cld 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +27 -0
- data/Manifest +106 -0
- data/README.rdoc +173 -0
- data/Rakefile +15 -0
- data/base/basictypes.h +348 -0
- data/base/build_config.h +115 -0
- data/base/casts.h +156 -0
- data/base/commandlineflags.h +443 -0
- data/base/crash.h +41 -0
- data/base/dynamic_annotations.h +358 -0
- data/base/global_strip_options.h +59 -0
- data/base/log_severity.h +46 -0
- data/base/logging.h +1403 -0
- data/base/macros.h +243 -0
- data/base/port.h +54 -0
- data/base/scoped_ptr.h +428 -0
- data/base/stl_decl.h +0 -0
- data/base/stl_decl_msvc.h +107 -0
- data/base/string_util.h +29 -0
- data/base/strtoint.h +93 -0
- data/base/template_util.h +96 -0
- data/base/type_traits.h +198 -0
- data/base/vlog_is_on.h +143 -0
- data/build.sh +48 -0
- data/build.win.cmd +28 -0
- data/cld.gemspec +30 -0
- data/cld_encodings.h +95 -0
- data/encodings/compact_lang_det/#cldutil.cc# +905 -0
- data/encodings/compact_lang_det/#cldutil.h# +1205 -0
- data/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
- data/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
- data/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
- data/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
- data/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
- data/encodings/compact_lang_det/#tote.cc# +299 -0
- data/encodings/compact_lang_det/#tote.h# +89 -0
- data/encodings/compact_lang_det/cldutil.cc +905 -0
- data/encodings/compact_lang_det/cldutil.h +1205 -0
- data/encodings/compact_lang_det/cldutil_dbg.h +76 -0
- data/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
- data/encodings/compact_lang_det/compact_lang_det.cc +62 -0
- data/encodings/compact_lang_det/compact_lang_det.h +145 -0
- data/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
- data/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
- data/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
- data/encodings/compact_lang_det/compile.cmd +1 -0
- data/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
- data/encodings/compact_lang_det/ext_lang_enc.h +119 -0
- data/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
- data/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
- data/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
- data/encodings/compact_lang_det/getonescriptspan.cc +570 -0
- data/encodings/compact_lang_det/getonescriptspan.h +131 -0
- data/encodings/compact_lang_det/letterscript_enum.cc +117 -0
- data/encodings/compact_lang_det/letterscript_enum.h +99 -0
- data/encodings/compact_lang_det/subsetsequence.cc +259 -0
- data/encodings/compact_lang_det/subsetsequence.h +44 -0
- data/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
- data/encodings/compact_lang_det/tote.cc +299 -0
- data/encodings/compact_lang_det/tote.h +89 -0
- data/encodings/compact_lang_det/unittest_data.h +193 -0
- data/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
- data/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
- data/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
- data/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
- data/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
- data/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
- data/encodings/compact_lang_det/win/cld_google.h +18 -0
- data/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
- data/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
- data/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
- data/encodings/compact_lang_det/win/cld_logging.h +21 -0
- data/encodings/compact_lang_det/win/cld_macros.h +19 -0
- data/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
- data/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
- data/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
- data/encodings/compact_lang_det/win/cld_unilib.h +15 -0
- data/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
- data/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
- data/encodings/compact_lang_det/win/cld_utf.h +24 -0
- data/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
- data/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
- data/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
- data/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
- data/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
- data/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
- data/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
- data/encodings/internal/encodings.cc +12 -0
- data/encodings/lang_enc.h +254 -0
- data/encodings/proto/encodings.pb.h +169 -0
- data/encodings/public/encodings.h +301 -0
- data/ext/cld/extconf.rb +7 -0
- data/languages/internal/#languages.cc# +337 -0
- data/languages/internal/languages.cc +337 -0
- data/languages/proto/languages.pb.h +179 -0
- data/languages/public/languages.h +379 -0
- data/lib/cld.rb +12 -0
- data/test/test.rb +570 -0
- data/thunk.cc +131 -0
- metadata +168 -0
data/base/vlog_is_on.h
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// Defines the VLOG_IS_ON macro that controls the variable-verbosity
|
|
6
|
+
// conditional logging.
|
|
7
|
+
//
|
|
8
|
+
// It's used by VLOG and VLOG_IF in logging.h
|
|
9
|
+
// and by RAW_VLOG in raw_logging.h to trigger the logging.
|
|
10
|
+
//
|
|
11
|
+
// It can also be used directly e.g. like this:
|
|
12
|
+
// if (VLOG_IS_ON(2)) {
|
|
13
|
+
// // do some logging preparation and logging
|
|
14
|
+
// // that can't be accomplished e.g. via just VLOG(2) << ...;
|
|
15
|
+
// }
|
|
16
|
+
//
|
|
17
|
+
// The truth value that VLOG_IS_ON(level) returns is determined by
|
|
18
|
+
// the three verbosity level flags:
|
|
19
|
+
// --v=<n> Gives the default maximal active V-logging level;
|
|
20
|
+
// 0 is the default.
|
|
21
|
+
// Normally positive values are used for V-logging levels.
|
|
22
|
+
// --vmodule=<str> Gives the per-module maximal V-logging levels to override
|
|
23
|
+
// the value given by --v.
|
|
24
|
+
// E.g. "my_module=2,foo*=3" would change the logging level
|
|
25
|
+
// for all code in source files "my_module.*" and "foo*.*"
|
|
26
|
+
// ("-inl" suffixes are also disregarded for this matching).
|
|
27
|
+
// --silent_init When true has the effect of increasing
|
|
28
|
+
// the argument of VLOG_IS_ON by 1,
|
|
29
|
+
// thus suppressing one more level of verbose logging.
|
|
30
|
+
//
|
|
31
|
+
// SetVLOGLevel helper function is provided to do limited dynamic control over
|
|
32
|
+
// V-logging by overriding the per-module settings given via --vmodule flag.
|
|
33
|
+
//
|
|
34
|
+
// CAVEAT: --vmodule functionality is not available in non gcc compilers.
|
|
35
|
+
//
|
|
36
|
+
|
|
37
|
+
#ifndef BASE_VLOG_IS_ON_H_
|
|
38
|
+
#define BASE_VLOG_IS_ON_H_
|
|
39
|
+
|
|
40
|
+
#include "base/atomicops.h"
|
|
41
|
+
#include "base/basictypes.h"
|
|
42
|
+
#include "base/port.h"
|
|
43
|
+
#include "base/commandlineflags.h"
|
|
44
|
+
#include "base/log_severity.h"
|
|
45
|
+
|
|
46
|
+
DECLARE_int32(v); // in vlog_is_on.cc
|
|
47
|
+
DECLARE_bool(silent_init); // in google.cc
|
|
48
|
+
|
|
49
|
+
#if defined(__GNUC__)
|
|
50
|
+
// We pack an int16 verbosity level and an int16 epoch into an
|
|
51
|
+
// Atomic32 at every VLOG_IS_ON() call site. The level determines
|
|
52
|
+
// whether the site should log, and the epoch determines whether the
|
|
53
|
+
// site is stale and should be reinitialized. A verbosity level of
|
|
54
|
+
// kUseFlag (-1) indicates that the value of FLAGS_v should be used as
|
|
55
|
+
// the verbosity level. When the site is (re)initialized, a verbosity
|
|
56
|
+
// level for the current source file is retrieved from an internal
|
|
57
|
+
// list. This list is mutated through calls to SetVLOGLevel() and
|
|
58
|
+
// mutations to the --vmodule flag. New log sites are initialized
|
|
59
|
+
// with a stale epoch and a verbosity level of kUseFlag.
|
|
60
|
+
//
|
|
61
|
+
// TODO(llansing): Investigate using GCC's __builtin_constant_p() to
|
|
62
|
+
// generate less code at call sites where verbositylevel is known to
|
|
63
|
+
// be a compile-time constant.
|
|
64
|
+
#define VLOG_IS_ON(verboselevel) \
|
|
65
|
+
({ static Atomic32 site__ = ::base::internal::kDefaultSite; \
|
|
66
|
+
::base::internal::VLogEnabled(&site__, (verboselevel), __FILE__); })
|
|
67
|
+
#else
|
|
68
|
+
// GNU extensions not available, so we do not support --vmodule.
|
|
69
|
+
// Dynamic value of FLAGS_v always controls the logging level.
|
|
70
|
+
//
|
|
71
|
+
// TODO(llansing): Investigate supporting --vmodule on other platforms.
|
|
72
|
+
#define VLOG_IS_ON(verboselevel) \
|
|
73
|
+
(FLAGS_v >= (verboselevel) + FLAGS_silent_init)
|
|
74
|
+
#endif
|
|
75
|
+
|
|
76
|
+
// Set VLOG(_IS_ON) level for module_pattern to log_level.
|
|
77
|
+
// This lets us dynamically control what is normally set by the --vmodule flag.
|
|
78
|
+
// Returns the level that previously applied to module_pattern.
|
|
79
|
+
// NOTE: To change the log level for VLOG(_IS_ON) sites
|
|
80
|
+
// that have already executed after/during InitGoogle,
|
|
81
|
+
// one needs to supply the exact --vmodule pattern that applied to them.
|
|
82
|
+
// (If no --vmodule pattern applied to them
|
|
83
|
+
// the value of FLAGS_v will continue to control them.)
|
|
84
|
+
int SetVLOGLevel(const char* module_pattern, int log_level);
|
|
85
|
+
|
|
86
|
+
// Private implementation details. No user-serviceable parts inside.
|
|
87
|
+
namespace base {
|
|
88
|
+
namespace internal {
|
|
89
|
+
|
|
90
|
+
// Each log site determines whether its log level is up to date by
|
|
91
|
+
// comparing its epoch to this global epoch. Whenever the program's
|
|
92
|
+
// vmodule configuration changes (ex: SetVLOGLevel is called), the
|
|
93
|
+
// global epoch is advanced, invalidating all site epochs.
|
|
94
|
+
extern Atomic32 vlog_epoch;
|
|
95
|
+
|
|
96
|
+
// A log level of kUseFlag means "read the logging level from FLAGS_v."
|
|
97
|
+
const int kUseFlag = -1;
|
|
98
|
+
|
|
99
|
+
// Log sites use FLAGS_v by default, and have an initial epoch of 0.
|
|
100
|
+
const Atomic32 kDefaultSite = kUseFlag << 16;
|
|
101
|
+
|
|
102
|
+
// The global epoch is the least significant half of an Atomic32, and
|
|
103
|
+
// may only be accessed through atomic operations.
|
|
104
|
+
inline Atomic32 GlobalEpoch() {
|
|
105
|
+
return Acquire_Load(&vlog_epoch) & 0x0000FFFF;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// The least significant half of a site is the epoch.
|
|
109
|
+
inline int SiteEpoch(Atomic32 site) { return site & 0x0000FFFF; }
|
|
110
|
+
|
|
111
|
+
// The most significant half of a site is the logging level.
|
|
112
|
+
inline int SiteLevel(Atomic32 site) { return site >> 16; }
|
|
113
|
+
|
|
114
|
+
// Construct a logging site from a logging level and epoch.
|
|
115
|
+
inline Atomic32 Site(int level, int epoch) {
|
|
116
|
+
return ((level & 0x0000FFFF) << 16) | (epoch & 0x0000FFFF);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Attempt to initialize or reinitialize a VLOG site. Returns the
|
|
120
|
+
// level of the log site, regardless of whether the attempt succeeds
|
|
121
|
+
// or fails.
|
|
122
|
+
// site: The address of the log site's state.
|
|
123
|
+
// fname: The filename of the current source file.
|
|
124
|
+
int InitVLOG(Atomic32* site, const char* fname);
|
|
125
|
+
|
|
126
|
+
// Determine whether verbose logging should occur at a given log site.
|
|
127
|
+
//
|
|
128
|
+
// TODO(llansing): Find a way to eliminate FLAGS_silent_init from this
|
|
129
|
+
// function while preserving the silent initialization behavior. The
|
|
130
|
+
// common-case code path shouldn't pay for silent initialization.
|
|
131
|
+
inline bool VLogEnabled(Atomic32* site, int32 level, const char* const file) {
|
|
132
|
+
const Atomic32 site_copy = Acquire_Load(site);
|
|
133
|
+
const int32 site_level =
|
|
134
|
+
PREDICT_TRUE(SiteEpoch(site_copy) == GlobalEpoch()) ?
|
|
135
|
+
SiteLevel(site_copy) : InitVLOG(site, file);
|
|
136
|
+
return (site_level == kUseFlag ? FLAGS_v : site_level) >=
|
|
137
|
+
(level + FLAGS_silent_init);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
} // namespace internal
|
|
141
|
+
} // namespace base
|
|
142
|
+
|
|
143
|
+
#endif // BASE_VLOG_IS_ON_H_
|
data/build.sh
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
CFLAGS="-fPIC -I. -O2 -DCLD_WINDOWS"
|
|
4
|
+
LDFLAGS=-L.
|
|
5
|
+
CC=g++
|
|
6
|
+
AR=ar
|
|
7
|
+
|
|
8
|
+
rm -f *.o
|
|
9
|
+
rm -f libcld.a
|
|
10
|
+
|
|
11
|
+
SOURCES="encodings/compact_lang_det/cldutil.cc \
|
|
12
|
+
encodings/compact_lang_det/cldutil_dbg_empty.cc \
|
|
13
|
+
encodings/compact_lang_det/compact_lang_det.cc \
|
|
14
|
+
encodings/compact_lang_det/compact_lang_det_impl.cc \
|
|
15
|
+
encodings/compact_lang_det/ext_lang_enc.cc \
|
|
16
|
+
encodings/compact_lang_det/getonescriptspan.cc \
|
|
17
|
+
encodings/compact_lang_det/letterscript_enum.cc \
|
|
18
|
+
encodings/compact_lang_det/tote.cc \
|
|
19
|
+
encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc \
|
|
20
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc \
|
|
21
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc \
|
|
22
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc \
|
|
23
|
+
encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc \
|
|
24
|
+
encodings/compact_lang_det/win/cld_htmlutils_windows.cc \
|
|
25
|
+
encodings/compact_lang_det/win/cld_unilib_windows.cc \
|
|
26
|
+
encodings/compact_lang_det/win/cld_utf8statetable.cc \
|
|
27
|
+
encodings/compact_lang_det/win/cld_utf8utils_windows.cc \
|
|
28
|
+
encodings/internal/encodings.cc \
|
|
29
|
+
languages/internal/languages.cc \
|
|
30
|
+
thunk.cc"
|
|
31
|
+
|
|
32
|
+
#encodings/compact_lang_det/win/cld_unicodetext.cc \
|
|
33
|
+
|
|
34
|
+
echo
|
|
35
|
+
echo "Compile..."
|
|
36
|
+
$CC -c $CFLAGS $SOURCES
|
|
37
|
+
|
|
38
|
+
echo
|
|
39
|
+
echo "Make libcld.a"
|
|
40
|
+
$AR rcs libcld.a *.o
|
|
41
|
+
|
|
42
|
+
echo
|
|
43
|
+
#$CC -DCLD_WINDOWS -I. -L. -o example example.cc -lcld -lstdc++
|
|
44
|
+
$CC -DCLD_WINDOWS -I. -L. -o cld.so -lcld -lstdc++ *.o
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
echo
|
|
48
|
+
echo "Done!"
|
data/build.win.cmd
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
REM "c:\Program Files\Microsoft Visual Studio 8\vc\vcvarsall.bat"
|
|
2
|
+
|
|
3
|
+
set CFLAGS=/nologo /I. /O2 /DCLD_WINDOWS /DWIN32 /EHsc
|
|
4
|
+
set LDFLAGS=-L.
|
|
5
|
+
set CC=cl.exe
|
|
6
|
+
set AR=lib.exe
|
|
7
|
+
|
|
8
|
+
del *.obj
|
|
9
|
+
del libcld.lib
|
|
10
|
+
|
|
11
|
+
set SOURCES=encodings/compact_lang_det/cldutil.cc encodings/compact_lang_det/cldutil_dbg_empty.cc encodings/compact_lang_det/compact_lang_det.cc encodings/compact_lang_det/compact_lang_det_impl.cc encodings/compact_lang_det/ext_lang_enc.cc encodings/compact_lang_det/getonescriptspan.cc encodings/compact_lang_det/letterscript_enum.cc encodings/compact_lang_det/tote.cc encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc encodings/compact_lang_det/win/cld_htmlutils_windows.cc encodings/compact_lang_det/win/cld_unilib_windows.cc encodings/compact_lang_det/win/cld_utf8statetable.cc encodings/compact_lang_det/win/cld_utf8utils_windows.cc encodings/internal/encodings.cc languages/internal/languages.cc
|
|
12
|
+
|
|
13
|
+
REM encodings/compact_lang_det/win/cld_unicodetext.cc \
|
|
14
|
+
|
|
15
|
+
echo ""
|
|
16
|
+
echo "Compile..."
|
|
17
|
+
%CC% /c %CFLAGS% %SOURCES%
|
|
18
|
+
|
|
19
|
+
echo ""
|
|
20
|
+
echo "Make libcld"
|
|
21
|
+
%AR% *.obj -OUT:libcld.lib
|
|
22
|
+
|
|
23
|
+
echo ""
|
|
24
|
+
echo "Compile example.cc"
|
|
25
|
+
%CC% %CFLAGS% %LFLAGS% example.cc libcld.lib
|
|
26
|
+
|
|
27
|
+
echo
|
|
28
|
+
echo "Done!"
|
data/cld.gemspec
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
Gem::Specification.new do |s|
|
|
4
|
+
s.name = %q{cld}
|
|
5
|
+
s.version = "0.1.0"
|
|
6
|
+
|
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
|
8
|
+
s.authors = [%q{Jason Toy}]
|
|
9
|
+
s.date = %q{2011-11-03}
|
|
10
|
+
s.description = %q{Compact Language Detection from chrome}
|
|
11
|
+
s.email = %q{jtoy@jtoy.net}
|
|
12
|
+
s.extensions = [%q{ext/cld/extconf.rb}]
|
|
13
|
+
s.extra_rdoc_files = [%q{LICENSE}, %q{README.rdoc}, %q{ext/cld/extconf.rb}, %q{lib/cld.rb}]
|
|
14
|
+
s.files = [%q{LICENSE}, %q{README.rdoc}, %q{Rakefile}, %q{base/basictypes.h}, %q{base/build_config.h}, %q{base/casts.h}, %q{base/commandlineflags.h}, %q{base/crash.h}, %q{base/dynamic_annotations.h}, %q{base/global_strip_options.h}, %q{base/log_severity.h}, %q{base/logging.h}, %q{base/macros.h}, %q{base/port.h}, %q{base/scoped_ptr.h}, %q{base/stl_decl.h}, %q{base/stl_decl_msvc.h}, %q{base/string_util.h}, %q{base/strtoint.h}, %q{base/template_util.h}, %q{base/type_traits.h}, %q{base/vlog_is_on.h}, %q{build.sh}, %q{build.win.cmd}, %q{cld.gemspec}, %q{cld_encodings.h}, %q{encodings/compact_lang_det/#cldutil.cc#}, %q{encodings/compact_lang_det/#cldutil.h#}, %q{encodings/compact_lang_det/#compact_lang_det_impl.h#}, %q{encodings/compact_lang_det/#ext_lang_enc.cc#}, %q{encodings/compact_lang_det/#ext_lang_enc.h#}, %q{encodings/compact_lang_det/#getonescriptspan.cc#}, %q{encodings/compact_lang_det/#getonescriptspan.h#}, %q{encodings/compact_lang_det/#tote.cc#}, %q{encodings/compact_lang_det/#tote.h#}, %q{encodings/compact_lang_det/cldutil.cc}, %q{encodings/compact_lang_det/cldutil.h}, %q{encodings/compact_lang_det/cldutil_dbg.h}, %q{encodings/compact_lang_det/cldutil_dbg_empty.cc}, %q{encodings/compact_lang_det/compact_lang_det.cc}, %q{encodings/compact_lang_det/compact_lang_det.h}, %q{encodings/compact_lang_det/compact_lang_det_impl.cc}, %q{encodings/compact_lang_det/compact_lang_det_impl.h}, %q{encodings/compact_lang_det/compact_lang_det_unittest_small.cc}, %q{encodings/compact_lang_det/compile.cmd}, %q{encodings/compact_lang_det/ext_lang_enc.cc}, %q{encodings/compact_lang_det/ext_lang_enc.h}, %q{encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc}, %q{encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc}, %q{encodings/compact_lang_det/getonescriptspan.cc}, %q{encodings/compact_lang_det/getonescriptspan.h}, %q{encodings/compact_lang_det/letterscript_enum.cc}, %q{encodings/compact_lang_det/letterscript_enum.h}, %q{encodings/compact_lang_det/subsetsequence.cc}, %q{encodings/compact_lang_det/subsetsequence.h}, %q{encodings/compact_lang_det/subsetsequence_unittest.cc}, %q{encodings/compact_lang_det/tote.cc}, %q{encodings/compact_lang_det/tote.h}, %q{encodings/compact_lang_det/unittest_data.h}, %q{encodings/compact_lang_det/utf8propjustletter.h}, %q{encodings/compact_lang_det/utf8propletterscriptnum.h}, %q{encodings/compact_lang_det/utf8scannotjustletterspecial.h}, %q{encodings/compact_lang_det/win/#cld_unilib_windows.cc#}, %q{encodings/compact_lang_det/win/cld_basictypes.h}, %q{encodings/compact_lang_det/win/cld_commandlineflags.h}, %q{encodings/compact_lang_det/win/cld_google.h}, %q{encodings/compact_lang_det/win/cld_htmlutils.h}, %q{encodings/compact_lang_det/win/cld_htmlutils_google3.cc}, %q{encodings/compact_lang_det/win/cld_htmlutils_windows.cc}, %q{encodings/compact_lang_det/win/cld_logging.h}, %q{encodings/compact_lang_det/win/cld_macros.h}, %q{encodings/compact_lang_det/win/cld_strtoint.h}, %q{encodings/compact_lang_det/win/cld_unicodetext.cc}, %q{encodings/compact_lang_det/win/cld_unicodetext.h}, %q{encodings/compact_lang_det/win/cld_unilib.h}, %q{encodings/compact_lang_det/win/cld_unilib_google3.cc}, %q{encodings/compact_lang_det/win/cld_unilib_windows.cc}, %q{encodings/compact_lang_det/win/cld_utf.h}, %q{encodings/compact_lang_det/win/cld_utf8statetable.cc}, %q{encodings/compact_lang_det/win/cld_utf8statetable.h}, %q{encodings/compact_lang_det/win/cld_utf8utils.h}, %q{encodings/compact_lang_det/win/cld_utf8utils_google3.cc}, %q{encodings/compact_lang_det/win/cld_utf8utils_windows.cc}, %q{encodings/compact_lang_det/win/normalizedunicodetext.cc}, %q{encodings/compact_lang_det/win/normalizedunicodetext.h}, %q{encodings/internal/encodings.cc}, %q{encodings/lang_enc.h}, %q{encodings/proto/encodings.pb.h}, %q{encodings/public/encodings.h}, %q{ext/cld/extconf.rb}, %q{languages/internal/#languages.cc#}, %q{languages/internal/languages.cc}, %q{languages/proto/languages.pb.h}, %q{languages/public/languages.h}, %q{lib/cld.rb}, %q{test/test.rb}, %q{thunk.cc}, %q{Manifest}]
|
|
15
|
+
s.homepage = %q{http://github.com/jtoy/cld}
|
|
16
|
+
s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Cld}, %q{--main}, %q{README.rdoc}]
|
|
17
|
+
s.require_paths = [%q{lib}, %q{ext}]
|
|
18
|
+
s.rubyforge_project = %q{cld}
|
|
19
|
+
s.rubygems_version = %q{1.8.6.1}
|
|
20
|
+
s.summary = %q{Compact Language Detection from chrome}
|
|
21
|
+
|
|
22
|
+
if s.respond_to? :specification_version then
|
|
23
|
+
s.specification_version = 3
|
|
24
|
+
|
|
25
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
|
26
|
+
else
|
|
27
|
+
end
|
|
28
|
+
else
|
|
29
|
+
end
|
|
30
|
+
end
|
data/cld_encodings.h
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// MKM: I copied this from the PHP port
|
|
6
|
+
// (https://github.com/lstrojny/php-cld/blob/master/cld_encodings.h);
|
|
7
|
+
// it just copies the encodings out of encodings.pb.h and
|
|
8
|
+
// gives them string matching string constants
|
|
9
|
+
|
|
10
|
+
#include "encodings/proto/encodings.pb.h"
|
|
11
|
+
|
|
12
|
+
struct cld_encoding {
|
|
13
|
+
const char *name;
|
|
14
|
+
Encoding encoding;
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
const cld_encoding cld_encoding_info[] = {
|
|
18
|
+
{"ISO_8859_1", ISO_8859_1},
|
|
19
|
+
{"ISO_8859_2", ISO_8859_2},
|
|
20
|
+
{"ISO_8859_3", ISO_8859_3},
|
|
21
|
+
{"ISO_8859_4", ISO_8859_4},
|
|
22
|
+
{"ISO_8859_5", ISO_8859_5},
|
|
23
|
+
{"ISO_8859_6", ISO_8859_6},
|
|
24
|
+
{"ISO_8859_7", ISO_8859_7},
|
|
25
|
+
{"ISO_8859_8", ISO_8859_8},
|
|
26
|
+
{"ISO_8859_9", ISO_8859_9},
|
|
27
|
+
{"ISO_8859_10", ISO_8859_10},
|
|
28
|
+
{"JAPANESE_EUC_JP", JAPANESE_EUC_JP},
|
|
29
|
+
{"JAPANESE_SHIFT_JIS", JAPANESE_SHIFT_JIS},
|
|
30
|
+
{"JAPANESE_JIS", JAPANESE_JIS},
|
|
31
|
+
{"CHINESE_BIG5", CHINESE_BIG5},
|
|
32
|
+
{"CHINESE_GB", CHINESE_GB},
|
|
33
|
+
{"CHINESE_EUC_CN", CHINESE_EUC_CN},
|
|
34
|
+
{"KOREAN_EUC_KR", KOREAN_EUC_KR},
|
|
35
|
+
{"UNICODE", UNICODE},
|
|
36
|
+
{"CHINESE_EUC_DEC", CHINESE_EUC_DEC},
|
|
37
|
+
{"CHINESE_CNS", CHINESE_CNS},
|
|
38
|
+
{"CHINESE_BIG5_CP950", CHINESE_BIG5_CP950},
|
|
39
|
+
{"JAPANESE_CP932", JAPANESE_CP932},
|
|
40
|
+
{"UTF8", UTF8},
|
|
41
|
+
{"UNKNOWN_ENCODING", UNKNOWN_ENCODING},
|
|
42
|
+
{"ASCII_7BIT", ASCII_7BIT},
|
|
43
|
+
{"RUSSIAN_KOI8_R", RUSSIAN_KOI8_R},
|
|
44
|
+
{"RUSSIAN_CP1251", RUSSIAN_CP1251},
|
|
45
|
+
{"MSFT_CP1252", MSFT_CP1252},
|
|
46
|
+
{"RUSSIAN_KOI8_RU", RUSSIAN_KOI8_RU},
|
|
47
|
+
{"MSFT_CP1250", MSFT_CP1250},
|
|
48
|
+
{"ISO_8859_15", ISO_8859_15},
|
|
49
|
+
{"MSFT_CP1254", MSFT_CP1254},
|
|
50
|
+
{"MSFT_CP1257", MSFT_CP1257},
|
|
51
|
+
{"ISO_8859_11", ISO_8859_11},
|
|
52
|
+
{"MSFT_CP874", MSFT_CP874},
|
|
53
|
+
{"MSFT_CP1256", MSFT_CP1256},
|
|
54
|
+
{"MSFT_CP1255", MSFT_CP1255},
|
|
55
|
+
{"ISO_8859_8_I", ISO_8859_8_I},
|
|
56
|
+
{"HEBREW_VISUAL", HEBREW_VISUAL},
|
|
57
|
+
{"CZECH_CP852", CZECH_CP852},
|
|
58
|
+
{"CZECH_CSN_369103", CZECH_CSN_369103},
|
|
59
|
+
{"MSFT_CP1253", MSFT_CP1253},
|
|
60
|
+
{"RUSSIAN_CP866", RUSSIAN_CP866},
|
|
61
|
+
{"ISO_8859_13", ISO_8859_13},
|
|
62
|
+
{"ISO_2022_KR", ISO_2022_KR},
|
|
63
|
+
{"GBK", GBK},
|
|
64
|
+
{"GB18030", GB18030},
|
|
65
|
+
{"BIG5_HKSCS", BIG5_HKSCS},
|
|
66
|
+
{"ISO_2022_CN", ISO_2022_CN},
|
|
67
|
+
{"TSCII", TSCII},
|
|
68
|
+
{"TAMIL_MONO", TAMIL_MONO},
|
|
69
|
+
{"TAMIL_BI", TAMIL_BI},
|
|
70
|
+
{"JAGRAN", JAGRAN},
|
|
71
|
+
{"MACINTOSH_ROMAN", MACINTOSH_ROMAN},
|
|
72
|
+
{"UTF7", UTF7},
|
|
73
|
+
{"BHASKAR", BHASKAR},
|
|
74
|
+
{"HTCHANAKYA", HTCHANAKYA},
|
|
75
|
+
{"UTF16BE", UTF16BE},
|
|
76
|
+
{"UTF16LE", UTF16LE},
|
|
77
|
+
{"UTF32BE", UTF32BE},
|
|
78
|
+
{"UTF32LE", UTF32LE},
|
|
79
|
+
{"BINARYENC", BINARYENC},
|
|
80
|
+
{"HZ_GB_2312", HZ_GB_2312},
|
|
81
|
+
{"UTF8UTF8", UTF8UTF8},
|
|
82
|
+
{"TAM_ELANGO", TAM_ELANGO},
|
|
83
|
+
{"TAM_LTTMBARANI", TAM_LTTMBARANI},
|
|
84
|
+
{"TAM_SHREE", TAM_SHREE},
|
|
85
|
+
{"TAM_TBOOMIS", TAM_TBOOMIS},
|
|
86
|
+
{"TAM_TMNEWS", TAM_TMNEWS},
|
|
87
|
+
{"TAM_WEBTAMIL", TAM_WEBTAMIL},
|
|
88
|
+
{"KDDI_SHIFT_JIS", KDDI_SHIFT_JIS},
|
|
89
|
+
{"DOCOMO_SHIFT_JIS", DOCOMO_SHIFT_JIS},
|
|
90
|
+
{"SOFTBANK_SHIFT_JIS", SOFTBANK_SHIFT_JIS},
|
|
91
|
+
{"KDDI_ISO_2022_JP", KDDI_ISO_2022_JP},
|
|
92
|
+
{"SOFTBANK_ISO_2022_JP", SOFTBANK_ISO_2022_JP},
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
COMPILE_ASSERT(arraysize(cld_encoding_info) == NUM_ENCODINGS, cld_encoding_info_length_is_wrong);
|
|
@@ -0,0 +1,905 @@
|
|
|
1
|
+
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
3
|
+
// found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
#include <string>
|
|
6
|
+
#include "encodings/compact_lang_det/cldutil.h"
|
|
7
|
+
#include "encodings/compact_lang_det/cldutil_dbg.h"
|
|
8
|
+
#include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
|
|
9
|
+
#include "encodings/compact_lang_det/utf8propletterscriptnum.h"
|
|
10
|
+
#include "encodings/compact_lang_det/win/cld_commandlineflags.h"
|
|
11
|
+
#include "encodings/compact_lang_det/win/cld_logging.h"
|
|
12
|
+
#include "encodings/compact_lang_det/win/cld_unilib.h"
|
|
13
|
+
#include "encodings/compact_lang_det/win/cld_utf.h"
|
|
14
|
+
#include "encodings/compact_lang_det/win/cld_utf8statetable.h"
|
|
15
|
+
|
|
16
|
+
// Runtime routines for hashing, looking up, and scoring
|
|
17
|
+
// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
|
|
18
|
+
// Unigrams and bigrams are for CJK languages only, including simplified/
|
|
19
|
+
// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
|
|
20
|
+
// Zhuang Han characters. Surrounding spaces are not considered.
|
|
21
|
+
// Quadgrams and octagrams for for non-CJK and include two bits indicating
|
|
22
|
+
// preceding and trailing spaces (word boundaries).
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
// Indicator bits for leading/trailing space around quad/octagram
|
|
26
|
+
// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
|
|
27
|
+
// 1-, 2-, or 3-bytes each.
|
|
28
|
+
static const uint32 kPreSpaceIndicator = 0x00004444;
|
|
29
|
+
static const uint32 kPostSpaceIndicator = 0x44440000;
|
|
30
|
+
|
|
31
|
+
// Little-endian masks for 0..24 bytes picked up as uint32's
|
|
32
|
+
static const uint32 kWordMask0[4] = {
|
|
33
|
+
0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
static const int kMinCJKUTF8CharBytes = 3;
|
|
37
|
+
|
|
38
|
+
static const int kMinGramCount = 3;
|
|
39
|
+
static const int kMaxGramCount = 16;
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
// Routines to access a hash table of <key:wordhash, value:probs> pairs
|
|
45
|
+
// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
|
|
46
|
+
// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
|
|
47
|
+
// bucket subscript.
|
|
48
|
+
// Probs is a packed: three languages plus a subscript for probability table
|
|
49
|
+
// Buckets have all the keys together, then all the values.Key array never
|
|
50
|
+
// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
|
|
51
|
+
// Match case may sometimes take an additional cache miss on value access.
|
|
52
|
+
//
|
|
53
|
+
// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
|
|
54
|
+
// byte buckets with single cache miss.
|
|
55
|
+
// Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
|
|
56
|
+
//------------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
//------------------------------------------------------------------------------
|
|
60
|
+
// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
|
|
61
|
+
//------------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
// Design principles for these hash functions
|
|
64
|
+
// - Few operations
|
|
65
|
+
// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
|
|
66
|
+
// Latin script expect 1- and 2-byte mixtures.
|
|
67
|
+
// - Last byte of each character has about 5 bits of information
|
|
68
|
+
// - Spread good bits around so they can interact in at least two ways
|
|
69
|
+
// with other characters
|
|
70
|
+
// - Use add for additional mixing thorugh carries
|
|
71
|
+
|
|
72
|
+
// CJK Three-byte bigram
|
|
73
|
+
// ....dddd..cccccc..bbbbbb....aaaa
|
|
74
|
+
// ..................ffffff..eeeeee
|
|
75
|
+
// make
|
|
76
|
+
// ....dddd..cccccc..bbbbbb....aaaa
|
|
77
|
+
// 000....dddd..cccccc..bbbbbb....a
|
|
78
|
+
// ..................ffffff..eeeeee
|
|
79
|
+
// ffffff..eeeeee000000000000000000
|
|
80
|
+
//
|
|
81
|
+
// CJK Four-byte bigram
|
|
82
|
+
// ..dddddd..cccccc....bbbb....aaaa
|
|
83
|
+
// ..hhhhhh..gggggg....ffff....eeee
|
|
84
|
+
// make
|
|
85
|
+
// ..dddddd..cccccc....bbbb....aaaa
|
|
86
|
+
// 000..dddddd..cccccc....bbbb....a
|
|
87
|
+
// ..hhhhhh..gggggg....ffff....eeee
|
|
88
|
+
// ..ffff....eeee000000000000000000
|
|
89
|
+
|
|
90
|
+
// BIGRAM
|
|
91
|
+
// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
|
|
92
|
+
// OVERSHOOTS up to 3 bytes
|
|
93
|
+
// For runtime use of tables
|
|
94
|
+
uint32 cld::BiHashV25(const char* word_ptr, int bytecount) {
|
|
95
|
+
if (bytecount == 0) {
|
|
96
|
+
return 0;
|
|
97
|
+
}
|
|
98
|
+
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
|
99
|
+
uint32 word0, word1;
|
|
100
|
+
if (bytecount <= 4) {
|
|
101
|
+
word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
|
|
102
|
+
word0 = word0 ^ (word0 >> 3);
|
|
103
|
+
return word0;
|
|
104
|
+
}
|
|
105
|
+
// Else do 8 bytes
|
|
106
|
+
word0 = word_ptr32[0];
|
|
107
|
+
word0 = word0 ^ (word0 >> 3);
|
|
108
|
+
word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
|
|
109
|
+
word1 = word1 ^ (word1 << 18);
|
|
110
|
+
return word0 + word1;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
//
|
|
114
|
+
// Ascii-7 One-byte chars
|
|
115
|
+
// ...ddddd...ccccc...bbbbb...aaaaa
|
|
116
|
+
// make
|
|
117
|
+
// ...ddddd...ccccc...bbbbb...aaaaa
|
|
118
|
+
// 000...ddddd...ccccc...bbbbb...aa
|
|
119
|
+
//
|
|
120
|
+
// Latin 1- and 2-byte chars
|
|
121
|
+
// ...ddddd...ccccc...bbbbb...aaaaa
|
|
122
|
+
// ...................fffff...eeeee
|
|
123
|
+
// make
|
|
124
|
+
// ...ddddd...ccccc...bbbbb...aaaaa
|
|
125
|
+
// 000...ddddd...ccccc...bbbbb...aa
|
|
126
|
+
// ...................fffff...eeeee
|
|
127
|
+
// ...............fffff...eeeee0000
|
|
128
|
+
//
|
|
129
|
+
// Non-CJK Two-byte chars
|
|
130
|
+
// ...ddddd...........bbbbb........
|
|
131
|
+
// ...hhhhh...........fffff........
|
|
132
|
+
// make
|
|
133
|
+
// ...ddddd...........bbbbb........
|
|
134
|
+
// 000...ddddd...........bbbbb.....
|
|
135
|
+
// ...hhhhh...........fffff........
|
|
136
|
+
// hhhh...........fffff........0000
|
|
137
|
+
//
|
|
138
|
+
// Non-CJK Three-byte chars
|
|
139
|
+
// ...........ccccc................
|
|
140
|
+
// ...................fffff........
|
|
141
|
+
// ...lllll...................iiiii
|
|
142
|
+
// make
|
|
143
|
+
// ...........ccccc................
|
|
144
|
+
// 000...........ccccc.............
|
|
145
|
+
// ...................fffff........
|
|
146
|
+
// ...............fffff........0000
|
|
147
|
+
// ...lllll...................iiiii
|
|
148
|
+
// .lllll...................iiiii00
|
|
149
|
+
//
|
|
150
|
+
|
|
151
|
+
// QUADGRAM
|
|
152
|
+
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
|
|
153
|
+
// OVERSHOOTS up to 3 bytes
|
|
154
|
+
// For runtime use of tables
|
|
155
|
+
uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) {
|
|
156
|
+
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
|
157
|
+
uint32 word0, word1, word2;
|
|
158
|
+
if (bytecount <= 4) {
|
|
159
|
+
word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
|
|
160
|
+
word0 = word0 ^ (word0 >> 3);
|
|
161
|
+
return word0 ^ prepost;
|
|
162
|
+
} else if (bytecount <= 8) {
|
|
163
|
+
word0 = word_ptr32[0];
|
|
164
|
+
word0 = word0 ^ (word0 >> 3);
|
|
165
|
+
word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
|
|
166
|
+
word1 = word1 ^ (word1 << 4);
|
|
167
|
+
return (word0 ^ prepost) + word1;
|
|
168
|
+
}
|
|
169
|
+
// else do 12 bytes
|
|
170
|
+
word0 = word_ptr32[0];
|
|
171
|
+
word0 = word0 ^ (word0 >> 3);
|
|
172
|
+
word1 = word_ptr32[1];
|
|
173
|
+
word1 = word1 ^ (word1 << 4);
|
|
174
|
+
word2 = word_ptr32[2] & kWordMask0[bytecount & 3];
|
|
175
|
+
word2 = word2 ^ (word2 << 2);
|
|
176
|
+
return (word0 ^ prepost) + word1 + word2;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
// QUADGRAM wrapper with surrounding spaces
|
|
181
|
+
// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
|
|
182
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
|
183
|
+
// For runtime use of tables
|
|
184
|
+
uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) {
|
|
185
|
+
if (bytecount == 0) {
|
|
186
|
+
return 0;
|
|
187
|
+
}
|
|
188
|
+
uint32 prepost = 0;
|
|
189
|
+
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
|
190
|
+
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
|
191
|
+
return QuadHashV25Mix(word_ptr, bytecount, prepost);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// QUADGRAM wrapper with surrounding underscores (offline use)
|
|
195
|
+
// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
|
|
196
|
+
// OVERSHOOTS up to 3 bytes
|
|
197
|
+
// For offline construction of tables
|
|
198
|
+
uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) {
|
|
199
|
+
if (bytecount == 0) {
|
|
200
|
+
return 0;
|
|
201
|
+
}
|
|
202
|
+
const char* local_word_ptr = word_ptr;
|
|
203
|
+
int local_bytecount = bytecount;
|
|
204
|
+
uint32 prepost = 0;
|
|
205
|
+
if (local_word_ptr[0] == '_') {
|
|
206
|
+
prepost |= kPreSpaceIndicator;
|
|
207
|
+
++local_word_ptr;
|
|
208
|
+
--local_bytecount;
|
|
209
|
+
}
|
|
210
|
+
if (local_word_ptr[local_bytecount - 1] == '_') {
|
|
211
|
+
prepost |= kPostSpaceIndicator;
|
|
212
|
+
--local_bytecount;
|
|
213
|
+
}
|
|
214
|
+
return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
// OCTAGRAM
|
|
219
|
+
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
|
220
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
|
221
|
+
//
|
|
222
|
+
// The low 32 bits follow the pattern from above, tuned to different scripts
|
|
223
|
+
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
|
224
|
+
// For runtime use of tables V3
|
|
225
|
+
uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
|
|
226
|
+
const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
|
|
227
|
+
uint64 word0;
|
|
228
|
+
uint64 word1;
|
|
229
|
+
uint64 sum;
|
|
230
|
+
|
|
231
|
+
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
|
232
|
+
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
|
233
|
+
switch ((bytecount - 1) >> 2) {
|
|
234
|
+
case 0: // 1..4 bytes
|
|
235
|
+
word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
|
|
236
|
+
sum = word0;
|
|
237
|
+
word0 = word0 ^ (word0 >> 3);
|
|
238
|
+
break;
|
|
239
|
+
case 1: // 5..8 bytes
|
|
240
|
+
word0 = word_ptr32[0];
|
|
241
|
+
sum = word0;
|
|
242
|
+
word0 = word0 ^ (word0 >> 3);
|
|
243
|
+
word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
|
|
244
|
+
sum += word1;
|
|
245
|
+
word1 = word1 ^ (word1 << 4);
|
|
246
|
+
word0 += word1;
|
|
247
|
+
break;
|
|
248
|
+
case 2: // 9..12 bytes
|
|
249
|
+
word0 = word_ptr32[0];
|
|
250
|
+
sum = word0;
|
|
251
|
+
word0 = word0 ^ (word0 >> 3);
|
|
252
|
+
word1 = word_ptr32[1];
|
|
253
|
+
sum += word1;
|
|
254
|
+
word1 = word1 ^ (word1 << 4);
|
|
255
|
+
word0 += word1;
|
|
256
|
+
word1 = word_ptr32[2] & kWordMask0[bytecount & 3];
|
|
257
|
+
sum += word1;
|
|
258
|
+
word1 = word1 ^ (word1 << 2);
|
|
259
|
+
word0 += word1;
|
|
260
|
+
break;
|
|
261
|
+
case 3: // 13..16 bytes
|
|
262
|
+
word0 = word_ptr32[0];
|
|
263
|
+
sum = word0;
|
|
264
|
+
word0 = word0 ^ (word0 >> 3);
|
|
265
|
+
word1 = word_ptr32[1];
|
|
266
|
+
sum += word1;
|
|
267
|
+
word1 = word1 ^ (word1 << 4);
|
|
268
|
+
word0 += word1;
|
|
269
|
+
word1 = word_ptr32[2];
|
|
270
|
+
sum += word1;
|
|
271
|
+
word1 = word1 ^ (word1 << 2);
|
|
272
|
+
word0 += word1;
|
|
273
|
+
word1 = word_ptr32[3] & kWordMask0[bytecount & 3];
|
|
274
|
+
sum += word1;
|
|
275
|
+
word1 = word1 ^ (word1 >> 8);
|
|
276
|
+
word0 += word1;
|
|
277
|
+
break;
|
|
278
|
+
case 4: // 17..20 bytes
|
|
279
|
+
word0 = word_ptr32[0];
|
|
280
|
+
sum = word0;
|
|
281
|
+
word0 = word0 ^ (word0 >> 3);
|
|
282
|
+
word1 = word_ptr32[1];
|
|
283
|
+
sum += word1;
|
|
284
|
+
word1 = word1 ^ (word1 << 4);
|
|
285
|
+
word0 += word1;
|
|
286
|
+
word1 = word_ptr32[2];
|
|
287
|
+
sum += word1;
|
|
288
|
+
word1 = word1 ^ (word1 << 2);
|
|
289
|
+
word0 += word1;
|
|
290
|
+
word1 = word_ptr32[3];
|
|
291
|
+
sum += word1;
|
|
292
|
+
word1 = word1 ^ (word1 >> 8);
|
|
293
|
+
word0 += word1;
|
|
294
|
+
word1 = word_ptr32[4] & kWordMask0[bytecount & 3];
|
|
295
|
+
sum += word1;
|
|
296
|
+
word1 = word1 ^ (word1 >> 4);
|
|
297
|
+
word0 += word1;
|
|
298
|
+
break;
|
|
299
|
+
default: // 21..24 bytes and higher (ignores beyond 24)
|
|
300
|
+
word0 = word_ptr32[0];
|
|
301
|
+
sum = word0;
|
|
302
|
+
word0 = word0 ^ (word0 >> 3);
|
|
303
|
+
word1 = word_ptr32[1];
|
|
304
|
+
sum += word1;
|
|
305
|
+
word1 = word1 ^ (word1 << 4);
|
|
306
|
+
word0 += word1;
|
|
307
|
+
word1 = word_ptr32[2];
|
|
308
|
+
sum += word1;
|
|
309
|
+
word1 = word1 ^ (word1 << 2);
|
|
310
|
+
word0 += word1;
|
|
311
|
+
word1 = word_ptr32[3];
|
|
312
|
+
sum += word1;
|
|
313
|
+
word1 = word1 ^ (word1 >> 8);
|
|
314
|
+
word0 += word1;
|
|
315
|
+
word1 = word_ptr32[4];
|
|
316
|
+
sum += word1;
|
|
317
|
+
word1 = word1 ^ (word1 >> 4);
|
|
318
|
+
word0 += word1;
|
|
319
|
+
word1 = word_ptr32[5] & kWordMask0[bytecount & 3];
|
|
320
|
+
sum += word1;
|
|
321
|
+
word1 = word1 ^ (word1 >> 6);
|
|
322
|
+
word0 += word1;
|
|
323
|
+
break;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3
|
|
327
|
+
sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3
|
|
328
|
+
sum = (sum & 0xff) << 32;
|
|
329
|
+
return (word0 ^ prepost) + sum;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// OCTAGRAM wrapper with surrounding spaces
|
|
333
|
+
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
|
334
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
|
335
|
+
//
|
|
336
|
+
// The low 32 bits follow the pattern from above, tuned to different scripts
|
|
337
|
+
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
|
338
|
+
// For runtime use of tables V3
|
|
339
|
+
uint64 cld::OctaHash40(const char* word_ptr, int bytecount) {
|
|
340
|
+
if (bytecount == 0) {
|
|
341
|
+
return 0;
|
|
342
|
+
}
|
|
343
|
+
uint64 prepost = 0;
|
|
344
|
+
if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
|
|
345
|
+
if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
|
|
346
|
+
return OctaHash40Mix(word_ptr, bytecount, prepost);
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
// OCTAGRAM wrapper with surrounding underscores (offline use)
|
|
351
|
+
// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
|
|
352
|
+
// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
|
|
353
|
+
//
|
|
354
|
+
// The low 32 bits follow the pattern from above, tuned to different scripts
|
|
355
|
+
// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
|
|
356
|
+
// For offline construction of tables
|
|
357
|
+
uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) {
|
|
358
|
+
if (bytecount == 0) {
|
|
359
|
+
return 0;
|
|
360
|
+
}
|
|
361
|
+
const char* local_word_ptr = word_ptr;
|
|
362
|
+
int local_bytecount = bytecount;
|
|
363
|
+
uint64 prepost = 0;
|
|
364
|
+
if (local_word_ptr[0] == '_') {
|
|
365
|
+
prepost |= kPreSpaceIndicator;
|
|
366
|
+
++local_word_ptr;
|
|
367
|
+
--local_bytecount;
|
|
368
|
+
}
|
|
369
|
+
if (local_word_ptr[local_bytecount - 1] == '_') {
|
|
370
|
+
prepost |= kPostSpaceIndicator;
|
|
371
|
+
--local_bytecount;
|
|
372
|
+
}
|
|
373
|
+
return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
//------------------------------------------------------------------------------
|
|
380
|
+
// Scoring single groups of letters
|
|
381
|
+
//------------------------------------------------------------------------------
|
|
382
|
+
|
|
383
|
+
// UNIGRAM score one => tote
|
|
384
|
+
// Input: 1-byte entry of subscript into unigram probs, plus
|
|
385
|
+
// an accumulator tote.
|
|
386
|
+
// Output: running sums in tote updated
|
|
387
|
+
void cld::ProcessProbV25UniTote(int propval, Tote* tote) {
|
|
388
|
+
tote->AddGram();
|
|
389
|
+
const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval];
|
|
390
|
+
if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);}
|
|
391
|
+
if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);}
|
|
392
|
+
if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);}
|
|
393
|
+
if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);}
|
|
394
|
+
if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);}
|
|
395
|
+
if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);}
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
|
|
399
|
+
// Input: 4-byte entry of 3 language numbers and one probability subscript, plus
|
|
400
|
+
// an accumulator tote. (language 0 means unused entry)
|
|
401
|
+
// Output: running sums in tote updated
|
|
402
|
+
void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) {
|
|
403
|
+
tote->AddGram();
|
|
404
|
+
uint8 prob123 = (probs >> 0) & 0xff;
|
|
405
|
+
const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
|
|
406
|
+
|
|
407
|
+
uint8 top1 = (probs >> 8) & 0xff;
|
|
408
|
+
if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));}
|
|
409
|
+
uint8 top2 = (probs >> 16) & 0xff;
|
|
410
|
+
if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));}
|
|
411
|
+
uint8 top3 = (probs >> 24) & 0xff;
|
|
412
|
+
if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));}
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
//------------------------------------------------------------------------------
|
|
417
|
+
// Routines to accumulate probabilities
|
|
418
|
+
//------------------------------------------------------------------------------
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
// UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars
|
|
422
|
+
// Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
|
|
423
|
+
// Score up to n unigrams, returning number of bytes consumed
|
|
424
|
+
// Updates tote_grams
|
|
425
|
+
int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj,
|
|
426
|
+
const char* isrc, int srclen, int advance_by,
|
|
427
|
+
int* tote_grams, int gram_limit, Tote* chunk_tote) {
|
|
428
|
+
const char* src = isrc;
|
|
429
|
+
if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
|
|
430
|
+
|
|
431
|
+
// Property-based CJK unigram lookup
|
|
432
|
+
if (src[0] == ' ') {++src; --srclen;}
|
|
433
|
+
|
|
434
|
+
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
|
435
|
+
int usrclen = srclen;
|
|
436
|
+
|
|
437
|
+
while (usrclen > 0) {
|
|
438
|
+
int len = kAdvanceOneChar[usrc[0]];
|
|
439
|
+
// Look up property of one UTF-8 character and advance over it
|
|
440
|
+
// Return 0 if input length is zero
|
|
441
|
+
// Return 0 and advance one byte if input is ill-formed
|
|
442
|
+
|
|
443
|
+
int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen);
|
|
444
|
+
|
|
445
|
+
if (FLAGS_dbglookup) {
|
|
446
|
+
DbgUniTermToStderr(propval, usrc, len);
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
if (propval > 0) {
|
|
450
|
+
ProcessProbV25UniTote(propval, chunk_tote);
|
|
451
|
+
++(*tote_grams);
|
|
452
|
+
if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
// Advance by 1/2/4/8 characters (half of quad advance)
|
|
456
|
+
if (advance_by == 2) {
|
|
457
|
+
// Already advanced by 1
|
|
458
|
+
} else if (advance_by == 4) {
|
|
459
|
+
// Advance by 2 chars total, if not at end
|
|
460
|
+
if (UTFmax <= usrclen) {
|
|
461
|
+
int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
462
|
+
}
|
|
463
|
+
} else if (advance_by == 8) {
|
|
464
|
+
// Advance by 4 chars total, if not at end
|
|
465
|
+
if ((UTFmax * 3) <= usrclen) {
|
|
466
|
+
int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
467
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
468
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
469
|
+
}
|
|
470
|
+
} else {
|
|
471
|
+
// Advance by 8 chars total, if not at end
|
|
472
|
+
if ((UTFmax * 7) <= usrclen) {
|
|
473
|
+
int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
474
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
475
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
476
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
477
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
478
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
479
|
+
n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
DCHECK(usrclen >= 0);
|
|
483
|
+
|
|
484
|
+
if (*tote_grams >= gram_limit) {
|
|
485
|
+
break;
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
if (FLAGS_dbgscore) {
|
|
489
|
+
// With advance_by>2, we consume more input to get the same number of quads
|
|
490
|
+
int len = src - isrc;
|
|
491
|
+
DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
|
|
492
|
+
DbgScoreFlush();
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
int consumed2 = reinterpret_cast<const char*>(usrc) - isrc;
|
|
496
|
+
return consumed2;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
// BIGRAM, using hash table, always advancing by 1 char
|
|
501
|
+
// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
|
|
502
|
+
// Score all bigrams in isrc, using languages that have bigrams (CJK)
|
|
503
|
+
// Return number of bigrams that hit in the hash table
|
|
504
|
+
int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
|
|
505
|
+
const char* isrc, int srclen, Tote* chunk_tote) {
|
|
506
|
+
int hit_count = 0;
|
|
507
|
+
const char* src = isrc;
|
|
508
|
+
|
|
509
|
+
// Hashtable-based CJK bigram lookup
|
|
510
|
+
const uint8* usrc = reinterpret_cast<const uint8*>(src);
|
|
511
|
+
const uint8* usrclimit1 = usrc + srclen - UTFmax;
|
|
512
|
+
if (FLAGS_dbgscore) {
|
|
513
|
+
fprintf(stderr, " " );
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
while (usrc < usrclimit1) {
|
|
517
|
+
int len = kAdvanceOneChar[usrc[0]];
|
|
518
|
+
int len2 = kAdvanceOneChar[usrc[len]] + len;
|
|
519
|
+
|
|
520
|
+
if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible
|
|
521
|
+
// Lookup and score this bigram
|
|
522
|
+
// Always ignore pre/post spaces
|
|
523
|
+
uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2);
|
|
524
|
+
uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
|
|
525
|
+
// Now go indirect on the subscript
|
|
526
|
+
probs = bigram_obj->kCLDTableInd[probs &
|
|
527
|
+
~bigram_obj->kCLDTableKeyMask];
|
|
528
|
+
|
|
529
|
+
// Process the bigram
|
|
530
|
+
if (FLAGS_dbglookup) {
|
|
531
|
+
const char* ssrc = reinterpret_cast<const char*>(usrc);
|
|
532
|
+
DbgBiTermToStderr(bihash, probs, ssrc, len2);
|
|
533
|
+
DbgScoreRecord(NULL, probs, len2);
|
|
534
|
+
} else if (FLAGS_dbgscore && (probs != 0)) {
|
|
535
|
+
const char* ssrc = reinterpret_cast<const char*>(usrc);
|
|
536
|
+
DbgScoreRecord(NULL, probs, len2);
|
|
537
|
+
string temp(ssrc, len2);
|
|
538
|
+
fprintf(stderr, "%s ", temp.c_str());
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
if (probs != 0) {
|
|
542
|
+
ProcessProbV25Tote(probs, chunk_tote);
|
|
543
|
+
++hit_count;
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
usrc += len; // Advance by one char
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
if (FLAGS_dbgscore) {
|
|
550
|
+
fprintf(stderr, "[%d bigrams scored]\n", hit_count);
|
|
551
|
+
DbgScoreState();
|
|
552
|
+
}
|
|
553
|
+
return hit_count;
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
// QUADGRAM, using hash table, advancing by 2/4/8/16 chars
|
|
559
|
+
// Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
|
|
560
|
+
// Score up to n quadgrams, returning number of bytes consumed
|
|
561
|
+
// Updates tote_grams
|
|
562
|
+
int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
|
|
563
|
+
const char* isrc, int srclen, int advance_by,
|
|
564
|
+
int* tote_grams, int gram_limit, Tote* chunk_tote) {
|
|
565
|
+
const char* src = isrc;
|
|
566
|
+
const char* srclimit = src + srclen;
|
|
567
|
+
// Limit is end, which has extra 20 20 20 00 past len
|
|
568
|
+
const char* srclimit7 = src + srclen - (UTFmax * 7);
|
|
569
|
+
const char* srclimit15 = src + srclen - (UTFmax * 15);
|
|
570
|
+
|
|
571
|
+
if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
|
|
572
|
+
|
|
573
|
+
// Run a little cache of last hits to catch overly-repetitive "text"
|
|
574
|
+
int next_prior = 0;
|
|
575
|
+
uint32 prior_quads[2] = {0, 0};
|
|
576
|
+
|
|
577
|
+
// Visit all quadgrams
|
|
578
|
+
if (src[0] == ' ') {++src;}
|
|
579
|
+
while (src < srclimit) {
|
|
580
|
+
// Find one quadgram
|
|
581
|
+
const char* src_end = src;
|
|
582
|
+
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
|
583
|
+
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
|
584
|
+
const char* src_mid = src_end;
|
|
585
|
+
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
|
586
|
+
src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
|
|
587
|
+
int len = src_end - src;
|
|
588
|
+
|
|
589
|
+
// Lookup and score this quadgram
|
|
590
|
+
uint32 quadhash = QuadHashV25(src, len);
|
|
591
|
+
uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
|
|
592
|
+
// Now go indirect on the subscript
|
|
593
|
+
probs = quadgram_obj->kCLDTableInd[probs &
|
|
594
|
+
~quadgram_obj->kCLDTableKeyMask];
|
|
595
|
+
|
|
596
|
+
// Process the quadgram
|
|
597
|
+
if (FLAGS_dbglookup) {
|
|
598
|
+
DbgQuadTermToStderr(quadhash, probs, src, len);
|
|
599
|
+
}
|
|
600
|
+
if (probs != 0) {
|
|
601
|
+
// Filter out recent repeats. If this works out, use in the other lookups
|
|
602
|
+
if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
|
|
603
|
+
prior_quads[next_prior] = quadhash;
|
|
604
|
+
next_prior = (next_prior + 1) & 1;
|
|
605
|
+
ProcessProbV25Tote(probs, chunk_tote);
|
|
606
|
+
++(*tote_grams);
|
|
607
|
+
if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
// Advance all the way past word if at end-of-word
|
|
612
|
+
if (src_end[0] == ' ') {
|
|
613
|
+
src_mid = src_end;
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
// Advance by 2/4/8/16 characters
|
|
617
|
+
if (advance_by == 2) {
|
|
618
|
+
src = src_mid;
|
|
619
|
+
} else if (advance_by == 4) {
|
|
620
|
+
src = src_end;
|
|
621
|
+
} else if (advance_by == 8) {
|
|
622
|
+
// Advance by 8 chars total (4 more), if not at end
|
|
623
|
+
if (src < srclimit7) {
|
|
624
|
+
src_end += kAdvanceOneChar[(uint8)src_end[0]];
|
|
625
|
+
src_end += kAdvanceOneChar[(uint8)src_end[0]];
|
|
626
|
+
src_end += kAdvanceOneChar[(uint8)src_end[0]];
|
|
627
|
+
src_end += kAdvanceOneChar[(uint8)src_end[0]];
|
|
628
|
+
}
|
|
629
|
+
src = src_end;
|
|
630
|
+
} else {
|
|
631
|
+
// Advance by 16 chars total (12 more), if not at end
|
|
632
|
+
if (src < srclimit15) {
|
|
633
|
+
// Advance by ~16 chars by adding 3 * current bytelen
|
|
634
|
+
int fourcharlen = src_end - src;
|
|
635
|
+
src = src_end + (3 * fourcharlen);
|
|
636
|
+
// Advance a bit more if mid-character
|
|
637
|
+
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
|
|
638
|
+
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
|
|
639
|
+
} else {
|
|
640
|
+
src = src_end;
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
DCHECK(src < srclimit);
|
|
644
|
+
src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
|
|
645
|
+
|
|
646
|
+
if (*tote_grams >= gram_limit) {
|
|
647
|
+
break;
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
if (FLAGS_dbgscore) {
|
|
652
|
+
// With advance_by>2, we consume more input to get the same number of quads
|
|
653
|
+
int len = src - isrc;
|
|
654
|
+
DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
|
|
655
|
+
DbgScoreFlush();
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
int consumed = src - isrc;
|
|
659
|
+
|
|
660
|
+
// If advancing by more than 2, src may have overshot srclimit
|
|
661
|
+
if (consumed > srclen) {
|
|
662
|
+
consumed = srclen;
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
return consumed;
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
// OCTAGRAM, using hash table, always advancing by 1 word
|
|
670
|
+
// Caller supplies table, such as &kLongWord8Table_obj
|
|
671
|
+
// Score all words in isrc, using languages that have quadgrams
|
|
672
|
+
// We don't normally use this routine except on the first quadgram run,
|
|
673
|
+
// but it can be used to resolve unreliable pages.
|
|
674
|
+
// This routine does not have an optimized advance_by
|
|
675
|
+
// SOON: Uses indirect language/probability longword
|
|
676
|
+
//
|
|
677
|
+
// Return number of words that hit in the hash table
|
|
678
|
+
int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
|
|
679
|
+
const char* isrc, int srclen, Tote* chunk_tote) {
|
|
680
|
+
int hit_count = 0;
|
|
681
|
+
const char* src = isrc;
|
|
682
|
+
const char* srclimit = src + srclen + 1;
|
|
683
|
+
// Limit is end+1, to include extra space char (0x20) off the end
|
|
684
|
+
//
|
|
685
|
+
// Score all words truncated to 8 characters
|
|
686
|
+
int charcount = 0;
|
|
687
|
+
// Skip any initial space
|
|
688
|
+
if (src[0] == ' ') {++src;}
|
|
689
|
+
const char* word_ptr = src;
|
|
690
|
+
const char* word_end = word_ptr;
|
|
691
|
+
if (FLAGS_dbgscore) {
|
|
692
|
+
fprintf(stderr, " " );
|
|
693
|
+
}
|
|
694
|
+
while (src < srclimit) {
|
|
695
|
+
// Terminate previous word or continue current word
|
|
696
|
+
if (src[0] == ' ') {
|
|
697
|
+
int bytecount = word_end - word_ptr;
|
|
698
|
+
if (bytecount == 0)
|
|
699
|
+
break;
|
|
700
|
+
// Lookup and score this word
|
|
701
|
+
uint64 wordhash40 = OctaHash40(word_ptr, bytecount);
|
|
702
|
+
uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40);
|
|
703
|
+
// Now go indirect on the subscript
|
|
704
|
+
probs = octagram_obj->kCLDTableInd[probs &
|
|
705
|
+
~octagram_obj->kCLDTableKeyMask];
|
|
706
|
+
|
|
707
|
+
// // Lookup and score this word
|
|
708
|
+
// uint32 wordhash = QuadHashV25(word_ptr, bytecount);
|
|
709
|
+
// uint32 probs = WordHashLookup4(wordhash, kLongWord8Table,
|
|
710
|
+
// kLongWord8TableSize);
|
|
711
|
+
//
|
|
712
|
+
if (FLAGS_dbglookup) {
|
|
713
|
+
DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount);
|
|
714
|
+
DbgScoreRecord(NULL, probs, bytecount);
|
|
715
|
+
} else if (FLAGS_dbgscore && (probs != 0)) {
|
|
716
|
+
DbgScoreRecord(NULL, probs, bytecount);
|
|
717
|
+
string temp(word_ptr, bytecount);
|
|
718
|
+
fprintf(stderr, "%s ", temp.c_str());
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
if (probs != 0) {
|
|
722
|
+
ProcessProbV25Tote(probs, chunk_tote);
|
|
723
|
+
++hit_count;
|
|
724
|
+
}
|
|
725
|
+
charcount = 0;
|
|
726
|
+
word_ptr = src + 1; // Over the space
|
|
727
|
+
word_end = word_ptr;
|
|
728
|
+
} else {
|
|
729
|
+
++charcount;
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
// Advance to next char
|
|
733
|
+
src += cld_UniLib::OneCharLen(src);
|
|
734
|
+
if (charcount <= 8) {
|
|
735
|
+
word_end = src;
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
if (FLAGS_dbgscore) {
|
|
740
|
+
fprintf(stderr, "[%d words scored]\n", hit_count);
|
|
741
|
+
DbgScoreState();
|
|
742
|
+
}
|
|
743
|
+
return hit_count;
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
//------------------------------------------------------------------------------
|
|
749
|
+
// Reliability calculations, for single language and between languages
|
|
750
|
+
//------------------------------------------------------------------------------
|
|
751
|
+
|
|
752
|
+
// Return reliablity of result 0..100 for top two scores
|
|
753
|
+
// delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
|
|
754
|
+
// (on a scale where +1 is a factor of 2 ** 1.6 = 3.02)
|
|
755
|
+
// Threshold is uni/quadgram increment count, bounded above and below.
|
|
756
|
+
//
|
|
757
|
+
// Requiring a factor of 3 improvement (e.g. +1 log base 3)
|
|
758
|
+
// for each scored quadgram is too stringent, so I've backed this off to a
|
|
759
|
+
// factor of 2 (e.g. +5/8 log base 3).
|
|
760
|
+
//
|
|
761
|
+
// I also somewhat lowered the Min/MaxGramCount limits above
|
|
762
|
+
//
|
|
763
|
+
// Added: if fewer than 8 quads/unis, max reliability is 12*n percent
|
|
764
|
+
//
|
|
765
|
+
int cld::ReliabilityDelta(int value1, int value2, int gramcount) {
|
|
766
|
+
int max_reliability_percent = 100;
|
|
767
|
+
if (gramcount < 8) {
|
|
768
|
+
max_reliability_percent = 12 * gramcount;
|
|
769
|
+
}
|
|
770
|
+
int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above
|
|
771
|
+
if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16
|
|
772
|
+
fully_reliable_thresh = kMinGramCount;
|
|
773
|
+
} else if (fully_reliable_thresh > kMaxGramCount) {
|
|
774
|
+
fully_reliable_thresh = kMaxGramCount;
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
int delta = value1 - value2;
|
|
778
|
+
if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
|
|
779
|
+
if (delta <= 0) {return 0;}
|
|
780
|
+
return cld::minint(max_reliability_percent,
|
|
781
|
+
(100 * delta) / fully_reliable_thresh);
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
// Return reliablity of result 0..100 for top score vs. mainsteam score
|
|
785
|
+
// Values are score per 1024 bytes of input
|
|
786
|
+
// ratio = max(top/mainstream, mainstream/top)
|
|
787
|
+
// ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
|
|
788
|
+
// Change: short-text word scoring can give unusually good results.
|
|
789
|
+
// Let top exceed mainstream by 4x at 50% reliable
|
|
790
|
+
int cld::ReliabilityMainstream(int topscore, int len, int mean_score) {
|
|
791
|
+
if (mean_score == 0) {return 100;} // No reliability data available yet
|
|
792
|
+
if (topscore == 0) {return 0;} // zero score = unreliable
|
|
793
|
+
if (len == 0) {return 0;} // zero len = unreliable
|
|
794
|
+
int top_kb = (topscore << 10) / len;
|
|
795
|
+
double ratio;
|
|
796
|
+
double ratio_cutoff;
|
|
797
|
+
if (top_kb > mean_score) {
|
|
798
|
+
ratio = (1.0 * top_kb) / mean_score;
|
|
799
|
+
ratio_cutoff = 5.0; // ramp down from 100% to 0%: 3.0-5.0
|
|
800
|
+
} else {
|
|
801
|
+
ratio = (1.0 * mean_score) / top_kb;
|
|
802
|
+
ratio_cutoff = 4.0; // ramp down from 100% to 0%: 2.0-4.0
|
|
803
|
+
}
|
|
804
|
+
if (ratio <= ratio_cutoff - 2.0) {return 100;}
|
|
805
|
+
if (ratio > ratio_cutoff) {return 0;}
|
|
806
|
+
|
|
807
|
+
int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0);
|
|
808
|
+
return iratio;
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
// Calculate ratio of score per 1KB vs. expected score per 1KB
|
|
812
|
+
double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript,
|
|
813
|
+
int bytes, int score) {
|
|
814
|
+
// Average training-data score for this language-script combo, per 1KB
|
|
815
|
+
int expected_score = kMeanScore[lang * 4 + LScript4(lscript)];
|
|
816
|
+
if (lscript == ULScript_Common) {
|
|
817
|
+
// We don't know the script (only happens with second-chance score)
|
|
818
|
+
// Look for first non-zero mean value
|
|
819
|
+
for (int i = 0; i < 3; ++i) {
|
|
820
|
+
if (kMeanScore[lang * 4 + i] > 0) {
|
|
821
|
+
expected_score = kMeanScore[lang * 4 + i];
|
|
822
|
+
}
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
if (expected_score < 100) {
|
|
826
|
+
expected_score = 1000;
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
// Our score per 1KB
|
|
830
|
+
double our_score = (score << 10) / (bytes ? bytes : 1); // Avoid zdiv
|
|
831
|
+
double ratio = our_score / expected_score;
|
|
832
|
+
|
|
833
|
+
// Just the raw count normalized as though each language has mean=1000;
|
|
834
|
+
ratio = (score * 1000.0) / expected_score;
|
|
835
|
+
return ratio;
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
// Calculate reliablity of len bytes of script lscript with chunk_tote
|
|
839
|
+
int cld::GetReliability(int len, UnicodeLScript lscript,
|
|
840
|
+
const Tote* chunk_tote) {
|
|
841
|
+
Language cur_lang = UnpackLanguage(chunk_tote->Key(0));
|
|
842
|
+
// Average score for this language-script combo
|
|
843
|
+
int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)];
|
|
844
|
+
if (lscript == ULScript_Common) {
|
|
845
|
+
// We don't know the script (only happens with second-chance score)
|
|
846
|
+
// Look for first non-zero mean value
|
|
847
|
+
for (int i = 0; i < 3; ++i) {
|
|
848
|
+
if (kMeanScore[cur_lang * 4 + i] > 0) {
|
|
849
|
+
mean_score = kMeanScore[cur_lang * 4 + i];
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
int reliability_delta = ReliabilityDelta(chunk_tote->Value(0),
|
|
854
|
+
chunk_tote->Value(1),
|
|
855
|
+
chunk_tote->GetGramCount());
|
|
856
|
+
|
|
857
|
+
int reliability_main = ReliabilityMainstream(chunk_tote->Value(0),
|
|
858
|
+
len,
|
|
859
|
+
mean_score);
|
|
860
|
+
|
|
861
|
+
int reliability_min = minint(reliability_delta, reliability_main);
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
if (FLAGS_dbgreli) {
|
|
865
|
+
char temp1[4];
|
|
866
|
+
char temp2[4];
|
|
867
|
+
cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1);
|
|
868
|
+
if (temp1[2] == ' ') {temp1[2] = '\0';}
|
|
869
|
+
cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2);
|
|
870
|
+
if (temp2[2] == ' ') {temp2[2] = '\0';}
|
|
871
|
+
int srclen = len;
|
|
872
|
+
fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d, %s=%d %s=%d "
|
|
873
|
+
"top/KB=%d mean/KB=%d del=%d%% reli=%d%% "
|
|
874
|
+
"lang/lscript %d %d\n",
|
|
875
|
+
chunk_tote->GetGramCount(),
|
|
876
|
+
chunk_tote->GetIncrCount(),
|
|
877
|
+
srclen,
|
|
878
|
+
temp1, chunk_tote->Value(0),
|
|
879
|
+
temp2, chunk_tote->Value(1),
|
|
880
|
+
(chunk_tote->Value(0) << 10) / (srclen ? srclen : 1),
|
|
881
|
+
mean_score,
|
|
882
|
+
reliability_delta,
|
|
883
|
+
reliability_main,
|
|
884
|
+
cur_lang, lscript);
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
return reliability_min;
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
//------------------------------------------------------------------------------
|
|
892
|
+
// Miscellaneous
|
|
893
|
+
//------------------------------------------------------------------------------
|
|
894
|
+
|
|
895
|
+
// Demote all languages except Top40 and plus_one
|
|
896
|
+
// Do this just before sorting chunk_tote results
|
|
897
|
+
void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) {
|
|
898
|
+
for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
|
|
899
|
+
if (chunk_tote->Key(sub) == 0) continue;
|
|
900
|
+
if (chunk_tote->Key(sub) == packed_plus_one) continue;
|
|
901
|
+
if (kIsPackedTop40[chunk_tote->Key(sub)]) continue;
|
|
902
|
+
// Quarter the score of others
|
|
903
|
+
chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2);
|
|
904
|
+
}
|
|
905
|
+
}
|