cld 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. data/LICENSE +27 -0
  2. data/Manifest +106 -0
  3. data/README.rdoc +173 -0
  4. data/Rakefile +15 -0
  5. data/base/basictypes.h +348 -0
  6. data/base/build_config.h +115 -0
  7. data/base/casts.h +156 -0
  8. data/base/commandlineflags.h +443 -0
  9. data/base/crash.h +41 -0
  10. data/base/dynamic_annotations.h +358 -0
  11. data/base/global_strip_options.h +59 -0
  12. data/base/log_severity.h +46 -0
  13. data/base/logging.h +1403 -0
  14. data/base/macros.h +243 -0
  15. data/base/port.h +54 -0
  16. data/base/scoped_ptr.h +428 -0
  17. data/base/stl_decl.h +0 -0
  18. data/base/stl_decl_msvc.h +107 -0
  19. data/base/string_util.h +29 -0
  20. data/base/strtoint.h +93 -0
  21. data/base/template_util.h +96 -0
  22. data/base/type_traits.h +198 -0
  23. data/base/vlog_is_on.h +143 -0
  24. data/build.sh +48 -0
  25. data/build.win.cmd +28 -0
  26. data/cld.gemspec +30 -0
  27. data/cld_encodings.h +95 -0
  28. data/encodings/compact_lang_det/#cldutil.cc# +905 -0
  29. data/encodings/compact_lang_det/#cldutil.h# +1205 -0
  30. data/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
  31. data/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
  32. data/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
  33. data/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
  34. data/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
  35. data/encodings/compact_lang_det/#tote.cc# +299 -0
  36. data/encodings/compact_lang_det/#tote.h# +89 -0
  37. data/encodings/compact_lang_det/cldutil.cc +905 -0
  38. data/encodings/compact_lang_det/cldutil.h +1205 -0
  39. data/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  40. data/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  41. data/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  42. data/encodings/compact_lang_det/compact_lang_det.h +145 -0
  43. data/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  44. data/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  45. data/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  46. data/encodings/compact_lang_det/compile.cmd +1 -0
  47. data/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  48. data/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  49. data/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  50. data/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  51. data/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  52. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  53. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  54. data/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  55. data/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  56. data/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  57. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  58. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  59. data/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  60. data/encodings/compact_lang_det/getonescriptspan.h +131 -0
  61. data/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  62. data/encodings/compact_lang_det/letterscript_enum.h +99 -0
  63. data/encodings/compact_lang_det/subsetsequence.cc +259 -0
  64. data/encodings/compact_lang_det/subsetsequence.h +44 -0
  65. data/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  66. data/encodings/compact_lang_det/tote.cc +299 -0
  67. data/encodings/compact_lang_det/tote.h +89 -0
  68. data/encodings/compact_lang_det/unittest_data.h +193 -0
  69. data/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  70. data/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  71. data/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  72. data/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
  73. data/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  74. data/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  75. data/encodings/compact_lang_det/win/cld_google.h +18 -0
  76. data/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  77. data/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  78. data/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  79. data/encodings/compact_lang_det/win/cld_logging.h +21 -0
  80. data/encodings/compact_lang_det/win/cld_macros.h +19 -0
  81. data/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  82. data/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  83. data/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  84. data/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  85. data/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  86. data/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  87. data/encodings/compact_lang_det/win/cld_utf.h +24 -0
  88. data/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  89. data/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  90. data/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  91. data/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  92. data/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  93. data/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  94. data/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  95. data/encodings/internal/encodings.cc +12 -0
  96. data/encodings/lang_enc.h +254 -0
  97. data/encodings/proto/encodings.pb.h +169 -0
  98. data/encodings/public/encodings.h +301 -0
  99. data/ext/cld/extconf.rb +7 -0
  100. data/languages/internal/#languages.cc# +337 -0
  101. data/languages/internal/languages.cc +337 -0
  102. data/languages/proto/languages.pb.h +179 -0
  103. data/languages/public/languages.h +379 -0
  104. data/lib/cld.rb +12 -0
  105. data/test/test.rb +570 -0
  106. data/thunk.cc +131 -0
  107. metadata +168 -0
@@ -0,0 +1,143 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // Defines the VLOG_IS_ON macro that controls the variable-verbosity
6
+ // conditional logging.
7
+ //
8
+ // It's used by VLOG and VLOG_IF in logging.h
9
+ // and by RAW_VLOG in raw_logging.h to trigger the logging.
10
+ //
11
+ // It can also be used directly e.g. like this:
12
+ // if (VLOG_IS_ON(2)) {
13
+ // // do some logging preparation and logging
14
+ // // that can't be accomplished e.g. via just VLOG(2) << ...;
15
+ // }
16
+ //
17
+ // The truth value that VLOG_IS_ON(level) returns is determined by
18
+ // the three verbosity level flags:
19
+ // --v=<n> Gives the default maximal active V-logging level;
20
+ // 0 is the default.
21
+ // Normally positive values are used for V-logging levels.
22
+ // --vmodule=<str> Gives the per-module maximal V-logging levels to override
23
+ // the value given by --v.
24
+ // E.g. "my_module=2,foo*=3" would change the logging level
25
+ // for all code in source files "my_module.*" and "foo*.*"
26
+ // ("-inl" suffixes are also disregarded for this matching).
27
+ // --silent_init When true has the effect of increasing
28
+ // the argument of VLOG_IS_ON by 1,
29
+ // thus suppressing one more level of verbose logging.
30
+ //
31
+ // SetVLOGLevel helper function is provided to do limited dynamic control over
32
+ // V-logging by overriding the per-module settings given via --vmodule flag.
33
+ //
34
+ // CAVEAT: --vmodule functionality is not available in non gcc compilers.
35
+ //
36
+
37
+ #ifndef BASE_VLOG_IS_ON_H_
38
+ #define BASE_VLOG_IS_ON_H_
39
+
40
+ #include "base/atomicops.h"
41
+ #include "base/basictypes.h"
42
+ #include "base/port.h"
43
+ #include "base/commandlineflags.h"
44
+ #include "base/log_severity.h"
45
+
46
+ DECLARE_int32(v); // in vlog_is_on.cc
47
+ DECLARE_bool(silent_init); // in google.cc
48
+
49
+ #if defined(__GNUC__)
50
+ // We pack an int16 verbosity level and an int16 epoch into an
51
+ // Atomic32 at every VLOG_IS_ON() call site. The level determines
52
+ // whether the site should log, and the epoch determines whether the
53
+ // site is stale and should be reinitialized. A verbosity level of
54
+ // kUseFlag (-1) indicates that the value of FLAGS_v should be used as
55
+ // the verbosity level. When the site is (re)initialized, a verbosity
56
+ // level for the current source file is retrieved from an internal
57
+ // list. This list is mutated through calls to SetVLOGLevel() and
58
+ // mutations to the --vmodule flag. New log sites are initialized
59
+ // with a stale epoch and a verbosity level of kUseFlag.
60
+ //
61
+ // TODO(llansing): Investigate using GCC's __builtin_constant_p() to
62
+ // generate less code at call sites where verbositylevel is known to
63
+ // be a compile-time constant.
64
+ #define VLOG_IS_ON(verboselevel) \
65
+ ({ static Atomic32 site__ = ::base::internal::kDefaultSite; \
66
+ ::base::internal::VLogEnabled(&site__, (verboselevel), __FILE__); })
67
+ #else
68
+ // GNU extensions not available, so we do not support --vmodule.
69
+ // Dynamic value of FLAGS_v always controls the logging level.
70
+ //
71
+ // TODO(llansing): Investigate supporting --vmodule on other platforms.
72
+ #define VLOG_IS_ON(verboselevel) \
73
+ (FLAGS_v >= (verboselevel) + FLAGS_silent_init)
74
+ #endif
75
+
76
+ // Set VLOG(_IS_ON) level for module_pattern to log_level.
77
+ // This lets us dynamically control what is normally set by the --vmodule flag.
78
+ // Returns the level that previously applied to module_pattern.
79
+ // NOTE: To change the log level for VLOG(_IS_ON) sites
80
+ // that have already executed after/during InitGoogle,
81
+ // one needs to supply the exact --vmodule pattern that applied to them.
82
+ // (If no --vmodule pattern applied to them
83
+ // the value of FLAGS_v will continue to control them.)
84
+ int SetVLOGLevel(const char* module_pattern, int log_level);
85
+
86
+ // Private implementation details. No user-serviceable parts inside.
87
+ namespace base {
88
+ namespace internal {
89
+
90
+ // Each log site determines whether its log level is up to date by
91
+ // comparing its epoch to this global epoch. Whenever the program's
92
+ // vmodule configuration changes (ex: SetVLOGLevel is called), the
93
+ // global epoch is advanced, invalidating all site epochs.
94
+ extern Atomic32 vlog_epoch;
95
+
96
+ // A log level of kUseFlag means "read the logging level from FLAGS_v."
97
+ const int kUseFlag = -1;
98
+
99
+ // Log sites use FLAGS_v by default, and have an initial epoch of 0.
100
+ const Atomic32 kDefaultSite = kUseFlag << 16;
101
+
102
+ // The global epoch is the least significant half of an Atomic32, and
103
+ // may only be accessed through atomic operations.
104
+ inline Atomic32 GlobalEpoch() {
105
+ return Acquire_Load(&vlog_epoch) & 0x0000FFFF;
106
+ }
107
+
108
+ // The least significant half of a site is the epoch.
109
+ inline int SiteEpoch(Atomic32 site) { return site & 0x0000FFFF; }
110
+
111
+ // The most significant half of a site is the logging level.
112
+ inline int SiteLevel(Atomic32 site) { return site >> 16; }
113
+
114
+ // Construct a logging site from a logging level and epoch.
115
+ inline Atomic32 Site(int level, int epoch) {
116
+ return ((level & 0x0000FFFF) << 16) | (epoch & 0x0000FFFF);
117
+ }
118
+
119
+ // Attempt to initialize or reinitialize a VLOG site. Returns the
120
+ // level of the log site, regardless of whether the attempt succeeds
121
+ // or fails.
122
+ // site: The address of the log site's state.
123
+ // fname: The filename of the current source file.
124
+ int InitVLOG(Atomic32* site, const char* fname);
125
+
126
+ // Determine whether verbose logging should occur at a given log site.
127
+ //
128
+ // TODO(llansing): Find a way to eliminate FLAGS_silent_init from this
129
+ // function while preserving the silent initialization behavior. The
130
+ // common-case code path shouldn't pay for silent initialization.
131
+ inline bool VLogEnabled(Atomic32* site, int32 level, const char* const file) {
132
+ const Atomic32 site_copy = Acquire_Load(site);
133
+ const int32 site_level =
134
+ PREDICT_TRUE(SiteEpoch(site_copy) == GlobalEpoch()) ?
135
+ SiteLevel(site_copy) : InitVLOG(site, file);
136
+ return (site_level == kUseFlag ? FLAGS_v : site_level) >=
137
+ (level + FLAGS_silent_init);
138
+ }
139
+
140
+ } // namespace internal
141
+ } // namespace base
142
+
143
+ #endif // BASE_VLOG_IS_ON_H_
@@ -0,0 +1,48 @@
1
+ #!/bin/bash
2
+
3
+ CFLAGS="-fPIC -I. -O2 -DCLD_WINDOWS"
4
+ LDFLAGS=-L.
5
+ CC=g++
6
+ AR=ar
7
+
8
+ rm -f *.o
9
+ rm -f libcld.a
10
+
11
+ SOURCES="encodings/compact_lang_det/cldutil.cc \
12
+ encodings/compact_lang_det/cldutil_dbg_empty.cc \
13
+ encodings/compact_lang_det/compact_lang_det.cc \
14
+ encodings/compact_lang_det/compact_lang_det_impl.cc \
15
+ encodings/compact_lang_det/ext_lang_enc.cc \
16
+ encodings/compact_lang_det/getonescriptspan.cc \
17
+ encodings/compact_lang_det/letterscript_enum.cc \
18
+ encodings/compact_lang_det/tote.cc \
19
+ encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc \
20
+ encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc \
21
+ encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc \
22
+ encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc \
23
+ encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc \
24
+ encodings/compact_lang_det/win/cld_htmlutils_windows.cc \
25
+ encodings/compact_lang_det/win/cld_unilib_windows.cc \
26
+ encodings/compact_lang_det/win/cld_utf8statetable.cc \
27
+ encodings/compact_lang_det/win/cld_utf8utils_windows.cc \
28
+ encodings/internal/encodings.cc \
29
+ languages/internal/languages.cc \
30
+ thunk.cc"
31
+
32
+ #encodings/compact_lang_det/win/cld_unicodetext.cc \
33
+
34
+ echo
35
+ echo "Compile..."
36
+ $CC -c $CFLAGS $SOURCES
37
+
38
+ echo
39
+ echo "Make libcld.a"
40
+ $AR rcs libcld.a *.o
41
+
42
+ echo
43
+ #$CC -DCLD_WINDOWS -I. -L. -o example example.cc -lcld -lstdc++
44
+ $CC -DCLD_WINDOWS -I. -L. -o cld.so -lcld -lstdc++ *.o
45
+
46
+
47
+ echo
48
+ echo "Done!"
@@ -0,0 +1,28 @@
1
+ REM "c:\Program Files\Microsoft Visual Studio 8\vc\vcvarsall.bat"
2
+
3
+ set CFLAGS=/nologo /I. /O2 /DCLD_WINDOWS /DWIN32 /EHsc
4
+ set LDFLAGS=-L.
5
+ set CC=cl.exe
6
+ set AR=lib.exe
7
+
8
+ del *.obj
9
+ del libcld.lib
10
+
11
+ set SOURCES=encodings/compact_lang_det/cldutil.cc encodings/compact_lang_det/cldutil_dbg_empty.cc encodings/compact_lang_det/compact_lang_det.cc encodings/compact_lang_det/compact_lang_det_impl.cc encodings/compact_lang_det/ext_lang_enc.cc encodings/compact_lang_det/getonescriptspan.cc encodings/compact_lang_det/letterscript_enum.cc encodings/compact_lang_det/tote.cc encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc encodings/compact_lang_det/win/cld_htmlutils_windows.cc encodings/compact_lang_det/win/cld_unilib_windows.cc encodings/compact_lang_det/win/cld_utf8statetable.cc encodings/compact_lang_det/win/cld_utf8utils_windows.cc encodings/internal/encodings.cc languages/internal/languages.cc
12
+
13
+ REM encodings/compact_lang_det/win/cld_unicodetext.cc \
14
+
15
+ echo ""
16
+ echo "Compile..."
17
+ %CC% /c %CFLAGS% %SOURCES%
18
+
19
+ echo ""
20
+ echo "Make libcld"
21
+ %AR% *.obj -OUT:libcld.lib
22
+
23
+ echo ""
24
+ echo "Compile example.cc"
25
+ %CC% %CFLAGS% %LFLAGS% example.cc libcld.lib
26
+
27
+ echo
28
+ echo "Done!"
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{cld}
5
+ s.version = "0.1.0"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = [%q{Jason Toy}]
9
+ s.date = %q{2011-11-03}
10
+ s.description = %q{Compact Language Detection from chrome}
11
+ s.email = %q{jtoy@jtoy.net}
12
+ s.extensions = [%q{ext/cld/extconf.rb}]
13
+ s.extra_rdoc_files = [%q{LICENSE}, %q{README.rdoc}, %q{ext/cld/extconf.rb}, %q{lib/cld.rb}]
14
+ s.files = [%q{LICENSE}, %q{README.rdoc}, %q{Rakefile}, %q{base/basictypes.h}, %q{base/build_config.h}, %q{base/casts.h}, %q{base/commandlineflags.h}, %q{base/crash.h}, %q{base/dynamic_annotations.h}, %q{base/global_strip_options.h}, %q{base/log_severity.h}, %q{base/logging.h}, %q{base/macros.h}, %q{base/port.h}, %q{base/scoped_ptr.h}, %q{base/stl_decl.h}, %q{base/stl_decl_msvc.h}, %q{base/string_util.h}, %q{base/strtoint.h}, %q{base/template_util.h}, %q{base/type_traits.h}, %q{base/vlog_is_on.h}, %q{build.sh}, %q{build.win.cmd}, %q{cld.gemspec}, %q{cld_encodings.h}, %q{encodings/compact_lang_det/#cldutil.cc#}, %q{encodings/compact_lang_det/#cldutil.h#}, %q{encodings/compact_lang_det/#compact_lang_det_impl.h#}, %q{encodings/compact_lang_det/#ext_lang_enc.cc#}, %q{encodings/compact_lang_det/#ext_lang_enc.h#}, %q{encodings/compact_lang_det/#getonescriptspan.cc#}, %q{encodings/compact_lang_det/#getonescriptspan.h#}, %q{encodings/compact_lang_det/#tote.cc#}, %q{encodings/compact_lang_det/#tote.h#}, %q{encodings/compact_lang_det/cldutil.cc}, %q{encodings/compact_lang_det/cldutil.h}, %q{encodings/compact_lang_det/cldutil_dbg.h}, %q{encodings/compact_lang_det/cldutil_dbg_empty.cc}, %q{encodings/compact_lang_det/compact_lang_det.cc}, %q{encodings/compact_lang_det/compact_lang_det.h}, %q{encodings/compact_lang_det/compact_lang_det_impl.cc}, %q{encodings/compact_lang_det/compact_lang_det_impl.h}, %q{encodings/compact_lang_det/compact_lang_det_unittest_small.cc}, %q{encodings/compact_lang_det/compile.cmd}, %q{encodings/compact_lang_det/ext_lang_enc.cc}, %q{encodings/compact_lang_det/ext_lang_enc.h}, %q{encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc}, %q{encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc}, %q{encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc}, %q{encodings/compact_lang_det/getonescriptspan.cc}, %q{encodings/compact_lang_det/getonescriptspan.h}, %q{encodings/compact_lang_det/letterscript_enum.cc}, %q{encodings/compact_lang_det/letterscript_enum.h}, %q{encodings/compact_lang_det/subsetsequence.cc}, %q{encodings/compact_lang_det/subsetsequence.h}, %q{encodings/compact_lang_det/subsetsequence_unittest.cc}, %q{encodings/compact_lang_det/tote.cc}, %q{encodings/compact_lang_det/tote.h}, %q{encodings/compact_lang_det/unittest_data.h}, %q{encodings/compact_lang_det/utf8propjustletter.h}, %q{encodings/compact_lang_det/utf8propletterscriptnum.h}, %q{encodings/compact_lang_det/utf8scannotjustletterspecial.h}, %q{encodings/compact_lang_det/win/#cld_unilib_windows.cc#}, %q{encodings/compact_lang_det/win/cld_basictypes.h}, %q{encodings/compact_lang_det/win/cld_commandlineflags.h}, %q{encodings/compact_lang_det/win/cld_google.h}, %q{encodings/compact_lang_det/win/cld_htmlutils.h}, %q{encodings/compact_lang_det/win/cld_htmlutils_google3.cc}, %q{encodings/compact_lang_det/win/cld_htmlutils_windows.cc}, %q{encodings/compact_lang_det/win/cld_logging.h}, %q{encodings/compact_lang_det/win/cld_macros.h}, %q{encodings/compact_lang_det/win/cld_strtoint.h}, %q{encodings/compact_lang_det/win/cld_unicodetext.cc}, %q{encodings/compact_lang_det/win/cld_unicodetext.h}, %q{encodings/compact_lang_det/win/cld_unilib.h}, %q{encodings/compact_lang_det/win/cld_unilib_google3.cc}, %q{encodings/compact_lang_det/win/cld_unilib_windows.cc}, %q{encodings/compact_lang_det/win/cld_utf.h}, %q{encodings/compact_lang_det/win/cld_utf8statetable.cc}, %q{encodings/compact_lang_det/win/cld_utf8statetable.h}, %q{encodings/compact_lang_det/win/cld_utf8utils.h}, %q{encodings/compact_lang_det/win/cld_utf8utils_google3.cc}, %q{encodings/compact_lang_det/win/cld_utf8utils_windows.cc}, %q{encodings/compact_lang_det/win/normalizedunicodetext.cc}, %q{encodings/compact_lang_det/win/normalizedunicodetext.h}, %q{encodings/internal/encodings.cc}, %q{encodings/lang_enc.h}, %q{encodings/proto/encodings.pb.h}, %q{encodings/public/encodings.h}, %q{ext/cld/extconf.rb}, %q{languages/internal/#languages.cc#}, %q{languages/internal/languages.cc}, %q{languages/proto/languages.pb.h}, %q{languages/public/languages.h}, %q{lib/cld.rb}, %q{test/test.rb}, %q{thunk.cc}, %q{Manifest}]
15
+ s.homepage = %q{http://github.com/jtoy/cld}
16
+ s.rdoc_options = [%q{--line-numbers}, %q{--inline-source}, %q{--title}, %q{Cld}, %q{--main}, %q{README.rdoc}]
17
+ s.require_paths = [%q{lib}, %q{ext}]
18
+ s.rubyforge_project = %q{cld}
19
+ s.rubygems_version = %q{1.8.6.1}
20
+ s.summary = %q{Compact Language Detection from chrome}
21
+
22
+ if s.respond_to? :specification_version then
23
+ s.specification_version = 3
24
+
25
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
26
+ else
27
+ end
28
+ else
29
+ end
30
+ end
@@ -0,0 +1,95 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // MKM: I copied this from the PHP port
6
+ // (https://github.com/lstrojny/php-cld/blob/master/cld_encodings.h);
7
+ // it just copies the encodings out of encodings.pb.h and
8
+ // gives them string matching string constants
9
+
10
+ #include "encodings/proto/encodings.pb.h"
11
+
12
+ struct cld_encoding {
13
+ const char *name;
14
+ Encoding encoding;
15
+ };
16
+
17
+ const cld_encoding cld_encoding_info[] = {
18
+ {"ISO_8859_1", ISO_8859_1},
19
+ {"ISO_8859_2", ISO_8859_2},
20
+ {"ISO_8859_3", ISO_8859_3},
21
+ {"ISO_8859_4", ISO_8859_4},
22
+ {"ISO_8859_5", ISO_8859_5},
23
+ {"ISO_8859_6", ISO_8859_6},
24
+ {"ISO_8859_7", ISO_8859_7},
25
+ {"ISO_8859_8", ISO_8859_8},
26
+ {"ISO_8859_9", ISO_8859_9},
27
+ {"ISO_8859_10", ISO_8859_10},
28
+ {"JAPANESE_EUC_JP", JAPANESE_EUC_JP},
29
+ {"JAPANESE_SHIFT_JIS", JAPANESE_SHIFT_JIS},
30
+ {"JAPANESE_JIS", JAPANESE_JIS},
31
+ {"CHINESE_BIG5", CHINESE_BIG5},
32
+ {"CHINESE_GB", CHINESE_GB},
33
+ {"CHINESE_EUC_CN", CHINESE_EUC_CN},
34
+ {"KOREAN_EUC_KR", KOREAN_EUC_KR},
35
+ {"UNICODE", UNICODE},
36
+ {"CHINESE_EUC_DEC", CHINESE_EUC_DEC},
37
+ {"CHINESE_CNS", CHINESE_CNS},
38
+ {"CHINESE_BIG5_CP950", CHINESE_BIG5_CP950},
39
+ {"JAPANESE_CP932", JAPANESE_CP932},
40
+ {"UTF8", UTF8},
41
+ {"UNKNOWN_ENCODING", UNKNOWN_ENCODING},
42
+ {"ASCII_7BIT", ASCII_7BIT},
43
+ {"RUSSIAN_KOI8_R", RUSSIAN_KOI8_R},
44
+ {"RUSSIAN_CP1251", RUSSIAN_CP1251},
45
+ {"MSFT_CP1252", MSFT_CP1252},
46
+ {"RUSSIAN_KOI8_RU", RUSSIAN_KOI8_RU},
47
+ {"MSFT_CP1250", MSFT_CP1250},
48
+ {"ISO_8859_15", ISO_8859_15},
49
+ {"MSFT_CP1254", MSFT_CP1254},
50
+ {"MSFT_CP1257", MSFT_CP1257},
51
+ {"ISO_8859_11", ISO_8859_11},
52
+ {"MSFT_CP874", MSFT_CP874},
53
+ {"MSFT_CP1256", MSFT_CP1256},
54
+ {"MSFT_CP1255", MSFT_CP1255},
55
+ {"ISO_8859_8_I", ISO_8859_8_I},
56
+ {"HEBREW_VISUAL", HEBREW_VISUAL},
57
+ {"CZECH_CP852", CZECH_CP852},
58
+ {"CZECH_CSN_369103", CZECH_CSN_369103},
59
+ {"MSFT_CP1253", MSFT_CP1253},
60
+ {"RUSSIAN_CP866", RUSSIAN_CP866},
61
+ {"ISO_8859_13", ISO_8859_13},
62
+ {"ISO_2022_KR", ISO_2022_KR},
63
+ {"GBK", GBK},
64
+ {"GB18030", GB18030},
65
+ {"BIG5_HKSCS", BIG5_HKSCS},
66
+ {"ISO_2022_CN", ISO_2022_CN},
67
+ {"TSCII", TSCII},
68
+ {"TAMIL_MONO", TAMIL_MONO},
69
+ {"TAMIL_BI", TAMIL_BI},
70
+ {"JAGRAN", JAGRAN},
71
+ {"MACINTOSH_ROMAN", MACINTOSH_ROMAN},
72
+ {"UTF7", UTF7},
73
+ {"BHASKAR", BHASKAR},
74
+ {"HTCHANAKYA", HTCHANAKYA},
75
+ {"UTF16BE", UTF16BE},
76
+ {"UTF16LE", UTF16LE},
77
+ {"UTF32BE", UTF32BE},
78
+ {"UTF32LE", UTF32LE},
79
+ {"BINARYENC", BINARYENC},
80
+ {"HZ_GB_2312", HZ_GB_2312},
81
+ {"UTF8UTF8", UTF8UTF8},
82
+ {"TAM_ELANGO", TAM_ELANGO},
83
+ {"TAM_LTTMBARANI", TAM_LTTMBARANI},
84
+ {"TAM_SHREE", TAM_SHREE},
85
+ {"TAM_TBOOMIS", TAM_TBOOMIS},
86
+ {"TAM_TMNEWS", TAM_TMNEWS},
87
+ {"TAM_WEBTAMIL", TAM_WEBTAMIL},
88
+ {"KDDI_SHIFT_JIS", KDDI_SHIFT_JIS},
89
+ {"DOCOMO_SHIFT_JIS", DOCOMO_SHIFT_JIS},
90
+ {"SOFTBANK_SHIFT_JIS", SOFTBANK_SHIFT_JIS},
91
+ {"KDDI_ISO_2022_JP", KDDI_ISO_2022_JP},
92
+ {"SOFTBANK_ISO_2022_JP", SOFTBANK_ISO_2022_JP},
93
+ };
94
+
95
+ COMPILE_ASSERT(arraysize(cld_encoding_info) == NUM_ENCODINGS, cld_encoding_info_length_is_wrong);
@@ -0,0 +1,905 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include <string>
6
+ #include "encodings/compact_lang_det/cldutil.h"
7
+ #include "encodings/compact_lang_det/cldutil_dbg.h"
8
+ #include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
9
+ #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
10
+ #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
11
+ #include "encodings/compact_lang_det/win/cld_logging.h"
12
+ #include "encodings/compact_lang_det/win/cld_unilib.h"
13
+ #include "encodings/compact_lang_det/win/cld_utf.h"
14
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
15
+
16
+ // Runtime routines for hashing, looking up, and scoring
17
+ // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
18
+ // Unigrams and bigrams are for CJK languages only, including simplified/
19
+ // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
20
+ // Zhuang Han characters. Surrounding spaces are not considered.
21
+ // Quadgrams and octagrams for for non-CJK and include two bits indicating
22
+ // preceding and trailing spaces (word boundaries).
23
+
24
+
25
+ // Indicator bits for leading/trailing space around quad/octagram
26
+ // NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
27
+ // 1-, 2-, or 3-bytes each.
28
+ static const uint32 kPreSpaceIndicator = 0x00004444;
29
+ static const uint32 kPostSpaceIndicator = 0x44440000;
30
+
31
+ // Little-endian masks for 0..24 bytes picked up as uint32's
32
+ static const uint32 kWordMask0[4] = {
33
+ 0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
34
+ };
35
+
36
+ static const int kMinCJKUTF8CharBytes = 3;
37
+
38
+ static const int kMinGramCount = 3;
39
+ static const int kMaxGramCount = 16;
40
+
41
+
42
+
43
+
44
+ // Routines to access a hash table of <key:wordhash, value:probs> pairs
45
+ // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
46
+ // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
47
+ // bucket subscript.
48
+ // Probs is a packed: three languages plus a subscript for probability table
49
+ // Buckets have all the keys together, then all the values.Key array never
50
+ // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
51
+ // Match case may sometimes take an additional cache miss on value access.
52
+ //
53
+ // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
54
+ // byte buckets with single cache miss.
55
+ // Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
56
+ //------------------------------------------------------------------------------
57
+
58
+
59
+ //------------------------------------------------------------------------------
60
+ // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
61
+ //------------------------------------------------------------------------------
62
+
63
+ // Design principles for these hash functions
64
+ // - Few operations
65
+ // - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
66
+ // Latin script expect 1- and 2-byte mixtures.
67
+ // - Last byte of each character has about 5 bits of information
68
+ // - Spread good bits around so they can interact in at least two ways
69
+ // with other characters
70
+ // - Use add for additional mixing thorugh carries
71
+
72
+ // CJK Three-byte bigram
73
+ // ....dddd..cccccc..bbbbbb....aaaa
74
+ // ..................ffffff..eeeeee
75
+ // make
76
+ // ....dddd..cccccc..bbbbbb....aaaa
77
+ // 000....dddd..cccccc..bbbbbb....a
78
+ // ..................ffffff..eeeeee
79
+ // ffffff..eeeeee000000000000000000
80
+ //
81
+ // CJK Four-byte bigram
82
+ // ..dddddd..cccccc....bbbb....aaaa
83
+ // ..hhhhhh..gggggg....ffff....eeee
84
+ // make
85
+ // ..dddddd..cccccc....bbbb....aaaa
86
+ // 000..dddddd..cccccc....bbbb....a
87
+ // ..hhhhhh..gggggg....ffff....eeee
88
+ // ..ffff....eeee000000000000000000
89
+
90
+ // BIGRAM
91
+ // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
92
+ // OVERSHOOTS up to 3 bytes
93
+ // For runtime use of tables
94
+ uint32 cld::BiHashV25(const char* word_ptr, int bytecount) {
95
+ if (bytecount == 0) {
96
+ return 0;
97
+ }
98
+ const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
99
+ uint32 word0, word1;
100
+ if (bytecount <= 4) {
101
+ word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
102
+ word0 = word0 ^ (word0 >> 3);
103
+ return word0;
104
+ }
105
+ // Else do 8 bytes
106
+ word0 = word_ptr32[0];
107
+ word0 = word0 ^ (word0 >> 3);
108
+ word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
109
+ word1 = word1 ^ (word1 << 18);
110
+ return word0 + word1;
111
+ }
112
+
113
+ //
114
+ // Ascii-7 One-byte chars
115
+ // ...ddddd...ccccc...bbbbb...aaaaa
116
+ // make
117
+ // ...ddddd...ccccc...bbbbb...aaaaa
118
+ // 000...ddddd...ccccc...bbbbb...aa
119
+ //
120
+ // Latin 1- and 2-byte chars
121
+ // ...ddddd...ccccc...bbbbb...aaaaa
122
+ // ...................fffff...eeeee
123
+ // make
124
+ // ...ddddd...ccccc...bbbbb...aaaaa
125
+ // 000...ddddd...ccccc...bbbbb...aa
126
+ // ...................fffff...eeeee
127
+ // ...............fffff...eeeee0000
128
+ //
129
+ // Non-CJK Two-byte chars
130
+ // ...ddddd...........bbbbb........
131
+ // ...hhhhh...........fffff........
132
+ // make
133
+ // ...ddddd...........bbbbb........
134
+ // 000...ddddd...........bbbbb.....
135
+ // ...hhhhh...........fffff........
136
+ // hhhh...........fffff........0000
137
+ //
138
+ // Non-CJK Three-byte chars
139
+ // ...........ccccc................
140
+ // ...................fffff........
141
+ // ...lllll...................iiiii
142
+ // make
143
+ // ...........ccccc................
144
+ // 000...........ccccc.............
145
+ // ...................fffff........
146
+ // ...............fffff........0000
147
+ // ...lllll...................iiiii
148
+ // .lllll...................iiiii00
149
+ //
150
+
151
+ // QUADGRAM
152
+ // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
153
+ // OVERSHOOTS up to 3 bytes
154
+ // For runtime use of tables
155
+ uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) {
156
+ const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
157
+ uint32 word0, word1, word2;
158
+ if (bytecount <= 4) {
159
+ word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
160
+ word0 = word0 ^ (word0 >> 3);
161
+ return word0 ^ prepost;
162
+ } else if (bytecount <= 8) {
163
+ word0 = word_ptr32[0];
164
+ word0 = word0 ^ (word0 >> 3);
165
+ word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
166
+ word1 = word1 ^ (word1 << 4);
167
+ return (word0 ^ prepost) + word1;
168
+ }
169
+ // else do 12 bytes
170
+ word0 = word_ptr32[0];
171
+ word0 = word0 ^ (word0 >> 3);
172
+ word1 = word_ptr32[1];
173
+ word1 = word1 ^ (word1 << 4);
174
+ word2 = word_ptr32[2] & kWordMask0[bytecount & 3];
175
+ word2 = word2 ^ (word2 << 2);
176
+ return (word0 ^ prepost) + word1 + word2;
177
+ }
178
+
179
+
180
+ // QUADGRAM wrapper with surrounding spaces
181
+ // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
182
+ // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
183
+ // For runtime use of tables
184
+ uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) {
185
+ if (bytecount == 0) {
186
+ return 0;
187
+ }
188
+ uint32 prepost = 0;
189
+ if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
190
+ if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
191
+ return QuadHashV25Mix(word_ptr, bytecount, prepost);
192
+ }
193
+
194
+ // QUADGRAM wrapper with surrounding underscores (offline use)
195
+ // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
196
+ // OVERSHOOTS up to 3 bytes
197
+ // For offline construction of tables
198
+ uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) {
199
+ if (bytecount == 0) {
200
+ return 0;
201
+ }
202
+ const char* local_word_ptr = word_ptr;
203
+ int local_bytecount = bytecount;
204
+ uint32 prepost = 0;
205
+ if (local_word_ptr[0] == '_') {
206
+ prepost |= kPreSpaceIndicator;
207
+ ++local_word_ptr;
208
+ --local_bytecount;
209
+ }
210
+ if (local_word_ptr[local_bytecount - 1] == '_') {
211
+ prepost |= kPostSpaceIndicator;
212
+ --local_bytecount;
213
+ }
214
+ return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost);
215
+ }
216
+
217
+
218
+ // OCTAGRAM
219
+ // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
220
+ // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
221
+ //
222
+ // The low 32 bits follow the pattern from above, tuned to different scripts
223
+ // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
224
+ // For runtime use of tables V3
225
+ uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
226
+ const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
227
+ uint64 word0;
228
+ uint64 word1;
229
+ uint64 sum;
230
+
231
+ if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
232
+ if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
233
+ switch ((bytecount - 1) >> 2) {
234
+ case 0: // 1..4 bytes
235
+ word0 = word_ptr32[0] & kWordMask0[bytecount & 3];
236
+ sum = word0;
237
+ word0 = word0 ^ (word0 >> 3);
238
+ break;
239
+ case 1: // 5..8 bytes
240
+ word0 = word_ptr32[0];
241
+ sum = word0;
242
+ word0 = word0 ^ (word0 >> 3);
243
+ word1 = word_ptr32[1] & kWordMask0[bytecount & 3];
244
+ sum += word1;
245
+ word1 = word1 ^ (word1 << 4);
246
+ word0 += word1;
247
+ break;
248
+ case 2: // 9..12 bytes
249
+ word0 = word_ptr32[0];
250
+ sum = word0;
251
+ word0 = word0 ^ (word0 >> 3);
252
+ word1 = word_ptr32[1];
253
+ sum += word1;
254
+ word1 = word1 ^ (word1 << 4);
255
+ word0 += word1;
256
+ word1 = word_ptr32[2] & kWordMask0[bytecount & 3];
257
+ sum += word1;
258
+ word1 = word1 ^ (word1 << 2);
259
+ word0 += word1;
260
+ break;
261
+ case 3: // 13..16 bytes
262
+ word0 = word_ptr32[0];
263
+ sum = word0;
264
+ word0 = word0 ^ (word0 >> 3);
265
+ word1 = word_ptr32[1];
266
+ sum += word1;
267
+ word1 = word1 ^ (word1 << 4);
268
+ word0 += word1;
269
+ word1 = word_ptr32[2];
270
+ sum += word1;
271
+ word1 = word1 ^ (word1 << 2);
272
+ word0 += word1;
273
+ word1 = word_ptr32[3] & kWordMask0[bytecount & 3];
274
+ sum += word1;
275
+ word1 = word1 ^ (word1 >> 8);
276
+ word0 += word1;
277
+ break;
278
+ case 4: // 17..20 bytes
279
+ word0 = word_ptr32[0];
280
+ sum = word0;
281
+ word0 = word0 ^ (word0 >> 3);
282
+ word1 = word_ptr32[1];
283
+ sum += word1;
284
+ word1 = word1 ^ (word1 << 4);
285
+ word0 += word1;
286
+ word1 = word_ptr32[2];
287
+ sum += word1;
288
+ word1 = word1 ^ (word1 << 2);
289
+ word0 += word1;
290
+ word1 = word_ptr32[3];
291
+ sum += word1;
292
+ word1 = word1 ^ (word1 >> 8);
293
+ word0 += word1;
294
+ word1 = word_ptr32[4] & kWordMask0[bytecount & 3];
295
+ sum += word1;
296
+ word1 = word1 ^ (word1 >> 4);
297
+ word0 += word1;
298
+ break;
299
+ default: // 21..24 bytes and higher (ignores beyond 24)
300
+ word0 = word_ptr32[0];
301
+ sum = word0;
302
+ word0 = word0 ^ (word0 >> 3);
303
+ word1 = word_ptr32[1];
304
+ sum += word1;
305
+ word1 = word1 ^ (word1 << 4);
306
+ word0 += word1;
307
+ word1 = word_ptr32[2];
308
+ sum += word1;
309
+ word1 = word1 ^ (word1 << 2);
310
+ word0 += word1;
311
+ word1 = word_ptr32[3];
312
+ sum += word1;
313
+ word1 = word1 ^ (word1 >> 8);
314
+ word0 += word1;
315
+ word1 = word_ptr32[4];
316
+ sum += word1;
317
+ word1 = word1 ^ (word1 >> 4);
318
+ word0 += word1;
319
+ word1 = word_ptr32[5] & kWordMask0[bytecount & 3];
320
+ sum += word1;
321
+ word1 = word1 ^ (word1 >> 6);
322
+ word0 += word1;
323
+ break;
324
+ }
325
+
326
+ sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3
327
+ sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3
328
+ sum = (sum & 0xff) << 32;
329
+ return (word0 ^ prepost) + sum;
330
+ }
331
+
332
+ // OCTAGRAM wrapper with surrounding spaces
333
+ // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
334
+ // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
335
+ //
336
+ // The low 32 bits follow the pattern from above, tuned to different scripts
337
+ // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
338
+ // For runtime use of tables V3
339
+ uint64 cld::OctaHash40(const char* word_ptr, int bytecount) {
340
+ if (bytecount == 0) {
341
+ return 0;
342
+ }
343
+ uint64 prepost = 0;
344
+ if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
345
+ if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
346
+ return OctaHash40Mix(word_ptr, bytecount, prepost);
347
+ }
348
+
349
+
350
+ // OCTAGRAM wrapper with surrounding underscores (offline use)
351
+ // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
352
+ // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
353
+ //
354
+ // The low 32 bits follow the pattern from above, tuned to different scripts
355
+ // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
356
+ // For offline construction of tables
357
+ uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) {
358
+ if (bytecount == 0) {
359
+ return 0;
360
+ }
361
+ const char* local_word_ptr = word_ptr;
362
+ int local_bytecount = bytecount;
363
+ uint64 prepost = 0;
364
+ if (local_word_ptr[0] == '_') {
365
+ prepost |= kPreSpaceIndicator;
366
+ ++local_word_ptr;
367
+ --local_bytecount;
368
+ }
369
+ if (local_word_ptr[local_bytecount - 1] == '_') {
370
+ prepost |= kPostSpaceIndicator;
371
+ --local_bytecount;
372
+ }
373
+ return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
374
+ }
375
+
376
+
377
+
378
+
379
+ //------------------------------------------------------------------------------
380
+ // Scoring single groups of letters
381
+ //------------------------------------------------------------------------------
382
+
383
+ // UNIGRAM score one => tote
384
+ // Input: 1-byte entry of subscript into unigram probs, plus
385
+ // an accumulator tote.
386
+ // Output: running sums in tote updated
387
+ void cld::ProcessProbV25UniTote(int propval, Tote* tote) {
388
+ tote->AddGram();
389
+ const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval];
390
+ if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);}
391
+ if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);}
392
+ if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);}
393
+ if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);}
394
+ if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);}
395
+ if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);}
396
+ }
397
+
398
+ // BIGRAM, QUADGRAM, OCTAGRAM score one => tote
399
+ // Input: 4-byte entry of 3 language numbers and one probability subscript, plus
400
+ // an accumulator tote. (language 0 means unused entry)
401
+ // Output: running sums in tote updated
402
+ void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) {
403
+ tote->AddGram();
404
+ uint8 prob123 = (probs >> 0) & 0xff;
405
+ const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
406
+
407
+ uint8 top1 = (probs >> 8) & 0xff;
408
+ if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));}
409
+ uint8 top2 = (probs >> 16) & 0xff;
410
+ if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));}
411
+ uint8 top3 = (probs >> 24) & 0xff;
412
+ if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));}
413
+ }
414
+
415
+
416
+ //------------------------------------------------------------------------------
417
+ // Routines to accumulate probabilities
418
+ //------------------------------------------------------------------------------
419
+
420
+
421
+ // UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars
422
+ // Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
423
+ // Score up to n unigrams, returning number of bytes consumed
424
+ // Updates tote_grams
425
+ int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj,
426
+ const char* isrc, int srclen, int advance_by,
427
+ int* tote_grams, int gram_limit, Tote* chunk_tote) {
428
+ const char* src = isrc;
429
+ if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
430
+
431
+ // Property-based CJK unigram lookup
432
+ if (src[0] == ' ') {++src; --srclen;}
433
+
434
+ const uint8* usrc = reinterpret_cast<const uint8*>(src);
435
+ int usrclen = srclen;
436
+
437
+ while (usrclen > 0) {
438
+ int len = kAdvanceOneChar[usrc[0]];
439
+ // Look up property of one UTF-8 character and advance over it
440
+ // Return 0 if input length is zero
441
+ // Return 0 and advance one byte if input is ill-formed
442
+
443
+ int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen);
444
+
445
+ if (FLAGS_dbglookup) {
446
+ DbgUniTermToStderr(propval, usrc, len);
447
+ }
448
+
449
+ if (propval > 0) {
450
+ ProcessProbV25UniTote(propval, chunk_tote);
451
+ ++(*tote_grams);
452
+ if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);}
453
+ }
454
+
455
+ // Advance by 1/2/4/8 characters (half of quad advance)
456
+ if (advance_by == 2) {
457
+ // Already advanced by 1
458
+ } else if (advance_by == 4) {
459
+ // Advance by 2 chars total, if not at end
460
+ if (UTFmax <= usrclen) {
461
+ int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
462
+ }
463
+ } else if (advance_by == 8) {
464
+ // Advance by 4 chars total, if not at end
465
+ if ((UTFmax * 3) <= usrclen) {
466
+ int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
467
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
468
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
469
+ }
470
+ } else {
471
+ // Advance by 8 chars total, if not at end
472
+ if ((UTFmax * 7) <= usrclen) {
473
+ int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
474
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
475
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
476
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
477
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
478
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
479
+ n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
480
+ }
481
+ }
482
+ DCHECK(usrclen >= 0);
483
+
484
+ if (*tote_grams >= gram_limit) {
485
+ break;
486
+ }
487
+ }
488
+ if (FLAGS_dbgscore) {
489
+ // With advance_by>2, we consume more input to get the same number of quads
490
+ int len = src - isrc;
491
+ DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
492
+ DbgScoreFlush();
493
+ }
494
+
495
+ int consumed2 = reinterpret_cast<const char*>(usrc) - isrc;
496
+ return consumed2;
497
+ }
498
+
499
+
500
+ // BIGRAM, using hash table, always advancing by 1 char
501
+ // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
502
+ // Score all bigrams in isrc, using languages that have bigrams (CJK)
503
+ // Return number of bigrams that hit in the hash table
504
+ int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
505
+ const char* isrc, int srclen, Tote* chunk_tote) {
506
+ int hit_count = 0;
507
+ const char* src = isrc;
508
+
509
+ // Hashtable-based CJK bigram lookup
510
+ const uint8* usrc = reinterpret_cast<const uint8*>(src);
511
+ const uint8* usrclimit1 = usrc + srclen - UTFmax;
512
+ if (FLAGS_dbgscore) {
513
+ fprintf(stderr, " " );
514
+ }
515
+
516
+ while (usrc < usrclimit1) {
517
+ int len = kAdvanceOneChar[usrc[0]];
518
+ int len2 = kAdvanceOneChar[usrc[len]] + len;
519
+
520
+ if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible
521
+ // Lookup and score this bigram
522
+ // Always ignore pre/post spaces
523
+ uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2);
524
+ uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
525
+ // Now go indirect on the subscript
526
+ probs = bigram_obj->kCLDTableInd[probs &
527
+ ~bigram_obj->kCLDTableKeyMask];
528
+
529
+ // Process the bigram
530
+ if (FLAGS_dbglookup) {
531
+ const char* ssrc = reinterpret_cast<const char*>(usrc);
532
+ DbgBiTermToStderr(bihash, probs, ssrc, len2);
533
+ DbgScoreRecord(NULL, probs, len2);
534
+ } else if (FLAGS_dbgscore && (probs != 0)) {
535
+ const char* ssrc = reinterpret_cast<const char*>(usrc);
536
+ DbgScoreRecord(NULL, probs, len2);
537
+ string temp(ssrc, len2);
538
+ fprintf(stderr, "%s ", temp.c_str());
539
+ }
540
+
541
+ if (probs != 0) {
542
+ ProcessProbV25Tote(probs, chunk_tote);
543
+ ++hit_count;
544
+ }
545
+ }
546
+ usrc += len; // Advance by one char
547
+ }
548
+
549
+ if (FLAGS_dbgscore) {
550
+ fprintf(stderr, "[%d bigrams scored]\n", hit_count);
551
+ DbgScoreState();
552
+ }
553
+ return hit_count;
554
+ }
555
+
556
+
557
+
558
+ // QUADGRAM, using hash table, advancing by 2/4/8/16 chars
559
+ // Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
560
+ // Score up to n quadgrams, returning number of bytes consumed
561
+ // Updates tote_grams
562
+ int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
563
+ const char* isrc, int srclen, int advance_by,
564
+ int* tote_grams, int gram_limit, Tote* chunk_tote) {
565
+ const char* src = isrc;
566
+ const char* srclimit = src + srclen;
567
+ // Limit is end, which has extra 20 20 20 00 past len
568
+ const char* srclimit7 = src + srclen - (UTFmax * 7);
569
+ const char* srclimit15 = src + srclen - (UTFmax * 15);
570
+
571
+ if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
572
+
573
+ // Run a little cache of last hits to catch overly-repetitive "text"
574
+ int next_prior = 0;
575
+ uint32 prior_quads[2] = {0, 0};
576
+
577
+ // Visit all quadgrams
578
+ if (src[0] == ' ') {++src;}
579
+ while (src < srclimit) {
580
+ // Find one quadgram
581
+ const char* src_end = src;
582
+ src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
583
+ src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
584
+ const char* src_mid = src_end;
585
+ src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
586
+ src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
587
+ int len = src_end - src;
588
+
589
+ // Lookup and score this quadgram
590
+ uint32 quadhash = QuadHashV25(src, len);
591
+ uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
592
+ // Now go indirect on the subscript
593
+ probs = quadgram_obj->kCLDTableInd[probs &
594
+ ~quadgram_obj->kCLDTableKeyMask];
595
+
596
+ // Process the quadgram
597
+ if (FLAGS_dbglookup) {
598
+ DbgQuadTermToStderr(quadhash, probs, src, len);
599
+ }
600
+ if (probs != 0) {
601
+ // Filter out recent repeats. If this works out, use in the other lookups
602
+ if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
603
+ prior_quads[next_prior] = quadhash;
604
+ next_prior = (next_prior + 1) & 1;
605
+ ProcessProbV25Tote(probs, chunk_tote);
606
+ ++(*tote_grams);
607
+ if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
608
+ }
609
+ }
610
+
611
+ // Advance all the way past word if at end-of-word
612
+ if (src_end[0] == ' ') {
613
+ src_mid = src_end;
614
+ }
615
+
616
+ // Advance by 2/4/8/16 characters
617
+ if (advance_by == 2) {
618
+ src = src_mid;
619
+ } else if (advance_by == 4) {
620
+ src = src_end;
621
+ } else if (advance_by == 8) {
622
+ // Advance by 8 chars total (4 more), if not at end
623
+ if (src < srclimit7) {
624
+ src_end += kAdvanceOneChar[(uint8)src_end[0]];
625
+ src_end += kAdvanceOneChar[(uint8)src_end[0]];
626
+ src_end += kAdvanceOneChar[(uint8)src_end[0]];
627
+ src_end += kAdvanceOneChar[(uint8)src_end[0]];
628
+ }
629
+ src = src_end;
630
+ } else {
631
+ // Advance by 16 chars total (12 more), if not at end
632
+ if (src < srclimit15) {
633
+ // Advance by ~16 chars by adding 3 * current bytelen
634
+ int fourcharlen = src_end - src;
635
+ src = src_end + (3 * fourcharlen);
636
+ // Advance a bit more if mid-character
637
+ src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
638
+ src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
639
+ } else {
640
+ src = src_end;
641
+ }
642
+ }
643
+ DCHECK(src < srclimit);
644
+ src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
645
+
646
+ if (*tote_grams >= gram_limit) {
647
+ break;
648
+ }
649
+ }
650
+
651
+ if (FLAGS_dbgscore) {
652
+ // With advance_by>2, we consume more input to get the same number of quads
653
+ int len = src - isrc;
654
+ DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
655
+ DbgScoreFlush();
656
+ }
657
+
658
+ int consumed = src - isrc;
659
+
660
+ // If advancing by more than 2, src may have overshot srclimit
661
+ if (consumed > srclen) {
662
+ consumed = srclen;
663
+ }
664
+
665
+ return consumed;
666
+ }
667
+
668
+
669
+ // OCTAGRAM, using hash table, always advancing by 1 word
670
+ // Caller supplies table, such as &kLongWord8Table_obj
671
+ // Score all words in isrc, using languages that have quadgrams
672
+ // We don't normally use this routine except on the first quadgram run,
673
+ // but it can be used to resolve unreliable pages.
674
+ // This routine does not have an optimized advance_by
675
+ // SOON: Uses indirect language/probability longword
676
+ //
677
+ // Return number of words that hit in the hash table
678
+ int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
679
+ const char* isrc, int srclen, Tote* chunk_tote) {
680
+ int hit_count = 0;
681
+ const char* src = isrc;
682
+ const char* srclimit = src + srclen + 1;
683
+ // Limit is end+1, to include extra space char (0x20) off the end
684
+ //
685
+ // Score all words truncated to 8 characters
686
+ int charcount = 0;
687
+ // Skip any initial space
688
+ if (src[0] == ' ') {++src;}
689
+ const char* word_ptr = src;
690
+ const char* word_end = word_ptr;
691
+ if (FLAGS_dbgscore) {
692
+ fprintf(stderr, " " );
693
+ }
694
+ while (src < srclimit) {
695
+ // Terminate previous word or continue current word
696
+ if (src[0] == ' ') {
697
+ int bytecount = word_end - word_ptr;
698
+ if (bytecount == 0)
699
+ break;
700
+ // Lookup and score this word
701
+ uint64 wordhash40 = OctaHash40(word_ptr, bytecount);
702
+ uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40);
703
+ // Now go indirect on the subscript
704
+ probs = octagram_obj->kCLDTableInd[probs &
705
+ ~octagram_obj->kCLDTableKeyMask];
706
+
707
+ // // Lookup and score this word
708
+ // uint32 wordhash = QuadHashV25(word_ptr, bytecount);
709
+ // uint32 probs = WordHashLookup4(wordhash, kLongWord8Table,
710
+ // kLongWord8TableSize);
711
+ //
712
+ if (FLAGS_dbglookup) {
713
+ DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount);
714
+ DbgScoreRecord(NULL, probs, bytecount);
715
+ } else if (FLAGS_dbgscore && (probs != 0)) {
716
+ DbgScoreRecord(NULL, probs, bytecount);
717
+ string temp(word_ptr, bytecount);
718
+ fprintf(stderr, "%s ", temp.c_str());
719
+ }
720
+
721
+ if (probs != 0) {
722
+ ProcessProbV25Tote(probs, chunk_tote);
723
+ ++hit_count;
724
+ }
725
+ charcount = 0;
726
+ word_ptr = src + 1; // Over the space
727
+ word_end = word_ptr;
728
+ } else {
729
+ ++charcount;
730
+ }
731
+
732
+ // Advance to next char
733
+ src += cld_UniLib::OneCharLen(src);
734
+ if (charcount <= 8) {
735
+ word_end = src;
736
+ }
737
+ }
738
+
739
+ if (FLAGS_dbgscore) {
740
+ fprintf(stderr, "[%d words scored]\n", hit_count);
741
+ DbgScoreState();
742
+ }
743
+ return hit_count;
744
+ }
745
+
746
+
747
+
748
+ //------------------------------------------------------------------------------
749
+ // Reliability calculations, for single language and between languages
750
+ //------------------------------------------------------------------------------
751
+
752
+ // Return reliablity of result 0..100 for top two scores
753
+ // delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
754
+ // (on a scale where +1 is a factor of 2 ** 1.6 = 3.02)
755
+ // Threshold is uni/quadgram increment count, bounded above and below.
756
+ //
757
+ // Requiring a factor of 3 improvement (e.g. +1 log base 3)
758
+ // for each scored quadgram is too stringent, so I've backed this off to a
759
+ // factor of 2 (e.g. +5/8 log base 3).
760
+ //
761
+ // I also somewhat lowered the Min/MaxGramCount limits above
762
+ //
763
+ // Added: if fewer than 8 quads/unis, max reliability is 12*n percent
764
+ //
765
+ int cld::ReliabilityDelta(int value1, int value2, int gramcount) {
766
+ int max_reliability_percent = 100;
767
+ if (gramcount < 8) {
768
+ max_reliability_percent = 12 * gramcount;
769
+ }
770
+ int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above
771
+ if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16
772
+ fully_reliable_thresh = kMinGramCount;
773
+ } else if (fully_reliable_thresh > kMaxGramCount) {
774
+ fully_reliable_thresh = kMaxGramCount;
775
+ }
776
+
777
+ int delta = value1 - value2;
778
+ if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
779
+ if (delta <= 0) {return 0;}
780
+ return cld::minint(max_reliability_percent,
781
+ (100 * delta) / fully_reliable_thresh);
782
+ }
783
+
784
+ // Return reliablity of result 0..100 for top score vs. mainsteam score
785
+ // Values are score per 1024 bytes of input
786
+ // ratio = max(top/mainstream, mainstream/top)
787
+ // ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
788
+ // Change: short-text word scoring can give unusually good results.
789
+ // Let top exceed mainstream by 4x at 50% reliable
790
+ int cld::ReliabilityMainstream(int topscore, int len, int mean_score) {
791
+ if (mean_score == 0) {return 100;} // No reliability data available yet
792
+ if (topscore == 0) {return 0;} // zero score = unreliable
793
+ if (len == 0) {return 0;} // zero len = unreliable
794
+ int top_kb = (topscore << 10) / len;
795
+ double ratio;
796
+ double ratio_cutoff;
797
+ if (top_kb > mean_score) {
798
+ ratio = (1.0 * top_kb) / mean_score;
799
+ ratio_cutoff = 5.0; // ramp down from 100% to 0%: 3.0-5.0
800
+ } else {
801
+ ratio = (1.0 * mean_score) / top_kb;
802
+ ratio_cutoff = 4.0; // ramp down from 100% to 0%: 2.0-4.0
803
+ }
804
+ if (ratio <= ratio_cutoff - 2.0) {return 100;}
805
+ if (ratio > ratio_cutoff) {return 0;}
806
+
807
+ int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0);
808
+ return iratio;
809
+ }
810
+
811
+ // Calculate ratio of score per 1KB vs. expected score per 1KB
812
+ double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript,
813
+ int bytes, int score) {
814
+ // Average training-data score for this language-script combo, per 1KB
815
+ int expected_score = kMeanScore[lang * 4 + LScript4(lscript)];
816
+ if (lscript == ULScript_Common) {
817
+ // We don't know the script (only happens with second-chance score)
818
+ // Look for first non-zero mean value
819
+ for (int i = 0; i < 3; ++i) {
820
+ if (kMeanScore[lang * 4 + i] > 0) {
821
+ expected_score = kMeanScore[lang * 4 + i];
822
+ }
823
+ }
824
+ }
825
+ if (expected_score < 100) {
826
+ expected_score = 1000;
827
+ }
828
+
829
+ // Our score per 1KB
830
+ double our_score = (score << 10) / (bytes ? bytes : 1); // Avoid zdiv
831
+ double ratio = our_score / expected_score;
832
+
833
+ // Just the raw count normalized as though each language has mean=1000;
834
+ ratio = (score * 1000.0) / expected_score;
835
+ return ratio;
836
+ }
837
+
838
+ // Calculate reliablity of len bytes of script lscript with chunk_tote
839
+ int cld::GetReliability(int len, UnicodeLScript lscript,
840
+ const Tote* chunk_tote) {
841
+ Language cur_lang = UnpackLanguage(chunk_tote->Key(0));
842
+ // Average score for this language-script combo
843
+ int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)];
844
+ if (lscript == ULScript_Common) {
845
+ // We don't know the script (only happens with second-chance score)
846
+ // Look for first non-zero mean value
847
+ for (int i = 0; i < 3; ++i) {
848
+ if (kMeanScore[cur_lang * 4 + i] > 0) {
849
+ mean_score = kMeanScore[cur_lang * 4 + i];
850
+ }
851
+ }
852
+ }
853
+ int reliability_delta = ReliabilityDelta(chunk_tote->Value(0),
854
+ chunk_tote->Value(1),
855
+ chunk_tote->GetGramCount());
856
+
857
+ int reliability_main = ReliabilityMainstream(chunk_tote->Value(0),
858
+ len,
859
+ mean_score);
860
+
861
+ int reliability_min = minint(reliability_delta, reliability_main);
862
+
863
+
864
+ if (FLAGS_dbgreli) {
865
+ char temp1[4];
866
+ char temp2[4];
867
+ cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1);
868
+ if (temp1[2] == ' ') {temp1[2] = '\0';}
869
+ cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2);
870
+ if (temp2[2] == ' ') {temp2[2] = '\0';}
871
+ int srclen = len;
872
+ fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d, %s=%d %s=%d "
873
+ "top/KB=%d mean/KB=%d del=%d%% reli=%d%% "
874
+ "lang/lscript %d %d\n",
875
+ chunk_tote->GetGramCount(),
876
+ chunk_tote->GetIncrCount(),
877
+ srclen,
878
+ temp1, chunk_tote->Value(0),
879
+ temp2, chunk_tote->Value(1),
880
+ (chunk_tote->Value(0) << 10) / (srclen ? srclen : 1),
881
+ mean_score,
882
+ reliability_delta,
883
+ reliability_main,
884
+ cur_lang, lscript);
885
+ }
886
+
887
+ return reliability_min;
888
+ }
889
+
890
+
891
+ //------------------------------------------------------------------------------
892
+ // Miscellaneous
893
+ //------------------------------------------------------------------------------
894
+
895
+ // Demote all languages except Top40 and plus_one
896
+ // Do this just before sorting chunk_tote results
897
+ void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) {
898
+ for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
899
+ if (chunk_tote->Key(sub) == 0) continue;
900
+ if (chunk_tote->Key(sub) == packed_plus_one) continue;
901
+ if (kIsPackedTop40[chunk_tote->Key(sub)]) continue;
902
+ // Quarter the score of others
903
+ chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2);
904
+ }
905
+ }