krukid-cld 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (108) hide show
  1. data/LICENSE +27 -0
  2. data/Manifest +106 -0
  3. data/README.rdoc +173 -0
  4. data/Rakefile +15 -0
  5. data/base/basictypes.h +348 -0
  6. data/base/build_config.h +115 -0
  7. data/base/casts.h +156 -0
  8. data/base/commandlineflags.h +443 -0
  9. data/base/crash.h +41 -0
  10. data/base/dynamic_annotations.h +358 -0
  11. data/base/global_strip_options.h +59 -0
  12. data/base/log_severity.h +46 -0
  13. data/base/logging.h +1403 -0
  14. data/base/macros.h +243 -0
  15. data/base/port.h +54 -0
  16. data/base/scoped_ptr.h +428 -0
  17. data/base/stl_decl.h +0 -0
  18. data/base/stl_decl_msvc.h +107 -0
  19. data/base/string_util.h +29 -0
  20. data/base/strtoint.h +93 -0
  21. data/base/template_util.h +96 -0
  22. data/base/type_traits.h +198 -0
  23. data/base/vlog_is_on.h +143 -0
  24. data/build.sh +48 -0
  25. data/build.win.cmd +28 -0
  26. data/cld.gemspec +33 -0
  27. data/cld_encodings.h +95 -0
  28. data/encodings/compact_lang_det/#cldutil.cc# +905 -0
  29. data/encodings/compact_lang_det/#cldutil.h# +1205 -0
  30. data/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
  31. data/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
  32. data/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
  33. data/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
  34. data/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
  35. data/encodings/compact_lang_det/#tote.cc# +299 -0
  36. data/encodings/compact_lang_det/#tote.h# +89 -0
  37. data/encodings/compact_lang_det/cldutil.cc +905 -0
  38. data/encodings/compact_lang_det/cldutil.h +1205 -0
  39. data/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  40. data/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  41. data/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  42. data/encodings/compact_lang_det/compact_lang_det.h +145 -0
  43. data/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  44. data/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  45. data/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  46. data/encodings/compact_lang_det/compile.cmd +1 -0
  47. data/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  48. data/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  49. data/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  50. data/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  51. data/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  52. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  53. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  54. data/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  55. data/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  56. data/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  57. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  58. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  59. data/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  60. data/encodings/compact_lang_det/getonescriptspan.h +131 -0
  61. data/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  62. data/encodings/compact_lang_det/letterscript_enum.h +99 -0
  63. data/encodings/compact_lang_det/subsetsequence.cc +259 -0
  64. data/encodings/compact_lang_det/subsetsequence.h +44 -0
  65. data/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  66. data/encodings/compact_lang_det/tote.cc +299 -0
  67. data/encodings/compact_lang_det/tote.h +89 -0
  68. data/encodings/compact_lang_det/unittest_data.h +193 -0
  69. data/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  70. data/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  71. data/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  72. data/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
  73. data/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  74. data/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  75. data/encodings/compact_lang_det/win/cld_google.h +18 -0
  76. data/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  77. data/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  78. data/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  79. data/encodings/compact_lang_det/win/cld_logging.h +21 -0
  80. data/encodings/compact_lang_det/win/cld_macros.h +19 -0
  81. data/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  82. data/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  83. data/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  84. data/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  85. data/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  86. data/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  87. data/encodings/compact_lang_det/win/cld_utf.h +24 -0
  88. data/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  89. data/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  90. data/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  91. data/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  92. data/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  93. data/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  94. data/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  95. data/encodings/internal/encodings.cc +12 -0
  96. data/encodings/lang_enc.h +254 -0
  97. data/encodings/proto/encodings.pb.h +169 -0
  98. data/encodings/public/encodings.h +301 -0
  99. data/ext/cld/extconf.rb +8 -0
  100. data/krukid-cld.gemspec +33 -0
  101. data/languages/internal/#languages.cc# +337 -0
  102. data/languages/internal/languages.cc +337 -0
  103. data/languages/proto/languages.pb.h +179 -0
  104. data/languages/public/languages.h +379 -0
  105. data/lib/cld.rb +12 -0
  106. data/test/test.rb +570 -0
  107. data/thunk.cc +131 -0
  108. metadata +196 -0
@@ -0,0 +1,76 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_CLDUTIL_DBG_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_CLDUTIL_DBG_H_
7
+
8
+ #include "encodings/compact_lang_det/cldutil.h"
9
+ #include <string>
10
+ #include "encodings/compact_lang_det/ext_lang_enc.h"
11
+ #include "encodings/compact_lang_det/tote.h"
12
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
13
+ #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
14
+
15
+ DECLARE_bool(dbgscore);
16
+ DECLARE_bool(dbglookup);
17
+ DECLARE_bool(dbgreli);
18
+
19
+ using std::string;
20
+
21
+ namespace cld {
22
+
23
+
24
+ //------------------------------------------------------------------------------
25
+ // Debugging. Not thread safe
26
+ //------------------------------------------------------------------------------
27
+
28
+ void DbgScoreInit(const char* src, int len);
29
+
30
+ // Return a 3-byte + NUL code for language
31
+ void DbgLangName3(Language lang, char* temp);
32
+
33
+ // Show all per-language totals
34
+ void DbgScoreState();
35
+
36
+ void DbgScoreTop(const char* src, int srclen, Tote* chunk_tote);
37
+
38
+ void DbgScoreFlush();
39
+
40
+ // Allow additional scoring debug output
41
+ void DbgScoreRecord(const char* src, uint32 probs, int len);
42
+
43
+ void DbgScoreRecordUni(const char* src, int propval, int len);
44
+
45
+ // Debug print language name(s)
46
+ void PrintLang(FILE* f, const Tote* chunk_tote,
47
+ const Language cur_lang, const bool cur_unreliable,
48
+ Language prior_lang, bool prior_unreliable);
49
+
50
+ // Debug print language name(s)
51
+ void PrintLang2(FILE* f,
52
+ const Language lang1, const Language lang2, bool diff_prior);
53
+
54
+ // Debug print text span
55
+ void PrintText(FILE* f, Language cur_lang, const string& str);
56
+
57
+ // Debug print text span with speculative language
58
+ void PrintTextSpeculative(FILE* f, Language cur_lang, const string& str);
59
+
60
+ // Debug print ignored text span
61
+ void PrintSkippedText(FILE* f, const string& str);
62
+
63
+ void DbgProbsToStderr(uint32 probs);
64
+ void DbgUniTermToStderr(int propval, const uint8* usrc, int len);
65
+ // No pre/post space
66
+ void DbgBiTermToStderr(uint32 bihash, uint32 probs,
67
+ const char* src, int len);
68
+ void DbgQuadTermToStderr(uint32 quadhash, uint32 probs,
69
+ const char* src, int len);
70
+ void DbgWordTermToStderr(uint64 wordhash, uint32 probs,
71
+ const char* src, int len);
72
+
73
+ } // End namespace cld
74
+
75
+
76
+ #endif // ENCODINGS_COMPACT_LANG_DET_CLDUTIL_DBG_H_
@@ -0,0 +1,76 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/cldutil_dbg.h"
6
+ //#include <string>
7
+
8
+ //#include "base/logging.h"
9
+ //#include "i18n/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
10
+ //#include "i18n/encodings/compact_lang_det/utf8propletterscriptnum.h"
11
+ //#include "third_party/utf/utf.h" // for UTFmax
12
+ //#include "util/utf8/unicodeprops.h"
13
+ //#include "util/utf8/unilib.h"
14
+ //#include "util/utf8/utf8statetable.h"
15
+ #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
16
+
17
+ DEFINE_bool(dbgscore, false, "Print picture of score calculation");
18
+ DEFINE_bool(dbglookup, false, "Print every quad/uni lookup in score calc");
19
+ DEFINE_bool(dbgreli, false, "Print reliability in score calc");
20
+
21
+ namespace cld {
22
+
23
+
24
+ //------------------------------------------------------------------------------
25
+ // Debugging. Not thread safe
26
+ // This is the empty version -- routines return immediately
27
+ //------------------------------------------------------------------------------
28
+
29
+ void DbgScoreInit(const char* src, int len) {};
30
+
31
+ // Return a 3-byte + NUL code for language
32
+ void DbgLangName3(Language lang, char* temp) {};
33
+
34
+ // Show all per-language totals
35
+ void DbgScoreState() {};
36
+
37
+ void DbgScoreTop(const char* src, int srclen, Tote* chunk_tote) {};
38
+
39
+ void DbgScoreFlush() {};
40
+
41
+ // Allow additional scoring debug output
42
+ void DbgScoreRecord(const char* src, uint32 probs, int len) {};
43
+
44
+ void DbgScoreRecordUni(const char* src, int propval, int len) {};
45
+
46
+ // Debug print language name(s)
47
+ void PrintLang(FILE* f, const Tote* chunk_tote,
48
+ const Language cur_lang, const bool cur_unreliable,
49
+ Language prior_lang, bool prior_unreliable) {};
50
+
51
+ // Debug print language name(s)
52
+ void PrintLang2(FILE* f,
53
+ const Language lang1, const Language lang2, bool diff_prior) {};
54
+
55
+ // Debug print text span
56
+ void PrintText(FILE* f, Language cur_lang, const string& str) {};
57
+
58
+ // Debug print text span with speculative language
59
+ void PrintTextSpeculative(FILE* f, Language cur_lang, const string& str) {};
60
+
61
+ // Debug print ignored text span
62
+ void PrintSkippedText(FILE* f, const string& str) {};
63
+
64
+ void DbgProbsToStderr(uint32 probs) {};
65
+ void DbgUniTermToStderr(int propval, const uint8* usrc, int len) {};
66
+ // No pre/post space
67
+ void DbgBiTermToStderr(uint32 bihash, uint32 probs,
68
+ const char* src, int len) {};
69
+ void DbgQuadTermToStderr(uint32 quadhash, uint32 probs,
70
+ const char* src, int len) {};
71
+ void DbgWordTermToStderr(uint64 wordhash, uint32 probs,
72
+ const char* src, int len) {};
73
+
74
+
75
+ } // End namespace cld
76
+
@@ -0,0 +1,62 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/compact_lang_det.h"
6
+ #include "encodings/compact_lang_det/compact_lang_det_impl.h"
7
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
8
+
9
+ // String is "code_version - data_scrape_date"
10
+ static const char* kDetectLanguageVersion = "V1.6 - 20081121";
11
+
12
+ // Large-table version for all ~160 languages (all Tiers)
13
+
14
+ Language CompactLangDet::DetectLanguage(
15
+ const DetectionTables* tables,
16
+ const char* buffer,
17
+ int buffer_length,
18
+ bool is_plain_text,
19
+ bool do_allow_extended_languages,
20
+ bool do_pick_summary_language,
21
+ bool do_remove_weak_matches,
22
+ const char* tld_hint, // "id" boosts Indonesian
23
+ int encoding_hint, // SJS boosts Japanese
24
+ Language language_hint, // ITALIAN boosts it
25
+ Language* language3,
26
+ int* percent3,
27
+ double* normalized_score3,
28
+ int* text_bytes,
29
+ bool* is_reliable) {
30
+ int flags = 0;
31
+ Language plus_one = UNKNOWN_LANGUAGE;
32
+
33
+ Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
34
+ tables,
35
+ buffer,
36
+ buffer_length,
37
+ is_plain_text,
38
+ do_pick_summary_language,
39
+ do_remove_weak_matches,
40
+ tld_hint, // "id" boosts Indonesian
41
+ encoding_hint, // SJS boosts Japanese
42
+ language_hint, // ITALIAN boosts it
43
+ do_allow_extended_languages,
44
+ flags,
45
+ plus_one,
46
+ language3,
47
+ percent3,
48
+ normalized_score3,
49
+ text_bytes,
50
+ is_reliable);
51
+ // Do not default to English
52
+ return lang;
53
+ }
54
+
55
+
56
+
57
+ // Return version text string
58
+ // String is "code_version - data_scrape_date"
59
+ const char* CompactLangDet::DetectLanguageVersion() {
60
+ return kDetectLanguageVersion;
61
+ }
62
+
@@ -0,0 +1,145 @@
1
+ // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // Baybayin (ancient script of the Philippines) is detected as TAGALOG.
6
+ // Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
7
+ // HAITIAN_CREOLE is detected as such.
8
+ // NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
9
+ // PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
10
+ // ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as MOLDAVIAN.
11
+ // SERBO_CROATIAN, BOSNIAN, CROATIAN, SERBIAN, MONTENEGRIN in the Latin script
12
+ // are all detected as CROATIAN; in the Cyrillic script as SERBIAN.
13
+ // Zhuang is detected in the Latin script only.
14
+ //
15
+ // The Google interface languages X_PIG_LATIN and X_KLINGON are detected in the
16
+ // extended calls ExtDetectLanguageSummary(). BorkBorkBork, ElmerFudd, and
17
+ // Hacker are not detected (too little training data).
18
+ //
19
+ // UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
20
+ // is high enough. This happens with non-text input such as the bytes of a
21
+ // JPEG, and also with some text in languages outside the Google Language
22
+ // enum, such as Ilonggo.
23
+ //
24
+ // The following languages are detected in multiple scripts:
25
+ // AZERBAIJANI (Latin, Cyrillic*, Arabic*)
26
+ // BURMESE (Latin, Myanmar)
27
+ // HAUSA (Latin, Arabic)
28
+ // KASHMIRI (Arabic, Devanagari)
29
+ // KAZAKH (Latin, Cyrillic, Arabic)
30
+ // KURDISH (Latin*, Arabic)
31
+ // KYRGYZ (Cyrillic, Arabic)
32
+ // LIMBU (Devanagari, Limbu)
33
+ // MONGOLIAN (Cyrillic, Mongolian)
34
+ // SANSKRIT (Latin, Devanagari)
35
+ // SINDHI (Arabic, Devanagari)
36
+ // TAGALOG (Latin, Tagalog)
37
+ // TAJIK (Cyrillic, Arabic*)
38
+ // TATAR (Latin, Cyrillic, Arabic)
39
+ // TURKMEN (Latin, Cyrillic, Arabic)
40
+ // UIGHUR (Latin, Cyrillic, Arabic)
41
+ // UZBEK (Latin, Cyrillic, Arabic)
42
+ //
43
+ // * Due to a shortage of training text, AZERBAIJANI is not currently detected
44
+ // in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
45
+ // Arabic script.
46
+ //
47
+
48
+ #ifndef ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_
49
+ #define ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_
50
+
51
+ #include "languages/public/languages.h"
52
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
53
+
54
+ namespace cld {
55
+ struct CLDTableSummary;
56
+ } // namespace cld
57
+
58
+ namespace CompactLangDet {
59
+ // Scan interchange-valid UTF-8 bytes and detect most likely language,
60
+ // or set of languages.
61
+ //
62
+ // Design goals:
63
+ // Skip over big stretches of HTML tags
64
+ // Able to return ranges of different languages
65
+ // Relatively small tables and relatively fast processing
66
+ // Thread safe
67
+ //
68
+ // For HTML documents, tags are skipped, along with <script> ... </script>
69
+ // and <style> ... </style> sequences, and entities are expanded.
70
+ //
71
+ // We distinguish between bytes of the raw input buffer and bytes of non-tag
72
+ // text letters. Since tags can be over 50% of the bytes of an HTML Page,
73
+ // and are nearly all seven-bit ASCII English, we prefer to distinguish
74
+ // language mixture fractions based on just the non-tag text.
75
+ //
76
+ // Inputs: text and text_length
77
+ // Code skips HTML tags and expands HTML entities, unless
78
+ // is_plain_text is true
79
+ // Outputs:
80
+ // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
81
+ // percent3 is an array of the text percentages 0..100 of the top 3 languages
82
+ // text_bytes is the amount of non-tag/letters-only text found
83
+ // is_reliable set true if the returned Language is some amount more
84
+ // probable then the second-best Language. Calculation is a complex function
85
+ // of the length of the text and the different-script runs of text.
86
+ // Return value: the most likely Language for the majority of the input text
87
+ // Length 0 input returns UNKNOWN_LANGUAGE.
88
+ //
89
+ // Subsetting: For fast detection over large documents, these routines will
90
+ // scan non-tag text of the initial part of a document, then will
91
+ // skip 4-16 bytes and subsample text in the rest of the document, up to a
92
+ // fixed limit (currently 160KB of non-tag letters).
93
+ //
94
+
95
+ struct DetectionTables {
96
+ const cld::CLDTableSummary* quadgram_obj;
97
+ const UTF8PropObj* unigram_obj;
98
+ };
99
+
100
+ // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
101
+ //
102
+ // Accepts hints to bias languagepriors.
103
+ //
104
+ // Extended languages are additional Google interface languages and Unicode
105
+ // single-language scripts, from ext_lang_enc.h. They are experimental and
106
+ // this call may be removed.
107
+ //
108
+ // Returns internal language scores as a ratio to
109
+ // normal score for real text in that language. Scores close to 1.0 indicate
110
+ // normal text, while scores far away from 1.0 indicate badly-skewed text or
111
+ // gibberish
112
+ //
113
+ // If do_pick_summary_lang is true then CLD will sometimes
114
+ // not pick the top-scoring language; see CalcSummaryLang
115
+ // in compact_lang_det_impl.cc. If it's false then the
116
+ // top language is always returned.
117
+ //
118
+ // If do_remove_weak_matches is true then CLD will delete
119
+ // poor scoring languages from the results, so that if a
120
+ // language is returned there is some confidence it is
121
+ // correct.
122
+ //
123
+ Language DetectLanguage(
124
+ const DetectionTables* tables,
125
+ const char* buffer,
126
+ int buffer_length,
127
+ bool is_plain_text,
128
+ bool do_allow_extended_languages,
129
+ bool do_pick_summary_language,
130
+ bool do_remove_weak_matches,
131
+ const char* tld_hint, // "id" boosts Indonesian
132
+ int encoding_hint, // SJS boosts Japanese
133
+ Language language_hint, // ITALIAN boosts it
134
+ Language* language3,
135
+ int* percent3,
136
+ double* normalized_score3,
137
+ int* text_bytes,
138
+ bool* is_reliable);
139
+
140
+ // Return version text string
141
+ // String is "code_version - data_scrape_date"
142
+ const char* DetectLanguageVersion();
143
+ }; // End namespace CompactLangDet
144
+
145
+ #endif // ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_