cld-fixed 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/.rspec +2 -0
  4. data/Gemfile +6 -0
  5. data/LICENSE +27 -0
  6. data/README.md +34 -0
  7. data/Rakefile +5 -0
  8. data/cld.gemspec +22 -0
  9. data/ext/cld/Makefile.am +28 -0
  10. data/ext/cld/Makefile.in +790 -0
  11. data/ext/cld/aclocal.m4 +8895 -0
  12. data/ext/cld/base/basictypes.h +348 -0
  13. data/ext/cld/base/build_config.h +115 -0
  14. data/ext/cld/base/casts.h +156 -0
  15. data/ext/cld/base/commandlineflags.h +443 -0
  16. data/ext/cld/base/crash.h +41 -0
  17. data/ext/cld/base/dynamic_annotations.h +358 -0
  18. data/ext/cld/base/global_strip_options.h +59 -0
  19. data/ext/cld/base/log_severity.h +46 -0
  20. data/ext/cld/base/logging.h +1403 -0
  21. data/ext/cld/base/macros.h +243 -0
  22. data/ext/cld/base/port.h +54 -0
  23. data/ext/cld/base/scoped_ptr.h +428 -0
  24. data/ext/cld/base/stl_decl.h +0 -0
  25. data/ext/cld/base/stl_decl_msvc.h +107 -0
  26. data/ext/cld/base/string_util.h +29 -0
  27. data/ext/cld/base/strtoint.h +93 -0
  28. data/ext/cld/base/template_util.h +96 -0
  29. data/ext/cld/base/type_traits.h +198 -0
  30. data/ext/cld/base/vlog_is_on.h +143 -0
  31. data/ext/cld/build_aux/config.guess +1500 -0
  32. data/ext/cld/build_aux/config.sub +1616 -0
  33. data/ext/cld/build_aux/depcomp +584 -0
  34. data/ext/cld/build_aux/install-sh +507 -0
  35. data/ext/cld/build_aux/ltmain.sh +8745 -0
  36. data/ext/cld/build_aux/missing +367 -0
  37. data/ext/cld/cld_encodings.h +95 -0
  38. data/ext/cld/configure +17362 -0
  39. data/ext/cld/configure.ac +14 -0
  40. data/ext/cld/encodings/compact_lang_det/#cldutil.cc# +905 -0
  41. data/ext/cld/encodings/compact_lang_det/#cldutil.h# +1205 -0
  42. data/ext/cld/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
  43. data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
  44. data/ext/cld/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
  45. data/ext/cld/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
  46. data/ext/cld/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
  47. data/ext/cld/encodings/compact_lang_det/#tote.cc# +299 -0
  48. data/ext/cld/encodings/compact_lang_det/#tote.h# +89 -0
  49. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  50. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  51. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  52. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  53. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  54. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  55. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  56. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  57. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  58. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  59. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  60. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  61. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  62. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  63. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  64. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  65. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  66. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  67. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  68. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  69. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  70. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  71. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  72. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  73. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  74. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  75. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  76. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  77. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  78. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  79. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  80. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  81. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  82. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  83. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  84. data/ext/cld/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
  85. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  86. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  87. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  88. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  89. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  90. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  91. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  92. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  93. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  94. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  95. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  96. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  97. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  98. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  99. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  100. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  101. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  102. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  103. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  104. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  105. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  106. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  107. data/ext/cld/encodings/internal/encodings.cc +12 -0
  108. data/ext/cld/encodings/lang_enc.h +254 -0
  109. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  110. data/ext/cld/encodings/public/encodings.h +301 -0
  111. data/ext/cld/extconf.rb +7 -0
  112. data/ext/cld/languages/internal/#languages.cc# +337 -0
  113. data/ext/cld/languages/internal/languages.cc +336 -0
  114. data/ext/cld/languages/proto/languages.pb.h +179 -0
  115. data/ext/cld/languages/public/languages.h +379 -0
  116. data/ext/cld/thunk.cc +55 -0
  117. data/lib/cld.rb +21 -0
  118. data/lib/cld/version.rb +3 -0
  119. data/spec/cld_spec.rb +67 -0
  120. data/spec/spec_helper.rb +6 -0
  121. metadata +193 -0
@@ -0,0 +1,131 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
7
+
8
+ #include "encodings/compact_lang_det/letterscript_enum.h"
9
+ #include "encodings/compact_lang_det/compact_lang_det_impl.h"
10
+
11
+ namespace getone {
12
+ static const int kMaxScriptBuffer = 4096;
13
+ static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
14
+ static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room
15
+ static const int kMaxAnswerBuffer = 256;
16
+
17
+ typedef enum UnicodeLScript ULScript;
18
+
19
+ typedef struct {
20
+ char* text; // Pointer to the span, somewhere
21
+ int text_bytes; // Number of bytes of text in the span
22
+ int offset; // Offset of start of span in original input buffer
23
+ ULScript script; // Script of all the letters in this span
24
+ Language lang; // Language identified for this span
25
+ bool truncated; // true if buffer filled up before a
26
+ // different script or EOF was found
27
+ } LangSpan;
28
+
29
+
30
+ static inline bool IsContinuationByte(char c) {
31
+ return static_cast<signed char>(c) < -64;
32
+ }
33
+
34
+ // Gets lscript number for letters; always returns
35
+ // 0 (common script) for non-letters
36
+ int GetUTF8LetterScriptNum(const char* src);
37
+
38
+
39
+ // Update src pointer to point to next quadgram, +2..+5
40
+ // Looks at src[0..4]
41
+ const char* AdvanceQuad(const char* src);
42
+ } // end namespace getone
43
+
44
+
45
+
46
+
47
+
48
+
49
+ class ScriptScanner {
50
+ public:
51
+ ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
52
+ ~ScriptScanner();
53
+
54
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
55
+ bool GetOneScriptSpan(getone::LangSpan* span);
56
+
57
+ // Force Latin and Cyrillic scripts to be lowercase
58
+ void LowerScriptSpan(getone::LangSpan* span);
59
+
60
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
61
+ // Force Latin and Cyrillic scripts to be lowercase
62
+ bool GetOneScriptSpanLower(getone::LangSpan* span);
63
+
64
+ private:
65
+ int SkipToFrontOfSpan(const char* src, int len, int* script);
66
+
67
+ const char* start_byte_;
68
+ const char* next_byte_;
69
+ const char* next_byte_limit_;
70
+ int byte_length_;
71
+ bool is_plain_text_;
72
+ char* script_buffer_; // Holds text with expanded entities
73
+ char* script_buffer_lower_; // Holds lowercased text
74
+ };
75
+
76
+
77
+ class LangScanner {
78
+ public:
79
+ LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
80
+ getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
81
+ int maxlangs, int minlangspan);
82
+ ~LangScanner();
83
+
84
+
85
+ int script() {return script_;}
86
+
87
+ // Use new text
88
+ // Keep smoothing state if same script, otherwise reinit smoothing
89
+ void NewText(getone::LangSpan* spn);
90
+
91
+ bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
92
+ bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
93
+
94
+ // The real ones
95
+ bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
96
+ getone::LangSpan* span);
97
+ bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
98
+ getone::LangSpan* span);
99
+
100
+ // Increases language bias by delta
101
+ void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
102
+ Language key, int delta);
103
+
104
+ // For debugging output
105
+ int next_answer_;
106
+ char answer_buffer_[getone::kMaxAnswerBuffer];
107
+ char answer_buffer2_[getone::kMaxAnswerBuffer];
108
+ char answer_buffer3_[getone::kMaxAnswerBuffer];
109
+ char answer_buffer4_[getone::kMaxAnswerBuffer];
110
+
111
+ private:
112
+ const char* start_byte_;
113
+ const char* next_byte_limit_;
114
+ const char* next_byte_;
115
+ const char* onelangspan_begin_;
116
+ int byte_length_;
117
+ int script_;
118
+ Language spanlang_;
119
+ int smoothwidth_;
120
+ int smoothwidth_2_;
121
+ int smoothcandidates_;
122
+ int maxlangs_;
123
+ int minlangspan_;
124
+ int rb_size_;
125
+ int next_rb_;
126
+ int rb_mask_;
127
+ uint32* rb_;
128
+ int* offset_rb_;
129
+ };
130
+
131
+ #endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
@@ -0,0 +1,117 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/letterscript_enum.h"
6
+
7
+ #include "encodings/compact_lang_det/win/cld_logging.h"
8
+
9
+ static const char* kUnicodeLScriptNames[ULScript_NUM_SCRIPTS] = {
10
+ "Common",
11
+ "Latin",
12
+ "Greek",
13
+ "Cyrillic",
14
+ "Armenian",
15
+ "Hebrew",
16
+ "Arabic",
17
+ "Syriac",
18
+ "Thaana",
19
+ "Devanagari",
20
+ "Bengali",
21
+ "Gurmukhi",
22
+ "Gujarati",
23
+ "Oriya",
24
+ "Tamil",
25
+ "Telugu",
26
+ "Kannada",
27
+ "Malayalam",
28
+ "Sinhala",
29
+ "Thai",
30
+ "Lao",
31
+ "Tibetan",
32
+ "Myanmar",
33
+ "Georgian",
34
+ "HanCJK",
35
+ "Ethiopic",
36
+ "Cherokee",
37
+ "Canadian_Aboriginal",
38
+ "Ogham",
39
+ "Runic",
40
+ "Khmer",
41
+ "Mongolian",
42
+ "Yi",
43
+ "Old_Italic",
44
+ "Gothic",
45
+ "Deseret",
46
+ "Inherited",
47
+ "Tagalog",
48
+ "Hanunoo",
49
+ "Buhid",
50
+ "Tagbanwa",
51
+ "Limbu",
52
+ "Tai_Le",
53
+ "Linear_B",
54
+ "Ugaritic",
55
+ "Shavian",
56
+ "Osmanya",
57
+ "Cypriot",
58
+ "Buginese",
59
+ "Coptic",
60
+ "New_Tai_Lue",
61
+ "Glagolitic",
62
+ "Tifinagh",
63
+ "Syloti_Nagri",
64
+ "Old_Persian",
65
+ "Kharoshthi",
66
+ "Balinese",
67
+ "Cuneiform",
68
+ "Phoenician",
69
+ "Phags_Pa",
70
+ "Nko",
71
+
72
+ // Unicode 5.1 beta
73
+ "Sundanese",
74
+ "Lepcha",
75
+ "Ol_Chiki",
76
+ "Vai",
77
+ "Saurashtra",
78
+ "Kayah_Li",
79
+ "Rejang",
80
+ "Lycian",
81
+ "Carian",
82
+ "Lydian",
83
+ "Cham",
84
+ };
85
+
86
+
87
+ // Unicode 5.1 beta script names from
88
+ // http://www.unicode.org/Public/5.1.0/diffs/5.0.0-5.1.0.all.2.diffs
89
+ // NOTE: 'Vai ' => "Vaii" to make four letters, not three
90
+ // see http://unicode.org/iso15924/iso15924-codes.html
91
+ const char* const kLScriptName4[ULScript_NUM_SCRIPTS] = {
92
+ "Zyyy", "Latn", "Grek", "Cyrl", "Armn", "Hebr", "Arab", "Syrc",
93
+ "Thaa", "Deva", "Beng", "Guru", "Gujr", "Orya", "Taml", "Telu",
94
+ "Knda", "Mlym", "Sinh", "Thai", "Laoo", "Tibt", "Mymr", "Geor",
95
+ "Hani", "Ethi", "Cher", "Cans", "Ogam", "Runr", "Khmr", "Mong",
96
+
97
+ "Yiii", "Ital", "Goth", "Dsrt", "Zzzz", "Tglg", "Hano", "Buhd",
98
+ "Tagb", "Limb", "Tale", "Linb", "Ugar", "Shaw", "Osma", "Cprt",
99
+ "Bugi", "Copt", "Talu", "Glag", "Tfng", "Sylo", "Xpeo", "Khar",
100
+ "Bali", "Xsux", "Phnx", "Phag", "Nkoo",
101
+
102
+ // Unicode 5.1 beta
103
+ "Sund", "Lepc", "Olck", "Vaii", "Saur", "Kali", "Rjng", "Lyci",
104
+ "Cari", "Lydi", "Cham",
105
+ };
106
+
107
+
108
+ const char* UnicodeLScriptName(const UnicodeLScript ls) {
109
+ CHECK(ls >= 0 && ls < ULScript_NUM_SCRIPTS);
110
+ return kUnicodeLScriptNames[ls];
111
+ }
112
+
113
+
114
+ const char* UnicodeLScriptCode(const UnicodeLScript ls) {
115
+ CHECK(ls >= 0 && ls < ULScript_NUM_SCRIPTS);
116
+ return kLScriptName4[ls];
117
+ }
@@ -0,0 +1,99 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
6
+ #define ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
7
+
8
+ enum UnicodeLScript {
9
+ ULScript_Common,
10
+ ULScript_Latin,
11
+ ULScript_Greek,
12
+ ULScript_Cyrillic,
13
+ ULScript_Armenian,
14
+ ULScript_Hebrew,
15
+ ULScript_Arabic,
16
+ ULScript_Syriac,
17
+ ULScript_Thaana,
18
+ ULScript_Devanagari,
19
+ ULScript_Bengali,
20
+ ULScript_Gurmukhi,
21
+ ULScript_Gujarati,
22
+ ULScript_Oriya,
23
+ ULScript_Tamil,
24
+ ULScript_Telugu,
25
+ ULScript_Kannada,
26
+ ULScript_Malayalam,
27
+ ULScript_Sinhala,
28
+ ULScript_Thai,
29
+ ULScript_Lao,
30
+ ULScript_Tibetan,
31
+ ULScript_Myanmar,
32
+ ULScript_Georgian,
33
+ ULScript_HanCJK,
34
+ ULScript_Ethiopic,
35
+ ULScript_Cherokee,
36
+ ULScript_Canadian_Aboriginal,
37
+ ULScript_Ogham,
38
+ ULScript_Runic,
39
+ ULScript_Khmer,
40
+ ULScript_Mongolian,
41
+ ULScript_Yi,
42
+ ULScript_Old_Italic,
43
+ ULScript_Gothic,
44
+ ULScript_Deseret,
45
+ ULScript_Inherited,
46
+ ULScript_Tagalog,
47
+ ULScript_Hanunoo,
48
+ ULScript_Buhid,
49
+ ULScript_Tagbanwa,
50
+ ULScript_Limbu,
51
+ ULScript_Tai_Le,
52
+ ULScript_Linear_B,
53
+ ULScript_Ugaritic,
54
+ ULScript_Shavian,
55
+ ULScript_Osmanya,
56
+ ULScript_Cypriot,
57
+ ULScript_Buginese,
58
+ ULScript_Coptic,
59
+ ULScript_New_Tai_Lue,
60
+ ULScript_Glagolitic,
61
+ ULScript_Tifinagh,
62
+ ULScript_Syloti_Nagri,
63
+ ULScript_Old_Persian,
64
+ ULScript_Kharoshthi,
65
+ ULScript_Balinese,
66
+ ULScript_Cuneiform,
67
+ ULScript_Phoenician,
68
+ ULScript_Phags_Pa,
69
+ ULScript_Nko,
70
+
71
+ // Unicode 5.1
72
+ ULScript_Sundanese,
73
+ ULScript_Lepcha,
74
+ ULScript_Ol_Chiki,
75
+ ULScript_Vai,
76
+ ULScript_Saurashtra,
77
+ ULScript_Kayah_Li,
78
+ ULScript_Rejang,
79
+ ULScript_Lycian,
80
+ ULScript_Carian,
81
+ ULScript_Lydian,
82
+ ULScript_Cham,
83
+ ULScript_NUM_SCRIPTS
84
+ };
85
+
86
+
87
+ static const UnicodeLScript UNKNOWN_LSCRIPT = ULScript_Common;
88
+
89
+
90
+ // Return the name corresponding to the script ls, e.g. "Latin".
91
+ // It is a fatal error if ls is not a valid UnicodeLScript.
92
+ const char* UnicodeLScriptName(const UnicodeLScript ls);
93
+
94
+
95
+ // Return the 4-letter code corresponding to the script ls, e.g. "Latn".
96
+ // It is a fatal error if ls is not a valid UnicodeLScript.
97
+ const char* UnicodeLScriptCode(const UnicodeLScript ls);
98
+
99
+ #endif // ENCODINGS_COMPACT_LANG_DET_LETTERSCRIPT_ENUM_H__
@@ -0,0 +1,259 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ // Remember a subset of a sequence of values, using a modest amount of memory
6
+
7
+ /***
8
+ Design:
9
+ Accumulate in powers of three, using 3-way median to collapse entries.
10
+ At any given time, there is one most-dense (highest power of 3) range of
11
+ entries and a series of less-dense ranges that hold 0..2 entries each. There
12
+ is a bounded-size storage array of S cells for all the entries.
13
+
14
+ The overflow detect is set up so that a new higher power of 3, K+1, is
15
+ triggered precisely when range K has 3n entries and all ranges < K have
16
+ zero entries.
17
+
18
+ In general, think of the range sizes as a multi-digit base 3 number, except
19
+ the highest digit may exceed 2:
20
+
21
+ 3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
22
+ 0 0 0 0 3n-1 2 2 unused:1
23
+
24
+ There are a total of 3n-1 + 2 + 2 entries in use. Assume a size limit S at
25
+ one more than that, and we add a new 3**0 entry and "carry" by performing
26
+ medians on any group of 3 elements:
27
+
28
+ 3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
29
+ 0 0 0 0 3n-1 2 3 unused:0
30
+ 0 0 0 0 3n-1 3 0 carry unused:2
31
+ 0 0 0 0 3n 0 0 carry unused:4
32
+
33
+ To accumulate 2 entries at all levels < K and 3 just before the first carry at
34
+ level 0, we need 2*K + 1 unused cells after doing all carries, or five cells
35
+ in this case. Since we only have 4 cells in the example above, we need to
36
+ make room by starting a new power of three:
37
+
38
+ 3**6 3**5 3**4 3**3 3**2 3**1 3**0
39
+ 0 0 0 0 3n 0 0 K=2 unused:4
40
+ 0 0 0 n 0 0 0 K=3 unused:2n+4
41
+
42
+ In the code below, we don't worry about overflow from the topmost place.
43
+
44
+
45
+ ***/
46
+
47
+ #include "encodings/compact_lang_det/subsetsequence.h"
48
+ #include <stdio.h>
49
+
50
+ #include "encodings/compact_lang_det/win/cld_logging.h"
51
+
52
+ void DumpInts(const char* label, const int* v, int n) {
53
+ printf("%s ", label);
54
+ for (int i = 0; i < n; ++i) {
55
+ printf("%d ", v[i]);
56
+ }
57
+ printf("\n");
58
+ }
59
+
60
+ void DumpUint8s(const char* label, const uint8* v, int n) {
61
+ printf("%s ", label);
62
+ for (int i = 0; i < n; ++i) {
63
+ printf("%d ", v[i]);
64
+ }
65
+ printf("\n");
66
+ }
67
+
68
+ // Return median of seq_[sub] .. seq_[sub+2], favoring middle element
69
+ uint8 SubsetSequence::Median3(int sub) {
70
+ if (seq_[sub] == seq_[sub + 1]) {
71
+ return seq_[sub];
72
+ }
73
+ if (seq_[sub] == seq_[sub + 2]) {
74
+ return seq_[sub];
75
+ }
76
+ return seq_[sub + 1];
77
+ }
78
+
79
+ void SubsetSequence::Init() {
80
+ // printf("Init\n");
81
+
82
+ k_ = 0;
83
+ count_[0] = 0;
84
+ next_e_ = 0;
85
+ seq_[0] = 0; // Default value if no calls to Add
86
+
87
+ // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
88
+ int reserve = (2 * k_ + 1);
89
+ level_limit_e_ = kMaxSeq_ - reserve;
90
+ level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3
91
+ limit_e_ = level_limit_e_;
92
+ }
93
+
94
+ // Compress level k by 3x, creating level k+1
95
+ void SubsetSequence::NewLevel() {
96
+ // printf("NewLevel 3 ** %d\n", k_ + 1);
97
+ //DumpUint8s("count[k]", count_, k_ + 1);
98
+ //DumpUint8s("seq[next]", seq_, next_e_);
99
+
100
+ // Incoming level must be an exact multiple of three in size
101
+ CHECK((count_[k_] % 3) == 0);
102
+ int k_size = count_[k_];
103
+ int new_size = k_size / 3;
104
+
105
+ // Compress down by 3x, via median
106
+ for (int j = 0; j < new_size; ++j) {
107
+ seq_[j] = Median3(j * 3);
108
+ }
109
+
110
+ // Update counts
111
+ count_[k_] = 0;
112
+ // Else Overflow -- just continue with 3x dense Level K
113
+ if (k_ < (kMaxLevel_ - 1)) {++k_;}
114
+ count_[k_] = new_size;
115
+
116
+ // Update limits
117
+ next_e_ = new_size;
118
+ limit_e_ = next_e_ + 3;
119
+
120
+ // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
121
+ int reserve = (2 * k_ + 1);
122
+ level_limit_e_ = kMaxSeq_ - reserve;
123
+ level_limit_e_ = (level_limit_e_ / 3) * 3; // Round down to multiple of 3
124
+ //
125
+ //DumpUint8s("after: count[k]", count_, k_ + 1);
126
+ //DumpUint8s("after: seq[next]", seq_, next_e_);
127
+ }
128
+
129
+ void SubsetSequence::DoCarries() {
130
+ CHECK(count_[k_] > 3); // We depend on count_[k_] being > 3 to stop while
131
+ // Make room by carrying
132
+
133
+ //DumpUint8s("DoCarries count[k]", count_, k_ + 1);
134
+ //DumpUint8s("DoCarries seq[next]", seq_, next_e_);
135
+
136
+ int i = 0;
137
+ while (count_[i] == 3) {
138
+ next_e_ -= 3;
139
+ seq_[next_e_] = Median3(next_e_);
140
+ ++next_e_;
141
+ count_[i] = 0;
142
+ ++count_[i + 1];
143
+ ++i;
144
+ }
145
+ limit_e_ = next_e_ + 3;
146
+
147
+ //DumpUint8s("after: DoCarries count[k]", count_, k_ + 1);
148
+ //DumpUint8s("after: DoCarries seq[next]", seq_, next_e_);
149
+
150
+ // If we just fully carried into level K,
151
+ // Make sure there is now enough room, else start level K + 1
152
+ if (i >= k_) {
153
+ CHECK(count_[k_] == next_e_);
154
+ if (next_e_ >= level_limit_e_) {
155
+ NewLevel();
156
+ }
157
+ }
158
+ }
159
+
160
+ void SubsetSequence::Add(uint8 e) {
161
+ // Add an entry then carry as needed
162
+ seq_[next_e_] = e;
163
+ ++next_e_;
164
+ ++count_[0];
165
+
166
+ if (next_e_ >= limit_e_) {
167
+ DoCarries();
168
+ }
169
+ }
170
+
171
+
172
+ // Collapse tail end by simple median across disparate-weight values,
173
+ // dropping or duplicating last value if need be.
174
+ // This routine is idempotent.
175
+ void SubsetSequence::Flush() {
176
+ // printf("Flush %d\n", count_[k_]);
177
+ int start_tail = count_[k_];
178
+ int size_tail = next_e_ - start_tail;
179
+ if ((size_tail % 3) == 2) {
180
+ seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value
181
+ ++size_tail;
182
+ }
183
+
184
+ // Compress tail down by 3x, via median
185
+ int new_size = size_tail / 3; // May delete last value
186
+ for (int j = 0; j < new_size; ++j) {
187
+ seq_[start_tail + j] = Median3(start_tail + j * 3);
188
+ }
189
+
190
+ next_e_ = start_tail + new_size;
191
+ count_[k_] = next_e_;
192
+ }
193
+
194
+
195
+ // Extract representative pattern of exactly N values into dst[0..n-1]
196
+ // This routine may be called multiple times, but it may downsample as a
197
+ // side effect, causing subsequent calls with larger N to get poor answers.
198
+ void SubsetSequence::Extract(int to_n, uint8* dst) {
199
+ // Collapse partial-carries in tail
200
+ Flush();
201
+
202
+ // Just use Bresenham to resample
203
+ int from_n = next_e_;
204
+ if (to_n >= from_n) {
205
+ // Up-sample from_n => to_n
206
+ int err = to_n - 1; // bias toward no overshoot
207
+ int j = 0;
208
+ for (int i = 0; i < to_n; ++i) {
209
+ dst[i] = seq_[j];
210
+ err -= from_n;
211
+ if (err < 0) {
212
+ ++j;
213
+ err += to_n;
214
+ }
215
+ }
216
+ } else {
217
+ // Get to the point that the number of samples is <= 3 * to_n
218
+ while (next_e_ > (to_n * 3)) {
219
+ // Compress down by 3x, via median
220
+ // printf("Extract, median %d / 3\n", next_e_);
221
+ if ((next_e_ % 3) == 2) {
222
+ seq_[next_e_] = seq_[next_e_ - 1]; // Duplicate last value
223
+ ++next_e_;
224
+ }
225
+ int new_size = next_e_ / 3; // May delete last value
226
+ for (int j = 0; j < new_size; ++j) {
227
+ seq_[j] = Median3(j * 3);
228
+ }
229
+ next_e_ = new_size;
230
+ count_[k_] = next_e_;
231
+ }
232
+ from_n = next_e_;
233
+
234
+ if (to_n == from_n) {
235
+ // Copy verbatim
236
+ for (int i = 0; i < to_n; ++i) {
237
+ dst[i] = seq_[i];
238
+ }
239
+ return;
240
+ }
241
+
242
+ // Down-sample from_n => to_n, using medians
243
+ int err = 0; // Bias to immediate median sample
244
+ int j = 0;
245
+ for (int i = 0; i < from_n; ++i) {
246
+ err -= to_n;
247
+ if (err < 0) {
248
+ if (i <= (next_e_ - 2)) {
249
+ dst[j] = Median3(i);
250
+ } else {
251
+ dst[j] = seq_[i];
252
+ }
253
+ ++j;
254
+ err += from_n;
255
+ }
256
+ }
257
+ }
258
+
259
+ }