language_detection 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,570 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/getonescriptspan.h"
6
+ #include <stdio.h>
7
+ #include <string.h>
8
+
9
+ #include "encodings/lang_enc.h"
10
+ #include "encodings/compact_lang_det/utf8propjustletter.h"
11
+ #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
12
+ #include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
13
+
14
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
15
+ #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
16
+ #include "encodings/compact_lang_det/win/cld_google.h"
17
+ #include "encodings/compact_lang_det/win/cld_htmlutils.h"
18
+ #include "encodings/compact_lang_det/win/cld_unilib.h"
19
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
20
+ #include "encodings/compact_lang_det/win/cld_utf8utils.h"
21
+
22
+ static const Language GRAY_LANG = (Language)254;
23
+
24
+ static const int kMaxUpToWordBoundary = 50; // span < this make longer,
25
+ // else make shorter
26
+ static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes
27
+ // to round to word boundary,
28
+ // direction above
29
+
30
+ static const char kSpecialSymbol[256] = { // true for < > &
31
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
32
+ 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
33
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
34
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
35
+
36
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
37
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
38
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
39
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
40
+ };
41
+
42
+
43
+
44
+ #define LT 0 // <
45
+ #define GT 1 // >
46
+ #define EX 2 // !
47
+ #define HY 3 // -
48
+ #define QU 4 // "
49
+ #define AP 5 // '
50
+ #define SL 6 // /
51
+ #define S_ 7
52
+ #define C_ 8
53
+ #define R_ 9
54
+ #define I_ 10
55
+ #define P_ 11
56
+ #define T_ 12
57
+ #define Y_ 13
58
+ #define L_ 14
59
+ #define E_ 15
60
+ #define CR 16 // <cr> or <lf>
61
+ #define NL 17 // non-letter: ASCII whitespace, digit, punctuation
62
+ #define PL 18 // possible letter, incl. &
63
+ #define xx 19 // <unused>
64
+
65
+ // Map byte to one of ~20 interesting categories for cheap tag parsing
66
+ static const uint8 kCharToSub[256] = {
67
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
68
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
69
+ NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
70
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
71
+
72
+ PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
73
+ P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
74
+ PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
75
+ P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
76
+
77
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
78
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
79
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
80
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
81
+
82
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
83
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
84
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
85
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
86
+ };
87
+
88
+ #undef LT
89
+ #undef GT
90
+ #undef EX
91
+ #undef HY
92
+ #undef QU
93
+ #undef AP
94
+ #undef SL
95
+ #undef S_
96
+ #undef C_
97
+ #undef R_
98
+ #undef I_
99
+ #undef P_
100
+ #undef T_
101
+ #undef Y_
102
+ #undef L_
103
+ #undef E_
104
+ #undef CR
105
+ #undef NL
106
+ #undef PL
107
+ #undef xx
108
+
109
+
110
+ #define OK 0
111
+ #define X_ 1
112
+
113
+ // State machine to do cheap parse of non-letter strings incl. tags
114
+ // advances <tag>
115
+ // | |
116
+ // advances <tag> ... </tag> for <script> <style>
117
+ // | |
118
+ // advances <!-- ... <tag> ... -->
119
+ // | |
120
+ // advances <tag
121
+ // || (0)
122
+ // advances <tag <tag2>
123
+ // || (0)
124
+ static const uint8 kTagParseTbl_0[] = {
125
+ // < > ! - " ' / S C R I P T Y L E CR NL PL xx
126
+ 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK
127
+ X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
128
+ 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL*
129
+ X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <
130
+ X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!
131
+ X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-
132
+ 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*
133
+ 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-
134
+ 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--
135
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*
136
+ 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
137
+ 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
138
+ X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
139
+
140
+ // < > ! - " ' / S C R I P T Y L E CR NL PL xx
141
+ X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S
142
+ X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC
143
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR
144
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI
145
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP
146
+ X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
147
+ 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
148
+ 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
149
+ 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
150
+ 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
151
+ 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
152
+ 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
153
+ 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
154
+ 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
155
+ 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
156
+
157
+ // < > ! - " ' / S C R I P T Y L E CR NL PL xx
158
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST
159
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY
160
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL
161
+ X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
162
+ 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
163
+ 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
164
+ 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
165
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
166
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
167
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
168
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
169
+ 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
170
+ };
171
+
172
+ #undef OK
173
+ #undef X_
174
+
175
+
176
+ /*
177
+ // Convert GetTimeOfDay output to 64-bit usec
178
+ static inline uint64 Microseconds(const struct timeval& t) {
179
+ // The SumReducer uses uint64, so convert to (uint64) microseconds,
180
+ // not (double) seconds.
181
+ return t.tv_sec * 1000000ULL + t.tv_usec;
182
+ }
183
+ */
184
+
185
+
186
+ // Returns true if character is < > or &
187
+ bool inline IsSpecial(char c) {
188
+ if ((c & 0xe0) == 0x20) {
189
+ return kSpecialSymbol[static_cast<uint8>(c)];
190
+ }
191
+ return false;
192
+ }
193
+
194
+ // Quick Skip to next letter or < > & or to end of string (eos)
195
+ // Always return is_letter for eos
196
+ int ScanToLetterOrSpecial(const char* src, int len) {
197
+ int bytes_consumed;
198
+ cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
199
+ &bytes_consumed);
200
+ return bytes_consumed;
201
+ }
202
+
203
+
204
+
205
+ // src points to non-letter, such as tag-opening '<'
206
+ // Return length from here to next possible letter
207
+ // On eos or another < before >, return 1
208
+ // advances <tag>
209
+ // | |
210
+ // advances <tag> ... </tag> for <script> <style>
211
+ // | |
212
+ // advances <!-- ... <tag> ... -->
213
+ // | |
214
+ // advances <tag
215
+ // || (1)
216
+ // advances <tag <tag2>
217
+ // || (1)
218
+ int ScanToPossibleLetter(const char* isrc, int len) {
219
+ const uint8* src = reinterpret_cast<const uint8*>(isrc);
220
+ const uint8* srclimit = src + len;
221
+ const uint8* tagParseTbl = kTagParseTbl_0;
222
+ int e = 0;
223
+ while (src < srclimit) {
224
+ e = tagParseTbl[kCharToSub[*src++]];
225
+ if ((e & ~1) == 0) {
226
+ // We overshot by one byte
227
+ --src;
228
+ break;
229
+ }
230
+ tagParseTbl = &kTagParseTbl_0[e * 20];
231
+ }
232
+
233
+ if (src >= srclimit) {
234
+ // We fell off the end of the text.
235
+ // It looks like the most common case for this is a truncated file, not
236
+ // mismatched angle brackets. So we pretend that the last char was '>'
237
+ return len;
238
+ }
239
+
240
+ // OK to be in state 0 or state 2 at exit
241
+ if ((e != 0) && (e != 2)) {
242
+ // Error, '<' followed by '<'
243
+ // We want to back up to first <, then advance by one byte past it
244
+ int offset = src - reinterpret_cast<const uint8*>(isrc);
245
+ // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
246
+
247
+ // Backscan to first '<' and return enough length to just get past it
248
+ --offset; // back up over the second '<', which caused us to stop
249
+ while ((0 < offset) && (isrc[offset] != '<')) {
250
+ // Find the first '<', which is unmatched
251
+ --offset;
252
+ }
253
+ // skip to just beyond first '<'
254
+ // printf(" returning %d\n", offset + 1);
255
+ return offset + 1;
256
+ }
257
+
258
+ return src - reinterpret_cast<const uint8*>(isrc);
259
+ }
260
+
261
+
262
+
263
+ ScriptScanner::ScriptScanner(const char* buffer,
264
+ int buffer_length,
265
+ bool is_plain_text)
266
+ : start_byte_(buffer),
267
+ next_byte_(buffer),
268
+ next_byte_limit_(buffer + buffer_length),
269
+ byte_length_(buffer_length),
270
+ is_plain_text_(is_plain_text) {
271
+ script_buffer_ = new char[getone::kMaxScriptBuffer];
272
+ script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
273
+ }
274
+
275
+ ScriptScanner::~ScriptScanner() {
276
+ delete[] script_buffer_;
277
+ delete[] script_buffer_lower_;
278
+ }
279
+
280
+
281
+
282
+
283
+ // Get to the first real non-tag letter or entity that is a letter
284
+ // Sets script of that letter
285
+ // Return len if no more letters
286
+ int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
287
+ int sc = UNKNOWN_LSCRIPT;
288
+ int skip = 0;
289
+ int tlen, plen;
290
+
291
+ // Do run of non-letters (tag | &NL | NL)*
292
+ while (skip < len) {
293
+ // Do fast scan to next interesting byte
294
+ // int oldskip = skip;
295
+ skip += ScanToLetterOrSpecial(src + skip, len - skip);
296
+ // TEMP
297
+ // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
298
+ // oldskip, src[oldskip], skip, src[skip]);
299
+
300
+ // Check for no more letters/specials
301
+ if (skip >= len) {
302
+ // All done
303
+ return len;
304
+ }
305
+
306
+ // We are at a letter, nonletter, tag, or entity
307
+ if (IsSpecial(src[skip]) && !is_plain_text_) {
308
+ if (src[skip] == '<') {
309
+ // Begining of tag; skip to end and go around again
310
+ tlen = ScanToPossibleLetter(src + skip, len - skip);
311
+ sc = 0;
312
+ // printf("<...> ");
313
+ } else if (src[skip] == '>') {
314
+ // Unexpected end of tag; skip it and go around again
315
+ tlen = 1; // Over the >
316
+ sc = 0;
317
+ // printf("..> ");
318
+ } else if (src[skip] == '&') {
319
+ // Expand entity, no advance
320
+ char temp[4];
321
+ EntityToBuffer(src + skip, len - skip,
322
+ temp, &tlen, &plen);
323
+ sc = getone::GetUTF8LetterScriptNum(temp);
324
+ // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
325
+ }
326
+ } else {
327
+ // Update 1..4 bytes
328
+ tlen = cld_UniLib::OneCharLen(src + skip);
329
+ sc = getone::GetUTF8LetterScriptNum(src + skip);
330
+ // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
331
+ }
332
+ // TEMP
333
+ // printf("sc=%d ", sc);
334
+ if (sc != 0) {break;} // Letter found
335
+ skip += tlen; // Advance
336
+ }
337
+
338
+ *script = sc;
339
+ return skip;
340
+ }
341
+
342
+
343
+
344
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
345
+ // Buffer has leading space and all text is lowercased
346
+ bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
347
+ span->text = script_buffer_;
348
+ span->text_bytes = 0;
349
+ span->offset = next_byte_ - start_byte_;
350
+ span->script = UNKNOWN_LSCRIPT;
351
+ span->lang = UNKNOWN_LANGUAGE;
352
+ span->truncated = false;
353
+
354
+ // printf("GetOneScriptSpan[[ ");
355
+ // struct timeval script_start, script_mid, script_end;
356
+
357
+ int spanscript; // The script of this span
358
+ int sc = UNKNOWN_LSCRIPT; // The script of next character
359
+ int tlen, plen;
360
+
361
+
362
+ script_buffer_[0] = ' '; // Always a space at front of output
363
+ script_buffer_[1] = '\0';
364
+ int take = 0;
365
+ int put = 1; // Start after the initial space
366
+
367
+ // gettimeofday(&script_start, NULL);
368
+ // Get to the first real non-tag letter or entity that is a letter
369
+ int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
370
+ next_byte_ += skip;
371
+ byte_length_ -= skip;
372
+ if (byte_length_ <= 0) {
373
+ // printf("]]\n");
374
+ return false; // No more letters to be found
375
+ }
376
+
377
+ // gettimeofday(&script_mid, NULL);
378
+
379
+ // There is at least one letter, so we know the script for this span
380
+ // printf("{%d} ", spanscript);
381
+ span->script = (UnicodeLScript)spanscript;
382
+
383
+
384
+ // Go over alternating spans of same-script letters and non-letters,
385
+ // copying letters to buffer with single spaces for each run of non-letters
386
+ while (take < byte_length_) {
387
+ // Copy run of letters in same script (&LS | LS)*
388
+ int letter_count = 0; // Keep track of word length
389
+ bool need_break = false;
390
+ while (take < byte_length_) {
391
+ // We are at a letter, nonletter, tag, or entity
392
+ if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
393
+ // printf("\"%c\" ", next_byte_[take]);
394
+ if (next_byte_[take] == '<') {
395
+ // Begining of tag
396
+ sc = 0;
397
+ break;
398
+ } else if (next_byte_[take] == '>') {
399
+ // Unexpected end of tag
400
+ sc = 0;
401
+ break;
402
+ } else if (next_byte_[take] == '&') {
403
+ // Copy entity, no advance
404
+ EntityToBuffer(next_byte_ + take, byte_length_ - take,
405
+ script_buffer_ + put, &tlen, &plen);
406
+ sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
407
+ }
408
+ } else {
409
+ // Real letter, safely copy up to 4 bytes, increment by 1..4
410
+ // Will update by 1..4 bytes at Advance, below
411
+ tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
412
+ if (take < (byte_length_ - 3)) {
413
+ // Fast case
414
+ *reinterpret_cast<uint32*>(script_buffer_ + put) =
415
+ *reinterpret_cast<const uint32*>(next_byte_ + take);
416
+ } else {
417
+ // Slow case, happens 1-3 times per input document
418
+ memcpy(script_buffer_ + put, next_byte_ + take, plen);
419
+ }
420
+ sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
421
+ }
422
+ // printf("sc(%c)=%d ", next_byte_[take], sc);
423
+ // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
424
+ // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
425
+
426
+ // Allow continue across a single letter in a different script:
427
+ // A B D = three scripts, c = common script, i = inherited script,
428
+ // - = don't care, ( = take position before the += below
429
+ // AAA(A- continue
430
+ //
431
+ // AAA(BA continue
432
+ // AAA(BB break
433
+ // AAA(Bc continue (breaks after B)
434
+ // AAA(BD break
435
+ // AAA(Bi break
436
+ //
437
+ // AAA(c- break
438
+ //
439
+ // AAA(i- continue
440
+ //
441
+
442
+ if ((sc != spanscript) && (sc != ULScript_Inherited)) {
443
+ // Might need to break this script span
444
+ if (sc == ULScript_Common) {
445
+ need_break = true;
446
+ } else {
447
+ // Look at next following character, ignoring entity as Common
448
+ int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
449
+ if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
450
+ need_break = true;
451
+ }
452
+ }
453
+ }
454
+ if (need_break) {break;} // Non-letter or letter in wrong script
455
+
456
+ take += tlen; // Advance
457
+ put += plen; // Advance
458
+ ++letter_count;
459
+ if (put >= getone::kMaxScriptBytes) {
460
+ // Buffer is full
461
+ span->truncated = true;
462
+ break;
463
+ }
464
+ } // End while letters
465
+
466
+ // Do run of non-letters (tag | &NL | NL)*
467
+ while (take < byte_length_) {
468
+ // Do fast scan to next interesting byte
469
+ take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
470
+
471
+ // Check for no more letters/specials
472
+ if (take >= byte_length_) {
473
+ take = byte_length_;
474
+ break;
475
+ }
476
+
477
+ // We are at a letter, nonletter, tag, or entity
478
+ if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
479
+ // printf("\"%c\" ", next_byte_[take]);
480
+ if (next_byte_[take] == '<') {
481
+ // Begining of tag; skip to end and go around again
482
+ tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
483
+ sc = 0;
484
+ // printf("<...> ");
485
+ } else if (next_byte_[take] == '>') {
486
+ // Unexpected end of tag; skip it and go around again
487
+ tlen = 1; // Over the >
488
+ sc = 0;
489
+ // printf("..> ");
490
+ } else if (next_byte_[take] == '&') {
491
+ // Expand entity, no advance
492
+ EntityToBuffer(next_byte_ + take, byte_length_ - take,
493
+ script_buffer_ + put, &tlen, &plen);
494
+ sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
495
+ }
496
+ } else {
497
+ // Update 1..4
498
+ tlen = cld_UniLib::OneCharLen(next_byte_ + take);
499
+ sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
500
+ }
501
+ // printf("sc[%c]=%d ", next_byte_[take], sc);
502
+ if (sc != 0) {break;} // Letter found
503
+ take += tlen; // Advance
504
+ } // End while not-letters
505
+
506
+ script_buffer_[put++] = ' ';
507
+
508
+ // We are at a letter again (or eos), after letter* not-letter*
509
+ if (sc != spanscript) {break;} // Letter in wrong script
510
+ if (put >= getone::kMaxScriptBytes - 8) {
511
+ // Buffer is almost full
512
+ span->truncated = true;
513
+ break;
514
+ }
515
+ }
516
+
517
+ // Update input position
518
+ next_byte_ += take;
519
+ byte_length_ -= take;
520
+
521
+ // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
522
+ // kMaxScriptBytes | | put
523
+ script_buffer_[put + 0] = ' ';
524
+ script_buffer_[put + 1] = ' ';
525
+ script_buffer_[put + 2] = ' ';
526
+ script_buffer_[put + 3] = '\0';
527
+
528
+ span->text_bytes = put; // Does not include the last four chars above
529
+
530
+ // printf(" %d]]\n\n", put);
531
+ return true;
532
+ }
533
+
534
+ // Force Latin, Cyrillic, Greek scripts to be lowercase
535
+ void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
536
+ // On Windows, text is lowercased beforehand, so no need to do anything here.
537
+ #if !defined(CLD_WINDOWS)
538
+ // If needed, lowercase all the text. If we do it sooner, might miss
539
+ // lowercasing an entity such as &Aacute;
540
+ // We only need to do this for Latn and Cyrl scripts
541
+ if ((span->script == ULScript_Latin) ||
542
+ (span->script == ULScript_Cyrillic) ||
543
+ (span->script == ULScript_Greek)) {
544
+ // Full Unicode lowercase of the entire buffer, including
545
+ // four pad bytes off the end
546
+ int consumed, filled;
547
+ UniLib::ToLower(span->text, span->text_bytes + 4,
548
+ script_buffer_lower_, getone::kMaxScriptLowerBuffer,
549
+ &consumed, &filled);
550
+ span->text = script_buffer_lower_;
551
+ span->text_bytes = filled - 4;
552
+ }
553
+ #endif
554
+ }
555
+
556
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
557
+ // Force Latin and Cyrillic scripts to be lowercase
558
+ bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
559
+ bool ok = GetOneScriptSpan(span);
560
+ LowerScriptSpan(span);
561
+ return ok;
562
+ }
563
+
564
+ // Gets lscript number for letters; always returns
565
+ // 0 (common script) for non-letters
566
+ int getone::GetUTF8LetterScriptNum(const char* src) {
567
+ int srclen = cld_UniLib::OneCharLen(src);
568
+ const uint8* usrc = reinterpret_cast<const uint8*>(src);
569
+ return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
570
+ }
@@ -0,0 +1,131 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
7
+
8
+ #include "encodings/compact_lang_det/letterscript_enum.h"
9
+ #include "encodings/compact_lang_det/compact_lang_det_impl.h"
10
+
11
+ namespace getone {
12
+ static const int kMaxScriptBuffer = 4096;
13
+ static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
14
+ static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room
15
+ static const int kMaxAnswerBuffer = 256;
16
+
17
+ typedef enum UnicodeLScript ULScript;
18
+
19
+ typedef struct {
20
+ char* text; // Pointer to the span, somewhere
21
+ int text_bytes; // Number of bytes of text in the span
22
+ int offset; // Offset of start of span in original input buffer
23
+ ULScript script; // Script of all the letters in this span
24
+ Language lang; // Language identified for this span
25
+ bool truncated; // true if buffer filled up before a
26
+ // different script or EOF was found
27
+ } LangSpan;
28
+
29
+
30
+ static inline bool IsContinuationByte(char c) {
31
+ return static_cast<signed char>(c) < -64;
32
+ }
33
+
34
+ // Gets lscript number for letters; always returns
35
+ // 0 (common script) for non-letters
36
+ int GetUTF8LetterScriptNum(const char* src);
37
+
38
+
39
+ // Update src pointer to point to next quadgram, +2..+5
40
+ // Looks at src[0..4]
41
+ const char* AdvanceQuad(const char* src);
42
+ } // end namespace getone
43
+
44
+
45
+
46
+
47
+
48
+
49
+ class ScriptScanner {
50
+ public:
51
+ ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
52
+ ~ScriptScanner();
53
+
54
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
55
+ bool GetOneScriptSpan(getone::LangSpan* span);
56
+
57
+ // Force Latin and Cyrillic scripts to be lowercase
58
+ void LowerScriptSpan(getone::LangSpan* span);
59
+
60
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
61
+ // Force Latin and Cyrillic scripts to be lowercase
62
+ bool GetOneScriptSpanLower(getone::LangSpan* span);
63
+
64
+ private:
65
+ int SkipToFrontOfSpan(const char* src, int len, int* script);
66
+
67
+ const char* start_byte_;
68
+ const char* next_byte_;
69
+ const char* next_byte_limit_;
70
+ int byte_length_;
71
+ bool is_plain_text_;
72
+ char* script_buffer_; // Holds text with expanded entities
73
+ char* script_buffer_lower_; // Holds lowercased text
74
+ };
75
+
76
+
77
+ class LangScanner {
78
+ public:
79
+ LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
80
+ getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
81
+ int maxlangs, int minlangspan);
82
+ ~LangScanner();
83
+
84
+
85
+ int script() {return script_;}
86
+
87
+ // Use new text
88
+ // Keep smoothing state if same script, otherwise reinit smoothing
89
+ void NewText(getone::LangSpan* spn);
90
+
91
+ bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
92
+ bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
93
+
94
+ // The real ones
95
+ bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
96
+ getone::LangSpan* span);
97
+ bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
98
+ getone::LangSpan* span);
99
+
100
+ // Increases language bias by delta
101
+ void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
102
+ Language key, int delta);
103
+
104
+ // For debugging output
105
+ int next_answer_;
106
+ char answer_buffer_[getone::kMaxAnswerBuffer];
107
+ char answer_buffer2_[getone::kMaxAnswerBuffer];
108
+ char answer_buffer3_[getone::kMaxAnswerBuffer];
109
+ char answer_buffer4_[getone::kMaxAnswerBuffer];
110
+
111
+ private:
112
+ const char* start_byte_;
113
+ const char* next_byte_limit_;
114
+ const char* next_byte_;
115
+ const char* onelangspan_begin_;
116
+ int byte_length_;
117
+ int script_;
118
+ Language spanlang_;
119
+ int smoothwidth_;
120
+ int smoothwidth_2_;
121
+ int smoothcandidates_;
122
+ int maxlangs_;
123
+ int minlangspan_;
124
+ int rb_size_;
125
+ int next_rb_;
126
+ int rb_mask_;
127
+ uint32* rb_;
128
+ int* offset_rb_;
129
+ };
130
+
131
+ #endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_