language_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,570 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #include "encodings/compact_lang_det/getonescriptspan.h"
6
+ #include <stdio.h>
7
+ #include <string.h>
8
+
9
+ #include "encodings/lang_enc.h"
10
+ #include "encodings/compact_lang_det/utf8propjustletter.h"
11
+ #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
12
+ #include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
13
+
14
+ #include "encodings/compact_lang_det/win/cld_basictypes.h"
15
+ #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
16
+ #include "encodings/compact_lang_det/win/cld_google.h"
17
+ #include "encodings/compact_lang_det/win/cld_htmlutils.h"
18
+ #include "encodings/compact_lang_det/win/cld_unilib.h"
19
+ #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
20
+ #include "encodings/compact_lang_det/win/cld_utf8utils.h"
21
+
22
+ static const Language GRAY_LANG = (Language)254;
23
+
24
+ static const int kMaxUpToWordBoundary = 50; // span < this make longer,
25
+ // else make shorter
26
+ static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes
27
+ // to round to word boundary,
28
+ // direction above
29
+
30
+ static const char kSpecialSymbol[256] = { // true for < > &
31
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
32
+ 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
33
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
34
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
35
+
36
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
37
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
38
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
39
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
40
+ };
41
+
42
+
43
+
44
+ #define LT 0 // <
45
+ #define GT 1 // >
46
+ #define EX 2 // !
47
+ #define HY 3 // -
48
+ #define QU 4 // "
49
+ #define AP 5 // '
50
+ #define SL 6 // /
51
+ #define S_ 7
52
+ #define C_ 8
53
+ #define R_ 9
54
+ #define I_ 10
55
+ #define P_ 11
56
+ #define T_ 12
57
+ #define Y_ 13
58
+ #define L_ 14
59
+ #define E_ 15
60
+ #define CR 16 // <cr> or <lf>
61
+ #define NL 17 // non-letter: ASCII whitespace, digit, punctuation
62
+ #define PL 18 // possible letter, incl. &
63
+ #define xx 19 // <unused>
64
+
65
+ // Map byte to one of ~20 interesting categories for cheap tag parsing
66
+ static const uint8 kCharToSub[256] = {
67
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
68
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
69
+ NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
70
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
71
+
72
+ PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
73
+ P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
74
+ PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
75
+ P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
76
+
77
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
78
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
79
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
80
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
81
+
82
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
83
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
84
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
85
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
86
+ };
87
+
88
+ #undef LT
89
+ #undef GT
90
+ #undef EX
91
+ #undef HY
92
+ #undef QU
93
+ #undef AP
94
+ #undef SL
95
+ #undef S_
96
+ #undef C_
97
+ #undef R_
98
+ #undef I_
99
+ #undef P_
100
+ #undef T_
101
+ #undef Y_
102
+ #undef L_
103
+ #undef E_
104
+ #undef CR
105
+ #undef NL
106
+ #undef PL
107
+ #undef xx
108
+
109
+
110
+ #define OK 0
111
+ #define X_ 1
112
+
113
+ // State machine to do cheap parse of non-letter strings incl. tags
114
+ // advances <tag>
115
+ // | |
116
+ // advances <tag> ... </tag> for <script> <style>
117
+ // | |
118
+ // advances <!-- ... <tag> ... -->
119
+ // | |
120
+ // advances <tag
121
+ // || (0)
122
+ // advances <tag <tag2>
123
+ // || (0)
124
+ static const uint8 kTagParseTbl_0[] = {
125
+ // < > ! - " ' / S C R I P T Y L E CR NL PL xx
126
+ 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK
127
+ X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
128
+ 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL*
129
+ X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <
130
+ X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!
131
+ X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-
132
+ 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*
133
+ 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-
134
+ 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--
135
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*
136
+ 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
137
+ 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
138
+ X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
139
+
140
+ // < > ! - " ' / S C R I P T Y L E CR NL PL xx
141
+ X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S
142
+ X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC
143
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR
144
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI
145
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP
146
+ X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
147
+ 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
148
+ 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
149
+ 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
150
+ 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
151
+ 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
152
+ 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
153
+ 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
154
+ 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
155
+ 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
156
+
157
+ // < > ! - " ' / S C R I P T Y L E CR NL PL xx
158
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST
159
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY
160
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL
161
+ X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
162
+ 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
163
+ 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
164
+ 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
165
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
166
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
167
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
168
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
169
+ 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
170
+ };
171
+
172
+ #undef OK
173
+ #undef X_
174
+
175
+
176
+ /*
177
+ // Convert GetTimeOfDay output to 64-bit usec
178
+ static inline uint64 Microseconds(const struct timeval& t) {
179
+ // The SumReducer uses uint64, so convert to (uint64) microseconds,
180
+ // not (double) seconds.
181
+ return t.tv_sec * 1000000ULL + t.tv_usec;
182
+ }
183
+ */
184
+
185
+
186
+ // Returns true if character is < > or &
187
+ bool inline IsSpecial(char c) {
188
+ if ((c & 0xe0) == 0x20) {
189
+ return kSpecialSymbol[static_cast<uint8>(c)];
190
+ }
191
+ return false;
192
+ }
193
+
194
+ // Quick Skip to next letter or < > & or to end of string (eos)
195
+ // Always return is_letter for eos
196
+ int ScanToLetterOrSpecial(const char* src, int len) {
197
+ int bytes_consumed;
198
+ cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
199
+ &bytes_consumed);
200
+ return bytes_consumed;
201
+ }
202
+
203
+
204
+
205
+ // src points to non-letter, such as tag-opening '<'
206
+ // Return length from here to next possible letter
207
+ // On eos or another < before >, return 1
208
+ // advances <tag>
209
+ // | |
210
+ // advances <tag> ... </tag> for <script> <style>
211
+ // | |
212
+ // advances <!-- ... <tag> ... -->
213
+ // | |
214
+ // advances <tag
215
+ // || (1)
216
+ // advances <tag <tag2>
217
+ // || (1)
218
+ int ScanToPossibleLetter(const char* isrc, int len) {
219
+ const uint8* src = reinterpret_cast<const uint8*>(isrc);
220
+ const uint8* srclimit = src + len;
221
+ const uint8* tagParseTbl = kTagParseTbl_0;
222
+ int e = 0;
223
+ while (src < srclimit) {
224
+ e = tagParseTbl[kCharToSub[*src++]];
225
+ if ((e & ~1) == 0) {
226
+ // We overshot by one byte
227
+ --src;
228
+ break;
229
+ }
230
+ tagParseTbl = &kTagParseTbl_0[e * 20];
231
+ }
232
+
233
+ if (src >= srclimit) {
234
+ // We fell off the end of the text.
235
+ // It looks like the most common case for this is a truncated file, not
236
+ // mismatched angle brackets. So we pretend that the last char was '>'
237
+ return len;
238
+ }
239
+
240
+ // OK to be in state 0 or state 2 at exit
241
+ if ((e != 0) && (e != 2)) {
242
+ // Error, '<' followed by '<'
243
+ // We want to back up to first <, then advance by one byte past it
244
+ int offset = src - reinterpret_cast<const uint8*>(isrc);
245
+ // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
246
+
247
+ // Backscan to first '<' and return enough length to just get past it
248
+ --offset; // back up over the second '<', which caused us to stop
249
+ while ((0 < offset) && (isrc[offset] != '<')) {
250
+ // Find the first '<', which is unmatched
251
+ --offset;
252
+ }
253
+ // skip to just beyond first '<'
254
+ // printf(" returning %d\n", offset + 1);
255
+ return offset + 1;
256
+ }
257
+
258
+ return src - reinterpret_cast<const uint8*>(isrc);
259
+ }
260
+
261
+
262
+
263
+ ScriptScanner::ScriptScanner(const char* buffer,
264
+ int buffer_length,
265
+ bool is_plain_text)
266
+ : start_byte_(buffer),
267
+ next_byte_(buffer),
268
+ next_byte_limit_(buffer + buffer_length),
269
+ byte_length_(buffer_length),
270
+ is_plain_text_(is_plain_text) {
271
+ script_buffer_ = new char[getone::kMaxScriptBuffer];
272
+ script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
273
+ }
274
+
275
+ ScriptScanner::~ScriptScanner() {
276
+ delete[] script_buffer_;
277
+ delete[] script_buffer_lower_;
278
+ }
279
+
280
+
281
+
282
+
283
+ // Get to the first real non-tag letter or entity that is a letter
284
+ // Sets script of that letter
285
+ // Return len if no more letters
286
+ int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
287
+ int sc = UNKNOWN_LSCRIPT;
288
+ int skip = 0;
289
+ int tlen, plen;
290
+
291
+ // Do run of non-letters (tag | &NL | NL)*
292
+ while (skip < len) {
293
+ // Do fast scan to next interesting byte
294
+ // int oldskip = skip;
295
+ skip += ScanToLetterOrSpecial(src + skip, len - skip);
296
+ // TEMP
297
+ // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
298
+ // oldskip, src[oldskip], skip, src[skip]);
299
+
300
+ // Check for no more letters/specials
301
+ if (skip >= len) {
302
+ // All done
303
+ return len;
304
+ }
305
+
306
+ // We are at a letter, nonletter, tag, or entity
307
+ if (IsSpecial(src[skip]) && !is_plain_text_) {
308
+ if (src[skip] == '<') {
309
+ // Begining of tag; skip to end and go around again
310
+ tlen = ScanToPossibleLetter(src + skip, len - skip);
311
+ sc = 0;
312
+ // printf("<...> ");
313
+ } else if (src[skip] == '>') {
314
+ // Unexpected end of tag; skip it and go around again
315
+ tlen = 1; // Over the >
316
+ sc = 0;
317
+ // printf("..> ");
318
+ } else if (src[skip] == '&') {
319
+ // Expand entity, no advance
320
+ char temp[4];
321
+ EntityToBuffer(src + skip, len - skip,
322
+ temp, &tlen, &plen);
323
+ sc = getone::GetUTF8LetterScriptNum(temp);
324
+ // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
325
+ }
326
+ } else {
327
+ // Update 1..4 bytes
328
+ tlen = cld_UniLib::OneCharLen(src + skip);
329
+ sc = getone::GetUTF8LetterScriptNum(src + skip);
330
+ // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
331
+ }
332
+ // TEMP
333
+ // printf("sc=%d ", sc);
334
+ if (sc != 0) {break;} // Letter found
335
+ skip += tlen; // Advance
336
+ }
337
+
338
+ *script = sc;
339
+ return skip;
340
+ }
341
+
342
+
343
+
344
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
345
+ // Buffer has leading space and all text is lowercased
346
+ bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
347
+ span->text = script_buffer_;
348
+ span->text_bytes = 0;
349
+ span->offset = next_byte_ - start_byte_;
350
+ span->script = UNKNOWN_LSCRIPT;
351
+ span->lang = UNKNOWN_LANGUAGE;
352
+ span->truncated = false;
353
+
354
+ // printf("GetOneScriptSpan[[ ");
355
+ // struct timeval script_start, script_mid, script_end;
356
+
357
+ int spanscript; // The script of this span
358
+ int sc = UNKNOWN_LSCRIPT; // The script of next character
359
+ int tlen, plen;
360
+
361
+
362
+ script_buffer_[0] = ' '; // Always a space at front of output
363
+ script_buffer_[1] = '\0';
364
+ int take = 0;
365
+ int put = 1; // Start after the initial space
366
+
367
+ // gettimeofday(&script_start, NULL);
368
+ // Get to the first real non-tag letter or entity that is a letter
369
+ int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
370
+ next_byte_ += skip;
371
+ byte_length_ -= skip;
372
+ if (byte_length_ <= 0) {
373
+ // printf("]]\n");
374
+ return false; // No more letters to be found
375
+ }
376
+
377
+ // gettimeofday(&script_mid, NULL);
378
+
379
+ // There is at least one letter, so we know the script for this span
380
+ // printf("{%d} ", spanscript);
381
+ span->script = (UnicodeLScript)spanscript;
382
+
383
+
384
+ // Go over alternating spans of same-script letters and non-letters,
385
+ // copying letters to buffer with single spaces for each run of non-letters
386
+ while (take < byte_length_) {
387
+ // Copy run of letters in same script (&LS | LS)*
388
+ int letter_count = 0; // Keep track of word length
389
+ bool need_break = false;
390
+ while (take < byte_length_) {
391
+ // We are at a letter, nonletter, tag, or entity
392
+ if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
393
+ // printf("\"%c\" ", next_byte_[take]);
394
+ if (next_byte_[take] == '<') {
395
+ // Begining of tag
396
+ sc = 0;
397
+ break;
398
+ } else if (next_byte_[take] == '>') {
399
+ // Unexpected end of tag
400
+ sc = 0;
401
+ break;
402
+ } else if (next_byte_[take] == '&') {
403
+ // Copy entity, no advance
404
+ EntityToBuffer(next_byte_ + take, byte_length_ - take,
405
+ script_buffer_ + put, &tlen, &plen);
406
+ sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
407
+ }
408
+ } else {
409
+ // Real letter, safely copy up to 4 bytes, increment by 1..4
410
+ // Will update by 1..4 bytes at Advance, below
411
+ tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
412
+ if (take < (byte_length_ - 3)) {
413
+ // Fast case
414
+ *reinterpret_cast<uint32*>(script_buffer_ + put) =
415
+ *reinterpret_cast<const uint32*>(next_byte_ + take);
416
+ } else {
417
+ // Slow case, happens 1-3 times per input document
418
+ memcpy(script_buffer_ + put, next_byte_ + take, plen);
419
+ }
420
+ sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
421
+ }
422
+ // printf("sc(%c)=%d ", next_byte_[take], sc);
423
+ // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
424
+ // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
425
+
426
+ // Allow continue across a single letter in a different script:
427
+ // A B D = three scripts, c = common script, i = inherited script,
428
+ // - = don't care, ( = take position before the += below
429
+ // AAA(A- continue
430
+ //
431
+ // AAA(BA continue
432
+ // AAA(BB break
433
+ // AAA(Bc continue (breaks after B)
434
+ // AAA(BD break
435
+ // AAA(Bi break
436
+ //
437
+ // AAA(c- break
438
+ //
439
+ // AAA(i- continue
440
+ //
441
+
442
+ if ((sc != spanscript) && (sc != ULScript_Inherited)) {
443
+ // Might need to break this script span
444
+ if (sc == ULScript_Common) {
445
+ need_break = true;
446
+ } else {
447
+ // Look at next following character, ignoring entity as Common
448
+ int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
449
+ if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
450
+ need_break = true;
451
+ }
452
+ }
453
+ }
454
+ if (need_break) {break;} // Non-letter or letter in wrong script
455
+
456
+ take += tlen; // Advance
457
+ put += plen; // Advance
458
+ ++letter_count;
459
+ if (put >= getone::kMaxScriptBytes) {
460
+ // Buffer is full
461
+ span->truncated = true;
462
+ break;
463
+ }
464
+ } // End while letters
465
+
466
+ // Do run of non-letters (tag | &NL | NL)*
467
+ while (take < byte_length_) {
468
+ // Do fast scan to next interesting byte
469
+ take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
470
+
471
+ // Check for no more letters/specials
472
+ if (take >= byte_length_) {
473
+ take = byte_length_;
474
+ break;
475
+ }
476
+
477
+ // We are at a letter, nonletter, tag, or entity
478
+ if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
479
+ // printf("\"%c\" ", next_byte_[take]);
480
+ if (next_byte_[take] == '<') {
481
+ // Begining of tag; skip to end and go around again
482
+ tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
483
+ sc = 0;
484
+ // printf("<...> ");
485
+ } else if (next_byte_[take] == '>') {
486
+ // Unexpected end of tag; skip it and go around again
487
+ tlen = 1; // Over the >
488
+ sc = 0;
489
+ // printf("..> ");
490
+ } else if (next_byte_[take] == '&') {
491
+ // Expand entity, no advance
492
+ EntityToBuffer(next_byte_ + take, byte_length_ - take,
493
+ script_buffer_ + put, &tlen, &plen);
494
+ sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
495
+ }
496
+ } else {
497
+ // Update 1..4
498
+ tlen = cld_UniLib::OneCharLen(next_byte_ + take);
499
+ sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
500
+ }
501
+ // printf("sc[%c]=%d ", next_byte_[take], sc);
502
+ if (sc != 0) {break;} // Letter found
503
+ take += tlen; // Advance
504
+ } // End while not-letters
505
+
506
+ script_buffer_[put++] = ' ';
507
+
508
+ // We are at a letter again (or eos), after letter* not-letter*
509
+ if (sc != spanscript) {break;} // Letter in wrong script
510
+ if (put >= getone::kMaxScriptBytes - 8) {
511
+ // Buffer is almost full
512
+ span->truncated = true;
513
+ break;
514
+ }
515
+ }
516
+
517
+ // Update input position
518
+ next_byte_ += take;
519
+ byte_length_ -= take;
520
+
521
+ // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
522
+ // kMaxScriptBytes | | put
523
+ script_buffer_[put + 0] = ' ';
524
+ script_buffer_[put + 1] = ' ';
525
+ script_buffer_[put + 2] = ' ';
526
+ script_buffer_[put + 3] = '\0';
527
+
528
+ span->text_bytes = put; // Does not include the last four chars above
529
+
530
+ // printf(" %d]]\n\n", put);
531
+ return true;
532
+ }
533
+
534
+ // Force Latin, Cyrillic, Greek scripts to be lowercase
535
+ void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
536
+ // On Windows, text is lowercased beforehand, so no need to do anything here.
537
+ #if !defined(CLD_WINDOWS)
538
+ // If needed, lowercase all the text. If we do it sooner, might miss
539
+ // lowercasing an entity such as &Aacute;
540
+ // We only need to do this for Latn and Cyrl scripts
541
+ if ((span->script == ULScript_Latin) ||
542
+ (span->script == ULScript_Cyrillic) ||
543
+ (span->script == ULScript_Greek)) {
544
+ // Full Unicode lowercase of the entire buffer, including
545
+ // four pad bytes off the end
546
+ int consumed, filled;
547
+ UniLib::ToLower(span->text, span->text_bytes + 4,
548
+ script_buffer_lower_, getone::kMaxScriptLowerBuffer,
549
+ &consumed, &filled);
550
+ span->text = script_buffer_lower_;
551
+ span->text_bytes = filled - 4;
552
+ }
553
+ #endif
554
+ }
555
+
556
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
557
+ // Force Latin and Cyrillic scripts to be lowercase
558
+ bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
559
+ bool ok = GetOneScriptSpan(span);
560
+ LowerScriptSpan(span);
561
+ return ok;
562
+ }
563
+
564
+ // Gets lscript number for letters; always returns
565
+ // 0 (common script) for non-letters
566
+ int getone::GetUTF8LetterScriptNum(const char* src) {
567
+ int srclen = cld_UniLib::OneCharLen(src);
568
+ const uint8* usrc = reinterpret_cast<const uint8*>(src);
569
+ return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
570
+ }
@@ -0,0 +1,131 @@
1
+ // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2
+ // Use of this source code is governed by a BSD-style license that can be
3
+ // found in the LICENSE file.
4
+
5
+ #ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
6
+ #define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
7
+
8
+ #include "encodings/compact_lang_det/letterscript_enum.h"
9
+ #include "encodings/compact_lang_det/compact_lang_det_impl.h"
10
+
11
+ namespace getone {
12
+ static const int kMaxScriptBuffer = 4096;
13
+ static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
14
+ static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room
15
+ static const int kMaxAnswerBuffer = 256;
16
+
17
+ typedef enum UnicodeLScript ULScript;
18
+
19
+ typedef struct {
20
+ char* text; // Pointer to the span, somewhere
21
+ int text_bytes; // Number of bytes of text in the span
22
+ int offset; // Offset of start of span in original input buffer
23
+ ULScript script; // Script of all the letters in this span
24
+ Language lang; // Language identified for this span
25
+ bool truncated; // true if buffer filled up before a
26
+ // different script or EOF was found
27
+ } LangSpan;
28
+
29
+
30
+ static inline bool IsContinuationByte(char c) {
31
+ return static_cast<signed char>(c) < -64;
32
+ }
33
+
34
+ // Gets lscript number for letters; always returns
35
+ // 0 (common script) for non-letters
36
+ int GetUTF8LetterScriptNum(const char* src);
37
+
38
+
39
+ // Update src pointer to point to next quadgram, +2..+5
40
+ // Looks at src[0..4]
41
+ const char* AdvanceQuad(const char* src);
42
+ } // end namespace getone
43
+
44
+
45
+
46
+
47
+
48
+
49
+ class ScriptScanner {
50
+ public:
51
+ ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
52
+ ~ScriptScanner();
53
+
54
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
55
+ bool GetOneScriptSpan(getone::LangSpan* span);
56
+
57
+ // Force Latin and Cyrillic scripts to be lowercase
58
+ void LowerScriptSpan(getone::LangSpan* span);
59
+
60
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
61
+ // Force Latin and Cyrillic scripts to be lowercase
62
+ bool GetOneScriptSpanLower(getone::LangSpan* span);
63
+
64
+ private:
65
+ int SkipToFrontOfSpan(const char* src, int len, int* script);
66
+
67
+ const char* start_byte_;
68
+ const char* next_byte_;
69
+ const char* next_byte_limit_;
70
+ int byte_length_;
71
+ bool is_plain_text_;
72
+ char* script_buffer_; // Holds text with expanded entities
73
+ char* script_buffer_lower_; // Holds lowercased text
74
+ };
75
+
76
+
77
+ class LangScanner {
78
+ public:
79
+ LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
80
+ getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
81
+ int maxlangs, int minlangspan);
82
+ ~LangScanner();
83
+
84
+
85
+ int script() {return script_;}
86
+
87
+ // Use new text
88
+ // Keep smoothing state if same script, otherwise reinit smoothing
89
+ void NewText(getone::LangSpan* spn);
90
+
91
+ bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
92
+ bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
93
+
94
+ // The real ones
95
+ bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
96
+ getone::LangSpan* span);
97
+ bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
98
+ getone::LangSpan* span);
99
+
100
+ // Increases language bias by delta
101
+ void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
102
+ Language key, int delta);
103
+
104
+ // For debugging output
105
+ int next_answer_;
106
+ char answer_buffer_[getone::kMaxAnswerBuffer];
107
+ char answer_buffer2_[getone::kMaxAnswerBuffer];
108
+ char answer_buffer3_[getone::kMaxAnswerBuffer];
109
+ char answer_buffer4_[getone::kMaxAnswerBuffer];
110
+
111
+ private:
112
+ const char* start_byte_;
113
+ const char* next_byte_limit_;
114
+ const char* next_byte_;
115
+ const char* onelangspan_begin_;
116
+ int byte_length_;
117
+ int script_;
118
+ Language spanlang_;
119
+ int smoothwidth_;
120
+ int smoothwidth_2_;
121
+ int smoothcandidates_;
122
+ int maxlangs_;
123
+ int minlangspan_;
124
+ int rb_size_;
125
+ int next_rb_;
126
+ int rb_mask_;
127
+ uint32* rb_;
128
+ int* offset_rb_;
129
+ };
130
+
131
+ #endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_