cld 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. data/LICENSE +27 -0
  2. data/Manifest +106 -0
  3. data/README.rdoc +173 -0
  4. data/Rakefile +15 -0
  5. data/base/basictypes.h +348 -0
  6. data/base/build_config.h +115 -0
  7. data/base/casts.h +156 -0
  8. data/base/commandlineflags.h +443 -0
  9. data/base/crash.h +41 -0
  10. data/base/dynamic_annotations.h +358 -0
  11. data/base/global_strip_options.h +59 -0
  12. data/base/log_severity.h +46 -0
  13. data/base/logging.h +1403 -0
  14. data/base/macros.h +243 -0
  15. data/base/port.h +54 -0
  16. data/base/scoped_ptr.h +428 -0
  17. data/base/stl_decl.h +0 -0
  18. data/base/stl_decl_msvc.h +107 -0
  19. data/base/string_util.h +29 -0
  20. data/base/strtoint.h +93 -0
  21. data/base/template_util.h +96 -0
  22. data/base/type_traits.h +198 -0
  23. data/base/vlog_is_on.h +143 -0
  24. data/build.sh +48 -0
  25. data/build.win.cmd +28 -0
  26. data/cld.gemspec +30 -0
  27. data/cld_encodings.h +95 -0
  28. data/encodings/compact_lang_det/#cldutil.cc# +905 -0
  29. data/encodings/compact_lang_det/#cldutil.h# +1205 -0
  30. data/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
  31. data/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
  32. data/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
  33. data/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
  34. data/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
  35. data/encodings/compact_lang_det/#tote.cc# +299 -0
  36. data/encodings/compact_lang_det/#tote.h# +89 -0
  37. data/encodings/compact_lang_det/cldutil.cc +905 -0
  38. data/encodings/compact_lang_det/cldutil.h +1205 -0
  39. data/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  40. data/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  41. data/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  42. data/encodings/compact_lang_det/compact_lang_det.h +145 -0
  43. data/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  44. data/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  45. data/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  46. data/encodings/compact_lang_det/compile.cmd +1 -0
  47. data/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  48. data/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  49. data/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  50. data/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  51. data/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  52. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  53. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  54. data/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  55. data/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  56. data/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  57. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  58. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  59. data/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  60. data/encodings/compact_lang_det/getonescriptspan.h +131 -0
  61. data/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  62. data/encodings/compact_lang_det/letterscript_enum.h +99 -0
  63. data/encodings/compact_lang_det/subsetsequence.cc +259 -0
  64. data/encodings/compact_lang_det/subsetsequence.h +44 -0
  65. data/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  66. data/encodings/compact_lang_det/tote.cc +299 -0
  67. data/encodings/compact_lang_det/tote.h +89 -0
  68. data/encodings/compact_lang_det/unittest_data.h +193 -0
  69. data/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  70. data/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  71. data/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  72. data/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
  73. data/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  74. data/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  75. data/encodings/compact_lang_det/win/cld_google.h +18 -0
  76. data/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  77. data/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  78. data/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  79. data/encodings/compact_lang_det/win/cld_logging.h +21 -0
  80. data/encodings/compact_lang_det/win/cld_macros.h +19 -0
  81. data/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  82. data/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  83. data/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  84. data/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  85. data/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  86. data/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  87. data/encodings/compact_lang_det/win/cld_utf.h +24 -0
  88. data/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  89. data/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  90. data/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  91. data/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  92. data/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  93. data/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  94. data/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  95. data/encodings/internal/encodings.cc +12 -0
  96. data/encodings/lang_enc.h +254 -0
  97. data/encodings/proto/encodings.pb.h +169 -0
  98. data/encodings/public/encodings.h +301 -0
  99. data/ext/cld/extconf.rb +7 -0
  100. data/languages/internal/#languages.cc# +337 -0
  101. data/languages/internal/languages.cc +337 -0
  102. data/languages/proto/languages.pb.h +179 -0
  103. data/languages/public/languages.h +379 -0
  104. data/lib/cld.rb +12 -0
  105. data/test/test.rb +570 -0
  106. data/thunk.cc +131 -0
  107. metadata +168 -0
@@ -0,0 +1,131 @@
1
+ #include <stdio.h>
2
+ #include "encodings/compact_lang_det/compact_lang_det.h"
3
+ #include "encodings/compact_lang_det/ext_lang_enc.h"
4
+ #include "encodings/compact_lang_det/unittest_data.h"
5
+ #include "encodings/proto/encodings.pb.h"
6
+
7
+
8
+ extern "C" {
9
+ int detectLanguageThunkInt(const char * src) {
10
+ bool is_plain_text = true;
11
+ bool do_allow_extended_languages = true;
12
+ bool do_pick_summary_language = false;
13
+ bool do_remove_weak_matches = false;
14
+ bool is_reliable;
15
+ Language plus_one = UNKNOWN_LANGUAGE;
16
+ const char* tld_hint = NULL;
17
+ int encoding_hint = UNKNOWN_ENCODING;
18
+ Language language_hint = UNKNOWN_LANGUAGE;
19
+
20
+ double normalized_score3[3];
21
+ Language language3[3];
22
+ int percent3[3];
23
+ int text_bytes;
24
+
25
+ Language lang;
26
+ lang = CompactLangDet::DetectLanguage(0,
27
+ src, strlen(src),
28
+ is_plain_text,
29
+ do_allow_extended_languages,
30
+ do_pick_summary_language,
31
+ do_remove_weak_matches,
32
+ tld_hint,
33
+ encoding_hint,
34
+ language_hint,
35
+ language3,
36
+ percent3,
37
+ normalized_score3,
38
+ &text_bytes,
39
+ &is_reliable);
40
+ return lang;
41
+ }
42
+ /*
43
+ char * detectLanguageThunkString(const char * src) {
44
+ bool is_plain_text = true;
45
+ bool do_allow_extended_languages = true;
46
+ bool do_pick_summary_language = false;
47
+ bool do_remove_weak_matches = false;
48
+ bool is_reliable;
49
+ Language plus_one = UNKNOWN_LANGUAGE;
50
+ const char* tld_hint = NULL;
51
+ int encoding_hint = UNKNOWN_ENCODING;
52
+ Language language_hint = UNKNOWN_LANGUAGE;
53
+
54
+ double normalized_score3[3];
55
+ Language language3[3];
56
+ int percent3[3];
57
+ int text_bytes;
58
+
59
+ Language lang;
60
+ lang = CompactLangDet::DetectLanguage(0,
61
+ src, strlen(src),
62
+ is_plain_text,
63
+ do_allow_extended_languages,
64
+ do_pick_summary_language,
65
+ do_remove_weak_matches,
66
+ tld_hint,
67
+ encoding_hint,
68
+ language_hint,
69
+ language3,
70
+ percent3,
71
+ normalized_score3,
72
+ &text_bytes,
73
+ &is_reliable);
74
+ return LanguageName(lang);
75
+ }
76
+ */
77
+ }
78
+
79
+ int main(int argc, char **argv) {
80
+ /*
81
+ bool is_plain_text = true;
82
+ bool do_allow_extended_languages = true;
83
+ bool do_pick_summary_language = false;
84
+ bool do_remove_weak_matches = false;
85
+ bool is_reliable;
86
+ Language plus_one = UNKNOWN_LANGUAGE;
87
+ const char* tld_hint = NULL;
88
+ int encoding_hint = UNKNOWN_ENCODING;
89
+ Language language_hint = UNKNOWN_LANGUAGE;
90
+
91
+ double normalized_score3[3];
92
+ Language language3[3];
93
+ int percent3[3];
94
+ int text_bytes;
95
+
96
+ const char* src = kTeststr_en;
97
+ Language lang;
98
+ lang = CompactLangDet::DetectLanguage(0,
99
+ src, strlen(src),
100
+ is_plain_text,
101
+ do_allow_extended_languages,
102
+ do_pick_summary_language,
103
+ do_remove_weak_matches,
104
+ tld_hint,
105
+ encoding_hint,
106
+ language_hint,
107
+ language3,
108
+ percent3,
109
+ normalized_score3,
110
+ &text_bytes,
111
+ &is_reliable);
112
+ printf("LANG=%s\n", LanguageName(lang));
113
+
114
+ src = kTeststr_ks;
115
+ lang = CompactLangDet::DetectLanguage(0,
116
+ src, strlen(src),
117
+ is_plain_text,
118
+ do_allow_extended_languages,
119
+ do_pick_summary_language,
120
+ do_remove_weak_matches,
121
+ tld_hint,
122
+ encoding_hint,
123
+ language_hint,
124
+ language3,
125
+ percent3,
126
+ normalized_score3,
127
+ &text_bytes,
128
+ &is_reliable);
129
+ printf("LANG=%s\n", LanguageName(lang));
130
+ */
131
+ }
metadata ADDED
@@ -0,0 +1,168 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cld
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.0
6
+ platform: ruby
7
+ authors:
8
+ - Jason Toy
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-11-03 00:00:00 Z
14
+ dependencies: []
15
+
16
+ description: Compact Language Detection from chrome
17
+ email: jtoy@jtoy.net
18
+ executables: []
19
+
20
+ extensions:
21
+ - ext/cld/extconf.rb
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README.rdoc
25
+ - ext/cld/extconf.rb
26
+ - lib/cld.rb
27
+ files:
28
+ - LICENSE
29
+ - README.rdoc
30
+ - Rakefile
31
+ - base/basictypes.h
32
+ - base/build_config.h
33
+ - base/casts.h
34
+ - base/commandlineflags.h
35
+ - base/crash.h
36
+ - base/dynamic_annotations.h
37
+ - base/global_strip_options.h
38
+ - base/log_severity.h
39
+ - base/logging.h
40
+ - base/macros.h
41
+ - base/port.h
42
+ - base/scoped_ptr.h
43
+ - base/stl_decl.h
44
+ - base/stl_decl_msvc.h
45
+ - base/string_util.h
46
+ - base/strtoint.h
47
+ - base/template_util.h
48
+ - base/type_traits.h
49
+ - base/vlog_is_on.h
50
+ - build.sh
51
+ - build.win.cmd
52
+ - cld.gemspec
53
+ - cld_encodings.h
54
+ - encodings/compact_lang_det/#cldutil.cc#
55
+ - encodings/compact_lang_det/#cldutil.h#
56
+ - encodings/compact_lang_det/#compact_lang_det_impl.h#
57
+ - encodings/compact_lang_det/#ext_lang_enc.cc#
58
+ - encodings/compact_lang_det/#ext_lang_enc.h#
59
+ - encodings/compact_lang_det/#getonescriptspan.cc#
60
+ - encodings/compact_lang_det/#getonescriptspan.h#
61
+ - encodings/compact_lang_det/#tote.cc#
62
+ - encodings/compact_lang_det/#tote.h#
63
+ - encodings/compact_lang_det/cldutil.cc
64
+ - encodings/compact_lang_det/cldutil.h
65
+ - encodings/compact_lang_det/cldutil_dbg.h
66
+ - encodings/compact_lang_det/cldutil_dbg_empty.cc
67
+ - encodings/compact_lang_det/compact_lang_det.cc
68
+ - encodings/compact_lang_det/compact_lang_det.h
69
+ - encodings/compact_lang_det/compact_lang_det_impl.cc
70
+ - encodings/compact_lang_det/compact_lang_det_impl.h
71
+ - encodings/compact_lang_det/compact_lang_det_unittest_small.cc
72
+ - encodings/compact_lang_det/compile.cmd
73
+ - encodings/compact_lang_det/ext_lang_enc.cc
74
+ - encodings/compact_lang_det/ext_lang_enc.h
75
+ - encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc
76
+ - encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc
77
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc
78
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc
79
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc
80
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc
81
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc
82
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h
83
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc
84
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc
85
+ - encodings/compact_lang_det/getonescriptspan.cc
86
+ - encodings/compact_lang_det/getonescriptspan.h
87
+ - encodings/compact_lang_det/letterscript_enum.cc
88
+ - encodings/compact_lang_det/letterscript_enum.h
89
+ - encodings/compact_lang_det/subsetsequence.cc
90
+ - encodings/compact_lang_det/subsetsequence.h
91
+ - encodings/compact_lang_det/subsetsequence_unittest.cc
92
+ - encodings/compact_lang_det/tote.cc
93
+ - encodings/compact_lang_det/tote.h
94
+ - encodings/compact_lang_det/unittest_data.h
95
+ - encodings/compact_lang_det/utf8propjustletter.h
96
+ - encodings/compact_lang_det/utf8propletterscriptnum.h
97
+ - encodings/compact_lang_det/utf8scannotjustletterspecial.h
98
+ - encodings/compact_lang_det/win/#cld_unilib_windows.cc#
99
+ - encodings/compact_lang_det/win/cld_basictypes.h
100
+ - encodings/compact_lang_det/win/cld_commandlineflags.h
101
+ - encodings/compact_lang_det/win/cld_google.h
102
+ - encodings/compact_lang_det/win/cld_htmlutils.h
103
+ - encodings/compact_lang_det/win/cld_htmlutils_google3.cc
104
+ - encodings/compact_lang_det/win/cld_htmlutils_windows.cc
105
+ - encodings/compact_lang_det/win/cld_logging.h
106
+ - encodings/compact_lang_det/win/cld_macros.h
107
+ - encodings/compact_lang_det/win/cld_strtoint.h
108
+ - encodings/compact_lang_det/win/cld_unicodetext.cc
109
+ - encodings/compact_lang_det/win/cld_unicodetext.h
110
+ - encodings/compact_lang_det/win/cld_unilib.h
111
+ - encodings/compact_lang_det/win/cld_unilib_google3.cc
112
+ - encodings/compact_lang_det/win/cld_unilib_windows.cc
113
+ - encodings/compact_lang_det/win/cld_utf.h
114
+ - encodings/compact_lang_det/win/cld_utf8statetable.cc
115
+ - encodings/compact_lang_det/win/cld_utf8statetable.h
116
+ - encodings/compact_lang_det/win/cld_utf8utils.h
117
+ - encodings/compact_lang_det/win/cld_utf8utils_google3.cc
118
+ - encodings/compact_lang_det/win/cld_utf8utils_windows.cc
119
+ - encodings/compact_lang_det/win/normalizedunicodetext.cc
120
+ - encodings/compact_lang_det/win/normalizedunicodetext.h
121
+ - encodings/internal/encodings.cc
122
+ - encodings/lang_enc.h
123
+ - encodings/proto/encodings.pb.h
124
+ - encodings/public/encodings.h
125
+ - ext/cld/extconf.rb
126
+ - languages/internal/#languages.cc#
127
+ - languages/internal/languages.cc
128
+ - languages/proto/languages.pb.h
129
+ - languages/public/languages.h
130
+ - lib/cld.rb
131
+ - test/test.rb
132
+ - thunk.cc
133
+ - Manifest
134
+ homepage: http://github.com/jtoy/cld
135
+ licenses: []
136
+
137
+ post_install_message:
138
+ rdoc_options:
139
+ - --line-numbers
140
+ - --inline-source
141
+ - --title
142
+ - Cld
143
+ - --main
144
+ - README.rdoc
145
+ require_paths:
146
+ - lib
147
+ - ext
148
+ required_ruby_version: !ruby/object:Gem::Requirement
149
+ none: false
150
+ requirements:
151
+ - - ">="
152
+ - !ruby/object:Gem::Version
153
+ version: "0"
154
+ required_rubygems_version: !ruby/object:Gem::Requirement
155
+ none: false
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: "1.2"
160
+ requirements: []
161
+
162
+ rubyforge_project: cld
163
+ rubygems_version: 1.8.6.1
164
+ signing_key:
165
+ specification_version: 3
166
+ summary: Compact Language Detection from chrome
167
+ test_files: []
168
+