cld 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (107) hide show
  1. data/LICENSE +27 -0
  2. data/Manifest +106 -0
  3. data/README.rdoc +173 -0
  4. data/Rakefile +15 -0
  5. data/base/basictypes.h +348 -0
  6. data/base/build_config.h +115 -0
  7. data/base/casts.h +156 -0
  8. data/base/commandlineflags.h +443 -0
  9. data/base/crash.h +41 -0
  10. data/base/dynamic_annotations.h +358 -0
  11. data/base/global_strip_options.h +59 -0
  12. data/base/log_severity.h +46 -0
  13. data/base/logging.h +1403 -0
  14. data/base/macros.h +243 -0
  15. data/base/port.h +54 -0
  16. data/base/scoped_ptr.h +428 -0
  17. data/base/stl_decl.h +0 -0
  18. data/base/stl_decl_msvc.h +107 -0
  19. data/base/string_util.h +29 -0
  20. data/base/strtoint.h +93 -0
  21. data/base/template_util.h +96 -0
  22. data/base/type_traits.h +198 -0
  23. data/base/vlog_is_on.h +143 -0
  24. data/build.sh +48 -0
  25. data/build.win.cmd +28 -0
  26. data/cld.gemspec +30 -0
  27. data/cld_encodings.h +95 -0
  28. data/encodings/compact_lang_det/#cldutil.cc# +905 -0
  29. data/encodings/compact_lang_det/#cldutil.h# +1205 -0
  30. data/encodings/compact_lang_det/#compact_lang_det_impl.h# +171 -0
  31. data/encodings/compact_lang_det/#ext_lang_enc.cc# +545 -0
  32. data/encodings/compact_lang_det/#ext_lang_enc.h# +119 -0
  33. data/encodings/compact_lang_det/#getonescriptspan.cc# +570 -0
  34. data/encodings/compact_lang_det/#getonescriptspan.h# +131 -0
  35. data/encodings/compact_lang_det/#tote.cc# +299 -0
  36. data/encodings/compact_lang_det/#tote.h# +89 -0
  37. data/encodings/compact_lang_det/cldutil.cc +905 -0
  38. data/encodings/compact_lang_det/cldutil.h +1205 -0
  39. data/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  40. data/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  41. data/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  42. data/encodings/compact_lang_det/compact_lang_det.h +145 -0
  43. data/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  44. data/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  45. data/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  46. data/encodings/compact_lang_det/compile.cmd +1 -0
  47. data/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  48. data/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  49. data/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  50. data/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  51. data/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  52. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  53. data/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  54. data/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  55. data/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  56. data/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  57. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  58. data/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  59. data/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  60. data/encodings/compact_lang_det/getonescriptspan.h +131 -0
  61. data/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  62. data/encodings/compact_lang_det/letterscript_enum.h +99 -0
  63. data/encodings/compact_lang_det/subsetsequence.cc +259 -0
  64. data/encodings/compact_lang_det/subsetsequence.h +44 -0
  65. data/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  66. data/encodings/compact_lang_det/tote.cc +299 -0
  67. data/encodings/compact_lang_det/tote.h +89 -0
  68. data/encodings/compact_lang_det/unittest_data.h +193 -0
  69. data/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  70. data/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  71. data/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  72. data/encodings/compact_lang_det/win/#cld_unilib_windows.cc# +29 -0
  73. data/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  74. data/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  75. data/encodings/compact_lang_det/win/cld_google.h +18 -0
  76. data/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  77. data/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  78. data/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  79. data/encodings/compact_lang_det/win/cld_logging.h +21 -0
  80. data/encodings/compact_lang_det/win/cld_macros.h +19 -0
  81. data/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  82. data/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  83. data/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  84. data/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  85. data/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  86. data/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  87. data/encodings/compact_lang_det/win/cld_utf.h +24 -0
  88. data/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  89. data/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  90. data/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  91. data/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  92. data/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  93. data/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  94. data/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  95. data/encodings/internal/encodings.cc +12 -0
  96. data/encodings/lang_enc.h +254 -0
  97. data/encodings/proto/encodings.pb.h +169 -0
  98. data/encodings/public/encodings.h +301 -0
  99. data/ext/cld/extconf.rb +7 -0
  100. data/languages/internal/#languages.cc# +337 -0
  101. data/languages/internal/languages.cc +337 -0
  102. data/languages/proto/languages.pb.h +179 -0
  103. data/languages/public/languages.h +379 -0
  104. data/lib/cld.rb +12 -0
  105. data/test/test.rb +570 -0
  106. data/thunk.cc +131 -0
  107. metadata +168 -0
@@ -0,0 +1,131 @@
1
+ #include <stdio.h>
2
+ #include "encodings/compact_lang_det/compact_lang_det.h"
3
+ #include "encodings/compact_lang_det/ext_lang_enc.h"
4
+ #include "encodings/compact_lang_det/unittest_data.h"
5
+ #include "encodings/proto/encodings.pb.h"
6
+
7
+
8
+ extern "C" {
9
+ int detectLanguageThunkInt(const char * src) {
10
+ bool is_plain_text = true;
11
+ bool do_allow_extended_languages = true;
12
+ bool do_pick_summary_language = false;
13
+ bool do_remove_weak_matches = false;
14
+ bool is_reliable;
15
+ Language plus_one = UNKNOWN_LANGUAGE;
16
+ const char* tld_hint = NULL;
17
+ int encoding_hint = UNKNOWN_ENCODING;
18
+ Language language_hint = UNKNOWN_LANGUAGE;
19
+
20
+ double normalized_score3[3];
21
+ Language language3[3];
22
+ int percent3[3];
23
+ int text_bytes;
24
+
25
+ Language lang;
26
+ lang = CompactLangDet::DetectLanguage(0,
27
+ src, strlen(src),
28
+ is_plain_text,
29
+ do_allow_extended_languages,
30
+ do_pick_summary_language,
31
+ do_remove_weak_matches,
32
+ tld_hint,
33
+ encoding_hint,
34
+ language_hint,
35
+ language3,
36
+ percent3,
37
+ normalized_score3,
38
+ &text_bytes,
39
+ &is_reliable);
40
+ return lang;
41
+ }
42
+ /*
43
+ char * detectLanguageThunkString(const char * src) {
44
+ bool is_plain_text = true;
45
+ bool do_allow_extended_languages = true;
46
+ bool do_pick_summary_language = false;
47
+ bool do_remove_weak_matches = false;
48
+ bool is_reliable;
49
+ Language plus_one = UNKNOWN_LANGUAGE;
50
+ const char* tld_hint = NULL;
51
+ int encoding_hint = UNKNOWN_ENCODING;
52
+ Language language_hint = UNKNOWN_LANGUAGE;
53
+
54
+ double normalized_score3[3];
55
+ Language language3[3];
56
+ int percent3[3];
57
+ int text_bytes;
58
+
59
+ Language lang;
60
+ lang = CompactLangDet::DetectLanguage(0,
61
+ src, strlen(src),
62
+ is_plain_text,
63
+ do_allow_extended_languages,
64
+ do_pick_summary_language,
65
+ do_remove_weak_matches,
66
+ tld_hint,
67
+ encoding_hint,
68
+ language_hint,
69
+ language3,
70
+ percent3,
71
+ normalized_score3,
72
+ &text_bytes,
73
+ &is_reliable);
74
+ return LanguageName(lang);
75
+ }
76
+ */
77
+ }
78
+
79
+ int main(int argc, char **argv) {
80
+ /*
81
+ bool is_plain_text = true;
82
+ bool do_allow_extended_languages = true;
83
+ bool do_pick_summary_language = false;
84
+ bool do_remove_weak_matches = false;
85
+ bool is_reliable;
86
+ Language plus_one = UNKNOWN_LANGUAGE;
87
+ const char* tld_hint = NULL;
88
+ int encoding_hint = UNKNOWN_ENCODING;
89
+ Language language_hint = UNKNOWN_LANGUAGE;
90
+
91
+ double normalized_score3[3];
92
+ Language language3[3];
93
+ int percent3[3];
94
+ int text_bytes;
95
+
96
+ const char* src = kTeststr_en;
97
+ Language lang;
98
+ lang = CompactLangDet::DetectLanguage(0,
99
+ src, strlen(src),
100
+ is_plain_text,
101
+ do_allow_extended_languages,
102
+ do_pick_summary_language,
103
+ do_remove_weak_matches,
104
+ tld_hint,
105
+ encoding_hint,
106
+ language_hint,
107
+ language3,
108
+ percent3,
109
+ normalized_score3,
110
+ &text_bytes,
111
+ &is_reliable);
112
+ printf("LANG=%s\n", LanguageName(lang));
113
+
114
+ src = kTeststr_ks;
115
+ lang = CompactLangDet::DetectLanguage(0,
116
+ src, strlen(src),
117
+ is_plain_text,
118
+ do_allow_extended_languages,
119
+ do_pick_summary_language,
120
+ do_remove_weak_matches,
121
+ tld_hint,
122
+ encoding_hint,
123
+ language_hint,
124
+ language3,
125
+ percent3,
126
+ normalized_score3,
127
+ &text_bytes,
128
+ &is_reliable);
129
+ printf("LANG=%s\n", LanguageName(lang));
130
+ */
131
+ }
metadata ADDED
@@ -0,0 +1,168 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cld
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.0
6
+ platform: ruby
7
+ authors:
8
+ - Jason Toy
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-11-03 00:00:00 Z
14
+ dependencies: []
15
+
16
+ description: Compact Language Detection from chrome
17
+ email: jtoy@jtoy.net
18
+ executables: []
19
+
20
+ extensions:
21
+ - ext/cld/extconf.rb
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README.rdoc
25
+ - ext/cld/extconf.rb
26
+ - lib/cld.rb
27
+ files:
28
+ - LICENSE
29
+ - README.rdoc
30
+ - Rakefile
31
+ - base/basictypes.h
32
+ - base/build_config.h
33
+ - base/casts.h
34
+ - base/commandlineflags.h
35
+ - base/crash.h
36
+ - base/dynamic_annotations.h
37
+ - base/global_strip_options.h
38
+ - base/log_severity.h
39
+ - base/logging.h
40
+ - base/macros.h
41
+ - base/port.h
42
+ - base/scoped_ptr.h
43
+ - base/stl_decl.h
44
+ - base/stl_decl_msvc.h
45
+ - base/string_util.h
46
+ - base/strtoint.h
47
+ - base/template_util.h
48
+ - base/type_traits.h
49
+ - base/vlog_is_on.h
50
+ - build.sh
51
+ - build.win.cmd
52
+ - cld.gemspec
53
+ - cld_encodings.h
54
+ - encodings/compact_lang_det/#cldutil.cc#
55
+ - encodings/compact_lang_det/#cldutil.h#
56
+ - encodings/compact_lang_det/#compact_lang_det_impl.h#
57
+ - encodings/compact_lang_det/#ext_lang_enc.cc#
58
+ - encodings/compact_lang_det/#ext_lang_enc.h#
59
+ - encodings/compact_lang_det/#getonescriptspan.cc#
60
+ - encodings/compact_lang_det/#getonescriptspan.h#
61
+ - encodings/compact_lang_det/#tote.cc#
62
+ - encodings/compact_lang_det/#tote.h#
63
+ - encodings/compact_lang_det/cldutil.cc
64
+ - encodings/compact_lang_det/cldutil.h
65
+ - encodings/compact_lang_det/cldutil_dbg.h
66
+ - encodings/compact_lang_det/cldutil_dbg_empty.cc
67
+ - encodings/compact_lang_det/compact_lang_det.cc
68
+ - encodings/compact_lang_det/compact_lang_det.h
69
+ - encodings/compact_lang_det/compact_lang_det_impl.cc
70
+ - encodings/compact_lang_det/compact_lang_det_impl.h
71
+ - encodings/compact_lang_det/compact_lang_det_unittest_small.cc
72
+ - encodings/compact_lang_det/compile.cmd
73
+ - encodings/compact_lang_det/ext_lang_enc.cc
74
+ - encodings/compact_lang_det/ext_lang_enc.h
75
+ - encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc
76
+ - encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc
77
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc
78
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc
79
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc
80
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc
81
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc
82
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h
83
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc
84
+ - encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc
85
+ - encodings/compact_lang_det/getonescriptspan.cc
86
+ - encodings/compact_lang_det/getonescriptspan.h
87
+ - encodings/compact_lang_det/letterscript_enum.cc
88
+ - encodings/compact_lang_det/letterscript_enum.h
89
+ - encodings/compact_lang_det/subsetsequence.cc
90
+ - encodings/compact_lang_det/subsetsequence.h
91
+ - encodings/compact_lang_det/subsetsequence_unittest.cc
92
+ - encodings/compact_lang_det/tote.cc
93
+ - encodings/compact_lang_det/tote.h
94
+ - encodings/compact_lang_det/unittest_data.h
95
+ - encodings/compact_lang_det/utf8propjustletter.h
96
+ - encodings/compact_lang_det/utf8propletterscriptnum.h
97
+ - encodings/compact_lang_det/utf8scannotjustletterspecial.h
98
+ - encodings/compact_lang_det/win/#cld_unilib_windows.cc#
99
+ - encodings/compact_lang_det/win/cld_basictypes.h
100
+ - encodings/compact_lang_det/win/cld_commandlineflags.h
101
+ - encodings/compact_lang_det/win/cld_google.h
102
+ - encodings/compact_lang_det/win/cld_htmlutils.h
103
+ - encodings/compact_lang_det/win/cld_htmlutils_google3.cc
104
+ - encodings/compact_lang_det/win/cld_htmlutils_windows.cc
105
+ - encodings/compact_lang_det/win/cld_logging.h
106
+ - encodings/compact_lang_det/win/cld_macros.h
107
+ - encodings/compact_lang_det/win/cld_strtoint.h
108
+ - encodings/compact_lang_det/win/cld_unicodetext.cc
109
+ - encodings/compact_lang_det/win/cld_unicodetext.h
110
+ - encodings/compact_lang_det/win/cld_unilib.h
111
+ - encodings/compact_lang_det/win/cld_unilib_google3.cc
112
+ - encodings/compact_lang_det/win/cld_unilib_windows.cc
113
+ - encodings/compact_lang_det/win/cld_utf.h
114
+ - encodings/compact_lang_det/win/cld_utf8statetable.cc
115
+ - encodings/compact_lang_det/win/cld_utf8statetable.h
116
+ - encodings/compact_lang_det/win/cld_utf8utils.h
117
+ - encodings/compact_lang_det/win/cld_utf8utils_google3.cc
118
+ - encodings/compact_lang_det/win/cld_utf8utils_windows.cc
119
+ - encodings/compact_lang_det/win/normalizedunicodetext.cc
120
+ - encodings/compact_lang_det/win/normalizedunicodetext.h
121
+ - encodings/internal/encodings.cc
122
+ - encodings/lang_enc.h
123
+ - encodings/proto/encodings.pb.h
124
+ - encodings/public/encodings.h
125
+ - ext/cld/extconf.rb
126
+ - languages/internal/#languages.cc#
127
+ - languages/internal/languages.cc
128
+ - languages/proto/languages.pb.h
129
+ - languages/public/languages.h
130
+ - lib/cld.rb
131
+ - test/test.rb
132
+ - thunk.cc
133
+ - Manifest
134
+ homepage: http://github.com/jtoy/cld
135
+ licenses: []
136
+
137
+ post_install_message:
138
+ rdoc_options:
139
+ - --line-numbers
140
+ - --inline-source
141
+ - --title
142
+ - Cld
143
+ - --main
144
+ - README.rdoc
145
+ require_paths:
146
+ - lib
147
+ - ext
148
+ required_ruby_version: !ruby/object:Gem::Requirement
149
+ none: false
150
+ requirements:
151
+ - - ">="
152
+ - !ruby/object:Gem::Version
153
+ version: "0"
154
+ required_rubygems_version: !ruby/object:Gem::Requirement
155
+ none: false
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: "1.2"
160
+ requirements: []
161
+
162
+ rubyforge_project: cld
163
+ rubygems_version: 1.8.6.1
164
+ signing_key:
165
+ specification_version: 3
166
+ summary: Compact Language Detection from chrome
167
+ test_files: []
168
+