language_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,88 @@
1
+ # encoding: utf-8
2
+
3
+ require '_helper'
4
+ require 'csv'
5
+
6
+ class LanguageDetectionTest < Test::Unit::TestCase
7
+
8
+ context "Language detection" do
9
+
10
+ should "be able to convert result from native call to Hashr instance" do
11
+ result = LanguageDetection.language_detection("this is some text", false)
12
+ parsed_result = LanguageDetection.parse_result(result)
13
+
14
+ assert_kind_of LanguageDetection::Language, result
15
+ assert_kind_of Hashr, parsed_result
16
+
17
+ assert_equal "ENGLISH", parsed_result.name
18
+ assert_nil parsed_result.non_existing_property
19
+ end
20
+
21
+ should "convert details from FFI pointer to Hashr instance" do
22
+ language = LanguageDetection.perform("this is some text")
23
+
24
+ assert_kind_of Array, language.details
25
+ assert_kind_of Hashr, language.details.first
26
+ assert_equal "ENGLISH", language.details.first.name
27
+ assert_equal 65, language.details.first.percent
28
+ end
29
+
30
+ should "recognize languages in testing data" do
31
+ CSV.foreach(File.expand_path("../fixtures/languages.csv", __FILE__), :quote_char => '"') do |row|
32
+ assert_equal row[0], LanguageDetection.perform(row[1]).code
33
+ end
34
+ end
35
+
36
+ end
37
+
38
+ context "When LanguageDetection module is included" do
39
+ class Article
40
+ include LanguageDetection
41
+
42
+ attr_accessor :title, :content
43
+
44
+ def initialize(params = {})
45
+ @title = params[:title]
46
+ @content = params[:content]
47
+ end
48
+
49
+ def to_s
50
+ "#{title}\n#{content}"
51
+ end
52
+ end
53
+
54
+ setup do
55
+ @article = Article.new :title => "Web development that doesn't hurt", :content => "Tens of thousands of Rails applications are already live. People are using Rails in the tiniest part-time operations to the biggest companies."
56
+ end
57
+
58
+ should "provide Model#language instance method" do
59
+ assert @article.respond_to?(:language)
60
+ end
61
+
62
+ should "call LanguageDetection.perform with Model#to_s as parameter when calling Model#language" do
63
+ LanguageDetection.expects(:perform).with("#{@article.title}\n#{@article.content}", false)
64
+
65
+ @article.language
66
+ end
67
+
68
+ should "return detected language" do
69
+ language = @article.language
70
+ assert_equal "ENGLISH", language.name
71
+ assert_equal true, language.reliable
72
+ assert_equal 100, language.details.first.percent
73
+ end
74
+
75
+ end
76
+
77
+ context "Include LanguageDetection to string" do
78
+
79
+ should "have String#language method" do
80
+ assert ! "some string".respond_to?(:language)
81
+ require 'language_detection/string'
82
+ assert "some string".respond_to?(:language)
83
+ end
84
+
85
+ end
86
+
87
+
88
+ end
metadata ADDED
@@ -0,0 +1,250 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: language_detection
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Vojtech Hyza
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-10-01 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: ffi
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: hashr
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rake
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: shoulda
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: mocha
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: turn
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: Language detection
111
+ email:
112
+ - vhyza@vhyza.eu
113
+ executables: []
114
+ extensions:
115
+ - ext/cld/extconf.rb
116
+ extra_rdoc_files: []
117
+ files:
118
+ - .gitignore
119
+ - Gemfile
120
+ - LICENSE.txt
121
+ - README.md
122
+ - Rakefile
123
+ - ext/cld/Makefile
124
+ - ext/cld/base/basictypes.h
125
+ - ext/cld/base/build_config.h
126
+ - ext/cld/base/casts.h
127
+ - ext/cld/base/commandlineflags.h
128
+ - ext/cld/base/crash.h
129
+ - ext/cld/base/dynamic_annotations.h
130
+ - ext/cld/base/global_strip_options.h
131
+ - ext/cld/base/log_severity.h
132
+ - ext/cld/base/logging.h
133
+ - ext/cld/base/macros.h
134
+ - ext/cld/base/port.h
135
+ - ext/cld/base/scoped_ptr.h
136
+ - ext/cld/base/stl_decl.h
137
+ - ext/cld/base/stl_decl_msvc.h
138
+ - ext/cld/base/string_util.h
139
+ - ext/cld/base/strtoint.h
140
+ - ext/cld/base/template_util.h
141
+ - ext/cld/base/type_traits.h
142
+ - ext/cld/base/vlog_is_on.h
143
+ - ext/cld/cld.so
144
+ - ext/cld/encodings/compact_lang_det/cldutil.cc
145
+ - ext/cld/encodings/compact_lang_det/cldutil.h
146
+ - ext/cld/encodings/compact_lang_det/cldutil_dbg.h
147
+ - ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc
148
+ - ext/cld/encodings/compact_lang_det/compact_lang_det.cc
149
+ - ext/cld/encodings/compact_lang_det/compact_lang_det.h
150
+ - ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc
151
+ - ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h
152
+ - ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc
153
+ - ext/cld/encodings/compact_lang_det/compile.cmd
154
+ - ext/cld/encodings/compact_lang_det/ext_lang_enc.cc
155
+ - ext/cld/encodings/compact_lang_det/ext_lang_enc.h
156
+ - ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc
157
+ - ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc
158
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc
159
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc
160
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc
161
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc
162
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc
163
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h
164
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc
165
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc
166
+ - ext/cld/encodings/compact_lang_det/getonescriptspan.cc
167
+ - ext/cld/encodings/compact_lang_det/getonescriptspan.h
168
+ - ext/cld/encodings/compact_lang_det/letterscript_enum.cc
169
+ - ext/cld/encodings/compact_lang_det/letterscript_enum.h
170
+ - ext/cld/encodings/compact_lang_det/subsetsequence.cc
171
+ - ext/cld/encodings/compact_lang_det/subsetsequence.h
172
+ - ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc
173
+ - ext/cld/encodings/compact_lang_det/tote.cc
174
+ - ext/cld/encodings/compact_lang_det/tote.h
175
+ - ext/cld/encodings/compact_lang_det/unittest_data.h
176
+ - ext/cld/encodings/compact_lang_det/utf8propjustletter.h
177
+ - ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h
178
+ - ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h
179
+ - ext/cld/encodings/compact_lang_det/win/cld_basictypes.h
180
+ - ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h
181
+ - ext/cld/encodings/compact_lang_det/win/cld_google.h
182
+ - ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h
183
+ - ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc
184
+ - ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc
185
+ - ext/cld/encodings/compact_lang_det/win/cld_logging.h
186
+ - ext/cld/encodings/compact_lang_det/win/cld_macros.h
187
+ - ext/cld/encodings/compact_lang_det/win/cld_strtoint.h
188
+ - ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc
189
+ - ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h
190
+ - ext/cld/encodings/compact_lang_det/win/cld_unilib.h
191
+ - ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc
192
+ - ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc
193
+ - ext/cld/encodings/compact_lang_det/win/cld_utf.h
194
+ - ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc
195
+ - ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h
196
+ - ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h
197
+ - ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc
198
+ - ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc
199
+ - ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc
200
+ - ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h
201
+ - ext/cld/encodings/internal/encodings.cc
202
+ - ext/cld/encodings/lang_enc.h
203
+ - ext/cld/encodings/proto/encodings.pb.h
204
+ - ext/cld/encodings/public/encodings.h
205
+ - ext/cld/extconf.rb
206
+ - ext/cld/language_detection.cc
207
+ - ext/cld/languages/internal/languages.cc
208
+ - ext/cld/languages/proto/languages.pb.h
209
+ - ext/cld/languages/public/languages.h
210
+ - language_detection.gemspec
211
+ - lib/language_detection.rb
212
+ - lib/language_detection/string.rb
213
+ - lib/language_detection/version.rb
214
+ - test/_helper.rb
215
+ - test/fixtures/languages.csv
216
+ - test/language_detection_test.rb
217
+ homepage: ''
218
+ licenses: []
219
+ post_install_message:
220
+ rdoc_options: []
221
+ require_paths:
222
+ - lib
223
+ required_ruby_version: !ruby/object:Gem::Requirement
224
+ none: false
225
+ requirements:
226
+ - - ! '>='
227
+ - !ruby/object:Gem::Version
228
+ version: '0'
229
+ segments:
230
+ - 0
231
+ hash: 301210449373780646
232
+ required_rubygems_version: !ruby/object:Gem::Requirement
233
+ none: false
234
+ requirements:
235
+ - - ! '>='
236
+ - !ruby/object:Gem::Version
237
+ version: '0'
238
+ segments:
239
+ - 0
240
+ hash: 301210449373780646
241
+ requirements: []
242
+ rubyforge_project:
243
+ rubygems_version: 1.8.24
244
+ signing_key:
245
+ specification_version: 3
246
+ summary: Wrapped Chrome's compact language detector
247
+ test_files:
248
+ - test/_helper.rb
249
+ - test/fixtures/languages.csv
250
+ - test/language_detection_test.rb