language_detection 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/.gitignore +19 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +85 -0
  5. data/Rakefile +11 -0
  6. data/ext/cld/Makefile +34 -0
  7. data/ext/cld/base/basictypes.h +348 -0
  8. data/ext/cld/base/build_config.h +124 -0
  9. data/ext/cld/base/casts.h +156 -0
  10. data/ext/cld/base/commandlineflags.h +443 -0
  11. data/ext/cld/base/crash.h +41 -0
  12. data/ext/cld/base/dynamic_annotations.h +358 -0
  13. data/ext/cld/base/global_strip_options.h +59 -0
  14. data/ext/cld/base/log_severity.h +46 -0
  15. data/ext/cld/base/logging.h +1403 -0
  16. data/ext/cld/base/macros.h +243 -0
  17. data/ext/cld/base/port.h +54 -0
  18. data/ext/cld/base/scoped_ptr.h +428 -0
  19. data/ext/cld/base/stl_decl.h +0 -0
  20. data/ext/cld/base/stl_decl_msvc.h +107 -0
  21. data/ext/cld/base/string_util.h +29 -0
  22. data/ext/cld/base/strtoint.h +93 -0
  23. data/ext/cld/base/template_util.h +96 -0
  24. data/ext/cld/base/type_traits.h +198 -0
  25. data/ext/cld/base/vlog_is_on.h +143 -0
  26. data/ext/cld/cld.so +0 -0
  27. data/ext/cld/encodings/compact_lang_det/cldutil.cc +905 -0
  28. data/ext/cld/encodings/compact_lang_det/cldutil.h +1205 -0
  29. data/ext/cld/encodings/compact_lang_det/cldutil_dbg.h +76 -0
  30. data/ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc +76 -0
  31. data/ext/cld/encodings/compact_lang_det/compact_lang_det.cc +62 -0
  32. data/ext/cld/encodings/compact_lang_det/compact_lang_det.h +145 -0
  33. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc +2574 -0
  34. data/ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h +173 -0
  35. data/ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc +406 -0
  36. data/ext/cld/encodings/compact_lang_det/compile.cmd +1 -0
  37. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.cc +545 -0
  38. data/ext/cld/encodings/compact_lang_det/ext_lang_enc.h +119 -0
  39. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc +380 -0
  40. data/ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc +382 -0
  41. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc +49 -0
  42. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc +7119 -0
  43. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc +61 -0
  44. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc +1263 -0
  45. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc +53 -0
  46. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h +10 -0
  47. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc +50 -0
  48. data/ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc +70935 -0
  49. data/ext/cld/encodings/compact_lang_det/getonescriptspan.cc +570 -0
  50. data/ext/cld/encodings/compact_lang_det/getonescriptspan.h +131 -0
  51. data/ext/cld/encodings/compact_lang_det/letterscript_enum.cc +117 -0
  52. data/ext/cld/encodings/compact_lang_det/letterscript_enum.h +99 -0
  53. data/ext/cld/encodings/compact_lang_det/subsetsequence.cc +259 -0
  54. data/ext/cld/encodings/compact_lang_det/subsetsequence.h +44 -0
  55. data/ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc +99 -0
  56. data/ext/cld/encodings/compact_lang_det/tote.cc +299 -0
  57. data/ext/cld/encodings/compact_lang_det/tote.h +89 -0
  58. data/ext/cld/encodings/compact_lang_det/unittest_data.h +193 -0
  59. data/ext/cld/encodings/compact_lang_det/utf8propjustletter.h +1162 -0
  60. data/ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h +1222 -0
  61. data/ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h +1185 -0
  62. data/ext/cld/encodings/compact_lang_det/win/cld_basictypes.h +10 -0
  63. data/ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h +28 -0
  64. data/ext/cld/encodings/compact_lang_det/win/cld_google.h +18 -0
  65. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h +13 -0
  66. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc +32 -0
  67. data/ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc +29 -0
  68. data/ext/cld/encodings/compact_lang_det/win/cld_logging.h +21 -0
  69. data/ext/cld/encodings/compact_lang_det/win/cld_macros.h +19 -0
  70. data/ext/cld/encodings/compact_lang_det/win/cld_strtoint.h +26 -0
  71. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +84 -0
  72. data/ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h +40 -0
  73. data/ext/cld/encodings/compact_lang_det/win/cld_unilib.h +15 -0
  74. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc +18 -0
  75. data/ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc +29 -0
  76. data/ext/cld/encodings/compact_lang_det/win/cld_utf.h +24 -0
  77. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc +224 -0
  78. data/ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h +141 -0
  79. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h +22 -0
  80. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc +18 -0
  81. data/ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc +17 -0
  82. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc +172 -0
  83. data/ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h +67 -0
  84. data/ext/cld/encodings/internal/encodings.cc +12 -0
  85. data/ext/cld/encodings/lang_enc.h +254 -0
  86. data/ext/cld/encodings/proto/encodings.pb.h +169 -0
  87. data/ext/cld/encodings/public/encodings.h +301 -0
  88. data/ext/cld/extconf.rb +1 -0
  89. data/ext/cld/language_detection.cc +88 -0
  90. data/ext/cld/languages/internal/languages.cc +337 -0
  91. data/ext/cld/languages/proto/languages.pb.h +179 -0
  92. data/ext/cld/languages/public/languages.h +379 -0
  93. data/language_detection.gemspec +28 -0
  94. data/lib/language_detection/string.rb +1 -0
  95. data/lib/language_detection/version.rb +3 -0
  96. data/lib/language_detection.rb +54 -0
  97. data/test/_helper.rb +15 -0
  98. data/test/fixtures/languages.csv +80 -0
  99. data/test/language_detection_test.rb +88 -0
  100. metadata +250 -0
@@ -0,0 +1,88 @@
1
+ # encoding: utf-8
2
+
3
+ require '_helper'
4
+ require 'csv'
5
+
6
+ class LanguageDetectionTest < Test::Unit::TestCase
7
+
8
+ context "Language detection" do
9
+
10
+ should "be able to convert result from native call to Hashr instance" do
11
+ result = LanguageDetection.language_detection("this is some text", false)
12
+ parsed_result = LanguageDetection.parse_result(result)
13
+
14
+ assert_kind_of LanguageDetection::Language, result
15
+ assert_kind_of Hashr, parsed_result
16
+
17
+ assert_equal "ENGLISH", parsed_result.name
18
+ assert_nil parsed_result.non_existing_property
19
+ end
20
+
21
+ should "convert details from FFI pointer to Hashr instance" do
22
+ language = LanguageDetection.perform("this is some text")
23
+
24
+ assert_kind_of Array, language.details
25
+ assert_kind_of Hashr, language.details.first
26
+ assert_equal "ENGLISH", language.details.first.name
27
+ assert_equal 65, language.details.first.percent
28
+ end
29
+
30
+ should "recognize languages in testing data" do
31
+ CSV.foreach(File.expand_path("../fixtures/languages.csv", __FILE__), :quote_char => '"') do |row|
32
+ assert_equal row[0], LanguageDetection.perform(row[1]).code
33
+ end
34
+ end
35
+
36
+ end
37
+
38
+ context "When LanguageDetection module is included" do
39
+ class Article
40
+ include LanguageDetection
41
+
42
+ attr_accessor :title, :content
43
+
44
+ def initialize(params = {})
45
+ @title = params[:title]
46
+ @content = params[:content]
47
+ end
48
+
49
+ def to_s
50
+ "#{title}\n#{content}"
51
+ end
52
+ end
53
+
54
+ setup do
55
+ @article = Article.new :title => "Web development that doesn't hurt", :content => "Tens of thousands of Rails applications are already live. People are using Rails in the tiniest part-time operations to the biggest companies."
56
+ end
57
+
58
+ should "provide Model#language instance method" do
59
+ assert @article.respond_to?(:language)
60
+ end
61
+
62
+ should "call LanguageDetection.perform with Model#to_s as parameter when calling Model#language" do
63
+ LanguageDetection.expects(:perform).with("#{@article.title}\n#{@article.content}", false)
64
+
65
+ @article.language
66
+ end
67
+
68
+ should "return detected language" do
69
+ language = @article.language
70
+ assert_equal "ENGLISH", language.name
71
+ assert_equal true, language.reliable
72
+ assert_equal 100, language.details.first.percent
73
+ end
74
+
75
+ end
76
+
77
+ context "Include LanguageDetection to string" do
78
+
79
+ should "have String#language method" do
80
+ assert ! "some string".respond_to?(:language)
81
+ require 'language_detection/string'
82
+ assert "some string".respond_to?(:language)
83
+ end
84
+
85
+ end
86
+
87
+
88
+ end
metadata ADDED
@@ -0,0 +1,250 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: language_detection
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Vojtech Hyza
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-10-01 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: ffi
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: hashr
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rake
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: shoulda
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: mocha
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: turn
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: Language detection
111
+ email:
112
+ - vhyza@vhyza.eu
113
+ executables: []
114
+ extensions:
115
+ - ext/cld/extconf.rb
116
+ extra_rdoc_files: []
117
+ files:
118
+ - .gitignore
119
+ - Gemfile
120
+ - LICENSE.txt
121
+ - README.md
122
+ - Rakefile
123
+ - ext/cld/Makefile
124
+ - ext/cld/base/basictypes.h
125
+ - ext/cld/base/build_config.h
126
+ - ext/cld/base/casts.h
127
+ - ext/cld/base/commandlineflags.h
128
+ - ext/cld/base/crash.h
129
+ - ext/cld/base/dynamic_annotations.h
130
+ - ext/cld/base/global_strip_options.h
131
+ - ext/cld/base/log_severity.h
132
+ - ext/cld/base/logging.h
133
+ - ext/cld/base/macros.h
134
+ - ext/cld/base/port.h
135
+ - ext/cld/base/scoped_ptr.h
136
+ - ext/cld/base/stl_decl.h
137
+ - ext/cld/base/stl_decl_msvc.h
138
+ - ext/cld/base/string_util.h
139
+ - ext/cld/base/strtoint.h
140
+ - ext/cld/base/template_util.h
141
+ - ext/cld/base/type_traits.h
142
+ - ext/cld/base/vlog_is_on.h
143
+ - ext/cld/cld.so
144
+ - ext/cld/encodings/compact_lang_det/cldutil.cc
145
+ - ext/cld/encodings/compact_lang_det/cldutil.h
146
+ - ext/cld/encodings/compact_lang_det/cldutil_dbg.h
147
+ - ext/cld/encodings/compact_lang_det/cldutil_dbg_empty.cc
148
+ - ext/cld/encodings/compact_lang_det/compact_lang_det.cc
149
+ - ext/cld/encodings/compact_lang_det/compact_lang_det.h
150
+ - ext/cld/encodings/compact_lang_det/compact_lang_det_impl.cc
151
+ - ext/cld/encodings/compact_lang_det/compact_lang_det_impl.h
152
+ - ext/cld/encodings/compact_lang_det/compact_lang_det_unittest_small.cc
153
+ - ext/cld/encodings/compact_lang_det/compile.cmd
154
+ - ext/cld/encodings/compact_lang_det/ext_lang_enc.cc
155
+ - ext/cld/encodings/compact_lang_det/ext_lang_enc.h
156
+ - ext/cld/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc
157
+ - ext/cld/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc
158
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc
159
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc
160
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc
161
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc
162
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc
163
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h
164
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc
165
+ - ext/cld/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc
166
+ - ext/cld/encodings/compact_lang_det/getonescriptspan.cc
167
+ - ext/cld/encodings/compact_lang_det/getonescriptspan.h
168
+ - ext/cld/encodings/compact_lang_det/letterscript_enum.cc
169
+ - ext/cld/encodings/compact_lang_det/letterscript_enum.h
170
+ - ext/cld/encodings/compact_lang_det/subsetsequence.cc
171
+ - ext/cld/encodings/compact_lang_det/subsetsequence.h
172
+ - ext/cld/encodings/compact_lang_det/subsetsequence_unittest.cc
173
+ - ext/cld/encodings/compact_lang_det/tote.cc
174
+ - ext/cld/encodings/compact_lang_det/tote.h
175
+ - ext/cld/encodings/compact_lang_det/unittest_data.h
176
+ - ext/cld/encodings/compact_lang_det/utf8propjustletter.h
177
+ - ext/cld/encodings/compact_lang_det/utf8propletterscriptnum.h
178
+ - ext/cld/encodings/compact_lang_det/utf8scannotjustletterspecial.h
179
+ - ext/cld/encodings/compact_lang_det/win/cld_basictypes.h
180
+ - ext/cld/encodings/compact_lang_det/win/cld_commandlineflags.h
181
+ - ext/cld/encodings/compact_lang_det/win/cld_google.h
182
+ - ext/cld/encodings/compact_lang_det/win/cld_htmlutils.h
183
+ - ext/cld/encodings/compact_lang_det/win/cld_htmlutils_google3.cc
184
+ - ext/cld/encodings/compact_lang_det/win/cld_htmlutils_windows.cc
185
+ - ext/cld/encodings/compact_lang_det/win/cld_logging.h
186
+ - ext/cld/encodings/compact_lang_det/win/cld_macros.h
187
+ - ext/cld/encodings/compact_lang_det/win/cld_strtoint.h
188
+ - ext/cld/encodings/compact_lang_det/win/cld_unicodetext.cc
189
+ - ext/cld/encodings/compact_lang_det/win/cld_unicodetext.h
190
+ - ext/cld/encodings/compact_lang_det/win/cld_unilib.h
191
+ - ext/cld/encodings/compact_lang_det/win/cld_unilib_google3.cc
192
+ - ext/cld/encodings/compact_lang_det/win/cld_unilib_windows.cc
193
+ - ext/cld/encodings/compact_lang_det/win/cld_utf.h
194
+ - ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.cc
195
+ - ext/cld/encodings/compact_lang_det/win/cld_utf8statetable.h
196
+ - ext/cld/encodings/compact_lang_det/win/cld_utf8utils.h
197
+ - ext/cld/encodings/compact_lang_det/win/cld_utf8utils_google3.cc
198
+ - ext/cld/encodings/compact_lang_det/win/cld_utf8utils_windows.cc
199
+ - ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.cc
200
+ - ext/cld/encodings/compact_lang_det/win/normalizedunicodetext.h
201
+ - ext/cld/encodings/internal/encodings.cc
202
+ - ext/cld/encodings/lang_enc.h
203
+ - ext/cld/encodings/proto/encodings.pb.h
204
+ - ext/cld/encodings/public/encodings.h
205
+ - ext/cld/extconf.rb
206
+ - ext/cld/language_detection.cc
207
+ - ext/cld/languages/internal/languages.cc
208
+ - ext/cld/languages/proto/languages.pb.h
209
+ - ext/cld/languages/public/languages.h
210
+ - language_detection.gemspec
211
+ - lib/language_detection.rb
212
+ - lib/language_detection/string.rb
213
+ - lib/language_detection/version.rb
214
+ - test/_helper.rb
215
+ - test/fixtures/languages.csv
216
+ - test/language_detection_test.rb
217
+ homepage: ''
218
+ licenses: []
219
+ post_install_message:
220
+ rdoc_options: []
221
+ require_paths:
222
+ - lib
223
+ required_ruby_version: !ruby/object:Gem::Requirement
224
+ none: false
225
+ requirements:
226
+ - - ! '>='
227
+ - !ruby/object:Gem::Version
228
+ version: '0'
229
+ segments:
230
+ - 0
231
+ hash: 301210449373780646
232
+ required_rubygems_version: !ruby/object:Gem::Requirement
233
+ none: false
234
+ requirements:
235
+ - - ! '>='
236
+ - !ruby/object:Gem::Version
237
+ version: '0'
238
+ segments:
239
+ - 0
240
+ hash: 301210449373780646
241
+ requirements: []
242
+ rubyforge_project:
243
+ rubygems_version: 1.8.24
244
+ signing_key:
245
+ specification_version: 3
246
+ summary: Wrapped Chrome's compact language detector
247
+ test_files:
248
+ - test/_helper.rb
249
+ - test/fixtures/languages.csv
250
+ - test/language_detection_test.rb