langusta 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. data/.document +5 -0
  2. data/Gemfile +11 -0
  3. data/Gemfile.lock +32 -0
  4. data/LICENSE.txt +13 -0
  5. data/README.rdoc +34 -0
  6. data/Rakefile +55 -0
  7. data/VERSION +1 -0
  8. data/bin/langusta +5 -0
  9. data/data/messages.properties +128 -0
  10. data/data/uppercase.bin +0 -0
  11. data/langusta.gemspec +210 -0
  12. data/lib/langusta.rb +36 -0
  13. data/lib/langusta/command.rb +78 -0
  14. data/lib/langusta/detector.rb +197 -0
  15. data/lib/langusta/detector_factory.rb +46 -0
  16. data/lib/langusta/java_property_reader.rb +35 -0
  17. data/lib/langusta/lang_profile.rb +80 -0
  18. data/lib/langusta/language.rb +14 -0
  19. data/lib/langusta/language_detection_facade.rb +24 -0
  20. data/lib/langusta/n_gram.rb +116 -0
  21. data/lib/langusta/regex_helper.rb +15 -0
  22. data/lib/langusta/tag_extractor.rb +39 -0
  23. data/lib/langusta/ucs2_string.rb +70 -0
  24. data/lib/langusta/unicode_block.rb +56 -0
  25. data/profiles/af +1 -0
  26. data/profiles/ar +1 -0
  27. data/profiles/bg +1 -0
  28. data/profiles/bn +1 -0
  29. data/profiles/cs +1 -0
  30. data/profiles/da +1 -0
  31. data/profiles/de +1 -0
  32. data/profiles/el +1 -0
  33. data/profiles/en +1 -0
  34. data/profiles/es +1 -0
  35. data/profiles/fa +1 -0
  36. data/profiles/fi +1 -0
  37. data/profiles/fr +1 -0
  38. data/profiles/gu +1 -0
  39. data/profiles/he +1 -0
  40. data/profiles/hi +1 -0
  41. data/profiles/hr +1 -0
  42. data/profiles/hu +1 -0
  43. data/profiles/id +1 -0
  44. data/profiles/it +1 -0
  45. data/profiles/ja +1 -0
  46. data/profiles/kn +1 -0
  47. data/profiles/ko +1 -0
  48. data/profiles/mk +1 -0
  49. data/profiles/ml +1 -0
  50. data/profiles/mr +1 -0
  51. data/profiles/ne +1 -0
  52. data/profiles/nl +1 -0
  53. data/profiles/no +1 -0
  54. data/profiles/pa +1 -0
  55. data/profiles/pl +1 -0
  56. data/profiles/pt +1 -0
  57. data/profiles/ro +1 -0
  58. data/profiles/ru +1 -0
  59. data/profiles/sk +1 -0
  60. data/profiles/so +1 -0
  61. data/profiles/sq +1 -0
  62. data/profiles/sv +1 -0
  63. data/profiles/sw +1 -0
  64. data/profiles/ta +1 -0
  65. data/profiles/te +1 -0
  66. data/profiles/th +1 -0
  67. data/profiles/tl +1 -0
  68. data/profiles/tr +1 -0
  69. data/profiles/uk +1 -0
  70. data/profiles/ur +1 -0
  71. data/profiles/vi +1 -0
  72. data/profiles/zh-cn +1 -0
  73. data/profiles/zh-tw +1 -0
  74. data/test/helper.rb +20 -0
  75. data/test/quality/test_falsified.rb +33 -0
  76. data/test/test_command.rb +34 -0
  77. data/test/test_data/af +1 -0
  78. data/test/test_data/ar +1 -0
  79. data/test/test_data/bg +32 -0
  80. data/test/test_data/bn +9 -0
  81. data/test/test_data/cs +9 -0
  82. data/test/test_data/da +14 -0
  83. data/test/test_data/de +4 -0
  84. data/test/test_data/el +7 -0
  85. data/test/test_data/en +26 -0
  86. data/test/test_data/es +4 -0
  87. data/test/test_data/fa +21 -0
  88. data/test/test_data/fi +8 -0
  89. data/test/test_data/fr +13 -0
  90. data/test/test_data/gu +3 -0
  91. data/test/test_data/he +20 -0
  92. data/test/test_data/hi +1 -0
  93. data/test/test_data/hr +16 -0
  94. data/test/test_data/hu +6 -0
  95. data/test/test_data/id +2 -0
  96. data/test/test_data/it +3 -0
  97. data/test/test_data/ja +34 -0
  98. data/test/test_data/kn +14 -0
  99. data/test/test_data/ko +2 -0
  100. data/test/test_data/mk +3 -0
  101. data/test/test_data/ml +1 -0
  102. data/test/test_data/mr +3 -0
  103. data/test/test_data/ne +2 -0
  104. data/test/test_data/nl +1 -0
  105. data/test/test_data/no +3 -0
  106. data/test/test_data/pa +1 -0
  107. data/test/test_data/pl +23 -0
  108. data/test/test_data/pt +2 -0
  109. data/test/test_data/ro +2 -0
  110. data/test/test_data/ru +1 -0
  111. data/test/test_data/sk +2 -0
  112. data/test/test_data/so +4 -0
  113. data/test/test_data/sq +4 -0
  114. data/test/test_data/sv +3 -0
  115. data/test/test_data/sw +6 -0
  116. data/test/test_data/ta +1 -0
  117. data/test/test_data/te +2 -0
  118. data/test/test_data/th +3 -0
  119. data/test/test_data/tl +1 -0
  120. data/test/test_data/tr +2 -0
  121. data/test/test_data/uk +3 -0
  122. data/test/test_data/ur +1 -0
  123. data/test/test_data/vi +2 -0
  124. data/test/test_data/zh-tw +3 -0
  125. data/test/test_detector.rb +52 -0
  126. data/test/test_detector_factory.rb +16 -0
  127. data/test/test_java_property_reader.rb +8 -0
  128. data/test/test_lang_profile.rb +79 -0
  129. data/test/test_language.rb +15 -0
  130. data/test/test_language_detection_facade.rb +9 -0
  131. data/test/test_langusta.rb +25 -0
  132. data/test/test_n_gram.rb +103 -0
  133. data/test/test_tag_extractor.rb +71 -0
  134. data/test/test_ucs2_string.rb +9 -0
  135. data/test/test_unicode_block.rb +9 -0
  136. metadata +320 -0
@@ -0,0 +1,9 @@
1
+ require 'test/helper'
2
+
3
+ class UCS2StringTest < Test::Unit::TestCase
4
+ def test_invalid_unicode_sequences_raise_an_error
5
+ assert_raises(Iconv::IllegalSequence) do
6
+ UCS2String.from_utf8("\xc0")
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ require 'test/helper'
2
+
3
+ class UnicodeBlockTest < Test::Unit::TestCase
4
+ def test_upper_case
5
+ ["\x00\x47", "\x01\x10", "\x01\x64", "\x03\xd5", "\x04\xa2", "\x10\xc3", "\x21\x60", "\xa7\x60"].each do |cp|
6
+ assert(Langusta::UnicodeBlock.is_upper_case?(cp))
7
+ end
8
+ end
9
+ end
metadata ADDED
@@ -0,0 +1,320 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: langusta
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Jan Szumiec
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-04-08 00:00:00 +02:00
19
+ default_executable: langusta
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ type: :runtime
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - "="
27
+ - !ruby/object:Gem::Version
28
+ hash: 19
29
+ segments:
30
+ - 1
31
+ - 1
32
+ - 0
33
+ version: 1.1.0
34
+ name: oniguruma
35
+ version_requirements: *id001
36
+ prerelease: false
37
+ - !ruby/object:Gem::Dependency
38
+ type: :runtime
39
+ requirement: &id002 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - "="
43
+ - !ruby/object:Gem::Version
44
+ hash: 59
45
+ segments:
46
+ - 0
47
+ - 8
48
+ - 2
49
+ version: 0.8.2
50
+ name: yajl-ruby
51
+ version_requirements: *id002
52
+ prerelease: false
53
+ - !ruby/object:Gem::Dependency
54
+ type: :development
55
+ requirement: &id003 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ~>
59
+ - !ruby/object:Gem::Version
60
+ hash: 23
61
+ segments:
62
+ - 1
63
+ - 0
64
+ - 0
65
+ version: 1.0.0
66
+ name: bundler
67
+ version_requirements: *id003
68
+ prerelease: false
69
+ - !ruby/object:Gem::Dependency
70
+ type: :development
71
+ requirement: &id004 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ hash: 7
77
+ segments:
78
+ - 1
79
+ - 5
80
+ - 2
81
+ version: 1.5.2
82
+ name: jeweler
83
+ version_requirements: *id004
84
+ prerelease: false
85
+ - !ruby/object:Gem::Dependency
86
+ type: :development
87
+ requirement: &id005 !ruby/object:Gem::Requirement
88
+ none: false
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ hash: 3
93
+ segments:
94
+ - 0
95
+ version: "0"
96
+ name: rcov
97
+ version_requirements: *id005
98
+ prerelease: false
99
+ - !ruby/object:Gem::Dependency
100
+ type: :development
101
+ requirement: &id006 !ruby/object:Gem::Requirement
102
+ none: false
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ hash: 3
107
+ segments:
108
+ - 0
109
+ version: "0"
110
+ name: mocha
111
+ version_requirements: *id006
112
+ prerelease: false
113
+ - !ruby/object:Gem::Dependency
114
+ type: :development
115
+ requirement: &id007 !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ">="
119
+ - !ruby/object:Gem::Version
120
+ hash: 3
121
+ segments:
122
+ - 0
123
+ version: "0"
124
+ name: ruby-debug
125
+ version_requirements: *id007
126
+ prerelease: false
127
+ description: Uses naive bayesian filter.
128
+ email: jan.szumiec@gmail.com
129
+ executables:
130
+ - langusta
131
+ extensions: []
132
+
133
+ extra_rdoc_files:
134
+ - LICENSE.txt
135
+ - README.rdoc
136
+ files:
137
+ - .document
138
+ - Gemfile
139
+ - Gemfile.lock
140
+ - LICENSE.txt
141
+ - README.rdoc
142
+ - Rakefile
143
+ - VERSION
144
+ - bin/langusta
145
+ - data/messages.properties
146
+ - data/uppercase.bin
147
+ - langusta.gemspec
148
+ - lib/langusta.rb
149
+ - lib/langusta/command.rb
150
+ - lib/langusta/detector.rb
151
+ - lib/langusta/detector_factory.rb
152
+ - lib/langusta/java_property_reader.rb
153
+ - lib/langusta/lang_profile.rb
154
+ - lib/langusta/language.rb
155
+ - lib/langusta/language_detection_facade.rb
156
+ - lib/langusta/n_gram.rb
157
+ - lib/langusta/regex_helper.rb
158
+ - lib/langusta/tag_extractor.rb
159
+ - lib/langusta/ucs2_string.rb
160
+ - lib/langusta/unicode_block.rb
161
+ - profiles/af
162
+ - profiles/ar
163
+ - profiles/bg
164
+ - profiles/bn
165
+ - profiles/cs
166
+ - profiles/da
167
+ - profiles/de
168
+ - profiles/el
169
+ - profiles/en
170
+ - profiles/es
171
+ - profiles/fa
172
+ - profiles/fi
173
+ - profiles/fr
174
+ - profiles/gu
175
+ - profiles/he
176
+ - profiles/hi
177
+ - profiles/hr
178
+ - profiles/hu
179
+ - profiles/id
180
+ - profiles/it
181
+ - profiles/ja
182
+ - profiles/kn
183
+ - profiles/ko
184
+ - profiles/mk
185
+ - profiles/ml
186
+ - profiles/mr
187
+ - profiles/ne
188
+ - profiles/nl
189
+ - profiles/no
190
+ - profiles/pa
191
+ - profiles/pl
192
+ - profiles/pt
193
+ - profiles/ro
194
+ - profiles/ru
195
+ - profiles/sk
196
+ - profiles/so
197
+ - profiles/sq
198
+ - profiles/sv
199
+ - profiles/sw
200
+ - profiles/ta
201
+ - profiles/te
202
+ - profiles/th
203
+ - profiles/tl
204
+ - profiles/tr
205
+ - profiles/uk
206
+ - profiles/ur
207
+ - profiles/vi
208
+ - profiles/zh-cn
209
+ - profiles/zh-tw
210
+ - test/helper.rb
211
+ - test/quality/test_falsified.rb
212
+ - test/test_command.rb
213
+ - test/test_data/af
214
+ - test/test_data/ar
215
+ - test/test_data/bg
216
+ - test/test_data/bn
217
+ - test/test_data/cs
218
+ - test/test_data/da
219
+ - test/test_data/de
220
+ - test/test_data/el
221
+ - test/test_data/en
222
+ - test/test_data/es
223
+ - test/test_data/fa
224
+ - test/test_data/fi
225
+ - test/test_data/fr
226
+ - test/test_data/gu
227
+ - test/test_data/he
228
+ - test/test_data/hi
229
+ - test/test_data/hr
230
+ - test/test_data/hu
231
+ - test/test_data/id
232
+ - test/test_data/it
233
+ - test/test_data/ja
234
+ - test/test_data/kn
235
+ - test/test_data/ko
236
+ - test/test_data/mk
237
+ - test/test_data/ml
238
+ - test/test_data/mr
239
+ - test/test_data/ne
240
+ - test/test_data/nl
241
+ - test/test_data/no
242
+ - test/test_data/pa
243
+ - test/test_data/pl
244
+ - test/test_data/pt
245
+ - test/test_data/ro
246
+ - test/test_data/ru
247
+ - test/test_data/sk
248
+ - test/test_data/so
249
+ - test/test_data/sq
250
+ - test/test_data/sv
251
+ - test/test_data/sw
252
+ - test/test_data/ta
253
+ - test/test_data/te
254
+ - test/test_data/th
255
+ - test/test_data/tl
256
+ - test/test_data/tr
257
+ - test/test_data/uk
258
+ - test/test_data/ur
259
+ - test/test_data/vi
260
+ - test/test_data/zh-tw
261
+ - test/test_detector.rb
262
+ - test/test_detector_factory.rb
263
+ - test/test_java_property_reader.rb
264
+ - test/test_lang_profile.rb
265
+ - test/test_language.rb
266
+ - test/test_language_detection_facade.rb
267
+ - test/test_langusta.rb
268
+ - test/test_n_gram.rb
269
+ - test/test_tag_extractor.rb
270
+ - test/test_ucs2_string.rb
271
+ - test/test_unicode_block.rb
272
+ has_rdoc: true
273
+ homepage: http://github.com/jasiek/langusta
274
+ licenses:
275
+ - Apache 2.0
276
+ post_install_message:
277
+ rdoc_options: []
278
+
279
+ require_paths:
280
+ - lib
281
+ required_ruby_version: !ruby/object:Gem::Requirement
282
+ none: false
283
+ requirements:
284
+ - - ">="
285
+ - !ruby/object:Gem::Version
286
+ hash: 3
287
+ segments:
288
+ - 0
289
+ version: "0"
290
+ required_rubygems_version: !ruby/object:Gem::Requirement
291
+ none: false
292
+ requirements:
293
+ - - ">="
294
+ - !ruby/object:Gem::Version
295
+ hash: 3
296
+ segments:
297
+ - 0
298
+ version: "0"
299
+ requirements: []
300
+
301
+ rubyforge_project:
302
+ rubygems_version: 1.5.1
303
+ signing_key:
304
+ specification_version: 3
305
+ summary: Language detection library based on http://code.google.com/p/language-detection/.
306
+ test_files:
307
+ - test/helper.rb
308
+ - test/quality/test_falsified.rb
309
+ - test/test_command.rb
310
+ - test/test_detector.rb
311
+ - test/test_detector_factory.rb
312
+ - test/test_java_property_reader.rb
313
+ - test/test_lang_profile.rb
314
+ - test/test_language.rb
315
+ - test/test_language_detection_facade.rb
316
+ - test/test_langusta.rb
317
+ - test/test_n_gram.rb
318
+ - test/test_tag_extractor.rb
319
+ - test/test_ucs2_string.rb
320
+ - test/test_unicode_block.rb