langusta 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +7 -0
- data/Gemfile +10 -7
- data/Gemfile.lock +12 -16
- data/{README.rdoc → README.md} +27 -10
- data/Rakefile +3 -10
- data/VERSION +1 -1
- data/langusta.gemspec +23 -47
- data/lib/langusta.rb +36 -10
- data/lib/langusta/codepoints.rb +19 -0
- data/lib/langusta/command.rb +3 -3
- data/lib/langusta/detector.rb +16 -13
- data/lib/langusta/detector_factory.rb +11 -5
- data/lib/langusta/guard.rb +22 -0
- data/lib/langusta/inspector.rb +7 -0
- data/lib/langusta/java_property_reader.rb +2 -3
- data/lib/langusta/lang_profile.rb +12 -18
- data/lib/langusta/language_detection_facade.rb +2 -2
- data/lib/langusta/n_gram.rb +20 -25
- data/lib/langusta/regex_helper.rb +15 -10
- data/lib/langusta/tag_extractor.rb +5 -5
- data/lib/langusta/unicode_block.rb +34 -34
- data/test/helper.rb +12 -3
- data/test/quality/test_falsified.rb +3 -3
- data/test/test_command.rb +1 -0
- data/test/test_detector.rb +18 -17
- data/test/test_detector_factory.rb +17 -5
- data/test/test_java_property_reader.rb +2 -1
- data/test/test_lang_profile.rb +37 -31
- data/test/test_language.rb +1 -0
- data/test/test_language_detection_facade.rb +1 -1
- data/test/test_langusta.rb +6 -6
- data/test/test_n_gram.rb +87 -75
- data/test/test_tag_extractor.rb +19 -18
- data/test/test_unicode_block.rb +2 -1
- metadata +54 -156
- data/lib/langusta/ucs2_string.rb +0 -70
- data/test/test_ucs2_string.rb +0 -9
data/test/test_tag_extractor.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
require 'test/helper'
|
2
3
|
|
3
4
|
class TagExtractorTest < Test::Unit::TestCase
|
@@ -6,22 +7,22 @@ class TagExtractorTest < Test::Unit::TestCase
|
|
6
7
|
assert_nil(extractor.target)
|
7
8
|
assert_equal(0, extractor.threshold)
|
8
9
|
|
9
|
-
extractor2 = TagExtractor.new(
|
10
|
-
assert_equal(
|
10
|
+
extractor2 = TagExtractor.new(str2cp("abstract"), 10)
|
11
|
+
assert_equal(str2cp("abstract"), extractor2.target)
|
11
12
|
assert_equal(10, extractor2.threshold)
|
12
13
|
end
|
13
14
|
|
14
15
|
def test_set_tag
|
15
16
|
extractor = TagExtractor.new(nil, 0)
|
16
|
-
extractor.tag =
|
17
|
-
assert_equal(
|
17
|
+
extractor.tag = str2cp("")
|
18
|
+
assert_equal(str2cp(""), extractor.tag)
|
18
19
|
extractor.tag = nil
|
19
20
|
assert_nil(extractor.tag)
|
20
21
|
end
|
21
22
|
|
22
23
|
def test_add
|
23
24
|
extractor = TagExtractor.new(nil, 0)
|
24
|
-
extractor.add(
|
25
|
+
extractor.add(str2cp(""))
|
25
26
|
extractor.add(nil)
|
26
27
|
end
|
27
28
|
|
@@ -32,13 +33,13 @@ class TagExtractorTest < Test::Unit::TestCase
|
|
32
33
|
end
|
33
34
|
|
34
35
|
def test_normal_scenario
|
35
|
-
extractor = TagExtractor.new(
|
36
|
+
extractor = TagExtractor.new(str2cp("abstract"), 10)
|
36
37
|
assert_equal(0, extractor.count)
|
37
38
|
|
38
39
|
profile = LangProfile.new("en")
|
39
40
|
# normal
|
40
|
-
extractor.tag =
|
41
|
-
extractor.add(
|
41
|
+
extractor.tag = str2cp("abstract")
|
42
|
+
extractor.add(str2cp("This is a sample text."))
|
42
43
|
extractor.close_tag(profile)
|
43
44
|
assert_equal(1, extractor.count)
|
44
45
|
assert_equal(17, profile.n_words[0])
|
@@ -46,26 +47,26 @@ class TagExtractorTest < Test::Unit::TestCase
|
|
46
47
|
assert_equal(17, profile.n_words[2])
|
47
48
|
|
48
49
|
# too short
|
49
|
-
extractor.tag =
|
50
|
-
extractor.add(
|
50
|
+
extractor.tag = str2cp("abstract")
|
51
|
+
extractor.add(str2cp("sample"))
|
51
52
|
extractor.close_tag(profile)
|
52
53
|
assert_equal(1, extractor.count)
|
53
54
|
|
54
55
|
# other tags
|
55
|
-
extractor.tag =
|
56
|
-
extractor.add(
|
56
|
+
extractor.tag = str2cp("div")
|
57
|
+
extractor.add(str2cp("This is a sample text which is enough long."))
|
57
58
|
extractor.close_tag(profile)
|
58
59
|
assert_equal(1, extractor.count)
|
59
60
|
end
|
60
61
|
|
61
62
|
def test_clear
|
62
|
-
extractor = TagExtractor.new(
|
63
|
-
extractor.tag =
|
64
|
-
extractor.add(
|
65
|
-
assert_equal(
|
66
|
-
assert_equal(
|
63
|
+
extractor = TagExtractor.new(str2cp("abstract"), 10)
|
64
|
+
extractor.tag = str2cp("abstract")
|
65
|
+
extractor.add(str2cp("This is a sample text."))
|
66
|
+
assert_equal(str2cp("This is a sample text."), extractor.buffer)
|
67
|
+
assert_equal(str2cp("abstract"), extractor.tag)
|
67
68
|
extractor.clear
|
68
|
-
assert_equal(
|
69
|
+
assert_equal(str2cp(""), extractor.buffer)
|
69
70
|
assert_nil(extractor.tag)
|
70
71
|
end
|
71
72
|
end
|
data/test/test_unicode_block.rb
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
require 'test/helper'
|
2
3
|
|
3
4
|
class UnicodeBlockTest < Test::Unit::TestCase
|
4
5
|
def test_upper_case
|
5
|
-
[
|
6
|
+
[0x0047, 0x0110, 0x0164, 0x03d5, 0x04a2, 0x10c3, 0x2160, 0xa760].each do |cp|
|
6
7
|
assert(Langusta::UnicodeBlock.is_upper_case?(cp))
|
7
8
|
end
|
8
9
|
end
|
metadata
CHANGED
@@ -1,144 +1,64 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: langusta
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 1
|
9
|
-
- 1
|
10
|
-
version: 0.1.1
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Jan Szumiec
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
type: :runtime
|
23
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
-
none: false
|
25
|
-
requirements:
|
26
|
-
- - "="
|
27
|
-
- !ruby/object:Gem::Version
|
28
|
-
hash: 19
|
29
|
-
segments:
|
30
|
-
- 1
|
31
|
-
- 1
|
32
|
-
- 0
|
33
|
-
version: 1.1.0
|
34
|
-
name: oniguruma
|
35
|
-
version_requirements: *id001
|
36
|
-
prerelease: false
|
37
|
-
- !ruby/object:Gem::Dependency
|
38
|
-
type: :runtime
|
39
|
-
requirement: &id002 !ruby/object:Gem::Requirement
|
12
|
+
date: 2012-03-04 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: yajl-ruby
|
16
|
+
requirement: &2152186680 !ruby/object:Gem::Requirement
|
40
17
|
none: false
|
41
|
-
requirements:
|
42
|
-
- -
|
43
|
-
- !ruby/object:Gem::Version
|
44
|
-
hash: 59
|
45
|
-
segments:
|
46
|
-
- 0
|
47
|
-
- 8
|
48
|
-
- 2
|
18
|
+
requirements:
|
19
|
+
- - =
|
20
|
+
- !ruby/object:Gem::Version
|
49
21
|
version: 0.8.2
|
50
|
-
|
51
|
-
version_requirements: *id002
|
22
|
+
type: :runtime
|
52
23
|
prerelease: false
|
53
|
-
|
54
|
-
|
55
|
-
requirement: &id003 !ruby/object:Gem::Requirement
|
56
|
-
none: false
|
57
|
-
requirements:
|
58
|
-
- - ~>
|
59
|
-
- !ruby/object:Gem::Version
|
60
|
-
hash: 23
|
61
|
-
segments:
|
62
|
-
- 1
|
63
|
-
- 0
|
64
|
-
- 0
|
65
|
-
version: 1.0.0
|
24
|
+
version_requirements: *2152186680
|
25
|
+
- !ruby/object:Gem::Dependency
|
66
26
|
name: bundler
|
67
|
-
|
68
|
-
prerelease: false
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
type: :development
|
71
|
-
requirement: &id004 !ruby/object:Gem::Requirement
|
72
|
-
none: false
|
73
|
-
requirements:
|
74
|
-
- - ~>
|
75
|
-
- !ruby/object:Gem::Version
|
76
|
-
hash: 7
|
77
|
-
segments:
|
78
|
-
- 1
|
79
|
-
- 5
|
80
|
-
- 2
|
81
|
-
version: 1.5.2
|
82
|
-
name: jeweler
|
83
|
-
version_requirements: *id004
|
84
|
-
prerelease: false
|
85
|
-
- !ruby/object:Gem::Dependency
|
86
|
-
type: :development
|
87
|
-
requirement: &id005 !ruby/object:Gem::Requirement
|
27
|
+
requirement: &2152184160 !ruby/object:Gem::Requirement
|
88
28
|
none: false
|
89
|
-
requirements:
|
90
|
-
- -
|
91
|
-
- !ruby/object:Gem::Version
|
92
|
-
|
93
|
-
|
94
|
-
- 0
|
95
|
-
version: "0"
|
96
|
-
name: rcov
|
97
|
-
version_requirements: *id005
|
98
|
-
prerelease: false
|
99
|
-
- !ruby/object:Gem::Dependency
|
100
|
-
type: :development
|
101
|
-
requirement: &id006 !ruby/object:Gem::Requirement
|
102
|
-
none: false
|
103
|
-
requirements:
|
104
|
-
- - ">="
|
105
|
-
- !ruby/object:Gem::Version
|
106
|
-
hash: 3
|
107
|
-
segments:
|
108
|
-
- 0
|
109
|
-
version: "0"
|
110
|
-
name: mocha
|
111
|
-
version_requirements: *id006
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
112
34
|
prerelease: false
|
113
|
-
|
114
|
-
|
115
|
-
|
35
|
+
version_requirements: *2152184160
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: jeweler
|
38
|
+
requirement: &2152182040 !ruby/object:Gem::Requirement
|
116
39
|
none: false
|
117
|
-
requirements:
|
118
|
-
- -
|
119
|
-
- !ruby/object:Gem::Version
|
120
|
-
|
121
|
-
|
122
|
-
- 0
|
123
|
-
version: "0"
|
124
|
-
name: ruby-debug
|
125
|
-
version_requirements: *id007
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
126
45
|
prerelease: false
|
46
|
+
version_requirements: *2152182040
|
127
47
|
description: Highly accurate language detection library, uses naive bayesian filter.
|
128
48
|
email: jan.szumiec@gmail.com
|
129
|
-
executables:
|
49
|
+
executables:
|
130
50
|
- langusta
|
131
51
|
extensions: []
|
132
|
-
|
133
|
-
extra_rdoc_files:
|
52
|
+
extra_rdoc_files:
|
134
53
|
- LICENSE.txt
|
135
|
-
- README.
|
136
|
-
files:
|
54
|
+
- README.md
|
55
|
+
files:
|
137
56
|
- .document
|
57
|
+
- .travis.yml
|
138
58
|
- Gemfile
|
139
59
|
- Gemfile.lock
|
140
60
|
- LICENSE.txt
|
141
|
-
- README.
|
61
|
+
- README.md
|
142
62
|
- Rakefile
|
143
63
|
- VERSION
|
144
64
|
- bin/langusta
|
@@ -146,9 +66,12 @@ files:
|
|
146
66
|
- data/uppercase.bin
|
147
67
|
- langusta.gemspec
|
148
68
|
- lib/langusta.rb
|
69
|
+
- lib/langusta/codepoints.rb
|
149
70
|
- lib/langusta/command.rb
|
150
71
|
- lib/langusta/detector.rb
|
151
72
|
- lib/langusta/detector_factory.rb
|
73
|
+
- lib/langusta/guard.rb
|
74
|
+
- lib/langusta/inspector.rb
|
152
75
|
- lib/langusta/java_property_reader.rb
|
153
76
|
- lib/langusta/lang_profile.rb
|
154
77
|
- lib/langusta/language.rb
|
@@ -156,7 +79,6 @@ files:
|
|
156
79
|
- lib/langusta/n_gram.rb
|
157
80
|
- lib/langusta/regex_helper.rb
|
158
81
|
- lib/langusta/tag_extractor.rb
|
159
|
-
- lib/langusta/ucs2_string.rb
|
160
82
|
- lib/langusta/unicode_block.rb
|
161
83
|
- profiles/af
|
162
84
|
- profiles/ar
|
@@ -267,54 +189,30 @@ files:
|
|
267
189
|
- test/test_langusta.rb
|
268
190
|
- test/test_n_gram.rb
|
269
191
|
- test/test_tag_extractor.rb
|
270
|
-
- test/test_ucs2_string.rb
|
271
192
|
- test/test_unicode_block.rb
|
272
|
-
has_rdoc: true
|
273
193
|
homepage: http://github.com/jasiek/langusta
|
274
|
-
licenses:
|
194
|
+
licenses:
|
275
195
|
- Apache 2.0
|
276
196
|
post_install_message:
|
277
197
|
rdoc_options: []
|
278
|
-
|
279
|
-
require_paths:
|
198
|
+
require_paths:
|
280
199
|
- lib
|
281
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
200
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
282
201
|
none: false
|
283
|
-
requirements:
|
284
|
-
- -
|
285
|
-
- !ruby/object:Gem::Version
|
286
|
-
|
287
|
-
|
288
|
-
- 0
|
289
|
-
version: "0"
|
290
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
202
|
+
requirements:
|
203
|
+
- - ! '>='
|
204
|
+
- !ruby/object:Gem::Version
|
205
|
+
version: '0'
|
206
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
291
207
|
none: false
|
292
|
-
requirements:
|
293
|
-
- -
|
294
|
-
- !ruby/object:Gem::Version
|
295
|
-
|
296
|
-
segments:
|
297
|
-
- 0
|
298
|
-
version: "0"
|
208
|
+
requirements:
|
209
|
+
- - ! '>='
|
210
|
+
- !ruby/object:Gem::Version
|
211
|
+
version: '0'
|
299
212
|
requirements: []
|
300
|
-
|
301
213
|
rubyforge_project:
|
302
|
-
rubygems_version: 1.
|
214
|
+
rubygems_version: 1.8.17
|
303
215
|
signing_key:
|
304
216
|
specification_version: 3
|
305
217
|
summary: Language detection library based on http://code.google.com/p/language-detection/.
|
306
|
-
test_files:
|
307
|
-
- test/helper.rb
|
308
|
-
- test/quality/test_falsified.rb
|
309
|
-
- test/test_command.rb
|
310
|
-
- test/test_detector.rb
|
311
|
-
- test/test_detector_factory.rb
|
312
|
-
- test/test_java_property_reader.rb
|
313
|
-
- test/test_lang_profile.rb
|
314
|
-
- test/test_language.rb
|
315
|
-
- test/test_language_detection_facade.rb
|
316
|
-
- test/test_langusta.rb
|
317
|
-
- test/test_n_gram.rb
|
318
|
-
- test/test_tag_extractor.rb
|
319
|
-
- test/test_ucs2_string.rb
|
320
|
-
- test/test_unicode_block.rb
|
218
|
+
test_files: []
|
data/lib/langusta/ucs2_string.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
module Langusta
|
2
|
-
class UCS2String
|
3
|
-
include Enumerable
|
4
|
-
UTF8_TO_UCS2BE_ICONV = Iconv.new('ucs-2be', 'utf-8')
|
5
|
-
UCS2BE_TO_UCS2BE_ICONV = Iconv.new('ucs-2be', 'ucs-2be')
|
6
|
-
|
7
|
-
attr_reader :underlying
|
8
|
-
|
9
|
-
def self.from_utf8(utf8_string)
|
10
|
-
self.new(UTF8_TO_UCS2BE_ICONV.iconv(utf8_string))
|
11
|
-
end
|
12
|
-
|
13
|
-
def initialize(underlying)
|
14
|
-
@underlying = UCS2BE_TO_UCS2BE_ICONV.iconv(underlying)
|
15
|
-
end
|
16
|
-
|
17
|
-
def [](index)
|
18
|
-
@underlying[index / 2, 2]
|
19
|
-
end
|
20
|
-
|
21
|
-
def gsub!(oregexp, subst)
|
22
|
-
oregexp.gsub!(@underlying, subst)
|
23
|
-
self
|
24
|
-
end
|
25
|
-
|
26
|
-
def map(&blk)
|
27
|
-
mapped = []
|
28
|
-
each_char do |char|
|
29
|
-
mapped << blk.call(char)
|
30
|
-
end
|
31
|
-
return UCS2String.new(mapped.join)
|
32
|
-
end
|
33
|
-
|
34
|
-
def hash
|
35
|
-
@underlying.hash
|
36
|
-
end
|
37
|
-
|
38
|
-
def <<(ucs2string)
|
39
|
-
case ucs2string
|
40
|
-
when UCS2String
|
41
|
-
@underlying += ucs2string.underlying
|
42
|
-
when String
|
43
|
-
@underlying += ucs2string
|
44
|
-
else
|
45
|
-
raise TypeError
|
46
|
-
end
|
47
|
-
self
|
48
|
-
end
|
49
|
-
|
50
|
-
def each_char(&blk)
|
51
|
-
(0..(@underlying.length - 2)).step(2) do |index|
|
52
|
-
blk.call(@underlying[index, 2])
|
53
|
-
end
|
54
|
-
end
|
55
|
-
alias :each :each_char
|
56
|
-
|
57
|
-
def eql?(other)
|
58
|
-
other.is_a?(UCS2String) && self.underlying.eql?(other.underlying)
|
59
|
-
end
|
60
|
-
|
61
|
-
def ==(other)
|
62
|
-
self.underlying == other.underlying
|
63
|
-
end
|
64
|
-
|
65
|
-
def size
|
66
|
-
@underlying.size / 2
|
67
|
-
end
|
68
|
-
alias :length :size
|
69
|
-
end
|
70
|
-
end
|