langusta 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  require 'test/helper'
2
3
 
3
4
  class TagExtractorTest < Test::Unit::TestCase
@@ -6,22 +7,22 @@ class TagExtractorTest < Test::Unit::TestCase
6
7
  assert_nil(extractor.target)
7
8
  assert_equal(0, extractor.threshold)
8
9
 
9
- extractor2 = TagExtractor.new(UCS2String.from_utf8("abstract"), 10)
10
- assert_equal(UCS2String.from_utf8("abstract"), extractor2.target)
10
+ extractor2 = TagExtractor.new(str2cp("abstract"), 10)
11
+ assert_equal(str2cp("abstract"), extractor2.target)
11
12
  assert_equal(10, extractor2.threshold)
12
13
  end
13
14
 
14
15
  def test_set_tag
15
16
  extractor = TagExtractor.new(nil, 0)
16
- extractor.tag = UCS2String.from_utf8("")
17
- assert_equal(UCS2String.from_utf8(""), extractor.tag)
17
+ extractor.tag = str2cp("")
18
+ assert_equal(str2cp(""), extractor.tag)
18
19
  extractor.tag = nil
19
20
  assert_nil(extractor.tag)
20
21
  end
21
22
 
22
23
  def test_add
23
24
  extractor = TagExtractor.new(nil, 0)
24
- extractor.add(UCS2String.from_utf8(""))
25
+ extractor.add(str2cp(""))
25
26
  extractor.add(nil)
26
27
  end
27
28
 
@@ -32,13 +33,13 @@ class TagExtractorTest < Test::Unit::TestCase
32
33
  end
33
34
 
34
35
  def test_normal_scenario
35
- extractor = TagExtractor.new(UCS2String.from_utf8("abstract"), 10)
36
+ extractor = TagExtractor.new(str2cp("abstract"), 10)
36
37
  assert_equal(0, extractor.count)
37
38
 
38
39
  profile = LangProfile.new("en")
39
40
  # normal
40
- extractor.tag = UCS2String.from_utf8("abstract")
41
- extractor.add(UCS2String.from_utf8("This is a sample text."))
41
+ extractor.tag = str2cp("abstract")
42
+ extractor.add(str2cp("This is a sample text."))
42
43
  extractor.close_tag(profile)
43
44
  assert_equal(1, extractor.count)
44
45
  assert_equal(17, profile.n_words[0])
@@ -46,26 +47,26 @@ class TagExtractorTest < Test::Unit::TestCase
46
47
  assert_equal(17, profile.n_words[2])
47
48
 
48
49
  # too short
49
- extractor.tag = UCS2String.from_utf8("abstract")
50
- extractor.add(UCS2String.from_utf8("sample"))
50
+ extractor.tag = str2cp("abstract")
51
+ extractor.add(str2cp("sample"))
51
52
  extractor.close_tag(profile)
52
53
  assert_equal(1, extractor.count)
53
54
 
54
55
  # other tags
55
- extractor.tag = UCS2String.from_utf8("div")
56
- extractor.add(UCS2String.from_utf8("This is a sample text which is enough long."))
56
+ extractor.tag = str2cp("div")
57
+ extractor.add(str2cp("This is a sample text which is enough long."))
57
58
  extractor.close_tag(profile)
58
59
  assert_equal(1, extractor.count)
59
60
  end
60
61
 
61
62
  def test_clear
62
- extractor = TagExtractor.new(UCS2String.from_utf8("abstract"), 10)
63
- extractor.tag = UCS2String.from_utf8("abstract")
64
- extractor.add(UCS2String.from_utf8("This is a sample text."))
65
- assert_equal(UCS2String.from_utf8("This is a sample text."), extractor.buffer)
66
- assert_equal(UCS2String.from_utf8("abstract"), extractor.tag)
63
+ extractor = TagExtractor.new(str2cp("abstract"), 10)
64
+ extractor.tag = str2cp("abstract")
65
+ extractor.add(str2cp("This is a sample text."))
66
+ assert_equal(str2cp("This is a sample text."), extractor.buffer)
67
+ assert_equal(str2cp("abstract"), extractor.tag)
67
68
  extractor.clear
68
- assert_equal(UCS2String.from_utf8(""), extractor.buffer)
69
+ assert_equal(str2cp(""), extractor.buffer)
69
70
  assert_nil(extractor.tag)
70
71
  end
71
72
  end
@@ -1,8 +1,9 @@
1
+ # -*- coding: utf-8 -*-
1
2
  require 'test/helper'
2
3
 
3
4
  class UnicodeBlockTest < Test::Unit::TestCase
4
5
  def test_upper_case
5
- ["\x00\x47", "\x01\x10", "\x01\x64", "\x03\xd5", "\x04\xa2", "\x10\xc3", "\x21\x60", "\xa7\x60"].each do |cp|
6
+ [0x0047, 0x0110, 0x0164, 0x03d5, 0x04a2, 0x10c3, 0x2160, 0xa760].each do |cp|
6
7
  assert(Langusta::UnicodeBlock.is_upper_case?(cp))
7
8
  end
8
9
  end
metadata CHANGED
@@ -1,144 +1,64 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: langusta
3
- version: !ruby/object:Gem::Version
4
- hash: 25
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 1
9
- - 1
10
- version: 0.1.1
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Jan Szumiec
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2011-04-10 00:00:00 +02:00
19
- default_executable: langusta
20
- dependencies:
21
- - !ruby/object:Gem::Dependency
22
- type: :runtime
23
- requirement: &id001 !ruby/object:Gem::Requirement
24
- none: false
25
- requirements:
26
- - - "="
27
- - !ruby/object:Gem::Version
28
- hash: 19
29
- segments:
30
- - 1
31
- - 1
32
- - 0
33
- version: 1.1.0
34
- name: oniguruma
35
- version_requirements: *id001
36
- prerelease: false
37
- - !ruby/object:Gem::Dependency
38
- type: :runtime
39
- requirement: &id002 !ruby/object:Gem::Requirement
12
+ date: 2012-03-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: yajl-ruby
16
+ requirement: &2152186680 !ruby/object:Gem::Requirement
40
17
  none: false
41
- requirements:
42
- - - "="
43
- - !ruby/object:Gem::Version
44
- hash: 59
45
- segments:
46
- - 0
47
- - 8
48
- - 2
18
+ requirements:
19
+ - - =
20
+ - !ruby/object:Gem::Version
49
21
  version: 0.8.2
50
- name: yajl-ruby
51
- version_requirements: *id002
22
+ type: :runtime
52
23
  prerelease: false
53
- - !ruby/object:Gem::Dependency
54
- type: :development
55
- requirement: &id003 !ruby/object:Gem::Requirement
56
- none: false
57
- requirements:
58
- - - ~>
59
- - !ruby/object:Gem::Version
60
- hash: 23
61
- segments:
62
- - 1
63
- - 0
64
- - 0
65
- version: 1.0.0
24
+ version_requirements: *2152186680
25
+ - !ruby/object:Gem::Dependency
66
26
  name: bundler
67
- version_requirements: *id003
68
- prerelease: false
69
- - !ruby/object:Gem::Dependency
70
- type: :development
71
- requirement: &id004 !ruby/object:Gem::Requirement
72
- none: false
73
- requirements:
74
- - - ~>
75
- - !ruby/object:Gem::Version
76
- hash: 7
77
- segments:
78
- - 1
79
- - 5
80
- - 2
81
- version: 1.5.2
82
- name: jeweler
83
- version_requirements: *id004
84
- prerelease: false
85
- - !ruby/object:Gem::Dependency
86
- type: :development
87
- requirement: &id005 !ruby/object:Gem::Requirement
27
+ requirement: &2152184160 !ruby/object:Gem::Requirement
88
28
  none: false
89
- requirements:
90
- - - ">="
91
- - !ruby/object:Gem::Version
92
- hash: 3
93
- segments:
94
- - 0
95
- version: "0"
96
- name: rcov
97
- version_requirements: *id005
98
- prerelease: false
99
- - !ruby/object:Gem::Dependency
100
- type: :development
101
- requirement: &id006 !ruby/object:Gem::Requirement
102
- none: false
103
- requirements:
104
- - - ">="
105
- - !ruby/object:Gem::Version
106
- hash: 3
107
- segments:
108
- - 0
109
- version: "0"
110
- name: mocha
111
- version_requirements: *id006
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
112
34
  prerelease: false
113
- - !ruby/object:Gem::Dependency
114
- type: :development
115
- requirement: &id007 !ruby/object:Gem::Requirement
35
+ version_requirements: *2152184160
36
+ - !ruby/object:Gem::Dependency
37
+ name: jeweler
38
+ requirement: &2152182040 !ruby/object:Gem::Requirement
116
39
  none: false
117
- requirements:
118
- - - ">="
119
- - !ruby/object:Gem::Version
120
- hash: 3
121
- segments:
122
- - 0
123
- version: "0"
124
- name: ruby-debug
125
- version_requirements: *id007
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
126
45
  prerelease: false
46
+ version_requirements: *2152182040
127
47
  description: Highly accurate language detection library, uses naive bayesian filter.
128
48
  email: jan.szumiec@gmail.com
129
- executables:
49
+ executables:
130
50
  - langusta
131
51
  extensions: []
132
-
133
- extra_rdoc_files:
52
+ extra_rdoc_files:
134
53
  - LICENSE.txt
135
- - README.rdoc
136
- files:
54
+ - README.md
55
+ files:
137
56
  - .document
57
+ - .travis.yml
138
58
  - Gemfile
139
59
  - Gemfile.lock
140
60
  - LICENSE.txt
141
- - README.rdoc
61
+ - README.md
142
62
  - Rakefile
143
63
  - VERSION
144
64
  - bin/langusta
@@ -146,9 +66,12 @@ files:
146
66
  - data/uppercase.bin
147
67
  - langusta.gemspec
148
68
  - lib/langusta.rb
69
+ - lib/langusta/codepoints.rb
149
70
  - lib/langusta/command.rb
150
71
  - lib/langusta/detector.rb
151
72
  - lib/langusta/detector_factory.rb
73
+ - lib/langusta/guard.rb
74
+ - lib/langusta/inspector.rb
152
75
  - lib/langusta/java_property_reader.rb
153
76
  - lib/langusta/lang_profile.rb
154
77
  - lib/langusta/language.rb
@@ -156,7 +79,6 @@ files:
156
79
  - lib/langusta/n_gram.rb
157
80
  - lib/langusta/regex_helper.rb
158
81
  - lib/langusta/tag_extractor.rb
159
- - lib/langusta/ucs2_string.rb
160
82
  - lib/langusta/unicode_block.rb
161
83
  - profiles/af
162
84
  - profiles/ar
@@ -267,54 +189,30 @@ files:
267
189
  - test/test_langusta.rb
268
190
  - test/test_n_gram.rb
269
191
  - test/test_tag_extractor.rb
270
- - test/test_ucs2_string.rb
271
192
  - test/test_unicode_block.rb
272
- has_rdoc: true
273
193
  homepage: http://github.com/jasiek/langusta
274
- licenses:
194
+ licenses:
275
195
  - Apache 2.0
276
196
  post_install_message:
277
197
  rdoc_options: []
278
-
279
- require_paths:
198
+ require_paths:
280
199
  - lib
281
- required_ruby_version: !ruby/object:Gem::Requirement
200
+ required_ruby_version: !ruby/object:Gem::Requirement
282
201
  none: false
283
- requirements:
284
- - - ">="
285
- - !ruby/object:Gem::Version
286
- hash: 3
287
- segments:
288
- - 0
289
- version: "0"
290
- required_rubygems_version: !ruby/object:Gem::Requirement
202
+ requirements:
203
+ - - ! '>='
204
+ - !ruby/object:Gem::Version
205
+ version: '0'
206
+ required_rubygems_version: !ruby/object:Gem::Requirement
291
207
  none: false
292
- requirements:
293
- - - ">="
294
- - !ruby/object:Gem::Version
295
- hash: 3
296
- segments:
297
- - 0
298
- version: "0"
208
+ requirements:
209
+ - - ! '>='
210
+ - !ruby/object:Gem::Version
211
+ version: '0'
299
212
  requirements: []
300
-
301
213
  rubyforge_project:
302
- rubygems_version: 1.5.1
214
+ rubygems_version: 1.8.17
303
215
  signing_key:
304
216
  specification_version: 3
305
217
  summary: Language detection library based on http://code.google.com/p/language-detection/.
306
- test_files:
307
- - test/helper.rb
308
- - test/quality/test_falsified.rb
309
- - test/test_command.rb
310
- - test/test_detector.rb
311
- - test/test_detector_factory.rb
312
- - test/test_java_property_reader.rb
313
- - test/test_lang_profile.rb
314
- - test/test_language.rb
315
- - test/test_language_detection_facade.rb
316
- - test/test_langusta.rb
317
- - test/test_n_gram.rb
318
- - test/test_tag_extractor.rb
319
- - test/test_ucs2_string.rb
320
- - test/test_unicode_block.rb
218
+ test_files: []
@@ -1,70 +0,0 @@
1
- module Langusta
2
- class UCS2String
3
- include Enumerable
4
- UTF8_TO_UCS2BE_ICONV = Iconv.new('ucs-2be', 'utf-8')
5
- UCS2BE_TO_UCS2BE_ICONV = Iconv.new('ucs-2be', 'ucs-2be')
6
-
7
- attr_reader :underlying
8
-
9
- def self.from_utf8(utf8_string)
10
- self.new(UTF8_TO_UCS2BE_ICONV.iconv(utf8_string))
11
- end
12
-
13
- def initialize(underlying)
14
- @underlying = UCS2BE_TO_UCS2BE_ICONV.iconv(underlying)
15
- end
16
-
17
- def [](index)
18
- @underlying[index / 2, 2]
19
- end
20
-
21
- def gsub!(oregexp, subst)
22
- oregexp.gsub!(@underlying, subst)
23
- self
24
- end
25
-
26
- def map(&blk)
27
- mapped = []
28
- each_char do |char|
29
- mapped << blk.call(char)
30
- end
31
- return UCS2String.new(mapped.join)
32
- end
33
-
34
- def hash
35
- @underlying.hash
36
- end
37
-
38
- def <<(ucs2string)
39
- case ucs2string
40
- when UCS2String
41
- @underlying += ucs2string.underlying
42
- when String
43
- @underlying += ucs2string
44
- else
45
- raise TypeError
46
- end
47
- self
48
- end
49
-
50
- def each_char(&blk)
51
- (0..(@underlying.length - 2)).step(2) do |index|
52
- blk.call(@underlying[index, 2])
53
- end
54
- end
55
- alias :each :each_char
56
-
57
- def eql?(other)
58
- other.is_a?(UCS2String) && self.underlying.eql?(other.underlying)
59
- end
60
-
61
- def ==(other)
62
- self.underlying == other.underlying
63
- end
64
-
65
- def size
66
- @underlying.size / 2
67
- end
68
- alias :length :size
69
- end
70
- end
@@ -1,9 +0,0 @@
1
- require 'test/helper'
2
-
3
- class UCS2StringTest < Test::Unit::TestCase
4
- def test_invalid_unicode_sequences_raise_an_error
5
- assert_raises(Iconv::IllegalSequence) do
6
- UCS2String.from_utf8("\xc0")
7
- end
8
- end
9
- end