langusta 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  require 'test/helper'
2
3
 
3
4
  class TagExtractorTest < Test::Unit::TestCase
@@ -6,22 +7,22 @@ class TagExtractorTest < Test::Unit::TestCase
6
7
  assert_nil(extractor.target)
7
8
  assert_equal(0, extractor.threshold)
8
9
 
9
- extractor2 = TagExtractor.new(UCS2String.from_utf8("abstract"), 10)
10
- assert_equal(UCS2String.from_utf8("abstract"), extractor2.target)
10
+ extractor2 = TagExtractor.new(str2cp("abstract"), 10)
11
+ assert_equal(str2cp("abstract"), extractor2.target)
11
12
  assert_equal(10, extractor2.threshold)
12
13
  end
13
14
 
14
15
  def test_set_tag
15
16
  extractor = TagExtractor.new(nil, 0)
16
- extractor.tag = UCS2String.from_utf8("")
17
- assert_equal(UCS2String.from_utf8(""), extractor.tag)
17
+ extractor.tag = str2cp("")
18
+ assert_equal(str2cp(""), extractor.tag)
18
19
  extractor.tag = nil
19
20
  assert_nil(extractor.tag)
20
21
  end
21
22
 
22
23
  def test_add
23
24
  extractor = TagExtractor.new(nil, 0)
24
- extractor.add(UCS2String.from_utf8(""))
25
+ extractor.add(str2cp(""))
25
26
  extractor.add(nil)
26
27
  end
27
28
 
@@ -32,13 +33,13 @@ class TagExtractorTest < Test::Unit::TestCase
32
33
  end
33
34
 
34
35
  def test_normal_scenario
35
- extractor = TagExtractor.new(UCS2String.from_utf8("abstract"), 10)
36
+ extractor = TagExtractor.new(str2cp("abstract"), 10)
36
37
  assert_equal(0, extractor.count)
37
38
 
38
39
  profile = LangProfile.new("en")
39
40
  # normal
40
- extractor.tag = UCS2String.from_utf8("abstract")
41
- extractor.add(UCS2String.from_utf8("This is a sample text."))
41
+ extractor.tag = str2cp("abstract")
42
+ extractor.add(str2cp("This is a sample text."))
42
43
  extractor.close_tag(profile)
43
44
  assert_equal(1, extractor.count)
44
45
  assert_equal(17, profile.n_words[0])
@@ -46,26 +47,26 @@ class TagExtractorTest < Test::Unit::TestCase
46
47
  assert_equal(17, profile.n_words[2])
47
48
 
48
49
  # too short
49
- extractor.tag = UCS2String.from_utf8("abstract")
50
- extractor.add(UCS2String.from_utf8("sample"))
50
+ extractor.tag = str2cp("abstract")
51
+ extractor.add(str2cp("sample"))
51
52
  extractor.close_tag(profile)
52
53
  assert_equal(1, extractor.count)
53
54
 
54
55
  # other tags
55
- extractor.tag = UCS2String.from_utf8("div")
56
- extractor.add(UCS2String.from_utf8("This is a sample text which is enough long."))
56
+ extractor.tag = str2cp("div")
57
+ extractor.add(str2cp("This is a sample text which is enough long."))
57
58
  extractor.close_tag(profile)
58
59
  assert_equal(1, extractor.count)
59
60
  end
60
61
 
61
62
  def test_clear
62
- extractor = TagExtractor.new(UCS2String.from_utf8("abstract"), 10)
63
- extractor.tag = UCS2String.from_utf8("abstract")
64
- extractor.add(UCS2String.from_utf8("This is a sample text."))
65
- assert_equal(UCS2String.from_utf8("This is a sample text."), extractor.buffer)
66
- assert_equal(UCS2String.from_utf8("abstract"), extractor.tag)
63
+ extractor = TagExtractor.new(str2cp("abstract"), 10)
64
+ extractor.tag = str2cp("abstract")
65
+ extractor.add(str2cp("This is a sample text."))
66
+ assert_equal(str2cp("This is a sample text."), extractor.buffer)
67
+ assert_equal(str2cp("abstract"), extractor.tag)
67
68
  extractor.clear
68
- assert_equal(UCS2String.from_utf8(""), extractor.buffer)
69
+ assert_equal(str2cp(""), extractor.buffer)
69
70
  assert_nil(extractor.tag)
70
71
  end
71
72
  end
@@ -1,8 +1,9 @@
1
+ # -*- coding: utf-8 -*-
1
2
  require 'test/helper'
2
3
 
3
4
  class UnicodeBlockTest < Test::Unit::TestCase
4
5
  def test_upper_case
5
- ["\x00\x47", "\x01\x10", "\x01\x64", "\x03\xd5", "\x04\xa2", "\x10\xc3", "\x21\x60", "\xa7\x60"].each do |cp|
6
+ [0x0047, 0x0110, 0x0164, 0x03d5, 0x04a2, 0x10c3, 0x2160, 0xa760].each do |cp|
6
7
  assert(Langusta::UnicodeBlock.is_upper_case?(cp))
7
8
  end
8
9
  end
metadata CHANGED
@@ -1,144 +1,64 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: langusta
3
- version: !ruby/object:Gem::Version
4
- hash: 25
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 1
9
- - 1
10
- version: 0.1.1
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Jan Szumiec
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2011-04-10 00:00:00 +02:00
19
- default_executable: langusta
20
- dependencies:
21
- - !ruby/object:Gem::Dependency
22
- type: :runtime
23
- requirement: &id001 !ruby/object:Gem::Requirement
24
- none: false
25
- requirements:
26
- - - "="
27
- - !ruby/object:Gem::Version
28
- hash: 19
29
- segments:
30
- - 1
31
- - 1
32
- - 0
33
- version: 1.1.0
34
- name: oniguruma
35
- version_requirements: *id001
36
- prerelease: false
37
- - !ruby/object:Gem::Dependency
38
- type: :runtime
39
- requirement: &id002 !ruby/object:Gem::Requirement
12
+ date: 2012-03-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: yajl-ruby
16
+ requirement: &2152186680 !ruby/object:Gem::Requirement
40
17
  none: false
41
- requirements:
42
- - - "="
43
- - !ruby/object:Gem::Version
44
- hash: 59
45
- segments:
46
- - 0
47
- - 8
48
- - 2
18
+ requirements:
19
+ - - =
20
+ - !ruby/object:Gem::Version
49
21
  version: 0.8.2
50
- name: yajl-ruby
51
- version_requirements: *id002
22
+ type: :runtime
52
23
  prerelease: false
53
- - !ruby/object:Gem::Dependency
54
- type: :development
55
- requirement: &id003 !ruby/object:Gem::Requirement
56
- none: false
57
- requirements:
58
- - - ~>
59
- - !ruby/object:Gem::Version
60
- hash: 23
61
- segments:
62
- - 1
63
- - 0
64
- - 0
65
- version: 1.0.0
24
+ version_requirements: *2152186680
25
+ - !ruby/object:Gem::Dependency
66
26
  name: bundler
67
- version_requirements: *id003
68
- prerelease: false
69
- - !ruby/object:Gem::Dependency
70
- type: :development
71
- requirement: &id004 !ruby/object:Gem::Requirement
72
- none: false
73
- requirements:
74
- - - ~>
75
- - !ruby/object:Gem::Version
76
- hash: 7
77
- segments:
78
- - 1
79
- - 5
80
- - 2
81
- version: 1.5.2
82
- name: jeweler
83
- version_requirements: *id004
84
- prerelease: false
85
- - !ruby/object:Gem::Dependency
86
- type: :development
87
- requirement: &id005 !ruby/object:Gem::Requirement
27
+ requirement: &2152184160 !ruby/object:Gem::Requirement
88
28
  none: false
89
- requirements:
90
- - - ">="
91
- - !ruby/object:Gem::Version
92
- hash: 3
93
- segments:
94
- - 0
95
- version: "0"
96
- name: rcov
97
- version_requirements: *id005
98
- prerelease: false
99
- - !ruby/object:Gem::Dependency
100
- type: :development
101
- requirement: &id006 !ruby/object:Gem::Requirement
102
- none: false
103
- requirements:
104
- - - ">="
105
- - !ruby/object:Gem::Version
106
- hash: 3
107
- segments:
108
- - 0
109
- version: "0"
110
- name: mocha
111
- version_requirements: *id006
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
112
34
  prerelease: false
113
- - !ruby/object:Gem::Dependency
114
- type: :development
115
- requirement: &id007 !ruby/object:Gem::Requirement
35
+ version_requirements: *2152184160
36
+ - !ruby/object:Gem::Dependency
37
+ name: jeweler
38
+ requirement: &2152182040 !ruby/object:Gem::Requirement
116
39
  none: false
117
- requirements:
118
- - - ">="
119
- - !ruby/object:Gem::Version
120
- hash: 3
121
- segments:
122
- - 0
123
- version: "0"
124
- name: ruby-debug
125
- version_requirements: *id007
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
126
45
  prerelease: false
46
+ version_requirements: *2152182040
127
47
  description: Highly accurate language detection library, uses naive bayesian filter.
128
48
  email: jan.szumiec@gmail.com
129
- executables:
49
+ executables:
130
50
  - langusta
131
51
  extensions: []
132
-
133
- extra_rdoc_files:
52
+ extra_rdoc_files:
134
53
  - LICENSE.txt
135
- - README.rdoc
136
- files:
54
+ - README.md
55
+ files:
137
56
  - .document
57
+ - .travis.yml
138
58
  - Gemfile
139
59
  - Gemfile.lock
140
60
  - LICENSE.txt
141
- - README.rdoc
61
+ - README.md
142
62
  - Rakefile
143
63
  - VERSION
144
64
  - bin/langusta
@@ -146,9 +66,12 @@ files:
146
66
  - data/uppercase.bin
147
67
  - langusta.gemspec
148
68
  - lib/langusta.rb
69
+ - lib/langusta/codepoints.rb
149
70
  - lib/langusta/command.rb
150
71
  - lib/langusta/detector.rb
151
72
  - lib/langusta/detector_factory.rb
73
+ - lib/langusta/guard.rb
74
+ - lib/langusta/inspector.rb
152
75
  - lib/langusta/java_property_reader.rb
153
76
  - lib/langusta/lang_profile.rb
154
77
  - lib/langusta/language.rb
@@ -156,7 +79,6 @@ files:
156
79
  - lib/langusta/n_gram.rb
157
80
  - lib/langusta/regex_helper.rb
158
81
  - lib/langusta/tag_extractor.rb
159
- - lib/langusta/ucs2_string.rb
160
82
  - lib/langusta/unicode_block.rb
161
83
  - profiles/af
162
84
  - profiles/ar
@@ -267,54 +189,30 @@ files:
267
189
  - test/test_langusta.rb
268
190
  - test/test_n_gram.rb
269
191
  - test/test_tag_extractor.rb
270
- - test/test_ucs2_string.rb
271
192
  - test/test_unicode_block.rb
272
- has_rdoc: true
273
193
  homepage: http://github.com/jasiek/langusta
274
- licenses:
194
+ licenses:
275
195
  - Apache 2.0
276
196
  post_install_message:
277
197
  rdoc_options: []
278
-
279
- require_paths:
198
+ require_paths:
280
199
  - lib
281
- required_ruby_version: !ruby/object:Gem::Requirement
200
+ required_ruby_version: !ruby/object:Gem::Requirement
282
201
  none: false
283
- requirements:
284
- - - ">="
285
- - !ruby/object:Gem::Version
286
- hash: 3
287
- segments:
288
- - 0
289
- version: "0"
290
- required_rubygems_version: !ruby/object:Gem::Requirement
202
+ requirements:
203
+ - - ! '>='
204
+ - !ruby/object:Gem::Version
205
+ version: '0'
206
+ required_rubygems_version: !ruby/object:Gem::Requirement
291
207
  none: false
292
- requirements:
293
- - - ">="
294
- - !ruby/object:Gem::Version
295
- hash: 3
296
- segments:
297
- - 0
298
- version: "0"
208
+ requirements:
209
+ - - ! '>='
210
+ - !ruby/object:Gem::Version
211
+ version: '0'
299
212
  requirements: []
300
-
301
213
  rubyforge_project:
302
- rubygems_version: 1.5.1
214
+ rubygems_version: 1.8.17
303
215
  signing_key:
304
216
  specification_version: 3
305
217
  summary: Language detection library based on http://code.google.com/p/language-detection/.
306
- test_files:
307
- - test/helper.rb
308
- - test/quality/test_falsified.rb
309
- - test/test_command.rb
310
- - test/test_detector.rb
311
- - test/test_detector_factory.rb
312
- - test/test_java_property_reader.rb
313
- - test/test_lang_profile.rb
314
- - test/test_language.rb
315
- - test/test_language_detection_facade.rb
316
- - test/test_langusta.rb
317
- - test/test_n_gram.rb
318
- - test/test_tag_extractor.rb
319
- - test/test_ucs2_string.rb
320
- - test/test_unicode_block.rb
218
+ test_files: []
@@ -1,70 +0,0 @@
1
- module Langusta
2
- class UCS2String
3
- include Enumerable
4
- UTF8_TO_UCS2BE_ICONV = Iconv.new('ucs-2be', 'utf-8')
5
- UCS2BE_TO_UCS2BE_ICONV = Iconv.new('ucs-2be', 'ucs-2be')
6
-
7
- attr_reader :underlying
8
-
9
- def self.from_utf8(utf8_string)
10
- self.new(UTF8_TO_UCS2BE_ICONV.iconv(utf8_string))
11
- end
12
-
13
- def initialize(underlying)
14
- @underlying = UCS2BE_TO_UCS2BE_ICONV.iconv(underlying)
15
- end
16
-
17
- def [](index)
18
- @underlying[index / 2, 2]
19
- end
20
-
21
- def gsub!(oregexp, subst)
22
- oregexp.gsub!(@underlying, subst)
23
- self
24
- end
25
-
26
- def map(&blk)
27
- mapped = []
28
- each_char do |char|
29
- mapped << blk.call(char)
30
- end
31
- return UCS2String.new(mapped.join)
32
- end
33
-
34
- def hash
35
- @underlying.hash
36
- end
37
-
38
- def <<(ucs2string)
39
- case ucs2string
40
- when UCS2String
41
- @underlying += ucs2string.underlying
42
- when String
43
- @underlying += ucs2string
44
- else
45
- raise TypeError
46
- end
47
- self
48
- end
49
-
50
- def each_char(&blk)
51
- (0..(@underlying.length - 2)).step(2) do |index|
52
- blk.call(@underlying[index, 2])
53
- end
54
- end
55
- alias :each :each_char
56
-
57
- def eql?(other)
58
- other.is_a?(UCS2String) && self.underlying.eql?(other.underlying)
59
- end
60
-
61
- def ==(other)
62
- self.underlying == other.underlying
63
- end
64
-
65
- def size
66
- @underlying.size / 2
67
- end
68
- alias :length :size
69
- end
70
- end
@@ -1,9 +0,0 @@
1
- require 'test/helper'
2
-
3
- class UCS2StringTest < Test::Unit::TestCase
4
- def test_invalid_unicode_sequences_raise_an_error
5
- assert_raises(Iconv::IllegalSequence) do
6
- UCS2String.from_utf8("\xc0")
7
- end
8
- end
9
- end