bimyou_segmenter 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/bimyou_segmenter +6 -2
- data/lib/bimyou_segmenter/aozora_model.rb +11 -4
- data/lib/bimyou_segmenter/version.rb +1 -1
- data/test/bimyou_segmenter_test.rb +12 -0
- metadata +34 -49
data/bin/bimyou_segmenter
CHANGED
@@ -68,7 +68,9 @@ if (ARGV.size == 0)
|
|
68
68
|
s = NKF.nkf(nkf_option, s)
|
69
69
|
end
|
70
70
|
begin
|
71
|
-
|
71
|
+
unless (s.empty?)
|
72
|
+
puts s
|
73
|
+
end
|
72
74
|
rescue Errno::EPIPE
|
73
75
|
exit 0
|
74
76
|
end
|
@@ -84,7 +86,9 @@ else
|
|
84
86
|
if (nkf_option)
|
85
87
|
s = NKF.nkf(nkf_option, s)
|
86
88
|
end
|
87
|
-
|
89
|
+
unless (s.empty?)
|
90
|
+
puts s
|
91
|
+
end
|
88
92
|
end
|
89
93
|
end
|
90
94
|
end
|
@@ -16,6 +16,7 @@ module BimyouSegmenter
|
|
16
16
|
end
|
17
17
|
|
18
18
|
white_space = options[:white_space].nil? ? false : options[:white_space]
|
19
|
+
symbol = options[:symbol].nil? ? true : options[:symbol]
|
19
20
|
|
20
21
|
wakachi = []
|
21
22
|
text_chars = text.chars.to_a
|
@@ -155,17 +156,23 @@ module BimyouSegmenter
|
|
155
156
|
end
|
156
157
|
end
|
157
158
|
wakachi << word
|
158
|
-
|
159
|
-
wakachi
|
160
|
-
else
|
161
|
-
wakachi.reject{|v| v.match(WHITE_SPACE) }
|
159
|
+
unless (white_space)
|
160
|
+
wakachi = wakachi.reject{|v| v.match(WHITE_SPACE) }
|
162
161
|
end
|
162
|
+
unless (symbol)
|
163
|
+
wakachi = wakachi.reject{|v| v.match(SYMBOL) }
|
164
|
+
end
|
165
|
+
wakachi
|
163
166
|
end
|
164
167
|
|
165
168
|
private
|
166
169
|
KANJI = Regexp.new('[々〇ヵヶ' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
|
167
170
|
[0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
|
168
171
|
[0x20000].pack('U') + '-' + [0x2FFFF].pack('U') + ']')
|
172
|
+
SYMBOL = Regexp.new('^[^々〇' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
|
173
|
+
[0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
|
174
|
+
[0x20000].pack('U') + '-' + [0x2FFFF].pack('U') +
|
175
|
+
'\s ぁ-ゞァ-ヾa-zA-Za-zA-Z0-90-9]+$')
|
169
176
|
WHITE_SPACE = /[\s ]/
|
170
177
|
|
171
178
|
def self.char_types(chars)
|
@@ -33,6 +33,18 @@ class BimyouSegmenterTest < Test::Unit::TestCase
|
|
33
33
|
assert_equal tokens[2], "\r"
|
34
34
|
assert_equal tokens[3], "\n"
|
35
35
|
assert_equal tokens[4], " "
|
36
|
+
|
37
|
+
tokens = BimyouSegmenter.segment("「私はトマトです。」", :symbol => false)
|
38
|
+
assert_equal tokens.size, 4
|
39
|
+
assert_equal tokens[0], "私"
|
40
|
+
assert_equal tokens[1], "は"
|
41
|
+
assert_equal tokens[2], "トマト"
|
42
|
+
assert_equal tokens[3], "です"
|
43
|
+
|
44
|
+
tokens = BimyouSegmenter.segment("hello. world? = !", :symbol => false)
|
45
|
+
assert_equal tokens.size, 2
|
46
|
+
assert_equal tokens[0], "hello"
|
47
|
+
assert_equal tokens[1], "world"
|
36
48
|
|
37
49
|
assert_equal BimyouSegmenter.segment("").size, 0
|
38
50
|
assert_equal BimyouSegmenter.segment(nil).size, 0
|
metadata
CHANGED
@@ -1,44 +1,35 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: bimyou_segmenter
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
6
|
-
- 1
|
7
|
-
- 1
|
8
|
-
- 1
|
9
|
-
version: 1.1.1
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.2.0
|
5
|
+
prerelease:
|
10
6
|
platform: ruby
|
11
|
-
authors:
|
7
|
+
authors:
|
12
8
|
- nagadomi
|
13
9
|
autorequire:
|
14
10
|
bindir: bin
|
15
11
|
cert_chain: []
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
dependencies:
|
20
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2012-06-01 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
21
15
|
name: test-unit
|
22
|
-
|
23
|
-
|
24
|
-
requirements:
|
25
|
-
- -
|
26
|
-
- !ruby/object:Gem::Version
|
27
|
-
|
28
|
-
- 0
|
29
|
-
version: "0"
|
16
|
+
requirement: &5715420 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
30
22
|
type: :development
|
31
|
-
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *5715420
|
32
25
|
description: Japanese Word Segmenter
|
33
|
-
email:
|
26
|
+
email:
|
34
27
|
- nagadomi@nurs.or.jp
|
35
|
-
executables:
|
28
|
+
executables:
|
36
29
|
- bimyou_segmenter
|
37
30
|
extensions: []
|
38
|
-
|
39
31
|
extra_rdoc_files: []
|
40
|
-
|
41
|
-
files:
|
32
|
+
files:
|
42
33
|
- .gitignore
|
43
34
|
- Changelog
|
44
35
|
- Gemfile
|
@@ -52,36 +43,30 @@ files:
|
|
52
43
|
- lib/bimyou_segmenter/version.rb
|
53
44
|
- test/bimyou_segmenter_test.rb
|
54
45
|
- test/test_helper.rb
|
55
|
-
has_rdoc: true
|
56
46
|
homepage: http://github.com/nagadomi/bimyou_segmenter
|
57
47
|
licenses: []
|
58
|
-
|
59
48
|
post_install_message:
|
60
49
|
rdoc_options: []
|
61
|
-
|
62
|
-
require_paths:
|
50
|
+
require_paths:
|
63
51
|
- lib
|
64
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
requirements:
|
73
|
-
- -
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
|
76
|
-
- 0
|
77
|
-
version: "0"
|
52
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ! '>='
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
59
|
+
none: false
|
60
|
+
requirements:
|
61
|
+
- - ! '>='
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
78
64
|
requirements: []
|
79
|
-
|
80
65
|
rubyforge_project:
|
81
|
-
rubygems_version: 1.
|
66
|
+
rubygems_version: 1.8.10
|
82
67
|
signing_key:
|
83
68
|
specification_version: 3
|
84
69
|
summary: Japanese Word Segmenter
|
85
|
-
test_files:
|
70
|
+
test_files:
|
86
71
|
- test/bimyou_segmenter_test.rb
|
87
72
|
- test/test_helper.rb
|