tiny_segmenter 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +2 -0
- data/.travis.yml +7 -0
- data/Gemfile +1 -2
- data/README.md +3 -1
- data/Rakefile +8 -0
- data/lib/tiny_segmenter.rb +6 -2
- data/lib/tiny_segmenter/version.rb +1 -1
- data/spec/tiny_segmenter_spec.rb +31 -10
- data/tiny_segmenter.gemspec +1 -1
- metadata +13 -17
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ed996e5195f9e29609e110d175c3d002875e5496
|
4
|
+
data.tar.gz: 00405bd85ee522fc3d0b7271cf46fe06bca298a8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e5e0455da693e7eba988f3a921d05c13db13756d9658b0481643963d35367cc207ea02a61f45e02be288ba0a4315b202fc95e6c3bbf88b2fb97b74df6afcd8bd
|
7
|
+
data.tar.gz: b3d581557d7097c012bb565ee736265892b686f0b7b4fa286c07409e785da2664cb46f4e7971417d530a277ed8cb302df632c91fe7b85d106ef0d2b20b01a24c
|
data/.gitignore
CHANGED
data/.travis.yml
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
Ruby port of [TinySegmenter.js](http://chasen.org/~taku/software/TinySegmenter/) for tokenizing Japanese text.
|
1
|
+
Ruby port of [TinySegmenter.js](http://chasen.org/~taku/software/TinySegmenter/) for tokenizing Japanese text. Ruby 1.9 or higher required.
|
2
|
+
|
3
|
+
[](http://travis-ci.org/6/tiny_segmenter)
|
2
4
|
|
3
5
|
### Install
|
4
6
|
|
data/Rakefile
ADDED
data/lib/tiny_segmenter.rb
CHANGED
@@ -3,6 +3,9 @@ require "tiny_segmenter/version"
|
|
3
3
|
require "tiny_segmenter/segmentation_model"
|
4
4
|
|
5
5
|
class TinySegmenter
|
6
|
+
WhitespaceOnlyRegex = Regexp.compile("^[ ]+$")
|
7
|
+
PunctuationRegex = Regexp.compile("^[-–—―.。・()()[]{}{}【】⟨⟩、、,,،…‥〽「」『』〜~!!::??\"'|__“”‘’;/⁄/«»]+$")
|
8
|
+
|
6
9
|
def initialize
|
7
10
|
@chartype = []
|
8
11
|
@model = SegmentationModel.new
|
@@ -20,7 +23,7 @@ class TinySegmenter
|
|
20
23
|
end
|
21
24
|
end
|
22
25
|
|
23
|
-
def segment(text)
|
26
|
+
def segment(text, options = {})
|
24
27
|
return [] if text.nil? || text.strip.empty?
|
25
28
|
text = text.strip
|
26
29
|
result = []
|
@@ -28,7 +31,8 @@ class TinySegmenter
|
|
28
31
|
ctypes = %w[O O O]
|
29
32
|
text.split(//).each do |char|
|
30
33
|
char.strip!
|
31
|
-
next if char.empty?
|
34
|
+
next if char.empty? || char.match(WhitespaceOnlyRegex)
|
35
|
+
next if options[:ignore_punctuation] && char.match(PunctuationRegex)
|
32
36
|
segments << char
|
33
37
|
ctypes << ctype(char)
|
34
38
|
end
|
data/spec/tiny_segmenter_spec.rb
CHANGED
@@ -4,20 +4,41 @@ require 'spec_helper'
|
|
4
4
|
describe TinySegmenter do
|
5
5
|
subject{ TinySegmenter.new }
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
7
|
+
describe "#segment" do
|
8
|
+
it "tokenizes Japanese text fairly accurately" do
|
9
|
+
subject.segment("極めてコンパクトな日本語分かち書きソフトウェアです。").should == \
|
10
|
+
["極めて", "コンパクト", "な", "日本", "語分", "かち", "書き", "ソフトウェア", "です", "。"]
|
11
|
+
end
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
13
|
+
it "removes any whitespace-only or empty tokens" do
|
14
|
+
subject.segment("書かれた 極めて コンパクト").should_not include("", " ", nil)
|
15
|
+
end
|
16
|
+
|
17
|
+
it "removes full-width space (U+3000) tokens" do
|
18
|
+
sentence = "すてき! 男性が歌う「夢やぶれて」もいいね。"
|
19
|
+
full_width_space = " "
|
20
|
+
sentence.should include(full_width_space)
|
21
|
+
subject.segment(sentence).should_not include (full_width_space)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "tokenizes interspersed non-Japanese words correctly" do
|
25
|
+
subject.segment("TinySegmenterはRubyだけで").should == ["TinySegmenter", "は", "Ruby", "だけ", "で"]
|
26
|
+
end
|
27
|
+
|
28
|
+
context "with ignore_punctuation option not set" do
|
29
|
+
it "includes punctuation-only tokens" do
|
30
|
+
subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...").should include("。", "!", "?", "、", "「", "」", "...")
|
31
|
+
end
|
32
|
+
end
|
15
33
|
|
16
|
-
|
17
|
-
|
34
|
+
context "with ignore_punctuation option set" do
|
35
|
+
it "removes all punctuation-only tokens" do
|
36
|
+
subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...", ignore_punctuation: true).should_not include("。", "!", "?", "、", "「", "」", "...")
|
37
|
+
end
|
38
|
+
end
|
18
39
|
end
|
19
40
|
|
20
41
|
it "has a version" do
|
21
|
-
TinySegmenter::VERSION.
|
42
|
+
TinySegmenter::VERSION.should be_kind_of(String)
|
22
43
|
end
|
23
44
|
end
|
data/tiny_segmenter.gemspec
CHANGED
@@ -5,7 +5,7 @@ require 'tiny_segmenter/version'
|
|
5
5
|
Gem::Specification.new do |s|
|
6
6
|
s.name = 'tiny_segmenter'
|
7
7
|
s.version = TinySegmenter::VERSION
|
8
|
-
s.date = '
|
8
|
+
s.date = '2013-03-30'
|
9
9
|
s.summary = "Ruby port of TinySegmenter.js for tokenizing Japanese text."
|
10
10
|
s.description = "Ruby port of TinySegmenter.js for tokenizing Japanese text."
|
11
11
|
s.authors = ["Peter Graham"]
|
metadata
CHANGED
@@ -1,46 +1,41 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tiny_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.4
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Peter Graham
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2013-03-30 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rake
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - '>='
|
20
18
|
- !ruby/object:Gem::Version
|
21
19
|
version: '0'
|
22
20
|
type: :development
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- -
|
24
|
+
- - '>='
|
28
25
|
- !ruby/object:Gem::Version
|
29
26
|
version: '0'
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: rspec
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- -
|
31
|
+
- - '>='
|
36
32
|
- !ruby/object:Gem::Version
|
37
33
|
version: '0'
|
38
34
|
type: :development
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- -
|
38
|
+
- - '>='
|
44
39
|
- !ruby/object:Gem::Version
|
45
40
|
version: '0'
|
46
41
|
description: Ruby port of TinySegmenter.js for tokenizing Japanese text.
|
@@ -52,8 +47,10 @@ extra_rdoc_files: []
|
|
52
47
|
files:
|
53
48
|
- .gitignore
|
54
49
|
- .rspec
|
50
|
+
- .travis.yml
|
55
51
|
- Gemfile
|
56
52
|
- README.md
|
53
|
+
- Rakefile
|
57
54
|
- lib/tiny_segmenter.rb
|
58
55
|
- lib/tiny_segmenter/segmentation_model.rb
|
59
56
|
- lib/tiny_segmenter/version.rb
|
@@ -62,27 +59,26 @@ files:
|
|
62
59
|
- tiny_segmenter.gemspec
|
63
60
|
homepage: http://github.com/6/tiny_segmenter
|
64
61
|
licenses: []
|
62
|
+
metadata: {}
|
65
63
|
post_install_message:
|
66
64
|
rdoc_options: []
|
67
65
|
require_paths:
|
68
66
|
- lib
|
69
67
|
required_ruby_version: !ruby/object:Gem::Requirement
|
70
|
-
none: false
|
71
68
|
requirements:
|
72
|
-
- -
|
69
|
+
- - '>='
|
73
70
|
- !ruby/object:Gem::Version
|
74
71
|
version: '0'
|
75
72
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
-
none: false
|
77
73
|
requirements:
|
78
|
-
- -
|
74
|
+
- - '>='
|
79
75
|
- !ruby/object:Gem::Version
|
80
76
|
version: '0'
|
81
77
|
requirements: []
|
82
78
|
rubyforge_project:
|
83
|
-
rubygems_version:
|
79
|
+
rubygems_version: 2.0.0
|
84
80
|
signing_key:
|
85
|
-
specification_version:
|
81
|
+
specification_version: 4
|
86
82
|
summary: Ruby port of TinySegmenter.js for tokenizing Japanese text.
|
87
83
|
test_files:
|
88
84
|
- spec/spec_helper.rb
|