tiny_segmenter 0.0.2 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +2 -0
- data/.travis.yml +7 -0
- data/Gemfile +1 -2
- data/README.md +3 -1
- data/Rakefile +8 -0
- data/lib/tiny_segmenter.rb +6 -2
- data/lib/tiny_segmenter/version.rb +1 -1
- data/spec/tiny_segmenter_spec.rb +31 -10
- data/tiny_segmenter.gemspec +1 -1
- metadata +13 -17
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ed996e5195f9e29609e110d175c3d002875e5496
|
4
|
+
data.tar.gz: 00405bd85ee522fc3d0b7271cf46fe06bca298a8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e5e0455da693e7eba988f3a921d05c13db13756d9658b0481643963d35367cc207ea02a61f45e02be288ba0a4315b202fc95e6c3bbf88b2fb97b74df6afcd8bd
|
7
|
+
data.tar.gz: b3d581557d7097c012bb565ee736265892b686f0b7b4fa286c07409e785da2664cb46f4e7971417d530a277ed8cb302df632c91fe7b85d106ef0d2b20b01a24c
|
data/.gitignore
CHANGED
data/.travis.yml
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
Ruby port of [TinySegmenter.js](http://chasen.org/~taku/software/TinySegmenter/) for tokenizing Japanese text.
|
1
|
+
Ruby port of [TinySegmenter.js](http://chasen.org/~taku/software/TinySegmenter/) for tokenizing Japanese text. Ruby 1.9 or higher required.
|
2
|
+
|
3
|
+
[![Build Status](https://secure.travis-ci.org/6/tiny_segmenter.png?branch=master)](http://travis-ci.org/6/tiny_segmenter)
|
2
4
|
|
3
5
|
### Install
|
4
6
|
|
data/Rakefile
ADDED
data/lib/tiny_segmenter.rb
CHANGED
@@ -3,6 +3,9 @@ require "tiny_segmenter/version"
|
|
3
3
|
require "tiny_segmenter/segmentation_model"
|
4
4
|
|
5
5
|
class TinySegmenter
|
6
|
+
WhitespaceOnlyRegex = Regexp.compile("^[ ]+$")
|
7
|
+
PunctuationRegex = Regexp.compile("^[-–—―.。・()()[]{}{}【】⟨⟩、、,,،…‥〽「」『』〜~!!::??\"'|__“”‘’;/⁄/«»]+$")
|
8
|
+
|
6
9
|
def initialize
|
7
10
|
@chartype = []
|
8
11
|
@model = SegmentationModel.new
|
@@ -20,7 +23,7 @@ class TinySegmenter
|
|
20
23
|
end
|
21
24
|
end
|
22
25
|
|
23
|
-
def segment(text)
|
26
|
+
def segment(text, options = {})
|
24
27
|
return [] if text.nil? || text.strip.empty?
|
25
28
|
text = text.strip
|
26
29
|
result = []
|
@@ -28,7 +31,8 @@ class TinySegmenter
|
|
28
31
|
ctypes = %w[O O O]
|
29
32
|
text.split(//).each do |char|
|
30
33
|
char.strip!
|
31
|
-
next if char.empty?
|
34
|
+
next if char.empty? || char.match(WhitespaceOnlyRegex)
|
35
|
+
next if options[:ignore_punctuation] && char.match(PunctuationRegex)
|
32
36
|
segments << char
|
33
37
|
ctypes << ctype(char)
|
34
38
|
end
|
data/spec/tiny_segmenter_spec.rb
CHANGED
@@ -4,20 +4,41 @@ require 'spec_helper'
|
|
4
4
|
describe TinySegmenter do
|
5
5
|
subject{ TinySegmenter.new }
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
7
|
+
describe "#segment" do
|
8
|
+
it "tokenizes Japanese text fairly accurately" do
|
9
|
+
subject.segment("極めてコンパクトな日本語分かち書きソフトウェアです。").should == \
|
10
|
+
["極めて", "コンパクト", "な", "日本", "語分", "かち", "書き", "ソフトウェア", "です", "。"]
|
11
|
+
end
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
13
|
+
it "removes any whitespace-only or empty tokens" do
|
14
|
+
subject.segment("書かれた 極めて コンパクト").should_not include("", " ", nil)
|
15
|
+
end
|
16
|
+
|
17
|
+
it "removes full-width space (U+3000) tokens" do
|
18
|
+
sentence = "すてき! 男性が歌う「夢やぶれて」もいいね。"
|
19
|
+
full_width_space = " "
|
20
|
+
sentence.should include(full_width_space)
|
21
|
+
subject.segment(sentence).should_not include (full_width_space)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "tokenizes interspersed non-Japanese words correctly" do
|
25
|
+
subject.segment("TinySegmenterはRubyだけで").should == ["TinySegmenter", "は", "Ruby", "だけ", "で"]
|
26
|
+
end
|
27
|
+
|
28
|
+
context "with ignore_punctuation option not set" do
|
29
|
+
it "includes punctuation-only tokens" do
|
30
|
+
subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...").should include("。", "!", "?", "、", "「", "」", "...")
|
31
|
+
end
|
32
|
+
end
|
15
33
|
|
16
|
-
|
17
|
-
|
34
|
+
context "with ignore_punctuation option set" do
|
35
|
+
it "removes all punctuation-only tokens" do
|
36
|
+
subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...", ignore_punctuation: true).should_not include("。", "!", "?", "、", "「", "」", "...")
|
37
|
+
end
|
38
|
+
end
|
18
39
|
end
|
19
40
|
|
20
41
|
it "has a version" do
|
21
|
-
TinySegmenter::VERSION.
|
42
|
+
TinySegmenter::VERSION.should be_kind_of(String)
|
22
43
|
end
|
23
44
|
end
|
data/tiny_segmenter.gemspec
CHANGED
@@ -5,7 +5,7 @@ require 'tiny_segmenter/version'
|
|
5
5
|
Gem::Specification.new do |s|
|
6
6
|
s.name = 'tiny_segmenter'
|
7
7
|
s.version = TinySegmenter::VERSION
|
8
|
-
s.date = '
|
8
|
+
s.date = '2013-03-30'
|
9
9
|
s.summary = "Ruby port of TinySegmenter.js for tokenizing Japanese text."
|
10
10
|
s.description = "Ruby port of TinySegmenter.js for tokenizing Japanese text."
|
11
11
|
s.authors = ["Peter Graham"]
|
metadata
CHANGED
@@ -1,46 +1,41 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tiny_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.4
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Peter Graham
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2013-03-30 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rake
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - '>='
|
20
18
|
- !ruby/object:Gem::Version
|
21
19
|
version: '0'
|
22
20
|
type: :development
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- -
|
24
|
+
- - '>='
|
28
25
|
- !ruby/object:Gem::Version
|
29
26
|
version: '0'
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: rspec
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- -
|
31
|
+
- - '>='
|
36
32
|
- !ruby/object:Gem::Version
|
37
33
|
version: '0'
|
38
34
|
type: :development
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- -
|
38
|
+
- - '>='
|
44
39
|
- !ruby/object:Gem::Version
|
45
40
|
version: '0'
|
46
41
|
description: Ruby port of TinySegmenter.js for tokenizing Japanese text.
|
@@ -52,8 +47,10 @@ extra_rdoc_files: []
|
|
52
47
|
files:
|
53
48
|
- .gitignore
|
54
49
|
- .rspec
|
50
|
+
- .travis.yml
|
55
51
|
- Gemfile
|
56
52
|
- README.md
|
53
|
+
- Rakefile
|
57
54
|
- lib/tiny_segmenter.rb
|
58
55
|
- lib/tiny_segmenter/segmentation_model.rb
|
59
56
|
- lib/tiny_segmenter/version.rb
|
@@ -62,27 +59,26 @@ files:
|
|
62
59
|
- tiny_segmenter.gemspec
|
63
60
|
homepage: http://github.com/6/tiny_segmenter
|
64
61
|
licenses: []
|
62
|
+
metadata: {}
|
65
63
|
post_install_message:
|
66
64
|
rdoc_options: []
|
67
65
|
require_paths:
|
68
66
|
- lib
|
69
67
|
required_ruby_version: !ruby/object:Gem::Requirement
|
70
|
-
none: false
|
71
68
|
requirements:
|
72
|
-
- -
|
69
|
+
- - '>='
|
73
70
|
- !ruby/object:Gem::Version
|
74
71
|
version: '0'
|
75
72
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
-
none: false
|
77
73
|
requirements:
|
78
|
-
- -
|
74
|
+
- - '>='
|
79
75
|
- !ruby/object:Gem::Version
|
80
76
|
version: '0'
|
81
77
|
requirements: []
|
82
78
|
rubyforge_project:
|
83
|
-
rubygems_version:
|
79
|
+
rubygems_version: 2.0.0
|
84
80
|
signing_key:
|
85
|
-
specification_version:
|
81
|
+
specification_version: 4
|
86
82
|
summary: Ruby port of TinySegmenter.js for tokenizing Japanese text.
|
87
83
|
test_files:
|
88
84
|
- spec/spec_helper.rb
|