tiny_segmenter 0.0.2 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ed996e5195f9e29609e110d175c3d002875e5496
4
+ data.tar.gz: 00405bd85ee522fc3d0b7271cf46fe06bca298a8
5
+ SHA512:
6
+ metadata.gz: e5e0455da693e7eba988f3a921d05c13db13756d9658b0481643963d35367cc207ea02a61f45e02be288ba0a4315b202fc95e6c3bbf88b2fb97b74df6afcd8bd
7
+ data.tar.gz: b3d581557d7097c012bb565ee736265892b686f0b7b4fa286c07409e785da2664cb46f4e7971417d530a277ed8cb302df632c91fe7b85d106ef0d2b20b01a24c
data/.gitignore CHANGED
@@ -1,2 +1,4 @@
1
1
  *.gem
2
2
  .DS_Store
3
+ Gemfile.lock
4
+ .rvmrc
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.2
4
+ - 1.9.3
5
+ - 2.0.0
6
+ - jruby-19mode # JRuby in 1.9 mode
7
+ - jruby-20mode
data/Gemfile CHANGED
@@ -1,4 +1,3 @@
1
- source "http://rubygems.org"
1
+ source "https://rubygems.org"
2
2
 
3
- # Specify your gem's dependencies in method_decorators.gemspec
4
3
  gemspec
data/README.md CHANGED
@@ -1,4 +1,6 @@
1
- Ruby port of [TinySegmenter.js](http://chasen.org/~taku/software/TinySegmenter/) for tokenizing Japanese text.
1
+ Ruby port of [TinySegmenter.js](http://chasen.org/~taku/software/TinySegmenter/) for tokenizing Japanese text. Ruby 1.9 or higher required.
2
+
3
+ [![Build Status](https://secure.travis-ci.org/6/tiny_segmenter.png?branch=master)](http://travis-ci.org/6/tiny_segmenter)
2
4
 
3
5
  ### Install
4
6
 
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task :default => :spec
@@ -3,6 +3,9 @@ require "tiny_segmenter/version"
3
3
  require "tiny_segmenter/segmentation_model"
4
4
 
5
5
  class TinySegmenter
6
+ WhitespaceOnlyRegex = Regexp.compile("^[  ]+$")
7
+ PunctuationRegex = Regexp.compile("^[-–—―.。・()()[]{}{}【】⟨⟩、、,,،…‥〽「」『』〜~!!::??\"'|__“”‘’;/⁄/«»]+$")
8
+
6
9
  def initialize
7
10
  @chartype = []
8
11
  @model = SegmentationModel.new
@@ -20,7 +23,7 @@ class TinySegmenter
20
23
  end
21
24
  end
22
25
 
23
- def segment(text)
26
+ def segment(text, options = {})
24
27
  return [] if text.nil? || text.strip.empty?
25
28
  text = text.strip
26
29
  result = []
@@ -28,7 +31,8 @@ class TinySegmenter
28
31
  ctypes = %w[O O O]
29
32
  text.split(//).each do |char|
30
33
  char.strip!
31
- next if char.empty?
34
+ next if char.empty? || char.match(WhitespaceOnlyRegex)
35
+ next if options[:ignore_punctuation] && char.match(PunctuationRegex)
32
36
  segments << char
33
37
  ctypes << ctype(char)
34
38
  end
@@ -1,3 +1,3 @@
1
1
  class TinySegmenter
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -4,20 +4,41 @@ require 'spec_helper'
4
4
  describe TinySegmenter do
5
5
  subject{ TinySegmenter.new }
6
6
 
7
- it "tokenizes Japanese text fairly accurately" do
8
- subject.segment("極めてコンパクトな日本語分かち書きソフトウェアです。").should == \
9
- ["極めて", "コンパクト", "な", "日本", "語分", "かち", "書き", "ソフトウェア", "です", "。"]
10
- end
7
+ describe "#segment" do
8
+ it "tokenizes Japanese text fairly accurately" do
9
+ subject.segment("極めてコンパクトな日本語分かち書きソフトウェアです。").should == \
10
+ ["極めて", "コンパクト", "な", "日本", "語分", "かち", "書き", "ソフトウェア", "です", "。"]
11
+ end
11
12
 
12
- it "removes any whitespace-only or empty tokens" do
13
- subject.segment("書かれた 極めて コンパクト").should_not include("", " ")
14
- end
13
+ it "removes any whitespace-only or empty tokens" do
14
+ subject.segment("書かれた 極めて コンパクト").should_not include("", " ", nil)
15
+ end
16
+
17
+ it "removes full-width space (U+3000) tokens" do
18
+ sentence = "すてき! 男性が歌う「夢やぶれて」もいいね。"
19
+ full_width_space = " "
20
+ sentence.should include(full_width_space)
21
+ subject.segment(sentence).should_not include (full_width_space)
22
+ end
23
+
24
+ it "tokenizes interspersed non-Japanese words correctly" do
25
+ subject.segment("TinySegmenterはRubyだけで").should == ["TinySegmenter", "は", "Ruby", "だけ", "で"]
26
+ end
27
+
28
+ context "with ignore_punctuation option not set" do
29
+ it "includes punctuation-only tokens" do
30
+ subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...").should include("。", "!", "?", "、", "「", "」", "...")
31
+ end
32
+ end
15
33
 
16
- it "tokenizes interspersed non-Japanese words correctly" do
17
- subject.segment("TinySegmenterはRubyだけで").should == ["TinySegmenter", "は", "Ruby", "だけ", "で"]
34
+ context "with ignore_punctuation option set" do
35
+ it "removes all punctuation-only tokens" do
36
+ subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...", ignore_punctuation: true).should_not include("。", "!", "?", "、", "「", "」", "...")
37
+ end
38
+ end
18
39
  end
19
40
 
20
41
  it "has a version" do
21
- TinySegmenter::VERSION.should_not be_empty
42
+ TinySegmenter::VERSION.should be_kind_of(String)
22
43
  end
23
44
  end
@@ -5,7 +5,7 @@ require 'tiny_segmenter/version'
5
5
  Gem::Specification.new do |s|
6
6
  s.name = 'tiny_segmenter'
7
7
  s.version = TinySegmenter::VERSION
8
- s.date = '2012-08-27'
8
+ s.date = '2013-03-30'
9
9
  s.summary = "Ruby port of TinySegmenter.js for tokenizing Japanese text."
10
10
  s.description = "Ruby port of TinySegmenter.js for tokenizing Japanese text."
11
11
  s.authors = ["Peter Graham"]
metadata CHANGED
@@ -1,46 +1,41 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiny_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
5
- prerelease:
4
+ version: 0.0.4
6
5
  platform: ruby
7
6
  authors:
8
7
  - Peter Graham
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-08-27 00:00:00.000000000 Z
11
+ date: 2013-03-30 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rake
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ! '>='
17
+ - - '>='
20
18
  - !ruby/object:Gem::Version
21
19
  version: '0'
22
20
  type: :development
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - ! '>='
24
+ - - '>='
28
25
  - !ruby/object:Gem::Version
29
26
  version: '0'
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: rspec
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>='
31
+ - - '>='
36
32
  - !ruby/object:Gem::Version
37
33
  version: '0'
38
34
  type: :development
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ! '>='
38
+ - - '>='
44
39
  - !ruby/object:Gem::Version
45
40
  version: '0'
46
41
  description: Ruby port of TinySegmenter.js for tokenizing Japanese text.
@@ -52,8 +47,10 @@ extra_rdoc_files: []
52
47
  files:
53
48
  - .gitignore
54
49
  - .rspec
50
+ - .travis.yml
55
51
  - Gemfile
56
52
  - README.md
53
+ - Rakefile
57
54
  - lib/tiny_segmenter.rb
58
55
  - lib/tiny_segmenter/segmentation_model.rb
59
56
  - lib/tiny_segmenter/version.rb
@@ -62,27 +59,26 @@ files:
62
59
  - tiny_segmenter.gemspec
63
60
  homepage: http://github.com/6/tiny_segmenter
64
61
  licenses: []
62
+ metadata: {}
65
63
  post_install_message:
66
64
  rdoc_options: []
67
65
  require_paths:
68
66
  - lib
69
67
  required_ruby_version: !ruby/object:Gem::Requirement
70
- none: false
71
68
  requirements:
72
- - - ! '>='
69
+ - - '>='
73
70
  - !ruby/object:Gem::Version
74
71
  version: '0'
75
72
  required_rubygems_version: !ruby/object:Gem::Requirement
76
- none: false
77
73
  requirements:
78
- - - ! '>='
74
+ - - '>='
79
75
  - !ruby/object:Gem::Version
80
76
  version: '0'
81
77
  requirements: []
82
78
  rubyforge_project:
83
- rubygems_version: 1.8.21
79
+ rubygems_version: 2.0.0
84
80
  signing_key:
85
- specification_version: 3
81
+ specification_version: 4
86
82
  summary: Ruby port of TinySegmenter.js for tokenizing Japanese text.
87
83
  test_files:
88
84
  - spec/spec_helper.rb