tiny_segmenter 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ed996e5195f9e29609e110d175c3d002875e5496
4
+ data.tar.gz: 00405bd85ee522fc3d0b7271cf46fe06bca298a8
5
+ SHA512:
6
+ metadata.gz: e5e0455da693e7eba988f3a921d05c13db13756d9658b0481643963d35367cc207ea02a61f45e02be288ba0a4315b202fc95e6c3bbf88b2fb97b74df6afcd8bd
7
+ data.tar.gz: b3d581557d7097c012bb565ee736265892b686f0b7b4fa286c07409e785da2664cb46f4e7971417d530a277ed8cb302df632c91fe7b85d106ef0d2b20b01a24c
data/.gitignore CHANGED
@@ -1,2 +1,4 @@
1
1
  *.gem
2
2
  .DS_Store
3
+ Gemfile.lock
4
+ .rvmrc
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.2
4
+ - 1.9.3
5
+ - 2.0.0
6
+ - jruby-19mode # JRuby in 1.9 mode
7
+ - jruby-20mode
data/Gemfile CHANGED
@@ -1,4 +1,3 @@
1
- source "http://rubygems.org"
1
+ source "https://rubygems.org"
2
2
 
3
- # Specify your gem's dependencies in method_decorators.gemspec
4
3
  gemspec
data/README.md CHANGED
@@ -1,4 +1,6 @@
1
- Ruby port of [TinySegmenter.js](http://chasen.org/~taku/software/TinySegmenter/) for tokenizing Japanese text.
1
+ Ruby port of [TinySegmenter.js](http://chasen.org/~taku/software/TinySegmenter/) for tokenizing Japanese text. Ruby 1.9 or higher required.
2
+
3
+ [![Build Status](https://secure.travis-ci.org/6/tiny_segmenter.png?branch=master)](http://travis-ci.org/6/tiny_segmenter)
2
4
 
3
5
  ### Install
4
6
 
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task :default => :spec
@@ -3,6 +3,9 @@ require "tiny_segmenter/version"
3
3
  require "tiny_segmenter/segmentation_model"
4
4
 
5
5
  class TinySegmenter
6
+ WhitespaceOnlyRegex = Regexp.compile("^[  ]+$")
7
+ PunctuationRegex = Regexp.compile("^[-–—―.。・()()[]{}{}【】⟨⟩、、,,،…‥〽「」『』〜~!!::??\"'|__“”‘’;/⁄/«»]+$")
8
+
6
9
  def initialize
7
10
  @chartype = []
8
11
  @model = SegmentationModel.new
@@ -20,7 +23,7 @@ class TinySegmenter
20
23
  end
21
24
  end
22
25
 
23
- def segment(text)
26
+ def segment(text, options = {})
24
27
  return [] if text.nil? || text.strip.empty?
25
28
  text = text.strip
26
29
  result = []
@@ -28,7 +31,8 @@ class TinySegmenter
28
31
  ctypes = %w[O O O]
29
32
  text.split(//).each do |char|
30
33
  char.strip!
31
- next if char.empty?
34
+ next if char.empty? || char.match(WhitespaceOnlyRegex)
35
+ next if options[:ignore_punctuation] && char.match(PunctuationRegex)
32
36
  segments << char
33
37
  ctypes << ctype(char)
34
38
  end
@@ -1,3 +1,3 @@
1
1
  class TinySegmenter
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -4,20 +4,41 @@ require 'spec_helper'
4
4
  describe TinySegmenter do
5
5
  subject{ TinySegmenter.new }
6
6
 
7
- it "tokenizes Japanese text fairly accurately" do
8
- subject.segment("極めてコンパクトな日本語分かち書きソフトウェアです。").should == \
9
- ["極めて", "コンパクト", "な", "日本", "語分", "かち", "書き", "ソフトウェア", "です", "。"]
10
- end
7
+ describe "#segment" do
8
+ it "tokenizes Japanese text fairly accurately" do
9
+ subject.segment("極めてコンパクトな日本語分かち書きソフトウェアです。").should == \
10
+ ["極めて", "コンパクト", "な", "日本", "語分", "かち", "書き", "ソフトウェア", "です", "。"]
11
+ end
11
12
 
12
- it "removes any whitespace-only or empty tokens" do
13
- subject.segment("書かれた 極めて コンパクト").should_not include("", " ")
14
- end
13
+ it "removes any whitespace-only or empty tokens" do
14
+ subject.segment("書かれた 極めて コンパクト").should_not include("", " ", nil)
15
+ end
16
+
17
+ it "removes full-width space (U+3000) tokens" do
18
+ sentence = "すてき! 男性が歌う「夢やぶれて」もいいね。"
19
+ full_width_space = " "
20
+ sentence.should include(full_width_space)
21
+ subject.segment(sentence).should_not include (full_width_space)
22
+ end
23
+
24
+ it "tokenizes interspersed non-Japanese words correctly" do
25
+ subject.segment("TinySegmenterはRubyだけで").should == ["TinySegmenter", "は", "Ruby", "だけ", "で"]
26
+ end
27
+
28
+ context "with ignore_punctuation option not set" do
29
+ it "includes punctuation-only tokens" do
30
+ subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...").should include("。", "!", "?", "、", "「", "」", "...")
31
+ end
32
+ end
15
33
 
16
- it "tokenizes interspersed non-Japanese words correctly" do
17
- subject.segment("TinySegmenterはRubyだけで").should == ["TinySegmenter", "は", "Ruby", "だけ", "で"]
34
+ context "with ignore_punctuation option set" do
35
+ it "removes all punctuation-only tokens" do
36
+ subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...", ignore_punctuation: true).should_not include("。", "!", "?", "、", "「", "」", "...")
37
+ end
38
+ end
18
39
  end
19
40
 
20
41
  it "has a version" do
21
- TinySegmenter::VERSION.should_not be_empty
42
+ TinySegmenter::VERSION.should be_kind_of(String)
22
43
  end
23
44
  end
@@ -5,7 +5,7 @@ require 'tiny_segmenter/version'
5
5
  Gem::Specification.new do |s|
6
6
  s.name = 'tiny_segmenter'
7
7
  s.version = TinySegmenter::VERSION
8
- s.date = '2012-08-27'
8
+ s.date = '2013-03-30'
9
9
  s.summary = "Ruby port of TinySegmenter.js for tokenizing Japanese text."
10
10
  s.description = "Ruby port of TinySegmenter.js for tokenizing Japanese text."
11
11
  s.authors = ["Peter Graham"]
metadata CHANGED
@@ -1,46 +1,41 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiny_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
5
- prerelease:
4
+ version: 0.0.4
6
5
  platform: ruby
7
6
  authors:
8
7
  - Peter Graham
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-08-27 00:00:00.000000000 Z
11
+ date: 2013-03-30 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rake
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ! '>='
17
+ - - '>='
20
18
  - !ruby/object:Gem::Version
21
19
  version: '0'
22
20
  type: :development
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - ! '>='
24
+ - - '>='
28
25
  - !ruby/object:Gem::Version
29
26
  version: '0'
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: rspec
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>='
31
+ - - '>='
36
32
  - !ruby/object:Gem::Version
37
33
  version: '0'
38
34
  type: :development
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ! '>='
38
+ - - '>='
44
39
  - !ruby/object:Gem::Version
45
40
  version: '0'
46
41
  description: Ruby port of TinySegmenter.js for tokenizing Japanese text.
@@ -52,8 +47,10 @@ extra_rdoc_files: []
52
47
  files:
53
48
  - .gitignore
54
49
  - .rspec
50
+ - .travis.yml
55
51
  - Gemfile
56
52
  - README.md
53
+ - Rakefile
57
54
  - lib/tiny_segmenter.rb
58
55
  - lib/tiny_segmenter/segmentation_model.rb
59
56
  - lib/tiny_segmenter/version.rb
@@ -62,27 +59,26 @@ files:
62
59
  - tiny_segmenter.gemspec
63
60
  homepage: http://github.com/6/tiny_segmenter
64
61
  licenses: []
62
+ metadata: {}
65
63
  post_install_message:
66
64
  rdoc_options: []
67
65
  require_paths:
68
66
  - lib
69
67
  required_ruby_version: !ruby/object:Gem::Requirement
70
- none: false
71
68
  requirements:
72
- - - ! '>='
69
+ - - '>='
73
70
  - !ruby/object:Gem::Version
74
71
  version: '0'
75
72
  required_rubygems_version: !ruby/object:Gem::Requirement
76
- none: false
77
73
  requirements:
78
- - - ! '>='
74
+ - - '>='
79
75
  - !ruby/object:Gem::Version
80
76
  version: '0'
81
77
  requirements: []
82
78
  rubyforge_project:
83
- rubygems_version: 1.8.21
79
+ rubygems_version: 2.0.0
84
80
  signing_key:
85
- specification_version: 3
81
+ specification_version: 4
86
82
  summary: Ruby port of TinySegmenter.js for tokenizing Japanese text.
87
83
  test_files:
88
84
  - spec/spec_helper.rb