greeb 0.2.2.pre1 → 0.2.2.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bff91e4559c7b7d1f83ef83212455d4b4b282351
4
- data.tar.gz: 867bb10aeb676a2608b701dd57c5a68b215eeaa9
3
+ metadata.gz: 921db15cfa86d9a5e6bf0e80058d349d708d1750
4
+ data.tar.gz: 994418e21e9e55ecb42b2329b59efce999281df9
5
5
  SHA512:
6
- metadata.gz: f116b3773ac59a02e17c6cece192cb5d8cc21ae976e2e78778752c602ef87b4e6cfe9c383e4b09150b11e9c4dbc08788b27a34834c15023d315b08d08e922d26
7
- data.tar.gz: 2532a596959945be6793265fc062f9c89d7676df2a28a535e1662b8bf4d848ac66f35e3bf299282dc53e3ab72ce9efe76382079ada9476856da14361ce19df53
6
+ metadata.gz: df570f263f22cfb2682ab39e758c6cb25a03c88d96bdcca15c79173d2dd937e28406a7baf90162882810973564d8dccb9aaf0962bb577c623ef1eceb3c6f56e4
7
+ data.tar.gz: 425859becdf31d2dc68552ba2f788675098a218063e627ec5f6b46c3a2a003fe685f75f03385289f482166c01e4e02ce5e7172ba7aa0289ae28f4fdc57645f4b
@@ -1,5 +1,4 @@
1
1
  language: ruby
2
- bundler_args: --without development
3
2
  rvm:
4
3
  - 2.0.0
5
4
  - jruby-19mode
data/Gemfile CHANGED
@@ -5,7 +5,7 @@ source 'https://rubygems.org'
5
5
  gemspec
6
6
 
7
7
  group :development do
8
- gem 'simplecov'
8
+ gem 'sdoc'
9
9
  end
10
10
 
11
11
  group :test do
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # Greeb
2
2
  Greeb [grʲip] is a simple yet awesome and Unicode-aware text segmentator
3
- that is based on regular expressions.
3
+ that is based on regular expressions. API documentation is available at
4
+ <https://dmchk.github.com/greeb>.
4
5
 
5
6
  ## Installation
6
7
  Add this line to your application's Gemfile:
@@ -175,11 +176,11 @@ There are several entity types at the tokenization stage: `:letter`,
175
176
  4. Push to the branch (`git push origin my-new-feature`);
176
177
  5. Create new Pull Request.
177
178
 
178
- ## Build Status [<img src="https://secure.travis-ci.org/ustalov/greeb.png"/>](http://travis-ci.org/ustalov/greeb)
179
+ ## Build Status [<img src="https://secure.travis-ci.org/dmchk/greeb.png"/>](http://travis-ci.org/dmchk/greeb)
179
180
 
180
- ## Dependency Status [<img src="https://gemnasium.com/ustalov/greeb.png"/>](https://gemnasium.com/ustalov/greeb)
181
+ ## Dependency Status [<img src="https://gemnasium.com/dmchk/greeb.png"/>](https://gemnasium.com/dmchk/greeb)
181
182
 
182
- ## Code Climate [<img src="https://codeclimate.com/github/ustalov/greeb.png"/>](https://codeclimate.com/github/ustalov/greeb)
183
+ ## Code Climate [<img src="https://codeclimate.com/github/dmchk/greeb.png"/>](https://codeclimate.com/github/dmchk/greeb)
183
184
 
184
185
  ## Copyright
185
186
 
data/bin/greeb CHANGED
@@ -9,5 +9,5 @@ require 'greeb'
9
9
  text = STDIN.read.tap(&:chomp!)
10
10
 
11
11
  Greeb[text].each do |entity|
12
- puts text[entity.from...entity.to] unless entity.type == :separ
12
+ puts text[entity.from...entity.to] unless [:space, :break].include? entity.type
13
13
  end
@@ -5,9 +5,9 @@
5
5
  #
6
6
  class Greeb::Segmentator
7
7
  # Sentence does not start from the separator charater, line break
8
- # character, and punctuation characters.
8
+ # character, punctuation characters, and spaces.
9
9
  #
10
- SENTENCE_DOES_NOT_START = [:separ, :break, :punct, :spunct]
10
+ SENTENCE_AINT_START = [:separ, :break, :punct, :spunct, :space]
11
11
 
12
12
  attr_reader :tokens
13
13
 
@@ -64,7 +64,7 @@ class Greeb::Segmentator
64
64
  collection = []
65
65
 
66
66
  rest = tokens.inject(sample.dup) do |entity, token|
67
- next entity if sentence_does_not_start? entity, token
67
+ next entity if sentence_aint_start? entity, token
68
68
  entity.from = token.from unless entity.from
69
69
  next entity if entity.to and entity.to > token.to
70
70
 
@@ -72,7 +72,7 @@ class Greeb::Segmentator
72
72
  entity.to = find_forward(tokens, token).to
73
73
  collection << entity
74
74
  entity = sample.dup
75
- elsif :separ != token.type
75
+ elsif ![:separ, :space].include? token.type
76
76
  entity.to = token.to
77
77
  end
78
78
 
@@ -95,8 +95,8 @@ class Greeb::Segmentator
95
95
  #
96
96
  # @return true or false.
97
97
  #
98
- def sentence_does_not_start?(entity, token)
99
- !entity.from and SENTENCE_DOES_NOT_START.include? token.type
98
+ def sentence_aint_start?(entity, token)
99
+ !entity.from and SENTENCE_AINT_START.include? token.type
100
100
  end
101
101
 
102
102
  # Find a forwarding token that has another type.
@@ -31,7 +31,11 @@ module Greeb::Tokenizer
31
31
 
32
32
  # In-subsentence seprator (i.e.: "*" or "=").
33
33
  #
34
- SEPARATORS = /[ \p{Nl}\p{No}\p{Pd}\p{Pc}\p{Po}\p{Sm}\p{So}\p{Sc}\p{Z}]+/u
34
+ SEPARATORS = /[\p{Nl}\p{No}\p{Pd}\p{Pc}\p{Po}\p{Sm}\p{So}\p{Sc}\p{Zl}\p{Zp}]+/u
35
+
36
+ # Spaces (i.e.: " " or &nbsp).
37
+ #
38
+ SPACES = /[\p{Zs}]+/u
35
39
 
36
40
  # Line breaks.
37
41
  #
@@ -68,7 +72,7 @@ module Greeb::Tokenizer
68
72
  # @return [Array<String>] splitted characters.
69
73
  #
70
74
  def split(token)
71
- token.scan(/((.|\n)\2*)/).map(&:first)
75
+ token.scan(/((.|\n)\2*)/).map!(&:first)
72
76
  end
73
77
 
74
78
  protected
@@ -86,6 +90,7 @@ module Greeb::Tokenizer
86
90
  split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
87
91
  split_parse! scanner, tokens, PUNCTUATIONS, :punct or
88
92
  split_parse! scanner, tokens, SEPARATORS, :separ or
93
+ split_parse! scanner, tokens, SPACES, :space or
89
94
  split_parse! scanner, tokens, BREAKS, :break or
90
95
  parse! scanner, tokens, RESIDUALS, :residual
91
96
  end
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.2.2.pre1'
8
+ VERSION = '0.2.2.rc1'
9
9
  end
@@ -11,7 +11,7 @@ module Greeb
11
11
  it 'should tokenize text when input is given' do
12
12
  Greeb['Hello guys!'].must_equal(
13
13
  [Entity.new(0, 5, :letter),
14
- Entity.new(5, 6, :separ),
14
+ Entity.new(5, 6, :space),
15
15
  Entity.new(6, 10, :letter),
16
16
  Entity.new(10, 11, :punct)]
17
17
  )
@@ -20,9 +20,9 @@ module Greeb
20
20
  it 'should extract URLs' do
21
21
  Greeb['Hello http://nlpub.ru guys!'].must_equal(
22
22
  [Entity.new(0, 5, :letter),
23
- Entity.new(5, 6, :separ),
23
+ Entity.new(5, 6, :space),
24
24
  Entity.new(6, 21, :url),
25
- Entity.new(21, 22, :separ),
25
+ Entity.new(21, 22, :space),
26
26
  Entity.new(22, 26, :letter),
27
27
  Entity.new(26, 27, :punct)]
28
28
  )
@@ -31,9 +31,9 @@ module Greeb
31
31
  it 'should extract e-mails' do
32
32
  Greeb['Hello example@example.com guys!'].must_equal(
33
33
  [Entity.new(0, 5, :letter),
34
- Entity.new(5, 6, :separ),
34
+ Entity.new(5, 6, :space),
35
35
  Entity.new(6, 25, :email),
36
- Entity.new(25, 26, :separ),
36
+ Entity.new(25, 26, :space),
37
37
  Entity.new(26, 30, :letter),
38
38
  Entity.new(30, 31, :punct)]
39
39
  )
@@ -84,11 +84,11 @@ module Greeb
84
84
  ],
85
85
  Entity.new(7, 22, :sentence) => [
86
86
  Entity.new(7, 8, :letter),
87
- Entity.new(8, 9, :separ),
87
+ Entity.new(8, 9, :space),
88
88
  Entity.new(9, 11, :letter),
89
- Entity.new(11, 12, :separ),
89
+ Entity.new(11, 12, :space),
90
90
  Entity.new(12, 14, :letter),
91
- Entity.new(14, 15, :separ),
91
+ Entity.new(14, 15, :space),
92
92
  Entity.new(15, 21, :letter),
93
93
  Entity.new(21, 22, :punct)
94
94
  ]
@@ -6,13 +6,6 @@ gem 'minitest'
6
6
  require 'minitest/autorun'
7
7
  require 'minitest/hell'
8
8
 
9
- unless 'true' == ENV['TRAVIS']
10
- require 'simplecov'
11
- SimpleCov.start do
12
- add_filter '/spec/'
13
- end
14
- end
15
-
16
9
  $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
17
10
  require 'greeb'
18
11
 
@@ -35,11 +35,11 @@ module Greeb
35
35
  Tokenizer.tokenize('Hello, I am 18').must_equal(
36
36
  [Entity.new(0, 5, :letter),
37
37
  Entity.new(5, 6, :spunct),
38
- Entity.new(6, 7, :separ),
38
+ Entity.new(6, 7, :space),
39
39
  Entity.new(7, 8, :letter),
40
- Entity.new(8, 9, :separ),
40
+ Entity.new(8, 9, :space),
41
41
  Entity.new(9, 11, :letter),
42
- Entity.new(11, 12, :separ),
42
+ Entity.new(11, 12, :space),
43
43
  Entity.new(12, 14, :integer)]
44
44
  )
45
45
  end
@@ -67,13 +67,13 @@ module Greeb
67
67
  Tokenizer.tokenize('Братишка, я тебе покушать принёс!').must_equal(
68
68
  [Entity.new(0, 8, :letter),
69
69
  Entity.new(8, 9, :spunct),
70
- Entity.new(9, 10, :separ),
70
+ Entity.new(9, 10, :space),
71
71
  Entity.new(10, 11, :letter),
72
- Entity.new(11, 12, :separ),
72
+ Entity.new(11, 12, :space),
73
73
  Entity.new(12, 16, :letter),
74
- Entity.new(16, 17, :separ),
74
+ Entity.new(16, 17, :space),
75
75
  Entity.new(17, 25, :letter),
76
- Entity.new(25, 26, :separ),
76
+ Entity.new(25, 26, :space),
77
77
  Entity.new(26, 32, :letter),
78
78
  Entity.new(32, 33, :punct)]
79
79
  )
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2.pre1
4
+ version: 0.2.2.rc1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Ustalov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-10-23 00:00:00.000000000 Z
11
+ date: 2013-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: minitest
@@ -34,9 +34,7 @@ extensions: []
34
34
  extra_rdoc_files: []
35
35
  files:
36
36
  - .gitignore
37
- - .rubocop.yml
38
37
  - .travis.yml
39
- - .yardopts
40
38
  - Gemfile
41
39
  - LICENSE
42
40
  - README.md
@@ -1,3 +0,0 @@
1
- # Don't use sprintf instead of %
2
- FavorSprintf:
3
- Enabled: false
data/.yardopts DELETED
@@ -1,6 +0,0 @@
1
- --protected
2
- --no-private
3
- -m markdown
4
- -
5
- README.md
6
- LICENSE