greeb 0.2.2.pre1 → 0.2.2.rc1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bff91e4559c7b7d1f83ef83212455d4b4b282351
4
- data.tar.gz: 867bb10aeb676a2608b701dd57c5a68b215eeaa9
3
+ metadata.gz: 921db15cfa86d9a5e6bf0e80058d349d708d1750
4
+ data.tar.gz: 994418e21e9e55ecb42b2329b59efce999281df9
5
5
  SHA512:
6
- metadata.gz: f116b3773ac59a02e17c6cece192cb5d8cc21ae976e2e78778752c602ef87b4e6cfe9c383e4b09150b11e9c4dbc08788b27a34834c15023d315b08d08e922d26
7
- data.tar.gz: 2532a596959945be6793265fc062f9c89d7676df2a28a535e1662b8bf4d848ac66f35e3bf299282dc53e3ab72ce9efe76382079ada9476856da14361ce19df53
6
+ metadata.gz: df570f263f22cfb2682ab39e758c6cb25a03c88d96bdcca15c79173d2dd937e28406a7baf90162882810973564d8dccb9aaf0962bb577c623ef1eceb3c6f56e4
7
+ data.tar.gz: 425859becdf31d2dc68552ba2f788675098a218063e627ec5f6b46c3a2a003fe685f75f03385289f482166c01e4e02ce5e7172ba7aa0289ae28f4fdc57645f4b
@@ -1,5 +1,4 @@
1
1
  language: ruby
2
- bundler_args: --without development
3
2
  rvm:
4
3
  - 2.0.0
5
4
  - jruby-19mode
data/Gemfile CHANGED
@@ -5,7 +5,7 @@ source 'https://rubygems.org'
5
5
  gemspec
6
6
 
7
7
  group :development do
8
- gem 'simplecov'
8
+ gem 'sdoc'
9
9
  end
10
10
 
11
11
  group :test do
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # Greeb
2
2
  Greeb [grʲip] is a simple yet awesome and Unicode-aware text segmentator
3
- that is based on regular expressions.
3
+ that is based on regular expressions. API documentation is available at
4
+ <https://dmchk.github.com/greeb>.
4
5
 
5
6
  ## Installation
6
7
  Add this line to your application's Gemfile:
@@ -175,11 +176,11 @@ There are several entity types at the tokenization stage: `:letter`,
175
176
  4. Push to the branch (`git push origin my-new-feature`);
176
177
  5. Create new Pull Request.
177
178
 
178
- ## Build Status [<img src="https://secure.travis-ci.org/ustalov/greeb.png"/>](http://travis-ci.org/ustalov/greeb)
179
+ ## Build Status [<img src="https://secure.travis-ci.org/dmchk/greeb.png"/>](http://travis-ci.org/dmchk/greeb)
179
180
 
180
- ## Dependency Status [<img src="https://gemnasium.com/ustalov/greeb.png"/>](https://gemnasium.com/ustalov/greeb)
181
+ ## Dependency Status [<img src="https://gemnasium.com/dmchk/greeb.png"/>](https://gemnasium.com/dmchk/greeb)
181
182
 
182
- ## Code Climate [<img src="https://codeclimate.com/github/ustalov/greeb.png"/>](https://codeclimate.com/github/ustalov/greeb)
183
+ ## Code Climate [<img src="https://codeclimate.com/github/dmchk/greeb.png"/>](https://codeclimate.com/github/dmchk/greeb)
183
184
 
184
185
  ## Copyright
185
186
 
data/bin/greeb CHANGED
@@ -9,5 +9,5 @@ require 'greeb'
9
9
  text = STDIN.read.tap(&:chomp!)
10
10
 
11
11
  Greeb[text].each do |entity|
12
- puts text[entity.from...entity.to] unless entity.type == :separ
12
+ puts text[entity.from...entity.to] unless [:space, :break].include? entity.type
13
13
  end
@@ -5,9 +5,9 @@
5
5
  #
6
6
  class Greeb::Segmentator
7
7
  # Sentence does not start from the separator charater, line break
8
- # character, and punctuation characters.
8
+ # character, punctuation characters, and spaces.
9
9
  #
10
- SENTENCE_DOES_NOT_START = [:separ, :break, :punct, :spunct]
10
+ SENTENCE_AINT_START = [:separ, :break, :punct, :spunct, :space]
11
11
 
12
12
  attr_reader :tokens
13
13
 
@@ -64,7 +64,7 @@ class Greeb::Segmentator
64
64
  collection = []
65
65
 
66
66
  rest = tokens.inject(sample.dup) do |entity, token|
67
- next entity if sentence_does_not_start? entity, token
67
+ next entity if sentence_aint_start? entity, token
68
68
  entity.from = token.from unless entity.from
69
69
  next entity if entity.to and entity.to > token.to
70
70
 
@@ -72,7 +72,7 @@ class Greeb::Segmentator
72
72
  entity.to = find_forward(tokens, token).to
73
73
  collection << entity
74
74
  entity = sample.dup
75
- elsif :separ != token.type
75
+ elsif ![:separ, :space].include? token.type
76
76
  entity.to = token.to
77
77
  end
78
78
 
@@ -95,8 +95,8 @@ class Greeb::Segmentator
95
95
  #
96
96
  # @return true or false.
97
97
  #
98
- def sentence_does_not_start?(entity, token)
99
- !entity.from and SENTENCE_DOES_NOT_START.include? token.type
98
+ def sentence_aint_start?(entity, token)
99
+ !entity.from and SENTENCE_AINT_START.include? token.type
100
100
  end
101
101
 
102
102
  # Find a forwarding token that has another type.
@@ -31,7 +31,11 @@ module Greeb::Tokenizer
31
31
 
32
32
  # In-subsentence seprator (i.e.: "*" or "=").
33
33
  #
34
- SEPARATORS = /[ \p{Nl}\p{No}\p{Pd}\p{Pc}\p{Po}\p{Sm}\p{So}\p{Sc}\p{Z}]+/u
34
+ SEPARATORS = /[\p{Nl}\p{No}\p{Pd}\p{Pc}\p{Po}\p{Sm}\p{So}\p{Sc}\p{Zl}\p{Zp}]+/u
35
+
36
+ # Spaces (i.e.: " " or &nbsp).
37
+ #
38
+ SPACES = /[\p{Zs}]+/u
35
39
 
36
40
  # Line breaks.
37
41
  #
@@ -68,7 +72,7 @@ module Greeb::Tokenizer
68
72
  # @return [Array<String>] splitted characters.
69
73
  #
70
74
  def split(token)
71
- token.scan(/((.|\n)\2*)/).map(&:first)
75
+ token.scan(/((.|\n)\2*)/).map!(&:first)
72
76
  end
73
77
 
74
78
  protected
@@ -86,6 +90,7 @@ module Greeb::Tokenizer
86
90
  split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
87
91
  split_parse! scanner, tokens, PUNCTUATIONS, :punct or
88
92
  split_parse! scanner, tokens, SEPARATORS, :separ or
93
+ split_parse! scanner, tokens, SPACES, :space or
89
94
  split_parse! scanner, tokens, BREAKS, :break or
90
95
  parse! scanner, tokens, RESIDUALS, :residual
91
96
  end
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.2.2.pre1'
8
+ VERSION = '0.2.2.rc1'
9
9
  end
@@ -11,7 +11,7 @@ module Greeb
11
11
  it 'should tokenize text when input is given' do
12
12
  Greeb['Hello guys!'].must_equal(
13
13
  [Entity.new(0, 5, :letter),
14
- Entity.new(5, 6, :separ),
14
+ Entity.new(5, 6, :space),
15
15
  Entity.new(6, 10, :letter),
16
16
  Entity.new(10, 11, :punct)]
17
17
  )
@@ -20,9 +20,9 @@ module Greeb
20
20
  it 'should extract URLs' do
21
21
  Greeb['Hello http://nlpub.ru guys!'].must_equal(
22
22
  [Entity.new(0, 5, :letter),
23
- Entity.new(5, 6, :separ),
23
+ Entity.new(5, 6, :space),
24
24
  Entity.new(6, 21, :url),
25
- Entity.new(21, 22, :separ),
25
+ Entity.new(21, 22, :space),
26
26
  Entity.new(22, 26, :letter),
27
27
  Entity.new(26, 27, :punct)]
28
28
  )
@@ -31,9 +31,9 @@ module Greeb
31
31
  it 'should extract e-mails' do
32
32
  Greeb['Hello example@example.com guys!'].must_equal(
33
33
  [Entity.new(0, 5, :letter),
34
- Entity.new(5, 6, :separ),
34
+ Entity.new(5, 6, :space),
35
35
  Entity.new(6, 25, :email),
36
- Entity.new(25, 26, :separ),
36
+ Entity.new(25, 26, :space),
37
37
  Entity.new(26, 30, :letter),
38
38
  Entity.new(30, 31, :punct)]
39
39
  )
@@ -84,11 +84,11 @@ module Greeb
84
84
  ],
85
85
  Entity.new(7, 22, :sentence) => [
86
86
  Entity.new(7, 8, :letter),
87
- Entity.new(8, 9, :separ),
87
+ Entity.new(8, 9, :space),
88
88
  Entity.new(9, 11, :letter),
89
- Entity.new(11, 12, :separ),
89
+ Entity.new(11, 12, :space),
90
90
  Entity.new(12, 14, :letter),
91
- Entity.new(14, 15, :separ),
91
+ Entity.new(14, 15, :space),
92
92
  Entity.new(15, 21, :letter),
93
93
  Entity.new(21, 22, :punct)
94
94
  ]
@@ -6,13 +6,6 @@ gem 'minitest'
6
6
  require 'minitest/autorun'
7
7
  require 'minitest/hell'
8
8
 
9
- unless 'true' == ENV['TRAVIS']
10
- require 'simplecov'
11
- SimpleCov.start do
12
- add_filter '/spec/'
13
- end
14
- end
15
-
16
9
  $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
17
10
  require 'greeb'
18
11
 
@@ -35,11 +35,11 @@ module Greeb
35
35
  Tokenizer.tokenize('Hello, I am 18').must_equal(
36
36
  [Entity.new(0, 5, :letter),
37
37
  Entity.new(5, 6, :spunct),
38
- Entity.new(6, 7, :separ),
38
+ Entity.new(6, 7, :space),
39
39
  Entity.new(7, 8, :letter),
40
- Entity.new(8, 9, :separ),
40
+ Entity.new(8, 9, :space),
41
41
  Entity.new(9, 11, :letter),
42
- Entity.new(11, 12, :separ),
42
+ Entity.new(11, 12, :space),
43
43
  Entity.new(12, 14, :integer)]
44
44
  )
45
45
  end
@@ -67,13 +67,13 @@ module Greeb
67
67
  Tokenizer.tokenize('Братишка, я тебе покушать принёс!').must_equal(
68
68
  [Entity.new(0, 8, :letter),
69
69
  Entity.new(8, 9, :spunct),
70
- Entity.new(9, 10, :separ),
70
+ Entity.new(9, 10, :space),
71
71
  Entity.new(10, 11, :letter),
72
- Entity.new(11, 12, :separ),
72
+ Entity.new(11, 12, :space),
73
73
  Entity.new(12, 16, :letter),
74
- Entity.new(16, 17, :separ),
74
+ Entity.new(16, 17, :space),
75
75
  Entity.new(17, 25, :letter),
76
- Entity.new(25, 26, :separ),
76
+ Entity.new(25, 26, :space),
77
77
  Entity.new(26, 32, :letter),
78
78
  Entity.new(32, 33, :punct)]
79
79
  )
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2.pre1
4
+ version: 0.2.2.rc1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Ustalov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-10-23 00:00:00.000000000 Z
11
+ date: 2013-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: minitest
@@ -34,9 +34,7 @@ extensions: []
34
34
  extra_rdoc_files: []
35
35
  files:
36
36
  - .gitignore
37
- - .rubocop.yml
38
37
  - .travis.yml
39
- - .yardopts
40
38
  - Gemfile
41
39
  - LICENSE
42
40
  - README.md
@@ -1,3 +0,0 @@
1
- # Don't use sprintf instead of %
2
- FavorSprintf:
3
- Enabled: false
data/.yardopts DELETED
@@ -1,6 +0,0 @@
1
- --protected
2
- --no-private
3
- -m markdown
4
- -
5
- README.md
6
- LICENSE