greeb 0.2.2.pre1 → 0.2.2.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +0 -1
- data/Gemfile +1 -1
- data/README.md +5 -4
- data/bin/greeb +1 -1
- data/lib/greeb/segmentator.rb +6 -6
- data/lib/greeb/tokenizer.rb +7 -2
- data/lib/greeb/version.rb +1 -1
- data/spec/core_spec.rb +5 -5
- data/spec/segmentator_spec.rb +3 -3
- data/spec/spec_helper.rb +0 -7
- data/spec/tokenizer_spec.rb +7 -7
- metadata +2 -4
- data/.rubocop.yml +0 -3
- data/.yardopts +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 921db15cfa86d9a5e6bf0e80058d349d708d1750
|
4
|
+
data.tar.gz: 994418e21e9e55ecb42b2329b59efce999281df9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: df570f263f22cfb2682ab39e758c6cb25a03c88d96bdcca15c79173d2dd937e28406a7baf90162882810973564d8dccb9aaf0962bb577c623ef1eceb3c6f56e4
|
7
|
+
data.tar.gz: 425859becdf31d2dc68552ba2f788675098a218063e627ec5f6b46c3a2a003fe685f75f03385289f482166c01e4e02ce5e7172ba7aa0289ae28f4fdc57645f4b
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# Greeb
|
2
2
|
Greeb [grʲip] is a simple yet awesome and Unicode-aware text segmentator
|
3
|
-
that is based on regular expressions.
|
3
|
+
that is based on regular expressions. API documentation is available at
|
4
|
+
<https://dmchk.github.com/greeb>.
|
4
5
|
|
5
6
|
## Installation
|
6
7
|
Add this line to your application's Gemfile:
|
@@ -175,11 +176,11 @@ There are several entity types at the tokenization stage: `:letter`,
|
|
175
176
|
4. Push to the branch (`git push origin my-new-feature`);
|
176
177
|
5. Create new Pull Request.
|
177
178
|
|
178
|
-
## Build Status [<img src="https://secure.travis-ci.org/
|
179
|
+
## Build Status [<img src="https://secure.travis-ci.org/dmchk/greeb.png"/>](http://travis-ci.org/dmchk/greeb)
|
179
180
|
|
180
|
-
## Dependency Status [<img src="https://gemnasium.com/
|
181
|
+
## Dependency Status [<img src="https://gemnasium.com/dmchk/greeb.png"/>](https://gemnasium.com/dmchk/greeb)
|
181
182
|
|
182
|
-
## Code Climate [<img src="https://codeclimate.com/github/
|
183
|
+
## Code Climate [<img src="https://codeclimate.com/github/dmchk/greeb.png"/>](https://codeclimate.com/github/dmchk/greeb)
|
183
184
|
|
184
185
|
## Copyright
|
185
186
|
|
data/bin/greeb
CHANGED
data/lib/greeb/segmentator.rb
CHANGED
@@ -5,9 +5,9 @@
|
|
5
5
|
#
|
6
6
|
class Greeb::Segmentator
|
7
7
|
# Sentence does not start from the separator charater, line break
|
8
|
-
# character,
|
8
|
+
# character, punctuation characters, and spaces.
|
9
9
|
#
|
10
|
-
|
10
|
+
SENTENCE_AINT_START = [:separ, :break, :punct, :spunct, :space]
|
11
11
|
|
12
12
|
attr_reader :tokens
|
13
13
|
|
@@ -64,7 +64,7 @@ class Greeb::Segmentator
|
|
64
64
|
collection = []
|
65
65
|
|
66
66
|
rest = tokens.inject(sample.dup) do |entity, token|
|
67
|
-
next entity if
|
67
|
+
next entity if sentence_aint_start? entity, token
|
68
68
|
entity.from = token.from unless entity.from
|
69
69
|
next entity if entity.to and entity.to > token.to
|
70
70
|
|
@@ -72,7 +72,7 @@ class Greeb::Segmentator
|
|
72
72
|
entity.to = find_forward(tokens, token).to
|
73
73
|
collection << entity
|
74
74
|
entity = sample.dup
|
75
|
-
elsif :separ
|
75
|
+
elsif ![:separ, :space].include? token.type
|
76
76
|
entity.to = token.to
|
77
77
|
end
|
78
78
|
|
@@ -95,8 +95,8 @@ class Greeb::Segmentator
|
|
95
95
|
#
|
96
96
|
# @return true or false.
|
97
97
|
#
|
98
|
-
def
|
99
|
-
!entity.from and
|
98
|
+
def sentence_aint_start?(entity, token)
|
99
|
+
!entity.from and SENTENCE_AINT_START.include? token.type
|
100
100
|
end
|
101
101
|
|
102
102
|
# Find a forwarding token that has another type.
|
data/lib/greeb/tokenizer.rb
CHANGED
@@ -31,7 +31,11 @@ module Greeb::Tokenizer
|
|
31
31
|
|
32
32
|
# In-subsentence seprator (i.e.: "*" or "=").
|
33
33
|
#
|
34
|
-
SEPARATORS = /[
|
34
|
+
SEPARATORS = /[\p{Nl}\p{No}\p{Pd}\p{Pc}\p{Po}\p{Sm}\p{So}\p{Sc}\p{Zl}\p{Zp}]+/u
|
35
|
+
|
36
|
+
# Spaces (i.e.: " " or  ).
|
37
|
+
#
|
38
|
+
SPACES = /[\p{Zs}]+/u
|
35
39
|
|
36
40
|
# Line breaks.
|
37
41
|
#
|
@@ -68,7 +72,7 @@ module Greeb::Tokenizer
|
|
68
72
|
# @return [Array<String>] splitted characters.
|
69
73
|
#
|
70
74
|
def split(token)
|
71
|
-
token.scan(/((.|\n)\2*)/).map(&:first)
|
75
|
+
token.scan(/((.|\n)\2*)/).map!(&:first)
|
72
76
|
end
|
73
77
|
|
74
78
|
protected
|
@@ -86,6 +90,7 @@ module Greeb::Tokenizer
|
|
86
90
|
split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
|
87
91
|
split_parse! scanner, tokens, PUNCTUATIONS, :punct or
|
88
92
|
split_parse! scanner, tokens, SEPARATORS, :separ or
|
93
|
+
split_parse! scanner, tokens, SPACES, :space or
|
89
94
|
split_parse! scanner, tokens, BREAKS, :break or
|
90
95
|
parse! scanner, tokens, RESIDUALS, :residual
|
91
96
|
end
|
data/lib/greeb/version.rb
CHANGED
data/spec/core_spec.rb
CHANGED
@@ -11,7 +11,7 @@ module Greeb
|
|
11
11
|
it 'should tokenize text when input is given' do
|
12
12
|
Greeb['Hello guys!'].must_equal(
|
13
13
|
[Entity.new(0, 5, :letter),
|
14
|
-
Entity.new(5, 6, :
|
14
|
+
Entity.new(5, 6, :space),
|
15
15
|
Entity.new(6, 10, :letter),
|
16
16
|
Entity.new(10, 11, :punct)]
|
17
17
|
)
|
@@ -20,9 +20,9 @@ module Greeb
|
|
20
20
|
it 'should extract URLs' do
|
21
21
|
Greeb['Hello http://nlpub.ru guys!'].must_equal(
|
22
22
|
[Entity.new(0, 5, :letter),
|
23
|
-
Entity.new(5, 6, :
|
23
|
+
Entity.new(5, 6, :space),
|
24
24
|
Entity.new(6, 21, :url),
|
25
|
-
Entity.new(21, 22, :
|
25
|
+
Entity.new(21, 22, :space),
|
26
26
|
Entity.new(22, 26, :letter),
|
27
27
|
Entity.new(26, 27, :punct)]
|
28
28
|
)
|
@@ -31,9 +31,9 @@ module Greeb
|
|
31
31
|
it 'should extract e-mails' do
|
32
32
|
Greeb['Hello example@example.com guys!'].must_equal(
|
33
33
|
[Entity.new(0, 5, :letter),
|
34
|
-
Entity.new(5, 6, :
|
34
|
+
Entity.new(5, 6, :space),
|
35
35
|
Entity.new(6, 25, :email),
|
36
|
-
Entity.new(25, 26, :
|
36
|
+
Entity.new(25, 26, :space),
|
37
37
|
Entity.new(26, 30, :letter),
|
38
38
|
Entity.new(30, 31, :punct)]
|
39
39
|
)
|
data/spec/segmentator_spec.rb
CHANGED
@@ -84,11 +84,11 @@ module Greeb
|
|
84
84
|
],
|
85
85
|
Entity.new(7, 22, :sentence) => [
|
86
86
|
Entity.new(7, 8, :letter),
|
87
|
-
Entity.new(8, 9, :
|
87
|
+
Entity.new(8, 9, :space),
|
88
88
|
Entity.new(9, 11, :letter),
|
89
|
-
Entity.new(11, 12, :
|
89
|
+
Entity.new(11, 12, :space),
|
90
90
|
Entity.new(12, 14, :letter),
|
91
|
-
Entity.new(14, 15, :
|
91
|
+
Entity.new(14, 15, :space),
|
92
92
|
Entity.new(15, 21, :letter),
|
93
93
|
Entity.new(21, 22, :punct)
|
94
94
|
]
|
data/spec/spec_helper.rb
CHANGED
@@ -6,13 +6,6 @@ gem 'minitest'
|
|
6
6
|
require 'minitest/autorun'
|
7
7
|
require 'minitest/hell'
|
8
8
|
|
9
|
-
unless 'true' == ENV['TRAVIS']
|
10
|
-
require 'simplecov'
|
11
|
-
SimpleCov.start do
|
12
|
-
add_filter '/spec/'
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
9
|
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
17
10
|
require 'greeb'
|
18
11
|
|
data/spec/tokenizer_spec.rb
CHANGED
@@ -35,11 +35,11 @@ module Greeb
|
|
35
35
|
Tokenizer.tokenize('Hello, I am 18').must_equal(
|
36
36
|
[Entity.new(0, 5, :letter),
|
37
37
|
Entity.new(5, 6, :spunct),
|
38
|
-
Entity.new(6, 7, :
|
38
|
+
Entity.new(6, 7, :space),
|
39
39
|
Entity.new(7, 8, :letter),
|
40
|
-
Entity.new(8, 9, :
|
40
|
+
Entity.new(8, 9, :space),
|
41
41
|
Entity.new(9, 11, :letter),
|
42
|
-
Entity.new(11, 12, :
|
42
|
+
Entity.new(11, 12, :space),
|
43
43
|
Entity.new(12, 14, :integer)]
|
44
44
|
)
|
45
45
|
end
|
@@ -67,13 +67,13 @@ module Greeb
|
|
67
67
|
Tokenizer.tokenize('Братишка, я тебе покушать принёс!').must_equal(
|
68
68
|
[Entity.new(0, 8, :letter),
|
69
69
|
Entity.new(8, 9, :spunct),
|
70
|
-
Entity.new(9, 10, :
|
70
|
+
Entity.new(9, 10, :space),
|
71
71
|
Entity.new(10, 11, :letter),
|
72
|
-
Entity.new(11, 12, :
|
72
|
+
Entity.new(11, 12, :space),
|
73
73
|
Entity.new(12, 16, :letter),
|
74
|
-
Entity.new(16, 17, :
|
74
|
+
Entity.new(16, 17, :space),
|
75
75
|
Entity.new(17, 25, :letter),
|
76
|
-
Entity.new(25, 26, :
|
76
|
+
Entity.new(25, 26, :space),
|
77
77
|
Entity.new(26, 32, :letter),
|
78
78
|
Entity.new(32, 33, :punct)]
|
79
79
|
)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.2.
|
4
|
+
version: 0.2.2.rc1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-11-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: minitest
|
@@ -34,9 +34,7 @@ extensions: []
|
|
34
34
|
extra_rdoc_files: []
|
35
35
|
files:
|
36
36
|
- .gitignore
|
37
|
-
- .rubocop.yml
|
38
37
|
- .travis.yml
|
39
|
-
- .yardopts
|
40
38
|
- Gemfile
|
41
39
|
- LICENSE
|
42
40
|
- README.md
|
data/.rubocop.yml
DELETED