greeb 0.2.2.pre1 → 0.2.2.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +0 -1
- data/Gemfile +1 -1
- data/README.md +5 -4
- data/bin/greeb +1 -1
- data/lib/greeb/segmentator.rb +6 -6
- data/lib/greeb/tokenizer.rb +7 -2
- data/lib/greeb/version.rb +1 -1
- data/spec/core_spec.rb +5 -5
- data/spec/segmentator_spec.rb +3 -3
- data/spec/spec_helper.rb +0 -7
- data/spec/tokenizer_spec.rb +7 -7
- metadata +2 -4
- data/.rubocop.yml +0 -3
- data/.yardopts +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 921db15cfa86d9a5e6bf0e80058d349d708d1750
|
4
|
+
data.tar.gz: 994418e21e9e55ecb42b2329b59efce999281df9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: df570f263f22cfb2682ab39e758c6cb25a03c88d96bdcca15c79173d2dd937e28406a7baf90162882810973564d8dccb9aaf0962bb577c623ef1eceb3c6f56e4
|
7
|
+
data.tar.gz: 425859becdf31d2dc68552ba2f788675098a218063e627ec5f6b46c3a2a003fe685f75f03385289f482166c01e4e02ce5e7172ba7aa0289ae28f4fdc57645f4b
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# Greeb
|
2
2
|
Greeb [grʲip] is a simple yet awesome and Unicode-aware text segmentator
|
3
|
-
that is based on regular expressions.
|
3
|
+
that is based on regular expressions. API documentation is available at
|
4
|
+
<https://dmchk.github.com/greeb>.
|
4
5
|
|
5
6
|
## Installation
|
6
7
|
Add this line to your application's Gemfile:
|
@@ -175,11 +176,11 @@ There are several entity types at the tokenization stage: `:letter`,
|
|
175
176
|
4. Push to the branch (`git push origin my-new-feature`);
|
176
177
|
5. Create new Pull Request.
|
177
178
|
|
178
|
-
## Build Status [<img src="https://secure.travis-ci.org/
|
179
|
+
## Build Status [<img src="https://secure.travis-ci.org/dmchk/greeb.png"/>](http://travis-ci.org/dmchk/greeb)
|
179
180
|
|
180
|
-
## Dependency Status [<img src="https://gemnasium.com/
|
181
|
+
## Dependency Status [<img src="https://gemnasium.com/dmchk/greeb.png"/>](https://gemnasium.com/dmchk/greeb)
|
181
182
|
|
182
|
-
## Code Climate [<img src="https://codeclimate.com/github/
|
183
|
+
## Code Climate [<img src="https://codeclimate.com/github/dmchk/greeb.png"/>](https://codeclimate.com/github/dmchk/greeb)
|
183
184
|
|
184
185
|
## Copyright
|
185
186
|
|
data/bin/greeb
CHANGED
data/lib/greeb/segmentator.rb
CHANGED
@@ -5,9 +5,9 @@
|
|
5
5
|
#
|
6
6
|
class Greeb::Segmentator
|
7
7
|
# Sentence does not start from the separator charater, line break
|
8
|
-
# character,
|
8
|
+
# character, punctuation characters, and spaces.
|
9
9
|
#
|
10
|
-
|
10
|
+
SENTENCE_AINT_START = [:separ, :break, :punct, :spunct, :space]
|
11
11
|
|
12
12
|
attr_reader :tokens
|
13
13
|
|
@@ -64,7 +64,7 @@ class Greeb::Segmentator
|
|
64
64
|
collection = []
|
65
65
|
|
66
66
|
rest = tokens.inject(sample.dup) do |entity, token|
|
67
|
-
next entity if
|
67
|
+
next entity if sentence_aint_start? entity, token
|
68
68
|
entity.from = token.from unless entity.from
|
69
69
|
next entity if entity.to and entity.to > token.to
|
70
70
|
|
@@ -72,7 +72,7 @@ class Greeb::Segmentator
|
|
72
72
|
entity.to = find_forward(tokens, token).to
|
73
73
|
collection << entity
|
74
74
|
entity = sample.dup
|
75
|
-
elsif :separ
|
75
|
+
elsif ![:separ, :space].include? token.type
|
76
76
|
entity.to = token.to
|
77
77
|
end
|
78
78
|
|
@@ -95,8 +95,8 @@ class Greeb::Segmentator
|
|
95
95
|
#
|
96
96
|
# @return true or false.
|
97
97
|
#
|
98
|
-
def
|
99
|
-
!entity.from and
|
98
|
+
def sentence_aint_start?(entity, token)
|
99
|
+
!entity.from and SENTENCE_AINT_START.include? token.type
|
100
100
|
end
|
101
101
|
|
102
102
|
# Find a forwarding token that has another type.
|
data/lib/greeb/tokenizer.rb
CHANGED
@@ -31,7 +31,11 @@ module Greeb::Tokenizer
|
|
31
31
|
|
32
32
|
# In-subsentence seprator (i.e.: "*" or "=").
|
33
33
|
#
|
34
|
-
SEPARATORS = /[
|
34
|
+
SEPARATORS = /[\p{Nl}\p{No}\p{Pd}\p{Pc}\p{Po}\p{Sm}\p{So}\p{Sc}\p{Zl}\p{Zp}]+/u
|
35
|
+
|
36
|
+
# Spaces (i.e.: " " or  ).
|
37
|
+
#
|
38
|
+
SPACES = /[\p{Zs}]+/u
|
35
39
|
|
36
40
|
# Line breaks.
|
37
41
|
#
|
@@ -68,7 +72,7 @@ module Greeb::Tokenizer
|
|
68
72
|
# @return [Array<String>] splitted characters.
|
69
73
|
#
|
70
74
|
def split(token)
|
71
|
-
token.scan(/((.|\n)\2*)/).map(&:first)
|
75
|
+
token.scan(/((.|\n)\2*)/).map!(&:first)
|
72
76
|
end
|
73
77
|
|
74
78
|
protected
|
@@ -86,6 +90,7 @@ module Greeb::Tokenizer
|
|
86
90
|
split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
|
87
91
|
split_parse! scanner, tokens, PUNCTUATIONS, :punct or
|
88
92
|
split_parse! scanner, tokens, SEPARATORS, :separ or
|
93
|
+
split_parse! scanner, tokens, SPACES, :space or
|
89
94
|
split_parse! scanner, tokens, BREAKS, :break or
|
90
95
|
parse! scanner, tokens, RESIDUALS, :residual
|
91
96
|
end
|
data/lib/greeb/version.rb
CHANGED
data/spec/core_spec.rb
CHANGED
@@ -11,7 +11,7 @@ module Greeb
|
|
11
11
|
it 'should tokenize text when input is given' do
|
12
12
|
Greeb['Hello guys!'].must_equal(
|
13
13
|
[Entity.new(0, 5, :letter),
|
14
|
-
Entity.new(5, 6, :
|
14
|
+
Entity.new(5, 6, :space),
|
15
15
|
Entity.new(6, 10, :letter),
|
16
16
|
Entity.new(10, 11, :punct)]
|
17
17
|
)
|
@@ -20,9 +20,9 @@ module Greeb
|
|
20
20
|
it 'should extract URLs' do
|
21
21
|
Greeb['Hello http://nlpub.ru guys!'].must_equal(
|
22
22
|
[Entity.new(0, 5, :letter),
|
23
|
-
Entity.new(5, 6, :
|
23
|
+
Entity.new(5, 6, :space),
|
24
24
|
Entity.new(6, 21, :url),
|
25
|
-
Entity.new(21, 22, :
|
25
|
+
Entity.new(21, 22, :space),
|
26
26
|
Entity.new(22, 26, :letter),
|
27
27
|
Entity.new(26, 27, :punct)]
|
28
28
|
)
|
@@ -31,9 +31,9 @@ module Greeb
|
|
31
31
|
it 'should extract e-mails' do
|
32
32
|
Greeb['Hello example@example.com guys!'].must_equal(
|
33
33
|
[Entity.new(0, 5, :letter),
|
34
|
-
Entity.new(5, 6, :
|
34
|
+
Entity.new(5, 6, :space),
|
35
35
|
Entity.new(6, 25, :email),
|
36
|
-
Entity.new(25, 26, :
|
36
|
+
Entity.new(25, 26, :space),
|
37
37
|
Entity.new(26, 30, :letter),
|
38
38
|
Entity.new(30, 31, :punct)]
|
39
39
|
)
|
data/spec/segmentator_spec.rb
CHANGED
@@ -84,11 +84,11 @@ module Greeb
|
|
84
84
|
],
|
85
85
|
Entity.new(7, 22, :sentence) => [
|
86
86
|
Entity.new(7, 8, :letter),
|
87
|
-
Entity.new(8, 9, :
|
87
|
+
Entity.new(8, 9, :space),
|
88
88
|
Entity.new(9, 11, :letter),
|
89
|
-
Entity.new(11, 12, :
|
89
|
+
Entity.new(11, 12, :space),
|
90
90
|
Entity.new(12, 14, :letter),
|
91
|
-
Entity.new(14, 15, :
|
91
|
+
Entity.new(14, 15, :space),
|
92
92
|
Entity.new(15, 21, :letter),
|
93
93
|
Entity.new(21, 22, :punct)
|
94
94
|
]
|
data/spec/spec_helper.rb
CHANGED
@@ -6,13 +6,6 @@ gem 'minitest'
|
|
6
6
|
require 'minitest/autorun'
|
7
7
|
require 'minitest/hell'
|
8
8
|
|
9
|
-
unless 'true' == ENV['TRAVIS']
|
10
|
-
require 'simplecov'
|
11
|
-
SimpleCov.start do
|
12
|
-
add_filter '/spec/'
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
9
|
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
17
10
|
require 'greeb'
|
18
11
|
|
data/spec/tokenizer_spec.rb
CHANGED
@@ -35,11 +35,11 @@ module Greeb
|
|
35
35
|
Tokenizer.tokenize('Hello, I am 18').must_equal(
|
36
36
|
[Entity.new(0, 5, :letter),
|
37
37
|
Entity.new(5, 6, :spunct),
|
38
|
-
Entity.new(6, 7, :
|
38
|
+
Entity.new(6, 7, :space),
|
39
39
|
Entity.new(7, 8, :letter),
|
40
|
-
Entity.new(8, 9, :
|
40
|
+
Entity.new(8, 9, :space),
|
41
41
|
Entity.new(9, 11, :letter),
|
42
|
-
Entity.new(11, 12, :
|
42
|
+
Entity.new(11, 12, :space),
|
43
43
|
Entity.new(12, 14, :integer)]
|
44
44
|
)
|
45
45
|
end
|
@@ -67,13 +67,13 @@ module Greeb
|
|
67
67
|
Tokenizer.tokenize('Братишка, я тебе покушать принёс!').must_equal(
|
68
68
|
[Entity.new(0, 8, :letter),
|
69
69
|
Entity.new(8, 9, :spunct),
|
70
|
-
Entity.new(9, 10, :
|
70
|
+
Entity.new(9, 10, :space),
|
71
71
|
Entity.new(10, 11, :letter),
|
72
|
-
Entity.new(11, 12, :
|
72
|
+
Entity.new(11, 12, :space),
|
73
73
|
Entity.new(12, 16, :letter),
|
74
|
-
Entity.new(16, 17, :
|
74
|
+
Entity.new(16, 17, :space),
|
75
75
|
Entity.new(17, 25, :letter),
|
76
|
-
Entity.new(25, 26, :
|
76
|
+
Entity.new(25, 26, :space),
|
77
77
|
Entity.new(26, 32, :letter),
|
78
78
|
Entity.new(32, 33, :punct)]
|
79
79
|
)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.2.
|
4
|
+
version: 0.2.2.rc1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-11-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: minitest
|
@@ -34,9 +34,7 @@ extensions: []
|
|
34
34
|
extra_rdoc_files: []
|
35
35
|
files:
|
36
36
|
- .gitignore
|
37
|
-
- .rubocop.yml
|
38
37
|
- .travis.yml
|
39
|
-
- .yardopts
|
40
38
|
- Gemfile
|
41
39
|
- LICENSE
|
42
40
|
- README.md
|
data/.rubocop.yml
DELETED