greeb 0.1.0.rc4 → 0.1.0.rc6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +1 -0
- data/README.md +9 -14
- data/greeb.gemspec +4 -4
- data/lib/greeb.rb +3 -3
- data/lib/greeb/segmentator.rb +3 -3
- data/lib/greeb/tokenizer.rb +30 -14
- data/lib/greeb/version.rb +1 -1
- data/spec/segmentator_spec.rb +8 -8
- data/spec/tokenizer_spec.rb +33 -33
- metadata +25 -24
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -27,8 +27,8 @@ Greeb can help you to solve simple text processing problems:
|
|
27
27
|
```ruby
|
28
28
|
pp Greeb::Tokenizer.new('Hello!').tokens
|
29
29
|
=begin
|
30
|
-
#<
|
31
|
-
#<struct Greeb::Entity from=5, to=6, type=:punct>
|
30
|
+
[#<struct Greeb::Entity from=0, to=5, type=:letter>,
|
31
|
+
#<struct Greeb::Entity from=5, to=6, type=:punct>]
|
32
32
|
=end
|
33
33
|
```
|
34
34
|
|
@@ -43,7 +43,7 @@ EOF
|
|
43
43
|
|
44
44
|
pp Greeb::Tokenizer.new(text).tokens
|
45
45
|
=begin
|
46
|
-
#<
|
46
|
+
[#<struct Greeb::Entity from=0, to=5, type=:letter>,
|
47
47
|
#<struct Greeb::Entity from=5, to=6, type=:punct>,
|
48
48
|
#<struct Greeb::Entity from=6, to=7, type=:separ>,
|
49
49
|
#<struct Greeb::Entity from=7, to=8, type=:letter>,
|
@@ -70,7 +70,7 @@ pp Greeb::Tokenizer.new(text).tokens
|
|
70
70
|
#<struct Greeb::Entity from=59, to=60, type=:separ>,
|
71
71
|
#<struct Greeb::Entity from=60, to=63, type=:letter>,
|
72
72
|
#<struct Greeb::Entity from=63, to=64, type=:punct>,
|
73
|
-
#<struct Greeb::Entity from=64, to=65, type=:break>
|
73
|
+
#<struct Greeb::Entity from=64, to=65, type=:break>]
|
74
74
|
=end
|
75
75
|
```
|
76
76
|
|
@@ -82,8 +82,8 @@ text = 'Hello! How are you?'
|
|
82
82
|
tokenizer = Greeb::Tokenizer.new(text)
|
83
83
|
pp Greeb::Segmentator.new(tokenizer).sentences
|
84
84
|
=begin
|
85
|
-
#<
|
86
|
-
#<struct Greeb::Entity from=7, to=19, type=:sentence>
|
85
|
+
[#<struct Greeb::Entity from=0, to=6, type=:sentence>,
|
86
|
+
#<struct Greeb::Entity from=7, to=19, type=:sentence>]
|
87
87
|
=end
|
88
88
|
```
|
89
89
|
|
@@ -111,9 +111,9 @@ pp segmentator.extract(*sentences)
|
|
111
111
|
|
112
112
|
## Tokens
|
113
113
|
|
114
|
-
Greeb operates with entities, tuples of
|
115
|
-
|
116
|
-
and
|
114
|
+
Greeb operates with entities, tuples of *(from, to, kind)*, where
|
115
|
+
*from* is a beginning of the entity, *to* is an ending of the entity,
|
116
|
+
and *kind* is a type of the entity.
|
117
117
|
|
118
118
|
There are several entity types: `:letter`, `:float`, `:integer`,
|
119
119
|
`:separ`, `:punct` (for punctuation), `:spunct` (for in-sentence
|
@@ -132,11 +132,6 @@ systematic and awesome.
|
|
132
132
|
|
133
133
|
## Build Status [<img src="https://secure.travis-ci.org/eveel/greeb.png"/>](http://travis-ci.org/eveel/greeb)
|
134
134
|
|
135
|
-
If you're using [Rubinius](http://rubini.us) please note that it has the
|
136
|
-
incompatible `StringScanner` implementation. More information can be
|
137
|
-
provided under the following link:
|
138
|
-
<https://github.com/rubinius/rubinius/issues/1808>.
|
139
|
-
|
140
135
|
## Dependency Status [<img src="https://gemnasium.com/eveel/greeb.png?travis"/>](https://gemnasium.com/eveel/greeb)
|
141
136
|
|
142
137
|
## Copyright
|
data/greeb.gemspec
CHANGED
@@ -6,12 +6,12 @@ Gem::Specification.new do |s|
|
|
6
6
|
s.name = 'greeb'
|
7
7
|
s.version = Greeb::VERSION
|
8
8
|
s.platform = Gem::Platform::RUBY
|
9
|
-
s.authors = ['Dmitry
|
9
|
+
s.authors = ['Dmitry Ustalov']
|
10
10
|
s.email = ['dmitry@eveel.ru']
|
11
11
|
s.homepage = 'https://github.com/eveel/greeb'
|
12
|
-
s.summary = 'Greeb is a simple regexp-based tokenizer.'
|
13
|
-
s.description = 'Greeb is a simple yet awesome
|
14
|
-
'written in Ruby.'
|
12
|
+
s.summary = 'Greeb is a simple Unicode-aware regexp-based tokenizer.'
|
13
|
+
s.description = 'Greeb is a simple yet awesome and Unicode-aware ' \
|
14
|
+
'regexp-based tokenizer, written in Ruby.'
|
15
15
|
|
16
16
|
s.rubyforge_project = 'greeb'
|
17
17
|
|
data/lib/greeb.rb
CHANGED
@@ -2,9 +2,9 @@
|
|
2
2
|
|
3
3
|
require 'greeb/version'
|
4
4
|
|
5
|
-
# Greeb operates with entities, tuples of
|
6
|
-
#
|
7
|
-
# and
|
5
|
+
# Greeb operates with entities, tuples of *(from, to, kind)*, where
|
6
|
+
# *from* is a beginning of the entity, *to* is an ending of the entity,
|
7
|
+
# and *kind* is a type of the entity.
|
8
8
|
#
|
9
9
|
# There are several entity types: `:letter`, `:float`, `:integer`,
|
10
10
|
# `:separ` for separators, `:punct` for punctuation characters,
|
data/lib/greeb/segmentator.rb
CHANGED
@@ -26,7 +26,7 @@ class Greeb::Segmentator
|
|
26
26
|
|
27
27
|
# Sentences memoization method.
|
28
28
|
#
|
29
|
-
# @return [
|
29
|
+
# @return [Array<Greeb::Entity>] a set of sentences.
|
30
30
|
#
|
31
31
|
def sentences
|
32
32
|
detect_sentences! unless @sentences
|
@@ -35,7 +35,7 @@ class Greeb::Segmentator
|
|
35
35
|
|
36
36
|
# Subsentences memoization method.
|
37
37
|
#
|
38
|
-
# @return [
|
38
|
+
# @return [Array<Greeb::Entity>] a set of subsentences.
|
39
39
|
#
|
40
40
|
def subsentences
|
41
41
|
detect_subsentences! unless @subsentences
|
@@ -79,7 +79,7 @@ class Greeb::Segmentator
|
|
79
79
|
# @return [nil] nothing.
|
80
80
|
#
|
81
81
|
def detect_sentences!
|
82
|
-
@sentences =
|
82
|
+
@sentences = []
|
83
83
|
|
84
84
|
rest = tokens.inject(new_sentence) do |sentence, token|
|
85
85
|
if !sentence.from and SENTENCE_DOESNT_START.include?(token.type)
|
data/lib/greeb/tokenizer.rb
CHANGED
@@ -1,13 +1,29 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
require 'set'
|
4
|
-
|
5
3
|
# Greeb's tokenization facilities. Use 'em with love.
|
6
4
|
#
|
7
5
|
class Greeb::Tokenizer
|
6
|
+
# This runtime error appears when {Greeb::Tokenizer} tries to recognize
|
7
|
+
# unknown character.
|
8
|
+
#
|
9
|
+
class UnknownEntity < RuntimeError
|
10
|
+
attr_reader :text, :pos
|
11
|
+
|
12
|
+
# @private
|
13
|
+
def initialize(text, pos)
|
14
|
+
@text, @pos = text, pos
|
15
|
+
end
|
16
|
+
|
17
|
+
# Generate the real error message.
|
18
|
+
#
|
19
|
+
def to_s
|
20
|
+
'Could not recognize character "%s" @ %d' % [text[pos], pos]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
8
24
|
# English and Russian letters.
|
9
25
|
#
|
10
|
-
LETTERS = /[
|
26
|
+
LETTERS = /[\p{L}]+/u
|
11
27
|
|
12
28
|
# Floating point values.
|
13
29
|
#
|
@@ -17,21 +33,21 @@ class Greeb::Tokenizer
|
|
17
33
|
#
|
18
34
|
INTEGERS = /\d+/u
|
19
35
|
|
20
|
-
# In-
|
36
|
+
# In-sentence punctuation character (i.e.: "," or "-").
|
21
37
|
#
|
22
|
-
|
38
|
+
SENTENCE_PUNCTUATIONS = /(\,|\-|:|;|\p{Ps}|\p{Pi}|\p{Pf}|\p{Pe})+/u
|
23
39
|
|
24
40
|
# Punctuation character (i.e.: "." or "!").
|
25
41
|
#
|
26
|
-
PUNCTUATIONS = /(\.|\!|\?)+/u
|
42
|
+
PUNCTUATIONS = /[(\.|\!|\?)]+/u
|
27
43
|
|
28
|
-
# In-
|
44
|
+
# In-subsentence seprator (i.e.: "*" or "=").
|
29
45
|
#
|
30
|
-
|
46
|
+
SEPARATORS = /[ \p{Sm}\p{Pc}\p{Po}\p{Pd}]+/u
|
31
47
|
|
32
48
|
# Line breaks.
|
33
49
|
#
|
34
|
-
BREAKS =
|
50
|
+
BREAKS = /(\r\n|\n|\r)+/u
|
35
51
|
|
36
52
|
attr_reader :text, :scanner
|
37
53
|
protected :scanner
|
@@ -46,7 +62,7 @@ class Greeb::Tokenizer
|
|
46
62
|
|
47
63
|
# Tokens memoization method.
|
48
64
|
#
|
49
|
-
# @return [
|
65
|
+
# @return [Array<Greeb::Entity>] a set of tokens.
|
50
66
|
#
|
51
67
|
def tokens
|
52
68
|
tokenize! unless @tokens
|
@@ -61,7 +77,7 @@ class Greeb::Tokenizer
|
|
61
77
|
#
|
62
78
|
def tokenize!
|
63
79
|
@scanner = Greeb::StringScanner.new(text)
|
64
|
-
@tokens =
|
80
|
+
@tokens = []
|
65
81
|
while !scanner.eos?
|
66
82
|
parse! LETTERS, :letter or
|
67
83
|
parse! FLOATS, :float or
|
@@ -70,7 +86,7 @@ class Greeb::Tokenizer
|
|
70
86
|
split_parse! PUNCTUATIONS, :punct or
|
71
87
|
split_parse! SEPARATORS, :separ or
|
72
88
|
split_parse! BREAKS, :break or
|
73
|
-
raise
|
89
|
+
raise UnknownEntity.new(text, scanner.char_pos)
|
74
90
|
end
|
75
91
|
ensure
|
76
92
|
scanner.terminate
|
@@ -83,7 +99,7 @@ class Greeb::Tokenizer
|
|
83
99
|
# @param type [Symbol] a symbol that represents the necessary token
|
84
100
|
# type.
|
85
101
|
#
|
86
|
-
# @return [
|
102
|
+
# @return [Array<Greeb::Entity>] the modified set of extracted tokens.
|
87
103
|
#
|
88
104
|
def parse! pattern, type
|
89
105
|
return false unless token = scanner.scan(pattern)
|
@@ -101,7 +117,7 @@ class Greeb::Tokenizer
|
|
101
117
|
# @param type [Symbol] a symbol that represents the necessary token
|
102
118
|
# type.
|
103
119
|
#
|
104
|
-
# @return [
|
120
|
+
# @return [Array<Greeb::Entity>] the modified set of extracted tokens.
|
105
121
|
#
|
106
122
|
def split_parse! pattern, type
|
107
123
|
return false unless token = scanner.scan(pattern)
|
data/lib/greeb/version.rb
CHANGED
data/spec/segmentator_spec.rb
CHANGED
@@ -10,12 +10,12 @@ module Greeb
|
|
10
10
|
subject { Segmentator.new(@tokenizer) }
|
11
11
|
|
12
12
|
it 'can be initialized either with Tokenizer' do
|
13
|
-
subject.tokens.must_be_kind_of
|
13
|
+
subject.tokens.must_be_kind_of Array
|
14
14
|
end
|
15
15
|
|
16
16
|
it 'can be initialized either with a set of tokens' do
|
17
17
|
subject = Segmentator.new(@tokenizer.tokens)
|
18
|
-
subject.tokens.must_be_kind_of
|
18
|
+
subject.tokens.must_be_kind_of Array
|
19
19
|
end
|
20
20
|
|
21
21
|
it 'should has @tokens ivar' do
|
@@ -30,7 +30,7 @@ module Greeb
|
|
30
30
|
|
31
31
|
it 'should be segmented' do
|
32
32
|
subject.must_equal(
|
33
|
-
|
33
|
+
[Entity.new(0, 22, :sentence)]
|
34
34
|
)
|
35
35
|
end
|
36
36
|
end
|
@@ -42,7 +42,7 @@ module Greeb
|
|
42
42
|
|
43
43
|
it 'should be segmented' do
|
44
44
|
subject.must_equal(
|
45
|
-
|
45
|
+
[Entity.new(0, 21, :sentence)]
|
46
46
|
)
|
47
47
|
end
|
48
48
|
end
|
@@ -54,7 +54,7 @@ module Greeb
|
|
54
54
|
|
55
55
|
it 'should be segmented' do
|
56
56
|
subject.must_equal(
|
57
|
-
|
57
|
+
[Entity.new(6, 27, :sentence)]
|
58
58
|
)
|
59
59
|
end
|
60
60
|
end
|
@@ -66,8 +66,8 @@ module Greeb
|
|
66
66
|
|
67
67
|
it 'should be segmented' do
|
68
68
|
subject.must_equal(
|
69
|
-
|
70
|
-
|
69
|
+
[Entity.new(0, 6, :sentence),
|
70
|
+
Entity.new(7, 22, :sentence)]
|
71
71
|
)
|
72
72
|
end
|
73
73
|
end
|
@@ -79,7 +79,7 @@ module Greeb
|
|
79
79
|
|
80
80
|
it 'should be segmented' do
|
81
81
|
subject.must_equal(
|
82
|
-
|
82
|
+
[Entity.new(2, 17, :sentence)]
|
83
83
|
)
|
84
84
|
end
|
85
85
|
end
|
data/spec/tokenizer_spec.rb
CHANGED
@@ -32,75 +32,75 @@ module Greeb
|
|
32
32
|
end
|
33
33
|
|
34
34
|
it 'should has the tokens set' do
|
35
|
-
subject.tokens.must_be_kind_of
|
35
|
+
subject.tokens.must_be_kind_of Array
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
39
39
|
describe 'tokenization facilities' do
|
40
40
|
it 'can handle words' do
|
41
41
|
Tokenizer.new('hello').tokens.must_equal(
|
42
|
-
|
42
|
+
[Entity.new(0, 5, :letter)]
|
43
43
|
)
|
44
44
|
end
|
45
45
|
|
46
46
|
it 'can handle floats' do
|
47
47
|
Tokenizer.new('14.88').tokens.must_equal(
|
48
|
-
|
48
|
+
[Entity.new(0, 5, :float)]
|
49
49
|
)
|
50
50
|
end
|
51
51
|
|
52
52
|
it 'can handle integers' do
|
53
53
|
Tokenizer.new('1337').tokens.must_equal(
|
54
|
-
|
54
|
+
[Entity.new(0, 4, :integer)]
|
55
55
|
)
|
56
56
|
end
|
57
57
|
|
58
58
|
it 'can handle words and integers' do
|
59
59
|
Tokenizer.new('Hello, I am 18').tokens.must_equal(
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
60
|
+
[Entity.new(0, 5, :letter),
|
61
|
+
Entity.new(5, 6, :spunct),
|
62
|
+
Entity.new(6, 7, :separ),
|
63
|
+
Entity.new(7, 8, :letter),
|
64
|
+
Entity.new(8, 9, :separ),
|
65
|
+
Entity.new(9, 11, :letter),
|
66
|
+
Entity.new(11, 12, :separ),
|
67
|
+
Entity.new(12, 14, :integer)]
|
68
68
|
)
|
69
69
|
end
|
70
70
|
|
71
71
|
it 'can handle multi-line paragraphs' do
|
72
72
|
Tokenizer.new("Brateeshka..!\n\nPrines!").tokens.must_equal(
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
73
|
+
[Entity.new(0, 10, :letter),
|
74
|
+
Entity.new(10, 12, :punct),
|
75
|
+
Entity.new(12, 13, :punct),
|
76
|
+
Entity.new(13, 15, :break),
|
77
|
+
Entity.new(15, 21, :letter),
|
78
|
+
Entity.new(21, 22, :punct)]
|
79
79
|
)
|
80
80
|
end
|
81
81
|
|
82
82
|
it 'can handle separated integers' do
|
83
83
|
Tokenizer.new('228/359').tokens.must_equal(
|
84
|
-
|
85
|
-
|
86
|
-
|
84
|
+
[Entity.new(0, 3, :integer),
|
85
|
+
Entity.new(3, 4, :separ),
|
86
|
+
Entity.new(4, 7, :integer)]
|
87
87
|
)
|
88
88
|
end
|
89
89
|
|
90
90
|
it 'can deal with Russian language' do
|
91
91
|
Tokenizer.new('Братишка, я тебе покушать принёс!').tokens.must_equal(
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
92
|
+
[Entity.new(0, 8, :letter),
|
93
|
+
Entity.new(8, 9, :spunct),
|
94
|
+
Entity.new(9, 10, :separ),
|
95
|
+
Entity.new(10, 11, :letter),
|
96
|
+
Entity.new(11, 12, :separ),
|
97
|
+
Entity.new(12, 16, :letter),
|
98
|
+
Entity.new(16, 17, :separ),
|
99
|
+
Entity.new(17, 25, :letter),
|
100
|
+
Entity.new(25, 26, :separ),
|
101
|
+
Entity.new(26, 32, :letter),
|
102
|
+
Entity.new(32, 33, :punct)]
|
103
|
+
)
|
104
104
|
end
|
105
105
|
end
|
106
106
|
end
|
metadata
CHANGED
@@ -1,81 +1,82 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0.
|
4
|
+
version: 0.1.0.rc6
|
5
5
|
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
|
-
- Dmitry
|
8
|
+
- Dmitry Ustalov
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-12-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name: rake
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ! '>='
|
20
18
|
- !ruby/object:Gem::Version
|
21
19
|
version: '0'
|
20
|
+
none: false
|
22
21
|
type: :development
|
23
|
-
|
22
|
+
name: rake
|
24
23
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
24
|
requirements:
|
27
25
|
- - ! '>='
|
28
26
|
- !ruby/object:Gem::Version
|
29
27
|
version: '0'
|
28
|
+
none: false
|
29
|
+
prerelease: false
|
30
30
|
- !ruby/object:Gem::Dependency
|
31
|
-
name: minitest
|
32
31
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
32
|
requirements:
|
35
33
|
- - ! '>='
|
36
34
|
- !ruby/object:Gem::Version
|
37
35
|
version: '2.11'
|
36
|
+
none: false
|
38
37
|
type: :development
|
39
|
-
|
38
|
+
name: minitest
|
40
39
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
40
|
requirements:
|
43
41
|
- - ! '>='
|
44
42
|
- !ruby/object:Gem::Version
|
45
43
|
version: '2.11'
|
44
|
+
none: false
|
45
|
+
prerelease: false
|
46
46
|
- !ruby/object:Gem::Dependency
|
47
|
-
name: simplecov
|
48
47
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
48
|
requirements:
|
51
49
|
- - ! '>='
|
52
50
|
- !ruby/object:Gem::Version
|
53
51
|
version: '0'
|
52
|
+
none: false
|
54
53
|
type: :development
|
55
|
-
|
54
|
+
name: simplecov
|
56
55
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
56
|
requirements:
|
59
57
|
- - ! '>='
|
60
58
|
- !ruby/object:Gem::Version
|
61
59
|
version: '0'
|
60
|
+
none: false
|
61
|
+
prerelease: false
|
62
62
|
- !ruby/object:Gem::Dependency
|
63
|
-
name: yard
|
64
63
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
64
|
requirements:
|
67
65
|
- - ! '>='
|
68
66
|
- !ruby/object:Gem::Version
|
69
67
|
version: '0'
|
68
|
+
none: false
|
70
69
|
type: :development
|
71
|
-
|
70
|
+
name: yard
|
72
71
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
72
|
requirements:
|
75
73
|
- - ! '>='
|
76
74
|
- !ruby/object:Gem::Version
|
77
75
|
version: '0'
|
78
|
-
|
76
|
+
none: false
|
77
|
+
prerelease: false
|
78
|
+
description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
|
79
|
+
written in Ruby.
|
79
80
|
email:
|
80
81
|
- dmitry@eveel.ru
|
81
82
|
executables: []
|
@@ -105,26 +106,26 @@ rdoc_options: []
|
|
105
106
|
require_paths:
|
106
107
|
- lib
|
107
108
|
required_ruby_version: !ruby/object:Gem::Requirement
|
108
|
-
none: false
|
109
109
|
requirements:
|
110
110
|
- - ! '>='
|
111
111
|
- !ruby/object:Gem::Version
|
112
|
+
hash: 2757695902770698935
|
112
113
|
version: '0'
|
113
114
|
segments:
|
114
115
|
- 0
|
115
|
-
hash: 1130932854600612903
|
116
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
116
|
none: false
|
117
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
118
118
|
requirements:
|
119
119
|
- - ! '>'
|
120
120
|
- !ruby/object:Gem::Version
|
121
121
|
version: 1.3.1
|
122
|
+
none: false
|
122
123
|
requirements: []
|
123
124
|
rubyforge_project: greeb
|
124
125
|
rubygems_version: 1.8.24
|
125
126
|
signing_key:
|
126
127
|
specification_version: 3
|
127
|
-
summary: Greeb is a simple regexp-based tokenizer.
|
128
|
+
summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
|
128
129
|
test_files:
|
129
130
|
- spec/segmentator_spec.rb
|
130
131
|
- spec/spec_helper.rb
|