greeb 0.1.0.rc7 → 0.1.0.rc8
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +9 -12
- data/lib/greeb.rb +18 -0
- data/lib/greeb/segmentator.rb +76 -81
- data/lib/greeb/tokenizer.rb +57 -88
- data/lib/greeb/version.rb +1 -1
- data/spec/segmentator_spec.rb +30 -47
- data/spec/tokenizer_spec.rb +9 -33
- metadata +3 -3
data/README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
Greeb
|
2
2
|
=====
|
3
3
|
|
4
|
-
Greeb is a simple yet awesome
|
5
|
-
expressions.
|
4
|
+
Greeb is a simple yet awesome and Unicode-aware text segmentator
|
5
|
+
that is based on regular expressions.
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -25,7 +25,7 @@ Or install it yourself as:
|
|
25
25
|
Greeb can help you to solve simple text processing problems:
|
26
26
|
|
27
27
|
```ruby
|
28
|
-
pp Greeb::Tokenizer.
|
28
|
+
pp Greeb::Tokenizer.tokenize('Hello!')
|
29
29
|
=begin
|
30
30
|
[#<struct Greeb::Entity from=0, to=5, type=:letter>,
|
31
31
|
#<struct Greeb::Entity from=5, to=6, type=:punct>]
|
@@ -41,7 +41,7 @@ Hello! I am 18! My favourite number is 133.7...
|
|
41
41
|
What about you?
|
42
42
|
EOF
|
43
43
|
|
44
|
-
pp Greeb::Tokenizer.
|
44
|
+
pp Greeb::Tokenizer.tokenize(text)
|
45
45
|
=begin
|
46
46
|
[#<struct Greeb::Entity from=0, to=5, type=:letter>,
|
47
47
|
#<struct Greeb::Entity from=5, to=6, type=:punct>,
|
@@ -79,8 +79,8 @@ such as sentence detection tasks:
|
|
79
79
|
|
80
80
|
```ruby
|
81
81
|
text = 'Hello! How are you?'
|
82
|
-
|
83
|
-
pp Greeb::Segmentator.new(
|
82
|
+
tokens = Greeb::Tokenizer.tokenize(text)
|
83
|
+
pp Greeb::Segmentator.new(tokens).sentences
|
84
84
|
=begin
|
85
85
|
[#<struct Greeb::Entity from=0, to=6, type=:sentence>,
|
86
86
|
#<struct Greeb::Entity from=7, to=19, type=:sentence>]
|
@@ -92,9 +92,9 @@ segmentator:
|
|
92
92
|
|
93
93
|
```ruby
|
94
94
|
text = 'Hello! How are you?'
|
95
|
-
|
96
|
-
sentences = Greeb::Segmentator.new(
|
97
|
-
pp segmentator.extract(
|
95
|
+
tokens = Greeb::Tokenizer.tokenize(text)
|
96
|
+
sentences = Greeb::Segmentator.new(tokens).sentences
|
97
|
+
pp segmentator.extract(sentences)
|
98
98
|
=begin
|
99
99
|
{#<struct Greeb::Entity from=0, to=6, type=:sentence>=>
|
100
100
|
[#<struct Greeb::Entity from=0, to=5, type=:letter>,
|
@@ -127,9 +127,6 @@ punctuation), and `:break`.
|
|
127
127
|
4. Push to the branch (`git push origin my-new-feature`);
|
128
128
|
5. Create new Pull Request.
|
129
129
|
|
130
|
-
I highly recommend you to use git flow to make development process much
|
131
|
-
systematic and awesome.
|
132
|
-
|
133
130
|
## Build Status [<img src="https://secure.travis-ci.org/eveel/greeb.png"/>](http://travis-ci.org/eveel/greeb)
|
134
131
|
|
135
132
|
## Dependency Status [<img src="https://gemnasium.com/eveel/greeb.png?travis"/>](https://gemnasium.com/eveel/greeb)
|
data/lib/greeb.rb
CHANGED
@@ -22,6 +22,24 @@ class Greeb::Entity < Struct.new(:from, :to, :type)
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
|
+
# This runtime error appears when {Greeb::Tokenizer} or
|
26
|
+
# {Greeb::Segmentator} tries to recognize unknown character.
|
27
|
+
#
|
28
|
+
class Greeb::UnknownEntity < RuntimeError
|
29
|
+
attr_reader :text, :pos
|
30
|
+
|
31
|
+
# @private
|
32
|
+
def initialize(text, pos)
|
33
|
+
@text, @pos = text, pos
|
34
|
+
end
|
35
|
+
|
36
|
+
# Generate the real error message.
|
37
|
+
#
|
38
|
+
def to_s
|
39
|
+
'Could not recognize character "%s" @ %d' % [text[pos], pos]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
25
43
|
require 'greeb/strscan'
|
26
44
|
require 'greeb/tokenizer'
|
27
45
|
require 'greeb/segmentator'
|
data/lib/greeb/segmentator.rb
CHANGED
@@ -13,15 +13,10 @@ class Greeb::Segmentator
|
|
13
13
|
|
14
14
|
# Create a new instance of {Greeb::Segmentator}.
|
15
15
|
#
|
16
|
-
# @param
|
17
|
-
#
|
18
|
-
|
19
|
-
|
20
|
-
@tokens = if tokenizer_or_tokens.is_a? Greeb::Tokenizer
|
21
|
-
tokenizer_or_tokens.tokens
|
22
|
-
else
|
23
|
-
tokenizer_or_tokens
|
24
|
-
end
|
16
|
+
# @param tokens [Array<Greeb::Entity>] tokens from [Greeb::Tokenizer].
|
17
|
+
#
|
18
|
+
def initialize tokens
|
19
|
+
@tokens = tokens
|
25
20
|
end
|
26
21
|
|
27
22
|
# Sentences memoization method.
|
@@ -49,7 +44,7 @@ class Greeb::Segmentator
|
|
49
44
|
# @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
|
50
45
|
# sentences as keys and tokens arrays as values.
|
51
46
|
#
|
52
|
-
def extract
|
47
|
+
def extract sentences
|
53
48
|
Hash[
|
54
49
|
sentences.map do |s|
|
55
50
|
[s, tokens.select { |t| t.from >= s.from and t.to <= s.to }]
|
@@ -64,7 +59,7 @@ class Greeb::Segmentator
|
|
64
59
|
# @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
|
65
60
|
# sentences as keys and subsentences arrays as values.
|
66
61
|
#
|
67
|
-
def subextract
|
62
|
+
def subextract sentences
|
68
63
|
Hash[
|
69
64
|
sentences.map do |s|
|
70
65
|
[s, subsentences.select { |ss| ss.from >= s.from and ss.to <= s.to }]
|
@@ -73,90 +68,90 @@ class Greeb::Segmentator
|
|
73
68
|
end
|
74
69
|
|
75
70
|
protected
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
end
|
104
|
-
|
105
|
-
sentence
|
71
|
+
# Implementation of the sentence detection method. This method
|
72
|
+
# changes the `@sentences` ivar.
|
73
|
+
#
|
74
|
+
# @return [nil] nothing.
|
75
|
+
#
|
76
|
+
def detect_sentences!
|
77
|
+
@sentences = []
|
78
|
+
|
79
|
+
rest = tokens.inject(new_sentence) do |sentence, token|
|
80
|
+
if !sentence.from and SENTENCE_DOESNT_START.include?(token.type)
|
81
|
+
next sentence
|
82
|
+
end
|
83
|
+
|
84
|
+
sentence.from = token.from unless sentence.from
|
85
|
+
|
86
|
+
next sentence if sentence.to and sentence.to > token.to
|
87
|
+
|
88
|
+
if :punct == token.type
|
89
|
+
sentence.to = tokens.
|
90
|
+
select { |t| t.from >= token.from }.
|
91
|
+
inject(token) { |r, t| break r if t.type != token.type; t }.
|
92
|
+
to
|
93
|
+
|
94
|
+
@sentences << sentence
|
95
|
+
sentence = new_sentence
|
96
|
+
elsif :separ != token.type
|
97
|
+
sentence.to = token.to
|
106
98
|
end
|
107
99
|
|
108
|
-
|
100
|
+
sentence
|
109
101
|
end
|
110
102
|
|
111
|
-
|
112
|
-
|
113
|
-
#
|
114
|
-
# @return [nil] nothing.
|
115
|
-
#
|
116
|
-
def detect_subsentences!
|
117
|
-
@subsentences = SortedSet.new
|
103
|
+
nil.tap { @sentences << rest if rest.from and rest.to }
|
104
|
+
end
|
118
105
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
106
|
+
# Implementation of the subsentence detection method. This method
|
107
|
+
# changes the `@subsentences` ivar.
|
108
|
+
#
|
109
|
+
# @return [nil] nothing.
|
110
|
+
#
|
111
|
+
def detect_subsentences!
|
112
|
+
@subsentences = SortedSet.new
|
123
113
|
|
124
|
-
|
114
|
+
rest = tokens.inject(new_subsentence) do |subsentence, token|
|
115
|
+
if !subsentence.from and SENTENCE_DOESNT_START.include?(token.type)
|
116
|
+
next subsentence
|
117
|
+
end
|
125
118
|
|
126
|
-
|
119
|
+
subsentence.from = token.from unless subsentence.from
|
127
120
|
|
128
|
-
|
129
|
-
subsentence.to = tokens.
|
130
|
-
select { |t| t.from >= token.from }.
|
131
|
-
inject(token) { |r, t| break r if t.type != token.type; t }.
|
132
|
-
to
|
121
|
+
next subsentence if subsentence.to and subsentence.to > token.to
|
133
122
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
123
|
+
if [:punct, :spunct].include? token.type
|
124
|
+
subsentence.to = tokens.
|
125
|
+
select { |t| t.from >= token.from }.
|
126
|
+
inject(token) { |r, t| break r if t.type != token.type; t }.
|
127
|
+
to
|
139
128
|
|
140
|
-
subsentence
|
129
|
+
@subsentences << subsentence
|
130
|
+
subsentence = new_subsentence
|
131
|
+
elsif :separ != token.type
|
132
|
+
subsentence.to = token.to
|
141
133
|
end
|
142
134
|
|
143
|
-
|
135
|
+
subsentence
|
144
136
|
end
|
145
137
|
|
138
|
+
nil.tap { @subsentences << rest if rest.from and rest.to }
|
139
|
+
end
|
140
|
+
|
146
141
|
private
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
142
|
+
# Create a new instance of {Greeb::Entity} with `:sentence` type.
|
143
|
+
#
|
144
|
+
# @return [Greeb::Entity] a new entity instance.
|
145
|
+
#
|
146
|
+
def new_sentence
|
147
|
+
Greeb::Entity.new(nil, nil, :sentence)
|
148
|
+
end
|
154
149
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
150
|
+
# Create a new instance of {Greeb::Entity} with `:subsentence` type.
|
151
|
+
#
|
152
|
+
# @return [Greeb::Entity] a new entity instance.
|
153
|
+
#
|
154
|
+
def new_subsentence
|
155
|
+
Greeb::Entity.new(nil, nil, :subsentence)
|
156
|
+
end
|
162
157
|
end
|
data/lib/greeb/tokenizer.rb
CHANGED
@@ -5,24 +5,9 @@
|
|
5
5
|
# Unicode character categories been obtained from
|
6
6
|
# <http://www.fileformat.info/info/unicode/category/index.htm>.
|
7
7
|
#
|
8
|
-
|
9
|
-
#
|
10
|
-
|
11
|
-
#
|
12
|
-
class UnknownEntity < RuntimeError
|
13
|
-
attr_reader :text, :pos
|
14
|
-
|
15
|
-
# @private
|
16
|
-
def initialize(text, pos)
|
17
|
-
@text, @pos = text, pos
|
18
|
-
end
|
19
|
-
|
20
|
-
# Generate the real error message.
|
21
|
-
#
|
22
|
-
def to_s
|
23
|
-
'Could not recognize character "%s" @ %d' % [text[pos], pos]
|
24
|
-
end
|
25
|
-
end
|
8
|
+
module Greeb::Tokenizer
|
9
|
+
# http://www.youtube.com/watch?v=eF1lU-CrQfc
|
10
|
+
extend self
|
26
11
|
|
27
12
|
# English and Russian letters.
|
28
13
|
#
|
@@ -56,83 +41,67 @@ class Greeb::Tokenizer
|
|
56
41
|
#
|
57
42
|
RESIDUALS = /[\p{C}\p{M}\p{Sk}]+/u
|
58
43
|
|
59
|
-
|
60
|
-
protected :scanner
|
61
|
-
|
62
|
-
# Create a new instance of {Greeb::Tokenizer}.
|
44
|
+
# Perform the tokenization process.
|
63
45
|
#
|
64
|
-
# @
|
46
|
+
# @return [Array<Greeb::Entity>] a set of tokens.
|
65
47
|
#
|
66
|
-
def
|
67
|
-
|
48
|
+
def tokenize text
|
49
|
+
scanner = Greeb::StringScanner.new(text)
|
50
|
+
tokens = []
|
51
|
+
while !scanner.eos?
|
52
|
+
parse! scanner, tokens, LETTERS, :letter or
|
53
|
+
parse! scanner, tokens, FLOATS, :float or
|
54
|
+
parse! scanner, tokens, INTEGERS, :integer or
|
55
|
+
split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
|
56
|
+
split_parse! scanner, tokens, PUNCTUATIONS, :punct or
|
57
|
+
split_parse! scanner, tokens, SEPARATORS, :separ or
|
58
|
+
split_parse! scanner, tokens, BREAKS, :break or
|
59
|
+
parse! scanner, tokens, RESIDUALS, :residual or
|
60
|
+
raise Greeb::UnknownEntity.new(text, scanner.char_pos)
|
61
|
+
end
|
62
|
+
tokens
|
63
|
+
ensure
|
64
|
+
scanner.terminate
|
68
65
|
end
|
69
66
|
|
70
|
-
|
67
|
+
private
|
68
|
+
# Try to parse one small piece of text that is covered by pattern
|
69
|
+
# of necessary type.
|
71
70
|
#
|
72
|
-
# @
|
71
|
+
# @param scanner [Greeb::StringScanner] string scanner.
|
72
|
+
# @param tokens [Array<Greeb::Entity>] result array.
|
73
|
+
# @param pattern [Regexp] a regular expression to extract the token.
|
74
|
+
# @param type [Symbol] a symbol that represents the necessary token
|
75
|
+
# type.
|
73
76
|
#
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
+
# @return [Array<Greeb::Entity>] the modified set of extracted tokens.
|
78
|
+
#
|
79
|
+
def parse! scanner, tokens, pattern, type
|
80
|
+
return false unless token = scanner.scan(pattern)
|
81
|
+
position = scanner.char_pos
|
82
|
+
tokens << Greeb::Entity.new(position - token.length,
|
83
|
+
position,
|
84
|
+
type)
|
77
85
|
end
|
78
86
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
raise UnknownEntity.new(text, scanner.char_pos)
|
98
|
-
end
|
99
|
-
ensure
|
100
|
-
scanner.terminate
|
101
|
-
end
|
102
|
-
|
103
|
-
# Try to parse one small piece of text that is covered by pattern
|
104
|
-
# of necessary type.
|
105
|
-
#
|
106
|
-
# @param pattern [Regexp] a regular expression to extract the token.
|
107
|
-
# @param type [Symbol] a symbol that represents the necessary token
|
108
|
-
# type.
|
109
|
-
#
|
110
|
-
# @return [Array<Greeb::Entity>] the modified set of extracted tokens.
|
111
|
-
#
|
112
|
-
def parse! pattern, type
|
113
|
-
return false unless token = scanner.scan(pattern)
|
114
|
-
position = scanner.char_pos
|
115
|
-
@tokens << Greeb::Entity.new(position - token.length,
|
116
|
-
position,
|
117
|
-
type)
|
118
|
-
end
|
119
|
-
|
120
|
-
# Try to parse one small piece of text that is covered by pattern
|
121
|
-
# of necessary type. This method performs grouping of the same
|
122
|
-
# characters.
|
123
|
-
#
|
124
|
-
# @param pattern [Regexp] a regular expression to extract the token.
|
125
|
-
# @param type [Symbol] a symbol that represents the necessary token
|
126
|
-
# type.
|
127
|
-
#
|
128
|
-
# @return [Array<Greeb::Entity>] the modified set of extracted tokens.
|
129
|
-
#
|
130
|
-
def split_parse! pattern, type
|
131
|
-
return false unless token = scanner.scan(pattern)
|
132
|
-
position = scanner.char_pos - token.length
|
133
|
-
token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
|
134
|
-
@tokens << Greeb::Entity.new(before, before + s.length, type)
|
135
|
-
before + s.length
|
136
|
-
end
|
87
|
+
# Try to parse one small piece of text that is covered by pattern
|
88
|
+
# of necessary type. This method performs grouping of the same
|
89
|
+
# characters.
|
90
|
+
#
|
91
|
+
# @param scanner [Greeb::StringScanner] string scanner.
|
92
|
+
# @param tokens [Array<Greeb::Entity>] result array.
|
93
|
+
# @param pattern [Regexp] a regular expression to extract the token.
|
94
|
+
# @param type [Symbol] a symbol that represents the necessary token
|
95
|
+
# type.
|
96
|
+
#
|
97
|
+
# @return [Array<Greeb::Entity>] the modified set of extracted tokens.
|
98
|
+
#
|
99
|
+
def split_parse! scanner, tokens, pattern, type
|
100
|
+
return false unless token = scanner.scan(pattern)
|
101
|
+
position = scanner.char_pos - token.length
|
102
|
+
token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
|
103
|
+
tokens << Greeb::Entity.new(before, before + s.length, type)
|
104
|
+
before + s.length
|
137
105
|
end
|
106
|
+
end
|
138
107
|
end
|
data/lib/greeb/version.rb
CHANGED
data/spec/segmentator_spec.rb
CHANGED
@@ -5,16 +5,11 @@ require File.expand_path('../spec_helper', __FILE__)
|
|
5
5
|
module Greeb
|
6
6
|
describe Segmentator do
|
7
7
|
describe 'initialization' do
|
8
|
-
|
8
|
+
let(:tokens) { Tokenizer.tokenize('Vodka') }
|
9
9
|
|
10
|
-
subject { Segmentator.new(
|
10
|
+
subject { Segmentator.new(tokens) }
|
11
11
|
|
12
|
-
it '
|
13
|
-
subject.tokens.must_be_kind_of Array
|
14
|
-
end
|
15
|
-
|
16
|
-
it 'can be initialized either with a set of tokens' do
|
17
|
-
subject = Segmentator.new(@tokenizer.tokens)
|
12
|
+
it 'is initialized either with set of tokens' do
|
18
13
|
subject.tokens.must_be_kind_of Array
|
19
14
|
end
|
20
15
|
|
@@ -24,75 +19,64 @@ module Greeb
|
|
24
19
|
end
|
25
20
|
|
26
21
|
describe 'a simple sentence' do
|
27
|
-
|
22
|
+
let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
|
28
23
|
|
29
|
-
subject { Segmentator.new(
|
24
|
+
subject { Segmentator.new(tokens).sentences }
|
30
25
|
|
31
26
|
it 'should be segmented' do
|
32
|
-
subject.must_equal(
|
33
|
-
[Entity.new(0, 22, :sentence)]
|
34
|
-
)
|
27
|
+
subject.must_equal([Entity.new(0, 22, :sentence)])
|
35
28
|
end
|
36
29
|
end
|
37
30
|
|
38
31
|
describe 'a simple sentence without punctuation' do
|
39
|
-
|
32
|
+
let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton') }
|
40
33
|
|
41
|
-
subject { Segmentator.new(
|
34
|
+
subject { Segmentator.new(tokens).sentences }
|
42
35
|
|
43
36
|
it 'should be segmented' do
|
44
|
-
subject.must_equal(
|
45
|
-
[Entity.new(0, 21, :sentence)]
|
46
|
-
)
|
37
|
+
subject.must_equal([Entity.new(0, 21, :sentence)])
|
47
38
|
end
|
48
39
|
end
|
49
40
|
|
50
41
|
describe 'a simple sentence with trailing whitespaces' do
|
51
|
-
|
42
|
+
let(:tokens) { Tokenizer.tokenize(' Hello, I am JC Denton ') }
|
52
43
|
|
53
|
-
subject { Segmentator.new(
|
44
|
+
subject { Segmentator.new(tokens).sentences }
|
54
45
|
|
55
46
|
it 'should be segmented' do
|
56
|
-
subject.must_equal(
|
57
|
-
[Entity.new(6, 27, :sentence)]
|
58
|
-
)
|
47
|
+
subject.must_equal([Entity.new(6, 27, :sentence)])
|
59
48
|
end
|
60
49
|
end
|
61
50
|
|
62
51
|
describe 'two simple sentences' do
|
63
|
-
|
52
|
+
let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
|
64
53
|
|
65
|
-
subject { Segmentator.new(
|
54
|
+
subject { Segmentator.new(tokens).sentences }
|
66
55
|
|
67
56
|
it 'should be segmented' do
|
68
|
-
subject.must_equal(
|
69
|
-
|
70
|
-
Entity.new(7, 22, :sentence)]
|
71
|
-
)
|
57
|
+
subject.must_equal([Entity.new(0, 6, :sentence),
|
58
|
+
Entity.new(7, 22, :sentence)])
|
72
59
|
end
|
73
60
|
end
|
74
61
|
|
75
62
|
describe 'one wrong character and one simple sentence' do
|
76
|
-
|
63
|
+
let(:tokens) { Tokenizer.tokenize('! I am JC Denton.') }
|
77
64
|
|
78
|
-
subject { Segmentator.new(
|
65
|
+
subject { Segmentator.new(tokens).sentences }
|
79
66
|
|
80
67
|
it 'should be segmented' do
|
81
|
-
subject.must_equal(
|
82
|
-
[Entity.new(2, 17, :sentence)]
|
83
|
-
)
|
68
|
+
subject.must_equal([Entity.new(2, 17, :sentence)])
|
84
69
|
end
|
85
70
|
end
|
86
71
|
|
87
|
-
describe '
|
88
|
-
|
72
|
+
describe 'sentence extractor' do
|
73
|
+
let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
|
74
|
+
let(:segmentator) { Segmentator.new(tokens) }
|
89
75
|
|
90
|
-
subject {
|
91
|
-
|
92
|
-
let(:sentences) { subject.sentences }
|
76
|
+
subject { segmentator.extract(segmentator.sentences) }
|
93
77
|
|
94
78
|
it 'should be extracted' do
|
95
|
-
subject.
|
79
|
+
subject.must_equal(
|
96
80
|
Entity.new(0, 6, :sentence) => [
|
97
81
|
Entity.new(0, 5, :letter),
|
98
82
|
Entity.new(5, 6, :punct)
|
@@ -107,24 +91,23 @@ module Greeb
|
|
107
91
|
Entity.new(15, 21, :letter),
|
108
92
|
Entity.new(21, 22, :punct)
|
109
93
|
]
|
110
|
-
|
94
|
+
)
|
111
95
|
end
|
112
96
|
end
|
113
97
|
|
114
98
|
describe 'subsentence extractor' do
|
115
|
-
|
116
|
-
|
117
|
-
subject { Segmentator.new(@tokenizer) }
|
99
|
+
let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
|
100
|
+
let(:segmentator) { Segmentator.new(tokens) }
|
118
101
|
|
119
|
-
|
102
|
+
subject { segmentator.subextract(segmentator.sentences) }
|
120
103
|
|
121
104
|
it 'should extract subsentences' do
|
122
|
-
subject.
|
105
|
+
subject.must_equal(
|
123
106
|
Entity.new(0, 22, :sentence) => [
|
124
107
|
Entity.new(0, 6, :subsentence),
|
125
108
|
Entity.new(7, 22, :subsentence)
|
126
109
|
]
|
127
|
-
|
110
|
+
)
|
128
111
|
end
|
129
112
|
end
|
130
113
|
end
|
data/spec/tokenizer_spec.rb
CHANGED
@@ -4,59 +4,35 @@ require File.expand_path('../spec_helper', __FILE__)
|
|
4
4
|
|
5
5
|
module Greeb
|
6
6
|
describe Tokenizer do
|
7
|
-
describe 'initialization' do
|
8
|
-
subject { Tokenizer.new('vodka') }
|
9
|
-
|
10
|
-
it 'should be initialized with a text' do
|
11
|
-
subject.text.must_equal 'vodka'
|
12
|
-
end
|
13
|
-
|
14
|
-
it 'should has the @text ivar' do
|
15
|
-
subject.instance_variable_get(:@text).must_equal 'vodka'
|
16
|
-
end
|
17
|
-
|
18
|
-
it 'should not has @tokens ivar' do
|
19
|
-
subject.instance_variable_get(:@tokens).must_be_nil
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
7
|
describe 'after tokenization' do
|
24
|
-
subject { Tokenizer.
|
25
|
-
|
26
|
-
it 'should has the @tokens ivar' do
|
27
|
-
subject.instance_variable_get(:@tokens).wont_be_nil
|
28
|
-
end
|
29
|
-
|
30
|
-
it 'should has the @scanner ivar' do
|
31
|
-
subject.instance_variable_get(:@scanner).wont_be_nil
|
32
|
-
end
|
8
|
+
subject { Tokenizer.tokenize('vodka') }
|
33
9
|
|
34
10
|
it 'should has the tokens set' do
|
35
|
-
subject.
|
11
|
+
subject.must_be_kind_of Array
|
36
12
|
end
|
37
13
|
end
|
38
14
|
|
39
15
|
describe 'tokenization facilities' do
|
40
16
|
it 'can handle words' do
|
41
|
-
Tokenizer.
|
17
|
+
Tokenizer.tokenize('hello').must_equal(
|
42
18
|
[Entity.new(0, 5, :letter)]
|
43
19
|
)
|
44
20
|
end
|
45
21
|
|
46
22
|
it 'can handle floats' do
|
47
|
-
Tokenizer.
|
23
|
+
Tokenizer.tokenize('14.88').must_equal(
|
48
24
|
[Entity.new(0, 5, :float)]
|
49
25
|
)
|
50
26
|
end
|
51
27
|
|
52
28
|
it 'can handle integers' do
|
53
|
-
Tokenizer.
|
29
|
+
Tokenizer.tokenize('1337').must_equal(
|
54
30
|
[Entity.new(0, 4, :integer)]
|
55
31
|
)
|
56
32
|
end
|
57
33
|
|
58
34
|
it 'can handle words and integers' do
|
59
|
-
Tokenizer.
|
35
|
+
Tokenizer.tokenize('Hello, I am 18').must_equal(
|
60
36
|
[Entity.new(0, 5, :letter),
|
61
37
|
Entity.new(5, 6, :spunct),
|
62
38
|
Entity.new(6, 7, :separ),
|
@@ -69,7 +45,7 @@ module Greeb
|
|
69
45
|
end
|
70
46
|
|
71
47
|
it 'can handle multi-line paragraphs' do
|
72
|
-
Tokenizer.
|
48
|
+
Tokenizer.tokenize("Brateeshka..!\n\nPrines!").must_equal(
|
73
49
|
[Entity.new(0, 10, :letter),
|
74
50
|
Entity.new(10, 12, :punct),
|
75
51
|
Entity.new(12, 13, :punct),
|
@@ -80,7 +56,7 @@ module Greeb
|
|
80
56
|
end
|
81
57
|
|
82
58
|
it 'can handle separated integers' do
|
83
|
-
Tokenizer.
|
59
|
+
Tokenizer.tokenize('228/359').must_equal(
|
84
60
|
[Entity.new(0, 3, :integer),
|
85
61
|
Entity.new(3, 4, :separ),
|
86
62
|
Entity.new(4, 7, :integer)]
|
@@ -88,7 +64,7 @@ module Greeb
|
|
88
64
|
end
|
89
65
|
|
90
66
|
it 'can deal with Russian language' do
|
91
|
-
Tokenizer.
|
67
|
+
Tokenizer.tokenize('Братишка, я тебе покушать принёс!').must_equal(
|
92
68
|
[Entity.new(0, 8, :letter),
|
93
69
|
Entity.new(8, 9, :spunct),
|
94
70
|
Entity.new(9, 10, :separ),
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0.
|
4
|
+
version: 0.1.0.rc8
|
5
5
|
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-12-
|
12
|
+
date: 2012-12-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -109,7 +109,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
109
109
|
requirements:
|
110
110
|
- - ! '>='
|
111
111
|
- !ruby/object:Gem::Version
|
112
|
-
hash:
|
112
|
+
hash: -482579338117024513
|
113
113
|
version: '0'
|
114
114
|
segments:
|
115
115
|
- 0
|