greeb 0.2.2.rc1 → 0.2.2.rc2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +52 -52
- data/bin/greeb +2 -2
- data/lib/greeb.rb +2 -39
- data/lib/greeb/core.rb +13 -12
- data/lib/greeb/exceptions.rb +17 -0
- data/lib/greeb/parser.rb +20 -7
- data/lib/greeb/segmentator.rb +38 -40
- data/lib/greeb/span.rb +36 -0
- data/lib/greeb/tokenizer.rb +11 -11
- data/lib/greeb/version.rb +1 -1
- data/spec/core_spec.rb +31 -33
- data/spec/parser_spec.rb +42 -30
- data/spec/segmentator_spec.rb +81 -83
- data/spec/span_spec.rb +63 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/tokenizer_spec.rb +76 -78
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 043ee2da87958a027caf792058ae1e3e44cc9684
|
4
|
+
data.tar.gz: 8f4a99b26f706badb15fd9e9d5533dd162e090e8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e950167615138975bc9873a729f2486eb506692fcdaefdd3aa828590d261da0d336e04e481c652036df892da305269a047ab076622c315bd17b3c015990dcba7
|
7
|
+
data.tar.gz: 7642fa3892694606792db842b0ea22a8ba13800b71a3e72eaff93d41ab0548f90a6881298aa290a61110941de350ef9941871e7497fda26b4d1e316e3688c50b
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# Greeb
|
2
2
|
Greeb [grʲip] is a simple yet awesome and Unicode-aware text segmentator
|
3
3
|
that is based on regular expressions. API documentation is available at
|
4
|
-
<
|
4
|
+
<http://rubydoc.info/github/dmchk/greeb/master/frames>.
|
5
5
|
|
6
6
|
## Installation
|
7
7
|
Add this line to your application's Gemfile:
|
@@ -43,8 +43,8 @@ Greeb has a very convinient API that makes you happy.
|
|
43
43
|
```ruby
|
44
44
|
pp Greeb::Tokenizer.tokenize('Hello!')
|
45
45
|
=begin
|
46
|
-
[#<struct Greeb::
|
47
|
-
#<struct Greeb::
|
46
|
+
[#<struct Greeb::Span from=0, to=5, type=:letter>,
|
47
|
+
#<struct Greeb::Span from=5, to=6, type=:punct>]
|
48
48
|
=end
|
49
49
|
```
|
50
50
|
|
@@ -59,34 +59,34 @@ EOF
|
|
59
59
|
|
60
60
|
pp Greeb::Tokenizer.tokenize(text)
|
61
61
|
=begin
|
62
|
-
[#<struct Greeb::
|
63
|
-
#<struct Greeb::
|
64
|
-
#<struct Greeb::
|
65
|
-
#<struct Greeb::
|
66
|
-
#<struct Greeb::
|
67
|
-
#<struct Greeb::
|
68
|
-
#<struct Greeb::
|
69
|
-
#<struct Greeb::
|
70
|
-
#<struct Greeb::
|
71
|
-
#<struct Greeb::
|
72
|
-
#<struct Greeb::
|
73
|
-
#<struct Greeb::
|
74
|
-
#<struct Greeb::
|
75
|
-
#<struct Greeb::
|
76
|
-
#<struct Greeb::
|
77
|
-
#<struct Greeb::
|
78
|
-
#<struct Greeb::
|
79
|
-
#<struct Greeb::
|
80
|
-
#<struct Greeb::
|
81
|
-
#<struct Greeb::
|
82
|
-
#<struct Greeb::
|
83
|
-
#<struct Greeb::
|
84
|
-
#<struct Greeb::
|
85
|
-
#<struct Greeb::
|
86
|
-
#<struct Greeb::
|
87
|
-
#<struct Greeb::
|
88
|
-
#<struct Greeb::
|
89
|
-
#<struct Greeb::
|
62
|
+
[#<struct Greeb::Span from=0, to=5, type=:letter>,
|
63
|
+
#<struct Greeb::Span from=5, to=6, type=:punct>,
|
64
|
+
#<struct Greeb::Span from=6, to=7, type=:space>,
|
65
|
+
#<struct Greeb::Span from=7, to=8, type=:letter>,
|
66
|
+
#<struct Greeb::Span from=8, to=9, type=:space>,
|
67
|
+
#<struct Greeb::Span from=9, to=11, type=:letter>,
|
68
|
+
#<struct Greeb::Span from=11, to=12, type=:space>,
|
69
|
+
#<struct Greeb::Span from=12, to=14, type=:integer>,
|
70
|
+
#<struct Greeb::Span from=14, to=15, type=:punct>,
|
71
|
+
#<struct Greeb::Span from=15, to=16, type=:space>,
|
72
|
+
#<struct Greeb::Span from=16, to=18, type=:letter>,
|
73
|
+
#<struct Greeb::Span from=18, to=19, type=:space>,
|
74
|
+
#<struct Greeb::Span from=19, to=28, type=:letter>,
|
75
|
+
#<struct Greeb::Span from=28, to=29, type=:space>,
|
76
|
+
#<struct Greeb::Span from=29, to=35, type=:letter>,
|
77
|
+
#<struct Greeb::Span from=35, to=36, type=:space>,
|
78
|
+
#<struct Greeb::Span from=36, to=38, type=:letter>,
|
79
|
+
#<struct Greeb::Span from=38, to=39, type=:space>,
|
80
|
+
#<struct Greeb::Span from=39, to=44, type=:float>,
|
81
|
+
#<struct Greeb::Span from=44, to=47, type=:punct>,
|
82
|
+
#<struct Greeb::Span from=47, to=49, type=:break>,
|
83
|
+
#<struct Greeb::Span from=49, to=53, type=:letter>,
|
84
|
+
#<struct Greeb::Span from=53, to=54, type=:space>,
|
85
|
+
#<struct Greeb::Span from=54, to=59, type=:letter>,
|
86
|
+
#<struct Greeb::Span from=59, to=60, type=:space>,
|
87
|
+
#<struct Greeb::Span from=60, to=63, type=:letter>,
|
88
|
+
#<struct Greeb::Span from=63, to=64, type=:punct>,
|
89
|
+
#<struct Greeb::Span from=64, to=65, type=:break>]
|
90
90
|
=end
|
91
91
|
```
|
92
92
|
|
@@ -99,8 +99,8 @@ text = 'Hello! How are you?'
|
|
99
99
|
tokens = Greeb::Tokenizer.tokenize(text)
|
100
100
|
pp Greeb::Segmentator.new(tokens).sentences
|
101
101
|
=begin
|
102
|
-
[#<struct Greeb::
|
103
|
-
#<struct Greeb::
|
102
|
+
[#<struct Greeb::Span from=0, to=6, type=:sentence>,
|
103
|
+
#<struct Greeb::Span from=7, to=19, type=:sentence>]
|
104
104
|
=end
|
105
105
|
```
|
106
106
|
|
@@ -113,21 +113,21 @@ tokens = Greeb::Tokenizer.tokenize(text)
|
|
113
113
|
segmentator = Greeb::Segmentator.new(tokens)
|
114
114
|
pp segmentator.extract(segmentator.sentences)
|
115
115
|
=begin
|
116
|
-
{#<struct Greeb::
|
117
|
-
[#<struct Greeb::
|
118
|
-
#<struct Greeb::
|
119
|
-
#<struct Greeb::
|
120
|
-
[#<struct Greeb::
|
121
|
-
#<struct Greeb::
|
122
|
-
#<struct Greeb::
|
123
|
-
#<struct Greeb::
|
124
|
-
#<struct Greeb::
|
125
|
-
#<struct Greeb::
|
116
|
+
{#<struct Greeb::Span from=0, to=6, type=:sentence>=>
|
117
|
+
[#<struct Greeb::Span from=0, to=5, type=:letter>,
|
118
|
+
#<struct Greeb::Span from=5, to=6, type=:punct>],
|
119
|
+
#<struct Greeb::Span from=7, to=19, type=:sentence>=>
|
120
|
+
[#<struct Greeb::Span from=7, to=10, type=:letter>,
|
121
|
+
#<struct Greeb::Span from=10, to=11, type=:space>,
|
122
|
+
#<struct Greeb::Span from=11, to=14, type=:letter>,
|
123
|
+
#<struct Greeb::Span from=14, to=15, type=:space>,
|
124
|
+
#<struct Greeb::Span from=15, to=18, type=:letter>,
|
125
|
+
#<struct Greeb::Span from=18, to=19, type=:punct>]}
|
126
126
|
=end
|
127
127
|
```
|
128
128
|
|
129
129
|
### Parsing API
|
130
|
-
Texts are often include some special
|
130
|
+
Texts are often include some special spans such as URLs and e-mail
|
131
131
|
addresses. Greeb can help you in these strings retrieval.
|
132
132
|
|
133
133
|
#### URL and E-mail retrieval
|
@@ -136,12 +136,12 @@ text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
|
|
136
136
|
|
137
137
|
pp Greeb::Parser.urls(text).map { |e| [e, text[e.from...e.to]] }
|
138
138
|
=begin
|
139
|
-
[[#<struct Greeb::
|
139
|
+
[[#<struct Greeb::Span from=14, to=29, type=:url>, "http://nlpub.ru"]]
|
140
140
|
=end
|
141
141
|
|
142
142
|
pp Greeb::Parser.emails(text).map { |e| [e, text[e.from...e.to]] }
|
143
143
|
=begin
|
144
|
-
[[#<struct Greeb::
|
144
|
+
[[#<struct Greeb::Span from=44, to=63, type=:email>, "example@example.com"]]
|
145
145
|
=end
|
146
146
|
```
|
147
147
|
|
@@ -153,7 +153,7 @@ text = 'Hello, G.L.H.F. everyone!'
|
|
153
153
|
|
154
154
|
pp Greeb::Parser.abbrevs(text).map { |e| [e, text[e.from...e.to]] }
|
155
155
|
=begin
|
156
|
-
[[#<struct Greeb::
|
156
|
+
[[#<struct Greeb::Span from=7, to=15, type=:abbrev>, "G.L.H.F."]]
|
157
157
|
=end
|
158
158
|
```
|
159
159
|
|
@@ -161,13 +161,13 @@ The algorithm is not so accurate, but still useful in many practical
|
|
161
161
|
situations.
|
162
162
|
|
163
163
|
## Tokens
|
164
|
-
Greeb operates with
|
165
|
-
*from* is a beginning of the
|
166
|
-
and *kind* is a type of the
|
164
|
+
Greeb operates with spans, tuples of *(from, to, kind)*, where
|
165
|
+
*from* is a beginning of the span, *to* is an ending of the span,
|
166
|
+
and *kind* is a type of the span.
|
167
167
|
|
168
|
-
There are several
|
168
|
+
There are several span types at the tokenization stage: `:letter`,
|
169
169
|
`:float`, `:integer`, `:separ`, `:punct` (for punctuation), `:spunct`
|
170
|
-
(for in-sentence punctuation), and `:break`.
|
170
|
+
(for in-sentence punctuation), `:space`, and `:break`.
|
171
171
|
|
172
172
|
## Contributing
|
173
173
|
1. Fork it;
|
data/bin/greeb
CHANGED
@@ -8,6 +8,6 @@ require 'greeb'
|
|
8
8
|
|
9
9
|
text = STDIN.read.tap(&:chomp!)
|
10
10
|
|
11
|
-
Greeb[text].each do |
|
12
|
-
puts text[
|
11
|
+
Greeb[text].each do |span|
|
12
|
+
puts text[span.from...span.to] unless [:space, :break].include? span.type
|
13
13
|
end
|
data/lib/greeb.rb
CHANGED
@@ -1,45 +1,8 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
require 'greeb/version'
|
4
|
-
|
5
|
-
|
6
|
-
# *from* is a beginning of the entity, *to* is an ending of the entity,
|
7
|
-
# and *kind* is a type of the entity.
|
8
|
-
#
|
9
|
-
# There are several entity types: `:letter`, `:float`, `:integer`,
|
10
|
-
# `:separ` for separators, `:punct` for punctuation characters,
|
11
|
-
# `:spunct` for in-sentence punctuation characters, and
|
12
|
-
# `:break` for line endings.
|
13
|
-
#
|
14
|
-
class Greeb::Entity < Struct.new(:from, :to, :type)
|
15
|
-
# @private
|
16
|
-
def <=> other
|
17
|
-
if (comparison = self.from <=> other.from) == 0
|
18
|
-
self.to <=> other.to
|
19
|
-
else
|
20
|
-
comparison
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
# This runtime error appears when {Greeb::Tokenizer} or
|
26
|
-
# {Greeb::Segmentator} tries to recognize unknown character.
|
27
|
-
#
|
28
|
-
class Greeb::UnknownEntity < RuntimeError
|
29
|
-
attr_reader :text, :pos
|
30
|
-
|
31
|
-
# @private
|
32
|
-
def initialize(text, pos)
|
33
|
-
@text, @pos = text, pos
|
34
|
-
end
|
35
|
-
|
36
|
-
# Generate the real error message.
|
37
|
-
#
|
38
|
-
def to_s
|
39
|
-
'Could not recognize character "%s" @ %d' % [text[pos], pos]
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
4
|
+
require 'greeb/exceptions'
|
5
|
+
require 'greeb/span'
|
43
6
|
require 'greeb/strscan'
|
44
7
|
require 'greeb/tokenizer'
|
45
8
|
require 'greeb/segmentator'
|
data/lib/greeb/core.rb
CHANGED
@@ -13,13 +13,13 @@ module Greeb::Core
|
|
13
13
|
#
|
14
14
|
# @param text [String] input text.
|
15
15
|
#
|
16
|
-
# @return [Array<Greeb::
|
16
|
+
# @return [Array<Greeb::Span>] a set of tokens.
|
17
17
|
#
|
18
|
-
def analyze
|
18
|
+
def analyze(text, helpers = HELPERS)
|
19
19
|
Greeb::Tokenizer.tokenize(text).tap do |tokens|
|
20
|
-
|
20
|
+
helpers.each do |helper|
|
21
21
|
Greeb::Parser.public_send(helper, text).each do |parsed|
|
22
|
-
|
22
|
+
extract_spans(tokens, parsed)
|
23
23
|
end
|
24
24
|
end
|
25
25
|
end
|
@@ -28,17 +28,18 @@ module Greeb::Core
|
|
28
28
|
alias_method :'[]', :analyze
|
29
29
|
|
30
30
|
protected
|
31
|
-
# Extact
|
31
|
+
# Extact spans of the specified type from the input spans set.
|
32
32
|
#
|
33
|
-
# @param
|
34
|
-
# @param
|
33
|
+
# @param spans [Array<Greeb::Span>] input spans set.
|
34
|
+
# @param span [Greeb::Span] span to be extracted.
|
35
35
|
#
|
36
|
-
# @return [Greeb::
|
36
|
+
# @return [Greeb::Span] span to be extracted.
|
37
37
|
#
|
38
|
-
def
|
39
|
-
from =
|
40
|
-
to =
|
41
|
-
|
38
|
+
def extract_spans(spans, span)
|
39
|
+
from = spans.index { |e| e.from == span.from }
|
40
|
+
to = spans.index { |e| e.to == span.to }
|
41
|
+
return unless from && to
|
42
|
+
spans[from..to] = span
|
42
43
|
end
|
43
44
|
end
|
44
45
|
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# This runtime error appears when {Greeb::Tokenizer} or
|
2
|
+
# {Greeb::Segmentator} tries to recognize unknown character.
|
3
|
+
#
|
4
|
+
class Greeb::UnknownEntity < RuntimeError
|
5
|
+
attr_reader :text, :pos
|
6
|
+
|
7
|
+
# @private
|
8
|
+
def initialize(text, pos)
|
9
|
+
@text, @pos = text, pos
|
10
|
+
end
|
11
|
+
|
12
|
+
# Generate the real error message.
|
13
|
+
#
|
14
|
+
def to_s
|
15
|
+
'Could not recognize character "%s" @ %d' % [text[pos], pos]
|
16
|
+
end
|
17
|
+
end
|
data/lib/greeb/parser.rb
CHANGED
@@ -16,12 +16,15 @@ module Greeb::Parser
|
|
16
16
|
# Another horrible pattern. Now for abbreviations.
|
17
17
|
ABBREV = /\b((-{0,1}\p{L}\.)*|(-{0,1}\p{L}\. )*)-{0,1}\p{L}\./i
|
18
18
|
|
19
|
+
# This pattern matches anything that looks like HTML. Or not.
|
20
|
+
HTML = /<(.*?)>/i
|
21
|
+
|
19
22
|
# Recognize URLs in the input text. Actually, URL is obsolete standard
|
20
23
|
# and this code should be rewritten to use the URI concept.
|
21
24
|
#
|
22
25
|
# @param text [String] input text.
|
23
26
|
#
|
24
|
-
# @return [Array<Greeb::
|
27
|
+
# @return [Array<Greeb::Span>] found URLs.
|
25
28
|
#
|
26
29
|
def urls(text)
|
27
30
|
scan(text, URL, :url)
|
@@ -31,7 +34,7 @@ module Greeb::Parser
|
|
31
34
|
#
|
32
35
|
# @param text [String] input text.
|
33
36
|
#
|
34
|
-
# @return [Array<Greeb::
|
37
|
+
# @return [Array<Greeb::Span>] found e-mail addresses.
|
35
38
|
#
|
36
39
|
def emails(text)
|
37
40
|
scan(text, EMAIL, :email)
|
@@ -41,27 +44,37 @@ module Greeb::Parser
|
|
41
44
|
#
|
42
45
|
# @param text [String] input text.
|
43
46
|
#
|
44
|
-
# @return [Array<Greeb::
|
47
|
+
# @return [Array<Greeb::Span>] found abbreviations.
|
45
48
|
#
|
46
49
|
def abbrevs(text)
|
47
50
|
scan(text, ABBREV, :abbrev)
|
48
51
|
end
|
49
52
|
|
53
|
+
# Recognize HTML-alike entities in the input text.
|
54
|
+
#
|
55
|
+
# @param text [String] input text.
|
56
|
+
#
|
57
|
+
# @return [Array<Greeb::Span>] found HTML entities.
|
58
|
+
#
|
59
|
+
def html(text)
|
60
|
+
scan(text, HTML, :html)
|
61
|
+
end
|
62
|
+
|
50
63
|
private
|
51
|
-
# Implementation of regexp-based {Greeb::
|
64
|
+
# Implementation of regexp-based {Greeb::Span} scanner.
|
52
65
|
#
|
53
66
|
# @param text [String] input text.
|
54
67
|
# @param regexp [Regexp] regular expression to be used.
|
55
|
-
# @param type [Symbol] type field for the new {Greeb::
|
68
|
+
# @param type [Symbol] type field for the new {Greeb::Span} instances.
|
56
69
|
# @param offset [Fixnum] offset of the next match.
|
57
70
|
#
|
58
|
-
# @return [Array<Greeb::
|
71
|
+
# @return [Array<Greeb::Span>] found entities.
|
59
72
|
#
|
60
73
|
def scan(text, regexp, type, offset = 0)
|
61
74
|
Array.new.tap do |matches|
|
62
75
|
while text and md = text.match(regexp)
|
63
76
|
start, stop = md.offset(0)
|
64
|
-
matches << Greeb::
|
77
|
+
matches << Greeb::Span.new(offset + start, offset + stop, type)
|
65
78
|
text, offset = text[stop..-1], offset + stop
|
66
79
|
end
|
67
80
|
end
|
data/lib/greeb/segmentator.rb
CHANGED
@@ -13,7 +13,7 @@ class Greeb::Segmentator
|
|
13
13
|
|
14
14
|
# Create a new instance of {Greeb::Segmentator}.
|
15
15
|
#
|
16
|
-
# @param tokens [Array<Greeb::
|
16
|
+
# @param tokens [Array<Greeb::Span>] tokens from [Greeb::Tokenizer].
|
17
17
|
#
|
18
18
|
def initialize(tokens)
|
19
19
|
@tokens = tokens
|
@@ -21,62 +21,60 @@ class Greeb::Segmentator
|
|
21
21
|
|
22
22
|
# Sentences memoization method.
|
23
23
|
#
|
24
|
-
# @return [Array<Greeb::
|
24
|
+
# @return [Array<Greeb::Span>] a set of sentences.
|
25
25
|
#
|
26
26
|
def sentences
|
27
|
-
@sentences ||=
|
27
|
+
@sentences ||= detect_spans(new_sentence, [:punct])
|
28
28
|
end
|
29
29
|
|
30
30
|
# Subsentences memoization method.
|
31
31
|
#
|
32
|
-
# @return [Array<Greeb::
|
32
|
+
# @return [Array<Greeb::Span>] a set of subsentences.
|
33
33
|
#
|
34
34
|
def subsentences
|
35
|
-
@subsentences ||=
|
35
|
+
@subsentences ||= detect_spans(new_subsentence, [:punct, :spunct])
|
36
36
|
end
|
37
37
|
|
38
38
|
# Extract tokens from the set of sentences.
|
39
39
|
#
|
40
|
-
# @param sentences [Array<Greeb::
|
40
|
+
# @param sentences [Array<Greeb::Span>] a list of sentences.
|
41
41
|
#
|
42
|
-
# @return [
|
42
|
+
# @return [Array<Greeb::Span, Array<Greeb::Span>>] a hash with
|
43
43
|
# sentences as keys and tokens arrays as values.
|
44
44
|
#
|
45
45
|
def extract(sentences, collection = tokens)
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
end
|
50
|
-
]
|
46
|
+
sentences.map do |s|
|
47
|
+
[s, collection.select { |t| t.from >= s.from and t.to <= s.to }]
|
48
|
+
end
|
51
49
|
end
|
52
50
|
|
53
51
|
protected
|
54
|
-
# Implementation of the
|
52
|
+
# Implementation of the span detection method.
|
55
53
|
#
|
56
|
-
# @param sample [Greeb::
|
54
|
+
# @param sample [Greeb::Span] a sample of span to be cloned in the
|
57
55
|
# process.
|
58
56
|
# @param stop_marks [Array<Symbol>] an array that stores the
|
59
|
-
# correspondent stop marks of the necessary
|
57
|
+
# correspondent stop marks of the necessary spans.
|
60
58
|
#
|
61
|
-
# @return [Array<Greeb::
|
59
|
+
# @return [Array<Greeb::Span>] a set of entites.
|
62
60
|
#
|
63
|
-
def
|
61
|
+
def detect_spans(sample, stop_marks)
|
64
62
|
collection = []
|
65
63
|
|
66
|
-
rest = tokens.inject(sample.dup) do |
|
67
|
-
next
|
68
|
-
|
69
|
-
next
|
64
|
+
rest = tokens.inject(sample.dup) do |span, token|
|
65
|
+
next span if sentence_aint_start? span, token
|
66
|
+
span.from = token.from unless span.from
|
67
|
+
next span if span.to and span.to > token.to
|
70
68
|
|
71
69
|
if stop_marks.include? token.type
|
72
|
-
|
73
|
-
collection <<
|
74
|
-
|
70
|
+
span.to = find_forward(tokens, token).to
|
71
|
+
collection << span
|
72
|
+
span = sample.dup
|
75
73
|
elsif ![:separ, :space].include? token.type
|
76
|
-
|
74
|
+
span.to = token.to
|
77
75
|
end
|
78
76
|
|
79
|
-
|
77
|
+
span
|
80
78
|
end
|
81
79
|
|
82
80
|
if rest.from && rest.to
|
@@ -88,42 +86,42 @@ class Greeb::Segmentator
|
|
88
86
|
|
89
87
|
private
|
90
88
|
# Check the possibility of starting a new sentence by the specified
|
91
|
-
# pair of
|
89
|
+
# pair of span and token.
|
92
90
|
#
|
93
|
-
# @param
|
94
|
-
# @param token [Greeb::
|
91
|
+
# @param span [Greeb::Span] an span to be checked.
|
92
|
+
# @param token [Greeb::Span] an token to be checked.
|
95
93
|
#
|
96
94
|
# @return true or false.
|
97
95
|
#
|
98
|
-
def sentence_aint_start?(
|
99
|
-
!
|
96
|
+
def sentence_aint_start?(span, token)
|
97
|
+
!span.from and SENTENCE_AINT_START.include? token.type
|
100
98
|
end
|
101
99
|
|
102
100
|
# Find a forwarding token that has another type.
|
103
101
|
#
|
104
|
-
# @param collection [Array<Greeb::
|
105
|
-
# @param sample [Greeb::
|
102
|
+
# @param collection [Array<Greeb::Span>] array of possible tokens.
|
103
|
+
# @param sample [Greeb::Span] a token that is treated as a sample.
|
106
104
|
#
|
107
|
-
# @return [Greeb::
|
105
|
+
# @return [Greeb::Span] a forwarding token.
|
108
106
|
#
|
109
107
|
def find_forward(collection, sample)
|
110
108
|
collection.select { |t| t.from >= sample.from }.
|
111
109
|
inject(sample) { |r, t| t.type == sample.type ? t : (break r) }
|
112
110
|
end
|
113
111
|
|
114
|
-
# Create a new instance of {Greeb::
|
112
|
+
# Create a new instance of {Greeb::Span} with `:sentence` type.
|
115
113
|
#
|
116
|
-
# @return [Greeb::
|
114
|
+
# @return [Greeb::Span] a new span instance.
|
117
115
|
#
|
118
116
|
def new_sentence
|
119
|
-
Greeb::
|
117
|
+
Greeb::Span.new(nil, nil, :sentence)
|
120
118
|
end
|
121
119
|
|
122
|
-
# Create a new instance of {Greeb::
|
120
|
+
# Create a new instance of {Greeb::Span} with `:subsentence` type.
|
123
121
|
#
|
124
|
-
# @return [Greeb::
|
122
|
+
# @return [Greeb::Span] a new span instance.
|
125
123
|
#
|
126
124
|
def new_subsentence
|
127
|
-
Greeb::
|
125
|
+
Greeb::Span.new(nil, nil, :subsentence)
|
128
126
|
end
|
129
127
|
end
|