greeb 0.2.2.rc1 → 0.2.2.rc2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +52 -52
- data/bin/greeb +2 -2
- data/lib/greeb.rb +2 -39
- data/lib/greeb/core.rb +13 -12
- data/lib/greeb/exceptions.rb +17 -0
- data/lib/greeb/parser.rb +20 -7
- data/lib/greeb/segmentator.rb +38 -40
- data/lib/greeb/span.rb +36 -0
- data/lib/greeb/tokenizer.rb +11 -11
- data/lib/greeb/version.rb +1 -1
- data/spec/core_spec.rb +31 -33
- data/spec/parser_spec.rb +42 -30
- data/spec/segmentator_spec.rb +81 -83
- data/spec/span_spec.rb +63 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/tokenizer_spec.rb +76 -78
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 043ee2da87958a027caf792058ae1e3e44cc9684
|
4
|
+
data.tar.gz: 8f4a99b26f706badb15fd9e9d5533dd162e090e8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e950167615138975bc9873a729f2486eb506692fcdaefdd3aa828590d261da0d336e04e481c652036df892da305269a047ab076622c315bd17b3c015990dcba7
|
7
|
+
data.tar.gz: 7642fa3892694606792db842b0ea22a8ba13800b71a3e72eaff93d41ab0548f90a6881298aa290a61110941de350ef9941871e7497fda26b4d1e316e3688c50b
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# Greeb
|
2
2
|
Greeb [grʲip] is a simple yet awesome and Unicode-aware text segmentator
|
3
3
|
that is based on regular expressions. API documentation is available at
|
4
|
-
<
|
4
|
+
<http://rubydoc.info/github/dmchk/greeb/master/frames>.
|
5
5
|
|
6
6
|
## Installation
|
7
7
|
Add this line to your application's Gemfile:
|
@@ -43,8 +43,8 @@ Greeb has a very convinient API that makes you happy.
|
|
43
43
|
```ruby
|
44
44
|
pp Greeb::Tokenizer.tokenize('Hello!')
|
45
45
|
=begin
|
46
|
-
[#<struct Greeb::
|
47
|
-
#<struct Greeb::
|
46
|
+
[#<struct Greeb::Span from=0, to=5, type=:letter>,
|
47
|
+
#<struct Greeb::Span from=5, to=6, type=:punct>]
|
48
48
|
=end
|
49
49
|
```
|
50
50
|
|
@@ -59,34 +59,34 @@ EOF
|
|
59
59
|
|
60
60
|
pp Greeb::Tokenizer.tokenize(text)
|
61
61
|
=begin
|
62
|
-
[#<struct Greeb::
|
63
|
-
#<struct Greeb::
|
64
|
-
#<struct Greeb::
|
65
|
-
#<struct Greeb::
|
66
|
-
#<struct Greeb::
|
67
|
-
#<struct Greeb::
|
68
|
-
#<struct Greeb::
|
69
|
-
#<struct Greeb::
|
70
|
-
#<struct Greeb::
|
71
|
-
#<struct Greeb::
|
72
|
-
#<struct Greeb::
|
73
|
-
#<struct Greeb::
|
74
|
-
#<struct Greeb::
|
75
|
-
#<struct Greeb::
|
76
|
-
#<struct Greeb::
|
77
|
-
#<struct Greeb::
|
78
|
-
#<struct Greeb::
|
79
|
-
#<struct Greeb::
|
80
|
-
#<struct Greeb::
|
81
|
-
#<struct Greeb::
|
82
|
-
#<struct Greeb::
|
83
|
-
#<struct Greeb::
|
84
|
-
#<struct Greeb::
|
85
|
-
#<struct Greeb::
|
86
|
-
#<struct Greeb::
|
87
|
-
#<struct Greeb::
|
88
|
-
#<struct Greeb::
|
89
|
-
#<struct Greeb::
|
62
|
+
[#<struct Greeb::Span from=0, to=5, type=:letter>,
|
63
|
+
#<struct Greeb::Span from=5, to=6, type=:punct>,
|
64
|
+
#<struct Greeb::Span from=6, to=7, type=:space>,
|
65
|
+
#<struct Greeb::Span from=7, to=8, type=:letter>,
|
66
|
+
#<struct Greeb::Span from=8, to=9, type=:space>,
|
67
|
+
#<struct Greeb::Span from=9, to=11, type=:letter>,
|
68
|
+
#<struct Greeb::Span from=11, to=12, type=:space>,
|
69
|
+
#<struct Greeb::Span from=12, to=14, type=:integer>,
|
70
|
+
#<struct Greeb::Span from=14, to=15, type=:punct>,
|
71
|
+
#<struct Greeb::Span from=15, to=16, type=:space>,
|
72
|
+
#<struct Greeb::Span from=16, to=18, type=:letter>,
|
73
|
+
#<struct Greeb::Span from=18, to=19, type=:space>,
|
74
|
+
#<struct Greeb::Span from=19, to=28, type=:letter>,
|
75
|
+
#<struct Greeb::Span from=28, to=29, type=:space>,
|
76
|
+
#<struct Greeb::Span from=29, to=35, type=:letter>,
|
77
|
+
#<struct Greeb::Span from=35, to=36, type=:space>,
|
78
|
+
#<struct Greeb::Span from=36, to=38, type=:letter>,
|
79
|
+
#<struct Greeb::Span from=38, to=39, type=:space>,
|
80
|
+
#<struct Greeb::Span from=39, to=44, type=:float>,
|
81
|
+
#<struct Greeb::Span from=44, to=47, type=:punct>,
|
82
|
+
#<struct Greeb::Span from=47, to=49, type=:break>,
|
83
|
+
#<struct Greeb::Span from=49, to=53, type=:letter>,
|
84
|
+
#<struct Greeb::Span from=53, to=54, type=:space>,
|
85
|
+
#<struct Greeb::Span from=54, to=59, type=:letter>,
|
86
|
+
#<struct Greeb::Span from=59, to=60, type=:space>,
|
87
|
+
#<struct Greeb::Span from=60, to=63, type=:letter>,
|
88
|
+
#<struct Greeb::Span from=63, to=64, type=:punct>,
|
89
|
+
#<struct Greeb::Span from=64, to=65, type=:break>]
|
90
90
|
=end
|
91
91
|
```
|
92
92
|
|
@@ -99,8 +99,8 @@ text = 'Hello! How are you?'
|
|
99
99
|
tokens = Greeb::Tokenizer.tokenize(text)
|
100
100
|
pp Greeb::Segmentator.new(tokens).sentences
|
101
101
|
=begin
|
102
|
-
[#<struct Greeb::
|
103
|
-
#<struct Greeb::
|
102
|
+
[#<struct Greeb::Span from=0, to=6, type=:sentence>,
|
103
|
+
#<struct Greeb::Span from=7, to=19, type=:sentence>]
|
104
104
|
=end
|
105
105
|
```
|
106
106
|
|
@@ -113,21 +113,21 @@ tokens = Greeb::Tokenizer.tokenize(text)
|
|
113
113
|
segmentator = Greeb::Segmentator.new(tokens)
|
114
114
|
pp segmentator.extract(segmentator.sentences)
|
115
115
|
=begin
|
116
|
-
{#<struct Greeb::
|
117
|
-
[#<struct Greeb::
|
118
|
-
#<struct Greeb::
|
119
|
-
#<struct Greeb::
|
120
|
-
[#<struct Greeb::
|
121
|
-
#<struct Greeb::
|
122
|
-
#<struct Greeb::
|
123
|
-
#<struct Greeb::
|
124
|
-
#<struct Greeb::
|
125
|
-
#<struct Greeb::
|
116
|
+
{#<struct Greeb::Span from=0, to=6, type=:sentence>=>
|
117
|
+
[#<struct Greeb::Span from=0, to=5, type=:letter>,
|
118
|
+
#<struct Greeb::Span from=5, to=6, type=:punct>],
|
119
|
+
#<struct Greeb::Span from=7, to=19, type=:sentence>=>
|
120
|
+
[#<struct Greeb::Span from=7, to=10, type=:letter>,
|
121
|
+
#<struct Greeb::Span from=10, to=11, type=:space>,
|
122
|
+
#<struct Greeb::Span from=11, to=14, type=:letter>,
|
123
|
+
#<struct Greeb::Span from=14, to=15, type=:space>,
|
124
|
+
#<struct Greeb::Span from=15, to=18, type=:letter>,
|
125
|
+
#<struct Greeb::Span from=18, to=19, type=:punct>]}
|
126
126
|
=end
|
127
127
|
```
|
128
128
|
|
129
129
|
### Parsing API
|
130
|
-
Texts are often include some special
|
130
|
+
Texts are often include some special spans such as URLs and e-mail
|
131
131
|
addresses. Greeb can help you in these strings retrieval.
|
132
132
|
|
133
133
|
#### URL and E-mail retrieval
|
@@ -136,12 +136,12 @@ text = 'My website is http://nlpub.ru and e-mail is example@example.com.'
|
|
136
136
|
|
137
137
|
pp Greeb::Parser.urls(text).map { |e| [e, text[e.from...e.to]] }
|
138
138
|
=begin
|
139
|
-
[[#<struct Greeb::
|
139
|
+
[[#<struct Greeb::Span from=14, to=29, type=:url>, "http://nlpub.ru"]]
|
140
140
|
=end
|
141
141
|
|
142
142
|
pp Greeb::Parser.emails(text).map { |e| [e, text[e.from...e.to]] }
|
143
143
|
=begin
|
144
|
-
[[#<struct Greeb::
|
144
|
+
[[#<struct Greeb::Span from=44, to=63, type=:email>, "example@example.com"]]
|
145
145
|
=end
|
146
146
|
```
|
147
147
|
|
@@ -153,7 +153,7 @@ text = 'Hello, G.L.H.F. everyone!'
|
|
153
153
|
|
154
154
|
pp Greeb::Parser.abbrevs(text).map { |e| [e, text[e.from...e.to]] }
|
155
155
|
=begin
|
156
|
-
[[#<struct Greeb::
|
156
|
+
[[#<struct Greeb::Span from=7, to=15, type=:abbrev>, "G.L.H.F."]]
|
157
157
|
=end
|
158
158
|
```
|
159
159
|
|
@@ -161,13 +161,13 @@ The algorithm is not so accurate, but still useful in many practical
|
|
161
161
|
situations.
|
162
162
|
|
163
163
|
## Tokens
|
164
|
-
Greeb operates with
|
165
|
-
*from* is a beginning of the
|
166
|
-
and *kind* is a type of the
|
164
|
+
Greeb operates with spans, tuples of *(from, to, kind)*, where
|
165
|
+
*from* is a beginning of the span, *to* is an ending of the span,
|
166
|
+
and *kind* is a type of the span.
|
167
167
|
|
168
|
-
There are several
|
168
|
+
There are several span types at the tokenization stage: `:letter`,
|
169
169
|
`:float`, `:integer`, `:separ`, `:punct` (for punctuation), `:spunct`
|
170
|
-
(for in-sentence punctuation), and `:break`.
|
170
|
+
(for in-sentence punctuation), `:space`, and `:break`.
|
171
171
|
|
172
172
|
## Contributing
|
173
173
|
1. Fork it;
|
data/bin/greeb
CHANGED
@@ -8,6 +8,6 @@ require 'greeb'
|
|
8
8
|
|
9
9
|
text = STDIN.read.tap(&:chomp!)
|
10
10
|
|
11
|
-
Greeb[text].each do |
|
12
|
-
puts text[
|
11
|
+
Greeb[text].each do |span|
|
12
|
+
puts text[span.from...span.to] unless [:space, :break].include? span.type
|
13
13
|
end
|
data/lib/greeb.rb
CHANGED
@@ -1,45 +1,8 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
require 'greeb/version'
|
4
|
-
|
5
|
-
|
6
|
-
# *from* is a beginning of the entity, *to* is an ending of the entity,
|
7
|
-
# and *kind* is a type of the entity.
|
8
|
-
#
|
9
|
-
# There are several entity types: `:letter`, `:float`, `:integer`,
|
10
|
-
# `:separ` for separators, `:punct` for punctuation characters,
|
11
|
-
# `:spunct` for in-sentence punctuation characters, and
|
12
|
-
# `:break` for line endings.
|
13
|
-
#
|
14
|
-
class Greeb::Entity < Struct.new(:from, :to, :type)
|
15
|
-
# @private
|
16
|
-
def <=> other
|
17
|
-
if (comparison = self.from <=> other.from) == 0
|
18
|
-
self.to <=> other.to
|
19
|
-
else
|
20
|
-
comparison
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
# This runtime error appears when {Greeb::Tokenizer} or
|
26
|
-
# {Greeb::Segmentator} tries to recognize unknown character.
|
27
|
-
#
|
28
|
-
class Greeb::UnknownEntity < RuntimeError
|
29
|
-
attr_reader :text, :pos
|
30
|
-
|
31
|
-
# @private
|
32
|
-
def initialize(text, pos)
|
33
|
-
@text, @pos = text, pos
|
34
|
-
end
|
35
|
-
|
36
|
-
# Generate the real error message.
|
37
|
-
#
|
38
|
-
def to_s
|
39
|
-
'Could not recognize character "%s" @ %d' % [text[pos], pos]
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
4
|
+
require 'greeb/exceptions'
|
5
|
+
require 'greeb/span'
|
43
6
|
require 'greeb/strscan'
|
44
7
|
require 'greeb/tokenizer'
|
45
8
|
require 'greeb/segmentator'
|
data/lib/greeb/core.rb
CHANGED
@@ -13,13 +13,13 @@ module Greeb::Core
|
|
13
13
|
#
|
14
14
|
# @param text [String] input text.
|
15
15
|
#
|
16
|
-
# @return [Array<Greeb::
|
16
|
+
# @return [Array<Greeb::Span>] a set of tokens.
|
17
17
|
#
|
18
|
-
def analyze
|
18
|
+
def analyze(text, helpers = HELPERS)
|
19
19
|
Greeb::Tokenizer.tokenize(text).tap do |tokens|
|
20
|
-
|
20
|
+
helpers.each do |helper|
|
21
21
|
Greeb::Parser.public_send(helper, text).each do |parsed|
|
22
|
-
|
22
|
+
extract_spans(tokens, parsed)
|
23
23
|
end
|
24
24
|
end
|
25
25
|
end
|
@@ -28,17 +28,18 @@ module Greeb::Core
|
|
28
28
|
alias_method :'[]', :analyze
|
29
29
|
|
30
30
|
protected
|
31
|
-
# Extact
|
31
|
+
# Extact spans of the specified type from the input spans set.
|
32
32
|
#
|
33
|
-
# @param
|
34
|
-
# @param
|
33
|
+
# @param spans [Array<Greeb::Span>] input spans set.
|
34
|
+
# @param span [Greeb::Span] span to be extracted.
|
35
35
|
#
|
36
|
-
# @return [Greeb::
|
36
|
+
# @return [Greeb::Span] span to be extracted.
|
37
37
|
#
|
38
|
-
def
|
39
|
-
from =
|
40
|
-
to =
|
41
|
-
|
38
|
+
def extract_spans(spans, span)
|
39
|
+
from = spans.index { |e| e.from == span.from }
|
40
|
+
to = spans.index { |e| e.to == span.to }
|
41
|
+
return unless from && to
|
42
|
+
spans[from..to] = span
|
42
43
|
end
|
43
44
|
end
|
44
45
|
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# This runtime error appears when {Greeb::Tokenizer} or
|
2
|
+
# {Greeb::Segmentator} tries to recognize unknown character.
|
3
|
+
#
|
4
|
+
class Greeb::UnknownEntity < RuntimeError
|
5
|
+
attr_reader :text, :pos
|
6
|
+
|
7
|
+
# @private
|
8
|
+
def initialize(text, pos)
|
9
|
+
@text, @pos = text, pos
|
10
|
+
end
|
11
|
+
|
12
|
+
# Generate the real error message.
|
13
|
+
#
|
14
|
+
def to_s
|
15
|
+
'Could not recognize character "%s" @ %d' % [text[pos], pos]
|
16
|
+
end
|
17
|
+
end
|
data/lib/greeb/parser.rb
CHANGED
@@ -16,12 +16,15 @@ module Greeb::Parser
|
|
16
16
|
# Another horrible pattern. Now for abbreviations.
|
17
17
|
ABBREV = /\b((-{0,1}\p{L}\.)*|(-{0,1}\p{L}\. )*)-{0,1}\p{L}\./i
|
18
18
|
|
19
|
+
# This pattern matches anything that looks like HTML. Or not.
|
20
|
+
HTML = /<(.*?)>/i
|
21
|
+
|
19
22
|
# Recognize URLs in the input text. Actually, URL is obsolete standard
|
20
23
|
# and this code should be rewritten to use the URI concept.
|
21
24
|
#
|
22
25
|
# @param text [String] input text.
|
23
26
|
#
|
24
|
-
# @return [Array<Greeb::
|
27
|
+
# @return [Array<Greeb::Span>] found URLs.
|
25
28
|
#
|
26
29
|
def urls(text)
|
27
30
|
scan(text, URL, :url)
|
@@ -31,7 +34,7 @@ module Greeb::Parser
|
|
31
34
|
#
|
32
35
|
# @param text [String] input text.
|
33
36
|
#
|
34
|
-
# @return [Array<Greeb::
|
37
|
+
# @return [Array<Greeb::Span>] found e-mail addresses.
|
35
38
|
#
|
36
39
|
def emails(text)
|
37
40
|
scan(text, EMAIL, :email)
|
@@ -41,27 +44,37 @@ module Greeb::Parser
|
|
41
44
|
#
|
42
45
|
# @param text [String] input text.
|
43
46
|
#
|
44
|
-
# @return [Array<Greeb::
|
47
|
+
# @return [Array<Greeb::Span>] found abbreviations.
|
45
48
|
#
|
46
49
|
def abbrevs(text)
|
47
50
|
scan(text, ABBREV, :abbrev)
|
48
51
|
end
|
49
52
|
|
53
|
+
# Recognize HTML-alike entities in the input text.
|
54
|
+
#
|
55
|
+
# @param text [String] input text.
|
56
|
+
#
|
57
|
+
# @return [Array<Greeb::Span>] found HTML entities.
|
58
|
+
#
|
59
|
+
def html(text)
|
60
|
+
scan(text, HTML, :html)
|
61
|
+
end
|
62
|
+
|
50
63
|
private
|
51
|
-
# Implementation of regexp-based {Greeb::
|
64
|
+
# Implementation of regexp-based {Greeb::Span} scanner.
|
52
65
|
#
|
53
66
|
# @param text [String] input text.
|
54
67
|
# @param regexp [Regexp] regular expression to be used.
|
55
|
-
# @param type [Symbol] type field for the new {Greeb::
|
68
|
+
# @param type [Symbol] type field for the new {Greeb::Span} instances.
|
56
69
|
# @param offset [Fixnum] offset of the next match.
|
57
70
|
#
|
58
|
-
# @return [Array<Greeb::
|
71
|
+
# @return [Array<Greeb::Span>] found entities.
|
59
72
|
#
|
60
73
|
def scan(text, regexp, type, offset = 0)
|
61
74
|
Array.new.tap do |matches|
|
62
75
|
while text and md = text.match(regexp)
|
63
76
|
start, stop = md.offset(0)
|
64
|
-
matches << Greeb::
|
77
|
+
matches << Greeb::Span.new(offset + start, offset + stop, type)
|
65
78
|
text, offset = text[stop..-1], offset + stop
|
66
79
|
end
|
67
80
|
end
|
data/lib/greeb/segmentator.rb
CHANGED
@@ -13,7 +13,7 @@ class Greeb::Segmentator
|
|
13
13
|
|
14
14
|
# Create a new instance of {Greeb::Segmentator}.
|
15
15
|
#
|
16
|
-
# @param tokens [Array<Greeb::
|
16
|
+
# @param tokens [Array<Greeb::Span>] tokens from [Greeb::Tokenizer].
|
17
17
|
#
|
18
18
|
def initialize(tokens)
|
19
19
|
@tokens = tokens
|
@@ -21,62 +21,60 @@ class Greeb::Segmentator
|
|
21
21
|
|
22
22
|
# Sentences memoization method.
|
23
23
|
#
|
24
|
-
# @return [Array<Greeb::
|
24
|
+
# @return [Array<Greeb::Span>] a set of sentences.
|
25
25
|
#
|
26
26
|
def sentences
|
27
|
-
@sentences ||=
|
27
|
+
@sentences ||= detect_spans(new_sentence, [:punct])
|
28
28
|
end
|
29
29
|
|
30
30
|
# Subsentences memoization method.
|
31
31
|
#
|
32
|
-
# @return [Array<Greeb::
|
32
|
+
# @return [Array<Greeb::Span>] a set of subsentences.
|
33
33
|
#
|
34
34
|
def subsentences
|
35
|
-
@subsentences ||=
|
35
|
+
@subsentences ||= detect_spans(new_subsentence, [:punct, :spunct])
|
36
36
|
end
|
37
37
|
|
38
38
|
# Extract tokens from the set of sentences.
|
39
39
|
#
|
40
|
-
# @param sentences [Array<Greeb::
|
40
|
+
# @param sentences [Array<Greeb::Span>] a list of sentences.
|
41
41
|
#
|
42
|
-
# @return [
|
42
|
+
# @return [Array<Greeb::Span, Array<Greeb::Span>>] a hash with
|
43
43
|
# sentences as keys and tokens arrays as values.
|
44
44
|
#
|
45
45
|
def extract(sentences, collection = tokens)
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
end
|
50
|
-
]
|
46
|
+
sentences.map do |s|
|
47
|
+
[s, collection.select { |t| t.from >= s.from and t.to <= s.to }]
|
48
|
+
end
|
51
49
|
end
|
52
50
|
|
53
51
|
protected
|
54
|
-
# Implementation of the
|
52
|
+
# Implementation of the span detection method.
|
55
53
|
#
|
56
|
-
# @param sample [Greeb::
|
54
|
+
# @param sample [Greeb::Span] a sample of span to be cloned in the
|
57
55
|
# process.
|
58
56
|
# @param stop_marks [Array<Symbol>] an array that stores the
|
59
|
-
# correspondent stop marks of the necessary
|
57
|
+
# correspondent stop marks of the necessary spans.
|
60
58
|
#
|
61
|
-
# @return [Array<Greeb::
|
59
|
+
# @return [Array<Greeb::Span>] a set of entites.
|
62
60
|
#
|
63
|
-
def
|
61
|
+
def detect_spans(sample, stop_marks)
|
64
62
|
collection = []
|
65
63
|
|
66
|
-
rest = tokens.inject(sample.dup) do |
|
67
|
-
next
|
68
|
-
|
69
|
-
next
|
64
|
+
rest = tokens.inject(sample.dup) do |span, token|
|
65
|
+
next span if sentence_aint_start? span, token
|
66
|
+
span.from = token.from unless span.from
|
67
|
+
next span if span.to and span.to > token.to
|
70
68
|
|
71
69
|
if stop_marks.include? token.type
|
72
|
-
|
73
|
-
collection <<
|
74
|
-
|
70
|
+
span.to = find_forward(tokens, token).to
|
71
|
+
collection << span
|
72
|
+
span = sample.dup
|
75
73
|
elsif ![:separ, :space].include? token.type
|
76
|
-
|
74
|
+
span.to = token.to
|
77
75
|
end
|
78
76
|
|
79
|
-
|
77
|
+
span
|
80
78
|
end
|
81
79
|
|
82
80
|
if rest.from && rest.to
|
@@ -88,42 +86,42 @@ class Greeb::Segmentator
|
|
88
86
|
|
89
87
|
private
|
90
88
|
# Check the possibility of starting a new sentence by the specified
|
91
|
-
# pair of
|
89
|
+
# pair of span and token.
|
92
90
|
#
|
93
|
-
# @param
|
94
|
-
# @param token [Greeb::
|
91
|
+
# @param span [Greeb::Span] an span to be checked.
|
92
|
+
# @param token [Greeb::Span] an token to be checked.
|
95
93
|
#
|
96
94
|
# @return true or false.
|
97
95
|
#
|
98
|
-
def sentence_aint_start?(
|
99
|
-
!
|
96
|
+
def sentence_aint_start?(span, token)
|
97
|
+
!span.from and SENTENCE_AINT_START.include? token.type
|
100
98
|
end
|
101
99
|
|
102
100
|
# Find a forwarding token that has another type.
|
103
101
|
#
|
104
|
-
# @param collection [Array<Greeb::
|
105
|
-
# @param sample [Greeb::
|
102
|
+
# @param collection [Array<Greeb::Span>] array of possible tokens.
|
103
|
+
# @param sample [Greeb::Span] a token that is treated as a sample.
|
106
104
|
#
|
107
|
-
# @return [Greeb::
|
105
|
+
# @return [Greeb::Span] a forwarding token.
|
108
106
|
#
|
109
107
|
def find_forward(collection, sample)
|
110
108
|
collection.select { |t| t.from >= sample.from }.
|
111
109
|
inject(sample) { |r, t| t.type == sample.type ? t : (break r) }
|
112
110
|
end
|
113
111
|
|
114
|
-
# Create a new instance of {Greeb::
|
112
|
+
# Create a new instance of {Greeb::Span} with `:sentence` type.
|
115
113
|
#
|
116
|
-
# @return [Greeb::
|
114
|
+
# @return [Greeb::Span] a new span instance.
|
117
115
|
#
|
118
116
|
def new_sentence
|
119
|
-
Greeb::
|
117
|
+
Greeb::Span.new(nil, nil, :sentence)
|
120
118
|
end
|
121
119
|
|
122
|
-
# Create a new instance of {Greeb::
|
120
|
+
# Create a new instance of {Greeb::Span} with `:subsentence` type.
|
123
121
|
#
|
124
|
-
# @return [Greeb::
|
122
|
+
# @return [Greeb::Span] a new span instance.
|
125
123
|
#
|
126
124
|
def new_subsentence
|
127
|
-
Greeb::
|
125
|
+
Greeb::Span.new(nil, nil, :subsentence)
|
128
126
|
end
|
129
127
|
end
|