greeb 0.2.2.rc1 → 0.2.2.rc2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +52 -52
- data/bin/greeb +2 -2
- data/lib/greeb.rb +2 -39
- data/lib/greeb/core.rb +13 -12
- data/lib/greeb/exceptions.rb +17 -0
- data/lib/greeb/parser.rb +20 -7
- data/lib/greeb/segmentator.rb +38 -40
- data/lib/greeb/span.rb +36 -0
- data/lib/greeb/tokenizer.rb +11 -11
- data/lib/greeb/version.rb +1 -1
- data/spec/core_spec.rb +31 -33
- data/spec/parser_spec.rb +42 -30
- data/spec/segmentator_spec.rb +81 -83
- data/spec/span_spec.rb +63 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/tokenizer_spec.rb +76 -78
- metadata +5 -1
data/lib/greeb/span.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# Greeb operates with spans. A span is a tuple of *(from, to, kind)*, where
|
2
|
+
# *from* is a beginning of the span, *to* is an ending of the span,
|
3
|
+
# and *kind* is a type of the span.
|
4
|
+
#
|
5
|
+
# There are several span types: `:letter` for letters, `:float` for
|
6
|
+
# floating point decimals, `:integer` for numbers, `:separ` for separators,
|
7
|
+
# `:punct` for punctuation characters, `:spunct` for in-sentence punctuation
|
8
|
+
# characters, `:space` for spaces, and `:break` for line endings.
|
9
|
+
#
|
10
|
+
class Greeb::Span < Struct.new(:from, :to, :type)
|
11
|
+
# Create a deriviative structure that is based on Greeb::Span
|
12
|
+
# members. Useful in integrating with Greeb.
|
13
|
+
#
|
14
|
+
# @param members [Array<Symbol>] additional members.
|
15
|
+
#
|
16
|
+
# @return [Struct] a new structure.
|
17
|
+
#
|
18
|
+
def self.derivate(*members)
|
19
|
+
Struct.new(*self.members, *members)
|
20
|
+
end
|
21
|
+
|
22
|
+
# @private
|
23
|
+
def <=> other
|
24
|
+
if (comparison = self.from <=> other.from) == 0
|
25
|
+
self.to <=> other.to
|
26
|
+
else
|
27
|
+
comparison
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# @private
|
32
|
+
def eql? other
|
33
|
+
return false unless type == other.type
|
34
|
+
(self <=> other) == 0
|
35
|
+
end
|
36
|
+
end
|
data/lib/greeb/tokenizer.rb
CHANGED
@@ -35,7 +35,7 @@ module Greeb::Tokenizer
|
|
35
35
|
|
36
36
|
# Spaces (i.e.: " " or  ).
|
37
37
|
#
|
38
|
-
SPACES = /[\p{Zs}]+/u
|
38
|
+
SPACES = /[\p{Zs}\t]+/u
|
39
39
|
|
40
40
|
# Line breaks.
|
41
41
|
#
|
@@ -47,14 +47,14 @@ module Greeb::Tokenizer
|
|
47
47
|
|
48
48
|
# Perform the tokenization process.
|
49
49
|
#
|
50
|
-
# @return [Array<Greeb::
|
50
|
+
# @return [Array<Greeb::Span>] a set of tokens.
|
51
51
|
#
|
52
52
|
def tokenize text
|
53
53
|
scanner = Greeb::StringScanner.new(text)
|
54
54
|
tokens = []
|
55
55
|
while !scanner.eos?
|
56
56
|
step scanner, tokens or
|
57
|
-
raise Greeb::
|
57
|
+
raise Greeb::UnknownSpan.new(text, scanner.char_pos)
|
58
58
|
end
|
59
59
|
tokens
|
60
60
|
ensure
|
@@ -79,9 +79,9 @@ module Greeb::Tokenizer
|
|
79
79
|
# One iteration of the tokenization process.
|
80
80
|
#
|
81
81
|
# @param scanner [Greeb::StringScanner] string scanner.
|
82
|
-
# @param tokens [Array<Greeb::
|
82
|
+
# @param tokens [Array<Greeb::Span>] result array.
|
83
83
|
#
|
84
|
-
# @return [Array<Greeb::
|
84
|
+
# @return [Array<Greeb::Span>] the modified set of extracted tokens.
|
85
85
|
#
|
86
86
|
def step scanner, tokens
|
87
87
|
parse! scanner, tokens, LETTERS, :letter or
|
@@ -99,17 +99,17 @@ module Greeb::Tokenizer
|
|
99
99
|
# of necessary type.
|
100
100
|
#
|
101
101
|
# @param scanner [Greeb::StringScanner] string scanner.
|
102
|
-
# @param tokens [Array<Greeb::
|
102
|
+
# @param tokens [Array<Greeb::Span>] result array.
|
103
103
|
# @param pattern [Regexp] a regular expression to extract the token.
|
104
104
|
# @param type [Symbol] a symbol that represents the necessary token
|
105
105
|
# type.
|
106
106
|
#
|
107
|
-
# @return [Array<Greeb::
|
107
|
+
# @return [Array<Greeb::Span>] the modified set of extracted tokens.
|
108
108
|
#
|
109
109
|
def parse! scanner, tokens, pattern, type
|
110
110
|
return false unless token = scanner.scan(pattern)
|
111
111
|
position = scanner.char_pos
|
112
|
-
tokens << Greeb::
|
112
|
+
tokens << Greeb::Span.new(position - token.length,
|
113
113
|
position,
|
114
114
|
type)
|
115
115
|
end
|
@@ -119,18 +119,18 @@ module Greeb::Tokenizer
|
|
119
119
|
# characters.
|
120
120
|
#
|
121
121
|
# @param scanner [Greeb::StringScanner] string scanner.
|
122
|
-
# @param tokens [Array<Greeb::
|
122
|
+
# @param tokens [Array<Greeb::Span>] result array.
|
123
123
|
# @param pattern [Regexp] a regular expression to extract the token.
|
124
124
|
# @param type [Symbol] a symbol that represents the necessary token
|
125
125
|
# type.
|
126
126
|
#
|
127
|
-
# @return [Array<Greeb::
|
127
|
+
# @return [Array<Greeb::Span>] the modified set of extracted tokens.
|
128
128
|
#
|
129
129
|
def split_parse! scanner, tokens, pattern, type
|
130
130
|
return false unless token = scanner.scan(pattern)
|
131
131
|
position = scanner.char_pos - token.length
|
132
132
|
split(token).inject(position) do |before, s|
|
133
|
-
tokens << Greeb::
|
133
|
+
tokens << Greeb::Span.new(before, before + s.length, type)
|
134
134
|
before + s.length
|
135
135
|
end
|
136
136
|
end
|
data/lib/greeb/version.rb
CHANGED
data/spec/core_spec.rb
CHANGED
@@ -2,41 +2,39 @@
|
|
2
2
|
|
3
3
|
require_relative 'spec_helper'
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
end
|
5
|
+
describe Greeb do
|
6
|
+
it 'should do nothing when ran without input' do
|
7
|
+
Greeb[''].must_be_empty
|
8
|
+
end
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
10
|
+
it 'should tokenize text when input is given' do
|
11
|
+
Greeb['Hello guys!'].must_equal(
|
12
|
+
[Span.new(0, 5, :letter),
|
13
|
+
Span.new(5, 6, :space),
|
14
|
+
Span.new(6, 10, :letter),
|
15
|
+
Span.new(10, 11, :punct)]
|
16
|
+
)
|
17
|
+
end
|
19
18
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
19
|
+
it 'should extract URLs' do
|
20
|
+
Greeb['Hello http://nlpub.ru guys!'].must_equal(
|
21
|
+
[Span.new(0, 5, :letter),
|
22
|
+
Span.new(5, 6, :space),
|
23
|
+
Span.new(6, 21, :url),
|
24
|
+
Span.new(21, 22, :space),
|
25
|
+
Span.new(22, 26, :letter),
|
26
|
+
Span.new(26, 27, :punct)]
|
27
|
+
)
|
28
|
+
end
|
30
29
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
end
|
30
|
+
it 'should extract e-mails' do
|
31
|
+
Greeb['Hello example@example.com guys!'].must_equal(
|
32
|
+
[Span.new(0, 5, :letter),
|
33
|
+
Span.new(5, 6, :space),
|
34
|
+
Span.new(6, 25, :email),
|
35
|
+
Span.new(25, 26, :space),
|
36
|
+
Span.new(26, 30, :letter),
|
37
|
+
Span.new(30, 31, :punct)]
|
38
|
+
)
|
41
39
|
end
|
42
40
|
end
|
data/spec/parser_spec.rb
CHANGED
@@ -2,45 +2,57 @@
|
|
2
2
|
|
3
3
|
require_relative 'spec_helper'
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
5
|
+
describe Parser do
|
6
|
+
let(:text) do
|
7
|
+
('Hello there! My name is <span class="name">Vasya B.</span> and ' \
|
8
|
+
'I am к.ф.-м.н. My website is http://вася.рф/. And my e-mail is ' \
|
9
|
+
'example@example.com! It is available by URL: http://vasya.ru. ' \
|
10
|
+
'Also, <b>G.L.H.F.</b> everyone!').freeze
|
11
|
+
end
|
12
|
+
|
13
|
+
describe 'URL' do
|
14
|
+
subject { Parser.urls(text) }
|
15
|
+
|
16
|
+
it 'recognizes URLs' do
|
17
|
+
subject.must_equal(
|
18
|
+
[Span.new(92, 107, :url),
|
19
|
+
Span.new(171, 186, :url)]
|
20
|
+
)
|
11
21
|
end
|
22
|
+
end
|
12
23
|
|
13
|
-
|
14
|
-
|
24
|
+
describe 'EMAIL' do
|
25
|
+
subject { Parser.emails(text) }
|
15
26
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
)
|
21
|
-
end
|
27
|
+
it 'recognizes e-mails' do
|
28
|
+
subject.must_equal(
|
29
|
+
[Span.new(126, 145, :email)]
|
30
|
+
)
|
22
31
|
end
|
32
|
+
end
|
23
33
|
|
24
|
-
|
25
|
-
|
34
|
+
describe 'ABBREV' do
|
35
|
+
subject { Parser.abbrevs(text) }
|
26
36
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
37
|
+
it 'recognizes abbreviations' do
|
38
|
+
subject.must_equal(
|
39
|
+
[Span.new(49, 51, :abbrev),
|
40
|
+
Span.new(68, 77, :abbrev),
|
41
|
+
Span.new(197, 205, :abbrev)]
|
42
|
+
)
|
32
43
|
end
|
44
|
+
end
|
33
45
|
|
34
|
-
|
35
|
-
|
46
|
+
describe 'HTML' do
|
47
|
+
subject { Parser.html(text) }
|
36
48
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
49
|
+
it 'recognizes HTML entities' do
|
50
|
+
subject.must_equal(
|
51
|
+
[Span.new(24, 43, :html),
|
52
|
+
Span.new(51, 58, :html),
|
53
|
+
Span.new(194, 197, :html),
|
54
|
+
Span.new(205, 209, :html)]
|
55
|
+
)
|
44
56
|
end
|
45
57
|
end
|
46
58
|
end
|
data/spec/segmentator_spec.rb
CHANGED
@@ -2,116 +2,114 @@
|
|
2
2
|
|
3
3
|
require_relative 'spec_helper'
|
4
4
|
|
5
|
-
|
6
|
-
describe
|
7
|
-
|
8
|
-
let(:tokens) { Tokenizer.tokenize('Vodka') }
|
5
|
+
describe Segmentator do
|
6
|
+
describe 'initialization' do
|
7
|
+
let(:tokens) { Tokenizer.tokenize('Vodka') }
|
9
8
|
|
10
|
-
|
9
|
+
subject { Segmentator.new(tokens) }
|
11
10
|
|
12
|
-
|
13
|
-
|
14
|
-
|
11
|
+
it 'is initialized either with set of tokens' do
|
12
|
+
subject.tokens.must_be_kind_of Array
|
13
|
+
end
|
15
14
|
|
16
|
-
|
17
|
-
|
18
|
-
end
|
15
|
+
it 'should has @tokens ivar' do
|
16
|
+
subject.instance_variable_get(:@tokens).wont_be_nil
|
19
17
|
end
|
18
|
+
end
|
20
19
|
|
21
|
-
|
22
|
-
|
20
|
+
describe 'a simple sentence' do
|
21
|
+
let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
|
23
22
|
|
24
|
-
|
23
|
+
subject { Segmentator.new(tokens).sentences }
|
25
24
|
|
26
|
-
|
27
|
-
|
28
|
-
end
|
25
|
+
it 'should be segmented' do
|
26
|
+
subject.must_equal([Span.new(0, 22, :sentence)])
|
29
27
|
end
|
28
|
+
end
|
30
29
|
|
31
|
-
|
32
|
-
|
30
|
+
describe 'a simple sentence without punctuation' do
|
31
|
+
let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton') }
|
33
32
|
|
34
|
-
|
33
|
+
subject { Segmentator.new(tokens).sentences }
|
35
34
|
|
36
|
-
|
37
|
-
|
38
|
-
end
|
35
|
+
it 'should be segmented' do
|
36
|
+
subject.must_equal([Span.new(0, 21, :sentence)])
|
39
37
|
end
|
38
|
+
end
|
40
39
|
|
41
|
-
|
42
|
-
|
40
|
+
describe 'a simple sentence with trailing whitespaces' do
|
41
|
+
let(:tokens) { Tokenizer.tokenize(' Hello, I am JC Denton ') }
|
43
42
|
|
44
|
-
|
43
|
+
subject { Segmentator.new(tokens).sentences }
|
45
44
|
|
46
|
-
|
47
|
-
|
48
|
-
end
|
45
|
+
it 'should be segmented' do
|
46
|
+
subject.must_equal([Span.new(6, 27, :sentence)])
|
49
47
|
end
|
48
|
+
end
|
50
49
|
|
51
|
-
|
52
|
-
|
50
|
+
describe 'two simple sentences' do
|
51
|
+
let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
|
53
52
|
|
54
|
-
|
53
|
+
subject { Segmentator.new(tokens).sentences }
|
55
54
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
end
|
55
|
+
it 'should be segmented' do
|
56
|
+
subject.must_equal([Span.new(0, 6, :sentence),
|
57
|
+
Span.new(7, 22, :sentence)])
|
60
58
|
end
|
59
|
+
end
|
61
60
|
|
62
|
-
|
63
|
-
|
61
|
+
describe 'one wrong character and one simple sentence' do
|
62
|
+
let(:tokens) { Tokenizer.tokenize('! I am JC Denton.') }
|
64
63
|
|
65
|
-
|
64
|
+
subject { Segmentator.new(tokens).sentences }
|
66
65
|
|
67
|
-
|
68
|
-
|
69
|
-
end
|
66
|
+
it 'should be segmented' do
|
67
|
+
subject.must_equal([Span.new(2, 17, :sentence)])
|
70
68
|
end
|
69
|
+
end
|
71
70
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
end
|
71
|
+
describe 'sentence extractor' do
|
72
|
+
let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
|
73
|
+
let(:segmentator) { Segmentator.new(tokens) }
|
74
|
+
let(:sentences) { segmentator.sentences }
|
75
|
+
|
76
|
+
subject { segmentator.extract(sentences) }
|
77
|
+
|
78
|
+
it 'should be extracted' do
|
79
|
+
subject.must_equal([
|
80
|
+
[Span.new(0, 6, :sentence), [
|
81
|
+
Span.new(0, 5, :letter),
|
82
|
+
Span.new(5, 6, :punct)
|
83
|
+
]],
|
84
|
+
[Span.new(7, 22, :sentence), [
|
85
|
+
Span.new(7, 8, :letter),
|
86
|
+
Span.new(8, 9, :space),
|
87
|
+
Span.new(9, 11, :letter),
|
88
|
+
Span.new(11, 12, :space),
|
89
|
+
Span.new(12, 14, :letter),
|
90
|
+
Span.new(14, 15, :space),
|
91
|
+
Span.new(15, 21, :letter),
|
92
|
+
Span.new(21, 22, :punct)
|
93
|
+
]]
|
94
|
+
])
|
97
95
|
end
|
96
|
+
end
|
98
97
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
end
|
98
|
+
describe 'subsentence extractor' do
|
99
|
+
let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
|
100
|
+
let(:segmentator) { Segmentator.new(tokens) }
|
101
|
+
let(:sentences) { segmentator.sentences }
|
102
|
+
let(:subsentences) { segmentator.subsentences }
|
103
|
+
|
104
|
+
subject { segmentator.extract(sentences, subsentences) }
|
105
|
+
|
106
|
+
it 'should extract subsentences' do
|
107
|
+
subject.must_equal([
|
108
|
+
[Span.new(0, 22, :sentence), [
|
109
|
+
Span.new(0, 6, :subsentence),
|
110
|
+
Span.new(7, 22, :subsentence)
|
111
|
+
]]
|
112
|
+
])
|
115
113
|
end
|
116
114
|
end
|
117
115
|
end
|