greeb 0.2.2.rc1 → 0.2.2.rc2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +52 -52
- data/bin/greeb +2 -2
- data/lib/greeb.rb +2 -39
- data/lib/greeb/core.rb +13 -12
- data/lib/greeb/exceptions.rb +17 -0
- data/lib/greeb/parser.rb +20 -7
- data/lib/greeb/segmentator.rb +38 -40
- data/lib/greeb/span.rb +36 -0
- data/lib/greeb/tokenizer.rb +11 -11
- data/lib/greeb/version.rb +1 -1
- data/spec/core_spec.rb +31 -33
- data/spec/parser_spec.rb +42 -30
- data/spec/segmentator_spec.rb +81 -83
- data/spec/span_spec.rb +63 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/tokenizer_spec.rb +76 -78
- metadata +5 -1
data/lib/greeb/span.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# Greeb operates with spans. A span is a tuple of *(from, to, kind)*, where
|
2
|
+
# *from* is a beginning of the span, *to* is an ending of the span,
|
3
|
+
# and *kind* is a type of the span.
|
4
|
+
#
|
5
|
+
# There are several span types: `:letter` for letters, `:float` for
|
6
|
+
# floating point decimals, `:integer` for numbers, `:separ` for separators,
|
7
|
+
# `:punct` for punctuation characters, `:spunct` for in-sentence punctuation
|
8
|
+
# characters, `:space` for spaces, and `:break` for line endings.
|
9
|
+
#
|
10
|
+
class Greeb::Span < Struct.new(:from, :to, :type)
|
11
|
+
# Create a deriviative structure that is based on Greeb::Span
|
12
|
+
# members. Useful in integrating with Greeb.
|
13
|
+
#
|
14
|
+
# @param members [Array<Symbol>] additional members.
|
15
|
+
#
|
16
|
+
# @return [Struct] a new structure.
|
17
|
+
#
|
18
|
+
def self.derivate(*members)
|
19
|
+
Struct.new(*self.members, *members)
|
20
|
+
end
|
21
|
+
|
22
|
+
# @private
|
23
|
+
def <=> other
|
24
|
+
if (comparison = self.from <=> other.from) == 0
|
25
|
+
self.to <=> other.to
|
26
|
+
else
|
27
|
+
comparison
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# @private
|
32
|
+
def eql? other
|
33
|
+
return false unless type == other.type
|
34
|
+
(self <=> other) == 0
|
35
|
+
end
|
36
|
+
end
|
data/lib/greeb/tokenizer.rb
CHANGED
@@ -35,7 +35,7 @@ module Greeb::Tokenizer
|
|
35
35
|
|
36
36
|
# Spaces (i.e.: " " or  ).
|
37
37
|
#
|
38
|
-
SPACES = /[\p{Zs}]+/u
|
38
|
+
SPACES = /[\p{Zs}\t]+/u
|
39
39
|
|
40
40
|
# Line breaks.
|
41
41
|
#
|
@@ -47,14 +47,14 @@ module Greeb::Tokenizer
|
|
47
47
|
|
48
48
|
# Perform the tokenization process.
|
49
49
|
#
|
50
|
-
# @return [Array<Greeb::
|
50
|
+
# @return [Array<Greeb::Span>] a set of tokens.
|
51
51
|
#
|
52
52
|
def tokenize text
|
53
53
|
scanner = Greeb::StringScanner.new(text)
|
54
54
|
tokens = []
|
55
55
|
while !scanner.eos?
|
56
56
|
step scanner, tokens or
|
57
|
-
raise Greeb::
|
57
|
+
raise Greeb::UnknownSpan.new(text, scanner.char_pos)
|
58
58
|
end
|
59
59
|
tokens
|
60
60
|
ensure
|
@@ -79,9 +79,9 @@ module Greeb::Tokenizer
|
|
79
79
|
# One iteration of the tokenization process.
|
80
80
|
#
|
81
81
|
# @param scanner [Greeb::StringScanner] string scanner.
|
82
|
-
# @param tokens [Array<Greeb::
|
82
|
+
# @param tokens [Array<Greeb::Span>] result array.
|
83
83
|
#
|
84
|
-
# @return [Array<Greeb::
|
84
|
+
# @return [Array<Greeb::Span>] the modified set of extracted tokens.
|
85
85
|
#
|
86
86
|
def step scanner, tokens
|
87
87
|
parse! scanner, tokens, LETTERS, :letter or
|
@@ -99,17 +99,17 @@ module Greeb::Tokenizer
|
|
99
99
|
# of necessary type.
|
100
100
|
#
|
101
101
|
# @param scanner [Greeb::StringScanner] string scanner.
|
102
|
-
# @param tokens [Array<Greeb::
|
102
|
+
# @param tokens [Array<Greeb::Span>] result array.
|
103
103
|
# @param pattern [Regexp] a regular expression to extract the token.
|
104
104
|
# @param type [Symbol] a symbol that represents the necessary token
|
105
105
|
# type.
|
106
106
|
#
|
107
|
-
# @return [Array<Greeb::
|
107
|
+
# @return [Array<Greeb::Span>] the modified set of extracted tokens.
|
108
108
|
#
|
109
109
|
def parse! scanner, tokens, pattern, type
|
110
110
|
return false unless token = scanner.scan(pattern)
|
111
111
|
position = scanner.char_pos
|
112
|
-
tokens << Greeb::
|
112
|
+
tokens << Greeb::Span.new(position - token.length,
|
113
113
|
position,
|
114
114
|
type)
|
115
115
|
end
|
@@ -119,18 +119,18 @@ module Greeb::Tokenizer
|
|
119
119
|
# characters.
|
120
120
|
#
|
121
121
|
# @param scanner [Greeb::StringScanner] string scanner.
|
122
|
-
# @param tokens [Array<Greeb::
|
122
|
+
# @param tokens [Array<Greeb::Span>] result array.
|
123
123
|
# @param pattern [Regexp] a regular expression to extract the token.
|
124
124
|
# @param type [Symbol] a symbol that represents the necessary token
|
125
125
|
# type.
|
126
126
|
#
|
127
|
-
# @return [Array<Greeb::
|
127
|
+
# @return [Array<Greeb::Span>] the modified set of extracted tokens.
|
128
128
|
#
|
129
129
|
def split_parse! scanner, tokens, pattern, type
|
130
130
|
return false unless token = scanner.scan(pattern)
|
131
131
|
position = scanner.char_pos - token.length
|
132
132
|
split(token).inject(position) do |before, s|
|
133
|
-
tokens << Greeb::
|
133
|
+
tokens << Greeb::Span.new(before, before + s.length, type)
|
134
134
|
before + s.length
|
135
135
|
end
|
136
136
|
end
|
data/lib/greeb/version.rb
CHANGED
data/spec/core_spec.rb
CHANGED
@@ -2,41 +2,39 @@
|
|
2
2
|
|
3
3
|
require_relative 'spec_helper'
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
end
|
5
|
+
describe Greeb do
|
6
|
+
it 'should do nothing when ran without input' do
|
7
|
+
Greeb[''].must_be_empty
|
8
|
+
end
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
10
|
+
it 'should tokenize text when input is given' do
|
11
|
+
Greeb['Hello guys!'].must_equal(
|
12
|
+
[Span.new(0, 5, :letter),
|
13
|
+
Span.new(5, 6, :space),
|
14
|
+
Span.new(6, 10, :letter),
|
15
|
+
Span.new(10, 11, :punct)]
|
16
|
+
)
|
17
|
+
end
|
19
18
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
19
|
+
it 'should extract URLs' do
|
20
|
+
Greeb['Hello http://nlpub.ru guys!'].must_equal(
|
21
|
+
[Span.new(0, 5, :letter),
|
22
|
+
Span.new(5, 6, :space),
|
23
|
+
Span.new(6, 21, :url),
|
24
|
+
Span.new(21, 22, :space),
|
25
|
+
Span.new(22, 26, :letter),
|
26
|
+
Span.new(26, 27, :punct)]
|
27
|
+
)
|
28
|
+
end
|
30
29
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
end
|
30
|
+
it 'should extract e-mails' do
|
31
|
+
Greeb['Hello example@example.com guys!'].must_equal(
|
32
|
+
[Span.new(0, 5, :letter),
|
33
|
+
Span.new(5, 6, :space),
|
34
|
+
Span.new(6, 25, :email),
|
35
|
+
Span.new(25, 26, :space),
|
36
|
+
Span.new(26, 30, :letter),
|
37
|
+
Span.new(30, 31, :punct)]
|
38
|
+
)
|
41
39
|
end
|
42
40
|
end
|
data/spec/parser_spec.rb
CHANGED
@@ -2,45 +2,57 @@
|
|
2
2
|
|
3
3
|
require_relative 'spec_helper'
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
5
|
+
describe Parser do
|
6
|
+
let(:text) do
|
7
|
+
('Hello there! My name is <span class="name">Vasya B.</span> and ' \
|
8
|
+
'I am к.ф.-м.н. My website is http://вася.рф/. And my e-mail is ' \
|
9
|
+
'example@example.com! It is available by URL: http://vasya.ru. ' \
|
10
|
+
'Also, <b>G.L.H.F.</b> everyone!').freeze
|
11
|
+
end
|
12
|
+
|
13
|
+
describe 'URL' do
|
14
|
+
subject { Parser.urls(text) }
|
15
|
+
|
16
|
+
it 'recognizes URLs' do
|
17
|
+
subject.must_equal(
|
18
|
+
[Span.new(92, 107, :url),
|
19
|
+
Span.new(171, 186, :url)]
|
20
|
+
)
|
11
21
|
end
|
22
|
+
end
|
12
23
|
|
13
|
-
|
14
|
-
|
24
|
+
describe 'EMAIL' do
|
25
|
+
subject { Parser.emails(text) }
|
15
26
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
)
|
21
|
-
end
|
27
|
+
it 'recognizes e-mails' do
|
28
|
+
subject.must_equal(
|
29
|
+
[Span.new(126, 145, :email)]
|
30
|
+
)
|
22
31
|
end
|
32
|
+
end
|
23
33
|
|
24
|
-
|
25
|
-
|
34
|
+
describe 'ABBREV' do
|
35
|
+
subject { Parser.abbrevs(text) }
|
26
36
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
37
|
+
it 'recognizes abbreviations' do
|
38
|
+
subject.must_equal(
|
39
|
+
[Span.new(49, 51, :abbrev),
|
40
|
+
Span.new(68, 77, :abbrev),
|
41
|
+
Span.new(197, 205, :abbrev)]
|
42
|
+
)
|
32
43
|
end
|
44
|
+
end
|
33
45
|
|
34
|
-
|
35
|
-
|
46
|
+
describe 'HTML' do
|
47
|
+
subject { Parser.html(text) }
|
36
48
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
49
|
+
it 'recognizes HTML entities' do
|
50
|
+
subject.must_equal(
|
51
|
+
[Span.new(24, 43, :html),
|
52
|
+
Span.new(51, 58, :html),
|
53
|
+
Span.new(194, 197, :html),
|
54
|
+
Span.new(205, 209, :html)]
|
55
|
+
)
|
44
56
|
end
|
45
57
|
end
|
46
58
|
end
|
data/spec/segmentator_spec.rb
CHANGED
@@ -2,116 +2,114 @@
|
|
2
2
|
|
3
3
|
require_relative 'spec_helper'
|
4
4
|
|
5
|
-
|
6
|
-
describe
|
7
|
-
|
8
|
-
let(:tokens) { Tokenizer.tokenize('Vodka') }
|
5
|
+
describe Segmentator do
|
6
|
+
describe 'initialization' do
|
7
|
+
let(:tokens) { Tokenizer.tokenize('Vodka') }
|
9
8
|
|
10
|
-
|
9
|
+
subject { Segmentator.new(tokens) }
|
11
10
|
|
12
|
-
|
13
|
-
|
14
|
-
|
11
|
+
it 'is initialized either with set of tokens' do
|
12
|
+
subject.tokens.must_be_kind_of Array
|
13
|
+
end
|
15
14
|
|
16
|
-
|
17
|
-
|
18
|
-
end
|
15
|
+
it 'should has @tokens ivar' do
|
16
|
+
subject.instance_variable_get(:@tokens).wont_be_nil
|
19
17
|
end
|
18
|
+
end
|
20
19
|
|
21
|
-
|
22
|
-
|
20
|
+
describe 'a simple sentence' do
|
21
|
+
let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
|
23
22
|
|
24
|
-
|
23
|
+
subject { Segmentator.new(tokens).sentences }
|
25
24
|
|
26
|
-
|
27
|
-
|
28
|
-
end
|
25
|
+
it 'should be segmented' do
|
26
|
+
subject.must_equal([Span.new(0, 22, :sentence)])
|
29
27
|
end
|
28
|
+
end
|
30
29
|
|
31
|
-
|
32
|
-
|
30
|
+
describe 'a simple sentence without punctuation' do
|
31
|
+
let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton') }
|
33
32
|
|
34
|
-
|
33
|
+
subject { Segmentator.new(tokens).sentences }
|
35
34
|
|
36
|
-
|
37
|
-
|
38
|
-
end
|
35
|
+
it 'should be segmented' do
|
36
|
+
subject.must_equal([Span.new(0, 21, :sentence)])
|
39
37
|
end
|
38
|
+
end
|
40
39
|
|
41
|
-
|
42
|
-
|
40
|
+
describe 'a simple sentence with trailing whitespaces' do
|
41
|
+
let(:tokens) { Tokenizer.tokenize(' Hello, I am JC Denton ') }
|
43
42
|
|
44
|
-
|
43
|
+
subject { Segmentator.new(tokens).sentences }
|
45
44
|
|
46
|
-
|
47
|
-
|
48
|
-
end
|
45
|
+
it 'should be segmented' do
|
46
|
+
subject.must_equal([Span.new(6, 27, :sentence)])
|
49
47
|
end
|
48
|
+
end
|
50
49
|
|
51
|
-
|
52
|
-
|
50
|
+
describe 'two simple sentences' do
|
51
|
+
let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
|
53
52
|
|
54
|
-
|
53
|
+
subject { Segmentator.new(tokens).sentences }
|
55
54
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
end
|
55
|
+
it 'should be segmented' do
|
56
|
+
subject.must_equal([Span.new(0, 6, :sentence),
|
57
|
+
Span.new(7, 22, :sentence)])
|
60
58
|
end
|
59
|
+
end
|
61
60
|
|
62
|
-
|
63
|
-
|
61
|
+
describe 'one wrong character and one simple sentence' do
|
62
|
+
let(:tokens) { Tokenizer.tokenize('! I am JC Denton.') }
|
64
63
|
|
65
|
-
|
64
|
+
subject { Segmentator.new(tokens).sentences }
|
66
65
|
|
67
|
-
|
68
|
-
|
69
|
-
end
|
66
|
+
it 'should be segmented' do
|
67
|
+
subject.must_equal([Span.new(2, 17, :sentence)])
|
70
68
|
end
|
69
|
+
end
|
71
70
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
end
|
71
|
+
describe 'sentence extractor' do
|
72
|
+
let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
|
73
|
+
let(:segmentator) { Segmentator.new(tokens) }
|
74
|
+
let(:sentences) { segmentator.sentences }
|
75
|
+
|
76
|
+
subject { segmentator.extract(sentences) }
|
77
|
+
|
78
|
+
it 'should be extracted' do
|
79
|
+
subject.must_equal([
|
80
|
+
[Span.new(0, 6, :sentence), [
|
81
|
+
Span.new(0, 5, :letter),
|
82
|
+
Span.new(5, 6, :punct)
|
83
|
+
]],
|
84
|
+
[Span.new(7, 22, :sentence), [
|
85
|
+
Span.new(7, 8, :letter),
|
86
|
+
Span.new(8, 9, :space),
|
87
|
+
Span.new(9, 11, :letter),
|
88
|
+
Span.new(11, 12, :space),
|
89
|
+
Span.new(12, 14, :letter),
|
90
|
+
Span.new(14, 15, :space),
|
91
|
+
Span.new(15, 21, :letter),
|
92
|
+
Span.new(21, 22, :punct)
|
93
|
+
]]
|
94
|
+
])
|
97
95
|
end
|
96
|
+
end
|
98
97
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
end
|
98
|
+
describe 'subsentence extractor' do
|
99
|
+
let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
|
100
|
+
let(:segmentator) { Segmentator.new(tokens) }
|
101
|
+
let(:sentences) { segmentator.sentences }
|
102
|
+
let(:subsentences) { segmentator.subsentences }
|
103
|
+
|
104
|
+
subject { segmentator.extract(sentences, subsentences) }
|
105
|
+
|
106
|
+
it 'should extract subsentences' do
|
107
|
+
subject.must_equal([
|
108
|
+
[Span.new(0, 22, :sentence), [
|
109
|
+
Span.new(0, 6, :subsentence),
|
110
|
+
Span.new(7, 22, :subsentence)
|
111
|
+
]]
|
112
|
+
])
|
115
113
|
end
|
116
114
|
end
|
117
115
|
end
|