greeb 0.2.2.rc1 → 0.2.2.rc2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +52 -52
- data/bin/greeb +2 -2
- data/lib/greeb.rb +2 -39
- data/lib/greeb/core.rb +13 -12
- data/lib/greeb/exceptions.rb +17 -0
- data/lib/greeb/parser.rb +20 -7
- data/lib/greeb/segmentator.rb +38 -40
- data/lib/greeb/span.rb +36 -0
- data/lib/greeb/tokenizer.rb +11 -11
- data/lib/greeb/version.rb +1 -1
- data/spec/core_spec.rb +31 -33
- data/spec/parser_spec.rb +42 -30
- data/spec/segmentator_spec.rb +81 -83
- data/spec/span_spec.rb +63 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/tokenizer_spec.rb +76 -78
- metadata +5 -1
data/spec/span_spec.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require_relative 'spec_helper'
|
4
|
+
|
5
|
+
describe Span do
|
6
|
+
describe 'argumentless derivation' do
|
7
|
+
subject { Span.derivate }
|
8
|
+
|
9
|
+
it 'should produce valid members' do
|
10
|
+
subject.members.must_equal Span.members
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should produce a derived structure' do
|
14
|
+
struct = subject.new(1, 2, 3)
|
15
|
+
struct.from.must_equal 1
|
16
|
+
struct.to.must_equal 2
|
17
|
+
struct.type.must_equal 3
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe 'common derivation' do
|
22
|
+
subject { Span.derivate(:foo, :bar) }
|
23
|
+
|
24
|
+
it 'should produce valid members' do
|
25
|
+
subject.members.must_equal(Span.members + [:foo, :bar])
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should produce a derived structure' do
|
29
|
+
struct = subject.new(1, 2, 3, 4)
|
30
|
+
struct.from.must_equal 1
|
31
|
+
struct.to.must_equal 2
|
32
|
+
struct.type.must_equal 3
|
33
|
+
struct.foo.must_equal 4
|
34
|
+
|
35
|
+
struct.bar = 5
|
36
|
+
struct.bar.must_equal 5
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe 'comparison' do
|
41
|
+
it 'should be comparable when positions are different' do
|
42
|
+
(Span.new(1, 2) <=> Span.new(2, 3)).wont_equal 0
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'should not be comparable when positions are same' do
|
46
|
+
(Span.new(1, 2) <=> Span.new(1, 2)).must_equal 0
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
describe 'equality' do
|
51
|
+
it 'should be equal when positions are different' do
|
52
|
+
Span.new(1, 2).wont_equal Span.new(2, 3)
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should not be equal when positions are same' do
|
56
|
+
Span.new(1, 2).must_equal Span.new(1, 2)
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'should not be equal when positions are same and types vary' do
|
60
|
+
Span.new(1, 2, 3).wont_equal Span.new(1, 2, 4)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/tokenizer_spec.rb
CHANGED
@@ -2,97 +2,95 @@
|
|
2
2
|
|
3
3
|
require_relative 'spec_helper'
|
4
4
|
|
5
|
-
|
6
|
-
describe
|
7
|
-
|
8
|
-
subject { Tokenizer.tokenize('vodka') }
|
5
|
+
describe Tokenizer do
|
6
|
+
describe 'after tokenization' do
|
7
|
+
subject { Tokenizer.tokenize('vodka') }
|
9
8
|
|
10
|
-
|
11
|
-
|
12
|
-
end
|
9
|
+
it 'should has the tokens set' do
|
10
|
+
subject.must_be_kind_of Array
|
13
11
|
end
|
12
|
+
end
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
14
|
+
describe 'tokenization facilities' do
|
15
|
+
it 'can handle words' do
|
16
|
+
Tokenizer.tokenize('hello').must_equal(
|
17
|
+
[Span.new(0, 5, :letter)]
|
18
|
+
)
|
19
|
+
end
|
21
20
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
21
|
+
it 'can handle floats' do
|
22
|
+
Tokenizer.tokenize('14.88').must_equal(
|
23
|
+
[Span.new(0, 5, :float)]
|
24
|
+
)
|
25
|
+
end
|
27
26
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
27
|
+
it 'can handle integers' do
|
28
|
+
Tokenizer.tokenize('1337').must_equal(
|
29
|
+
[Span.new(0, 4, :integer)]
|
30
|
+
)
|
31
|
+
end
|
33
32
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
33
|
+
it 'can handle words and integers' do
|
34
|
+
Tokenizer.tokenize('Hello, I am 18').must_equal(
|
35
|
+
[Span.new(0, 5, :letter),
|
36
|
+
Span.new(5, 6, :spunct),
|
37
|
+
Span.new(6, 7, :space),
|
38
|
+
Span.new(7, 8, :letter),
|
39
|
+
Span.new(8, 9, :space),
|
40
|
+
Span.new(9, 11, :letter),
|
41
|
+
Span.new(11, 12, :space),
|
42
|
+
Span.new(12, 14, :integer)]
|
43
|
+
)
|
44
|
+
end
|
46
45
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
46
|
+
it 'can handle multi-line paragraphs' do
|
47
|
+
Tokenizer.tokenize("Brateeshka..!\n\nPrines!").must_equal(
|
48
|
+
[Span.new(0, 10, :letter),
|
49
|
+
Span.new(10, 12, :punct),
|
50
|
+
Span.new(12, 13, :punct),
|
51
|
+
Span.new(13, 15, :break),
|
52
|
+
Span.new(15, 21, :letter),
|
53
|
+
Span.new(21, 22, :punct)]
|
54
|
+
)
|
55
|
+
end
|
57
56
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
57
|
+
it 'can handle separated integers' do
|
58
|
+
Tokenizer.tokenize('228/359').must_equal(
|
59
|
+
[Span.new(0, 3, :integer),
|
60
|
+
Span.new(3, 4, :separ),
|
61
|
+
Span.new(4, 7, :integer)]
|
62
|
+
)
|
63
|
+
end
|
65
64
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
end
|
65
|
+
it 'can deal with Russian language' do
|
66
|
+
Tokenizer.tokenize('Братишка, я тебе покушать принёс!').must_equal(
|
67
|
+
[Span.new(0, 8, :letter),
|
68
|
+
Span.new(8, 9, :spunct),
|
69
|
+
Span.new(9, 10, :space),
|
70
|
+
Span.new(10, 11, :letter),
|
71
|
+
Span.new(11, 12, :space),
|
72
|
+
Span.new(12, 16, :letter),
|
73
|
+
Span.new(16, 17, :space),
|
74
|
+
Span.new(17, 25, :letter),
|
75
|
+
Span.new(25, 26, :space),
|
76
|
+
Span.new(26, 32, :letter),
|
77
|
+
Span.new(32, 33, :punct)]
|
78
|
+
)
|
81
79
|
end
|
80
|
+
end
|
82
81
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
82
|
+
describe '.split' do
|
83
|
+
it 'should split characters' do
|
84
|
+
Tokenizer.split('loh').must_equal %w(l o h)
|
85
|
+
end
|
87
86
|
|
88
|
-
|
89
|
-
|
90
|
-
|
87
|
+
it 'should combine duplicated characters' do
|
88
|
+
Tokenizer.split('foo').must_equal %w(f oo)
|
89
|
+
end
|
91
90
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
end
|
91
|
+
it 'should also deal with line breaks' do
|
92
|
+
Tokenizer.split("bar\n\nbaz").must_equal(
|
93
|
+
[*%w(b a r), "\n\n", *%w(b a z)])
|
96
94
|
end
|
97
95
|
end
|
98
96
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.2.
|
4
|
+
version: 0.2.2.rc2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Ustalov
|
@@ -43,8 +43,10 @@ files:
|
|
43
43
|
- greeb.gemspec
|
44
44
|
- lib/greeb.rb
|
45
45
|
- lib/greeb/core.rb
|
46
|
+
- lib/greeb/exceptions.rb
|
46
47
|
- lib/greeb/parser.rb
|
47
48
|
- lib/greeb/segmentator.rb
|
49
|
+
- lib/greeb/span.rb
|
48
50
|
- lib/greeb/strscan.rb
|
49
51
|
- lib/greeb/tokenizer.rb
|
50
52
|
- lib/greeb/version.rb
|
@@ -52,6 +54,7 @@ files:
|
|
52
54
|
- spec/core_spec.rb
|
53
55
|
- spec/parser_spec.rb
|
54
56
|
- spec/segmentator_spec.rb
|
57
|
+
- spec/span_spec.rb
|
55
58
|
- spec/spec_helper.rb
|
56
59
|
- spec/support/invoker.rb
|
57
60
|
- spec/tokenizer_spec.rb
|
@@ -83,6 +86,7 @@ test_files:
|
|
83
86
|
- spec/core_spec.rb
|
84
87
|
- spec/parser_spec.rb
|
85
88
|
- spec/segmentator_spec.rb
|
89
|
+
- spec/span_spec.rb
|
86
90
|
- spec/spec_helper.rb
|
87
91
|
- spec/support/invoker.rb
|
88
92
|
- spec/tokenizer_spec.rb
|