greeb 0.2.2.rc1 → 0.2.2.rc2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ # Greeb operates with spans. A span is a tuple of *(from, to, kind)*, where
2
+ # *from* is a beginning of the span, *to* is an ending of the span,
3
+ # and *kind* is a type of the span.
4
+ #
5
+ # There are several span types: `:letter` for letters, `:float` for
6
+ # floating point decimals, `:integer` for numbers, `:separ` for separators,
7
+ # `:punct` for punctuation characters, `:spunct` for in-sentence punctuation
8
+ # characters, `:space` for spaces, and `:break` for line endings.
9
+ #
10
+ class Greeb::Span < Struct.new(:from, :to, :type)
11
+ # Create a deriviative structure that is based on Greeb::Span
12
+ # members. Useful in integrating with Greeb.
13
+ #
14
+ # @param members [Array<Symbol>] additional members.
15
+ #
16
+ # @return [Struct] a new structure.
17
+ #
18
+ def self.derivate(*members)
19
+ Struct.new(*self.members, *members)
20
+ end
21
+
22
+ # @private
23
+ def <=> other
24
+ if (comparison = self.from <=> other.from) == 0
25
+ self.to <=> other.to
26
+ else
27
+ comparison
28
+ end
29
+ end
30
+
31
+ # @private
32
+ def eql? other
33
+ return false unless type == other.type
34
+ (self <=> other) == 0
35
+ end
36
+ end
@@ -35,7 +35,7 @@ module Greeb::Tokenizer
35
35
 
36
36
  # Spaces (i.e.: " " or &nbsp).
37
37
  #
38
- SPACES = /[\p{Zs}]+/u
38
+ SPACES = /[\p{Zs}\t]+/u
39
39
 
40
40
  # Line breaks.
41
41
  #
@@ -47,14 +47,14 @@ module Greeb::Tokenizer
47
47
 
48
48
  # Perform the tokenization process.
49
49
  #
50
- # @return [Array<Greeb::Entity>] a set of tokens.
50
+ # @return [Array<Greeb::Span>] a set of tokens.
51
51
  #
52
52
  def tokenize text
53
53
  scanner = Greeb::StringScanner.new(text)
54
54
  tokens = []
55
55
  while !scanner.eos?
56
56
  step scanner, tokens or
57
- raise Greeb::UnknownEntity.new(text, scanner.char_pos)
57
+ raise Greeb::UnknownSpan.new(text, scanner.char_pos)
58
58
  end
59
59
  tokens
60
60
  ensure
@@ -79,9 +79,9 @@ module Greeb::Tokenizer
79
79
  # One iteration of the tokenization process.
80
80
  #
81
81
  # @param scanner [Greeb::StringScanner] string scanner.
82
- # @param tokens [Array<Greeb::Entity>] result array.
82
+ # @param tokens [Array<Greeb::Span>] result array.
83
83
  #
84
- # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
84
+ # @return [Array<Greeb::Span>] the modified set of extracted tokens.
85
85
  #
86
86
  def step scanner, tokens
87
87
  parse! scanner, tokens, LETTERS, :letter or
@@ -99,17 +99,17 @@ module Greeb::Tokenizer
99
99
  # of necessary type.
100
100
  #
101
101
  # @param scanner [Greeb::StringScanner] string scanner.
102
- # @param tokens [Array<Greeb::Entity>] result array.
102
+ # @param tokens [Array<Greeb::Span>] result array.
103
103
  # @param pattern [Regexp] a regular expression to extract the token.
104
104
  # @param type [Symbol] a symbol that represents the necessary token
105
105
  # type.
106
106
  #
107
- # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
107
+ # @return [Array<Greeb::Span>] the modified set of extracted tokens.
108
108
  #
109
109
  def parse! scanner, tokens, pattern, type
110
110
  return false unless token = scanner.scan(pattern)
111
111
  position = scanner.char_pos
112
- tokens << Greeb::Entity.new(position - token.length,
112
+ tokens << Greeb::Span.new(position - token.length,
113
113
  position,
114
114
  type)
115
115
  end
@@ -119,18 +119,18 @@ module Greeb::Tokenizer
119
119
  # characters.
120
120
  #
121
121
  # @param scanner [Greeb::StringScanner] string scanner.
122
- # @param tokens [Array<Greeb::Entity>] result array.
122
+ # @param tokens [Array<Greeb::Span>] result array.
123
123
  # @param pattern [Regexp] a regular expression to extract the token.
124
124
  # @param type [Symbol] a symbol that represents the necessary token
125
125
  # type.
126
126
  #
127
- # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
127
+ # @return [Array<Greeb::Span>] the modified set of extracted tokens.
128
128
  #
129
129
  def split_parse! scanner, tokens, pattern, type
130
130
  return false unless token = scanner.scan(pattern)
131
131
  position = scanner.char_pos - token.length
132
132
  split(token).inject(position) do |before, s|
133
- tokens << Greeb::Entity.new(before, before + s.length, type)
133
+ tokens << Greeb::Span.new(before, before + s.length, type)
134
134
  before + s.length
135
135
  end
136
136
  end
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.2.2.rc1'
8
+ VERSION = '0.2.2.rc2'
9
9
  end
@@ -2,41 +2,39 @@
2
2
 
3
3
  require_relative 'spec_helper'
4
4
 
5
- module Greeb
6
- describe Greeb do
7
- it 'should do nothing when ran without input' do
8
- Greeb[''].must_be_empty
9
- end
5
+ describe Greeb do
6
+ it 'should do nothing when ran without input' do
7
+ Greeb[''].must_be_empty
8
+ end
10
9
 
11
- it 'should tokenize text when input is given' do
12
- Greeb['Hello guys!'].must_equal(
13
- [Entity.new(0, 5, :letter),
14
- Entity.new(5, 6, :space),
15
- Entity.new(6, 10, :letter),
16
- Entity.new(10, 11, :punct)]
17
- )
18
- end
10
+ it 'should tokenize text when input is given' do
11
+ Greeb['Hello guys!'].must_equal(
12
+ [Span.new(0, 5, :letter),
13
+ Span.new(5, 6, :space),
14
+ Span.new(6, 10, :letter),
15
+ Span.new(10, 11, :punct)]
16
+ )
17
+ end
19
18
 
20
- it 'should extract URLs' do
21
- Greeb['Hello http://nlpub.ru guys!'].must_equal(
22
- [Entity.new(0, 5, :letter),
23
- Entity.new(5, 6, :space),
24
- Entity.new(6, 21, :url),
25
- Entity.new(21, 22, :space),
26
- Entity.new(22, 26, :letter),
27
- Entity.new(26, 27, :punct)]
28
- )
29
- end
19
+ it 'should extract URLs' do
20
+ Greeb['Hello http://nlpub.ru guys!'].must_equal(
21
+ [Span.new(0, 5, :letter),
22
+ Span.new(5, 6, :space),
23
+ Span.new(6, 21, :url),
24
+ Span.new(21, 22, :space),
25
+ Span.new(22, 26, :letter),
26
+ Span.new(26, 27, :punct)]
27
+ )
28
+ end
30
29
 
31
- it 'should extract e-mails' do
32
- Greeb['Hello example@example.com guys!'].must_equal(
33
- [Entity.new(0, 5, :letter),
34
- Entity.new(5, 6, :space),
35
- Entity.new(6, 25, :email),
36
- Entity.new(25, 26, :space),
37
- Entity.new(26, 30, :letter),
38
- Entity.new(30, 31, :punct)]
39
- )
40
- end
30
+ it 'should extract e-mails' do
31
+ Greeb['Hello example@example.com guys!'].must_equal(
32
+ [Span.new(0, 5, :letter),
33
+ Span.new(5, 6, :space),
34
+ Span.new(6, 25, :email),
35
+ Span.new(25, 26, :space),
36
+ Span.new(26, 30, :letter),
37
+ Span.new(30, 31, :punct)]
38
+ )
41
39
  end
42
40
  end
@@ -2,45 +2,57 @@
2
2
 
3
3
  require_relative 'spec_helper'
4
4
 
5
- module Greeb
6
- describe Parser do
7
- let(:text) do
8
- 'Hello there! My name is Vasya B. and I am к.ф.-м.н. My website is ' \
9
- 'http://вася.рф/. And my e-mail is example@example.com! Also it is ' \
10
- 'available by URL: http://vasya.ru. Also, G.L.H.F. everyone!'
5
+ describe Parser do
6
+ let(:text) do
7
+ ('Hello there! My name is <span class="name">Vasya B.</span> and ' \
8
+ 'I am к.ф.-м.н. My website is http://вася.рф/. And my e-mail is ' \
9
+ 'example@example.com! It is available by URL: http://vasya.ru. ' \
10
+ 'Also, <b>G.L.H.F.</b> everyone!').freeze
11
+ end
12
+
13
+ describe 'URL' do
14
+ subject { Parser.urls(text) }
15
+
16
+ it 'recognizes URLs' do
17
+ subject.must_equal(
18
+ [Span.new(92, 107, :url),
19
+ Span.new(171, 186, :url)]
20
+ )
11
21
  end
22
+ end
12
23
 
13
- describe 'URL' do
14
- subject { Parser.urls(text) }
24
+ describe 'EMAIL' do
25
+ subject { Parser.emails(text) }
15
26
 
16
- it 'recognizes URLs' do
17
- subject.must_equal(
18
- [Entity.new(66, 81, :url),
19
- Entity.new(150, 165, :url)]
20
- )
21
- end
27
+ it 'recognizes e-mails' do
28
+ subject.must_equal(
29
+ [Span.new(126, 145, :email)]
30
+ )
22
31
  end
32
+ end
23
33
 
24
- describe 'EMAIL' do
25
- subject { Parser.emails(text) }
34
+ describe 'ABBREV' do
35
+ subject { Parser.abbrevs(text) }
26
36
 
27
- it 'recognizes e-mails' do
28
- subject.must_equal(
29
- [Entity.new(100, 119, :email)]
30
- )
31
- end
37
+ it 'recognizes abbreviations' do
38
+ subject.must_equal(
39
+ [Span.new(49, 51, :abbrev),
40
+ Span.new(68, 77, :abbrev),
41
+ Span.new(197, 205, :abbrev)]
42
+ )
32
43
  end
44
+ end
33
45
 
34
- describe 'ABBREV' do
35
- subject { Parser.abbrevs(text) }
46
+ describe 'HTML' do
47
+ subject { Parser.html(text) }
36
48
 
37
- it 'recognizes abbreviations' do
38
- subject.must_equal(
39
- [Entity.new(30, 32, :abbrev),
40
- Entity.new(42, 51, :abbrev),
41
- Entity.new(173, 181, :abbrev)]
42
- )
43
- end
49
+ it 'recognizes HTML entities' do
50
+ subject.must_equal(
51
+ [Span.new(24, 43, :html),
52
+ Span.new(51, 58, :html),
53
+ Span.new(194, 197, :html),
54
+ Span.new(205, 209, :html)]
55
+ )
44
56
  end
45
57
  end
46
58
  end
@@ -2,116 +2,114 @@
2
2
 
3
3
  require_relative 'spec_helper'
4
4
 
5
- module Greeb
6
- describe Segmentator do
7
- describe 'initialization' do
8
- let(:tokens) { Tokenizer.tokenize('Vodka') }
5
+ describe Segmentator do
6
+ describe 'initialization' do
7
+ let(:tokens) { Tokenizer.tokenize('Vodka') }
9
8
 
10
- subject { Segmentator.new(tokens) }
9
+ subject { Segmentator.new(tokens) }
11
10
 
12
- it 'is initialized either with set of tokens' do
13
- subject.tokens.must_be_kind_of Array
14
- end
11
+ it 'is initialized either with set of tokens' do
12
+ subject.tokens.must_be_kind_of Array
13
+ end
15
14
 
16
- it 'should has @tokens ivar' do
17
- subject.instance_variable_get(:@tokens).wont_be_nil
18
- end
15
+ it 'should has @tokens ivar' do
16
+ subject.instance_variable_get(:@tokens).wont_be_nil
19
17
  end
18
+ end
20
19
 
21
- describe 'a simple sentence' do
22
- let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
20
+ describe 'a simple sentence' do
21
+ let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
23
22
 
24
- subject { Segmentator.new(tokens).sentences }
23
+ subject { Segmentator.new(tokens).sentences }
25
24
 
26
- it 'should be segmented' do
27
- subject.must_equal([Entity.new(0, 22, :sentence)])
28
- end
25
+ it 'should be segmented' do
26
+ subject.must_equal([Span.new(0, 22, :sentence)])
29
27
  end
28
+ end
30
29
 
31
- describe 'a simple sentence without punctuation' do
32
- let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton') }
30
+ describe 'a simple sentence without punctuation' do
31
+ let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton') }
33
32
 
34
- subject { Segmentator.new(tokens).sentences }
33
+ subject { Segmentator.new(tokens).sentences }
35
34
 
36
- it 'should be segmented' do
37
- subject.must_equal([Entity.new(0, 21, :sentence)])
38
- end
35
+ it 'should be segmented' do
36
+ subject.must_equal([Span.new(0, 21, :sentence)])
39
37
  end
38
+ end
40
39
 
41
- describe 'a simple sentence with trailing whitespaces' do
42
- let(:tokens) { Tokenizer.tokenize(' Hello, I am JC Denton ') }
40
+ describe 'a simple sentence with trailing whitespaces' do
41
+ let(:tokens) { Tokenizer.tokenize(' Hello, I am JC Denton ') }
43
42
 
44
- subject { Segmentator.new(tokens).sentences }
43
+ subject { Segmentator.new(tokens).sentences }
45
44
 
46
- it 'should be segmented' do
47
- subject.must_equal([Entity.new(6, 27, :sentence)])
48
- end
45
+ it 'should be segmented' do
46
+ subject.must_equal([Span.new(6, 27, :sentence)])
49
47
  end
48
+ end
50
49
 
51
- describe 'two simple sentences' do
52
- let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
50
+ describe 'two simple sentences' do
51
+ let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
53
52
 
54
- subject { Segmentator.new(tokens).sentences }
53
+ subject { Segmentator.new(tokens).sentences }
55
54
 
56
- it 'should be segmented' do
57
- subject.must_equal([Entity.new(0, 6, :sentence),
58
- Entity.new(7, 22, :sentence)])
59
- end
55
+ it 'should be segmented' do
56
+ subject.must_equal([Span.new(0, 6, :sentence),
57
+ Span.new(7, 22, :sentence)])
60
58
  end
59
+ end
61
60
 
62
- describe 'one wrong character and one simple sentence' do
63
- let(:tokens) { Tokenizer.tokenize('! I am JC Denton.') }
61
+ describe 'one wrong character and one simple sentence' do
62
+ let(:tokens) { Tokenizer.tokenize('! I am JC Denton.') }
64
63
 
65
- subject { Segmentator.new(tokens).sentences }
64
+ subject { Segmentator.new(tokens).sentences }
66
65
 
67
- it 'should be segmented' do
68
- subject.must_equal([Entity.new(2, 17, :sentence)])
69
- end
66
+ it 'should be segmented' do
67
+ subject.must_equal([Span.new(2, 17, :sentence)])
70
68
  end
69
+ end
71
70
 
72
- describe 'sentence extractor' do
73
- let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
74
- let(:segmentator) { Segmentator.new(tokens) }
75
- let(:sentences) { segmentator.sentences }
76
-
77
- subject { segmentator.extract(sentences) }
78
-
79
- it 'should be extracted' do
80
- subject.must_equal(
81
- Entity.new(0, 6, :sentence) => [
82
- Entity.new(0, 5, :letter),
83
- Entity.new(5, 6, :punct)
84
- ],
85
- Entity.new(7, 22, :sentence) => [
86
- Entity.new(7, 8, :letter),
87
- Entity.new(8, 9, :space),
88
- Entity.new(9, 11, :letter),
89
- Entity.new(11, 12, :space),
90
- Entity.new(12, 14, :letter),
91
- Entity.new(14, 15, :space),
92
- Entity.new(15, 21, :letter),
93
- Entity.new(21, 22, :punct)
94
- ]
95
- )
96
- end
71
+ describe 'sentence extractor' do
72
+ let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
73
+ let(:segmentator) { Segmentator.new(tokens) }
74
+ let(:sentences) { segmentator.sentences }
75
+
76
+ subject { segmentator.extract(sentences) }
77
+
78
+ it 'should be extracted' do
79
+ subject.must_equal([
80
+ [Span.new(0, 6, :sentence), [
81
+ Span.new(0, 5, :letter),
82
+ Span.new(5, 6, :punct)
83
+ ]],
84
+ [Span.new(7, 22, :sentence), [
85
+ Span.new(7, 8, :letter),
86
+ Span.new(8, 9, :space),
87
+ Span.new(9, 11, :letter),
88
+ Span.new(11, 12, :space),
89
+ Span.new(12, 14, :letter),
90
+ Span.new(14, 15, :space),
91
+ Span.new(15, 21, :letter),
92
+ Span.new(21, 22, :punct)
93
+ ]]
94
+ ])
97
95
  end
96
+ end
98
97
 
99
- describe 'subsentence extractor' do
100
- let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
101
- let(:segmentator) { Segmentator.new(tokens) }
102
- let(:sentences) { segmentator.sentences }
103
- let(:subsentences) { segmentator.subsentences }
104
-
105
- subject { segmentator.extract(sentences, subsentences) }
106
-
107
- it 'should extract subsentences' do
108
- subject.must_equal(
109
- Entity.new(0, 22, :sentence) => [
110
- Entity.new(0, 6, :subsentence),
111
- Entity.new(7, 22, :subsentence)
112
- ]
113
- )
114
- end
98
+ describe 'subsentence extractor' do
99
+ let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
100
+ let(:segmentator) { Segmentator.new(tokens) }
101
+ let(:sentences) { segmentator.sentences }
102
+ let(:subsentences) { segmentator.subsentences }
103
+
104
+ subject { segmentator.extract(sentences, subsentences) }
105
+
106
+ it 'should extract subsentences' do
107
+ subject.must_equal([
108
+ [Span.new(0, 22, :sentence), [
109
+ Span.new(0, 6, :subsentence),
110
+ Span.new(7, 22, :subsentence)
111
+ ]]
112
+ ])
115
113
  end
116
114
  end
117
115
  end