greeb 0.2.2.rc1 → 0.2.2.rc2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,36 @@
1
+ # Greeb operates with spans. A span is a tuple of *(from, to, kind)*, where
2
+ # *from* is a beginning of the span, *to* is an ending of the span,
3
+ # and *kind* is a type of the span.
4
+ #
5
+ # There are several span types: `:letter` for letters, `:float` for
6
+ # floating point decimals, `:integer` for numbers, `:separ` for separators,
7
+ # `:punct` for punctuation characters, `:spunct` for in-sentence punctuation
8
+ # characters, `:space` for spaces, and `:break` for line endings.
9
+ #
10
+ class Greeb::Span < Struct.new(:from, :to, :type)
11
+ # Create a deriviative structure that is based on Greeb::Span
12
+ # members. Useful in integrating with Greeb.
13
+ #
14
+ # @param members [Array<Symbol>] additional members.
15
+ #
16
+ # @return [Struct] a new structure.
17
+ #
18
+ def self.derivate(*members)
19
+ Struct.new(*self.members, *members)
20
+ end
21
+
22
+ # @private
23
+ def <=> other
24
+ if (comparison = self.from <=> other.from) == 0
25
+ self.to <=> other.to
26
+ else
27
+ comparison
28
+ end
29
+ end
30
+
31
+ # @private
32
+ def eql? other
33
+ return false unless type == other.type
34
+ (self <=> other) == 0
35
+ end
36
+ end
@@ -35,7 +35,7 @@ module Greeb::Tokenizer
35
35
 
36
36
  # Spaces (i.e.: " " or &nbsp).
37
37
  #
38
- SPACES = /[\p{Zs}]+/u
38
+ SPACES = /[\p{Zs}\t]+/u
39
39
 
40
40
  # Line breaks.
41
41
  #
@@ -47,14 +47,14 @@ module Greeb::Tokenizer
47
47
 
48
48
  # Perform the tokenization process.
49
49
  #
50
- # @return [Array<Greeb::Entity>] a set of tokens.
50
+ # @return [Array<Greeb::Span>] a set of tokens.
51
51
  #
52
52
  def tokenize text
53
53
  scanner = Greeb::StringScanner.new(text)
54
54
  tokens = []
55
55
  while !scanner.eos?
56
56
  step scanner, tokens or
57
- raise Greeb::UnknownEntity.new(text, scanner.char_pos)
57
+ raise Greeb::UnknownSpan.new(text, scanner.char_pos)
58
58
  end
59
59
  tokens
60
60
  ensure
@@ -79,9 +79,9 @@ module Greeb::Tokenizer
79
79
  # One iteration of the tokenization process.
80
80
  #
81
81
  # @param scanner [Greeb::StringScanner] string scanner.
82
- # @param tokens [Array<Greeb::Entity>] result array.
82
+ # @param tokens [Array<Greeb::Span>] result array.
83
83
  #
84
- # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
84
+ # @return [Array<Greeb::Span>] the modified set of extracted tokens.
85
85
  #
86
86
  def step scanner, tokens
87
87
  parse! scanner, tokens, LETTERS, :letter or
@@ -99,17 +99,17 @@ module Greeb::Tokenizer
99
99
  # of necessary type.
100
100
  #
101
101
  # @param scanner [Greeb::StringScanner] string scanner.
102
- # @param tokens [Array<Greeb::Entity>] result array.
102
+ # @param tokens [Array<Greeb::Span>] result array.
103
103
  # @param pattern [Regexp] a regular expression to extract the token.
104
104
  # @param type [Symbol] a symbol that represents the necessary token
105
105
  # type.
106
106
  #
107
- # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
107
+ # @return [Array<Greeb::Span>] the modified set of extracted tokens.
108
108
  #
109
109
  def parse! scanner, tokens, pattern, type
110
110
  return false unless token = scanner.scan(pattern)
111
111
  position = scanner.char_pos
112
- tokens << Greeb::Entity.new(position - token.length,
112
+ tokens << Greeb::Span.new(position - token.length,
113
113
  position,
114
114
  type)
115
115
  end
@@ -119,18 +119,18 @@ module Greeb::Tokenizer
119
119
  # characters.
120
120
  #
121
121
  # @param scanner [Greeb::StringScanner] string scanner.
122
- # @param tokens [Array<Greeb::Entity>] result array.
122
+ # @param tokens [Array<Greeb::Span>] result array.
123
123
  # @param pattern [Regexp] a regular expression to extract the token.
124
124
  # @param type [Symbol] a symbol that represents the necessary token
125
125
  # type.
126
126
  #
127
- # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
127
+ # @return [Array<Greeb::Span>] the modified set of extracted tokens.
128
128
  #
129
129
  def split_parse! scanner, tokens, pattern, type
130
130
  return false unless token = scanner.scan(pattern)
131
131
  position = scanner.char_pos - token.length
132
132
  split(token).inject(position) do |before, s|
133
- tokens << Greeb::Entity.new(before, before + s.length, type)
133
+ tokens << Greeb::Span.new(before, before + s.length, type)
134
134
  before + s.length
135
135
  end
136
136
  end
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.2.2.rc1'
8
+ VERSION = '0.2.2.rc2'
9
9
  end
@@ -2,41 +2,39 @@
2
2
 
3
3
  require_relative 'spec_helper'
4
4
 
5
- module Greeb
6
- describe Greeb do
7
- it 'should do nothing when ran without input' do
8
- Greeb[''].must_be_empty
9
- end
5
+ describe Greeb do
6
+ it 'should do nothing when ran without input' do
7
+ Greeb[''].must_be_empty
8
+ end
10
9
 
11
- it 'should tokenize text when input is given' do
12
- Greeb['Hello guys!'].must_equal(
13
- [Entity.new(0, 5, :letter),
14
- Entity.new(5, 6, :space),
15
- Entity.new(6, 10, :letter),
16
- Entity.new(10, 11, :punct)]
17
- )
18
- end
10
+ it 'should tokenize text when input is given' do
11
+ Greeb['Hello guys!'].must_equal(
12
+ [Span.new(0, 5, :letter),
13
+ Span.new(5, 6, :space),
14
+ Span.new(6, 10, :letter),
15
+ Span.new(10, 11, :punct)]
16
+ )
17
+ end
19
18
 
20
- it 'should extract URLs' do
21
- Greeb['Hello http://nlpub.ru guys!'].must_equal(
22
- [Entity.new(0, 5, :letter),
23
- Entity.new(5, 6, :space),
24
- Entity.new(6, 21, :url),
25
- Entity.new(21, 22, :space),
26
- Entity.new(22, 26, :letter),
27
- Entity.new(26, 27, :punct)]
28
- )
29
- end
19
+ it 'should extract URLs' do
20
+ Greeb['Hello http://nlpub.ru guys!'].must_equal(
21
+ [Span.new(0, 5, :letter),
22
+ Span.new(5, 6, :space),
23
+ Span.new(6, 21, :url),
24
+ Span.new(21, 22, :space),
25
+ Span.new(22, 26, :letter),
26
+ Span.new(26, 27, :punct)]
27
+ )
28
+ end
30
29
 
31
- it 'should extract e-mails' do
32
- Greeb['Hello example@example.com guys!'].must_equal(
33
- [Entity.new(0, 5, :letter),
34
- Entity.new(5, 6, :space),
35
- Entity.new(6, 25, :email),
36
- Entity.new(25, 26, :space),
37
- Entity.new(26, 30, :letter),
38
- Entity.new(30, 31, :punct)]
39
- )
40
- end
30
+ it 'should extract e-mails' do
31
+ Greeb['Hello example@example.com guys!'].must_equal(
32
+ [Span.new(0, 5, :letter),
33
+ Span.new(5, 6, :space),
34
+ Span.new(6, 25, :email),
35
+ Span.new(25, 26, :space),
36
+ Span.new(26, 30, :letter),
37
+ Span.new(30, 31, :punct)]
38
+ )
41
39
  end
42
40
  end
@@ -2,45 +2,57 @@
2
2
 
3
3
  require_relative 'spec_helper'
4
4
 
5
- module Greeb
6
- describe Parser do
7
- let(:text) do
8
- 'Hello there! My name is Vasya B. and I am к.ф.-м.н. My website is ' \
9
- 'http://вася.рф/. And my e-mail is example@example.com! Also it is ' \
10
- 'available by URL: http://vasya.ru. Also, G.L.H.F. everyone!'
5
+ describe Parser do
6
+ let(:text) do
7
+ ('Hello there! My name is <span class="name">Vasya B.</span> and ' \
8
+ 'I am к.ф.-м.н. My website is http://вася.рф/. And my e-mail is ' \
9
+ 'example@example.com! It is available by URL: http://vasya.ru. ' \
10
+ 'Also, <b>G.L.H.F.</b> everyone!').freeze
11
+ end
12
+
13
+ describe 'URL' do
14
+ subject { Parser.urls(text) }
15
+
16
+ it 'recognizes URLs' do
17
+ subject.must_equal(
18
+ [Span.new(92, 107, :url),
19
+ Span.new(171, 186, :url)]
20
+ )
11
21
  end
22
+ end
12
23
 
13
- describe 'URL' do
14
- subject { Parser.urls(text) }
24
+ describe 'EMAIL' do
25
+ subject { Parser.emails(text) }
15
26
 
16
- it 'recognizes URLs' do
17
- subject.must_equal(
18
- [Entity.new(66, 81, :url),
19
- Entity.new(150, 165, :url)]
20
- )
21
- end
27
+ it 'recognizes e-mails' do
28
+ subject.must_equal(
29
+ [Span.new(126, 145, :email)]
30
+ )
22
31
  end
32
+ end
23
33
 
24
- describe 'EMAIL' do
25
- subject { Parser.emails(text) }
34
+ describe 'ABBREV' do
35
+ subject { Parser.abbrevs(text) }
26
36
 
27
- it 'recognizes e-mails' do
28
- subject.must_equal(
29
- [Entity.new(100, 119, :email)]
30
- )
31
- end
37
+ it 'recognizes abbreviations' do
38
+ subject.must_equal(
39
+ [Span.new(49, 51, :abbrev),
40
+ Span.new(68, 77, :abbrev),
41
+ Span.new(197, 205, :abbrev)]
42
+ )
32
43
  end
44
+ end
33
45
 
34
- describe 'ABBREV' do
35
- subject { Parser.abbrevs(text) }
46
+ describe 'HTML' do
47
+ subject { Parser.html(text) }
36
48
 
37
- it 'recognizes abbreviations' do
38
- subject.must_equal(
39
- [Entity.new(30, 32, :abbrev),
40
- Entity.new(42, 51, :abbrev),
41
- Entity.new(173, 181, :abbrev)]
42
- )
43
- end
49
+ it 'recognizes HTML entities' do
50
+ subject.must_equal(
51
+ [Span.new(24, 43, :html),
52
+ Span.new(51, 58, :html),
53
+ Span.new(194, 197, :html),
54
+ Span.new(205, 209, :html)]
55
+ )
44
56
  end
45
57
  end
46
58
  end
@@ -2,116 +2,114 @@
2
2
 
3
3
  require_relative 'spec_helper'
4
4
 
5
- module Greeb
6
- describe Segmentator do
7
- describe 'initialization' do
8
- let(:tokens) { Tokenizer.tokenize('Vodka') }
5
+ describe Segmentator do
6
+ describe 'initialization' do
7
+ let(:tokens) { Tokenizer.tokenize('Vodka') }
9
8
 
10
- subject { Segmentator.new(tokens) }
9
+ subject { Segmentator.new(tokens) }
11
10
 
12
- it 'is initialized either with set of tokens' do
13
- subject.tokens.must_be_kind_of Array
14
- end
11
+ it 'is initialized either with set of tokens' do
12
+ subject.tokens.must_be_kind_of Array
13
+ end
15
14
 
16
- it 'should has @tokens ivar' do
17
- subject.instance_variable_get(:@tokens).wont_be_nil
18
- end
15
+ it 'should has @tokens ivar' do
16
+ subject.instance_variable_get(:@tokens).wont_be_nil
19
17
  end
18
+ end
20
19
 
21
- describe 'a simple sentence' do
22
- let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
20
+ describe 'a simple sentence' do
21
+ let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
23
22
 
24
- subject { Segmentator.new(tokens).sentences }
23
+ subject { Segmentator.new(tokens).sentences }
25
24
 
26
- it 'should be segmented' do
27
- subject.must_equal([Entity.new(0, 22, :sentence)])
28
- end
25
+ it 'should be segmented' do
26
+ subject.must_equal([Span.new(0, 22, :sentence)])
29
27
  end
28
+ end
30
29
 
31
- describe 'a simple sentence without punctuation' do
32
- let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton') }
30
+ describe 'a simple sentence without punctuation' do
31
+ let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton') }
33
32
 
34
- subject { Segmentator.new(tokens).sentences }
33
+ subject { Segmentator.new(tokens).sentences }
35
34
 
36
- it 'should be segmented' do
37
- subject.must_equal([Entity.new(0, 21, :sentence)])
38
- end
35
+ it 'should be segmented' do
36
+ subject.must_equal([Span.new(0, 21, :sentence)])
39
37
  end
38
+ end
40
39
 
41
- describe 'a simple sentence with trailing whitespaces' do
42
- let(:tokens) { Tokenizer.tokenize(' Hello, I am JC Denton ') }
40
+ describe 'a simple sentence with trailing whitespaces' do
41
+ let(:tokens) { Tokenizer.tokenize(' Hello, I am JC Denton ') }
43
42
 
44
- subject { Segmentator.new(tokens).sentences }
43
+ subject { Segmentator.new(tokens).sentences }
45
44
 
46
- it 'should be segmented' do
47
- subject.must_equal([Entity.new(6, 27, :sentence)])
48
- end
45
+ it 'should be segmented' do
46
+ subject.must_equal([Span.new(6, 27, :sentence)])
49
47
  end
48
+ end
50
49
 
51
- describe 'two simple sentences' do
52
- let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
50
+ describe 'two simple sentences' do
51
+ let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
53
52
 
54
- subject { Segmentator.new(tokens).sentences }
53
+ subject { Segmentator.new(tokens).sentences }
55
54
 
56
- it 'should be segmented' do
57
- subject.must_equal([Entity.new(0, 6, :sentence),
58
- Entity.new(7, 22, :sentence)])
59
- end
55
+ it 'should be segmented' do
56
+ subject.must_equal([Span.new(0, 6, :sentence),
57
+ Span.new(7, 22, :sentence)])
60
58
  end
59
+ end
61
60
 
62
- describe 'one wrong character and one simple sentence' do
63
- let(:tokens) { Tokenizer.tokenize('! I am JC Denton.') }
61
+ describe 'one wrong character and one simple sentence' do
62
+ let(:tokens) { Tokenizer.tokenize('! I am JC Denton.') }
64
63
 
65
- subject { Segmentator.new(tokens).sentences }
64
+ subject { Segmentator.new(tokens).sentences }
66
65
 
67
- it 'should be segmented' do
68
- subject.must_equal([Entity.new(2, 17, :sentence)])
69
- end
66
+ it 'should be segmented' do
67
+ subject.must_equal([Span.new(2, 17, :sentence)])
70
68
  end
69
+ end
71
70
 
72
- describe 'sentence extractor' do
73
- let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
74
- let(:segmentator) { Segmentator.new(tokens) }
75
- let(:sentences) { segmentator.sentences }
76
-
77
- subject { segmentator.extract(sentences) }
78
-
79
- it 'should be extracted' do
80
- subject.must_equal(
81
- Entity.new(0, 6, :sentence) => [
82
- Entity.new(0, 5, :letter),
83
- Entity.new(5, 6, :punct)
84
- ],
85
- Entity.new(7, 22, :sentence) => [
86
- Entity.new(7, 8, :letter),
87
- Entity.new(8, 9, :space),
88
- Entity.new(9, 11, :letter),
89
- Entity.new(11, 12, :space),
90
- Entity.new(12, 14, :letter),
91
- Entity.new(14, 15, :space),
92
- Entity.new(15, 21, :letter),
93
- Entity.new(21, 22, :punct)
94
- ]
95
- )
96
- end
71
+ describe 'sentence extractor' do
72
+ let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
73
+ let(:segmentator) { Segmentator.new(tokens) }
74
+ let(:sentences) { segmentator.sentences }
75
+
76
+ subject { segmentator.extract(sentences) }
77
+
78
+ it 'should be extracted' do
79
+ subject.must_equal([
80
+ [Span.new(0, 6, :sentence), [
81
+ Span.new(0, 5, :letter),
82
+ Span.new(5, 6, :punct)
83
+ ]],
84
+ [Span.new(7, 22, :sentence), [
85
+ Span.new(7, 8, :letter),
86
+ Span.new(8, 9, :space),
87
+ Span.new(9, 11, :letter),
88
+ Span.new(11, 12, :space),
89
+ Span.new(12, 14, :letter),
90
+ Span.new(14, 15, :space),
91
+ Span.new(15, 21, :letter),
92
+ Span.new(21, 22, :punct)
93
+ ]]
94
+ ])
97
95
  end
96
+ end
98
97
 
99
- describe 'subsentence extractor' do
100
- let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
101
- let(:segmentator) { Segmentator.new(tokens) }
102
- let(:sentences) { segmentator.sentences }
103
- let(:subsentences) { segmentator.subsentences }
104
-
105
- subject { segmentator.extract(sentences, subsentences) }
106
-
107
- it 'should extract subsentences' do
108
- subject.must_equal(
109
- Entity.new(0, 22, :sentence) => [
110
- Entity.new(0, 6, :subsentence),
111
- Entity.new(7, 22, :subsentence)
112
- ]
113
- )
114
- end
98
+ describe 'subsentence extractor' do
99
+ let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
100
+ let(:segmentator) { Segmentator.new(tokens) }
101
+ let(:sentences) { segmentator.sentences }
102
+ let(:subsentences) { segmentator.subsentences }
103
+
104
+ subject { segmentator.extract(sentences, subsentences) }
105
+
106
+ it 'should extract subsentences' do
107
+ subject.must_equal([
108
+ [Span.new(0, 22, :sentence), [
109
+ Span.new(0, 6, :subsentence),
110
+ Span.new(7, 22, :subsentence)
111
+ ]]
112
+ ])
115
113
  end
116
114
  end
117
115
  end