greeb 0.1.0.rc7 → 0.1.0.rc8

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,8 +1,8 @@
1
1
  Greeb
2
2
  =====
3
3
 
4
- Greeb is a simple yet awesome text tokenizer that is based on regular
5
- expressions.
4
+ Greeb is a simple yet awesome and Unicode-aware text segmentator
5
+ that is based on regular expressions.
6
6
 
7
7
  ## Installation
8
8
 
@@ -25,7 +25,7 @@ Or install it yourself as:
25
25
  Greeb can help you to solve simple text processing problems:
26
26
 
27
27
  ```ruby
28
- pp Greeb::Tokenizer.new('Hello!').tokens
28
+ pp Greeb::Tokenizer.tokenize('Hello!')
29
29
  =begin
30
30
  [#<struct Greeb::Entity from=0, to=5, type=:letter>,
31
31
  #<struct Greeb::Entity from=5, to=6, type=:punct>]
@@ -41,7 +41,7 @@ Hello! I am 18! My favourite number is 133.7...
41
41
  What about you?
42
42
  EOF
43
43
 
44
- pp Greeb::Tokenizer.new(text).tokens
44
+ pp Greeb::Tokenizer.tokenize(text)
45
45
  =begin
46
46
  [#<struct Greeb::Entity from=0, to=5, type=:letter>,
47
47
  #<struct Greeb::Entity from=5, to=6, type=:punct>,
@@ -79,8 +79,8 @@ such as sentence detection tasks:
79
79
 
80
80
  ```ruby
81
81
  text = 'Hello! How are you?'
82
- tokenizer = Greeb::Tokenizer.new(text)
83
- pp Greeb::Segmentator.new(tokenizer).sentences
82
+ tokens = Greeb::Tokenizer.tokenize(text)
83
+ pp Greeb::Segmentator.new(tokens).sentences
84
84
  =begin
85
85
  [#<struct Greeb::Entity from=0, to=6, type=:sentence>,
86
86
  #<struct Greeb::Entity from=7, to=19, type=:sentence>]
@@ -92,9 +92,9 @@ segmentator:
92
92
 
93
93
  ```ruby
94
94
  text = 'Hello! How are you?'
95
- tokenizer = Greeb::Tokenizer.new(text)
96
- sentences = Greeb::Segmentator.new(tokenizer).sentences
97
- pp segmentator.extract(*sentences)
95
+ tokens = Greeb::Tokenizer.tokenize(text)
96
+ sentences = Greeb::Segmentator.new(tokens).sentences
97
+ pp segmentator.extract(sentences)
98
98
  =begin
99
99
  {#<struct Greeb::Entity from=0, to=6, type=:sentence>=>
100
100
  [#<struct Greeb::Entity from=0, to=5, type=:letter>,
@@ -127,9 +127,6 @@ punctuation), and `:break`.
127
127
  4. Push to the branch (`git push origin my-new-feature`);
128
128
  5. Create new Pull Request.
129
129
 
130
- I highly recommend you to use git flow to make development process much
131
- systematic and awesome.
132
-
133
130
  ## Build Status [<img src="https://secure.travis-ci.org/eveel/greeb.png"/>](http://travis-ci.org/eveel/greeb)
134
131
 
135
132
  ## Dependency Status [<img src="https://gemnasium.com/eveel/greeb.png?travis"/>](https://gemnasium.com/eveel/greeb)
data/lib/greeb.rb CHANGED
@@ -22,6 +22,24 @@ class Greeb::Entity < Struct.new(:from, :to, :type)
22
22
  end
23
23
  end
24
24
 
25
+ # This runtime error appears when {Greeb::Tokenizer} or
26
+ # {Greeb::Segmentator} tries to recognize unknown character.
27
+ #
28
+ class Greeb::UnknownEntity < RuntimeError
29
+ attr_reader :text, :pos
30
+
31
+ # @private
32
+ def initialize(text, pos)
33
+ @text, @pos = text, pos
34
+ end
35
+
36
+ # Generate the real error message.
37
+ #
38
+ def to_s
39
+ 'Could not recognize character "%s" @ %d' % [text[pos], pos]
40
+ end
41
+ end
42
+
25
43
  require 'greeb/strscan'
26
44
  require 'greeb/tokenizer'
27
45
  require 'greeb/segmentator'
@@ -13,15 +13,10 @@ class Greeb::Segmentator
13
13
 
14
14
  # Create a new instance of {Greeb::Segmentator}.
15
15
  #
16
- # @param tokenizer_or_tokens [Greeb::Tokenizer,Set] an instance of
17
- # {Greeb::Tokenizer} or set of its results.
18
- #
19
- def initialize tokenizer_or_tokens
20
- @tokens = if tokenizer_or_tokens.is_a? Greeb::Tokenizer
21
- tokenizer_or_tokens.tokens
22
- else
23
- tokenizer_or_tokens
24
- end
16
+ # @param tokens [Array<Greeb::Entity>] tokens from [Greeb::Tokenizer].
17
+ #
18
+ def initialize tokens
19
+ @tokens = tokens
25
20
  end
26
21
 
27
22
  # Sentences memoization method.
@@ -49,7 +44,7 @@ class Greeb::Segmentator
49
44
  # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
50
45
  # sentences as keys and tokens arrays as values.
51
46
  #
52
- def extract *sentences
47
+ def extract sentences
53
48
  Hash[
54
49
  sentences.map do |s|
55
50
  [s, tokens.select { |t| t.from >= s.from and t.to <= s.to }]
@@ -64,7 +59,7 @@ class Greeb::Segmentator
64
59
  # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
65
60
  # sentences as keys and subsentences arrays as values.
66
61
  #
67
- def subextract *sentences
62
+ def subextract sentences
68
63
  Hash[
69
64
  sentences.map do |s|
70
65
  [s, subsentences.select { |ss| ss.from >= s.from and ss.to <= s.to }]
@@ -73,90 +68,90 @@ class Greeb::Segmentator
73
68
  end
74
69
 
75
70
  protected
76
- # Implementation of the sentence detection method. This method
77
- # changes the `@sentences` ivar.
78
- #
79
- # @return [nil] nothing.
80
- #
81
- def detect_sentences!
82
- @sentences = []
83
-
84
- rest = tokens.inject(new_sentence) do |sentence, token|
85
- if !sentence.from and SENTENCE_DOESNT_START.include?(token.type)
86
- next sentence
87
- end
88
-
89
- sentence.from = token.from unless sentence.from
90
-
91
- next sentence if sentence.to and sentence.to > token.to
92
-
93
- if :punct == token.type
94
- sentence.to = tokens.
95
- select { |t| t.from >= token.from }.
96
- inject(token) { |r, t| break r if t.type != token.type; t }.
97
- to
98
-
99
- @sentences << sentence
100
- sentence = new_sentence
101
- elsif :separ != token.type
102
- sentence.to = token.to
103
- end
104
-
105
- sentence
71
+ # Implementation of the sentence detection method. This method
72
+ # changes the `@sentences` ivar.
73
+ #
74
+ # @return [nil] nothing.
75
+ #
76
+ def detect_sentences!
77
+ @sentences = []
78
+
79
+ rest = tokens.inject(new_sentence) do |sentence, token|
80
+ if !sentence.from and SENTENCE_DOESNT_START.include?(token.type)
81
+ next sentence
82
+ end
83
+
84
+ sentence.from = token.from unless sentence.from
85
+
86
+ next sentence if sentence.to and sentence.to > token.to
87
+
88
+ if :punct == token.type
89
+ sentence.to = tokens.
90
+ select { |t| t.from >= token.from }.
91
+ inject(token) { |r, t| break r if t.type != token.type; t }.
92
+ to
93
+
94
+ @sentences << sentence
95
+ sentence = new_sentence
96
+ elsif :separ != token.type
97
+ sentence.to = token.to
106
98
  end
107
99
 
108
- nil.tap { @sentences << rest if rest.from and rest.to }
100
+ sentence
109
101
  end
110
102
 
111
- # Implementation of the subsentence detection method. This method
112
- # changes the `@subsentences` ivar.
113
- #
114
- # @return [nil] nothing.
115
- #
116
- def detect_subsentences!
117
- @subsentences = SortedSet.new
103
+ nil.tap { @sentences << rest if rest.from and rest.to }
104
+ end
118
105
 
119
- rest = tokens.inject(new_subsentence) do |subsentence, token|
120
- if !subsentence.from and SENTENCE_DOESNT_START.include?(token.type)
121
- next subsentence
122
- end
106
+ # Implementation of the subsentence detection method. This method
107
+ # changes the `@subsentences` ivar.
108
+ #
109
+ # @return [nil] nothing.
110
+ #
111
+ def detect_subsentences!
112
+ @subsentences = SortedSet.new
123
113
 
124
- subsentence.from = token.from unless subsentence.from
114
+ rest = tokens.inject(new_subsentence) do |subsentence, token|
115
+ if !subsentence.from and SENTENCE_DOESNT_START.include?(token.type)
116
+ next subsentence
117
+ end
125
118
 
126
- next subsentence if subsentence.to and subsentence.to > token.to
119
+ subsentence.from = token.from unless subsentence.from
127
120
 
128
- if [:punct, :spunct].include? token.type
129
- subsentence.to = tokens.
130
- select { |t| t.from >= token.from }.
131
- inject(token) { |r, t| break r if t.type != token.type; t }.
132
- to
121
+ next subsentence if subsentence.to and subsentence.to > token.to
133
122
 
134
- @subsentences << subsentence
135
- subsentence = new_subsentence
136
- elsif :separ != token.type
137
- subsentence.to = token.to
138
- end
123
+ if [:punct, :spunct].include? token.type
124
+ subsentence.to = tokens.
125
+ select { |t| t.from >= token.from }.
126
+ inject(token) { |r, t| break r if t.type != token.type; t }.
127
+ to
139
128
 
140
- subsentence
129
+ @subsentences << subsentence
130
+ subsentence = new_subsentence
131
+ elsif :separ != token.type
132
+ subsentence.to = token.to
141
133
  end
142
134
 
143
- nil.tap { @subsentences << rest if rest.from and rest.to }
135
+ subsentence
144
136
  end
145
137
 
138
+ nil.tap { @subsentences << rest if rest.from and rest.to }
139
+ end
140
+
146
141
  private
147
- # Create a new instance of {Greeb::Entity} with `:sentence` type.
148
- #
149
- # @return [Greeb::Entity] a new entity instance.
150
- #
151
- def new_sentence
152
- Greeb::Entity.new(nil, nil, :sentence)
153
- end
142
+ # Create a new instance of {Greeb::Entity} with `:sentence` type.
143
+ #
144
+ # @return [Greeb::Entity] a new entity instance.
145
+ #
146
+ def new_sentence
147
+ Greeb::Entity.new(nil, nil, :sentence)
148
+ end
154
149
 
155
- # Create a new instance of {Greeb::Entity} with `:subsentence` type.
156
- #
157
- # @return [Greeb::Entity] a new entity instance.
158
- #
159
- def new_subsentence
160
- Greeb::Entity.new(nil, nil, :subsentence)
161
- end
150
+ # Create a new instance of {Greeb::Entity} with `:subsentence` type.
151
+ #
152
+ # @return [Greeb::Entity] a new entity instance.
153
+ #
154
+ def new_subsentence
155
+ Greeb::Entity.new(nil, nil, :subsentence)
156
+ end
162
157
  end
@@ -5,24 +5,9 @@
5
5
  # Unicode character categories been obtained from
6
6
  # <http://www.fileformat.info/info/unicode/category/index.htm>.
7
7
  #
8
- class Greeb::Tokenizer
9
- # This runtime error appears when {Greeb::Tokenizer} tries to recognize
10
- # unknown character.
11
- #
12
- class UnknownEntity < RuntimeError
13
- attr_reader :text, :pos
14
-
15
- # @private
16
- def initialize(text, pos)
17
- @text, @pos = text, pos
18
- end
19
-
20
- # Generate the real error message.
21
- #
22
- def to_s
23
- 'Could not recognize character "%s" @ %d' % [text[pos], pos]
24
- end
25
- end
8
+ module Greeb::Tokenizer
9
+ # http://www.youtube.com/watch?v=eF1lU-CrQfc
10
+ extend self
26
11
 
27
12
  # English and Russian letters.
28
13
  #
@@ -56,83 +41,67 @@ class Greeb::Tokenizer
56
41
  #
57
42
  RESIDUALS = /[\p{C}\p{M}\p{Sk}]+/u
58
43
 
59
- attr_reader :text, :scanner
60
- protected :scanner
61
-
62
- # Create a new instance of {Greeb::Tokenizer}.
44
+ # Perform the tokenization process.
63
45
  #
64
- # @param text [String] text to be tokenized.
46
+ # @return [Array<Greeb::Entity>] a set of tokens.
65
47
  #
66
- def initialize(text)
67
- @text = text
48
+ def tokenize text
49
+ scanner = Greeb::StringScanner.new(text)
50
+ tokens = []
51
+ while !scanner.eos?
52
+ parse! scanner, tokens, LETTERS, :letter or
53
+ parse! scanner, tokens, FLOATS, :float or
54
+ parse! scanner, tokens, INTEGERS, :integer or
55
+ split_parse! scanner, tokens, SENTENCE_PUNCTUATIONS, :spunct or
56
+ split_parse! scanner, tokens, PUNCTUATIONS, :punct or
57
+ split_parse! scanner, tokens, SEPARATORS, :separ or
58
+ split_parse! scanner, tokens, BREAKS, :break or
59
+ parse! scanner, tokens, RESIDUALS, :residual or
60
+ raise Greeb::UnknownEntity.new(text, scanner.char_pos)
61
+ end
62
+ tokens
63
+ ensure
64
+ scanner.terminate
68
65
  end
69
66
 
70
- # Tokens memoization method.
67
+ private
68
+ # Try to parse one small piece of text that is covered by pattern
69
+ # of necessary type.
71
70
  #
72
- # @return [Array<Greeb::Entity>] a set of tokens.
71
+ # @param scanner [Greeb::StringScanner] string scanner.
72
+ # @param tokens [Array<Greeb::Entity>] result array.
73
+ # @param pattern [Regexp] a regular expression to extract the token.
74
+ # @param type [Symbol] a symbol that represents the necessary token
75
+ # type.
73
76
  #
74
- def tokens
75
- tokenize! unless @tokens
76
- @tokens
77
+ # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
78
+ #
79
+ def parse! scanner, tokens, pattern, type
80
+ return false unless token = scanner.scan(pattern)
81
+ position = scanner.char_pos
82
+ tokens << Greeb::Entity.new(position - token.length,
83
+ position,
84
+ type)
77
85
  end
78
86
 
79
- protected
80
- # Perform the tokenization process. This method modifies
81
- # `@scanner` and `@tokens` instance variables.
82
- #
83
- # @return [nil] nothing unless exception is raised.
84
- #
85
- def tokenize!
86
- @scanner = Greeb::StringScanner.new(text)
87
- @tokens = []
88
- while !scanner.eos?
89
- parse! LETTERS, :letter or
90
- parse! FLOATS, :float or
91
- parse! INTEGERS, :integer or
92
- split_parse! SENTENCE_PUNCTUATIONS, :spunct or
93
- split_parse! PUNCTUATIONS, :punct or
94
- split_parse! SEPARATORS, :separ or
95
- split_parse! BREAKS, :break or
96
- parse! RESIDUALS, :residual or
97
- raise UnknownEntity.new(text, scanner.char_pos)
98
- end
99
- ensure
100
- scanner.terminate
101
- end
102
-
103
- # Try to parse one small piece of text that is covered by pattern
104
- # of necessary type.
105
- #
106
- # @param pattern [Regexp] a regular expression to extract the token.
107
- # @param type [Symbol] a symbol that represents the necessary token
108
- # type.
109
- #
110
- # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
111
- #
112
- def parse! pattern, type
113
- return false unless token = scanner.scan(pattern)
114
- position = scanner.char_pos
115
- @tokens << Greeb::Entity.new(position - token.length,
116
- position,
117
- type)
118
- end
119
-
120
- # Try to parse one small piece of text that is covered by pattern
121
- # of necessary type. This method performs grouping of the same
122
- # characters.
123
- #
124
- # @param pattern [Regexp] a regular expression to extract the token.
125
- # @param type [Symbol] a symbol that represents the necessary token
126
- # type.
127
- #
128
- # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
129
- #
130
- def split_parse! pattern, type
131
- return false unless token = scanner.scan(pattern)
132
- position = scanner.char_pos - token.length
133
- token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
134
- @tokens << Greeb::Entity.new(before, before + s.length, type)
135
- before + s.length
136
- end
87
+ # Try to parse one small piece of text that is covered by pattern
88
+ # of necessary type. This method performs grouping of the same
89
+ # characters.
90
+ #
91
+ # @param scanner [Greeb::StringScanner] string scanner.
92
+ # @param tokens [Array<Greeb::Entity>] result array.
93
+ # @param pattern [Regexp] a regular expression to extract the token.
94
+ # @param type [Symbol] a symbol that represents the necessary token
95
+ # type.
96
+ #
97
+ # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
98
+ #
99
+ def split_parse! scanner, tokens, pattern, type
100
+ return false unless token = scanner.scan(pattern)
101
+ position = scanner.char_pos - token.length
102
+ token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
103
+ tokens << Greeb::Entity.new(before, before + s.length, type)
104
+ before + s.length
137
105
  end
106
+ end
138
107
  end
data/lib/greeb/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.1.0.rc7'
8
+ VERSION = '0.1.0.rc8'
9
9
  end
@@ -5,16 +5,11 @@ require File.expand_path('../spec_helper', __FILE__)
5
5
  module Greeb
6
6
  describe Segmentator do
7
7
  describe 'initialization' do
8
- before { @tokenizer = Tokenizer.new('Vodka') }
8
+ let(:tokens) { Tokenizer.tokenize('Vodka') }
9
9
 
10
- subject { Segmentator.new(@tokenizer) }
10
+ subject { Segmentator.new(tokens) }
11
11
 
12
- it 'can be initialized either with Tokenizer' do
13
- subject.tokens.must_be_kind_of Array
14
- end
15
-
16
- it 'can be initialized either with a set of tokens' do
17
- subject = Segmentator.new(@tokenizer.tokens)
12
+ it 'is initialized either with set of tokens' do
18
13
  subject.tokens.must_be_kind_of Array
19
14
  end
20
15
 
@@ -24,75 +19,64 @@ module Greeb
24
19
  end
25
20
 
26
21
  describe 'a simple sentence' do
27
- before { @tokenizer = Tokenizer.new('Hello, I am JC Denton.') }
22
+ let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
28
23
 
29
- subject { Segmentator.new(@tokenizer).sentences }
24
+ subject { Segmentator.new(tokens).sentences }
30
25
 
31
26
  it 'should be segmented' do
32
- subject.must_equal(
33
- [Entity.new(0, 22, :sentence)]
34
- )
27
+ subject.must_equal([Entity.new(0, 22, :sentence)])
35
28
  end
36
29
  end
37
30
 
38
31
  describe 'a simple sentence without punctuation' do
39
- before { @tokenizer = Tokenizer.new('Hello, I am JC Denton') }
32
+ let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton') }
40
33
 
41
- subject { Segmentator.new(@tokenizer).sentences }
34
+ subject { Segmentator.new(tokens).sentences }
42
35
 
43
36
  it 'should be segmented' do
44
- subject.must_equal(
45
- [Entity.new(0, 21, :sentence)]
46
- )
37
+ subject.must_equal([Entity.new(0, 21, :sentence)])
47
38
  end
48
39
  end
49
40
 
50
41
  describe 'a simple sentence with trailing whitespaces' do
51
- before { @tokenizer = Tokenizer.new(' Hello, I am JC Denton ') }
42
+ let(:tokens) { Tokenizer.tokenize(' Hello, I am JC Denton ') }
52
43
 
53
- subject { Segmentator.new(@tokenizer).sentences }
44
+ subject { Segmentator.new(tokens).sentences }
54
45
 
55
46
  it 'should be segmented' do
56
- subject.must_equal(
57
- [Entity.new(6, 27, :sentence)]
58
- )
47
+ subject.must_equal([Entity.new(6, 27, :sentence)])
59
48
  end
60
49
  end
61
50
 
62
51
  describe 'two simple sentences' do
63
- before { @tokenizer = Tokenizer.new('Hello! I am JC Denton.') }
52
+ let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
64
53
 
65
- subject { Segmentator.new(@tokenizer).sentences }
54
+ subject { Segmentator.new(tokens).sentences }
66
55
 
67
56
  it 'should be segmented' do
68
- subject.must_equal(
69
- [Entity.new(0, 6, :sentence),
70
- Entity.new(7, 22, :sentence)]
71
- )
57
+ subject.must_equal([Entity.new(0, 6, :sentence),
58
+ Entity.new(7, 22, :sentence)])
72
59
  end
73
60
  end
74
61
 
75
62
  describe 'one wrong character and one simple sentence' do
76
- before { @tokenizer = Tokenizer.new('! I am JC Denton.') }
63
+ let(:tokens) { Tokenizer.tokenize('! I am JC Denton.') }
77
64
 
78
- subject { Segmentator.new(@tokenizer).sentences }
65
+ subject { Segmentator.new(tokens).sentences }
79
66
 
80
67
  it 'should be segmented' do
81
- subject.must_equal(
82
- [Entity.new(2, 17, :sentence)]
83
- )
68
+ subject.must_equal([Entity.new(2, 17, :sentence)])
84
69
  end
85
70
  end
86
71
 
87
- describe 'token extractor' do
88
- before { @tokenizer = Tokenizer.new('Hello! I am JC Denton.') }
72
+ describe 'sentence extractor' do
73
+ let(:tokens) { Tokenizer.tokenize('Hello! I am JC Denton.') }
74
+ let(:segmentator) { Segmentator.new(tokens) }
89
75
 
90
- subject { Segmentator.new(@tokenizer) }
91
-
92
- let(:sentences) { subject.sentences }
76
+ subject { segmentator.extract(segmentator.sentences) }
93
77
 
94
78
  it 'should be extracted' do
95
- subject.extract(*sentences).must_equal({
79
+ subject.must_equal(
96
80
  Entity.new(0, 6, :sentence) => [
97
81
  Entity.new(0, 5, :letter),
98
82
  Entity.new(5, 6, :punct)
@@ -107,24 +91,23 @@ module Greeb
107
91
  Entity.new(15, 21, :letter),
108
92
  Entity.new(21, 22, :punct)
109
93
  ]
110
- })
94
+ )
111
95
  end
112
96
  end
113
97
 
114
98
  describe 'subsentence extractor' do
115
- before { @tokenizer = Tokenizer.new('Hello, I am JC Denton.') }
116
-
117
- subject { Segmentator.new(@tokenizer) }
99
+ let(:tokens) { Tokenizer.tokenize('Hello, I am JC Denton.') }
100
+ let(:segmentator) { Segmentator.new(tokens) }
118
101
 
119
- let(:sentences) { subject.sentences }
102
+ subject { segmentator.subextract(segmentator.sentences) }
120
103
 
121
104
  it 'should extract subsentences' do
122
- subject.subextract(*sentences).must_equal({
105
+ subject.must_equal(
123
106
  Entity.new(0, 22, :sentence) => [
124
107
  Entity.new(0, 6, :subsentence),
125
108
  Entity.new(7, 22, :subsentence)
126
109
  ]
127
- })
110
+ )
128
111
  end
129
112
  end
130
113
  end
@@ -4,59 +4,35 @@ require File.expand_path('../spec_helper', __FILE__)
4
4
 
5
5
  module Greeb
6
6
  describe Tokenizer do
7
- describe 'initialization' do
8
- subject { Tokenizer.new('vodka') }
9
-
10
- it 'should be initialized with a text' do
11
- subject.text.must_equal 'vodka'
12
- end
13
-
14
- it 'should has the @text ivar' do
15
- subject.instance_variable_get(:@text).must_equal 'vodka'
16
- end
17
-
18
- it 'should not has @tokens ivar' do
19
- subject.instance_variable_get(:@tokens).must_be_nil
20
- end
21
- end
22
-
23
7
  describe 'after tokenization' do
24
- subject { Tokenizer.new('vodka').tap(&:tokens) }
25
-
26
- it 'should has the @tokens ivar' do
27
- subject.instance_variable_get(:@tokens).wont_be_nil
28
- end
29
-
30
- it 'should has the @scanner ivar' do
31
- subject.instance_variable_get(:@scanner).wont_be_nil
32
- end
8
+ subject { Tokenizer.tokenize('vodka') }
33
9
 
34
10
  it 'should has the tokens set' do
35
- subject.tokens.must_be_kind_of Array
11
+ subject.must_be_kind_of Array
36
12
  end
37
13
  end
38
14
 
39
15
  describe 'tokenization facilities' do
40
16
  it 'can handle words' do
41
- Tokenizer.new('hello').tokens.must_equal(
17
+ Tokenizer.tokenize('hello').must_equal(
42
18
  [Entity.new(0, 5, :letter)]
43
19
  )
44
20
  end
45
21
 
46
22
  it 'can handle floats' do
47
- Tokenizer.new('14.88').tokens.must_equal(
23
+ Tokenizer.tokenize('14.88').must_equal(
48
24
  [Entity.new(0, 5, :float)]
49
25
  )
50
26
  end
51
27
 
52
28
  it 'can handle integers' do
53
- Tokenizer.new('1337').tokens.must_equal(
29
+ Tokenizer.tokenize('1337').must_equal(
54
30
  [Entity.new(0, 4, :integer)]
55
31
  )
56
32
  end
57
33
 
58
34
  it 'can handle words and integers' do
59
- Tokenizer.new('Hello, I am 18').tokens.must_equal(
35
+ Tokenizer.tokenize('Hello, I am 18').must_equal(
60
36
  [Entity.new(0, 5, :letter),
61
37
  Entity.new(5, 6, :spunct),
62
38
  Entity.new(6, 7, :separ),
@@ -69,7 +45,7 @@ module Greeb
69
45
  end
70
46
 
71
47
  it 'can handle multi-line paragraphs' do
72
- Tokenizer.new("Brateeshka..!\n\nPrines!").tokens.must_equal(
48
+ Tokenizer.tokenize("Brateeshka..!\n\nPrines!").must_equal(
73
49
  [Entity.new(0, 10, :letter),
74
50
  Entity.new(10, 12, :punct),
75
51
  Entity.new(12, 13, :punct),
@@ -80,7 +56,7 @@ module Greeb
80
56
  end
81
57
 
82
58
  it 'can handle separated integers' do
83
- Tokenizer.new('228/359').tokens.must_equal(
59
+ Tokenizer.tokenize('228/359').must_equal(
84
60
  [Entity.new(0, 3, :integer),
85
61
  Entity.new(3, 4, :separ),
86
62
  Entity.new(4, 7, :integer)]
@@ -88,7 +64,7 @@ module Greeb
88
64
  end
89
65
 
90
66
  it 'can deal with Russian language' do
91
- Tokenizer.new('Братишка, я тебе покушать принёс!').tokens.must_equal(
67
+ Tokenizer.tokenize('Братишка, я тебе покушать принёс!').must_equal(
92
68
  [Entity.new(0, 8, :letter),
93
69
  Entity.new(8, 9, :spunct),
94
70
  Entity.new(9, 10, :separ),
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.rc7
4
+ version: 0.1.0.rc8
5
5
  prerelease: 6
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-14 00:00:00.000000000 Z
12
+ date: 2012-12-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  requirement: !ruby/object:Gem::Requirement
@@ -109,7 +109,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
109
109
  requirements:
110
110
  - - ! '>='
111
111
  - !ruby/object:Gem::Version
112
- hash: 2716089438708653231
112
+ hash: -482579338117024513
113
113
  version: '0'
114
114
  segments:
115
115
  - 0