greeb 0.1.0.rc4 → 0.1.0.rc6

Sign up to get free protection for your applications and to get access to all the features.
data/.travis.yml CHANGED
@@ -4,3 +4,4 @@ branches:
4
4
  - master
5
5
  rvm:
6
6
  - 1.9.3
7
+ - rbx-19mode
data/README.md CHANGED
@@ -27,8 +27,8 @@ Greeb can help you to solve simple text processing problems:
27
27
  ```ruby
28
28
  pp Greeb::Tokenizer.new('Hello!').tokens
29
29
  =begin
30
- #<SortedSet: {#<struct Greeb::Entity from=0, to=5, type=:letter>,
31
- #<struct Greeb::Entity from=5, to=6, type=:punct>}>
30
+ [#<struct Greeb::Entity from=0, to=5, type=:letter>,
31
+ #<struct Greeb::Entity from=5, to=6, type=:punct>]
32
32
  =end
33
33
  ```
34
34
 
@@ -43,7 +43,7 @@ EOF
43
43
 
44
44
  pp Greeb::Tokenizer.new(text).tokens
45
45
  =begin
46
- #<SortedSet: {#<struct Greeb::Entity from=0, to=5, type=:letter>,
46
+ [#<struct Greeb::Entity from=0, to=5, type=:letter>,
47
47
  #<struct Greeb::Entity from=5, to=6, type=:punct>,
48
48
  #<struct Greeb::Entity from=6, to=7, type=:separ>,
49
49
  #<struct Greeb::Entity from=7, to=8, type=:letter>,
@@ -70,7 +70,7 @@ pp Greeb::Tokenizer.new(text).tokens
70
70
  #<struct Greeb::Entity from=59, to=60, type=:separ>,
71
71
  #<struct Greeb::Entity from=60, to=63, type=:letter>,
72
72
  #<struct Greeb::Entity from=63, to=64, type=:punct>,
73
- #<struct Greeb::Entity from=64, to=65, type=:break>}>
73
+ #<struct Greeb::Entity from=64, to=65, type=:break>]
74
74
  =end
75
75
  ```
76
76
 
@@ -82,8 +82,8 @@ text = 'Hello! How are you?'
82
82
  tokenizer = Greeb::Tokenizer.new(text)
83
83
  pp Greeb::Segmentator.new(tokenizer).sentences
84
84
  =begin
85
- #<SortedSet: {#<struct Greeb::Entity from=0, to=6, type=:sentence>,
86
- #<struct Greeb::Entity from=7, to=19, type=:sentence>}>
85
+ [#<struct Greeb::Entity from=0, to=6, type=:sentence>,
86
+ #<struct Greeb::Entity from=7, to=19, type=:sentence>]
87
87
  =end
88
88
  ```
89
89
 
@@ -111,9 +111,9 @@ pp segmentator.extract(*sentences)
111
111
 
112
112
  ## Tokens
113
113
 
114
- Greeb operates with entities, tuples of `<from, to, type>`, where
115
- `from` is a beginning of the entity, `to` is an ending of the entity,
116
- and `type` is a type of the entity.
114
+ Greeb operates with entities, tuples of *(from, to, kind)*, where
115
+ *from* is a beginning of the entity, *to* is an ending of the entity,
116
+ and *kind* is a type of the entity.
117
117
 
118
118
  There are several entity types: `:letter`, `:float`, `:integer`,
119
119
  `:separ`, `:punct` (for punctuation), `:spunct` (for in-sentence
@@ -132,11 +132,6 @@ systematic and awesome.
132
132
 
133
133
  ## Build Status [<img src="https://secure.travis-ci.org/eveel/greeb.png"/>](http://travis-ci.org/eveel/greeb)
134
134
 
135
- If you're using [Rubinius](http://rubini.us) please note that it has the
136
- incompatible `StringScanner` implementation. More information can be
137
- provided under the following link:
138
- <https://github.com/rubinius/rubinius/issues/1808>.
139
-
140
135
  ## Dependency Status [<img src="https://gemnasium.com/eveel/greeb.png?travis"/>](https://gemnasium.com/eveel/greeb)
141
136
 
142
137
  ## Copyright
data/greeb.gemspec CHANGED
@@ -6,12 +6,12 @@ Gem::Specification.new do |s|
6
6
  s.name = 'greeb'
7
7
  s.version = Greeb::VERSION
8
8
  s.platform = Gem::Platform::RUBY
9
- s.authors = ['Dmitry A. Ustalov']
9
+ s.authors = ['Dmitry Ustalov']
10
10
  s.email = ['dmitry@eveel.ru']
11
11
  s.homepage = 'https://github.com/eveel/greeb'
12
- s.summary = 'Greeb is a simple regexp-based tokenizer.'
13
- s.description = 'Greeb is a simple yet awesome regexp-based tokenizer, ' \
14
- 'written in Ruby.'
12
+ s.summary = 'Greeb is a simple Unicode-aware regexp-based tokenizer.'
13
+ s.description = 'Greeb is a simple yet awesome and Unicode-aware ' \
14
+ 'regexp-based tokenizer, written in Ruby.'
15
15
 
16
16
  s.rubyforge_project = 'greeb'
17
17
 
data/lib/greeb.rb CHANGED
@@ -2,9 +2,9 @@
2
2
 
3
3
  require 'greeb/version'
4
4
 
5
- # Greeb operates with entities, tuples of `<from, to, kind>`, where
6
- # `from` is a beginning of the entity, `to` is an ending of the entity,
7
- # and `kind` is a type of the entity.
5
+ # Greeb operates with entities, tuples of *(from, to, kind)*, where
6
+ # *from* is a beginning of the entity, *to* is an ending of the entity,
7
+ # and *kind* is a type of the entity.
8
8
  #
9
9
  # There are several entity types: `:letter`, `:float`, `:integer`,
10
10
  # `:separ` for separators, `:punct` for punctuation characters,
@@ -26,7 +26,7 @@ class Greeb::Segmentator
26
26
 
27
27
  # Sentences memoization method.
28
28
  #
29
- # @return [Set<Greeb::Entity>] a set of sentences.
29
+ # @return [Array<Greeb::Entity>] a set of sentences.
30
30
  #
31
31
  def sentences
32
32
  detect_sentences! unless @sentences
@@ -35,7 +35,7 @@ class Greeb::Segmentator
35
35
 
36
36
  # Subsentences memoization method.
37
37
  #
38
- # @return [Set<Greeb::Entity>] a set of subsentences.
38
+ # @return [Array<Greeb::Entity>] a set of subsentences.
39
39
  #
40
40
  def subsentences
41
41
  detect_subsentences! unless @subsentences
@@ -79,7 +79,7 @@ class Greeb::Segmentator
79
79
  # @return [nil] nothing.
80
80
  #
81
81
  def detect_sentences!
82
- @sentences = SortedSet.new
82
+ @sentences = []
83
83
 
84
84
  rest = tokens.inject(new_sentence) do |sentence, token|
85
85
  if !sentence.from and SENTENCE_DOESNT_START.include?(token.type)
@@ -1,13 +1,29 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'set'
4
-
5
3
  # Greeb's tokenization facilities. Use 'em with love.
6
4
  #
7
5
  class Greeb::Tokenizer
6
+ # This runtime error appears when {Greeb::Tokenizer} tries to recognize
7
+ # unknown character.
8
+ #
9
+ class UnknownEntity < RuntimeError
10
+ attr_reader :text, :pos
11
+
12
+ # @private
13
+ def initialize(text, pos)
14
+ @text, @pos = text, pos
15
+ end
16
+
17
+ # Generate the real error message.
18
+ #
19
+ def to_s
20
+ 'Could not recognize character "%s" @ %d' % [text[pos], pos]
21
+ end
22
+ end
23
+
8
24
  # English and Russian letters.
9
25
  #
10
- LETTERS = /[A-Za-zА-Яа-яЁё]+/u
26
+ LETTERS = /[\p{L}]+/u
11
27
 
12
28
  # Floating point values.
13
29
  #
@@ -17,21 +33,21 @@ class Greeb::Tokenizer
17
33
  #
18
34
  INTEGERS = /\d+/u
19
35
 
20
- # In-subsentence seprator (i.e.: "*" or "=").
36
+ # In-sentence punctuation character (i.e.: "," or "-").
21
37
  #
22
- SEPARATORS = /[*=_\/\\ ]+/u
38
+ SENTENCE_PUNCTUATIONS = /(\,|\-|:|;|\p{Ps}|\p{Pi}|\p{Pf}|\p{Pe})+/u
23
39
 
24
40
  # Punctuation character (i.e.: "." or "!").
25
41
  #
26
- PUNCTUATIONS = /(\.|\!|\?)+/u
42
+ PUNCTUATIONS = /[(\.|\!|\?)]+/u
27
43
 
28
- # In-sentence punctuation character (i.e.: "," or "-").
44
+ # In-subsentence seprator (i.e.: "*" or "=").
29
45
  #
30
- SENTENCE_PUNCTUATIONS = /(\,|\[|\]|\(|\)|\-|:|;)+/u
46
+ SEPARATORS = /[ \p{Sm}\p{Pc}\p{Po}\p{Pd}]+/u
31
47
 
32
48
  # Line breaks.
33
49
  #
34
- BREAKS = /\n+/u
50
+ BREAKS = /(\r\n|\n|\r)+/u
35
51
 
36
52
  attr_reader :text, :scanner
37
53
  protected :scanner
@@ -46,7 +62,7 @@ class Greeb::Tokenizer
46
62
 
47
63
  # Tokens memoization method.
48
64
  #
49
- # @return [Set<Greeb::Entity>] a set of tokens.
65
+ # @return [Array<Greeb::Entity>] a set of tokens.
50
66
  #
51
67
  def tokens
52
68
  tokenize! unless @tokens
@@ -61,7 +77,7 @@ class Greeb::Tokenizer
61
77
  #
62
78
  def tokenize!
63
79
  @scanner = Greeb::StringScanner.new(text)
64
- @tokens = SortedSet.new
80
+ @tokens = []
65
81
  while !scanner.eos?
66
82
  parse! LETTERS, :letter or
67
83
  parse! FLOATS, :float or
@@ -70,7 +86,7 @@ class Greeb::Tokenizer
70
86
  split_parse! PUNCTUATIONS, :punct or
71
87
  split_parse! SEPARATORS, :separ or
72
88
  split_parse! BREAKS, :break or
73
- raise @tokens.inspect
89
+ raise UnknownEntity.new(text, scanner.char_pos)
74
90
  end
75
91
  ensure
76
92
  scanner.terminate
@@ -83,7 +99,7 @@ class Greeb::Tokenizer
83
99
  # @param type [Symbol] a symbol that represents the necessary token
84
100
  # type.
85
101
  #
86
- # @return [Set<Greeb::Entity>] the modified set of extracted tokens.
102
+ # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
87
103
  #
88
104
  def parse! pattern, type
89
105
  return false unless token = scanner.scan(pattern)
@@ -101,7 +117,7 @@ class Greeb::Tokenizer
101
117
  # @param type [Symbol] a symbol that represents the necessary token
102
118
  # type.
103
119
  #
104
- # @return [Set<Greeb::Entity>] the modified set of extracted tokens.
120
+ # @return [Array<Greeb::Entity>] the modified set of extracted tokens.
105
121
  #
106
122
  def split_parse! pattern, type
107
123
  return false unless token = scanner.scan(pattern)
data/lib/greeb/version.rb CHANGED
@@ -5,5 +5,5 @@
5
5
  module Greeb
6
6
  # Version of Greeb.
7
7
  #
8
- VERSION = '0.1.0.rc4'
8
+ VERSION = '0.1.0.rc6'
9
9
  end
@@ -10,12 +10,12 @@ module Greeb
10
10
  subject { Segmentator.new(@tokenizer) }
11
11
 
12
12
  it 'can be initialized either with Tokenizer' do
13
- subject.tokens.must_be_kind_of SortedSet
13
+ subject.tokens.must_be_kind_of Array
14
14
  end
15
15
 
16
16
  it 'can be initialized either with a set of tokens' do
17
17
  subject = Segmentator.new(@tokenizer.tokens)
18
- subject.tokens.must_be_kind_of SortedSet
18
+ subject.tokens.must_be_kind_of Array
19
19
  end
20
20
 
21
21
  it 'should has @tokens ivar' do
@@ -30,7 +30,7 @@ module Greeb
30
30
 
31
31
  it 'should be segmented' do
32
32
  subject.must_equal(
33
- SortedSet.new([Entity.new(0, 22, :sentence)])
33
+ [Entity.new(0, 22, :sentence)]
34
34
  )
35
35
  end
36
36
  end
@@ -42,7 +42,7 @@ module Greeb
42
42
 
43
43
  it 'should be segmented' do
44
44
  subject.must_equal(
45
- SortedSet.new([Entity.new(0, 21, :sentence)])
45
+ [Entity.new(0, 21, :sentence)]
46
46
  )
47
47
  end
48
48
  end
@@ -54,7 +54,7 @@ module Greeb
54
54
 
55
55
  it 'should be segmented' do
56
56
  subject.must_equal(
57
- SortedSet.new([Entity.new(6, 27, :sentence)])
57
+ [Entity.new(6, 27, :sentence)]
58
58
  )
59
59
  end
60
60
  end
@@ -66,8 +66,8 @@ module Greeb
66
66
 
67
67
  it 'should be segmented' do
68
68
  subject.must_equal(
69
- SortedSet.new([Entity.new(0, 6, :sentence),
70
- Entity.new(7, 22, :sentence)])
69
+ [Entity.new(0, 6, :sentence),
70
+ Entity.new(7, 22, :sentence)]
71
71
  )
72
72
  end
73
73
  end
@@ -79,7 +79,7 @@ module Greeb
79
79
 
80
80
  it 'should be segmented' do
81
81
  subject.must_equal(
82
- SortedSet.new([Entity.new(2, 17, :sentence)])
82
+ [Entity.new(2, 17, :sentence)]
83
83
  )
84
84
  end
85
85
  end
@@ -32,75 +32,75 @@ module Greeb
32
32
  end
33
33
 
34
34
  it 'should has the tokens set' do
35
- subject.tokens.must_be_kind_of SortedSet
35
+ subject.tokens.must_be_kind_of Array
36
36
  end
37
37
  end
38
38
 
39
39
  describe 'tokenization facilities' do
40
40
  it 'can handle words' do
41
41
  Tokenizer.new('hello').tokens.must_equal(
42
- SortedSet.new([Entity.new(0, 5, :letter)])
42
+ [Entity.new(0, 5, :letter)]
43
43
  )
44
44
  end
45
45
 
46
46
  it 'can handle floats' do
47
47
  Tokenizer.new('14.88').tokens.must_equal(
48
- SortedSet.new([Entity.new(0, 5, :float)])
48
+ [Entity.new(0, 5, :float)]
49
49
  )
50
50
  end
51
51
 
52
52
  it 'can handle integers' do
53
53
  Tokenizer.new('1337').tokens.must_equal(
54
- SortedSet.new([Entity.new(0, 4, :integer)])
54
+ [Entity.new(0, 4, :integer)]
55
55
  )
56
56
  end
57
57
 
58
58
  it 'can handle words and integers' do
59
59
  Tokenizer.new('Hello, I am 18').tokens.must_equal(
60
- SortedSet.new([Entity.new(0, 5, :letter),
61
- Entity.new(5, 6, :spunct),
62
- Entity.new(6, 7, :separ),
63
- Entity.new(7, 8, :letter),
64
- Entity.new(8, 9, :separ),
65
- Entity.new(9, 11, :letter),
66
- Entity.new(11, 12, :separ),
67
- Entity.new(12, 14, :integer)])
60
+ [Entity.new(0, 5, :letter),
61
+ Entity.new(5, 6, :spunct),
62
+ Entity.new(6, 7, :separ),
63
+ Entity.new(7, 8, :letter),
64
+ Entity.new(8, 9, :separ),
65
+ Entity.new(9, 11, :letter),
66
+ Entity.new(11, 12, :separ),
67
+ Entity.new(12, 14, :integer)]
68
68
  )
69
69
  end
70
70
 
71
71
  it 'can handle multi-line paragraphs' do
72
72
  Tokenizer.new("Brateeshka..!\n\nPrines!").tokens.must_equal(
73
- SortedSet.new([Entity.new(0, 10, :letter),
74
- Entity.new(10, 12, :punct),
75
- Entity.new(12, 13, :punct),
76
- Entity.new(13, 15, :break),
77
- Entity.new(15, 21, :letter),
78
- Entity.new(21, 22, :punct)])
73
+ [Entity.new(0, 10, :letter),
74
+ Entity.new(10, 12, :punct),
75
+ Entity.new(12, 13, :punct),
76
+ Entity.new(13, 15, :break),
77
+ Entity.new(15, 21, :letter),
78
+ Entity.new(21, 22, :punct)]
79
79
  )
80
80
  end
81
81
 
82
82
  it 'can handle separated integers' do
83
83
  Tokenizer.new('228/359').tokens.must_equal(
84
- SortedSet.new([Entity.new(0, 3, :integer),
85
- Entity.new(3, 4, :separ),
86
- Entity.new(4, 7, :integer)])
84
+ [Entity.new(0, 3, :integer),
85
+ Entity.new(3, 4, :separ),
86
+ Entity.new(4, 7, :integer)]
87
87
  )
88
88
  end
89
89
 
90
90
  it 'can deal with Russian language' do
91
91
  Tokenizer.new('Братишка, я тебе покушать принёс!').tokens.must_equal(
92
- SortedSet.new([Entity.new(0, 8, :letter),
93
- Entity.new(8, 9, :spunct),
94
- Entity.new(9, 10, :separ),
95
- Entity.new(10, 11, :letter),
96
- Entity.new(11, 12, :separ),
97
- Entity.new(12, 16, :letter),
98
- Entity.new(16, 17, :separ),
99
- Entity.new(17, 25, :letter),
100
- Entity.new(25, 26, :separ),
101
- Entity.new(26, 32, :letter),
102
- Entity.new(32, 33, :punct)])
103
- )
92
+ [Entity.new(0, 8, :letter),
93
+ Entity.new(8, 9, :spunct),
94
+ Entity.new(9, 10, :separ),
95
+ Entity.new(10, 11, :letter),
96
+ Entity.new(11, 12, :separ),
97
+ Entity.new(12, 16, :letter),
98
+ Entity.new(16, 17, :separ),
99
+ Entity.new(17, 25, :letter),
100
+ Entity.new(25, 26, :separ),
101
+ Entity.new(26, 32, :letter),
102
+ Entity.new(32, 33, :punct)]
103
+ )
104
104
  end
105
105
  end
106
106
  end
metadata CHANGED
@@ -1,81 +1,82 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.rc4
4
+ version: 0.1.0.rc6
5
5
  prerelease: 6
6
6
  platform: ruby
7
7
  authors:
8
- - Dmitry A. Ustalov
8
+ - Dmitry Ustalov
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-20 00:00:00.000000000 Z
12
+ date: 2012-12-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
- name: rake
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
21
19
  version: '0'
20
+ none: false
22
21
  type: :development
23
- prerelease: false
22
+ name: rake
24
23
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
24
  requirements:
27
25
  - - ! '>='
28
26
  - !ruby/object:Gem::Version
29
27
  version: '0'
28
+ none: false
29
+ prerelease: false
30
30
  - !ruby/object:Gem::Dependency
31
- name: minitest
32
31
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
32
  requirements:
35
33
  - - ! '>='
36
34
  - !ruby/object:Gem::Version
37
35
  version: '2.11'
36
+ none: false
38
37
  type: :development
39
- prerelease: false
38
+ name: minitest
40
39
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
40
  requirements:
43
41
  - - ! '>='
44
42
  - !ruby/object:Gem::Version
45
43
  version: '2.11'
44
+ none: false
45
+ prerelease: false
46
46
  - !ruby/object:Gem::Dependency
47
- name: simplecov
48
47
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
48
  requirements:
51
49
  - - ! '>='
52
50
  - !ruby/object:Gem::Version
53
51
  version: '0'
52
+ none: false
54
53
  type: :development
55
- prerelease: false
54
+ name: simplecov
56
55
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
56
  requirements:
59
57
  - - ! '>='
60
58
  - !ruby/object:Gem::Version
61
59
  version: '0'
60
+ none: false
61
+ prerelease: false
62
62
  - !ruby/object:Gem::Dependency
63
- name: yard
64
63
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
64
  requirements:
67
65
  - - ! '>='
68
66
  - !ruby/object:Gem::Version
69
67
  version: '0'
68
+ none: false
70
69
  type: :development
71
- prerelease: false
70
+ name: yard
72
71
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
72
  requirements:
75
73
  - - ! '>='
76
74
  - !ruby/object:Gem::Version
77
75
  version: '0'
78
- description: Greeb is a simple yet awesome regexp-based tokenizer, written in Ruby.
76
+ none: false
77
+ prerelease: false
78
+ description: Greeb is a simple yet awesome and Unicode-aware regexp-based tokenizer,
79
+ written in Ruby.
79
80
  email:
80
81
  - dmitry@eveel.ru
81
82
  executables: []
@@ -105,26 +106,26 @@ rdoc_options: []
105
106
  require_paths:
106
107
  - lib
107
108
  required_ruby_version: !ruby/object:Gem::Requirement
108
- none: false
109
109
  requirements:
110
110
  - - ! '>='
111
111
  - !ruby/object:Gem::Version
112
+ hash: 2757695902770698935
112
113
  version: '0'
113
114
  segments:
114
115
  - 0
115
- hash: 1130932854600612903
116
- required_rubygems_version: !ruby/object:Gem::Requirement
117
116
  none: false
117
+ required_rubygems_version: !ruby/object:Gem::Requirement
118
118
  requirements:
119
119
  - - ! '>'
120
120
  - !ruby/object:Gem::Version
121
121
  version: 1.3.1
122
+ none: false
122
123
  requirements: []
123
124
  rubyforge_project: greeb
124
125
  rubygems_version: 1.8.24
125
126
  signing_key:
126
127
  specification_version: 3
127
- summary: Greeb is a simple regexp-based tokenizer.
128
+ summary: Greeb is a simple Unicode-aware regexp-based tokenizer.
128
129
  test_files:
129
130
  - spec/segmentator_spec.rb
130
131
  - spec/spec_helper.rb