greeb 0.0.2 → 0.1.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -27,6 +27,7 @@ nbproject
27
27
 
28
28
  ## BUNDLER
29
29
  .bundle
30
+ Gemfile.lock
30
31
 
31
32
  ## PROJECT::GENERAL
32
33
  coverage
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ branches:
2
+ only:
3
+ - develop
4
+ - master
5
+ rvm:
6
+ - 1.9.3
7
+ - rbx-19mode
data/.yardopts ADDED
@@ -0,0 +1,6 @@
1
+ --protected
2
+ --no-private
3
+ -m markdown
4
+ -
5
+ README.md
6
+ LICENSE
data/Gemfile CHANGED
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  source 'http://rubygems.org'
2
4
 
3
5
  gemspec
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010-2012 Dmitry A. Ustalov
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,140 @@
1
+ Greeb
2
+ =====
3
+
4
+ Greeb is a simple yet awesome text tokenizer that is based on regular
5
+ expressions.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'greeb'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install greeb
22
+
23
+ ## Usage
24
+
25
+ Greeb can help you to solve simple text processing problems:
26
+
27
+ ```ruby
28
+ pp Greeb::Tokenizer.new('Hello!').tokens
29
+ =begin
30
+ #<SortedSet: {#<struct Greeb::Entity from=0, to=5, type=:letter>,
31
+ #<struct Greeb::Entity from=5, to=6, type=:punct>}>
32
+ =end
33
+ ```
34
+
35
+ It should be noted that it is possible to process much complex texts:
36
+
37
+ ```ruby
38
+ text =<<-EOF
39
+ Hello! I am 18! My favourite number is 133.7...
40
+
41
+ What about you?
42
+ EOF
43
+
44
+ pp Greeb::Tokenizer.new(text).tokens
45
+ =begin
46
+ #<SortedSet: {#<struct Greeb::Entity from=0, to=5, type=:letter>,
47
+ #<struct Greeb::Entity from=5, to=6, type=:punct>,
48
+ #<struct Greeb::Entity from=6, to=7, type=:separ>,
49
+ #<struct Greeb::Entity from=7, to=8, type=:letter>,
50
+ #<struct Greeb::Entity from=8, to=9, type=:separ>,
51
+ #<struct Greeb::Entity from=9, to=11, type=:letter>,
52
+ #<struct Greeb::Entity from=11, to=12, type=:separ>,
53
+ #<struct Greeb::Entity from=12, to=14, type=:integer>,
54
+ #<struct Greeb::Entity from=14, to=15, type=:punct>,
55
+ #<struct Greeb::Entity from=15, to=16, type=:separ>,
56
+ #<struct Greeb::Entity from=16, to=18, type=:letter>,
57
+ #<struct Greeb::Entity from=18, to=19, type=:separ>,
58
+ #<struct Greeb::Entity from=19, to=28, type=:letter>,
59
+ #<struct Greeb::Entity from=28, to=29, type=:separ>,
60
+ #<struct Greeb::Entity from=29, to=35, type=:letter>,
61
+ #<struct Greeb::Entity from=35, to=36, type=:separ>,
62
+ #<struct Greeb::Entity from=36, to=38, type=:letter>,
63
+ #<struct Greeb::Entity from=38, to=39, type=:separ>,
64
+ #<struct Greeb::Entity from=39, to=44, type=:float>,
65
+ #<struct Greeb::Entity from=44, to=47, type=:punct>,
66
+ #<struct Greeb::Entity from=47, to=49, type=:break>,
67
+ #<struct Greeb::Entity from=49, to=53, type=:letter>,
68
+ #<struct Greeb::Entity from=53, to=54, type=:separ>,
69
+ #<struct Greeb::Entity from=54, to=59, type=:letter>,
70
+ #<struct Greeb::Entity from=59, to=60, type=:separ>,
71
+ #<struct Greeb::Entity from=60, to=63, type=:letter>,
72
+ #<struct Greeb::Entity from=63, to=64, type=:punct>,
73
+ #<struct Greeb::Entity from=64, to=65, type=:break>}>
74
+ =end
75
+ ```
76
+
77
+ Also it can be used to solve the text segmentation problems
78
+ such as sentence detection tasks:
79
+
80
+ ```ruby
81
+ text = 'Hello! How are you?'
82
+ pp Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
83
+ =begin
84
+ #<SortedSet: {#<struct Greeb::Entity from=0, to=6, type=:sentence>,
85
+ #<struct Greeb::Entity from=7, to=19, type=:sentence>}>
86
+ =end
87
+ ```
88
+
89
+ It is possible to extract tokens that were processed by the text
90
+ segmentator:
91
+
92
+ ```ruby
93
+ text = 'Hello! How are you?'
94
+ segmentator = Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
95
+ sentences = segmentator.sentences
96
+ pp segmentator.extract(*sentences)
97
+ =begin
98
+ {#<struct Greeb::Entity from=0, to=6, type=:sentence>=>
99
+ [#<struct Greeb::Entity from=0, to=5, type=:letter>,
100
+ #<struct Greeb::Entity from=5, to=6, type=:punct>],
101
+ #<struct Greeb::Entity from=7, to=19, type=:sentence>=>
102
+ [#<struct Greeb::Entity from=7, to=10, type=:letter>,
103
+ #<struct Greeb::Entity from=10, to=11, type=:separ>,
104
+ #<struct Greeb::Entity from=11, to=14, type=:letter>,
105
+ #<struct Greeb::Entity from=14, to=15, type=:separ>,
106
+ #<struct Greeb::Entity from=15, to=18, type=:letter>,
107
+ #<struct Greeb::Entity from=18, to=19, type=:punct>]}
108
+ =end
109
+ ```
110
+
111
+ ## Tokens
112
+
113
+ Greeb operates with entities, tuples of `<from, to, type>`, where
114
+ `from` is a beginning of the entity, `to` is an ending of the entity,
115
+ and `type` is a type of the entity.
116
+
117
+ There are several entity types: `:letter`, `:float`, `:integer`,
118
+ `:separ`, `:punct` (for punctuation), `:spunct` (for in-sentence
119
+ punctuation), and `:break`.
120
+
121
+ ## Contributing
122
+
123
+ 1. Fork it;
124
+ 2. Create your feature branch (`git checkout -b my-new-feature`);
125
+ 3. Commit your changes (`git commit -am 'Added some feature'`);
126
+ 4. Push to the branch (`git push origin my-new-feature`);
127
+ 5. Create new Pull Request.
128
+
129
+ I highly recommend you to use git flow to make development process much
130
+ systematic and awesome.
131
+
132
+ ## Build Status [<img src="https://secure.travis-ci.org/eveel/greeb.png"/>](http://travis-ci.org/eveel/greeb)
133
+
134
+ ## Dependency Status [<img src="https://gemnasium.com/eveel/greeb.png?travis"/>](https://gemnasium.com/eveel/greeb)
135
+
136
+ ## Copyright
137
+
138
+ Copyright (c) 2010-2012 [Dmitry A. Ustalov]. See LICENSE for details.
139
+
140
+ [Dmitry A. Ustalov]: http://eveel.ru
data/Rakefile CHANGED
@@ -1,12 +1,12 @@
1
+ #!/usr/bin/env rake
1
2
  # encoding: utf-8
2
3
 
3
- require 'bundler'
4
- Bundler::GemHelper.install_tasks
4
+ require 'bundler/gem_tasks'
5
5
 
6
- require 'rspec/core/rake_task'
7
- desc 'Run all examples'
8
- RSpec::Core::RakeTask.new(:spec) do |t|
9
- t.rspec_opts = %w[--color]
10
- end
6
+ task :default => :test
11
7
 
12
- task :default => :spec
8
+ require 'rake/testtask'
9
+ Rake::TestTask.new do |test|
10
+ test.pattern = 'spec/**/*_spec.rb'
11
+ test.verbose = true
12
+ end
data/greeb.gemspec CHANGED
@@ -1,25 +1,27 @@
1
1
  # encoding: utf-8
2
2
 
3
- $:.push File.expand_path('../lib', __FILE__)
4
- require 'greeb'
3
+ require File.expand_path('../lib/greeb/version', __FILE__)
5
4
 
6
5
  Gem::Specification.new do |s|
7
6
  s.name = 'greeb'
8
7
  s.version = Greeb::VERSION
9
8
  s.platform = Gem::Platform::RUBY
10
- s.authors = [ 'Dmitry A. Ustalov' ]
11
- s.email = [ 'dmitry@eveel.ru' ]
9
+ s.authors = ['Dmitry A. Ustalov']
10
+ s.email = ['dmitry@eveel.ru']
12
11
  s.homepage = 'https://github.com/eveel/greeb'
13
- s.summary = 'Greeb is a Graphematical Analyzer.'
14
- s.description = 'Greeb is awesome Graphematical Analyzer, ' \
12
+ s.summary = 'Greeb is a simple regexp-based tokenizer.'
13
+ s.description = 'Greeb is a simple yet awesome regexp-based tokenizer, ' \
15
14
  'written in Ruby.'
16
15
 
17
16
  s.rubyforge_project = 'greeb'
18
17
 
19
- s.add_dependency 'rspec', '~> 2.4.0'
18
+ s.add_development_dependency 'rake'
19
+ s.add_development_dependency 'minitest', '>= 2.11'
20
+ s.add_development_dependency 'simplecov'
21
+ s.add_development_dependency 'yard'
20
22
 
21
23
  s.files = `git ls-files`.split("\n")
22
24
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
23
25
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
24
- s.require_paths = [ 'lib' ]
26
+ s.require_paths = ['lib']
25
27
  end
data/lib/greeb.rb CHANGED
@@ -1,11 +1,25 @@
1
1
  # encoding: utf-8
2
2
 
3
- # Greeb is awesome Graphematical Analyzer.
4
- #
5
- module Greeb
6
- # Version of the Greeb.
7
- #
8
- VERSION = "0.0.2"
3
+ require 'greeb/version'
9
4
 
10
- require 'greeb/parser'
5
+ # Greeb operates with entities, tuples of `<from, to, kind>`, where
6
+ # `from` is a beginning of the entity, `to` is an ending of the entity,
7
+ # and `kind` is a type of the entity.
8
+ #
9
+ # There are several entity types: `:letter`, `:float`, `:integer`,
10
+ # `:separ` for separators, `:punct` for punctuation characters,
11
+ # `:spunct` for in-sentence punctuation characters, and
12
+ # `:break` for line endings.
13
+ #
14
+ class Greeb::Entity < Struct.new(:from, :to, :type)
15
+ def <=> other
16
+ if (comparison = self.from <=> other.from) == 0
17
+ self.to <=> other.to
18
+ else
19
+ comparison
20
+ end
21
+ end
11
22
  end
23
+
24
+ require 'greeb/tokenizer'
25
+ require 'greeb/segmentator'
@@ -0,0 +1,95 @@
1
+ # encoding: utf-8
2
+
3
+ # It is possible to perform simple sentence detection that is based
4
+ # on Greeb's tokenization.
5
+ #
6
+ class Greeb::Segmentator
7
+ # Sentence does not start from the separator charater, line break
8
+ # character, and punctuation characters.
9
+ #
10
+ SENTENCE_DOESNT_START = [:separ, :break, :punct, :spunct]
11
+
12
+ attr_reader :tokens
13
+
14
+ # Create a new instance of {Greeb::Segmentator}.
15
+ #
16
+ # @param tokenizer_or_tokens [Greeb::Tokenizer,Set] an instance of
17
+ # Greeb::Tokenizer or set of its results.
18
+ #
19
+ def initialize tokenizer_or_tokens
20
+ @tokens = if tokenizer_or_tokens.is_a? Greeb::Tokenizer
21
+ tokenizer_or_tokens.tokens
22
+ else
23
+ tokenizer_or_tokens
24
+ end
25
+ end
26
+
27
+ # Sentences memoization method.
28
+ #
29
+ # @return [Set<Greeb::Entity>] a set of sentences.
30
+ #
31
+ def sentences
32
+ detect_sentences! unless @sentences
33
+ @sentences
34
+ end
35
+
36
+ # Extract tokens from the set of sentences.
37
+ #
38
+ # @param sentences [Array<Greeb::Entity>] a list of sentences.
39
+ #
40
+ # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
41
+ # sentences as keys and tokens arrays as values.
42
+ #
43
+ def extract *sentences
44
+ Hash[
45
+ sentences.map do |s|
46
+ [s, tokens.select { |t| t.from >= s.from and t.to <= s.to }]
47
+ end
48
+ ]
49
+ end
50
+
51
+ protected
52
+ # Implementation of the sentence detection method. This method
53
+ # changes the `@sentences` ivar.
54
+ #
55
+ # @return [nil] nothing.
56
+ #
57
+ def detect_sentences!
58
+ @sentences = SortedSet.new
59
+
60
+ rest = tokens.inject(new_sentence) do |sentence, token|
61
+ if !sentence.from and SENTENCE_DOESNT_START.include?(token.type)
62
+ next sentence
63
+ end
64
+
65
+ sentence.from = token.from unless sentence.from
66
+
67
+ next sentence if sentence.to and sentence.to > token.to
68
+
69
+ if :punct == token.type
70
+ sentence.to = tokens.
71
+ select { |t| t.from >= token.from }.
72
+ inject(token) { |r, t| break r if t.type != token.type; t }.
73
+ to
74
+
75
+ @sentences << sentence
76
+ sentence = new_sentence
77
+ elsif :separ != token.type
78
+ sentence.to = token.to
79
+ end
80
+
81
+ sentence
82
+ end
83
+
84
+ nil.tap { @sentences << rest if rest.from and rest.to }
85
+ end
86
+
87
+ private
88
+ # Create a new instance of {Greeb::Entity} with `:sentence` type.
89
+ #
90
+ # @return [Greeb::Entity] a new entity instance.
91
+ #
92
+ def new_sentence
93
+ Greeb::Entity.new(nil, nil, :sentence)
94
+ end
95
+ end
@@ -0,0 +1,112 @@
1
+ # encoding: utf-8
2
+
3
+ require 'strscan'
4
+ require 'set'
5
+
6
+ # Greeb's tokenization facilities. Use 'em with love.
7
+ #
8
+ class Greeb::Tokenizer
9
+ # English and Russian letters.
10
+ #
11
+ LETTERS = /[A-Za-zА-Яа-яЁё]+/u
12
+
13
+ # Floating point values.
14
+ #
15
+ FLOATS = /(\d+)[.,](\d+)/u
16
+
17
+ # Integer values.
18
+ #
19
+ INTEGERS = /\d+/u
20
+
21
+ # In-subsentence seprator (i.e.: "*" or "=").
22
+ #
23
+ SEPARATORS = /[*=_\/\\ ]+/u
24
+
25
+ # Punctuation character (i.e.: "." or "!").
26
+ #
27
+ PUNCTUATIONS = /(\.|\!|\?)+/u
28
+
29
+ # In-sentence punctuation character (i.e.: "," or "-").
30
+ #
31
+ SENTENCE_PUNCTUATIONS = /(\,|\[|\]|\(|\)|\-|:|;)+/u
32
+
33
+ # Line breaks.
34
+ #
35
+ BREAKS = /\n+/u
36
+
37
+ attr_reader :text, :scanner
38
+ protected :scanner
39
+
40
+ # Create a new instance of {Greeb::Tokenizer}.
41
+ #
42
+ # @param text [String] text to be tokenized.
43
+ #
44
+ def initialize(text)
45
+ @text = text
46
+ end
47
+
48
+ # Tokens memoization method.
49
+ #
50
+ # @return [Set<Greeb::Entity>] a set of tokens.
51
+ #
52
+ def tokens
53
+ tokenize! unless @tokens
54
+ @tokens
55
+ end
56
+
57
+ protected
58
+ # Perform the tokenization process. This method modifies
59
+ # `@scanner` and `@tokens` instance variables.
60
+ #
61
+ # @return [nil] nothing unless exception is raised.
62
+ #
63
+ def tokenize!
64
+ @scanner = StringScanner.new(text)
65
+ @tokens = SortedSet.new
66
+ while !scanner.eos?
67
+ parse! LETTERS, :letter or
68
+ parse! FLOATS, :float or
69
+ parse! INTEGERS, :integer or
70
+ split_parse! SENTENCE_PUNCTUATIONS, :spunct or
71
+ split_parse! PUNCTUATIONS, :punct or
72
+ split_parse! SEPARATORS, :separ or
73
+ split_parse! BREAKS, :break or
74
+ raise @tokens.inspect
75
+ end
76
+ ensure
77
+ scanner.terminate
78
+ end
79
+
80
+ # Try to parse one small piece of text that is covered by pattern
81
+ # of necessary type.
82
+ #
83
+ # @param pattern [Regexp] a regular expression to extract the token.
84
+ # @param type [Symbol] a symbol that represents the necessary token
85
+ # type.
86
+ #
87
+ # @return [Set<Greeb::Entity>] the modified set of extracted tokens.
88
+ #
89
+ def parse! pattern, type
90
+ return false unless token = scanner.scan(pattern)
91
+ @tokens << Greeb::Entity.new(scanner.pos - token.length, scanner.pos, type)
92
+ end
93
+
94
+ # Try to parse one small piece of text that is covered by pattern
95
+ # of necessary type. This method performs grouping of the same
96
+ # characters.
97
+ #
98
+ # @param pattern [Regexp] a regular expression to extract the token.
99
+ # @param type [Symbol] a symbol that represents the necessary token
100
+ # type.
101
+ #
102
+ # @return [Set<Greeb::Entity>] the modified set of extracted tokens.
103
+ #
104
+ def split_parse! pattern, type
105
+ return false unless token = scanner.scan(pattern)
106
+ position = scanner.pos - token.length
107
+ token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
108
+ @tokens << Greeb::Entity.new(before, before + s.length, type)
109
+ before + s.length
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,9 @@
1
+ # encoding: utf-8
2
+
3
+ # Greeb is a simple regexp-based tokenizer.
4
+ #
5
+ module Greeb
6
+ # Version of Greeb.
7
+ #
8
+ VERSION = '0.1.0.rc1'
9
+ end
@@ -0,0 +1,112 @@
1
+ # encoding: utf-8
2
+
3
+ require File.expand_path('../spec_helper', __FILE__)
4
+
5
+ module Greeb
6
+ describe Segmentator do
7
+ describe 'initialization' do
8
+ before { @tokenizer = Tokenizer.new('Vodka') }
9
+
10
+ subject { Segmentator.new(@tokenizer) }
11
+
12
+ it 'can be initialized either with Tokenizer' do
13
+ subject.tokens.must_be_kind_of SortedSet
14
+ end
15
+
16
+ it 'can be initialized either with a set of tokens' do
17
+ subject = Segmentator.new(@tokenizer.tokens)
18
+ subject.tokens.must_be_kind_of SortedSet
19
+ end
20
+
21
+ it 'should has @tokens ivar' do
22
+ subject.instance_variable_get(:@tokens).wont_be_nil
23
+ end
24
+ end
25
+
26
+ describe 'a simple sentence' do
27
+ before { @tokenizer = Tokenizer.new('Hello, I am JC Denton.') }
28
+
29
+ subject { Segmentator.new(@tokenizer).sentences }
30
+
31
+ it 'should be segmented' do
32
+ subject.must_equal(
33
+ SortedSet.new([Entity.new(0, 22, :sentence)])
34
+ )
35
+ end
36
+ end
37
+
38
+ describe 'a simple sentence without punctuation' do
39
+ before { @tokenizer = Tokenizer.new('Hello, I am JC Denton') }
40
+
41
+ subject { Segmentator.new(@tokenizer).sentences }
42
+
43
+ it 'should be segmented' do
44
+ subject.must_equal(
45
+ SortedSet.new([Entity.new(0, 21, :sentence)])
46
+ )
47
+ end
48
+ end
49
+
50
+ describe 'a simple sentence with trailing whitespaces' do
51
+ before { @tokenizer = Tokenizer.new(' Hello, I am JC Denton ') }
52
+
53
+ subject { Segmentator.new(@tokenizer).sentences }
54
+
55
+ it 'should be segmented' do
56
+ subject.must_equal(
57
+ SortedSet.new([Entity.new(6, 27, :sentence)])
58
+ )
59
+ end
60
+ end
61
+
62
+ describe 'two simple sentences' do
63
+ before { @tokenizer = Tokenizer.new('Hello! I am JC Denton.') }
64
+
65
+ subject { Segmentator.new(@tokenizer).sentences }
66
+
67
+ it 'should be segmented' do
68
+ subject.must_equal(
69
+ SortedSet.new([Entity.new(0, 6, :sentence),
70
+ Entity.new(7, 22, :sentence)])
71
+ )
72
+ end
73
+ end
74
+
75
+ describe 'one wrong character and one simple sentence' do
76
+ before { @tokenizer = Tokenizer.new('! I am JC Denton.') }
77
+
78
+ subject { Segmentator.new(@tokenizer).sentences }
79
+
80
+ it 'should be segmented' do
81
+ subject.must_equal(
82
+ SortedSet.new([Entity.new(2, 17, :sentence)])
83
+ )
84
+ end
85
+ end
86
+
87
+ describe 'token extractor' do
88
+ before { @tokenizer = Tokenizer.new('Hello! I am JC Denton.') }
89
+
90
+ subject { Segmentator.new(@tokenizer) }
91
+
92
+ it 'should be extracted' do
93
+ subject.extract(*subject.sentences).must_equal({
94
+ Entity.new(0, 6, :sentence) => [
95
+ Entity.new(0, 5, :letter),
96
+ Entity.new(5, 6, :punct)
97
+ ],
98
+ Entity.new(7, 22, :sentence) => [
99
+ Entity.new(7, 8, :letter),
100
+ Entity.new(8, 9, :separ),
101
+ Entity.new(9, 11, :letter),
102
+ Entity.new(11, 12, :separ),
103
+ Entity.new(12, 14, :letter),
104
+ Entity.new(14, 15, :separ),
105
+ Entity.new(15, 21, :letter),
106
+ Entity.new(21, 22, :punct)
107
+ ]
108
+ })
109
+ end
110
+ end
111
+ end
112
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,14 +1,20 @@
1
1
  # encoding: utf-8
2
2
 
3
- require File.expand_path('../../lib/greeb', __FILE__)
3
+ require 'rubygems'
4
4
 
5
- RSpec.configure do |c|
6
- c.mock_with :rspec
5
+ $:.unshift File.expand_path('../../lib', __FILE__)
6
+
7
+ if RUBY_VERSION == '1.8'
8
+ gem 'minitest'
7
9
  end
8
10
 
9
- RSpec::Matchers.define :be_parsed_as do |expected|
10
- match do |actual|
11
- tree = Greeb::Parser.new(actual).parse
12
- tree == expected
11
+ require 'minitest/autorun'
12
+
13
+ unless 'true' == ENV['TRAVIS']
14
+ require 'simplecov'
15
+ SimpleCov.start do
16
+ add_filter '/spec/'
13
17
  end
14
18
  end
19
+
20
+ require 'greeb'
@@ -0,0 +1,91 @@
1
+ # encoding: utf-8
2
+
3
+ require File.expand_path('../spec_helper', __FILE__)
4
+
5
+ module Greeb
6
+ describe Tokenizer do
7
+ describe 'initialization' do
8
+ subject { Tokenizer.new('vodka') }
9
+
10
+ it 'should be initialized with a text' do
11
+ subject.text.must_equal 'vodka'
12
+ end
13
+
14
+ it 'should has the @text ivar' do
15
+ subject.instance_variable_get(:@text).must_equal 'vodka'
16
+ end
17
+
18
+ it 'should not has @tokens ivar' do
19
+ subject.instance_variable_get(:@tokens).must_be_nil
20
+ end
21
+ end
22
+
23
+ describe 'after tokenization' do
24
+ subject { Tokenizer.new('vodka').tap(&:tokens) }
25
+
26
+ it 'should has the @tokens ivar' do
27
+ subject.instance_variable_get(:@tokens).wont_be_nil
28
+ end
29
+
30
+ it 'should has the @scanner ivar' do
31
+ subject.instance_variable_get(:@scanner).wont_be_nil
32
+ end
33
+
34
+ it 'should has the tokens set' do
35
+ subject.tokens.must_be_kind_of SortedSet
36
+ end
37
+ end
38
+
39
+ describe 'tokenization facilities' do
40
+ it 'can handle words' do
41
+ Tokenizer.new('hello').tokens.must_equal(
42
+ SortedSet.new([Entity.new(0, 5, :letter)])
43
+ )
44
+ end
45
+
46
+ it 'can handle floats' do
47
+ Tokenizer.new('14.88').tokens.must_equal(
48
+ SortedSet.new([Entity.new(0, 5, :float)])
49
+ )
50
+ end
51
+
52
+ it 'can handle integers' do
53
+ Tokenizer.new('1337').tokens.must_equal(
54
+ SortedSet.new([Entity.new(0, 4, :integer)])
55
+ )
56
+ end
57
+
58
+ it 'can handle words and integers' do
59
+ Tokenizer.new('Hello, I am 18').tokens.must_equal(
60
+ SortedSet.new([Entity.new(0, 5, :letter),
61
+ Entity.new(5, 6, :spunct),
62
+ Entity.new(6, 7, :separ),
63
+ Entity.new(7, 8, :letter),
64
+ Entity.new(8, 9, :separ),
65
+ Entity.new(9, 11, :letter),
66
+ Entity.new(11, 12, :separ),
67
+ Entity.new(12, 14, :integer)])
68
+ )
69
+ end
70
+
71
+ it 'can handle multi-line paragraphs' do
72
+ Tokenizer.new("Brateeshka..!\n\nPrines!").tokens.must_equal(
73
+ SortedSet.new([Entity.new(0, 10, :letter),
74
+ Entity.new(10, 12, :punct),
75
+ Entity.new(12, 13, :punct),
76
+ Entity.new(13, 15, :break),
77
+ Entity.new(15, 21, :letter),
78
+ Entity.new(21, 22, :punct)])
79
+ )
80
+ end
81
+
82
+ it 'can handle separated integers' do
83
+ Tokenizer.new('228/359').tokens.must_equal(
84
+ SortedSet.new([Entity.new(0, 3, :integer),
85
+ Entity.new(3, 4, :separ),
86
+ Entity.new(4, 7, :integer)])
87
+ )
88
+ end
89
+ end
90
+ end
91
+ end
metadata CHANGED
@@ -1,29 +1,81 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
5
- prerelease:
4
+ version: 0.1.0.rc1
5
+ prerelease: 6
6
6
  platform: ruby
7
7
  authors:
8
8
  - Dmitry A. Ustalov
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-02-20 00:00:00.000000000 +05:00
13
- default_executable:
12
+ date: 2012-07-08 00:00:00.000000000 Z
14
13
  dependencies:
15
14
  - !ruby/object:Gem::Dependency
16
- name: rspec
17
- requirement: &81165430 !ruby/object:Gem::Requirement
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
18
17
  none: false
19
18
  requirements:
20
- - - ~>
19
+ - - ! '>='
21
20
  - !ruby/object:Gem::Version
22
- version: 2.4.0
23
- type: :runtime
21
+ version: '0'
22
+ type: :development
24
23
  prerelease: false
25
- version_requirements: *81165430
26
- description: Greeb is awesome Graphematical Analyzer, written in Ruby.
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: minitest
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '2.11'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '2.11'
46
+ - !ruby/object:Gem::Dependency
47
+ name: simplecov
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: yard
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: Greeb is a simple yet awesome regexp-based tokenizer, written in Ruby.
27
79
  email:
28
80
  - dmitry@eveel.ru
29
81
  executables: []
@@ -31,19 +83,20 @@ extensions: []
31
83
  extra_rdoc_files: []
32
84
  files:
33
85
  - .gitignore
86
+ - .travis.yml
87
+ - .yardopts
34
88
  - Gemfile
35
- - Gemfile.lock
36
- - README
89
+ - LICENSE
90
+ - README.md
37
91
  - Rakefile
38
- - greeb-test.rb
39
92
  - greeb.gemspec
40
- - lib/enumerable.rb
41
93
  - lib/greeb.rb
42
- - lib/greeb/parser.rb
43
- - lib/meta_array.rb
44
- - spec/parser_spec.rb
94
+ - lib/greeb/segmentator.rb
95
+ - lib/greeb/tokenizer.rb
96
+ - lib/greeb/version.rb
97
+ - spec/segmentator_spec.rb
45
98
  - spec/spec_helper.rb
46
- has_rdoc: true
99
+ - spec/tokenizer_spec.rb
47
100
  homepage: https://github.com/eveel/greeb
48
101
  licenses: []
49
102
  post_install_message:
@@ -56,18 +109,23 @@ required_ruby_version: !ruby/object:Gem::Requirement
56
109
  - - ! '>='
57
110
  - !ruby/object:Gem::Version
58
111
  version: '0'
112
+ segments:
113
+ - 0
114
+ hash: -4603914053803130942
59
115
  required_rubygems_version: !ruby/object:Gem::Requirement
60
116
  none: false
61
117
  requirements:
62
- - - ! '>='
118
+ - - ! '>'
63
119
  - !ruby/object:Gem::Version
64
- version: '0'
120
+ version: 1.3.1
65
121
  requirements: []
66
122
  rubyforge_project: greeb
67
- rubygems_version: 1.5.2
123
+ rubygems_version: 1.8.24
68
124
  signing_key:
69
125
  specification_version: 3
70
- summary: Greeb is a Graphematical Analyzer.
126
+ summary: Greeb is a simple regexp-based tokenizer.
71
127
  test_files:
72
- - spec/parser_spec.rb
128
+ - spec/segmentator_spec.rb
73
129
  - spec/spec_helper.rb
130
+ - spec/tokenizer_spec.rb
131
+ has_rdoc:
data/Gemfile.lock DELETED
@@ -1,24 +0,0 @@
1
- PATH
2
- remote: .
3
- specs:
4
- greeb (0.0.2)
5
- rspec (~> 2.4.0)
6
-
7
- GEM
8
- remote: http://rubygems.org/
9
- specs:
10
- diff-lcs (1.1.2)
11
- rspec (2.4.0)
12
- rspec-core (~> 2.4.0)
13
- rspec-expectations (~> 2.4.0)
14
- rspec-mocks (~> 2.4.0)
15
- rspec-core (2.4.0)
16
- rspec-expectations (2.4.0)
17
- diff-lcs (~> 1.1.2)
18
- rspec-mocks (2.4.0)
19
-
20
- PLATFORMS
21
- ruby
22
-
23
- DEPENDENCIES
24
- greeb!
data/README DELETED
File without changes
data/greeb-test.rb DELETED
@@ -1,141 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # encoding: utf-8
3
-
4
- require 'rubygems'
5
- require 'graphviz'
6
-
7
- $:.unshift('./lib')
8
- require 'greeb'
9
-
10
- origin = <<-END
11
- - Сынок, чего это от тебя зигами пахнет,
12
- опять на Манежную площадь ходил?
13
-
14
- - Нет мама, я в метро ехал, там назиговано было!!
15
-
16
-
17
-
18
- Четырнадцать, восемьдесять восемь: 14/88.
19
- Вот так блять
20
- END
21
- origin.chomp!
22
-
23
- def identify(token)
24
- case token
25
- when Greeb::RU_LEX then 'RU_LEX'
26
- when Greeb::EN_LEX then 'EN_LEX'
27
- when Greeb::EOL then 'EOL'
28
- when Greeb::SEP then 'SEP'
29
- when Greeb::PUN then 'PUN'
30
- when Greeb::SPUN then 'SPUN'
31
- when Greeb::DIG then 'DIG'
32
- when Greeb::DIL then 'DIL'
33
- else
34
- '?!'
35
- end
36
- end
37
-
38
- greeb = Greeb::Parser.new(origin)
39
- text = greeb.tree
40
-
41
- g = GraphViz.new('graphematics', 'type' => 'graph')
42
-
43
- g.node[:color] = '#ddaa66'
44
- g.node[:style] = 'filled'
45
- g.node[:shape] = 'box'
46
- g.node[:penwidth] = '1'
47
- g.node[:fontname] = 'PT Sans'
48
- g.node[:fontsize] = '8'
49
- g.node[:fillcolor]= '#ffeecc'
50
- g.node[:fontcolor]= '#775500'
51
- g.node[:margin] = '0.0'
52
-
53
- g.edge[:color] = '#999999'
54
- g.edge[:weight] = '1'
55
- g.edge[:fontname] = 'PT Sans'
56
- g.edge[:fontcolor]= '#444444'
57
- g.edge[:fontsize] = '6'
58
- g.edge[:dir] = 'forward'
59
- g.edge[:arrowsize]= '0.5'
60
-
61
- bid = 'begin'
62
- g.add_node(bid).tap do |node|
63
- node.label = "Начало\nтекста"
64
- node.shape = 'ellipse'
65
- node.style = ''
66
- end
67
-
68
- eid = 'end'
69
- g.add_node(eid).tap do |node|
70
- node.label = "Конец\nтекста"
71
- node.shape = 'ellipse'
72
- node.style = ''
73
- end
74
-
75
- tree = text.map_with_index do |paragraph, i|
76
- pid = "p#{i}"
77
- sentences = paragraph.map_with_index do |sentence, j|
78
- sid = "#{pid}s#{j}"
79
- subsentences = sentence.map_with_index do |subsentence, k|
80
- ssid = "#{sid}ss#{k}"
81
- tokens = subsentence.map_with_index do |token, l|
82
- next if ' ' == token
83
- [ "#{ssid}t#{l}", token, l ]
84
- end
85
- tokens.delete(nil)
86
- [ ssid, tokens, k ]
87
- end
88
- [ sid, subsentences, j ]
89
- end
90
- [ pid, sentences, i ]
91
- end
92
-
93
- tree.each do |pid, paragraph, i|
94
- g.add_node(pid).tap do |node|
95
- node.label = "Абзац\n№#{i + 1}"
96
- node.shape = 'ellipse'
97
- end
98
- g.add_edge(bid, pid)
99
-
100
- paragraph.each do |sid, sentence, j|
101
- g.add_node(sid).tap do |node|
102
- node.label = "Предложение\n№#{j + 1}"
103
- node.shape = 'ellipse'
104
- end
105
- g.add_edge(pid, sid)
106
-
107
- sentence.each do |ssid, subsentence, k|
108
- g.add_node(ssid).tap do |node|
109
- node.label = "Подпредложение\n№#{k + 1}"
110
- node.shape = 'ellipse'
111
- end
112
- g.add_edge(sid, ssid)
113
-
114
- subsentence.each do |tid, token, l|
115
- g.add_node(tid).label = token
116
- g.add_edge(ssid, tid).label = identify(token)
117
- g.add_edge(tid, eid)
118
- end
119
-
120
- subsentence.each_cons(2) do |(tid1, token1, l1),
121
- (tid2, token2, l2)|
122
- g.add_edge(tid1, tid2).tap do |edge|
123
- edge.weight = 0.25
124
- edge.style = 'dashed'
125
- end
126
- end
127
- end
128
-
129
- sentence.each_cons(2) do |(ssid1, subsentence1, k1),
130
- (ssid2, subsentence2, k2)|
131
- tid1, token1, l1 = subsentence1.last
132
- tid2, token2, l2 = subsentence2.first
133
- g.add_edge(tid1, tid2).tap do |edge|
134
- edge.weight = 0.5
135
- edge.style = 'dashed'
136
- end
137
- end
138
- end
139
- end
140
-
141
- g.output(:output => 'png', :file => 'graph.png')
data/lib/enumerable.rb DELETED
@@ -1,10 +0,0 @@
1
- # encoding: utf-8
2
-
3
- # Enumerable module additions.
4
- #
5
- module Enumerable
6
- def collect_with_index(i = -1) # :nodoc:
7
- collect { |e| yield(e, i += 1) }
8
- end
9
- alias map_with_index collect_with_index
10
- end
data/lib/greeb/parser.rb DELETED
@@ -1,176 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require 'meta_array'
4
- require 'enumerable'
5
-
6
- # Graphematical Parser of the Greeb.
7
- # Use it with love.
8
- #
9
- class Greeb::Parser
10
- # Russian lexeme (i.e.: "хуй").
11
- #
12
- RUSSIAN_LEXEME = /^[А-Яа-яЁё]+$/u
13
-
14
- # English lexeme (i.e.: "foo").
15
- #
16
- ENGLISH_LEXEME = /^[A-Za-z]+$/u
17
-
18
- # End of Line sequence (i.e.: "\n").
19
- #
20
- END_OF_LINE = /^\n+$/u
21
-
22
- # In-subsentence seprator (i.e.: "*" or "\").
23
- #
24
- SEPARATOR = /^[*=_\/\\ ]$/u
25
-
26
- # Punctuation character (i.e.: "." or "!").
27
- #
28
- PUNCTUATION = /^(\.|\!|\?)$/u
29
-
30
- # In-sentence punctuation character (i.e.: "," or "-").
31
- #
32
- SENTENCE_PUNCTUATION = /^(\,|\[|\]|\(|\)|\-|:|;)$/u
33
-
34
- # Digit (i.e.: "1337").
35
- #
36
- DIGIT = /^[0-9]+$/u
37
-
38
- # Digit-Letter complex (i.e.: "0xDEADBEEF").
39
- #
40
- DIGIT_LETTER = /^[А-Яа-яA-Za-z0-9Ёё]+$/u
41
-
42
- # Empty string (i.e.: "").
43
- #
44
- EMPTY = ''
45
-
46
- attr_accessor :text
47
- private :text=
48
-
49
- # Create a new instance of Greeb::Parser.
50
- #
51
- # ==== Parameters
52
- # text<String>:: Source text.
53
- #
54
- def initialize(text)
55
- self.text = text
56
- end
57
-
58
- # Perform the text parsing.
59
- #
60
- # ==== Returns
61
- # Array:: Tree of Graphematical Analysis of text.
62
- #
63
- def parse
64
- return @tree if @tree
65
-
66
- # parse tree
67
- tree = MetaArray.new
68
-
69
- # paragraph, sentence, subsentence
70
- p_id, s_id, ss_id = 0, 0, 0
71
-
72
- # current token
73
- token = ''
74
-
75
- # run FSM
76
- text.each_char do |c|
77
- case c
78
- when END_OF_LINE then begin
79
- case token
80
- when EMPTY then token << c
81
- when END_OF_LINE then begin
82
- token = ''
83
- p_id += 1
84
- s_id = 0
85
- ss_id = 0
86
- end
87
- else
88
- tree[p_id][s_id][ss_id] << token
89
- token = c
90
- end
91
- end
92
- when SEPARATOR then begin
93
- case token
94
- when EMPTY
95
- else
96
- tree[p_id][s_id][ss_id] << token
97
- while tree[p_id][s_id][ss_id].last == c
98
- tree[p_id][s_id][ss_id].pop
99
- end
100
- tree[p_id][s_id][ss_id] << c
101
- token = ''
102
- end
103
- end
104
- when PUNCTUATION then begin
105
- case token
106
- when EMPTY
107
- else
108
- tree[p_id][s_id][ss_id] << token
109
- tree[p_id][s_id][ss_id] << c
110
- token = ''
111
- s_id += 1
112
- ss_id = 0
113
- end
114
- end
115
- when SENTENCE_PUNCTUATION then begin
116
- case token
117
- when EMPTY
118
- else
119
- tree[p_id][s_id][ss_id] << token
120
- tree[p_id][s_id][ss_id] << c
121
- token = ''
122
- ss_id += 1
123
- end
124
- end
125
- when RUSSIAN_LEXEME then begin
126
- case token
127
- when END_OF_LINE then begin
128
- tree[p_id][s_id][ss_id] << ' '
129
- token = c
130
- end
131
- else
132
- token << c
133
- end
134
- end
135
- when ENGLISH_LEXEME then begin
136
- case token
137
- when END_OF_LINE then begin
138
- tree[p_id][s_id][ss_id] << ' '
139
- token = c
140
- end
141
- else
142
- token << c
143
- end
144
- end
145
- when DIGIT then begin
146
- case token
147
- when END_OF_LINE then begin
148
- tree[p_id][s_id][ss_id] << ' '
149
- token = c
150
- end
151
- else
152
- token << c
153
- end
154
- end
155
- when DIGIT_LETTER then begin
156
- case token
157
- when END_OF_LINE then begin
158
- tree[p_id][s_id][ss_id] << token
159
- token = c
160
- end
161
- else
162
- token << c
163
- end
164
- end
165
- end
166
- end
167
-
168
- unless token.empty?
169
- tree[p_id][s_id][ss_id] << token
170
- end
171
-
172
- tree.delete(nil)
173
-
174
- @tree = tree.to_a
175
- end
176
- end
data/lib/meta_array.rb DELETED
@@ -1,14 +0,0 @@
1
- # encoding: utf-8
2
-
3
- # MetaArray is an Array, which creates subarrays
4
- # on non-existent elements.
5
- #
6
- class MetaArray < Array
7
- def [] id
8
- super(id) or begin
9
- self.class.new.tap do |element|
10
- self[id] = element
11
- end
12
- end
13
- end
14
- end
data/spec/parser_spec.rb DELETED
@@ -1,63 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require File.expand_path('../spec_helper.rb', __FILE__)
4
-
5
- describe Greeb::Parser do
6
- it 'should parse very simple strings' do
7
- 'буба сука дебил'.should be_parsed_as([
8
- [
9
- [ [ 'буба', ' ', 'сука', ' ', 'дебил' ] ]
10
- ]
11
- ])
12
- end
13
-
14
- it 'should parse one sentence with subsentences' do
15
- 'буба, сука, дебил'.should be_parsed_as([
16
- [
17
- [
18
- [ 'буба', ',' ],
19
- [ 'сука', ',' ],
20
- [ 'дебил' ]
21
- ]
22
- ]
23
- ])
24
- end
25
-
26
- it 'should parse two simple paragraphs' do
27
- "буба сука дебил\n\nточно!".should be_parsed_as([
28
- [
29
- [ [ 'буба', ' ', 'сука', ' ', 'дебил' ] ]
30
- ],
31
- [
32
- [ [ 'точно', '!' ] ]
33
- ]
34
- ])
35
- end
36
-
37
- it 'should parse two sentences in paragraph' do
38
- "буба молодец? буба умница.".should be_parsed_as([
39
- [
40
- [ [ 'буба', ' ', 'молодец', '?' ] ],
41
- [ [ 'буба', ' ', 'умница', '.' ] ]
42
- ]
43
- ])
44
- end
45
-
46
- it 'should parse sentences with floating point values' do
47
- 'буба не считает Пи равной 3.14'.should be_parsed_as([
48
- [
49
- [ [ 'буба', ' ', 'не', ' ', 'считает', ' ',
50
- 'Пи', ' ', 'равной', ' ', '3.14' ] ]
51
- ]
52
- ])
53
- end
54
-
55
- it 'should parse sentences with floating "dot" values' do
56
- 'буба не считает Пи равной 3,14'.should be_parsed_as([
57
- [
58
- [ [ 'буба', ' ', 'не', ' ', 'считает', ' ',
59
- 'Пи', ' ', 'равной', ' ', '3,14' ] ]
60
- ]
61
- ])
62
- end
63
- end