greeb 0.0.2 → 0.1.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -27,6 +27,7 @@ nbproject
27
27
 
28
28
  ## BUNDLER
29
29
  .bundle
30
+ Gemfile.lock
30
31
 
31
32
  ## PROJECT::GENERAL
32
33
  coverage
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ branches:
2
+ only:
3
+ - develop
4
+ - master
5
+ rvm:
6
+ - 1.9.3
7
+ - rbx-19mode
data/.yardopts ADDED
@@ -0,0 +1,6 @@
1
+ --protected
2
+ --no-private
3
+ -m markdown
4
+ -
5
+ README.md
6
+ LICENSE
data/Gemfile CHANGED
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  source 'http://rubygems.org'
2
4
 
3
5
  gemspec
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010-2012 Dmitry A. Ustalov
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,140 @@
1
+ Greeb
2
+ =====
3
+
4
+ Greeb is a simple yet awesome text tokenizer that is based on regular
5
+ expressions.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'greeb'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install greeb
22
+
23
+ ## Usage
24
+
25
+ Greeb can help you to solve simple text processing problems:
26
+
27
+ ```ruby
28
+ pp Greeb::Tokenizer.new('Hello!').tokens
29
+ =begin
30
+ #<SortedSet: {#<struct Greeb::Entity from=0, to=5, type=:letter>,
31
+ #<struct Greeb::Entity from=5, to=6, type=:punct>}>
32
+ =end
33
+ ```
34
+
35
+ It should be noted that it is possible to process much complex texts:
36
+
37
+ ```ruby
38
+ text =<<-EOF
39
+ Hello! I am 18! My favourite number is 133.7...
40
+
41
+ What about you?
42
+ EOF
43
+
44
+ pp Greeb::Tokenizer.new(text).tokens
45
+ =begin
46
+ #<SortedSet: {#<struct Greeb::Entity from=0, to=5, type=:letter>,
47
+ #<struct Greeb::Entity from=5, to=6, type=:punct>,
48
+ #<struct Greeb::Entity from=6, to=7, type=:separ>,
49
+ #<struct Greeb::Entity from=7, to=8, type=:letter>,
50
+ #<struct Greeb::Entity from=8, to=9, type=:separ>,
51
+ #<struct Greeb::Entity from=9, to=11, type=:letter>,
52
+ #<struct Greeb::Entity from=11, to=12, type=:separ>,
53
+ #<struct Greeb::Entity from=12, to=14, type=:integer>,
54
+ #<struct Greeb::Entity from=14, to=15, type=:punct>,
55
+ #<struct Greeb::Entity from=15, to=16, type=:separ>,
56
+ #<struct Greeb::Entity from=16, to=18, type=:letter>,
57
+ #<struct Greeb::Entity from=18, to=19, type=:separ>,
58
+ #<struct Greeb::Entity from=19, to=28, type=:letter>,
59
+ #<struct Greeb::Entity from=28, to=29, type=:separ>,
60
+ #<struct Greeb::Entity from=29, to=35, type=:letter>,
61
+ #<struct Greeb::Entity from=35, to=36, type=:separ>,
62
+ #<struct Greeb::Entity from=36, to=38, type=:letter>,
63
+ #<struct Greeb::Entity from=38, to=39, type=:separ>,
64
+ #<struct Greeb::Entity from=39, to=44, type=:float>,
65
+ #<struct Greeb::Entity from=44, to=47, type=:punct>,
66
+ #<struct Greeb::Entity from=47, to=49, type=:break>,
67
+ #<struct Greeb::Entity from=49, to=53, type=:letter>,
68
+ #<struct Greeb::Entity from=53, to=54, type=:separ>,
69
+ #<struct Greeb::Entity from=54, to=59, type=:letter>,
70
+ #<struct Greeb::Entity from=59, to=60, type=:separ>,
71
+ #<struct Greeb::Entity from=60, to=63, type=:letter>,
72
+ #<struct Greeb::Entity from=63, to=64, type=:punct>,
73
+ #<struct Greeb::Entity from=64, to=65, type=:break>}>
74
+ =end
75
+ ```
76
+
77
+ Also it can be used to solve the text segmentation problems
78
+ such as sentence detection tasks:
79
+
80
+ ```ruby
81
+ text = 'Hello! How are you?'
82
+ pp Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
83
+ =begin
84
+ #<SortedSet: {#<struct Greeb::Entity from=0, to=6, type=:sentence>,
85
+ #<struct Greeb::Entity from=7, to=19, type=:sentence>}>
86
+ =end
87
+ ```
88
+
89
+ It is possible to extract tokens that were processed by the text
90
+ segmentator:
91
+
92
+ ```ruby
93
+ text = 'Hello! How are you?'
94
+ segmentator = Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
95
+ sentences = segmentator.sentences
96
+ pp segmentator.extract(*sentences)
97
+ =begin
98
+ {#<struct Greeb::Entity from=0, to=6, type=:sentence>=>
99
+ [#<struct Greeb::Entity from=0, to=5, type=:letter>,
100
+ #<struct Greeb::Entity from=5, to=6, type=:punct>],
101
+ #<struct Greeb::Entity from=7, to=19, type=:sentence>=>
102
+ [#<struct Greeb::Entity from=7, to=10, type=:letter>,
103
+ #<struct Greeb::Entity from=10, to=11, type=:separ>,
104
+ #<struct Greeb::Entity from=11, to=14, type=:letter>,
105
+ #<struct Greeb::Entity from=14, to=15, type=:separ>,
106
+ #<struct Greeb::Entity from=15, to=18, type=:letter>,
107
+ #<struct Greeb::Entity from=18, to=19, type=:punct>]}
108
+ =end
109
+ ```
110
+
111
+ ## Tokens
112
+
113
+ Greeb operates with entities, tuples of `<from, to, type>`, where
114
+ `from` is a beginning of the entity, `to` is an ending of the entity,
115
+ and `type` is a type of the entity.
116
+
117
+ There are several entity types: `:letter`, `:float`, `:integer`,
118
+ `:separ`, `:punct` (for punctuation), `:spunct` (for in-sentence
119
+ punctuation), and `:break`.
120
+
121
+ ## Contributing
122
+
123
+ 1. Fork it;
124
+ 2. Create your feature branch (`git checkout -b my-new-feature`);
125
+ 3. Commit your changes (`git commit -am 'Added some feature'`);
126
+ 4. Push to the branch (`git push origin my-new-feature`);
127
+ 5. Create new Pull Request.
128
+
129
+ I highly recommend you to use git flow to make development process much
130
+ systematic and awesome.
131
+
132
+ ## Build Status [<img src="https://secure.travis-ci.org/eveel/greeb.png"/>](http://travis-ci.org/eveel/greeb)
133
+
134
+ ## Dependency Status [<img src="https://gemnasium.com/eveel/greeb.png?travis"/>](https://gemnasium.com/eveel/greeb)
135
+
136
+ ## Copyright
137
+
138
+ Copyright (c) 2010-2012 [Dmitry A. Ustalov]. See LICENSE for details.
139
+
140
+ [Dmitry A. Ustalov]: http://eveel.ru
data/Rakefile CHANGED
@@ -1,12 +1,12 @@
1
+ #!/usr/bin/env rake
1
2
  # encoding: utf-8
2
3
 
3
- require 'bundler'
4
- Bundler::GemHelper.install_tasks
4
+ require 'bundler/gem_tasks'
5
5
 
6
- require 'rspec/core/rake_task'
7
- desc 'Run all examples'
8
- RSpec::Core::RakeTask.new(:spec) do |t|
9
- t.rspec_opts = %w[--color]
10
- end
6
+ task :default => :test
11
7
 
12
- task :default => :spec
8
+ require 'rake/testtask'
9
+ Rake::TestTask.new do |test|
10
+ test.pattern = 'spec/**/*_spec.rb'
11
+ test.verbose = true
12
+ end
data/greeb.gemspec CHANGED
@@ -1,25 +1,27 @@
1
1
  # encoding: utf-8
2
2
 
3
- $:.push File.expand_path('../lib', __FILE__)
4
- require 'greeb'
3
+ require File.expand_path('../lib/greeb/version', __FILE__)
5
4
 
6
5
  Gem::Specification.new do |s|
7
6
  s.name = 'greeb'
8
7
  s.version = Greeb::VERSION
9
8
  s.platform = Gem::Platform::RUBY
10
- s.authors = [ 'Dmitry A. Ustalov' ]
11
- s.email = [ 'dmitry@eveel.ru' ]
9
+ s.authors = ['Dmitry A. Ustalov']
10
+ s.email = ['dmitry@eveel.ru']
12
11
  s.homepage = 'https://github.com/eveel/greeb'
13
- s.summary = 'Greeb is a Graphematical Analyzer.'
14
- s.description = 'Greeb is awesome Graphematical Analyzer, ' \
12
+ s.summary = 'Greeb is a simple regexp-based tokenizer.'
13
+ s.description = 'Greeb is a simple yet awesome regexp-based tokenizer, ' \
15
14
  'written in Ruby.'
16
15
 
17
16
  s.rubyforge_project = 'greeb'
18
17
 
19
- s.add_dependency 'rspec', '~> 2.4.0'
18
+ s.add_development_dependency 'rake'
19
+ s.add_development_dependency 'minitest', '>= 2.11'
20
+ s.add_development_dependency 'simplecov'
21
+ s.add_development_dependency 'yard'
20
22
 
21
23
  s.files = `git ls-files`.split("\n")
22
24
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
23
25
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
24
- s.require_paths = [ 'lib' ]
26
+ s.require_paths = ['lib']
25
27
  end
data/lib/greeb.rb CHANGED
@@ -1,11 +1,25 @@
1
1
  # encoding: utf-8
2
2
 
3
- # Greeb is awesome Graphematical Analyzer.
4
- #
5
- module Greeb
6
- # Version of the Greeb.
7
- #
8
- VERSION = "0.0.2"
3
+ require 'greeb/version'
9
4
 
10
- require 'greeb/parser'
5
+ # Greeb operates with entities, tuples of `<from, to, kind>`, where
6
+ # `from` is a beginning of the entity, `to` is an ending of the entity,
7
+ # and `kind` is a type of the entity.
8
+ #
9
+ # There are several entity types: `:letter`, `:float`, `:integer`,
10
+ # `:separ` for separators, `:punct` for punctuation characters,
11
+ # `:spunct` for in-sentence punctuation characters, and
12
+ # `:break` for line endings.
13
+ #
14
+ class Greeb::Entity < Struct.new(:from, :to, :type)
15
+ def <=> other
16
+ if (comparison = self.from <=> other.from) == 0
17
+ self.to <=> other.to
18
+ else
19
+ comparison
20
+ end
21
+ end
11
22
  end
23
+
24
+ require 'greeb/tokenizer'
25
+ require 'greeb/segmentator'
@@ -0,0 +1,95 @@
1
+ # encoding: utf-8
2
+
3
+ # It is possible to perform simple sentence detection that is based
4
+ # on Greeb's tokenization.
5
+ #
6
+ class Greeb::Segmentator
7
+ # Sentence does not start from the separator charater, line break
8
+ # character, and punctuation characters.
9
+ #
10
+ SENTENCE_DOESNT_START = [:separ, :break, :punct, :spunct]
11
+
12
+ attr_reader :tokens
13
+
14
+ # Create a new instance of {Greeb::Segmentator}.
15
+ #
16
+ # @param tokenizer_or_tokens [Greeb::Tokenizer,Set] an instance of
17
+ # Greeb::Tokenizer or set of its results.
18
+ #
19
+ def initialize tokenizer_or_tokens
20
+ @tokens = if tokenizer_or_tokens.is_a? Greeb::Tokenizer
21
+ tokenizer_or_tokens.tokens
22
+ else
23
+ tokenizer_or_tokens
24
+ end
25
+ end
26
+
27
+ # Sentences memoization method.
28
+ #
29
+ # @return [Set<Greeb::Entity>] a set of sentences.
30
+ #
31
+ def sentences
32
+ detect_sentences! unless @sentences
33
+ @sentences
34
+ end
35
+
36
+ # Extract tokens from the set of sentences.
37
+ #
38
+ # @param sentences [Array<Greeb::Entity>] a list of sentences.
39
+ #
40
+ # @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
41
+ # sentences as keys and tokens arrays as values.
42
+ #
43
+ def extract *sentences
44
+ Hash[
45
+ sentences.map do |s|
46
+ [s, tokens.select { |t| t.from >= s.from and t.to <= s.to }]
47
+ end
48
+ ]
49
+ end
50
+
51
+ protected
52
+ # Implementation of the sentence detection method. This method
53
+ # changes the `@sentences` ivar.
54
+ #
55
+ # @return [nil] nothing.
56
+ #
57
+ def detect_sentences!
58
+ @sentences = SortedSet.new
59
+
60
+ rest = tokens.inject(new_sentence) do |sentence, token|
61
+ if !sentence.from and SENTENCE_DOESNT_START.include?(token.type)
62
+ next sentence
63
+ end
64
+
65
+ sentence.from = token.from unless sentence.from
66
+
67
+ next sentence if sentence.to and sentence.to > token.to
68
+
69
+ if :punct == token.type
70
+ sentence.to = tokens.
71
+ select { |t| t.from >= token.from }.
72
+ inject(token) { |r, t| break r if t.type != token.type; t }.
73
+ to
74
+
75
+ @sentences << sentence
76
+ sentence = new_sentence
77
+ elsif :separ != token.type
78
+ sentence.to = token.to
79
+ end
80
+
81
+ sentence
82
+ end
83
+
84
+ nil.tap { @sentences << rest if rest.from and rest.to }
85
+ end
86
+
87
+ private
88
+ # Create a new instance of {Greeb::Entity} with `:sentence` type.
89
+ #
90
+ # @return [Greeb::Entity] a new entity instance.
91
+ #
92
+ def new_sentence
93
+ Greeb::Entity.new(nil, nil, :sentence)
94
+ end
95
+ end
@@ -0,0 +1,112 @@
1
+ # encoding: utf-8
2
+
3
+ require 'strscan'
4
+ require 'set'
5
+
6
+ # Greeb's tokenization facilities. Use 'em with love.
7
+ #
8
+ class Greeb::Tokenizer
9
+ # English and Russian letters.
10
+ #
11
+ LETTERS = /[A-Za-zА-Яа-яЁё]+/u
12
+
13
+ # Floating point values.
14
+ #
15
+ FLOATS = /(\d+)[.,](\d+)/u
16
+
17
+ # Integer values.
18
+ #
19
+ INTEGERS = /\d+/u
20
+
21
+ # In-subsentence seprator (i.e.: "*" or "=").
22
+ #
23
+ SEPARATORS = /[*=_\/\\ ]+/u
24
+
25
+ # Punctuation character (i.e.: "." or "!").
26
+ #
27
+ PUNCTUATIONS = /(\.|\!|\?)+/u
28
+
29
+ # In-sentence punctuation character (i.e.: "," or "-").
30
+ #
31
+ SENTENCE_PUNCTUATIONS = /(\,|\[|\]|\(|\)|\-|:|;)+/u
32
+
33
+ # Line breaks.
34
+ #
35
+ BREAKS = /\n+/u
36
+
37
+ attr_reader :text, :scanner
38
+ protected :scanner
39
+
40
+ # Create a new instance of {Greeb::Tokenizer}.
41
+ #
42
+ # @param text [String] text to be tokenized.
43
+ #
44
+ def initialize(text)
45
+ @text = text
46
+ end
47
+
48
+ # Tokens memoization method.
49
+ #
50
+ # @return [Set<Greeb::Entity>] a set of tokens.
51
+ #
52
+ def tokens
53
+ tokenize! unless @tokens
54
+ @tokens
55
+ end
56
+
57
+ protected
58
+ # Perform the tokenization process. This method modifies
59
+ # `@scanner` and `@tokens` instance variables.
60
+ #
61
+ # @return [nil] nothing unless exception is raised.
62
+ #
63
+ def tokenize!
64
+ @scanner = StringScanner.new(text)
65
+ @tokens = SortedSet.new
66
+ while !scanner.eos?
67
+ parse! LETTERS, :letter or
68
+ parse! FLOATS, :float or
69
+ parse! INTEGERS, :integer or
70
+ split_parse! SENTENCE_PUNCTUATIONS, :spunct or
71
+ split_parse! PUNCTUATIONS, :punct or
72
+ split_parse! SEPARATORS, :separ or
73
+ split_parse! BREAKS, :break or
74
+ raise @tokens.inspect
75
+ end
76
+ ensure
77
+ scanner.terminate
78
+ end
79
+
80
+ # Try to parse one small piece of text that is covered by pattern
81
+ # of necessary type.
82
+ #
83
+ # @param pattern [Regexp] a regular expression to extract the token.
84
+ # @param type [Symbol] a symbol that represents the necessary token
85
+ # type.
86
+ #
87
+ # @return [Set<Greeb::Entity>] the modified set of extracted tokens.
88
+ #
89
+ def parse! pattern, type
90
+ return false unless token = scanner.scan(pattern)
91
+ @tokens << Greeb::Entity.new(scanner.pos - token.length, scanner.pos, type)
92
+ end
93
+
94
+ # Try to parse one small piece of text that is covered by pattern
95
+ # of necessary type. This method performs grouping of the same
96
+ # characters.
97
+ #
98
+ # @param pattern [Regexp] a regular expression to extract the token.
99
+ # @param type [Symbol] a symbol that represents the necessary token
100
+ # type.
101
+ #
102
+ # @return [Set<Greeb::Entity>] the modified set of extracted tokens.
103
+ #
104
+ def split_parse! pattern, type
105
+ return false unless token = scanner.scan(pattern)
106
+ position = scanner.pos - token.length
107
+ token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
108
+ @tokens << Greeb::Entity.new(before, before + s.length, type)
109
+ before + s.length
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,9 @@
1
+ # encoding: utf-8
2
+
3
+ # Greeb is a simple regexp-based tokenizer.
4
+ #
5
+ module Greeb
6
+ # Version of Greeb.
7
+ #
8
+ VERSION = '0.1.0.rc1'
9
+ end
@@ -0,0 +1,112 @@
1
+ # encoding: utf-8
2
+
3
+ require File.expand_path('../spec_helper', __FILE__)
4
+
5
+ module Greeb
6
+ describe Segmentator do
7
+ describe 'initialization' do
8
+ before { @tokenizer = Tokenizer.new('Vodka') }
9
+
10
+ subject { Segmentator.new(@tokenizer) }
11
+
12
+ it 'can be initialized either with Tokenizer' do
13
+ subject.tokens.must_be_kind_of SortedSet
14
+ end
15
+
16
+ it 'can be initialized either with a set of tokens' do
17
+ subject = Segmentator.new(@tokenizer.tokens)
18
+ subject.tokens.must_be_kind_of SortedSet
19
+ end
20
+
21
+ it 'should has @tokens ivar' do
22
+ subject.instance_variable_get(:@tokens).wont_be_nil
23
+ end
24
+ end
25
+
26
+ describe 'a simple sentence' do
27
+ before { @tokenizer = Tokenizer.new('Hello, I am JC Denton.') }
28
+
29
+ subject { Segmentator.new(@tokenizer).sentences }
30
+
31
+ it 'should be segmented' do
32
+ subject.must_equal(
33
+ SortedSet.new([Entity.new(0, 22, :sentence)])
34
+ )
35
+ end
36
+ end
37
+
38
+ describe 'a simple sentence without punctuation' do
39
+ before { @tokenizer = Tokenizer.new('Hello, I am JC Denton') }
40
+
41
+ subject { Segmentator.new(@tokenizer).sentences }
42
+
43
+ it 'should be segmented' do
44
+ subject.must_equal(
45
+ SortedSet.new([Entity.new(0, 21, :sentence)])
46
+ )
47
+ end
48
+ end
49
+
50
+ describe 'a simple sentence with trailing whitespaces' do
51
+ before { @tokenizer = Tokenizer.new(' Hello, I am JC Denton ') }
52
+
53
+ subject { Segmentator.new(@tokenizer).sentences }
54
+
55
+ it 'should be segmented' do
56
+ subject.must_equal(
57
+ SortedSet.new([Entity.new(6, 27, :sentence)])
58
+ )
59
+ end
60
+ end
61
+
62
+ describe 'two simple sentences' do
63
+ before { @tokenizer = Tokenizer.new('Hello! I am JC Denton.') }
64
+
65
+ subject { Segmentator.new(@tokenizer).sentences }
66
+
67
+ it 'should be segmented' do
68
+ subject.must_equal(
69
+ SortedSet.new([Entity.new(0, 6, :sentence),
70
+ Entity.new(7, 22, :sentence)])
71
+ )
72
+ end
73
+ end
74
+
75
+ describe 'one wrong character and one simple sentence' do
76
+ before { @tokenizer = Tokenizer.new('! I am JC Denton.') }
77
+
78
+ subject { Segmentator.new(@tokenizer).sentences }
79
+
80
+ it 'should be segmented' do
81
+ subject.must_equal(
82
+ SortedSet.new([Entity.new(2, 17, :sentence)])
83
+ )
84
+ end
85
+ end
86
+
87
+ describe 'token extractor' do
88
+ before { @tokenizer = Tokenizer.new('Hello! I am JC Denton.') }
89
+
90
+ subject { Segmentator.new(@tokenizer) }
91
+
92
+ it 'should be extracted' do
93
+ subject.extract(*subject.sentences).must_equal({
94
+ Entity.new(0, 6, :sentence) => [
95
+ Entity.new(0, 5, :letter),
96
+ Entity.new(5, 6, :punct)
97
+ ],
98
+ Entity.new(7, 22, :sentence) => [
99
+ Entity.new(7, 8, :letter),
100
+ Entity.new(8, 9, :separ),
101
+ Entity.new(9, 11, :letter),
102
+ Entity.new(11, 12, :separ),
103
+ Entity.new(12, 14, :letter),
104
+ Entity.new(14, 15, :separ),
105
+ Entity.new(15, 21, :letter),
106
+ Entity.new(21, 22, :punct)
107
+ ]
108
+ })
109
+ end
110
+ end
111
+ end
112
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,14 +1,20 @@
1
1
  # encoding: utf-8
2
2
 
3
- require File.expand_path('../../lib/greeb', __FILE__)
3
+ require 'rubygems'
4
4
 
5
- RSpec.configure do |c|
6
- c.mock_with :rspec
5
+ $:.unshift File.expand_path('../../lib', __FILE__)
6
+
7
+ if RUBY_VERSION == '1.8'
8
+ gem 'minitest'
7
9
  end
8
10
 
9
- RSpec::Matchers.define :be_parsed_as do |expected|
10
- match do |actual|
11
- tree = Greeb::Parser.new(actual).parse
12
- tree == expected
11
+ require 'minitest/autorun'
12
+
13
+ unless 'true' == ENV['TRAVIS']
14
+ require 'simplecov'
15
+ SimpleCov.start do
16
+ add_filter '/spec/'
13
17
  end
14
18
  end
19
+
20
+ require 'greeb'
@@ -0,0 +1,91 @@
1
+ # encoding: utf-8
2
+
3
+ require File.expand_path('../spec_helper', __FILE__)
4
+
5
+ module Greeb
6
+ describe Tokenizer do
7
+ describe 'initialization' do
8
+ subject { Tokenizer.new('vodka') }
9
+
10
+ it 'should be initialized with a text' do
11
+ subject.text.must_equal 'vodka'
12
+ end
13
+
14
+ it 'should has the @text ivar' do
15
+ subject.instance_variable_get(:@text).must_equal 'vodka'
16
+ end
17
+
18
+ it 'should not has @tokens ivar' do
19
+ subject.instance_variable_get(:@tokens).must_be_nil
20
+ end
21
+ end
22
+
23
+ describe 'after tokenization' do
24
+ subject { Tokenizer.new('vodka').tap(&:tokens) }
25
+
26
+ it 'should has the @tokens ivar' do
27
+ subject.instance_variable_get(:@tokens).wont_be_nil
28
+ end
29
+
30
+ it 'should has the @scanner ivar' do
31
+ subject.instance_variable_get(:@scanner).wont_be_nil
32
+ end
33
+
34
+ it 'should has the tokens set' do
35
+ subject.tokens.must_be_kind_of SortedSet
36
+ end
37
+ end
38
+
39
+ describe 'tokenization facilities' do
40
+ it 'can handle words' do
41
+ Tokenizer.new('hello').tokens.must_equal(
42
+ SortedSet.new([Entity.new(0, 5, :letter)])
43
+ )
44
+ end
45
+
46
+ it 'can handle floats' do
47
+ Tokenizer.new('14.88').tokens.must_equal(
48
+ SortedSet.new([Entity.new(0, 5, :float)])
49
+ )
50
+ end
51
+
52
+ it 'can handle integers' do
53
+ Tokenizer.new('1337').tokens.must_equal(
54
+ SortedSet.new([Entity.new(0, 4, :integer)])
55
+ )
56
+ end
57
+
58
+ it 'can handle words and integers' do
59
+ Tokenizer.new('Hello, I am 18').tokens.must_equal(
60
+ SortedSet.new([Entity.new(0, 5, :letter),
61
+ Entity.new(5, 6, :spunct),
62
+ Entity.new(6, 7, :separ),
63
+ Entity.new(7, 8, :letter),
64
+ Entity.new(8, 9, :separ),
65
+ Entity.new(9, 11, :letter),
66
+ Entity.new(11, 12, :separ),
67
+ Entity.new(12, 14, :integer)])
68
+ )
69
+ end
70
+
71
+ it 'can handle multi-line paragraphs' do
72
+ Tokenizer.new("Brateeshka..!\n\nPrines!").tokens.must_equal(
73
+ SortedSet.new([Entity.new(0, 10, :letter),
74
+ Entity.new(10, 12, :punct),
75
+ Entity.new(12, 13, :punct),
76
+ Entity.new(13, 15, :break),
77
+ Entity.new(15, 21, :letter),
78
+ Entity.new(21, 22, :punct)])
79
+ )
80
+ end
81
+
82
+ it 'can handle separated integers' do
83
+ Tokenizer.new('228/359').tokens.must_equal(
84
+ SortedSet.new([Entity.new(0, 3, :integer),
85
+ Entity.new(3, 4, :separ),
86
+ Entity.new(4, 7, :integer)])
87
+ )
88
+ end
89
+ end
90
+ end
91
+ end
metadata CHANGED
@@ -1,29 +1,81 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: greeb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
5
- prerelease:
4
+ version: 0.1.0.rc1
5
+ prerelease: 6
6
6
  platform: ruby
7
7
  authors:
8
8
  - Dmitry A. Ustalov
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-02-20 00:00:00.000000000 +05:00
13
- default_executable:
12
+ date: 2012-07-08 00:00:00.000000000 Z
14
13
  dependencies:
15
14
  - !ruby/object:Gem::Dependency
16
- name: rspec
17
- requirement: &81165430 !ruby/object:Gem::Requirement
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
18
17
  none: false
19
18
  requirements:
20
- - - ~>
19
+ - - ! '>='
21
20
  - !ruby/object:Gem::Version
22
- version: 2.4.0
23
- type: :runtime
21
+ version: '0'
22
+ type: :development
24
23
  prerelease: false
25
- version_requirements: *81165430
26
- description: Greeb is awesome Graphematical Analyzer, written in Ruby.
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: minitest
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '2.11'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '2.11'
46
+ - !ruby/object:Gem::Dependency
47
+ name: simplecov
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: yard
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: Greeb is a simple yet awesome regexp-based tokenizer, written in Ruby.
27
79
  email:
28
80
  - dmitry@eveel.ru
29
81
  executables: []
@@ -31,19 +83,20 @@ extensions: []
31
83
  extra_rdoc_files: []
32
84
  files:
33
85
  - .gitignore
86
+ - .travis.yml
87
+ - .yardopts
34
88
  - Gemfile
35
- - Gemfile.lock
36
- - README
89
+ - LICENSE
90
+ - README.md
37
91
  - Rakefile
38
- - greeb-test.rb
39
92
  - greeb.gemspec
40
- - lib/enumerable.rb
41
93
  - lib/greeb.rb
42
- - lib/greeb/parser.rb
43
- - lib/meta_array.rb
44
- - spec/parser_spec.rb
94
+ - lib/greeb/segmentator.rb
95
+ - lib/greeb/tokenizer.rb
96
+ - lib/greeb/version.rb
97
+ - spec/segmentator_spec.rb
45
98
  - spec/spec_helper.rb
46
- has_rdoc: true
99
+ - spec/tokenizer_spec.rb
47
100
  homepage: https://github.com/eveel/greeb
48
101
  licenses: []
49
102
  post_install_message:
@@ -56,18 +109,23 @@ required_ruby_version: !ruby/object:Gem::Requirement
56
109
  - - ! '>='
57
110
  - !ruby/object:Gem::Version
58
111
  version: '0'
112
+ segments:
113
+ - 0
114
+ hash: -4603914053803130942
59
115
  required_rubygems_version: !ruby/object:Gem::Requirement
60
116
  none: false
61
117
  requirements:
62
- - - ! '>='
118
+ - - ! '>'
63
119
  - !ruby/object:Gem::Version
64
- version: '0'
120
+ version: 1.3.1
65
121
  requirements: []
66
122
  rubyforge_project: greeb
67
- rubygems_version: 1.5.2
123
+ rubygems_version: 1.8.24
68
124
  signing_key:
69
125
  specification_version: 3
70
- summary: Greeb is a Graphematical Analyzer.
126
+ summary: Greeb is a simple regexp-based tokenizer.
71
127
  test_files:
72
- - spec/parser_spec.rb
128
+ - spec/segmentator_spec.rb
73
129
  - spec/spec_helper.rb
130
+ - spec/tokenizer_spec.rb
131
+ has_rdoc:
data/Gemfile.lock DELETED
@@ -1,24 +0,0 @@
1
- PATH
2
- remote: .
3
- specs:
4
- greeb (0.0.2)
5
- rspec (~> 2.4.0)
6
-
7
- GEM
8
- remote: http://rubygems.org/
9
- specs:
10
- diff-lcs (1.1.2)
11
- rspec (2.4.0)
12
- rspec-core (~> 2.4.0)
13
- rspec-expectations (~> 2.4.0)
14
- rspec-mocks (~> 2.4.0)
15
- rspec-core (2.4.0)
16
- rspec-expectations (2.4.0)
17
- diff-lcs (~> 1.1.2)
18
- rspec-mocks (2.4.0)
19
-
20
- PLATFORMS
21
- ruby
22
-
23
- DEPENDENCIES
24
- greeb!
data/README DELETED
File without changes
data/greeb-test.rb DELETED
@@ -1,141 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # encoding: utf-8
3
-
4
- require 'rubygems'
5
- require 'graphviz'
6
-
7
- $:.unshift('./lib')
8
- require 'greeb'
9
-
10
- origin = <<-END
11
- - Сынок, чего это от тебя зигами пахнет,
12
- опять на Манежную площадь ходил?
13
-
14
- - Нет мама, я в метро ехал, там назиговано было!!
15
-
16
-
17
-
18
- Четырнадцать, восемьдесять восемь: 14/88.
19
- Вот так блять
20
- END
21
- origin.chomp!
22
-
23
- def identify(token)
24
- case token
25
- when Greeb::RU_LEX then 'RU_LEX'
26
- when Greeb::EN_LEX then 'EN_LEX'
27
- when Greeb::EOL then 'EOL'
28
- when Greeb::SEP then 'SEP'
29
- when Greeb::PUN then 'PUN'
30
- when Greeb::SPUN then 'SPUN'
31
- when Greeb::DIG then 'DIG'
32
- when Greeb::DIL then 'DIL'
33
- else
34
- '?!'
35
- end
36
- end
37
-
38
- greeb = Greeb::Parser.new(origin)
39
- text = greeb.tree
40
-
41
- g = GraphViz.new('graphematics', 'type' => 'graph')
42
-
43
- g.node[:color] = '#ddaa66'
44
- g.node[:style] = 'filled'
45
- g.node[:shape] = 'box'
46
- g.node[:penwidth] = '1'
47
- g.node[:fontname] = 'PT Sans'
48
- g.node[:fontsize] = '8'
49
- g.node[:fillcolor]= '#ffeecc'
50
- g.node[:fontcolor]= '#775500'
51
- g.node[:margin] = '0.0'
52
-
53
- g.edge[:color] = '#999999'
54
- g.edge[:weight] = '1'
55
- g.edge[:fontname] = 'PT Sans'
56
- g.edge[:fontcolor]= '#444444'
57
- g.edge[:fontsize] = '6'
58
- g.edge[:dir] = 'forward'
59
- g.edge[:arrowsize]= '0.5'
60
-
61
- bid = 'begin'
62
- g.add_node(bid).tap do |node|
63
- node.label = "Начало\nтекста"
64
- node.shape = 'ellipse'
65
- node.style = ''
66
- end
67
-
68
- eid = 'end'
69
- g.add_node(eid).tap do |node|
70
- node.label = "Конец\nтекста"
71
- node.shape = 'ellipse'
72
- node.style = ''
73
- end
74
-
75
- tree = text.map_with_index do |paragraph, i|
76
- pid = "p#{i}"
77
- sentences = paragraph.map_with_index do |sentence, j|
78
- sid = "#{pid}s#{j}"
79
- subsentences = sentence.map_with_index do |subsentence, k|
80
- ssid = "#{sid}ss#{k}"
81
- tokens = subsentence.map_with_index do |token, l|
82
- next if ' ' == token
83
- [ "#{ssid}t#{l}", token, l ]
84
- end
85
- tokens.delete(nil)
86
- [ ssid, tokens, k ]
87
- end
88
- [ sid, subsentences, j ]
89
- end
90
- [ pid, sentences, i ]
91
- end
92
-
93
- tree.each do |pid, paragraph, i|
94
- g.add_node(pid).tap do |node|
95
- node.label = "Абзац\n№#{i + 1}"
96
- node.shape = 'ellipse'
97
- end
98
- g.add_edge(bid, pid)
99
-
100
- paragraph.each do |sid, sentence, j|
101
- g.add_node(sid).tap do |node|
102
- node.label = "Предложение\n№#{j + 1}"
103
- node.shape = 'ellipse'
104
- end
105
- g.add_edge(pid, sid)
106
-
107
- sentence.each do |ssid, subsentence, k|
108
- g.add_node(ssid).tap do |node|
109
- node.label = "Подпредложение\n№#{k + 1}"
110
- node.shape = 'ellipse'
111
- end
112
- g.add_edge(sid, ssid)
113
-
114
- subsentence.each do |tid, token, l|
115
- g.add_node(tid).label = token
116
- g.add_edge(ssid, tid).label = identify(token)
117
- g.add_edge(tid, eid)
118
- end
119
-
120
- subsentence.each_cons(2) do |(tid1, token1, l1),
121
- (tid2, token2, l2)|
122
- g.add_edge(tid1, tid2).tap do |edge|
123
- edge.weight = 0.25
124
- edge.style = 'dashed'
125
- end
126
- end
127
- end
128
-
129
- sentence.each_cons(2) do |(ssid1, subsentence1, k1),
130
- (ssid2, subsentence2, k2)|
131
- tid1, token1, l1 = subsentence1.last
132
- tid2, token2, l2 = subsentence2.first
133
- g.add_edge(tid1, tid2).tap do |edge|
134
- edge.weight = 0.5
135
- edge.style = 'dashed'
136
- end
137
- end
138
- end
139
- end
140
-
141
- g.output(:output => 'png', :file => 'graph.png')
data/lib/enumerable.rb DELETED
@@ -1,10 +0,0 @@
1
- # encoding: utf-8
2
-
3
- # Enumerable module additions.
4
- #
5
- module Enumerable
6
- def collect_with_index(i = -1) # :nodoc:
7
- collect { |e| yield(e, i += 1) }
8
- end
9
- alias map_with_index collect_with_index
10
- end
data/lib/greeb/parser.rb DELETED
@@ -1,176 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require 'meta_array'
4
- require 'enumerable'
5
-
6
- # Graphematical Parser of the Greeb.
7
- # Use it with love.
8
- #
9
- class Greeb::Parser
10
- # Russian lexeme (i.e.: "хуй").
11
- #
12
- RUSSIAN_LEXEME = /^[А-Яа-яЁё]+$/u
13
-
14
- # English lexeme (i.e.: "foo").
15
- #
16
- ENGLISH_LEXEME = /^[A-Za-z]+$/u
17
-
18
- # End of Line sequence (i.e.: "\n").
19
- #
20
- END_OF_LINE = /^\n+$/u
21
-
22
- # In-subsentence seprator (i.e.: "*" or "\").
23
- #
24
- SEPARATOR = /^[*=_\/\\ ]$/u
25
-
26
- # Punctuation character (i.e.: "." or "!").
27
- #
28
- PUNCTUATION = /^(\.|\!|\?)$/u
29
-
30
- # In-sentence punctuation character (i.e.: "," or "-").
31
- #
32
- SENTENCE_PUNCTUATION = /^(\,|\[|\]|\(|\)|\-|:|;)$/u
33
-
34
- # Digit (i.e.: "1337").
35
- #
36
- DIGIT = /^[0-9]+$/u
37
-
38
- # Digit-Letter complex (i.e.: "0xDEADBEEF").
39
- #
40
- DIGIT_LETTER = /^[А-Яа-яA-Za-z0-9Ёё]+$/u
41
-
42
- # Empty string (i.e.: "").
43
- #
44
- EMPTY = ''
45
-
46
- attr_accessor :text
47
- private :text=
48
-
49
- # Create a new instance of Greeb::Parser.
50
- #
51
- # ==== Parameters
52
- # text<String>:: Source text.
53
- #
54
- def initialize(text)
55
- self.text = text
56
- end
57
-
58
- # Perform the text parsing.
59
- #
60
- # ==== Returns
61
- # Array:: Tree of Graphematical Analysis of text.
62
- #
63
- def parse
64
- return @tree if @tree
65
-
66
- # parse tree
67
- tree = MetaArray.new
68
-
69
- # paragraph, sentence, subsentence
70
- p_id, s_id, ss_id = 0, 0, 0
71
-
72
- # current token
73
- token = ''
74
-
75
- # run FSM
76
- text.each_char do |c|
77
- case c
78
- when END_OF_LINE then begin
79
- case token
80
- when EMPTY then token << c
81
- when END_OF_LINE then begin
82
- token = ''
83
- p_id += 1
84
- s_id = 0
85
- ss_id = 0
86
- end
87
- else
88
- tree[p_id][s_id][ss_id] << token
89
- token = c
90
- end
91
- end
92
- when SEPARATOR then begin
93
- case token
94
- when EMPTY
95
- else
96
- tree[p_id][s_id][ss_id] << token
97
- while tree[p_id][s_id][ss_id].last == c
98
- tree[p_id][s_id][ss_id].pop
99
- end
100
- tree[p_id][s_id][ss_id] << c
101
- token = ''
102
- end
103
- end
104
- when PUNCTUATION then begin
105
- case token
106
- when EMPTY
107
- else
108
- tree[p_id][s_id][ss_id] << token
109
- tree[p_id][s_id][ss_id] << c
110
- token = ''
111
- s_id += 1
112
- ss_id = 0
113
- end
114
- end
115
- when SENTENCE_PUNCTUATION then begin
116
- case token
117
- when EMPTY
118
- else
119
- tree[p_id][s_id][ss_id] << token
120
- tree[p_id][s_id][ss_id] << c
121
- token = ''
122
- ss_id += 1
123
- end
124
- end
125
- when RUSSIAN_LEXEME then begin
126
- case token
127
- when END_OF_LINE then begin
128
- tree[p_id][s_id][ss_id] << ' '
129
- token = c
130
- end
131
- else
132
- token << c
133
- end
134
- end
135
- when ENGLISH_LEXEME then begin
136
- case token
137
- when END_OF_LINE then begin
138
- tree[p_id][s_id][ss_id] << ' '
139
- token = c
140
- end
141
- else
142
- token << c
143
- end
144
- end
145
- when DIGIT then begin
146
- case token
147
- when END_OF_LINE then begin
148
- tree[p_id][s_id][ss_id] << ' '
149
- token = c
150
- end
151
- else
152
- token << c
153
- end
154
- end
155
- when DIGIT_LETTER then begin
156
- case token
157
- when END_OF_LINE then begin
158
- tree[p_id][s_id][ss_id] << token
159
- token = c
160
- end
161
- else
162
- token << c
163
- end
164
- end
165
- end
166
- end
167
-
168
- unless token.empty?
169
- tree[p_id][s_id][ss_id] << token
170
- end
171
-
172
- tree.delete(nil)
173
-
174
- @tree = tree.to_a
175
- end
176
- end
data/lib/meta_array.rb DELETED
@@ -1,14 +0,0 @@
1
- # encoding: utf-8
2
-
3
- # MetaArray is an Array, which creates subarrays
4
- # on non-existent elements.
5
- #
6
- class MetaArray < Array
7
- def [] id
8
- super(id) or begin
9
- self.class.new.tap do |element|
10
- self[id] = element
11
- end
12
- end
13
- end
14
- end
data/spec/parser_spec.rb DELETED
@@ -1,63 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require File.expand_path('../spec_helper.rb', __FILE__)
4
-
5
- describe Greeb::Parser do
6
- it 'should parse very simple strings' do
7
- 'буба сука дебил'.should be_parsed_as([
8
- [
9
- [ [ 'буба', ' ', 'сука', ' ', 'дебил' ] ]
10
- ]
11
- ])
12
- end
13
-
14
- it 'should parse one sentence with subsentences' do
15
- 'буба, сука, дебил'.should be_parsed_as([
16
- [
17
- [
18
- [ 'буба', ',' ],
19
- [ 'сука', ',' ],
20
- [ 'дебил' ]
21
- ]
22
- ]
23
- ])
24
- end
25
-
26
- it 'should parse two simple paragraphs' do
27
- "буба сука дебил\n\nточно!".should be_parsed_as([
28
- [
29
- [ [ 'буба', ' ', 'сука', ' ', 'дебил' ] ]
30
- ],
31
- [
32
- [ [ 'точно', '!' ] ]
33
- ]
34
- ])
35
- end
36
-
37
- it 'should parse two sentences in paragraph' do
38
- "буба молодец? буба умница.".should be_parsed_as([
39
- [
40
- [ [ 'буба', ' ', 'молодец', '?' ] ],
41
- [ [ 'буба', ' ', 'умница', '.' ] ]
42
- ]
43
- ])
44
- end
45
-
46
- it 'should parse sentences with floating point values' do
47
- 'буба не считает Пи равной 3.14'.should be_parsed_as([
48
- [
49
- [ [ 'буба', ' ', 'не', ' ', 'считает', ' ',
50
- 'Пи', ' ', 'равной', ' ', '3.14' ] ]
51
- ]
52
- ])
53
- end
54
-
55
- it 'should parse sentences with floating "dot" values' do
56
- 'буба не считает Пи равной 3,14'.should be_parsed_as([
57
- [
58
- [ [ 'буба', ' ', 'не', ' ', 'считает', ' ',
59
- 'Пи', ' ', 'равной', ' ', '3,14' ] ]
60
- ]
61
- ])
62
- end
63
- end