greeb 0.0.2 → 0.1.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/.travis.yml +7 -0
- data/.yardopts +6 -0
- data/Gemfile +2 -0
- data/LICENSE +20 -0
- data/README.md +140 -0
- data/Rakefile +8 -8
- data/greeb.gemspec +10 -8
- data/lib/greeb.rb +21 -7
- data/lib/greeb/segmentator.rb +95 -0
- data/lib/greeb/tokenizer.rb +112 -0
- data/lib/greeb/version.rb +9 -0
- data/spec/segmentator_spec.rb +112 -0
- data/spec/spec_helper.rb +13 -7
- data/spec/tokenizer_spec.rb +91 -0
- metadata +82 -24
- data/Gemfile.lock +0 -24
- data/README +0 -0
- data/greeb-test.rb +0 -141
- data/lib/enumerable.rb +0 -10
- data/lib/greeb/parser.rb +0 -176
- data/lib/meta_array.rb +0 -14
- data/spec/parser_spec.rb +0 -63
data/.gitignore
CHANGED
data/.travis.yml
ADDED
data/.yardopts
ADDED
data/Gemfile
CHANGED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010-2012 Dmitry A. Ustalov
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
Greeb
|
2
|
+
=====
|
3
|
+
|
4
|
+
Greeb is a simple yet awesome text tokenizer that is based on regular
|
5
|
+
expressions.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'greeb'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install greeb
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
Greeb can help you to solve simple text processing problems:
|
26
|
+
|
27
|
+
```ruby
|
28
|
+
pp Greeb::Tokenizer.new('Hello!').tokens
|
29
|
+
=begin
|
30
|
+
#<SortedSet: {#<struct Greeb::Entity from=0, to=5, type=:letter>,
|
31
|
+
#<struct Greeb::Entity from=5, to=6, type=:punct>}>
|
32
|
+
=end
|
33
|
+
```
|
34
|
+
|
35
|
+
It should be noted that it is possible to process much complex texts:
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
text =<<-EOF
|
39
|
+
Hello! I am 18! My favourite number is 133.7...
|
40
|
+
|
41
|
+
What about you?
|
42
|
+
EOF
|
43
|
+
|
44
|
+
pp Greeb::Tokenizer.new(text).tokens
|
45
|
+
=begin
|
46
|
+
#<SortedSet: {#<struct Greeb::Entity from=0, to=5, type=:letter>,
|
47
|
+
#<struct Greeb::Entity from=5, to=6, type=:punct>,
|
48
|
+
#<struct Greeb::Entity from=6, to=7, type=:separ>,
|
49
|
+
#<struct Greeb::Entity from=7, to=8, type=:letter>,
|
50
|
+
#<struct Greeb::Entity from=8, to=9, type=:separ>,
|
51
|
+
#<struct Greeb::Entity from=9, to=11, type=:letter>,
|
52
|
+
#<struct Greeb::Entity from=11, to=12, type=:separ>,
|
53
|
+
#<struct Greeb::Entity from=12, to=14, type=:integer>,
|
54
|
+
#<struct Greeb::Entity from=14, to=15, type=:punct>,
|
55
|
+
#<struct Greeb::Entity from=15, to=16, type=:separ>,
|
56
|
+
#<struct Greeb::Entity from=16, to=18, type=:letter>,
|
57
|
+
#<struct Greeb::Entity from=18, to=19, type=:separ>,
|
58
|
+
#<struct Greeb::Entity from=19, to=28, type=:letter>,
|
59
|
+
#<struct Greeb::Entity from=28, to=29, type=:separ>,
|
60
|
+
#<struct Greeb::Entity from=29, to=35, type=:letter>,
|
61
|
+
#<struct Greeb::Entity from=35, to=36, type=:separ>,
|
62
|
+
#<struct Greeb::Entity from=36, to=38, type=:letter>,
|
63
|
+
#<struct Greeb::Entity from=38, to=39, type=:separ>,
|
64
|
+
#<struct Greeb::Entity from=39, to=44, type=:float>,
|
65
|
+
#<struct Greeb::Entity from=44, to=47, type=:punct>,
|
66
|
+
#<struct Greeb::Entity from=47, to=49, type=:break>,
|
67
|
+
#<struct Greeb::Entity from=49, to=53, type=:letter>,
|
68
|
+
#<struct Greeb::Entity from=53, to=54, type=:separ>,
|
69
|
+
#<struct Greeb::Entity from=54, to=59, type=:letter>,
|
70
|
+
#<struct Greeb::Entity from=59, to=60, type=:separ>,
|
71
|
+
#<struct Greeb::Entity from=60, to=63, type=:letter>,
|
72
|
+
#<struct Greeb::Entity from=63, to=64, type=:punct>,
|
73
|
+
#<struct Greeb::Entity from=64, to=65, type=:break>}>
|
74
|
+
=end
|
75
|
+
```
|
76
|
+
|
77
|
+
Also it can be used to solve the text segmentation problems
|
78
|
+
such as sentence detection tasks:
|
79
|
+
|
80
|
+
```ruby
|
81
|
+
text = 'Hello! How are you?'
|
82
|
+
pp Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
|
83
|
+
=begin
|
84
|
+
#<SortedSet: {#<struct Greeb::Entity from=0, to=6, type=:sentence>,
|
85
|
+
#<struct Greeb::Entity from=7, to=19, type=:sentence>}>
|
86
|
+
=end
|
87
|
+
```
|
88
|
+
|
89
|
+
It is possible to extract tokens that were processed by the text
|
90
|
+
segmentator:
|
91
|
+
|
92
|
+
```ruby
|
93
|
+
text = 'Hello! How are you?'
|
94
|
+
segmentator = Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
|
95
|
+
sentences = segmentator.sentences
|
96
|
+
pp segmentator.extract(*sentences)
|
97
|
+
=begin
|
98
|
+
{#<struct Greeb::Entity from=0, to=6, type=:sentence>=>
|
99
|
+
[#<struct Greeb::Entity from=0, to=5, type=:letter>,
|
100
|
+
#<struct Greeb::Entity from=5, to=6, type=:punct>],
|
101
|
+
#<struct Greeb::Entity from=7, to=19, type=:sentence>=>
|
102
|
+
[#<struct Greeb::Entity from=7, to=10, type=:letter>,
|
103
|
+
#<struct Greeb::Entity from=10, to=11, type=:separ>,
|
104
|
+
#<struct Greeb::Entity from=11, to=14, type=:letter>,
|
105
|
+
#<struct Greeb::Entity from=14, to=15, type=:separ>,
|
106
|
+
#<struct Greeb::Entity from=15, to=18, type=:letter>,
|
107
|
+
#<struct Greeb::Entity from=18, to=19, type=:punct>]}
|
108
|
+
=end
|
109
|
+
```
|
110
|
+
|
111
|
+
## Tokens
|
112
|
+
|
113
|
+
Greeb operates with entities, tuples of `<from, to, type>`, where
|
114
|
+
`from` is a beginning of the entity, `to` is an ending of the entity,
|
115
|
+
and `type` is a type of the entity.
|
116
|
+
|
117
|
+
There are several entity types: `:letter`, `:float`, `:integer`,
|
118
|
+
`:separ`, `:punct` (for punctuation), `:spunct` (for in-sentence
|
119
|
+
punctuation), and `:break`.
|
120
|
+
|
121
|
+
## Contributing
|
122
|
+
|
123
|
+
1. Fork it;
|
124
|
+
2. Create your feature branch (`git checkout -b my-new-feature`);
|
125
|
+
3. Commit your changes (`git commit -am 'Added some feature'`);
|
126
|
+
4. Push to the branch (`git push origin my-new-feature`);
|
127
|
+
5. Create new Pull Request.
|
128
|
+
|
129
|
+
I highly recommend you to use git flow to make development process much
|
130
|
+
systematic and awesome.
|
131
|
+
|
132
|
+
## Build Status [<img src="https://secure.travis-ci.org/eveel/greeb.png"/>](http://travis-ci.org/eveel/greeb)
|
133
|
+
|
134
|
+
## Dependency Status [<img src="https://gemnasium.com/eveel/greeb.png?travis"/>](https://gemnasium.com/eveel/greeb)
|
135
|
+
|
136
|
+
## Copyright
|
137
|
+
|
138
|
+
Copyright (c) 2010-2012 [Dmitry A. Ustalov]. See LICENSE for details.
|
139
|
+
|
140
|
+
[Dmitry A. Ustalov]: http://eveel.ru
|
data/Rakefile
CHANGED
@@ -1,12 +1,12 @@
|
|
1
|
+
#!/usr/bin/env rake
|
1
2
|
# encoding: utf-8
|
2
3
|
|
3
|
-
require 'bundler'
|
4
|
-
Bundler::GemHelper.install_tasks
|
4
|
+
require 'bundler/gem_tasks'
|
5
5
|
|
6
|
-
|
7
|
-
desc 'Run all examples'
|
8
|
-
RSpec::Core::RakeTask.new(:spec) do |t|
|
9
|
-
t.rspec_opts = %w[--color]
|
10
|
-
end
|
6
|
+
task :default => :test
|
11
7
|
|
12
|
-
|
8
|
+
require 'rake/testtask'
|
9
|
+
Rake::TestTask.new do |test|
|
10
|
+
test.pattern = 'spec/**/*_spec.rb'
|
11
|
+
test.verbose = true
|
12
|
+
end
|
data/greeb.gemspec
CHANGED
@@ -1,25 +1,27 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
|
4
|
-
require 'greeb'
|
3
|
+
require File.expand_path('../lib/greeb/version', __FILE__)
|
5
4
|
|
6
5
|
Gem::Specification.new do |s|
|
7
6
|
s.name = 'greeb'
|
8
7
|
s.version = Greeb::VERSION
|
9
8
|
s.platform = Gem::Platform::RUBY
|
10
|
-
s.authors = [
|
11
|
-
s.email = [
|
9
|
+
s.authors = ['Dmitry A. Ustalov']
|
10
|
+
s.email = ['dmitry@eveel.ru']
|
12
11
|
s.homepage = 'https://github.com/eveel/greeb'
|
13
|
-
s.summary = 'Greeb is a
|
14
|
-
s.description = 'Greeb is awesome
|
12
|
+
s.summary = 'Greeb is a simple regexp-based tokenizer.'
|
13
|
+
s.description = 'Greeb is a simple yet awesome regexp-based tokenizer, ' \
|
15
14
|
'written in Ruby.'
|
16
15
|
|
17
16
|
s.rubyforge_project = 'greeb'
|
18
17
|
|
19
|
-
s.
|
18
|
+
s.add_development_dependency 'rake'
|
19
|
+
s.add_development_dependency 'minitest', '>= 2.11'
|
20
|
+
s.add_development_dependency 'simplecov'
|
21
|
+
s.add_development_dependency 'yard'
|
20
22
|
|
21
23
|
s.files = `git ls-files`.split("\n")
|
22
24
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
23
25
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
24
|
-
s.require_paths = [
|
26
|
+
s.require_paths = ['lib']
|
25
27
|
end
|
data/lib/greeb.rb
CHANGED
@@ -1,11 +1,25 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
|
4
|
-
#
|
5
|
-
module Greeb
|
6
|
-
# Version of the Greeb.
|
7
|
-
#
|
8
|
-
VERSION = "0.0.2"
|
3
|
+
require 'greeb/version'
|
9
4
|
|
10
|
-
|
5
|
+
# Greeb operates with entities, tuples of `<from, to, kind>`, where
|
6
|
+
# `from` is a beginning of the entity, `to` is an ending of the entity,
|
7
|
+
# and `kind` is a type of the entity.
|
8
|
+
#
|
9
|
+
# There are several entity types: `:letter`, `:float`, `:integer`,
|
10
|
+
# `:separ` for separators, `:punct` for punctuation characters,
|
11
|
+
# `:spunct` for in-sentence punctuation characters, and
|
12
|
+
# `:break` for line endings.
|
13
|
+
#
|
14
|
+
class Greeb::Entity < Struct.new(:from, :to, :type)
|
15
|
+
def <=> other
|
16
|
+
if (comparison = self.from <=> other.from) == 0
|
17
|
+
self.to <=> other.to
|
18
|
+
else
|
19
|
+
comparison
|
20
|
+
end
|
21
|
+
end
|
11
22
|
end
|
23
|
+
|
24
|
+
require 'greeb/tokenizer'
|
25
|
+
require 'greeb/segmentator'
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# It is possible to perform simple sentence detection that is based
|
4
|
+
# on Greeb's tokenization.
|
5
|
+
#
|
6
|
+
class Greeb::Segmentator
|
7
|
+
# Sentence does not start from the separator charater, line break
|
8
|
+
# character, and punctuation characters.
|
9
|
+
#
|
10
|
+
SENTENCE_DOESNT_START = [:separ, :break, :punct, :spunct]
|
11
|
+
|
12
|
+
attr_reader :tokens
|
13
|
+
|
14
|
+
# Create a new instance of {Greeb::Segmentator}.
|
15
|
+
#
|
16
|
+
# @param tokenizer_or_tokens [Greeb::Tokenizer,Set] an instance of
|
17
|
+
# Greeb::Tokenizer or set of its results.
|
18
|
+
#
|
19
|
+
def initialize tokenizer_or_tokens
|
20
|
+
@tokens = if tokenizer_or_tokens.is_a? Greeb::Tokenizer
|
21
|
+
tokenizer_or_tokens.tokens
|
22
|
+
else
|
23
|
+
tokenizer_or_tokens
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Sentences memoization method.
|
28
|
+
#
|
29
|
+
# @return [Set<Greeb::Entity>] a set of sentences.
|
30
|
+
#
|
31
|
+
def sentences
|
32
|
+
detect_sentences! unless @sentences
|
33
|
+
@sentences
|
34
|
+
end
|
35
|
+
|
36
|
+
# Extract tokens from the set of sentences.
|
37
|
+
#
|
38
|
+
# @param sentences [Array<Greeb::Entity>] a list of sentences.
|
39
|
+
#
|
40
|
+
# @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
|
41
|
+
# sentences as keys and tokens arrays as values.
|
42
|
+
#
|
43
|
+
def extract *sentences
|
44
|
+
Hash[
|
45
|
+
sentences.map do |s|
|
46
|
+
[s, tokens.select { |t| t.from >= s.from and t.to <= s.to }]
|
47
|
+
end
|
48
|
+
]
|
49
|
+
end
|
50
|
+
|
51
|
+
protected
|
52
|
+
# Implementation of the sentence detection method. This method
|
53
|
+
# changes the `@sentences` ivar.
|
54
|
+
#
|
55
|
+
# @return [nil] nothing.
|
56
|
+
#
|
57
|
+
def detect_sentences!
|
58
|
+
@sentences = SortedSet.new
|
59
|
+
|
60
|
+
rest = tokens.inject(new_sentence) do |sentence, token|
|
61
|
+
if !sentence.from and SENTENCE_DOESNT_START.include?(token.type)
|
62
|
+
next sentence
|
63
|
+
end
|
64
|
+
|
65
|
+
sentence.from = token.from unless sentence.from
|
66
|
+
|
67
|
+
next sentence if sentence.to and sentence.to > token.to
|
68
|
+
|
69
|
+
if :punct == token.type
|
70
|
+
sentence.to = tokens.
|
71
|
+
select { |t| t.from >= token.from }.
|
72
|
+
inject(token) { |r, t| break r if t.type != token.type; t }.
|
73
|
+
to
|
74
|
+
|
75
|
+
@sentences << sentence
|
76
|
+
sentence = new_sentence
|
77
|
+
elsif :separ != token.type
|
78
|
+
sentence.to = token.to
|
79
|
+
end
|
80
|
+
|
81
|
+
sentence
|
82
|
+
end
|
83
|
+
|
84
|
+
nil.tap { @sentences << rest if rest.from and rest.to }
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
# Create a new instance of {Greeb::Entity} with `:sentence` type.
|
89
|
+
#
|
90
|
+
# @return [Greeb::Entity] a new entity instance.
|
91
|
+
#
|
92
|
+
def new_sentence
|
93
|
+
Greeb::Entity.new(nil, nil, :sentence)
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'strscan'
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
# Greeb's tokenization facilities. Use 'em with love.
|
7
|
+
#
|
8
|
+
class Greeb::Tokenizer
|
9
|
+
# English and Russian letters.
|
10
|
+
#
|
11
|
+
LETTERS = /[A-Za-zА-Яа-яЁё]+/u
|
12
|
+
|
13
|
+
# Floating point values.
|
14
|
+
#
|
15
|
+
FLOATS = /(\d+)[.,](\d+)/u
|
16
|
+
|
17
|
+
# Integer values.
|
18
|
+
#
|
19
|
+
INTEGERS = /\d+/u
|
20
|
+
|
21
|
+
# In-subsentence seprator (i.e.: "*" or "=").
|
22
|
+
#
|
23
|
+
SEPARATORS = /[*=_\/\\ ]+/u
|
24
|
+
|
25
|
+
# Punctuation character (i.e.: "." or "!").
|
26
|
+
#
|
27
|
+
PUNCTUATIONS = /(\.|\!|\?)+/u
|
28
|
+
|
29
|
+
# In-sentence punctuation character (i.e.: "," or "-").
|
30
|
+
#
|
31
|
+
SENTENCE_PUNCTUATIONS = /(\,|\[|\]|\(|\)|\-|:|;)+/u
|
32
|
+
|
33
|
+
# Line breaks.
|
34
|
+
#
|
35
|
+
BREAKS = /\n+/u
|
36
|
+
|
37
|
+
attr_reader :text, :scanner
|
38
|
+
protected :scanner
|
39
|
+
|
40
|
+
# Create a new instance of {Greeb::Tokenizer}.
|
41
|
+
#
|
42
|
+
# @param text [String] text to be tokenized.
|
43
|
+
#
|
44
|
+
def initialize(text)
|
45
|
+
@text = text
|
46
|
+
end
|
47
|
+
|
48
|
+
# Tokens memoization method.
|
49
|
+
#
|
50
|
+
# @return [Set<Greeb::Entity>] a set of tokens.
|
51
|
+
#
|
52
|
+
def tokens
|
53
|
+
tokenize! unless @tokens
|
54
|
+
@tokens
|
55
|
+
end
|
56
|
+
|
57
|
+
protected
|
58
|
+
# Perform the tokenization process. This method modifies
|
59
|
+
# `@scanner` and `@tokens` instance variables.
|
60
|
+
#
|
61
|
+
# @return [nil] nothing unless exception is raised.
|
62
|
+
#
|
63
|
+
def tokenize!
|
64
|
+
@scanner = StringScanner.new(text)
|
65
|
+
@tokens = SortedSet.new
|
66
|
+
while !scanner.eos?
|
67
|
+
parse! LETTERS, :letter or
|
68
|
+
parse! FLOATS, :float or
|
69
|
+
parse! INTEGERS, :integer or
|
70
|
+
split_parse! SENTENCE_PUNCTUATIONS, :spunct or
|
71
|
+
split_parse! PUNCTUATIONS, :punct or
|
72
|
+
split_parse! SEPARATORS, :separ or
|
73
|
+
split_parse! BREAKS, :break or
|
74
|
+
raise @tokens.inspect
|
75
|
+
end
|
76
|
+
ensure
|
77
|
+
scanner.terminate
|
78
|
+
end
|
79
|
+
|
80
|
+
# Try to parse one small piece of text that is covered by pattern
|
81
|
+
# of necessary type.
|
82
|
+
#
|
83
|
+
# @param pattern [Regexp] a regular expression to extract the token.
|
84
|
+
# @param type [Symbol] a symbol that represents the necessary token
|
85
|
+
# type.
|
86
|
+
#
|
87
|
+
# @return [Set<Greeb::Entity>] the modified set of extracted tokens.
|
88
|
+
#
|
89
|
+
def parse! pattern, type
|
90
|
+
return false unless token = scanner.scan(pattern)
|
91
|
+
@tokens << Greeb::Entity.new(scanner.pos - token.length, scanner.pos, type)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Try to parse one small piece of text that is covered by pattern
|
95
|
+
# of necessary type. This method performs grouping of the same
|
96
|
+
# characters.
|
97
|
+
#
|
98
|
+
# @param pattern [Regexp] a regular expression to extract the token.
|
99
|
+
# @param type [Symbol] a symbol that represents the necessary token
|
100
|
+
# type.
|
101
|
+
#
|
102
|
+
# @return [Set<Greeb::Entity>] the modified set of extracted tokens.
|
103
|
+
#
|
104
|
+
def split_parse! pattern, type
|
105
|
+
return false unless token = scanner.scan(pattern)
|
106
|
+
position = scanner.pos - token.length
|
107
|
+
token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
|
108
|
+
@tokens << Greeb::Entity.new(before, before + s.length, type)
|
109
|
+
before + s.length
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require File.expand_path('../spec_helper', __FILE__)
|
4
|
+
|
5
|
+
module Greeb
|
6
|
+
describe Segmentator do
|
7
|
+
describe 'initialization' do
|
8
|
+
before { @tokenizer = Tokenizer.new('Vodka') }
|
9
|
+
|
10
|
+
subject { Segmentator.new(@tokenizer) }
|
11
|
+
|
12
|
+
it 'can be initialized either with Tokenizer' do
|
13
|
+
subject.tokens.must_be_kind_of SortedSet
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'can be initialized either with a set of tokens' do
|
17
|
+
subject = Segmentator.new(@tokenizer.tokens)
|
18
|
+
subject.tokens.must_be_kind_of SortedSet
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should has @tokens ivar' do
|
22
|
+
subject.instance_variable_get(:@tokens).wont_be_nil
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
describe 'a simple sentence' do
|
27
|
+
before { @tokenizer = Tokenizer.new('Hello, I am JC Denton.') }
|
28
|
+
|
29
|
+
subject { Segmentator.new(@tokenizer).sentences }
|
30
|
+
|
31
|
+
it 'should be segmented' do
|
32
|
+
subject.must_equal(
|
33
|
+
SortedSet.new([Entity.new(0, 22, :sentence)])
|
34
|
+
)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe 'a simple sentence without punctuation' do
|
39
|
+
before { @tokenizer = Tokenizer.new('Hello, I am JC Denton') }
|
40
|
+
|
41
|
+
subject { Segmentator.new(@tokenizer).sentences }
|
42
|
+
|
43
|
+
it 'should be segmented' do
|
44
|
+
subject.must_equal(
|
45
|
+
SortedSet.new([Entity.new(0, 21, :sentence)])
|
46
|
+
)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
describe 'a simple sentence with trailing whitespaces' do
|
51
|
+
before { @tokenizer = Tokenizer.new(' Hello, I am JC Denton ') }
|
52
|
+
|
53
|
+
subject { Segmentator.new(@tokenizer).sentences }
|
54
|
+
|
55
|
+
it 'should be segmented' do
|
56
|
+
subject.must_equal(
|
57
|
+
SortedSet.new([Entity.new(6, 27, :sentence)])
|
58
|
+
)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
describe 'two simple sentences' do
|
63
|
+
before { @tokenizer = Tokenizer.new('Hello! I am JC Denton.') }
|
64
|
+
|
65
|
+
subject { Segmentator.new(@tokenizer).sentences }
|
66
|
+
|
67
|
+
it 'should be segmented' do
|
68
|
+
subject.must_equal(
|
69
|
+
SortedSet.new([Entity.new(0, 6, :sentence),
|
70
|
+
Entity.new(7, 22, :sentence)])
|
71
|
+
)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
describe 'one wrong character and one simple sentence' do
|
76
|
+
before { @tokenizer = Tokenizer.new('! I am JC Denton.') }
|
77
|
+
|
78
|
+
subject { Segmentator.new(@tokenizer).sentences }
|
79
|
+
|
80
|
+
it 'should be segmented' do
|
81
|
+
subject.must_equal(
|
82
|
+
SortedSet.new([Entity.new(2, 17, :sentence)])
|
83
|
+
)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
describe 'token extractor' do
|
88
|
+
before { @tokenizer = Tokenizer.new('Hello! I am JC Denton.') }
|
89
|
+
|
90
|
+
subject { Segmentator.new(@tokenizer) }
|
91
|
+
|
92
|
+
it 'should be extracted' do
|
93
|
+
subject.extract(*subject.sentences).must_equal({
|
94
|
+
Entity.new(0, 6, :sentence) => [
|
95
|
+
Entity.new(0, 5, :letter),
|
96
|
+
Entity.new(5, 6, :punct)
|
97
|
+
],
|
98
|
+
Entity.new(7, 22, :sentence) => [
|
99
|
+
Entity.new(7, 8, :letter),
|
100
|
+
Entity.new(8, 9, :separ),
|
101
|
+
Entity.new(9, 11, :letter),
|
102
|
+
Entity.new(11, 12, :separ),
|
103
|
+
Entity.new(12, 14, :letter),
|
104
|
+
Entity.new(14, 15, :separ),
|
105
|
+
Entity.new(15, 21, :letter),
|
106
|
+
Entity.new(21, 22, :punct)
|
107
|
+
]
|
108
|
+
})
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,14 +1,20 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
require
|
3
|
+
require 'rubygems'
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
$:.unshift File.expand_path('../../lib', __FILE__)
|
6
|
+
|
7
|
+
if RUBY_VERSION == '1.8'
|
8
|
+
gem 'minitest'
|
7
9
|
end
|
8
10
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
11
|
+
require 'minitest/autorun'
|
12
|
+
|
13
|
+
unless 'true' == ENV['TRAVIS']
|
14
|
+
require 'simplecov'
|
15
|
+
SimpleCov.start do
|
16
|
+
add_filter '/spec/'
|
13
17
|
end
|
14
18
|
end
|
19
|
+
|
20
|
+
require 'greeb'
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require File.expand_path('../spec_helper', __FILE__)
|
4
|
+
|
5
|
+
module Greeb
|
6
|
+
describe Tokenizer do
|
7
|
+
describe 'initialization' do
|
8
|
+
subject { Tokenizer.new('vodka') }
|
9
|
+
|
10
|
+
it 'should be initialized with a text' do
|
11
|
+
subject.text.must_equal 'vodka'
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should has the @text ivar' do
|
15
|
+
subject.instance_variable_get(:@text).must_equal 'vodka'
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should not has @tokens ivar' do
|
19
|
+
subject.instance_variable_get(:@tokens).must_be_nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe 'after tokenization' do
|
24
|
+
subject { Tokenizer.new('vodka').tap(&:tokens) }
|
25
|
+
|
26
|
+
it 'should has the @tokens ivar' do
|
27
|
+
subject.instance_variable_get(:@tokens).wont_be_nil
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should has the @scanner ivar' do
|
31
|
+
subject.instance_variable_get(:@scanner).wont_be_nil
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should has the tokens set' do
|
35
|
+
subject.tokens.must_be_kind_of SortedSet
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe 'tokenization facilities' do
|
40
|
+
it 'can handle words' do
|
41
|
+
Tokenizer.new('hello').tokens.must_equal(
|
42
|
+
SortedSet.new([Entity.new(0, 5, :letter)])
|
43
|
+
)
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'can handle floats' do
|
47
|
+
Tokenizer.new('14.88').tokens.must_equal(
|
48
|
+
SortedSet.new([Entity.new(0, 5, :float)])
|
49
|
+
)
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'can handle integers' do
|
53
|
+
Tokenizer.new('1337').tokens.must_equal(
|
54
|
+
SortedSet.new([Entity.new(0, 4, :integer)])
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'can handle words and integers' do
|
59
|
+
Tokenizer.new('Hello, I am 18').tokens.must_equal(
|
60
|
+
SortedSet.new([Entity.new(0, 5, :letter),
|
61
|
+
Entity.new(5, 6, :spunct),
|
62
|
+
Entity.new(6, 7, :separ),
|
63
|
+
Entity.new(7, 8, :letter),
|
64
|
+
Entity.new(8, 9, :separ),
|
65
|
+
Entity.new(9, 11, :letter),
|
66
|
+
Entity.new(11, 12, :separ),
|
67
|
+
Entity.new(12, 14, :integer)])
|
68
|
+
)
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'can handle multi-line paragraphs' do
|
72
|
+
Tokenizer.new("Brateeshka..!\n\nPrines!").tokens.must_equal(
|
73
|
+
SortedSet.new([Entity.new(0, 10, :letter),
|
74
|
+
Entity.new(10, 12, :punct),
|
75
|
+
Entity.new(12, 13, :punct),
|
76
|
+
Entity.new(13, 15, :break),
|
77
|
+
Entity.new(15, 21, :letter),
|
78
|
+
Entity.new(21, 22, :punct)])
|
79
|
+
)
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'can handle separated integers' do
|
83
|
+
Tokenizer.new('228/359').tokens.must_equal(
|
84
|
+
SortedSet.new([Entity.new(0, 3, :integer),
|
85
|
+
Entity.new(3, 4, :separ),
|
86
|
+
Entity.new(4, 7, :integer)])
|
87
|
+
)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
metadata
CHANGED
@@ -1,29 +1,81 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.0.rc1
|
5
|
+
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Dmitry A. Ustalov
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
13
|
-
default_executable:
|
12
|
+
date: 2012-07-08 00:00:00.000000000 Z
|
14
13
|
dependencies:
|
15
14
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
17
|
-
requirement:
|
15
|
+
name: rake
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
18
17
|
none: false
|
19
18
|
requirements:
|
20
|
-
- -
|
19
|
+
- - ! '>='
|
21
20
|
- !ruby/object:Gem::Version
|
22
|
-
version:
|
23
|
-
type: :
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
24
23
|
prerelease: false
|
25
|
-
version_requirements:
|
26
|
-
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: minitest
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '2.11'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '2.11'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: simplecov
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: yard
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
description: Greeb is a simple yet awesome regexp-based tokenizer, written in Ruby.
|
27
79
|
email:
|
28
80
|
- dmitry@eveel.ru
|
29
81
|
executables: []
|
@@ -31,19 +83,20 @@ extensions: []
|
|
31
83
|
extra_rdoc_files: []
|
32
84
|
files:
|
33
85
|
- .gitignore
|
86
|
+
- .travis.yml
|
87
|
+
- .yardopts
|
34
88
|
- Gemfile
|
35
|
-
-
|
36
|
-
- README
|
89
|
+
- LICENSE
|
90
|
+
- README.md
|
37
91
|
- Rakefile
|
38
|
-
- greeb-test.rb
|
39
92
|
- greeb.gemspec
|
40
|
-
- lib/enumerable.rb
|
41
93
|
- lib/greeb.rb
|
42
|
-
- lib/greeb/
|
43
|
-
- lib/
|
44
|
-
-
|
94
|
+
- lib/greeb/segmentator.rb
|
95
|
+
- lib/greeb/tokenizer.rb
|
96
|
+
- lib/greeb/version.rb
|
97
|
+
- spec/segmentator_spec.rb
|
45
98
|
- spec/spec_helper.rb
|
46
|
-
|
99
|
+
- spec/tokenizer_spec.rb
|
47
100
|
homepage: https://github.com/eveel/greeb
|
48
101
|
licenses: []
|
49
102
|
post_install_message:
|
@@ -56,18 +109,23 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
56
109
|
- - ! '>='
|
57
110
|
- !ruby/object:Gem::Version
|
58
111
|
version: '0'
|
112
|
+
segments:
|
113
|
+
- 0
|
114
|
+
hash: -4603914053803130942
|
59
115
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
116
|
none: false
|
61
117
|
requirements:
|
62
|
-
- - ! '
|
118
|
+
- - ! '>'
|
63
119
|
- !ruby/object:Gem::Version
|
64
|
-
version:
|
120
|
+
version: 1.3.1
|
65
121
|
requirements: []
|
66
122
|
rubyforge_project: greeb
|
67
|
-
rubygems_version: 1.
|
123
|
+
rubygems_version: 1.8.24
|
68
124
|
signing_key:
|
69
125
|
specification_version: 3
|
70
|
-
summary: Greeb is a
|
126
|
+
summary: Greeb is a simple regexp-based tokenizer.
|
71
127
|
test_files:
|
72
|
-
- spec/
|
128
|
+
- spec/segmentator_spec.rb
|
73
129
|
- spec/spec_helper.rb
|
130
|
+
- spec/tokenizer_spec.rb
|
131
|
+
has_rdoc:
|
data/Gemfile.lock
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
greeb (0.0.2)
|
5
|
-
rspec (~> 2.4.0)
|
6
|
-
|
7
|
-
GEM
|
8
|
-
remote: http://rubygems.org/
|
9
|
-
specs:
|
10
|
-
diff-lcs (1.1.2)
|
11
|
-
rspec (2.4.0)
|
12
|
-
rspec-core (~> 2.4.0)
|
13
|
-
rspec-expectations (~> 2.4.0)
|
14
|
-
rspec-mocks (~> 2.4.0)
|
15
|
-
rspec-core (2.4.0)
|
16
|
-
rspec-expectations (2.4.0)
|
17
|
-
diff-lcs (~> 1.1.2)
|
18
|
-
rspec-mocks (2.4.0)
|
19
|
-
|
20
|
-
PLATFORMS
|
21
|
-
ruby
|
22
|
-
|
23
|
-
DEPENDENCIES
|
24
|
-
greeb!
|
data/README
DELETED
File without changes
|
data/greeb-test.rb
DELETED
@@ -1,141 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# encoding: utf-8
|
3
|
-
|
4
|
-
require 'rubygems'
|
5
|
-
require 'graphviz'
|
6
|
-
|
7
|
-
$:.unshift('./lib')
|
8
|
-
require 'greeb'
|
9
|
-
|
10
|
-
origin = <<-END
|
11
|
-
- Сынок, чего это от тебя зигами пахнет,
|
12
|
-
опять на Манежную площадь ходил?
|
13
|
-
|
14
|
-
- Нет мама, я в метро ехал, там назиговано было!!
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
Четырнадцать, восемьдесять восемь: 14/88.
|
19
|
-
Вот так блять
|
20
|
-
END
|
21
|
-
origin.chomp!
|
22
|
-
|
23
|
-
def identify(token)
|
24
|
-
case token
|
25
|
-
when Greeb::RU_LEX then 'RU_LEX'
|
26
|
-
when Greeb::EN_LEX then 'EN_LEX'
|
27
|
-
when Greeb::EOL then 'EOL'
|
28
|
-
when Greeb::SEP then 'SEP'
|
29
|
-
when Greeb::PUN then 'PUN'
|
30
|
-
when Greeb::SPUN then 'SPUN'
|
31
|
-
when Greeb::DIG then 'DIG'
|
32
|
-
when Greeb::DIL then 'DIL'
|
33
|
-
else
|
34
|
-
'?!'
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
greeb = Greeb::Parser.new(origin)
|
39
|
-
text = greeb.tree
|
40
|
-
|
41
|
-
g = GraphViz.new('graphematics', 'type' => 'graph')
|
42
|
-
|
43
|
-
g.node[:color] = '#ddaa66'
|
44
|
-
g.node[:style] = 'filled'
|
45
|
-
g.node[:shape] = 'box'
|
46
|
-
g.node[:penwidth] = '1'
|
47
|
-
g.node[:fontname] = 'PT Sans'
|
48
|
-
g.node[:fontsize] = '8'
|
49
|
-
g.node[:fillcolor]= '#ffeecc'
|
50
|
-
g.node[:fontcolor]= '#775500'
|
51
|
-
g.node[:margin] = '0.0'
|
52
|
-
|
53
|
-
g.edge[:color] = '#999999'
|
54
|
-
g.edge[:weight] = '1'
|
55
|
-
g.edge[:fontname] = 'PT Sans'
|
56
|
-
g.edge[:fontcolor]= '#444444'
|
57
|
-
g.edge[:fontsize] = '6'
|
58
|
-
g.edge[:dir] = 'forward'
|
59
|
-
g.edge[:arrowsize]= '0.5'
|
60
|
-
|
61
|
-
bid = 'begin'
|
62
|
-
g.add_node(bid).tap do |node|
|
63
|
-
node.label = "Начало\nтекста"
|
64
|
-
node.shape = 'ellipse'
|
65
|
-
node.style = ''
|
66
|
-
end
|
67
|
-
|
68
|
-
eid = 'end'
|
69
|
-
g.add_node(eid).tap do |node|
|
70
|
-
node.label = "Конец\nтекста"
|
71
|
-
node.shape = 'ellipse'
|
72
|
-
node.style = ''
|
73
|
-
end
|
74
|
-
|
75
|
-
tree = text.map_with_index do |paragraph, i|
|
76
|
-
pid = "p#{i}"
|
77
|
-
sentences = paragraph.map_with_index do |sentence, j|
|
78
|
-
sid = "#{pid}s#{j}"
|
79
|
-
subsentences = sentence.map_with_index do |subsentence, k|
|
80
|
-
ssid = "#{sid}ss#{k}"
|
81
|
-
tokens = subsentence.map_with_index do |token, l|
|
82
|
-
next if ' ' == token
|
83
|
-
[ "#{ssid}t#{l}", token, l ]
|
84
|
-
end
|
85
|
-
tokens.delete(nil)
|
86
|
-
[ ssid, tokens, k ]
|
87
|
-
end
|
88
|
-
[ sid, subsentences, j ]
|
89
|
-
end
|
90
|
-
[ pid, sentences, i ]
|
91
|
-
end
|
92
|
-
|
93
|
-
tree.each do |pid, paragraph, i|
|
94
|
-
g.add_node(pid).tap do |node|
|
95
|
-
node.label = "Абзац\n№#{i + 1}"
|
96
|
-
node.shape = 'ellipse'
|
97
|
-
end
|
98
|
-
g.add_edge(bid, pid)
|
99
|
-
|
100
|
-
paragraph.each do |sid, sentence, j|
|
101
|
-
g.add_node(sid).tap do |node|
|
102
|
-
node.label = "Предложение\n№#{j + 1}"
|
103
|
-
node.shape = 'ellipse'
|
104
|
-
end
|
105
|
-
g.add_edge(pid, sid)
|
106
|
-
|
107
|
-
sentence.each do |ssid, subsentence, k|
|
108
|
-
g.add_node(ssid).tap do |node|
|
109
|
-
node.label = "Подпредложение\n№#{k + 1}"
|
110
|
-
node.shape = 'ellipse'
|
111
|
-
end
|
112
|
-
g.add_edge(sid, ssid)
|
113
|
-
|
114
|
-
subsentence.each do |tid, token, l|
|
115
|
-
g.add_node(tid).label = token
|
116
|
-
g.add_edge(ssid, tid).label = identify(token)
|
117
|
-
g.add_edge(tid, eid)
|
118
|
-
end
|
119
|
-
|
120
|
-
subsentence.each_cons(2) do |(tid1, token1, l1),
|
121
|
-
(tid2, token2, l2)|
|
122
|
-
g.add_edge(tid1, tid2).tap do |edge|
|
123
|
-
edge.weight = 0.25
|
124
|
-
edge.style = 'dashed'
|
125
|
-
end
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
sentence.each_cons(2) do |(ssid1, subsentence1, k1),
|
130
|
-
(ssid2, subsentence2, k2)|
|
131
|
-
tid1, token1, l1 = subsentence1.last
|
132
|
-
tid2, token2, l2 = subsentence2.first
|
133
|
-
g.add_edge(tid1, tid2).tap do |edge|
|
134
|
-
edge.weight = 0.5
|
135
|
-
edge.style = 'dashed'
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
g.output(:output => 'png', :file => 'graph.png')
|
data/lib/enumerable.rb
DELETED
data/lib/greeb/parser.rb
DELETED
@@ -1,176 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
require 'meta_array'
|
4
|
-
require 'enumerable'
|
5
|
-
|
6
|
-
# Graphematical Parser of the Greeb.
|
7
|
-
# Use it with love.
|
8
|
-
#
|
9
|
-
class Greeb::Parser
|
10
|
-
# Russian lexeme (i.e.: "хуй").
|
11
|
-
#
|
12
|
-
RUSSIAN_LEXEME = /^[А-Яа-яЁё]+$/u
|
13
|
-
|
14
|
-
# English lexeme (i.e.: "foo").
|
15
|
-
#
|
16
|
-
ENGLISH_LEXEME = /^[A-Za-z]+$/u
|
17
|
-
|
18
|
-
# End of Line sequence (i.e.: "\n").
|
19
|
-
#
|
20
|
-
END_OF_LINE = /^\n+$/u
|
21
|
-
|
22
|
-
# In-subsentence seprator (i.e.: "*" or "\").
|
23
|
-
#
|
24
|
-
SEPARATOR = /^[*=_\/\\ ]$/u
|
25
|
-
|
26
|
-
# Punctuation character (i.e.: "." or "!").
|
27
|
-
#
|
28
|
-
PUNCTUATION = /^(\.|\!|\?)$/u
|
29
|
-
|
30
|
-
# In-sentence punctuation character (i.e.: "," or "-").
|
31
|
-
#
|
32
|
-
SENTENCE_PUNCTUATION = /^(\,|\[|\]|\(|\)|\-|:|;)$/u
|
33
|
-
|
34
|
-
# Digit (i.e.: "1337").
|
35
|
-
#
|
36
|
-
DIGIT = /^[0-9]+$/u
|
37
|
-
|
38
|
-
# Digit-Letter complex (i.e.: "0xDEADBEEF").
|
39
|
-
#
|
40
|
-
DIGIT_LETTER = /^[А-Яа-яA-Za-z0-9Ёё]+$/u
|
41
|
-
|
42
|
-
# Empty string (i.e.: "").
|
43
|
-
#
|
44
|
-
EMPTY = ''
|
45
|
-
|
46
|
-
attr_accessor :text
|
47
|
-
private :text=
|
48
|
-
|
49
|
-
# Create a new instance of Greeb::Parser.
|
50
|
-
#
|
51
|
-
# ==== Parameters
|
52
|
-
# text<String>:: Source text.
|
53
|
-
#
|
54
|
-
def initialize(text)
|
55
|
-
self.text = text
|
56
|
-
end
|
57
|
-
|
58
|
-
# Perform the text parsing.
|
59
|
-
#
|
60
|
-
# ==== Returns
|
61
|
-
# Array:: Tree of Graphematical Analysis of text.
|
62
|
-
#
|
63
|
-
def parse
|
64
|
-
return @tree if @tree
|
65
|
-
|
66
|
-
# parse tree
|
67
|
-
tree = MetaArray.new
|
68
|
-
|
69
|
-
# paragraph, sentence, subsentence
|
70
|
-
p_id, s_id, ss_id = 0, 0, 0
|
71
|
-
|
72
|
-
# current token
|
73
|
-
token = ''
|
74
|
-
|
75
|
-
# run FSM
|
76
|
-
text.each_char do |c|
|
77
|
-
case c
|
78
|
-
when END_OF_LINE then begin
|
79
|
-
case token
|
80
|
-
when EMPTY then token << c
|
81
|
-
when END_OF_LINE then begin
|
82
|
-
token = ''
|
83
|
-
p_id += 1
|
84
|
-
s_id = 0
|
85
|
-
ss_id = 0
|
86
|
-
end
|
87
|
-
else
|
88
|
-
tree[p_id][s_id][ss_id] << token
|
89
|
-
token = c
|
90
|
-
end
|
91
|
-
end
|
92
|
-
when SEPARATOR then begin
|
93
|
-
case token
|
94
|
-
when EMPTY
|
95
|
-
else
|
96
|
-
tree[p_id][s_id][ss_id] << token
|
97
|
-
while tree[p_id][s_id][ss_id].last == c
|
98
|
-
tree[p_id][s_id][ss_id].pop
|
99
|
-
end
|
100
|
-
tree[p_id][s_id][ss_id] << c
|
101
|
-
token = ''
|
102
|
-
end
|
103
|
-
end
|
104
|
-
when PUNCTUATION then begin
|
105
|
-
case token
|
106
|
-
when EMPTY
|
107
|
-
else
|
108
|
-
tree[p_id][s_id][ss_id] << token
|
109
|
-
tree[p_id][s_id][ss_id] << c
|
110
|
-
token = ''
|
111
|
-
s_id += 1
|
112
|
-
ss_id = 0
|
113
|
-
end
|
114
|
-
end
|
115
|
-
when SENTENCE_PUNCTUATION then begin
|
116
|
-
case token
|
117
|
-
when EMPTY
|
118
|
-
else
|
119
|
-
tree[p_id][s_id][ss_id] << token
|
120
|
-
tree[p_id][s_id][ss_id] << c
|
121
|
-
token = ''
|
122
|
-
ss_id += 1
|
123
|
-
end
|
124
|
-
end
|
125
|
-
when RUSSIAN_LEXEME then begin
|
126
|
-
case token
|
127
|
-
when END_OF_LINE then begin
|
128
|
-
tree[p_id][s_id][ss_id] << ' '
|
129
|
-
token = c
|
130
|
-
end
|
131
|
-
else
|
132
|
-
token << c
|
133
|
-
end
|
134
|
-
end
|
135
|
-
when ENGLISH_LEXEME then begin
|
136
|
-
case token
|
137
|
-
when END_OF_LINE then begin
|
138
|
-
tree[p_id][s_id][ss_id] << ' '
|
139
|
-
token = c
|
140
|
-
end
|
141
|
-
else
|
142
|
-
token << c
|
143
|
-
end
|
144
|
-
end
|
145
|
-
when DIGIT then begin
|
146
|
-
case token
|
147
|
-
when END_OF_LINE then begin
|
148
|
-
tree[p_id][s_id][ss_id] << ' '
|
149
|
-
token = c
|
150
|
-
end
|
151
|
-
else
|
152
|
-
token << c
|
153
|
-
end
|
154
|
-
end
|
155
|
-
when DIGIT_LETTER then begin
|
156
|
-
case token
|
157
|
-
when END_OF_LINE then begin
|
158
|
-
tree[p_id][s_id][ss_id] << token
|
159
|
-
token = c
|
160
|
-
end
|
161
|
-
else
|
162
|
-
token << c
|
163
|
-
end
|
164
|
-
end
|
165
|
-
end
|
166
|
-
end
|
167
|
-
|
168
|
-
unless token.empty?
|
169
|
-
tree[p_id][s_id][ss_id] << token
|
170
|
-
end
|
171
|
-
|
172
|
-
tree.delete(nil)
|
173
|
-
|
174
|
-
@tree = tree.to_a
|
175
|
-
end
|
176
|
-
end
|
data/lib/meta_array.rb
DELETED
data/spec/parser_spec.rb
DELETED
@@ -1,63 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
require File.expand_path('../spec_helper.rb', __FILE__)
|
4
|
-
|
5
|
-
describe Greeb::Parser do
|
6
|
-
it 'should parse very simple strings' do
|
7
|
-
'буба сука дебил'.should be_parsed_as([
|
8
|
-
[
|
9
|
-
[ [ 'буба', ' ', 'сука', ' ', 'дебил' ] ]
|
10
|
-
]
|
11
|
-
])
|
12
|
-
end
|
13
|
-
|
14
|
-
it 'should parse one sentence with subsentences' do
|
15
|
-
'буба, сука, дебил'.should be_parsed_as([
|
16
|
-
[
|
17
|
-
[
|
18
|
-
[ 'буба', ',' ],
|
19
|
-
[ 'сука', ',' ],
|
20
|
-
[ 'дебил' ]
|
21
|
-
]
|
22
|
-
]
|
23
|
-
])
|
24
|
-
end
|
25
|
-
|
26
|
-
it 'should parse two simple paragraphs' do
|
27
|
-
"буба сука дебил\n\nточно!".should be_parsed_as([
|
28
|
-
[
|
29
|
-
[ [ 'буба', ' ', 'сука', ' ', 'дебил' ] ]
|
30
|
-
],
|
31
|
-
[
|
32
|
-
[ [ 'точно', '!' ] ]
|
33
|
-
]
|
34
|
-
])
|
35
|
-
end
|
36
|
-
|
37
|
-
it 'should parse two sentences in paragraph' do
|
38
|
-
"буба молодец? буба умница.".should be_parsed_as([
|
39
|
-
[
|
40
|
-
[ [ 'буба', ' ', 'молодец', '?' ] ],
|
41
|
-
[ [ 'буба', ' ', 'умница', '.' ] ]
|
42
|
-
]
|
43
|
-
])
|
44
|
-
end
|
45
|
-
|
46
|
-
it 'should parse sentences with floating point values' do
|
47
|
-
'буба не считает Пи равной 3.14'.should be_parsed_as([
|
48
|
-
[
|
49
|
-
[ [ 'буба', ' ', 'не', ' ', 'считает', ' ',
|
50
|
-
'Пи', ' ', 'равной', ' ', '3.14' ] ]
|
51
|
-
]
|
52
|
-
])
|
53
|
-
end
|
54
|
-
|
55
|
-
it 'should parse sentences with floating "dot" values' do
|
56
|
-
'буба не считает Пи равной 3,14'.should be_parsed_as([
|
57
|
-
[
|
58
|
-
[ [ 'буба', ' ', 'не', ' ', 'считает', ' ',
|
59
|
-
'Пи', ' ', 'равной', ' ', '3,14' ] ]
|
60
|
-
]
|
61
|
-
])
|
62
|
-
end
|
63
|
-
end
|