greeb 0.0.2 → 0.1.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/.travis.yml +7 -0
- data/.yardopts +6 -0
- data/Gemfile +2 -0
- data/LICENSE +20 -0
- data/README.md +140 -0
- data/Rakefile +8 -8
- data/greeb.gemspec +10 -8
- data/lib/greeb.rb +21 -7
- data/lib/greeb/segmentator.rb +95 -0
- data/lib/greeb/tokenizer.rb +112 -0
- data/lib/greeb/version.rb +9 -0
- data/spec/segmentator_spec.rb +112 -0
- data/spec/spec_helper.rb +13 -7
- data/spec/tokenizer_spec.rb +91 -0
- metadata +82 -24
- data/Gemfile.lock +0 -24
- data/README +0 -0
- data/greeb-test.rb +0 -141
- data/lib/enumerable.rb +0 -10
- data/lib/greeb/parser.rb +0 -176
- data/lib/meta_array.rb +0 -14
- data/spec/parser_spec.rb +0 -63
data/.gitignore
CHANGED
data/.travis.yml
ADDED
data/.yardopts
ADDED
data/Gemfile
CHANGED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010-2012 Dmitry A. Ustalov
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
Greeb
|
2
|
+
=====
|
3
|
+
|
4
|
+
Greeb is a simple yet awesome text tokenizer that is based on regular
|
5
|
+
expressions.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'greeb'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install greeb
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
Greeb can help you to solve simple text processing problems:
|
26
|
+
|
27
|
+
```ruby
|
28
|
+
pp Greeb::Tokenizer.new('Hello!').tokens
|
29
|
+
=begin
|
30
|
+
#<SortedSet: {#<struct Greeb::Entity from=0, to=5, type=:letter>,
|
31
|
+
#<struct Greeb::Entity from=5, to=6, type=:punct>}>
|
32
|
+
=end
|
33
|
+
```
|
34
|
+
|
35
|
+
It should be noted that it is possible to process much complex texts:
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
text =<<-EOF
|
39
|
+
Hello! I am 18! My favourite number is 133.7...
|
40
|
+
|
41
|
+
What about you?
|
42
|
+
EOF
|
43
|
+
|
44
|
+
pp Greeb::Tokenizer.new(text).tokens
|
45
|
+
=begin
|
46
|
+
#<SortedSet: {#<struct Greeb::Entity from=0, to=5, type=:letter>,
|
47
|
+
#<struct Greeb::Entity from=5, to=6, type=:punct>,
|
48
|
+
#<struct Greeb::Entity from=6, to=7, type=:separ>,
|
49
|
+
#<struct Greeb::Entity from=7, to=8, type=:letter>,
|
50
|
+
#<struct Greeb::Entity from=8, to=9, type=:separ>,
|
51
|
+
#<struct Greeb::Entity from=9, to=11, type=:letter>,
|
52
|
+
#<struct Greeb::Entity from=11, to=12, type=:separ>,
|
53
|
+
#<struct Greeb::Entity from=12, to=14, type=:integer>,
|
54
|
+
#<struct Greeb::Entity from=14, to=15, type=:punct>,
|
55
|
+
#<struct Greeb::Entity from=15, to=16, type=:separ>,
|
56
|
+
#<struct Greeb::Entity from=16, to=18, type=:letter>,
|
57
|
+
#<struct Greeb::Entity from=18, to=19, type=:separ>,
|
58
|
+
#<struct Greeb::Entity from=19, to=28, type=:letter>,
|
59
|
+
#<struct Greeb::Entity from=28, to=29, type=:separ>,
|
60
|
+
#<struct Greeb::Entity from=29, to=35, type=:letter>,
|
61
|
+
#<struct Greeb::Entity from=35, to=36, type=:separ>,
|
62
|
+
#<struct Greeb::Entity from=36, to=38, type=:letter>,
|
63
|
+
#<struct Greeb::Entity from=38, to=39, type=:separ>,
|
64
|
+
#<struct Greeb::Entity from=39, to=44, type=:float>,
|
65
|
+
#<struct Greeb::Entity from=44, to=47, type=:punct>,
|
66
|
+
#<struct Greeb::Entity from=47, to=49, type=:break>,
|
67
|
+
#<struct Greeb::Entity from=49, to=53, type=:letter>,
|
68
|
+
#<struct Greeb::Entity from=53, to=54, type=:separ>,
|
69
|
+
#<struct Greeb::Entity from=54, to=59, type=:letter>,
|
70
|
+
#<struct Greeb::Entity from=59, to=60, type=:separ>,
|
71
|
+
#<struct Greeb::Entity from=60, to=63, type=:letter>,
|
72
|
+
#<struct Greeb::Entity from=63, to=64, type=:punct>,
|
73
|
+
#<struct Greeb::Entity from=64, to=65, type=:break>}>
|
74
|
+
=end
|
75
|
+
```
|
76
|
+
|
77
|
+
Also it can be used to solve the text segmentation problems
|
78
|
+
such as sentence detection tasks:
|
79
|
+
|
80
|
+
```ruby
|
81
|
+
text = 'Hello! How are you?'
|
82
|
+
pp Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
|
83
|
+
=begin
|
84
|
+
#<SortedSet: {#<struct Greeb::Entity from=0, to=6, type=:sentence>,
|
85
|
+
#<struct Greeb::Entity from=7, to=19, type=:sentence>}>
|
86
|
+
=end
|
87
|
+
```
|
88
|
+
|
89
|
+
It is possible to extract tokens that were processed by the text
|
90
|
+
segmentator:
|
91
|
+
|
92
|
+
```ruby
|
93
|
+
text = 'Hello! How are you?'
|
94
|
+
segmentator = Greeb::Segmentator.new(Greeb::Tokenizer.new(text))
|
95
|
+
sentences = segmentator.sentences
|
96
|
+
pp segmentator.extract(*sentences)
|
97
|
+
=begin
|
98
|
+
{#<struct Greeb::Entity from=0, to=6, type=:sentence>=>
|
99
|
+
[#<struct Greeb::Entity from=0, to=5, type=:letter>,
|
100
|
+
#<struct Greeb::Entity from=5, to=6, type=:punct>],
|
101
|
+
#<struct Greeb::Entity from=7, to=19, type=:sentence>=>
|
102
|
+
[#<struct Greeb::Entity from=7, to=10, type=:letter>,
|
103
|
+
#<struct Greeb::Entity from=10, to=11, type=:separ>,
|
104
|
+
#<struct Greeb::Entity from=11, to=14, type=:letter>,
|
105
|
+
#<struct Greeb::Entity from=14, to=15, type=:separ>,
|
106
|
+
#<struct Greeb::Entity from=15, to=18, type=:letter>,
|
107
|
+
#<struct Greeb::Entity from=18, to=19, type=:punct>]}
|
108
|
+
=end
|
109
|
+
```
|
110
|
+
|
111
|
+
## Tokens
|
112
|
+
|
113
|
+
Greeb operates with entities, tuples of `<from, to, type>`, where
|
114
|
+
`from` is a beginning of the entity, `to` is an ending of the entity,
|
115
|
+
and `type` is a type of the entity.
|
116
|
+
|
117
|
+
There are several entity types: `:letter`, `:float`, `:integer`,
|
118
|
+
`:separ`, `:punct` (for punctuation), `:spunct` (for in-sentence
|
119
|
+
punctuation), and `:break`.
|
120
|
+
|
121
|
+
## Contributing
|
122
|
+
|
123
|
+
1. Fork it;
|
124
|
+
2. Create your feature branch (`git checkout -b my-new-feature`);
|
125
|
+
3. Commit your changes (`git commit -am 'Added some feature'`);
|
126
|
+
4. Push to the branch (`git push origin my-new-feature`);
|
127
|
+
5. Create new Pull Request.
|
128
|
+
|
129
|
+
I highly recommend you to use git flow to make development process much
|
130
|
+
systematic and awesome.
|
131
|
+
|
132
|
+
## Build Status [<img src="https://secure.travis-ci.org/eveel/greeb.png"/>](http://travis-ci.org/eveel/greeb)
|
133
|
+
|
134
|
+
## Dependency Status [<img src="https://gemnasium.com/eveel/greeb.png?travis"/>](https://gemnasium.com/eveel/greeb)
|
135
|
+
|
136
|
+
## Copyright
|
137
|
+
|
138
|
+
Copyright (c) 2010-2012 [Dmitry A. Ustalov]. See LICENSE for details.
|
139
|
+
|
140
|
+
[Dmitry A. Ustalov]: http://eveel.ru
|
data/Rakefile
CHANGED
@@ -1,12 +1,12 @@
|
|
1
|
+
#!/usr/bin/env rake
|
1
2
|
# encoding: utf-8
|
2
3
|
|
3
|
-
require 'bundler'
|
4
|
-
Bundler::GemHelper.install_tasks
|
4
|
+
require 'bundler/gem_tasks'
|
5
5
|
|
6
|
-
|
7
|
-
desc 'Run all examples'
|
8
|
-
RSpec::Core::RakeTask.new(:spec) do |t|
|
9
|
-
t.rspec_opts = %w[--color]
|
10
|
-
end
|
6
|
+
task :default => :test
|
11
7
|
|
12
|
-
|
8
|
+
require 'rake/testtask'
|
9
|
+
Rake::TestTask.new do |test|
|
10
|
+
test.pattern = 'spec/**/*_spec.rb'
|
11
|
+
test.verbose = true
|
12
|
+
end
|
data/greeb.gemspec
CHANGED
@@ -1,25 +1,27 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
|
4
|
-
require 'greeb'
|
3
|
+
require File.expand_path('../lib/greeb/version', __FILE__)
|
5
4
|
|
6
5
|
Gem::Specification.new do |s|
|
7
6
|
s.name = 'greeb'
|
8
7
|
s.version = Greeb::VERSION
|
9
8
|
s.platform = Gem::Platform::RUBY
|
10
|
-
s.authors = [
|
11
|
-
s.email = [
|
9
|
+
s.authors = ['Dmitry A. Ustalov']
|
10
|
+
s.email = ['dmitry@eveel.ru']
|
12
11
|
s.homepage = 'https://github.com/eveel/greeb'
|
13
|
-
s.summary = 'Greeb is a
|
14
|
-
s.description = 'Greeb is awesome
|
12
|
+
s.summary = 'Greeb is a simple regexp-based tokenizer.'
|
13
|
+
s.description = 'Greeb is a simple yet awesome regexp-based tokenizer, ' \
|
15
14
|
'written in Ruby.'
|
16
15
|
|
17
16
|
s.rubyforge_project = 'greeb'
|
18
17
|
|
19
|
-
s.
|
18
|
+
s.add_development_dependency 'rake'
|
19
|
+
s.add_development_dependency 'minitest', '>= 2.11'
|
20
|
+
s.add_development_dependency 'simplecov'
|
21
|
+
s.add_development_dependency 'yard'
|
20
22
|
|
21
23
|
s.files = `git ls-files`.split("\n")
|
22
24
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
23
25
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
24
|
-
s.require_paths = [
|
26
|
+
s.require_paths = ['lib']
|
25
27
|
end
|
data/lib/greeb.rb
CHANGED
@@ -1,11 +1,25 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
|
4
|
-
#
|
5
|
-
module Greeb
|
6
|
-
# Version of the Greeb.
|
7
|
-
#
|
8
|
-
VERSION = "0.0.2"
|
3
|
+
require 'greeb/version'
|
9
4
|
|
10
|
-
|
5
|
+
# Greeb operates with entities, tuples of `<from, to, kind>`, where
|
6
|
+
# `from` is a beginning of the entity, `to` is an ending of the entity,
|
7
|
+
# and `kind` is a type of the entity.
|
8
|
+
#
|
9
|
+
# There are several entity types: `:letter`, `:float`, `:integer`,
|
10
|
+
# `:separ` for separators, `:punct` for punctuation characters,
|
11
|
+
# `:spunct` for in-sentence punctuation characters, and
|
12
|
+
# `:break` for line endings.
|
13
|
+
#
|
14
|
+
class Greeb::Entity < Struct.new(:from, :to, :type)
|
15
|
+
def <=> other
|
16
|
+
if (comparison = self.from <=> other.from) == 0
|
17
|
+
self.to <=> other.to
|
18
|
+
else
|
19
|
+
comparison
|
20
|
+
end
|
21
|
+
end
|
11
22
|
end
|
23
|
+
|
24
|
+
require 'greeb/tokenizer'
|
25
|
+
require 'greeb/segmentator'
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# It is possible to perform simple sentence detection that is based
|
4
|
+
# on Greeb's tokenization.
|
5
|
+
#
|
6
|
+
class Greeb::Segmentator
|
7
|
+
# Sentence does not start from the separator charater, line break
|
8
|
+
# character, and punctuation characters.
|
9
|
+
#
|
10
|
+
SENTENCE_DOESNT_START = [:separ, :break, :punct, :spunct]
|
11
|
+
|
12
|
+
attr_reader :tokens
|
13
|
+
|
14
|
+
# Create a new instance of {Greeb::Segmentator}.
|
15
|
+
#
|
16
|
+
# @param tokenizer_or_tokens [Greeb::Tokenizer,Set] an instance of
|
17
|
+
# Greeb::Tokenizer or set of its results.
|
18
|
+
#
|
19
|
+
def initialize tokenizer_or_tokens
|
20
|
+
@tokens = if tokenizer_or_tokens.is_a? Greeb::Tokenizer
|
21
|
+
tokenizer_or_tokens.tokens
|
22
|
+
else
|
23
|
+
tokenizer_or_tokens
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Sentences memoization method.
|
28
|
+
#
|
29
|
+
# @return [Set<Greeb::Entity>] a set of sentences.
|
30
|
+
#
|
31
|
+
def sentences
|
32
|
+
detect_sentences! unless @sentences
|
33
|
+
@sentences
|
34
|
+
end
|
35
|
+
|
36
|
+
# Extract tokens from the set of sentences.
|
37
|
+
#
|
38
|
+
# @param sentences [Array<Greeb::Entity>] a list of sentences.
|
39
|
+
#
|
40
|
+
# @return [Hash<Greeb::Entity, Array<Greeb::Entity>>] a hash with
|
41
|
+
# sentences as keys and tokens arrays as values.
|
42
|
+
#
|
43
|
+
def extract *sentences
|
44
|
+
Hash[
|
45
|
+
sentences.map do |s|
|
46
|
+
[s, tokens.select { |t| t.from >= s.from and t.to <= s.to }]
|
47
|
+
end
|
48
|
+
]
|
49
|
+
end
|
50
|
+
|
51
|
+
protected
|
52
|
+
# Implementation of the sentence detection method. This method
|
53
|
+
# changes the `@sentences` ivar.
|
54
|
+
#
|
55
|
+
# @return [nil] nothing.
|
56
|
+
#
|
57
|
+
def detect_sentences!
|
58
|
+
@sentences = SortedSet.new
|
59
|
+
|
60
|
+
rest = tokens.inject(new_sentence) do |sentence, token|
|
61
|
+
if !sentence.from and SENTENCE_DOESNT_START.include?(token.type)
|
62
|
+
next sentence
|
63
|
+
end
|
64
|
+
|
65
|
+
sentence.from = token.from unless sentence.from
|
66
|
+
|
67
|
+
next sentence if sentence.to and sentence.to > token.to
|
68
|
+
|
69
|
+
if :punct == token.type
|
70
|
+
sentence.to = tokens.
|
71
|
+
select { |t| t.from >= token.from }.
|
72
|
+
inject(token) { |r, t| break r if t.type != token.type; t }.
|
73
|
+
to
|
74
|
+
|
75
|
+
@sentences << sentence
|
76
|
+
sentence = new_sentence
|
77
|
+
elsif :separ != token.type
|
78
|
+
sentence.to = token.to
|
79
|
+
end
|
80
|
+
|
81
|
+
sentence
|
82
|
+
end
|
83
|
+
|
84
|
+
nil.tap { @sentences << rest if rest.from and rest.to }
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
# Create a new instance of {Greeb::Entity} with `:sentence` type.
|
89
|
+
#
|
90
|
+
# @return [Greeb::Entity] a new entity instance.
|
91
|
+
#
|
92
|
+
def new_sentence
|
93
|
+
Greeb::Entity.new(nil, nil, :sentence)
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'strscan'
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
# Greeb's tokenization facilities. Use 'em with love.
|
7
|
+
#
|
8
|
+
class Greeb::Tokenizer
|
9
|
+
# English and Russian letters.
|
10
|
+
#
|
11
|
+
LETTERS = /[A-Za-zА-Яа-яЁё]+/u
|
12
|
+
|
13
|
+
# Floating point values.
|
14
|
+
#
|
15
|
+
FLOATS = /(\d+)[.,](\d+)/u
|
16
|
+
|
17
|
+
# Integer values.
|
18
|
+
#
|
19
|
+
INTEGERS = /\d+/u
|
20
|
+
|
21
|
+
# In-subsentence seprator (i.e.: "*" or "=").
|
22
|
+
#
|
23
|
+
SEPARATORS = /[*=_\/\\ ]+/u
|
24
|
+
|
25
|
+
# Punctuation character (i.e.: "." or "!").
|
26
|
+
#
|
27
|
+
PUNCTUATIONS = /(\.|\!|\?)+/u
|
28
|
+
|
29
|
+
# In-sentence punctuation character (i.e.: "," or "-").
|
30
|
+
#
|
31
|
+
SENTENCE_PUNCTUATIONS = /(\,|\[|\]|\(|\)|\-|:|;)+/u
|
32
|
+
|
33
|
+
# Line breaks.
|
34
|
+
#
|
35
|
+
BREAKS = /\n+/u
|
36
|
+
|
37
|
+
attr_reader :text, :scanner
|
38
|
+
protected :scanner
|
39
|
+
|
40
|
+
# Create a new instance of {Greeb::Tokenizer}.
|
41
|
+
#
|
42
|
+
# @param text [String] text to be tokenized.
|
43
|
+
#
|
44
|
+
def initialize(text)
|
45
|
+
@text = text
|
46
|
+
end
|
47
|
+
|
48
|
+
# Tokens memoization method.
|
49
|
+
#
|
50
|
+
# @return [Set<Greeb::Entity>] a set of tokens.
|
51
|
+
#
|
52
|
+
def tokens
|
53
|
+
tokenize! unless @tokens
|
54
|
+
@tokens
|
55
|
+
end
|
56
|
+
|
57
|
+
protected
|
58
|
+
# Perform the tokenization process. This method modifies
|
59
|
+
# `@scanner` and `@tokens` instance variables.
|
60
|
+
#
|
61
|
+
# @return [nil] nothing unless exception is raised.
|
62
|
+
#
|
63
|
+
def tokenize!
|
64
|
+
@scanner = StringScanner.new(text)
|
65
|
+
@tokens = SortedSet.new
|
66
|
+
while !scanner.eos?
|
67
|
+
parse! LETTERS, :letter or
|
68
|
+
parse! FLOATS, :float or
|
69
|
+
parse! INTEGERS, :integer or
|
70
|
+
split_parse! SENTENCE_PUNCTUATIONS, :spunct or
|
71
|
+
split_parse! PUNCTUATIONS, :punct or
|
72
|
+
split_parse! SEPARATORS, :separ or
|
73
|
+
split_parse! BREAKS, :break or
|
74
|
+
raise @tokens.inspect
|
75
|
+
end
|
76
|
+
ensure
|
77
|
+
scanner.terminate
|
78
|
+
end
|
79
|
+
|
80
|
+
# Try to parse one small piece of text that is covered by pattern
|
81
|
+
# of necessary type.
|
82
|
+
#
|
83
|
+
# @param pattern [Regexp] a regular expression to extract the token.
|
84
|
+
# @param type [Symbol] a symbol that represents the necessary token
|
85
|
+
# type.
|
86
|
+
#
|
87
|
+
# @return [Set<Greeb::Entity>] the modified set of extracted tokens.
|
88
|
+
#
|
89
|
+
def parse! pattern, type
|
90
|
+
return false unless token = scanner.scan(pattern)
|
91
|
+
@tokens << Greeb::Entity.new(scanner.pos - token.length, scanner.pos, type)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Try to parse one small piece of text that is covered by pattern
|
95
|
+
# of necessary type. This method performs grouping of the same
|
96
|
+
# characters.
|
97
|
+
#
|
98
|
+
# @param pattern [Regexp] a regular expression to extract the token.
|
99
|
+
# @param type [Symbol] a symbol that represents the necessary token
|
100
|
+
# type.
|
101
|
+
#
|
102
|
+
# @return [Set<Greeb::Entity>] the modified set of extracted tokens.
|
103
|
+
#
|
104
|
+
def split_parse! pattern, type
|
105
|
+
return false unless token = scanner.scan(pattern)
|
106
|
+
position = scanner.pos - token.length
|
107
|
+
token.scan(/((.|\n)\2*)/).map(&:first).inject(position) do |before, s|
|
108
|
+
@tokens << Greeb::Entity.new(before, before + s.length, type)
|
109
|
+
before + s.length
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require File.expand_path('../spec_helper', __FILE__)
|
4
|
+
|
5
|
+
module Greeb
|
6
|
+
describe Segmentator do
|
7
|
+
describe 'initialization' do
|
8
|
+
before { @tokenizer = Tokenizer.new('Vodka') }
|
9
|
+
|
10
|
+
subject { Segmentator.new(@tokenizer) }
|
11
|
+
|
12
|
+
it 'can be initialized either with Tokenizer' do
|
13
|
+
subject.tokens.must_be_kind_of SortedSet
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'can be initialized either with a set of tokens' do
|
17
|
+
subject = Segmentator.new(@tokenizer.tokens)
|
18
|
+
subject.tokens.must_be_kind_of SortedSet
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should has @tokens ivar' do
|
22
|
+
subject.instance_variable_get(:@tokens).wont_be_nil
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
describe 'a simple sentence' do
|
27
|
+
before { @tokenizer = Tokenizer.new('Hello, I am JC Denton.') }
|
28
|
+
|
29
|
+
subject { Segmentator.new(@tokenizer).sentences }
|
30
|
+
|
31
|
+
it 'should be segmented' do
|
32
|
+
subject.must_equal(
|
33
|
+
SortedSet.new([Entity.new(0, 22, :sentence)])
|
34
|
+
)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe 'a simple sentence without punctuation' do
|
39
|
+
before { @tokenizer = Tokenizer.new('Hello, I am JC Denton') }
|
40
|
+
|
41
|
+
subject { Segmentator.new(@tokenizer).sentences }
|
42
|
+
|
43
|
+
it 'should be segmented' do
|
44
|
+
subject.must_equal(
|
45
|
+
SortedSet.new([Entity.new(0, 21, :sentence)])
|
46
|
+
)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
describe 'a simple sentence with trailing whitespaces' do
|
51
|
+
before { @tokenizer = Tokenizer.new(' Hello, I am JC Denton ') }
|
52
|
+
|
53
|
+
subject { Segmentator.new(@tokenizer).sentences }
|
54
|
+
|
55
|
+
it 'should be segmented' do
|
56
|
+
subject.must_equal(
|
57
|
+
SortedSet.new([Entity.new(6, 27, :sentence)])
|
58
|
+
)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
describe 'two simple sentences' do
|
63
|
+
before { @tokenizer = Tokenizer.new('Hello! I am JC Denton.') }
|
64
|
+
|
65
|
+
subject { Segmentator.new(@tokenizer).sentences }
|
66
|
+
|
67
|
+
it 'should be segmented' do
|
68
|
+
subject.must_equal(
|
69
|
+
SortedSet.new([Entity.new(0, 6, :sentence),
|
70
|
+
Entity.new(7, 22, :sentence)])
|
71
|
+
)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
describe 'one wrong character and one simple sentence' do
|
76
|
+
before { @tokenizer = Tokenizer.new('! I am JC Denton.') }
|
77
|
+
|
78
|
+
subject { Segmentator.new(@tokenizer).sentences }
|
79
|
+
|
80
|
+
it 'should be segmented' do
|
81
|
+
subject.must_equal(
|
82
|
+
SortedSet.new([Entity.new(2, 17, :sentence)])
|
83
|
+
)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
describe 'token extractor' do
|
88
|
+
before { @tokenizer = Tokenizer.new('Hello! I am JC Denton.') }
|
89
|
+
|
90
|
+
subject { Segmentator.new(@tokenizer) }
|
91
|
+
|
92
|
+
it 'should be extracted' do
|
93
|
+
subject.extract(*subject.sentences).must_equal({
|
94
|
+
Entity.new(0, 6, :sentence) => [
|
95
|
+
Entity.new(0, 5, :letter),
|
96
|
+
Entity.new(5, 6, :punct)
|
97
|
+
],
|
98
|
+
Entity.new(7, 22, :sentence) => [
|
99
|
+
Entity.new(7, 8, :letter),
|
100
|
+
Entity.new(8, 9, :separ),
|
101
|
+
Entity.new(9, 11, :letter),
|
102
|
+
Entity.new(11, 12, :separ),
|
103
|
+
Entity.new(12, 14, :letter),
|
104
|
+
Entity.new(14, 15, :separ),
|
105
|
+
Entity.new(15, 21, :letter),
|
106
|
+
Entity.new(21, 22, :punct)
|
107
|
+
]
|
108
|
+
})
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,14 +1,20 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
require
|
3
|
+
require 'rubygems'
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
$:.unshift File.expand_path('../../lib', __FILE__)
|
6
|
+
|
7
|
+
if RUBY_VERSION == '1.8'
|
8
|
+
gem 'minitest'
|
7
9
|
end
|
8
10
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
11
|
+
require 'minitest/autorun'
|
12
|
+
|
13
|
+
unless 'true' == ENV['TRAVIS']
|
14
|
+
require 'simplecov'
|
15
|
+
SimpleCov.start do
|
16
|
+
add_filter '/spec/'
|
13
17
|
end
|
14
18
|
end
|
19
|
+
|
20
|
+
require 'greeb'
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require File.expand_path('../spec_helper', __FILE__)
|
4
|
+
|
5
|
+
module Greeb
|
6
|
+
describe Tokenizer do
|
7
|
+
describe 'initialization' do
|
8
|
+
subject { Tokenizer.new('vodka') }
|
9
|
+
|
10
|
+
it 'should be initialized with a text' do
|
11
|
+
subject.text.must_equal 'vodka'
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should has the @text ivar' do
|
15
|
+
subject.instance_variable_get(:@text).must_equal 'vodka'
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should not has @tokens ivar' do
|
19
|
+
subject.instance_variable_get(:@tokens).must_be_nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe 'after tokenization' do
|
24
|
+
subject { Tokenizer.new('vodka').tap(&:tokens) }
|
25
|
+
|
26
|
+
it 'should has the @tokens ivar' do
|
27
|
+
subject.instance_variable_get(:@tokens).wont_be_nil
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should has the @scanner ivar' do
|
31
|
+
subject.instance_variable_get(:@scanner).wont_be_nil
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should has the tokens set' do
|
35
|
+
subject.tokens.must_be_kind_of SortedSet
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe 'tokenization facilities' do
|
40
|
+
it 'can handle words' do
|
41
|
+
Tokenizer.new('hello').tokens.must_equal(
|
42
|
+
SortedSet.new([Entity.new(0, 5, :letter)])
|
43
|
+
)
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'can handle floats' do
|
47
|
+
Tokenizer.new('14.88').tokens.must_equal(
|
48
|
+
SortedSet.new([Entity.new(0, 5, :float)])
|
49
|
+
)
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'can handle integers' do
|
53
|
+
Tokenizer.new('1337').tokens.must_equal(
|
54
|
+
SortedSet.new([Entity.new(0, 4, :integer)])
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'can handle words and integers' do
|
59
|
+
Tokenizer.new('Hello, I am 18').tokens.must_equal(
|
60
|
+
SortedSet.new([Entity.new(0, 5, :letter),
|
61
|
+
Entity.new(5, 6, :spunct),
|
62
|
+
Entity.new(6, 7, :separ),
|
63
|
+
Entity.new(7, 8, :letter),
|
64
|
+
Entity.new(8, 9, :separ),
|
65
|
+
Entity.new(9, 11, :letter),
|
66
|
+
Entity.new(11, 12, :separ),
|
67
|
+
Entity.new(12, 14, :integer)])
|
68
|
+
)
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'can handle multi-line paragraphs' do
|
72
|
+
Tokenizer.new("Brateeshka..!\n\nPrines!").tokens.must_equal(
|
73
|
+
SortedSet.new([Entity.new(0, 10, :letter),
|
74
|
+
Entity.new(10, 12, :punct),
|
75
|
+
Entity.new(12, 13, :punct),
|
76
|
+
Entity.new(13, 15, :break),
|
77
|
+
Entity.new(15, 21, :letter),
|
78
|
+
Entity.new(21, 22, :punct)])
|
79
|
+
)
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'can handle separated integers' do
|
83
|
+
Tokenizer.new('228/359').tokens.must_equal(
|
84
|
+
SortedSet.new([Entity.new(0, 3, :integer),
|
85
|
+
Entity.new(3, 4, :separ),
|
86
|
+
Entity.new(4, 7, :integer)])
|
87
|
+
)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
metadata
CHANGED
@@ -1,29 +1,81 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.0.rc1
|
5
|
+
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Dmitry A. Ustalov
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
13
|
-
default_executable:
|
12
|
+
date: 2012-07-08 00:00:00.000000000 Z
|
14
13
|
dependencies:
|
15
14
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
17
|
-
requirement:
|
15
|
+
name: rake
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
18
17
|
none: false
|
19
18
|
requirements:
|
20
|
-
- -
|
19
|
+
- - ! '>='
|
21
20
|
- !ruby/object:Gem::Version
|
22
|
-
version:
|
23
|
-
type: :
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
24
23
|
prerelease: false
|
25
|
-
version_requirements:
|
26
|
-
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: minitest
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '2.11'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '2.11'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: simplecov
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: yard
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
description: Greeb is a simple yet awesome regexp-based tokenizer, written in Ruby.
|
27
79
|
email:
|
28
80
|
- dmitry@eveel.ru
|
29
81
|
executables: []
|
@@ -31,19 +83,20 @@ extensions: []
|
|
31
83
|
extra_rdoc_files: []
|
32
84
|
files:
|
33
85
|
- .gitignore
|
86
|
+
- .travis.yml
|
87
|
+
- .yardopts
|
34
88
|
- Gemfile
|
35
|
-
-
|
36
|
-
- README
|
89
|
+
- LICENSE
|
90
|
+
- README.md
|
37
91
|
- Rakefile
|
38
|
-
- greeb-test.rb
|
39
92
|
- greeb.gemspec
|
40
|
-
- lib/enumerable.rb
|
41
93
|
- lib/greeb.rb
|
42
|
-
- lib/greeb/
|
43
|
-
- lib/
|
44
|
-
-
|
94
|
+
- lib/greeb/segmentator.rb
|
95
|
+
- lib/greeb/tokenizer.rb
|
96
|
+
- lib/greeb/version.rb
|
97
|
+
- spec/segmentator_spec.rb
|
45
98
|
- spec/spec_helper.rb
|
46
|
-
|
99
|
+
- spec/tokenizer_spec.rb
|
47
100
|
homepage: https://github.com/eveel/greeb
|
48
101
|
licenses: []
|
49
102
|
post_install_message:
|
@@ -56,18 +109,23 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
56
109
|
- - ! '>='
|
57
110
|
- !ruby/object:Gem::Version
|
58
111
|
version: '0'
|
112
|
+
segments:
|
113
|
+
- 0
|
114
|
+
hash: -4603914053803130942
|
59
115
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
116
|
none: false
|
61
117
|
requirements:
|
62
|
-
- - ! '
|
118
|
+
- - ! '>'
|
63
119
|
- !ruby/object:Gem::Version
|
64
|
-
version:
|
120
|
+
version: 1.3.1
|
65
121
|
requirements: []
|
66
122
|
rubyforge_project: greeb
|
67
|
-
rubygems_version: 1.
|
123
|
+
rubygems_version: 1.8.24
|
68
124
|
signing_key:
|
69
125
|
specification_version: 3
|
70
|
-
summary: Greeb is a
|
126
|
+
summary: Greeb is a simple regexp-based tokenizer.
|
71
127
|
test_files:
|
72
|
-
- spec/
|
128
|
+
- spec/segmentator_spec.rb
|
73
129
|
- spec/spec_helper.rb
|
130
|
+
- spec/tokenizer_spec.rb
|
131
|
+
has_rdoc:
|
data/Gemfile.lock
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
greeb (0.0.2)
|
5
|
-
rspec (~> 2.4.0)
|
6
|
-
|
7
|
-
GEM
|
8
|
-
remote: http://rubygems.org/
|
9
|
-
specs:
|
10
|
-
diff-lcs (1.1.2)
|
11
|
-
rspec (2.4.0)
|
12
|
-
rspec-core (~> 2.4.0)
|
13
|
-
rspec-expectations (~> 2.4.0)
|
14
|
-
rspec-mocks (~> 2.4.0)
|
15
|
-
rspec-core (2.4.0)
|
16
|
-
rspec-expectations (2.4.0)
|
17
|
-
diff-lcs (~> 1.1.2)
|
18
|
-
rspec-mocks (2.4.0)
|
19
|
-
|
20
|
-
PLATFORMS
|
21
|
-
ruby
|
22
|
-
|
23
|
-
DEPENDENCIES
|
24
|
-
greeb!
|
data/README
DELETED
File without changes
|
data/greeb-test.rb
DELETED
@@ -1,141 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# encoding: utf-8
|
3
|
-
|
4
|
-
require 'rubygems'
|
5
|
-
require 'graphviz'
|
6
|
-
|
7
|
-
$:.unshift('./lib')
|
8
|
-
require 'greeb'
|
9
|
-
|
10
|
-
origin = <<-END
|
11
|
-
- Сынок, чего это от тебя зигами пахнет,
|
12
|
-
опять на Манежную площадь ходил?
|
13
|
-
|
14
|
-
- Нет мама, я в метро ехал, там назиговано было!!
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
Четырнадцать, восемьдесять восемь: 14/88.
|
19
|
-
Вот так блять
|
20
|
-
END
|
21
|
-
origin.chomp!
|
22
|
-
|
23
|
-
def identify(token)
|
24
|
-
case token
|
25
|
-
when Greeb::RU_LEX then 'RU_LEX'
|
26
|
-
when Greeb::EN_LEX then 'EN_LEX'
|
27
|
-
when Greeb::EOL then 'EOL'
|
28
|
-
when Greeb::SEP then 'SEP'
|
29
|
-
when Greeb::PUN then 'PUN'
|
30
|
-
when Greeb::SPUN then 'SPUN'
|
31
|
-
when Greeb::DIG then 'DIG'
|
32
|
-
when Greeb::DIL then 'DIL'
|
33
|
-
else
|
34
|
-
'?!'
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
greeb = Greeb::Parser.new(origin)
|
39
|
-
text = greeb.tree
|
40
|
-
|
41
|
-
g = GraphViz.new('graphematics', 'type' => 'graph')
|
42
|
-
|
43
|
-
g.node[:color] = '#ddaa66'
|
44
|
-
g.node[:style] = 'filled'
|
45
|
-
g.node[:shape] = 'box'
|
46
|
-
g.node[:penwidth] = '1'
|
47
|
-
g.node[:fontname] = 'PT Sans'
|
48
|
-
g.node[:fontsize] = '8'
|
49
|
-
g.node[:fillcolor]= '#ffeecc'
|
50
|
-
g.node[:fontcolor]= '#775500'
|
51
|
-
g.node[:margin] = '0.0'
|
52
|
-
|
53
|
-
g.edge[:color] = '#999999'
|
54
|
-
g.edge[:weight] = '1'
|
55
|
-
g.edge[:fontname] = 'PT Sans'
|
56
|
-
g.edge[:fontcolor]= '#444444'
|
57
|
-
g.edge[:fontsize] = '6'
|
58
|
-
g.edge[:dir] = 'forward'
|
59
|
-
g.edge[:arrowsize]= '0.5'
|
60
|
-
|
61
|
-
bid = 'begin'
|
62
|
-
g.add_node(bid).tap do |node|
|
63
|
-
node.label = "Начало\nтекста"
|
64
|
-
node.shape = 'ellipse'
|
65
|
-
node.style = ''
|
66
|
-
end
|
67
|
-
|
68
|
-
eid = 'end'
|
69
|
-
g.add_node(eid).tap do |node|
|
70
|
-
node.label = "Конец\nтекста"
|
71
|
-
node.shape = 'ellipse'
|
72
|
-
node.style = ''
|
73
|
-
end
|
74
|
-
|
75
|
-
tree = text.map_with_index do |paragraph, i|
|
76
|
-
pid = "p#{i}"
|
77
|
-
sentences = paragraph.map_with_index do |sentence, j|
|
78
|
-
sid = "#{pid}s#{j}"
|
79
|
-
subsentences = sentence.map_with_index do |subsentence, k|
|
80
|
-
ssid = "#{sid}ss#{k}"
|
81
|
-
tokens = subsentence.map_with_index do |token, l|
|
82
|
-
next if ' ' == token
|
83
|
-
[ "#{ssid}t#{l}", token, l ]
|
84
|
-
end
|
85
|
-
tokens.delete(nil)
|
86
|
-
[ ssid, tokens, k ]
|
87
|
-
end
|
88
|
-
[ sid, subsentences, j ]
|
89
|
-
end
|
90
|
-
[ pid, sentences, i ]
|
91
|
-
end
|
92
|
-
|
93
|
-
tree.each do |pid, paragraph, i|
|
94
|
-
g.add_node(pid).tap do |node|
|
95
|
-
node.label = "Абзац\n№#{i + 1}"
|
96
|
-
node.shape = 'ellipse'
|
97
|
-
end
|
98
|
-
g.add_edge(bid, pid)
|
99
|
-
|
100
|
-
paragraph.each do |sid, sentence, j|
|
101
|
-
g.add_node(sid).tap do |node|
|
102
|
-
node.label = "Предложение\n№#{j + 1}"
|
103
|
-
node.shape = 'ellipse'
|
104
|
-
end
|
105
|
-
g.add_edge(pid, sid)
|
106
|
-
|
107
|
-
sentence.each do |ssid, subsentence, k|
|
108
|
-
g.add_node(ssid).tap do |node|
|
109
|
-
node.label = "Подпредложение\n№#{k + 1}"
|
110
|
-
node.shape = 'ellipse'
|
111
|
-
end
|
112
|
-
g.add_edge(sid, ssid)
|
113
|
-
|
114
|
-
subsentence.each do |tid, token, l|
|
115
|
-
g.add_node(tid).label = token
|
116
|
-
g.add_edge(ssid, tid).label = identify(token)
|
117
|
-
g.add_edge(tid, eid)
|
118
|
-
end
|
119
|
-
|
120
|
-
subsentence.each_cons(2) do |(tid1, token1, l1),
|
121
|
-
(tid2, token2, l2)|
|
122
|
-
g.add_edge(tid1, tid2).tap do |edge|
|
123
|
-
edge.weight = 0.25
|
124
|
-
edge.style = 'dashed'
|
125
|
-
end
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
sentence.each_cons(2) do |(ssid1, subsentence1, k1),
|
130
|
-
(ssid2, subsentence2, k2)|
|
131
|
-
tid1, token1, l1 = subsentence1.last
|
132
|
-
tid2, token2, l2 = subsentence2.first
|
133
|
-
g.add_edge(tid1, tid2).tap do |edge|
|
134
|
-
edge.weight = 0.5
|
135
|
-
edge.style = 'dashed'
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
g.output(:output => 'png', :file => 'graph.png')
|
data/lib/enumerable.rb
DELETED
data/lib/greeb/parser.rb
DELETED
@@ -1,176 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
require 'meta_array'
|
4
|
-
require 'enumerable'
|
5
|
-
|
6
|
-
# Graphematical Parser of the Greeb.
|
7
|
-
# Use it with love.
|
8
|
-
#
|
9
|
-
class Greeb::Parser
|
10
|
-
# Russian lexeme (i.e.: "хуй").
|
11
|
-
#
|
12
|
-
RUSSIAN_LEXEME = /^[А-Яа-яЁё]+$/u
|
13
|
-
|
14
|
-
# English lexeme (i.e.: "foo").
|
15
|
-
#
|
16
|
-
ENGLISH_LEXEME = /^[A-Za-z]+$/u
|
17
|
-
|
18
|
-
# End of Line sequence (i.e.: "\n").
|
19
|
-
#
|
20
|
-
END_OF_LINE = /^\n+$/u
|
21
|
-
|
22
|
-
# In-subsentence seprator (i.e.: "*" or "\").
|
23
|
-
#
|
24
|
-
SEPARATOR = /^[*=_\/\\ ]$/u
|
25
|
-
|
26
|
-
# Punctuation character (i.e.: "." or "!").
|
27
|
-
#
|
28
|
-
PUNCTUATION = /^(\.|\!|\?)$/u
|
29
|
-
|
30
|
-
# In-sentence punctuation character (i.e.: "," or "-").
|
31
|
-
#
|
32
|
-
SENTENCE_PUNCTUATION = /^(\,|\[|\]|\(|\)|\-|:|;)$/u
|
33
|
-
|
34
|
-
# Digit (i.e.: "1337").
|
35
|
-
#
|
36
|
-
DIGIT = /^[0-9]+$/u
|
37
|
-
|
38
|
-
# Digit-Letter complex (i.e.: "0xDEADBEEF").
|
39
|
-
#
|
40
|
-
DIGIT_LETTER = /^[А-Яа-яA-Za-z0-9Ёё]+$/u
|
41
|
-
|
42
|
-
# Empty string (i.e.: "").
|
43
|
-
#
|
44
|
-
EMPTY = ''
|
45
|
-
|
46
|
-
attr_accessor :text
|
47
|
-
private :text=
|
48
|
-
|
49
|
-
# Create a new instance of Greeb::Parser.
|
50
|
-
#
|
51
|
-
# ==== Parameters
|
52
|
-
# text<String>:: Source text.
|
53
|
-
#
|
54
|
-
def initialize(text)
|
55
|
-
self.text = text
|
56
|
-
end
|
57
|
-
|
58
|
-
# Perform the text parsing.
|
59
|
-
#
|
60
|
-
# ==== Returns
|
61
|
-
# Array:: Tree of Graphematical Analysis of text.
|
62
|
-
#
|
63
|
-
def parse
|
64
|
-
return @tree if @tree
|
65
|
-
|
66
|
-
# parse tree
|
67
|
-
tree = MetaArray.new
|
68
|
-
|
69
|
-
# paragraph, sentence, subsentence
|
70
|
-
p_id, s_id, ss_id = 0, 0, 0
|
71
|
-
|
72
|
-
# current token
|
73
|
-
token = ''
|
74
|
-
|
75
|
-
# run FSM
|
76
|
-
text.each_char do |c|
|
77
|
-
case c
|
78
|
-
when END_OF_LINE then begin
|
79
|
-
case token
|
80
|
-
when EMPTY then token << c
|
81
|
-
when END_OF_LINE then begin
|
82
|
-
token = ''
|
83
|
-
p_id += 1
|
84
|
-
s_id = 0
|
85
|
-
ss_id = 0
|
86
|
-
end
|
87
|
-
else
|
88
|
-
tree[p_id][s_id][ss_id] << token
|
89
|
-
token = c
|
90
|
-
end
|
91
|
-
end
|
92
|
-
when SEPARATOR then begin
|
93
|
-
case token
|
94
|
-
when EMPTY
|
95
|
-
else
|
96
|
-
tree[p_id][s_id][ss_id] << token
|
97
|
-
while tree[p_id][s_id][ss_id].last == c
|
98
|
-
tree[p_id][s_id][ss_id].pop
|
99
|
-
end
|
100
|
-
tree[p_id][s_id][ss_id] << c
|
101
|
-
token = ''
|
102
|
-
end
|
103
|
-
end
|
104
|
-
when PUNCTUATION then begin
|
105
|
-
case token
|
106
|
-
when EMPTY
|
107
|
-
else
|
108
|
-
tree[p_id][s_id][ss_id] << token
|
109
|
-
tree[p_id][s_id][ss_id] << c
|
110
|
-
token = ''
|
111
|
-
s_id += 1
|
112
|
-
ss_id = 0
|
113
|
-
end
|
114
|
-
end
|
115
|
-
when SENTENCE_PUNCTUATION then begin
|
116
|
-
case token
|
117
|
-
when EMPTY
|
118
|
-
else
|
119
|
-
tree[p_id][s_id][ss_id] << token
|
120
|
-
tree[p_id][s_id][ss_id] << c
|
121
|
-
token = ''
|
122
|
-
ss_id += 1
|
123
|
-
end
|
124
|
-
end
|
125
|
-
when RUSSIAN_LEXEME then begin
|
126
|
-
case token
|
127
|
-
when END_OF_LINE then begin
|
128
|
-
tree[p_id][s_id][ss_id] << ' '
|
129
|
-
token = c
|
130
|
-
end
|
131
|
-
else
|
132
|
-
token << c
|
133
|
-
end
|
134
|
-
end
|
135
|
-
when ENGLISH_LEXEME then begin
|
136
|
-
case token
|
137
|
-
when END_OF_LINE then begin
|
138
|
-
tree[p_id][s_id][ss_id] << ' '
|
139
|
-
token = c
|
140
|
-
end
|
141
|
-
else
|
142
|
-
token << c
|
143
|
-
end
|
144
|
-
end
|
145
|
-
when DIGIT then begin
|
146
|
-
case token
|
147
|
-
when END_OF_LINE then begin
|
148
|
-
tree[p_id][s_id][ss_id] << ' '
|
149
|
-
token = c
|
150
|
-
end
|
151
|
-
else
|
152
|
-
token << c
|
153
|
-
end
|
154
|
-
end
|
155
|
-
when DIGIT_LETTER then begin
|
156
|
-
case token
|
157
|
-
when END_OF_LINE then begin
|
158
|
-
tree[p_id][s_id][ss_id] << token
|
159
|
-
token = c
|
160
|
-
end
|
161
|
-
else
|
162
|
-
token << c
|
163
|
-
end
|
164
|
-
end
|
165
|
-
end
|
166
|
-
end
|
167
|
-
|
168
|
-
unless token.empty?
|
169
|
-
tree[p_id][s_id][ss_id] << token
|
170
|
-
end
|
171
|
-
|
172
|
-
tree.delete(nil)
|
173
|
-
|
174
|
-
@tree = tree.to_a
|
175
|
-
end
|
176
|
-
end
|
data/lib/meta_array.rb
DELETED
data/spec/parser_spec.rb
DELETED
@@ -1,63 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
require File.expand_path('../spec_helper.rb', __FILE__)
|
4
|
-
|
5
|
-
describe Greeb::Parser do
|
6
|
-
it 'should parse very simple strings' do
|
7
|
-
'буба сука дебил'.should be_parsed_as([
|
8
|
-
[
|
9
|
-
[ [ 'буба', ' ', 'сука', ' ', 'дебил' ] ]
|
10
|
-
]
|
11
|
-
])
|
12
|
-
end
|
13
|
-
|
14
|
-
it 'should parse one sentence with subsentences' do
|
15
|
-
'буба, сука, дебил'.should be_parsed_as([
|
16
|
-
[
|
17
|
-
[
|
18
|
-
[ 'буба', ',' ],
|
19
|
-
[ 'сука', ',' ],
|
20
|
-
[ 'дебил' ]
|
21
|
-
]
|
22
|
-
]
|
23
|
-
])
|
24
|
-
end
|
25
|
-
|
26
|
-
it 'should parse two simple paragraphs' do
|
27
|
-
"буба сука дебил\n\nточно!".should be_parsed_as([
|
28
|
-
[
|
29
|
-
[ [ 'буба', ' ', 'сука', ' ', 'дебил' ] ]
|
30
|
-
],
|
31
|
-
[
|
32
|
-
[ [ 'точно', '!' ] ]
|
33
|
-
]
|
34
|
-
])
|
35
|
-
end
|
36
|
-
|
37
|
-
it 'should parse two sentences in paragraph' do
|
38
|
-
"буба молодец? буба умница.".should be_parsed_as([
|
39
|
-
[
|
40
|
-
[ [ 'буба', ' ', 'молодец', '?' ] ],
|
41
|
-
[ [ 'буба', ' ', 'умница', '.' ] ]
|
42
|
-
]
|
43
|
-
])
|
44
|
-
end
|
45
|
-
|
46
|
-
it 'should parse sentences with floating point values' do
|
47
|
-
'буба не считает Пи равной 3.14'.should be_parsed_as([
|
48
|
-
[
|
49
|
-
[ [ 'буба', ' ', 'не', ' ', 'считает', ' ',
|
50
|
-
'Пи', ' ', 'равной', ' ', '3.14' ] ]
|
51
|
-
]
|
52
|
-
])
|
53
|
-
end
|
54
|
-
|
55
|
-
it 'should parse sentences with floating "dot" values' do
|
56
|
-
'буба не считает Пи равной 3,14'.should be_parsed_as([
|
57
|
-
[
|
58
|
-
[ [ 'буба', ' ', 'не', ' ', 'считает', ' ',
|
59
|
-
'Пи', ' ', 'равной', ' ', '3,14' ] ]
|
60
|
-
]
|
61
|
-
])
|
62
|
-
end
|
63
|
-
end
|