greeb 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +24 -0
- data/Rakefile +8 -0
- data/greeb.gemspec +4 -2
- data/lib/enumerable.rb +3 -1
- data/lib/greeb.rb +6 -139
- data/lib/greeb/parser.rb +176 -0
- data/lib/meta_array.rb +3 -0
- data/spec/parser_spec.rb +63 -0
- data/spec/spec_helper.rb +14 -0
- metadata +28 -12
- data/lib/greeb/version.rb +0 -5
data/Gemfile.lock
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
greeb (0.0.2)
|
5
|
+
rspec (~> 2.4.0)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: http://rubygems.org/
|
9
|
+
specs:
|
10
|
+
diff-lcs (1.1.2)
|
11
|
+
rspec (2.4.0)
|
12
|
+
rspec-core (~> 2.4.0)
|
13
|
+
rspec-expectations (~> 2.4.0)
|
14
|
+
rspec-mocks (~> 2.4.0)
|
15
|
+
rspec-core (2.4.0)
|
16
|
+
rspec-expectations (2.4.0)
|
17
|
+
diff-lcs (~> 1.1.2)
|
18
|
+
rspec-mocks (2.4.0)
|
19
|
+
|
20
|
+
PLATFORMS
|
21
|
+
ruby
|
22
|
+
|
23
|
+
DEPENDENCIES
|
24
|
+
greeb!
|
data/Rakefile
CHANGED
data/greeb.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
$:.push File.expand_path('../lib', __FILE__)
|
4
|
-
require 'greeb
|
4
|
+
require 'greeb'
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = 'greeb'
|
@@ -11,11 +11,13 @@ Gem::Specification.new do |s|
|
|
11
11
|
s.email = [ 'dmitry@eveel.ru' ]
|
12
12
|
s.homepage = 'https://github.com/eveel/greeb'
|
13
13
|
s.summary = 'Greeb is a Graphematical Analyzer.'
|
14
|
-
s.description = 'Greeb is
|
14
|
+
s.description = 'Greeb is awesome Graphematical Analyzer, ' \
|
15
15
|
'written in Ruby.'
|
16
16
|
|
17
17
|
s.rubyforge_project = 'greeb'
|
18
18
|
|
19
|
+
s.add_dependency 'rspec', '~> 2.4.0'
|
20
|
+
|
19
21
|
s.files = `git ls-files`.split("\n")
|
20
22
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
21
23
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
data/lib/enumerable.rb
CHANGED
data/lib/greeb.rb
CHANGED
@@ -1,144 +1,11 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
3
|
+
# Greeb is awesome Graphematical Analyzer.
|
4
|
+
#
|
6
5
|
module Greeb
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
SEP = /^[*=_\/\\ ]$/u
|
11
|
-
PUN = /^(\.|\!|\?)$/u
|
12
|
-
SPUN = /^(\,|\[|\]|\(|\)|\-|:|;)$/u
|
13
|
-
DIG = /^[0-9]+$/u
|
14
|
-
DIL = /^[А-Яа-яA-Za-z0-9]+$/u
|
15
|
-
EMPTY = ''
|
16
|
-
|
17
|
-
class Parser
|
18
|
-
attr_accessor :origin
|
19
|
-
private :origin=
|
20
|
-
|
21
|
-
attr_writer :tree
|
22
|
-
private :tree=
|
23
|
-
|
24
|
-
def initialize(origin)
|
25
|
-
self.origin = origin
|
26
|
-
end
|
27
|
-
|
28
|
-
def tree
|
29
|
-
@tree ||= parse(origin)
|
30
|
-
end
|
31
|
-
|
32
|
-
private
|
33
|
-
def parse(origin) # :nodoc:
|
34
|
-
tree = MetaArray.new
|
35
|
-
|
36
|
-
# paragraph
|
37
|
-
p_id = 0
|
38
|
-
|
39
|
-
# sentence
|
40
|
-
s_id = 0
|
41
|
-
|
42
|
-
# subsentence
|
43
|
-
ss_id = 0
|
44
|
-
|
45
|
-
token = ''
|
6
|
+
# Version of the Greeb.
|
7
|
+
#
|
8
|
+
VERSION = "0.0.2"
|
46
9
|
|
47
|
-
|
48
|
-
puts "[#{token.inspect}] ← #{c.inspect}"
|
49
|
-
case c
|
50
|
-
when EOL then begin
|
51
|
-
case token
|
52
|
-
when EMPTY then token << c
|
53
|
-
when EOL then begin
|
54
|
-
token = ''
|
55
|
-
p_id += 1
|
56
|
-
s_id = 0
|
57
|
-
ss_id = 0
|
58
|
-
end
|
59
|
-
else
|
60
|
-
tree[p_id][s_id][ss_id] << token
|
61
|
-
token = c
|
62
|
-
end
|
63
|
-
end
|
64
|
-
when SEP then begin
|
65
|
-
case token
|
66
|
-
when EMPTY
|
67
|
-
else
|
68
|
-
tree[p_id][s_id][ss_id] << token
|
69
|
-
while tree[p_id][s_id][ss_id].last == c
|
70
|
-
tree[p_id][s_id][ss_id].pop
|
71
|
-
end
|
72
|
-
tree[p_id][s_id][ss_id] << c
|
73
|
-
token = ''
|
74
|
-
end
|
75
|
-
end
|
76
|
-
when PUN then begin
|
77
|
-
case token
|
78
|
-
when EMPTY
|
79
|
-
else
|
80
|
-
tree[p_id][s_id][ss_id] << token
|
81
|
-
tree[p_id][s_id][ss_id] << c
|
82
|
-
token = ''
|
83
|
-
s_id += 1
|
84
|
-
ss_id = 0
|
85
|
-
end
|
86
|
-
end
|
87
|
-
when SPUN then begin
|
88
|
-
case token
|
89
|
-
when EMPTY
|
90
|
-
else
|
91
|
-
tree[p_id][s_id][ss_id] << token
|
92
|
-
tree[p_id][s_id][ss_id] << c
|
93
|
-
token = ''
|
94
|
-
ss_id += 1
|
95
|
-
end
|
96
|
-
end
|
97
|
-
when RU_LEX then begin
|
98
|
-
case token
|
99
|
-
when EOL then begin
|
100
|
-
tree[p_id][s_id][ss_id] << ' '
|
101
|
-
token = c
|
102
|
-
end
|
103
|
-
else
|
104
|
-
token << c
|
105
|
-
end
|
106
|
-
end
|
107
|
-
when EN_LEX then begin
|
108
|
-
case token
|
109
|
-
when EOL then begin
|
110
|
-
tree[p_id][s_id][ss_id] << ' '
|
111
|
-
token = c
|
112
|
-
end
|
113
|
-
else
|
114
|
-
token << c
|
115
|
-
end
|
116
|
-
end
|
117
|
-
when DIG then begin
|
118
|
-
case token
|
119
|
-
when EOL then begin
|
120
|
-
tree[p_id][s_id][ss_id] << ' '
|
121
|
-
token = c
|
122
|
-
end
|
123
|
-
else
|
124
|
-
token << c
|
125
|
-
end
|
126
|
-
end
|
127
|
-
when DIL then begin
|
128
|
-
case token
|
129
|
-
when EOL then begin
|
130
|
-
tree[p_id][s_id][ss_id] << token
|
131
|
-
token = c
|
132
|
-
end
|
133
|
-
else
|
134
|
-
token << c
|
135
|
-
end
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
tree[p_id][s_id][ss_id] << token
|
140
|
-
tree.delete(nil)
|
141
|
-
tree.to_a
|
142
|
-
end
|
143
|
-
end
|
10
|
+
require 'greeb/parser'
|
144
11
|
end
|
data/lib/greeb/parser.rb
ADDED
@@ -0,0 +1,176 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'meta_array'
|
4
|
+
require 'enumerable'
|
5
|
+
|
6
|
+
# Graphematical Parser of the Greeb.
|
7
|
+
# Use it with love.
|
8
|
+
#
|
9
|
+
class Greeb::Parser
|
10
|
+
# Russian lexeme (i.e.: "хуй").
|
11
|
+
#
|
12
|
+
RUSSIAN_LEXEME = /^[А-Яа-яЁё]+$/u
|
13
|
+
|
14
|
+
# English lexeme (i.e.: "foo").
|
15
|
+
#
|
16
|
+
ENGLISH_LEXEME = /^[A-Za-z]+$/u
|
17
|
+
|
18
|
+
# End of Line sequence (i.e.: "\n").
|
19
|
+
#
|
20
|
+
END_OF_LINE = /^\n+$/u
|
21
|
+
|
22
|
+
# In-subsentence seprator (i.e.: "*" or "\").
|
23
|
+
#
|
24
|
+
SEPARATOR = /^[*=_\/\\ ]$/u
|
25
|
+
|
26
|
+
# Punctuation character (i.e.: "." or "!").
|
27
|
+
#
|
28
|
+
PUNCTUATION = /^(\.|\!|\?)$/u
|
29
|
+
|
30
|
+
# In-sentence punctuation character (i.e.: "," or "-").
|
31
|
+
#
|
32
|
+
SENTENCE_PUNCTUATION = /^(\,|\[|\]|\(|\)|\-|:|;)$/u
|
33
|
+
|
34
|
+
# Digit (i.e.: "1337").
|
35
|
+
#
|
36
|
+
DIGIT = /^[0-9]+$/u
|
37
|
+
|
38
|
+
# Digit-Letter complex (i.e.: "0xDEADBEEF").
|
39
|
+
#
|
40
|
+
DIGIT_LETTER = /^[А-Яа-яA-Za-z0-9Ёё]+$/u
|
41
|
+
|
42
|
+
# Empty string (i.e.: "").
|
43
|
+
#
|
44
|
+
EMPTY = ''
|
45
|
+
|
46
|
+
attr_accessor :text
|
47
|
+
private :text=
|
48
|
+
|
49
|
+
# Create a new instance of Greeb::Parser.
|
50
|
+
#
|
51
|
+
# ==== Parameters
|
52
|
+
# text<String>:: Source text.
|
53
|
+
#
|
54
|
+
def initialize(text)
|
55
|
+
self.text = text
|
56
|
+
end
|
57
|
+
|
58
|
+
# Perform the text parsing.
|
59
|
+
#
|
60
|
+
# ==== Returns
|
61
|
+
# Array:: Tree of Graphematical Analysis of text.
|
62
|
+
#
|
63
|
+
def parse
|
64
|
+
return @tree if @tree
|
65
|
+
|
66
|
+
# parse tree
|
67
|
+
tree = MetaArray.new
|
68
|
+
|
69
|
+
# paragraph, sentence, subsentence
|
70
|
+
p_id, s_id, ss_id = 0, 0, 0
|
71
|
+
|
72
|
+
# current token
|
73
|
+
token = ''
|
74
|
+
|
75
|
+
# run FSM
|
76
|
+
text.each_char do |c|
|
77
|
+
case c
|
78
|
+
when END_OF_LINE then begin
|
79
|
+
case token
|
80
|
+
when EMPTY then token << c
|
81
|
+
when END_OF_LINE then begin
|
82
|
+
token = ''
|
83
|
+
p_id += 1
|
84
|
+
s_id = 0
|
85
|
+
ss_id = 0
|
86
|
+
end
|
87
|
+
else
|
88
|
+
tree[p_id][s_id][ss_id] << token
|
89
|
+
token = c
|
90
|
+
end
|
91
|
+
end
|
92
|
+
when SEPARATOR then begin
|
93
|
+
case token
|
94
|
+
when EMPTY
|
95
|
+
else
|
96
|
+
tree[p_id][s_id][ss_id] << token
|
97
|
+
while tree[p_id][s_id][ss_id].last == c
|
98
|
+
tree[p_id][s_id][ss_id].pop
|
99
|
+
end
|
100
|
+
tree[p_id][s_id][ss_id] << c
|
101
|
+
token = ''
|
102
|
+
end
|
103
|
+
end
|
104
|
+
when PUNCTUATION then begin
|
105
|
+
case token
|
106
|
+
when EMPTY
|
107
|
+
else
|
108
|
+
tree[p_id][s_id][ss_id] << token
|
109
|
+
tree[p_id][s_id][ss_id] << c
|
110
|
+
token = ''
|
111
|
+
s_id += 1
|
112
|
+
ss_id = 0
|
113
|
+
end
|
114
|
+
end
|
115
|
+
when SENTENCE_PUNCTUATION then begin
|
116
|
+
case token
|
117
|
+
when EMPTY
|
118
|
+
else
|
119
|
+
tree[p_id][s_id][ss_id] << token
|
120
|
+
tree[p_id][s_id][ss_id] << c
|
121
|
+
token = ''
|
122
|
+
ss_id += 1
|
123
|
+
end
|
124
|
+
end
|
125
|
+
when RUSSIAN_LEXEME then begin
|
126
|
+
case token
|
127
|
+
when END_OF_LINE then begin
|
128
|
+
tree[p_id][s_id][ss_id] << ' '
|
129
|
+
token = c
|
130
|
+
end
|
131
|
+
else
|
132
|
+
token << c
|
133
|
+
end
|
134
|
+
end
|
135
|
+
when ENGLISH_LEXEME then begin
|
136
|
+
case token
|
137
|
+
when END_OF_LINE then begin
|
138
|
+
tree[p_id][s_id][ss_id] << ' '
|
139
|
+
token = c
|
140
|
+
end
|
141
|
+
else
|
142
|
+
token << c
|
143
|
+
end
|
144
|
+
end
|
145
|
+
when DIGIT then begin
|
146
|
+
case token
|
147
|
+
when END_OF_LINE then begin
|
148
|
+
tree[p_id][s_id][ss_id] << ' '
|
149
|
+
token = c
|
150
|
+
end
|
151
|
+
else
|
152
|
+
token << c
|
153
|
+
end
|
154
|
+
end
|
155
|
+
when DIGIT_LETTER then begin
|
156
|
+
case token
|
157
|
+
when END_OF_LINE then begin
|
158
|
+
tree[p_id][s_id][ss_id] << token
|
159
|
+
token = c
|
160
|
+
end
|
161
|
+
else
|
162
|
+
token << c
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
unless token.empty?
|
169
|
+
tree[p_id][s_id][ss_id] << token
|
170
|
+
end
|
171
|
+
|
172
|
+
tree.delete(nil)
|
173
|
+
|
174
|
+
@tree = tree.to_a
|
175
|
+
end
|
176
|
+
end
|
data/lib/meta_array.rb
CHANGED
data/spec/parser_spec.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require File.expand_path('../spec_helper.rb', __FILE__)
|
4
|
+
|
5
|
+
describe Greeb::Parser do
|
6
|
+
it 'should parse very simple strings' do
|
7
|
+
'буба сука дебил'.should be_parsed_as([
|
8
|
+
[
|
9
|
+
[ [ 'буба', ' ', 'сука', ' ', 'дебил' ] ]
|
10
|
+
]
|
11
|
+
])
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should parse one sentence with subsentences' do
|
15
|
+
'буба, сука, дебил'.should be_parsed_as([
|
16
|
+
[
|
17
|
+
[
|
18
|
+
[ 'буба', ',' ],
|
19
|
+
[ 'сука', ',' ],
|
20
|
+
[ 'дебил' ]
|
21
|
+
]
|
22
|
+
]
|
23
|
+
])
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'should parse two simple paragraphs' do
|
27
|
+
"буба сука дебил\n\nточно!".should be_parsed_as([
|
28
|
+
[
|
29
|
+
[ [ 'буба', ' ', 'сука', ' ', 'дебил' ] ]
|
30
|
+
],
|
31
|
+
[
|
32
|
+
[ [ 'точно', '!' ] ]
|
33
|
+
]
|
34
|
+
])
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should parse two sentences in paragraph' do
|
38
|
+
"буба молодец? буба умница.".should be_parsed_as([
|
39
|
+
[
|
40
|
+
[ [ 'буба', ' ', 'молодец', '?' ] ],
|
41
|
+
[ [ 'буба', ' ', 'умница', '.' ] ]
|
42
|
+
]
|
43
|
+
])
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should parse sentences with floating point values' do
|
47
|
+
'буба не считает Пи равной 3.14'.should be_parsed_as([
|
48
|
+
[
|
49
|
+
[ [ 'буба', ' ', 'не', ' ', 'считает', ' ',
|
50
|
+
'Пи', ' ', 'равной', ' ', '3.14' ] ]
|
51
|
+
]
|
52
|
+
])
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should parse sentences with floating "dot" values' do
|
56
|
+
'буба не считает Пи равной 3,14'.should be_parsed_as([
|
57
|
+
[
|
58
|
+
[ [ 'буба', ' ', 'не', ' ', 'считает', ' ',
|
59
|
+
'Пи', ' ', 'равной', ' ', '3,14' ] ]
|
60
|
+
]
|
61
|
+
])
|
62
|
+
end
|
63
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require File.expand_path('../../lib/greeb', __FILE__)
|
4
|
+
|
5
|
+
RSpec.configure do |c|
|
6
|
+
c.mock_with :rspec
|
7
|
+
end
|
8
|
+
|
9
|
+
RSpec::Matchers.define :be_parsed_as do |expected|
|
10
|
+
match do |actual|
|
11
|
+
tree = Greeb::Parser.new(actual).parse
|
12
|
+
tree == expected
|
13
|
+
end
|
14
|
+
end
|
metadata
CHANGED
@@ -1,18 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Dmitry A. Ustalov
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-02-
|
13
|
-
default_executable:
|
14
|
-
dependencies:
|
15
|
-
|
12
|
+
date: 2011-02-20 00:00:00.000000000 +05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rspec
|
17
|
+
requirement: &81165430 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ~>
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 2.4.0
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *81165430
|
26
|
+
description: Greeb is awesome Graphematical Analyzer, written in Ruby.
|
16
27
|
email:
|
17
28
|
- dmitry@eveel.ru
|
18
29
|
executables: []
|
@@ -21,18 +32,21 @@ extra_rdoc_files: []
|
|
21
32
|
files:
|
22
33
|
- .gitignore
|
23
34
|
- Gemfile
|
35
|
+
- Gemfile.lock
|
24
36
|
- README
|
25
37
|
- Rakefile
|
26
38
|
- greeb-test.rb
|
27
39
|
- greeb.gemspec
|
28
40
|
- lib/enumerable.rb
|
29
41
|
- lib/greeb.rb
|
30
|
-
- lib/greeb/
|
42
|
+
- lib/greeb/parser.rb
|
31
43
|
- lib/meta_array.rb
|
44
|
+
- spec/parser_spec.rb
|
45
|
+
- spec/spec_helper.rb
|
32
46
|
has_rdoc: true
|
33
47
|
homepage: https://github.com/eveel/greeb
|
34
48
|
licenses: []
|
35
|
-
post_install_message:
|
49
|
+
post_install_message:
|
36
50
|
rdoc_options: []
|
37
51
|
require_paths:
|
38
52
|
- lib
|
@@ -50,8 +64,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
50
64
|
version: '0'
|
51
65
|
requirements: []
|
52
66
|
rubyforge_project: greeb
|
53
|
-
rubygems_version: 1.5.
|
54
|
-
signing_key:
|
67
|
+
rubygems_version: 1.5.2
|
68
|
+
signing_key:
|
55
69
|
specification_version: 3
|
56
70
|
summary: Greeb is a Graphematical Analyzer.
|
57
|
-
test_files:
|
71
|
+
test_files:
|
72
|
+
- spec/parser_spec.rb
|
73
|
+
- spec/spec_helper.rb
|