greeb 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +24 -0
- data/Rakefile +8 -0
- data/greeb.gemspec +4 -2
- data/lib/enumerable.rb +3 -1
- data/lib/greeb.rb +6 -139
- data/lib/greeb/parser.rb +176 -0
- data/lib/meta_array.rb +3 -0
- data/spec/parser_spec.rb +63 -0
- data/spec/spec_helper.rb +14 -0
- metadata +28 -12
- data/lib/greeb/version.rb +0 -5
data/Gemfile.lock
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
greeb (0.0.2)
|
5
|
+
rspec (~> 2.4.0)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: http://rubygems.org/
|
9
|
+
specs:
|
10
|
+
diff-lcs (1.1.2)
|
11
|
+
rspec (2.4.0)
|
12
|
+
rspec-core (~> 2.4.0)
|
13
|
+
rspec-expectations (~> 2.4.0)
|
14
|
+
rspec-mocks (~> 2.4.0)
|
15
|
+
rspec-core (2.4.0)
|
16
|
+
rspec-expectations (2.4.0)
|
17
|
+
diff-lcs (~> 1.1.2)
|
18
|
+
rspec-mocks (2.4.0)
|
19
|
+
|
20
|
+
PLATFORMS
|
21
|
+
ruby
|
22
|
+
|
23
|
+
DEPENDENCIES
|
24
|
+
greeb!
|
data/Rakefile
CHANGED
data/greeb.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
$:.push File.expand_path('../lib', __FILE__)
|
4
|
-
require 'greeb
|
4
|
+
require 'greeb'
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = 'greeb'
|
@@ -11,11 +11,13 @@ Gem::Specification.new do |s|
|
|
11
11
|
s.email = [ 'dmitry@eveel.ru' ]
|
12
12
|
s.homepage = 'https://github.com/eveel/greeb'
|
13
13
|
s.summary = 'Greeb is a Graphematical Analyzer.'
|
14
|
-
s.description = 'Greeb is
|
14
|
+
s.description = 'Greeb is awesome Graphematical Analyzer, ' \
|
15
15
|
'written in Ruby.'
|
16
16
|
|
17
17
|
s.rubyforge_project = 'greeb'
|
18
18
|
|
19
|
+
s.add_dependency 'rspec', '~> 2.4.0'
|
20
|
+
|
19
21
|
s.files = `git ls-files`.split("\n")
|
20
22
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
21
23
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
data/lib/enumerable.rb
CHANGED
data/lib/greeb.rb
CHANGED
@@ -1,144 +1,11 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
3
|
+
# Greeb is awesome Graphematical Analyzer.
|
4
|
+
#
|
6
5
|
module Greeb
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
SEP = /^[*=_\/\\ ]$/u
|
11
|
-
PUN = /^(\.|\!|\?)$/u
|
12
|
-
SPUN = /^(\,|\[|\]|\(|\)|\-|:|;)$/u
|
13
|
-
DIG = /^[0-9]+$/u
|
14
|
-
DIL = /^[А-Яа-яA-Za-z0-9]+$/u
|
15
|
-
EMPTY = ''
|
16
|
-
|
17
|
-
class Parser
|
18
|
-
attr_accessor :origin
|
19
|
-
private :origin=
|
20
|
-
|
21
|
-
attr_writer :tree
|
22
|
-
private :tree=
|
23
|
-
|
24
|
-
def initialize(origin)
|
25
|
-
self.origin = origin
|
26
|
-
end
|
27
|
-
|
28
|
-
def tree
|
29
|
-
@tree ||= parse(origin)
|
30
|
-
end
|
31
|
-
|
32
|
-
private
|
33
|
-
def parse(origin) # :nodoc:
|
34
|
-
tree = MetaArray.new
|
35
|
-
|
36
|
-
# paragraph
|
37
|
-
p_id = 0
|
38
|
-
|
39
|
-
# sentence
|
40
|
-
s_id = 0
|
41
|
-
|
42
|
-
# subsentence
|
43
|
-
ss_id = 0
|
44
|
-
|
45
|
-
token = ''
|
6
|
+
# Version of the Greeb.
|
7
|
+
#
|
8
|
+
VERSION = "0.0.2"
|
46
9
|
|
47
|
-
|
48
|
-
puts "[#{token.inspect}] ← #{c.inspect}"
|
49
|
-
case c
|
50
|
-
when EOL then begin
|
51
|
-
case token
|
52
|
-
when EMPTY then token << c
|
53
|
-
when EOL then begin
|
54
|
-
token = ''
|
55
|
-
p_id += 1
|
56
|
-
s_id = 0
|
57
|
-
ss_id = 0
|
58
|
-
end
|
59
|
-
else
|
60
|
-
tree[p_id][s_id][ss_id] << token
|
61
|
-
token = c
|
62
|
-
end
|
63
|
-
end
|
64
|
-
when SEP then begin
|
65
|
-
case token
|
66
|
-
when EMPTY
|
67
|
-
else
|
68
|
-
tree[p_id][s_id][ss_id] << token
|
69
|
-
while tree[p_id][s_id][ss_id].last == c
|
70
|
-
tree[p_id][s_id][ss_id].pop
|
71
|
-
end
|
72
|
-
tree[p_id][s_id][ss_id] << c
|
73
|
-
token = ''
|
74
|
-
end
|
75
|
-
end
|
76
|
-
when PUN then begin
|
77
|
-
case token
|
78
|
-
when EMPTY
|
79
|
-
else
|
80
|
-
tree[p_id][s_id][ss_id] << token
|
81
|
-
tree[p_id][s_id][ss_id] << c
|
82
|
-
token = ''
|
83
|
-
s_id += 1
|
84
|
-
ss_id = 0
|
85
|
-
end
|
86
|
-
end
|
87
|
-
when SPUN then begin
|
88
|
-
case token
|
89
|
-
when EMPTY
|
90
|
-
else
|
91
|
-
tree[p_id][s_id][ss_id] << token
|
92
|
-
tree[p_id][s_id][ss_id] << c
|
93
|
-
token = ''
|
94
|
-
ss_id += 1
|
95
|
-
end
|
96
|
-
end
|
97
|
-
when RU_LEX then begin
|
98
|
-
case token
|
99
|
-
when EOL then begin
|
100
|
-
tree[p_id][s_id][ss_id] << ' '
|
101
|
-
token = c
|
102
|
-
end
|
103
|
-
else
|
104
|
-
token << c
|
105
|
-
end
|
106
|
-
end
|
107
|
-
when EN_LEX then begin
|
108
|
-
case token
|
109
|
-
when EOL then begin
|
110
|
-
tree[p_id][s_id][ss_id] << ' '
|
111
|
-
token = c
|
112
|
-
end
|
113
|
-
else
|
114
|
-
token << c
|
115
|
-
end
|
116
|
-
end
|
117
|
-
when DIG then begin
|
118
|
-
case token
|
119
|
-
when EOL then begin
|
120
|
-
tree[p_id][s_id][ss_id] << ' '
|
121
|
-
token = c
|
122
|
-
end
|
123
|
-
else
|
124
|
-
token << c
|
125
|
-
end
|
126
|
-
end
|
127
|
-
when DIL then begin
|
128
|
-
case token
|
129
|
-
when EOL then begin
|
130
|
-
tree[p_id][s_id][ss_id] << token
|
131
|
-
token = c
|
132
|
-
end
|
133
|
-
else
|
134
|
-
token << c
|
135
|
-
end
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
tree[p_id][s_id][ss_id] << token
|
140
|
-
tree.delete(nil)
|
141
|
-
tree.to_a
|
142
|
-
end
|
143
|
-
end
|
10
|
+
require 'greeb/parser'
|
144
11
|
end
|
data/lib/greeb/parser.rb
ADDED
@@ -0,0 +1,176 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'meta_array'
|
4
|
+
require 'enumerable'
|
5
|
+
|
6
|
+
# Graphematical Parser of the Greeb.
|
7
|
+
# Use it with love.
|
8
|
+
#
|
9
|
+
class Greeb::Parser
|
10
|
+
# Russian lexeme (i.e.: "хуй").
|
11
|
+
#
|
12
|
+
RUSSIAN_LEXEME = /^[А-Яа-яЁё]+$/u
|
13
|
+
|
14
|
+
# English lexeme (i.e.: "foo").
|
15
|
+
#
|
16
|
+
ENGLISH_LEXEME = /^[A-Za-z]+$/u
|
17
|
+
|
18
|
+
# End of Line sequence (i.e.: "\n").
|
19
|
+
#
|
20
|
+
END_OF_LINE = /^\n+$/u
|
21
|
+
|
22
|
+
# In-subsentence seprator (i.e.: "*" or "\").
|
23
|
+
#
|
24
|
+
SEPARATOR = /^[*=_\/\\ ]$/u
|
25
|
+
|
26
|
+
# Punctuation character (i.e.: "." or "!").
|
27
|
+
#
|
28
|
+
PUNCTUATION = /^(\.|\!|\?)$/u
|
29
|
+
|
30
|
+
# In-sentence punctuation character (i.e.: "," or "-").
|
31
|
+
#
|
32
|
+
SENTENCE_PUNCTUATION = /^(\,|\[|\]|\(|\)|\-|:|;)$/u
|
33
|
+
|
34
|
+
# Digit (i.e.: "1337").
|
35
|
+
#
|
36
|
+
DIGIT = /^[0-9]+$/u
|
37
|
+
|
38
|
+
# Digit-Letter complex (i.e.: "0xDEADBEEF").
|
39
|
+
#
|
40
|
+
DIGIT_LETTER = /^[А-Яа-яA-Za-z0-9Ёё]+$/u
|
41
|
+
|
42
|
+
# Empty string (i.e.: "").
|
43
|
+
#
|
44
|
+
EMPTY = ''
|
45
|
+
|
46
|
+
attr_accessor :text
|
47
|
+
private :text=
|
48
|
+
|
49
|
+
# Create a new instance of Greeb::Parser.
|
50
|
+
#
|
51
|
+
# ==== Parameters
|
52
|
+
# text<String>:: Source text.
|
53
|
+
#
|
54
|
+
def initialize(text)
|
55
|
+
self.text = text
|
56
|
+
end
|
57
|
+
|
58
|
+
# Perform the text parsing.
|
59
|
+
#
|
60
|
+
# ==== Returns
|
61
|
+
# Array:: Tree of Graphematical Analysis of text.
|
62
|
+
#
|
63
|
+
def parse
|
64
|
+
return @tree if @tree
|
65
|
+
|
66
|
+
# parse tree
|
67
|
+
tree = MetaArray.new
|
68
|
+
|
69
|
+
# paragraph, sentence, subsentence
|
70
|
+
p_id, s_id, ss_id = 0, 0, 0
|
71
|
+
|
72
|
+
# current token
|
73
|
+
token = ''
|
74
|
+
|
75
|
+
# run FSM
|
76
|
+
text.each_char do |c|
|
77
|
+
case c
|
78
|
+
when END_OF_LINE then begin
|
79
|
+
case token
|
80
|
+
when EMPTY then token << c
|
81
|
+
when END_OF_LINE then begin
|
82
|
+
token = ''
|
83
|
+
p_id += 1
|
84
|
+
s_id = 0
|
85
|
+
ss_id = 0
|
86
|
+
end
|
87
|
+
else
|
88
|
+
tree[p_id][s_id][ss_id] << token
|
89
|
+
token = c
|
90
|
+
end
|
91
|
+
end
|
92
|
+
when SEPARATOR then begin
|
93
|
+
case token
|
94
|
+
when EMPTY
|
95
|
+
else
|
96
|
+
tree[p_id][s_id][ss_id] << token
|
97
|
+
while tree[p_id][s_id][ss_id].last == c
|
98
|
+
tree[p_id][s_id][ss_id].pop
|
99
|
+
end
|
100
|
+
tree[p_id][s_id][ss_id] << c
|
101
|
+
token = ''
|
102
|
+
end
|
103
|
+
end
|
104
|
+
when PUNCTUATION then begin
|
105
|
+
case token
|
106
|
+
when EMPTY
|
107
|
+
else
|
108
|
+
tree[p_id][s_id][ss_id] << token
|
109
|
+
tree[p_id][s_id][ss_id] << c
|
110
|
+
token = ''
|
111
|
+
s_id += 1
|
112
|
+
ss_id = 0
|
113
|
+
end
|
114
|
+
end
|
115
|
+
when SENTENCE_PUNCTUATION then begin
|
116
|
+
case token
|
117
|
+
when EMPTY
|
118
|
+
else
|
119
|
+
tree[p_id][s_id][ss_id] << token
|
120
|
+
tree[p_id][s_id][ss_id] << c
|
121
|
+
token = ''
|
122
|
+
ss_id += 1
|
123
|
+
end
|
124
|
+
end
|
125
|
+
when RUSSIAN_LEXEME then begin
|
126
|
+
case token
|
127
|
+
when END_OF_LINE then begin
|
128
|
+
tree[p_id][s_id][ss_id] << ' '
|
129
|
+
token = c
|
130
|
+
end
|
131
|
+
else
|
132
|
+
token << c
|
133
|
+
end
|
134
|
+
end
|
135
|
+
when ENGLISH_LEXEME then begin
|
136
|
+
case token
|
137
|
+
when END_OF_LINE then begin
|
138
|
+
tree[p_id][s_id][ss_id] << ' '
|
139
|
+
token = c
|
140
|
+
end
|
141
|
+
else
|
142
|
+
token << c
|
143
|
+
end
|
144
|
+
end
|
145
|
+
when DIGIT then begin
|
146
|
+
case token
|
147
|
+
when END_OF_LINE then begin
|
148
|
+
tree[p_id][s_id][ss_id] << ' '
|
149
|
+
token = c
|
150
|
+
end
|
151
|
+
else
|
152
|
+
token << c
|
153
|
+
end
|
154
|
+
end
|
155
|
+
when DIGIT_LETTER then begin
|
156
|
+
case token
|
157
|
+
when END_OF_LINE then begin
|
158
|
+
tree[p_id][s_id][ss_id] << token
|
159
|
+
token = c
|
160
|
+
end
|
161
|
+
else
|
162
|
+
token << c
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
unless token.empty?
|
169
|
+
tree[p_id][s_id][ss_id] << token
|
170
|
+
end
|
171
|
+
|
172
|
+
tree.delete(nil)
|
173
|
+
|
174
|
+
@tree = tree.to_a
|
175
|
+
end
|
176
|
+
end
|
data/lib/meta_array.rb
CHANGED
data/spec/parser_spec.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require File.expand_path('../spec_helper.rb', __FILE__)
|
4
|
+
|
5
|
+
describe Greeb::Parser do
|
6
|
+
it 'should parse very simple strings' do
|
7
|
+
'буба сука дебил'.should be_parsed_as([
|
8
|
+
[
|
9
|
+
[ [ 'буба', ' ', 'сука', ' ', 'дебил' ] ]
|
10
|
+
]
|
11
|
+
])
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should parse one sentence with subsentences' do
|
15
|
+
'буба, сука, дебил'.should be_parsed_as([
|
16
|
+
[
|
17
|
+
[
|
18
|
+
[ 'буба', ',' ],
|
19
|
+
[ 'сука', ',' ],
|
20
|
+
[ 'дебил' ]
|
21
|
+
]
|
22
|
+
]
|
23
|
+
])
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'should parse two simple paragraphs' do
|
27
|
+
"буба сука дебил\n\nточно!".should be_parsed_as([
|
28
|
+
[
|
29
|
+
[ [ 'буба', ' ', 'сука', ' ', 'дебил' ] ]
|
30
|
+
],
|
31
|
+
[
|
32
|
+
[ [ 'точно', '!' ] ]
|
33
|
+
]
|
34
|
+
])
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should parse two sentences in paragraph' do
|
38
|
+
"буба молодец? буба умница.".should be_parsed_as([
|
39
|
+
[
|
40
|
+
[ [ 'буба', ' ', 'молодец', '?' ] ],
|
41
|
+
[ [ 'буба', ' ', 'умница', '.' ] ]
|
42
|
+
]
|
43
|
+
])
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should parse sentences with floating point values' do
|
47
|
+
'буба не считает Пи равной 3.14'.should be_parsed_as([
|
48
|
+
[
|
49
|
+
[ [ 'буба', ' ', 'не', ' ', 'считает', ' ',
|
50
|
+
'Пи', ' ', 'равной', ' ', '3.14' ] ]
|
51
|
+
]
|
52
|
+
])
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should parse sentences with floating "dot" values' do
|
56
|
+
'буба не считает Пи равной 3,14'.should be_parsed_as([
|
57
|
+
[
|
58
|
+
[ [ 'буба', ' ', 'не', ' ', 'считает', ' ',
|
59
|
+
'Пи', ' ', 'равной', ' ', '3,14' ] ]
|
60
|
+
]
|
61
|
+
])
|
62
|
+
end
|
63
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require File.expand_path('../../lib/greeb', __FILE__)
|
4
|
+
|
5
|
+
RSpec.configure do |c|
|
6
|
+
c.mock_with :rspec
|
7
|
+
end
|
8
|
+
|
9
|
+
RSpec::Matchers.define :be_parsed_as do |expected|
|
10
|
+
match do |actual|
|
11
|
+
tree = Greeb::Parser.new(actual).parse
|
12
|
+
tree == expected
|
13
|
+
end
|
14
|
+
end
|
metadata
CHANGED
@@ -1,18 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: greeb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Dmitry A. Ustalov
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-02-
|
13
|
-
default_executable:
|
14
|
-
dependencies:
|
15
|
-
|
12
|
+
date: 2011-02-20 00:00:00.000000000 +05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rspec
|
17
|
+
requirement: &81165430 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ~>
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 2.4.0
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *81165430
|
26
|
+
description: Greeb is awesome Graphematical Analyzer, written in Ruby.
|
16
27
|
email:
|
17
28
|
- dmitry@eveel.ru
|
18
29
|
executables: []
|
@@ -21,18 +32,21 @@ extra_rdoc_files: []
|
|
21
32
|
files:
|
22
33
|
- .gitignore
|
23
34
|
- Gemfile
|
35
|
+
- Gemfile.lock
|
24
36
|
- README
|
25
37
|
- Rakefile
|
26
38
|
- greeb-test.rb
|
27
39
|
- greeb.gemspec
|
28
40
|
- lib/enumerable.rb
|
29
41
|
- lib/greeb.rb
|
30
|
-
- lib/greeb/
|
42
|
+
- lib/greeb/parser.rb
|
31
43
|
- lib/meta_array.rb
|
44
|
+
- spec/parser_spec.rb
|
45
|
+
- spec/spec_helper.rb
|
32
46
|
has_rdoc: true
|
33
47
|
homepage: https://github.com/eveel/greeb
|
34
48
|
licenses: []
|
35
|
-
post_install_message:
|
49
|
+
post_install_message:
|
36
50
|
rdoc_options: []
|
37
51
|
require_paths:
|
38
52
|
- lib
|
@@ -50,8 +64,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
50
64
|
version: '0'
|
51
65
|
requirements: []
|
52
66
|
rubyforge_project: greeb
|
53
|
-
rubygems_version: 1.5.
|
54
|
-
signing_key:
|
67
|
+
rubygems_version: 1.5.2
|
68
|
+
signing_key:
|
55
69
|
specification_version: 3
|
56
70
|
summary: Greeb is a Graphematical Analyzer.
|
57
|
-
test_files:
|
71
|
+
test_files:
|
72
|
+
- spec/parser_spec.rb
|
73
|
+
- spec/spec_helper.rb
|