greeb 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +35 -0
- data/Gemfile +3 -0
- data/README +0 -0
- data/Rakefile +4 -0
- data/greeb-test.rb +141 -0
- data/greeb.gemspec +23 -0
- data/lib/enumerable.rb +8 -0
- data/lib/greeb.rb +144 -0
- data/lib/greeb/version.rb +5 -0
- data/lib/meta_array.rb +11 -0
- metadata +57 -0
data/.gitignore
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
## MAC OS
|
2
|
+
.DS_Store
|
3
|
+
|
4
|
+
## TEXTMATE
|
5
|
+
*.tmproj
|
6
|
+
tmtags
|
7
|
+
|
8
|
+
## EMACS
|
9
|
+
*~
|
10
|
+
\#*
|
11
|
+
.\#*
|
12
|
+
|
13
|
+
## VIM
|
14
|
+
*.swp
|
15
|
+
|
16
|
+
## RUBINIUS
|
17
|
+
*.rbc
|
18
|
+
|
19
|
+
## NETBEANS
|
20
|
+
nbproject
|
21
|
+
|
22
|
+
## REDCAR
|
23
|
+
.redcar
|
24
|
+
|
25
|
+
## RVM
|
26
|
+
.rvmrc
|
27
|
+
|
28
|
+
## BUNDLER
|
29
|
+
.bundle
|
30
|
+
|
31
|
+
## PROJECT::GENERAL
|
32
|
+
coverage
|
33
|
+
pkg
|
34
|
+
|
35
|
+
## PROJECT::SPECIFIC
|
data/Gemfile
ADDED
data/README
ADDED
File without changes
|
data/Rakefile
ADDED
data/greeb-test.rb
ADDED
@@ -0,0 +1,141 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'graphviz'
|
6
|
+
|
7
|
+
$:.unshift('./lib')
|
8
|
+
require 'greeb'
|
9
|
+
|
10
|
+
origin = <<-END
|
11
|
+
- Сынок, чего это от тебя зигами пахнет,
|
12
|
+
опять на Манежную площадь ходил?
|
13
|
+
|
14
|
+
- Нет мама, я в метро ехал, там назиговано было!!
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
Четырнадцать, восемьдесять восемь: 14/88.
|
19
|
+
Вот так блять
|
20
|
+
END
|
21
|
+
origin.chomp!
|
22
|
+
|
23
|
+
def identify(token)
|
24
|
+
case token
|
25
|
+
when Greeb::RU_LEX then 'RU_LEX'
|
26
|
+
when Greeb::EN_LEX then 'EN_LEX'
|
27
|
+
when Greeb::EOL then 'EOL'
|
28
|
+
when Greeb::SEP then 'SEP'
|
29
|
+
when Greeb::PUN then 'PUN'
|
30
|
+
when Greeb::SPUN then 'SPUN'
|
31
|
+
when Greeb::DIG then 'DIG'
|
32
|
+
when Greeb::DIL then 'DIL'
|
33
|
+
else
|
34
|
+
'?!'
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
greeb = Greeb::Parser.new(origin)
|
39
|
+
text = greeb.tree
|
40
|
+
|
41
|
+
g = GraphViz.new('graphematics', 'type' => 'graph')
|
42
|
+
|
43
|
+
g.node[:color] = '#ddaa66'
|
44
|
+
g.node[:style] = 'filled'
|
45
|
+
g.node[:shape] = 'box'
|
46
|
+
g.node[:penwidth] = '1'
|
47
|
+
g.node[:fontname] = 'PT Sans'
|
48
|
+
g.node[:fontsize] = '8'
|
49
|
+
g.node[:fillcolor]= '#ffeecc'
|
50
|
+
g.node[:fontcolor]= '#775500'
|
51
|
+
g.node[:margin] = '0.0'
|
52
|
+
|
53
|
+
g.edge[:color] = '#999999'
|
54
|
+
g.edge[:weight] = '1'
|
55
|
+
g.edge[:fontname] = 'PT Sans'
|
56
|
+
g.edge[:fontcolor]= '#444444'
|
57
|
+
g.edge[:fontsize] = '6'
|
58
|
+
g.edge[:dir] = 'forward'
|
59
|
+
g.edge[:arrowsize]= '0.5'
|
60
|
+
|
61
|
+
bid = 'begin'
|
62
|
+
g.add_node(bid).tap do |node|
|
63
|
+
node.label = "Начало\nтекста"
|
64
|
+
node.shape = 'ellipse'
|
65
|
+
node.style = ''
|
66
|
+
end
|
67
|
+
|
68
|
+
eid = 'end'
|
69
|
+
g.add_node(eid).tap do |node|
|
70
|
+
node.label = "Конец\nтекста"
|
71
|
+
node.shape = 'ellipse'
|
72
|
+
node.style = ''
|
73
|
+
end
|
74
|
+
|
75
|
+
tree = text.map_with_index do |paragraph, i|
|
76
|
+
pid = "p#{i}"
|
77
|
+
sentences = paragraph.map_with_index do |sentence, j|
|
78
|
+
sid = "#{pid}s#{j}"
|
79
|
+
subsentences = sentence.map_with_index do |subsentence, k|
|
80
|
+
ssid = "#{sid}ss#{k}"
|
81
|
+
tokens = subsentence.map_with_index do |token, l|
|
82
|
+
next if ' ' == token
|
83
|
+
[ "#{ssid}t#{l}", token, l ]
|
84
|
+
end
|
85
|
+
tokens.delete(nil)
|
86
|
+
[ ssid, tokens, k ]
|
87
|
+
end
|
88
|
+
[ sid, subsentences, j ]
|
89
|
+
end
|
90
|
+
[ pid, sentences, i ]
|
91
|
+
end
|
92
|
+
|
93
|
+
tree.each do |pid, paragraph, i|
|
94
|
+
g.add_node(pid).tap do |node|
|
95
|
+
node.label = "Абзац\n№#{i + 1}"
|
96
|
+
node.shape = 'ellipse'
|
97
|
+
end
|
98
|
+
g.add_edge(bid, pid)
|
99
|
+
|
100
|
+
paragraph.each do |sid, sentence, j|
|
101
|
+
g.add_node(sid).tap do |node|
|
102
|
+
node.label = "Предложение\n№#{j + 1}"
|
103
|
+
node.shape = 'ellipse'
|
104
|
+
end
|
105
|
+
g.add_edge(pid, sid)
|
106
|
+
|
107
|
+
sentence.each do |ssid, subsentence, k|
|
108
|
+
g.add_node(ssid).tap do |node|
|
109
|
+
node.label = "Подпредложение\n№#{k + 1}"
|
110
|
+
node.shape = 'ellipse'
|
111
|
+
end
|
112
|
+
g.add_edge(sid, ssid)
|
113
|
+
|
114
|
+
subsentence.each do |tid, token, l|
|
115
|
+
g.add_node(tid).label = token
|
116
|
+
g.add_edge(ssid, tid).label = identify(token)
|
117
|
+
g.add_edge(tid, eid)
|
118
|
+
end
|
119
|
+
|
120
|
+
subsentence.each_cons(2) do |(tid1, token1, l1),
|
121
|
+
(tid2, token2, l2)|
|
122
|
+
g.add_edge(tid1, tid2).tap do |edge|
|
123
|
+
edge.weight = 0.25
|
124
|
+
edge.style = 'dashed'
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
sentence.each_cons(2) do |(ssid1, subsentence1, k1),
|
130
|
+
(ssid2, subsentence2, k2)|
|
131
|
+
tid1, token1, l1 = subsentence1.last
|
132
|
+
tid2, token2, l2 = subsentence2.first
|
133
|
+
g.add_edge(tid1, tid2).tap do |edge|
|
134
|
+
edge.weight = 0.5
|
135
|
+
edge.style = 'dashed'
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
g.output(:output => 'png', :file => 'graph.png')
|
data/greeb.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
$:.push File.expand_path('../lib', __FILE__)
|
4
|
+
require 'greeb/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = 'greeb'
|
8
|
+
s.version = Greeb::VERSION
|
9
|
+
s.platform = Gem::Platform::RUBY
|
10
|
+
s.authors = [ 'Dmitry A. Ustalov' ]
|
11
|
+
s.email = [ 'dmitry@eveel.ru' ]
|
12
|
+
s.homepage = 'https://github.com/eveel/greeb'
|
13
|
+
s.summary = 'Greeb is a Graphematical Analyzer.'
|
14
|
+
s.description = 'Greeb is a Graphematical Analyzer, ' \
|
15
|
+
'written in Ruby.'
|
16
|
+
|
17
|
+
s.rubyforge_project = 'greeb'
|
18
|
+
|
19
|
+
s.files = `git ls-files`.split("\n")
|
20
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
21
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
22
|
+
s.require_paths = [ 'lib' ]
|
23
|
+
end
|
data/lib/enumerable.rb
ADDED
data/lib/greeb.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'meta_array'
|
4
|
+
require 'enumerable'
|
5
|
+
|
6
|
+
module Greeb
|
7
|
+
RU_LEX = /^[А-Яа-я]+$/u
|
8
|
+
EN_LEX = /^[A-Za-z]+$/u
|
9
|
+
EOL = /^\n+$/u
|
10
|
+
SEP = /^[*=_\/\\ ]$/u
|
11
|
+
PUN = /^(\.|\!|\?)$/u
|
12
|
+
SPUN = /^(\,|\[|\]|\(|\)|\-|:|;)$/u
|
13
|
+
DIG = /^[0-9]+$/u
|
14
|
+
DIL = /^[А-Яа-яA-Za-z0-9]+$/u
|
15
|
+
EMPTY = ''
|
16
|
+
|
17
|
+
class Parser
|
18
|
+
attr_accessor :origin
|
19
|
+
private :origin=
|
20
|
+
|
21
|
+
attr_writer :tree
|
22
|
+
private :tree=
|
23
|
+
|
24
|
+
def initialize(origin)
|
25
|
+
self.origin = origin
|
26
|
+
end
|
27
|
+
|
28
|
+
def tree
|
29
|
+
@tree ||= parse(origin)
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
def parse(origin) # :nodoc:
|
34
|
+
tree = MetaArray.new
|
35
|
+
|
36
|
+
# paragraph
|
37
|
+
p_id = 0
|
38
|
+
|
39
|
+
# sentence
|
40
|
+
s_id = 0
|
41
|
+
|
42
|
+
# subsentence
|
43
|
+
ss_id = 0
|
44
|
+
|
45
|
+
token = ''
|
46
|
+
|
47
|
+
origin.each_char do |c|
|
48
|
+
puts "[#{token.inspect}] ← #{c.inspect}"
|
49
|
+
case c
|
50
|
+
when EOL then begin
|
51
|
+
case token
|
52
|
+
when EMPTY then token << c
|
53
|
+
when EOL then begin
|
54
|
+
token = ''
|
55
|
+
p_id += 1
|
56
|
+
s_id = 0
|
57
|
+
ss_id = 0
|
58
|
+
end
|
59
|
+
else
|
60
|
+
tree[p_id][s_id][ss_id] << token
|
61
|
+
token = c
|
62
|
+
end
|
63
|
+
end
|
64
|
+
when SEP then begin
|
65
|
+
case token
|
66
|
+
when EMPTY
|
67
|
+
else
|
68
|
+
tree[p_id][s_id][ss_id] << token
|
69
|
+
while tree[p_id][s_id][ss_id].last == c
|
70
|
+
tree[p_id][s_id][ss_id].pop
|
71
|
+
end
|
72
|
+
tree[p_id][s_id][ss_id] << c
|
73
|
+
token = ''
|
74
|
+
end
|
75
|
+
end
|
76
|
+
when PUN then begin
|
77
|
+
case token
|
78
|
+
when EMPTY
|
79
|
+
else
|
80
|
+
tree[p_id][s_id][ss_id] << token
|
81
|
+
tree[p_id][s_id][ss_id] << c
|
82
|
+
token = ''
|
83
|
+
s_id += 1
|
84
|
+
ss_id = 0
|
85
|
+
end
|
86
|
+
end
|
87
|
+
when SPUN then begin
|
88
|
+
case token
|
89
|
+
when EMPTY
|
90
|
+
else
|
91
|
+
tree[p_id][s_id][ss_id] << token
|
92
|
+
tree[p_id][s_id][ss_id] << c
|
93
|
+
token = ''
|
94
|
+
ss_id += 1
|
95
|
+
end
|
96
|
+
end
|
97
|
+
when RU_LEX then begin
|
98
|
+
case token
|
99
|
+
when EOL then begin
|
100
|
+
tree[p_id][s_id][ss_id] << ' '
|
101
|
+
token = c
|
102
|
+
end
|
103
|
+
else
|
104
|
+
token << c
|
105
|
+
end
|
106
|
+
end
|
107
|
+
when EN_LEX then begin
|
108
|
+
case token
|
109
|
+
when EOL then begin
|
110
|
+
tree[p_id][s_id][ss_id] << ' '
|
111
|
+
token = c
|
112
|
+
end
|
113
|
+
else
|
114
|
+
token << c
|
115
|
+
end
|
116
|
+
end
|
117
|
+
when DIG then begin
|
118
|
+
case token
|
119
|
+
when EOL then begin
|
120
|
+
tree[p_id][s_id][ss_id] << ' '
|
121
|
+
token = c
|
122
|
+
end
|
123
|
+
else
|
124
|
+
token << c
|
125
|
+
end
|
126
|
+
end
|
127
|
+
when DIL then begin
|
128
|
+
case token
|
129
|
+
when EOL then begin
|
130
|
+
tree[p_id][s_id][ss_id] << token
|
131
|
+
token = c
|
132
|
+
end
|
133
|
+
else
|
134
|
+
token << c
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
tree[p_id][s_id][ss_id] << token
|
140
|
+
tree.delete(nil)
|
141
|
+
tree.to_a
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
data/lib/meta_array.rb
ADDED
metadata
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: greeb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease: !!null
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Dmitry A. Ustalov
|
9
|
+
autorequire: !!null
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-02-06 00:00:00.000000000 +05:00
|
13
|
+
default_executable: !!null
|
14
|
+
dependencies: []
|
15
|
+
description: Greeb is a Graphematical Analyzer, written in Ruby.
|
16
|
+
email:
|
17
|
+
- dmitry@eveel.ru
|
18
|
+
executables: []
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- .gitignore
|
23
|
+
- Gemfile
|
24
|
+
- README
|
25
|
+
- Rakefile
|
26
|
+
- greeb-test.rb
|
27
|
+
- greeb.gemspec
|
28
|
+
- lib/enumerable.rb
|
29
|
+
- lib/greeb.rb
|
30
|
+
- lib/greeb/version.rb
|
31
|
+
- lib/meta_array.rb
|
32
|
+
has_rdoc: true
|
33
|
+
homepage: https://github.com/eveel/greeb
|
34
|
+
licenses: []
|
35
|
+
post_install_message: !!null
|
36
|
+
rdoc_options: []
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ! '>='
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: '0'
|
45
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
46
|
+
none: false
|
47
|
+
requirements:
|
48
|
+
- - ! '>='
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '0'
|
51
|
+
requirements: []
|
52
|
+
rubyforge_project: greeb
|
53
|
+
rubygems_version: 1.5.0
|
54
|
+
signing_key: !!null
|
55
|
+
specification_version: 3
|
56
|
+
summary: Greeb is a Graphematical Analyzer.
|
57
|
+
test_files: []
|