greeb 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +35 -0
- data/Gemfile +3 -0
- data/README +0 -0
- data/Rakefile +4 -0
- data/greeb-test.rb +141 -0
- data/greeb.gemspec +23 -0
- data/lib/enumerable.rb +8 -0
- data/lib/greeb.rb +144 -0
- data/lib/greeb/version.rb +5 -0
- data/lib/meta_array.rb +11 -0
- metadata +57 -0
data/.gitignore
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
## MAC OS
|
2
|
+
.DS_Store
|
3
|
+
|
4
|
+
## TEXTMATE
|
5
|
+
*.tmproj
|
6
|
+
tmtags
|
7
|
+
|
8
|
+
## EMACS
|
9
|
+
*~
|
10
|
+
\#*
|
11
|
+
.\#*
|
12
|
+
|
13
|
+
## VIM
|
14
|
+
*.swp
|
15
|
+
|
16
|
+
## RUBINIUS
|
17
|
+
*.rbc
|
18
|
+
|
19
|
+
## NETBEANS
|
20
|
+
nbproject
|
21
|
+
|
22
|
+
## REDCAR
|
23
|
+
.redcar
|
24
|
+
|
25
|
+
## RVM
|
26
|
+
.rvmrc
|
27
|
+
|
28
|
+
## BUNDLER
|
29
|
+
.bundle
|
30
|
+
|
31
|
+
## PROJECT::GENERAL
|
32
|
+
coverage
|
33
|
+
pkg
|
34
|
+
|
35
|
+
## PROJECT::SPECIFIC
|
data/Gemfile
ADDED
data/README
ADDED
File without changes
|
data/Rakefile
ADDED
data/greeb-test.rb
ADDED
@@ -0,0 +1,141 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'graphviz'
|
6
|
+
|
7
|
+
$:.unshift('./lib')
|
8
|
+
require 'greeb'
|
9
|
+
|
10
|
+
origin = <<-END
|
11
|
+
- Сынок, чего это от тебя зигами пахнет,
|
12
|
+
опять на Манежную площадь ходил?
|
13
|
+
|
14
|
+
- Нет мама, я в метро ехал, там назиговано было!!
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
Четырнадцать, восемьдесять восемь: 14/88.
|
19
|
+
Вот так блять
|
20
|
+
END
|
21
|
+
origin.chomp!
|
22
|
+
|
23
|
+
def identify(token)
|
24
|
+
case token
|
25
|
+
when Greeb::RU_LEX then 'RU_LEX'
|
26
|
+
when Greeb::EN_LEX then 'EN_LEX'
|
27
|
+
when Greeb::EOL then 'EOL'
|
28
|
+
when Greeb::SEP then 'SEP'
|
29
|
+
when Greeb::PUN then 'PUN'
|
30
|
+
when Greeb::SPUN then 'SPUN'
|
31
|
+
when Greeb::DIG then 'DIG'
|
32
|
+
when Greeb::DIL then 'DIL'
|
33
|
+
else
|
34
|
+
'?!'
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
greeb = Greeb::Parser.new(origin)
|
39
|
+
text = greeb.tree
|
40
|
+
|
41
|
+
g = GraphViz.new('graphematics', 'type' => 'graph')
|
42
|
+
|
43
|
+
g.node[:color] = '#ddaa66'
|
44
|
+
g.node[:style] = 'filled'
|
45
|
+
g.node[:shape] = 'box'
|
46
|
+
g.node[:penwidth] = '1'
|
47
|
+
g.node[:fontname] = 'PT Sans'
|
48
|
+
g.node[:fontsize] = '8'
|
49
|
+
g.node[:fillcolor]= '#ffeecc'
|
50
|
+
g.node[:fontcolor]= '#775500'
|
51
|
+
g.node[:margin] = '0.0'
|
52
|
+
|
53
|
+
g.edge[:color] = '#999999'
|
54
|
+
g.edge[:weight] = '1'
|
55
|
+
g.edge[:fontname] = 'PT Sans'
|
56
|
+
g.edge[:fontcolor]= '#444444'
|
57
|
+
g.edge[:fontsize] = '6'
|
58
|
+
g.edge[:dir] = 'forward'
|
59
|
+
g.edge[:arrowsize]= '0.5'
|
60
|
+
|
61
|
+
bid = 'begin'
|
62
|
+
g.add_node(bid).tap do |node|
|
63
|
+
node.label = "Начало\nтекста"
|
64
|
+
node.shape = 'ellipse'
|
65
|
+
node.style = ''
|
66
|
+
end
|
67
|
+
|
68
|
+
eid = 'end'
|
69
|
+
g.add_node(eid).tap do |node|
|
70
|
+
node.label = "Конец\nтекста"
|
71
|
+
node.shape = 'ellipse'
|
72
|
+
node.style = ''
|
73
|
+
end
|
74
|
+
|
75
|
+
tree = text.map_with_index do |paragraph, i|
|
76
|
+
pid = "p#{i}"
|
77
|
+
sentences = paragraph.map_with_index do |sentence, j|
|
78
|
+
sid = "#{pid}s#{j}"
|
79
|
+
subsentences = sentence.map_with_index do |subsentence, k|
|
80
|
+
ssid = "#{sid}ss#{k}"
|
81
|
+
tokens = subsentence.map_with_index do |token, l|
|
82
|
+
next if ' ' == token
|
83
|
+
[ "#{ssid}t#{l}", token, l ]
|
84
|
+
end
|
85
|
+
tokens.delete(nil)
|
86
|
+
[ ssid, tokens, k ]
|
87
|
+
end
|
88
|
+
[ sid, subsentences, j ]
|
89
|
+
end
|
90
|
+
[ pid, sentences, i ]
|
91
|
+
end
|
92
|
+
|
93
|
+
tree.each do |pid, paragraph, i|
|
94
|
+
g.add_node(pid).tap do |node|
|
95
|
+
node.label = "Абзац\n№#{i + 1}"
|
96
|
+
node.shape = 'ellipse'
|
97
|
+
end
|
98
|
+
g.add_edge(bid, pid)
|
99
|
+
|
100
|
+
paragraph.each do |sid, sentence, j|
|
101
|
+
g.add_node(sid).tap do |node|
|
102
|
+
node.label = "Предложение\n№#{j + 1}"
|
103
|
+
node.shape = 'ellipse'
|
104
|
+
end
|
105
|
+
g.add_edge(pid, sid)
|
106
|
+
|
107
|
+
sentence.each do |ssid, subsentence, k|
|
108
|
+
g.add_node(ssid).tap do |node|
|
109
|
+
node.label = "Подпредложение\n№#{k + 1}"
|
110
|
+
node.shape = 'ellipse'
|
111
|
+
end
|
112
|
+
g.add_edge(sid, ssid)
|
113
|
+
|
114
|
+
subsentence.each do |tid, token, l|
|
115
|
+
g.add_node(tid).label = token
|
116
|
+
g.add_edge(ssid, tid).label = identify(token)
|
117
|
+
g.add_edge(tid, eid)
|
118
|
+
end
|
119
|
+
|
120
|
+
subsentence.each_cons(2) do |(tid1, token1, l1),
|
121
|
+
(tid2, token2, l2)|
|
122
|
+
g.add_edge(tid1, tid2).tap do |edge|
|
123
|
+
edge.weight = 0.25
|
124
|
+
edge.style = 'dashed'
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
sentence.each_cons(2) do |(ssid1, subsentence1, k1),
|
130
|
+
(ssid2, subsentence2, k2)|
|
131
|
+
tid1, token1, l1 = subsentence1.last
|
132
|
+
tid2, token2, l2 = subsentence2.first
|
133
|
+
g.add_edge(tid1, tid2).tap do |edge|
|
134
|
+
edge.weight = 0.5
|
135
|
+
edge.style = 'dashed'
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
g.output(:output => 'png', :file => 'graph.png')
|
data/greeb.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
$:.push File.expand_path('../lib', __FILE__)
|
4
|
+
require 'greeb/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = 'greeb'
|
8
|
+
s.version = Greeb::VERSION
|
9
|
+
s.platform = Gem::Platform::RUBY
|
10
|
+
s.authors = [ 'Dmitry A. Ustalov' ]
|
11
|
+
s.email = [ 'dmitry@eveel.ru' ]
|
12
|
+
s.homepage = 'https://github.com/eveel/greeb'
|
13
|
+
s.summary = 'Greeb is a Graphematical Analyzer.'
|
14
|
+
s.description = 'Greeb is a Graphematical Analyzer, ' \
|
15
|
+
'written in Ruby.'
|
16
|
+
|
17
|
+
s.rubyforge_project = 'greeb'
|
18
|
+
|
19
|
+
s.files = `git ls-files`.split("\n")
|
20
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
21
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
22
|
+
s.require_paths = [ 'lib' ]
|
23
|
+
end
|
data/lib/enumerable.rb
ADDED
data/lib/greeb.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'meta_array'
|
4
|
+
require 'enumerable'
|
5
|
+
|
6
|
+
module Greeb
|
7
|
+
RU_LEX = /^[А-Яа-я]+$/u
|
8
|
+
EN_LEX = /^[A-Za-z]+$/u
|
9
|
+
EOL = /^\n+$/u
|
10
|
+
SEP = /^[*=_\/\\ ]$/u
|
11
|
+
PUN = /^(\.|\!|\?)$/u
|
12
|
+
SPUN = /^(\,|\[|\]|\(|\)|\-|:|;)$/u
|
13
|
+
DIG = /^[0-9]+$/u
|
14
|
+
DIL = /^[А-Яа-яA-Za-z0-9]+$/u
|
15
|
+
EMPTY = ''
|
16
|
+
|
17
|
+
class Parser
|
18
|
+
attr_accessor :origin
|
19
|
+
private :origin=
|
20
|
+
|
21
|
+
attr_writer :tree
|
22
|
+
private :tree=
|
23
|
+
|
24
|
+
def initialize(origin)
|
25
|
+
self.origin = origin
|
26
|
+
end
|
27
|
+
|
28
|
+
def tree
|
29
|
+
@tree ||= parse(origin)
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
def parse(origin) # :nodoc:
|
34
|
+
tree = MetaArray.new
|
35
|
+
|
36
|
+
# paragraph
|
37
|
+
p_id = 0
|
38
|
+
|
39
|
+
# sentence
|
40
|
+
s_id = 0
|
41
|
+
|
42
|
+
# subsentence
|
43
|
+
ss_id = 0
|
44
|
+
|
45
|
+
token = ''
|
46
|
+
|
47
|
+
origin.each_char do |c|
|
48
|
+
puts "[#{token.inspect}] ← #{c.inspect}"
|
49
|
+
case c
|
50
|
+
when EOL then begin
|
51
|
+
case token
|
52
|
+
when EMPTY then token << c
|
53
|
+
when EOL then begin
|
54
|
+
token = ''
|
55
|
+
p_id += 1
|
56
|
+
s_id = 0
|
57
|
+
ss_id = 0
|
58
|
+
end
|
59
|
+
else
|
60
|
+
tree[p_id][s_id][ss_id] << token
|
61
|
+
token = c
|
62
|
+
end
|
63
|
+
end
|
64
|
+
when SEP then begin
|
65
|
+
case token
|
66
|
+
when EMPTY
|
67
|
+
else
|
68
|
+
tree[p_id][s_id][ss_id] << token
|
69
|
+
while tree[p_id][s_id][ss_id].last == c
|
70
|
+
tree[p_id][s_id][ss_id].pop
|
71
|
+
end
|
72
|
+
tree[p_id][s_id][ss_id] << c
|
73
|
+
token = ''
|
74
|
+
end
|
75
|
+
end
|
76
|
+
when PUN then begin
|
77
|
+
case token
|
78
|
+
when EMPTY
|
79
|
+
else
|
80
|
+
tree[p_id][s_id][ss_id] << token
|
81
|
+
tree[p_id][s_id][ss_id] << c
|
82
|
+
token = ''
|
83
|
+
s_id += 1
|
84
|
+
ss_id = 0
|
85
|
+
end
|
86
|
+
end
|
87
|
+
when SPUN then begin
|
88
|
+
case token
|
89
|
+
when EMPTY
|
90
|
+
else
|
91
|
+
tree[p_id][s_id][ss_id] << token
|
92
|
+
tree[p_id][s_id][ss_id] << c
|
93
|
+
token = ''
|
94
|
+
ss_id += 1
|
95
|
+
end
|
96
|
+
end
|
97
|
+
when RU_LEX then begin
|
98
|
+
case token
|
99
|
+
when EOL then begin
|
100
|
+
tree[p_id][s_id][ss_id] << ' '
|
101
|
+
token = c
|
102
|
+
end
|
103
|
+
else
|
104
|
+
token << c
|
105
|
+
end
|
106
|
+
end
|
107
|
+
when EN_LEX then begin
|
108
|
+
case token
|
109
|
+
when EOL then begin
|
110
|
+
tree[p_id][s_id][ss_id] << ' '
|
111
|
+
token = c
|
112
|
+
end
|
113
|
+
else
|
114
|
+
token << c
|
115
|
+
end
|
116
|
+
end
|
117
|
+
when DIG then begin
|
118
|
+
case token
|
119
|
+
when EOL then begin
|
120
|
+
tree[p_id][s_id][ss_id] << ' '
|
121
|
+
token = c
|
122
|
+
end
|
123
|
+
else
|
124
|
+
token << c
|
125
|
+
end
|
126
|
+
end
|
127
|
+
when DIL then begin
|
128
|
+
case token
|
129
|
+
when EOL then begin
|
130
|
+
tree[p_id][s_id][ss_id] << token
|
131
|
+
token = c
|
132
|
+
end
|
133
|
+
else
|
134
|
+
token << c
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
tree[p_id][s_id][ss_id] << token
|
140
|
+
tree.delete(nil)
|
141
|
+
tree.to_a
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
data/lib/meta_array.rb
ADDED
metadata
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: greeb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease: !!null
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Dmitry A. Ustalov
|
9
|
+
autorequire: !!null
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-02-06 00:00:00.000000000 +05:00
|
13
|
+
default_executable: !!null
|
14
|
+
dependencies: []
|
15
|
+
description: Greeb is a Graphematical Analyzer, written in Ruby.
|
16
|
+
email:
|
17
|
+
- dmitry@eveel.ru
|
18
|
+
executables: []
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- .gitignore
|
23
|
+
- Gemfile
|
24
|
+
- README
|
25
|
+
- Rakefile
|
26
|
+
- greeb-test.rb
|
27
|
+
- greeb.gemspec
|
28
|
+
- lib/enumerable.rb
|
29
|
+
- lib/greeb.rb
|
30
|
+
- lib/greeb/version.rb
|
31
|
+
- lib/meta_array.rb
|
32
|
+
has_rdoc: true
|
33
|
+
homepage: https://github.com/eveel/greeb
|
34
|
+
licenses: []
|
35
|
+
post_install_message: !!null
|
36
|
+
rdoc_options: []
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ! '>='
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: '0'
|
45
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
46
|
+
none: false
|
47
|
+
requirements:
|
48
|
+
- - ! '>='
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '0'
|
51
|
+
requirements: []
|
52
|
+
rubyforge_project: greeb
|
53
|
+
rubygems_version: 1.5.0
|
54
|
+
signing_key: !!null
|
55
|
+
specification_version: 3
|
56
|
+
summary: Greeb is a Graphematical Analyzer.
|
57
|
+
test_files: []
|