srx2ruby 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -0
- data/Rakefile +2 -0
- data/bin/srx2ruby +2 -0
- data/lib/srx2ruby.rb +150 -0
- data/srx2ruby.gemspec +22 -0
- metadata +63 -0
data/.gitignore
ADDED
data/Rakefile
ADDED
data/bin/srx2ruby
ADDED
data/lib/srx2ruby.rb
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
require 'rexml/document'
|
4
|
+
require 'pp'
|
5
|
+
include REXML
|
6
|
+
|
7
|
+
|
8
|
+
if ARGV.size < 3
|
9
|
+
puts "Usage: srx2ruby rules_file.srx output.rb LanguageRuleSet1 [LanguageRuleSet2 ...]"
|
10
|
+
puts "rules_file.srx - file with SRX rules"
|
11
|
+
puts "output.rb - the file with Ruby code implementing the breaking rules"
|
12
|
+
puts "LanguageRuleSet* - selected language rules"
|
13
|
+
exit
|
14
|
+
end
|
15
|
+
|
16
|
+
xml = nil
|
17
|
+
File.open(ARGV[0]) do |input|
|
18
|
+
xml = Document.new(input)
|
19
|
+
end
|
20
|
+
|
21
|
+
breaking_rules = 0
|
22
|
+
nonbreaking_rules = 0
|
23
|
+
invalid_rules = 0
|
24
|
+
RULES = []
|
25
|
+
|
26
|
+
PAR_RE = /(^|[^\\])\((?!\?[<:i])/
|
27
|
+
GROUP_RE = /\(\?iu\)/
|
28
|
+
DASH_RE = /(\[(?:[^\]]|\\\])+)-(–(?:[^\]]|\\\])+)\]/
|
29
|
+
|
30
|
+
xml.each_element('//languagerule/') do |language|
|
31
|
+
next unless ARGV[2..-1].include?(language.attributes['languagerulename'])
|
32
|
+
puts language.attributes['languagerulename']
|
33
|
+
language.each_element('rule') do |rule|
|
34
|
+
should_break = rule.attributes['break'] == "yes"
|
35
|
+
if should_break
|
36
|
+
breaking_rules += 1
|
37
|
+
else
|
38
|
+
nonbreaking_rules += 1
|
39
|
+
end
|
40
|
+
before = rule.elements['beforebreak'].text
|
41
|
+
after = rule.elements['afterbreak'].text
|
42
|
+
begin
|
43
|
+
[before,after].each do |item|
|
44
|
+
next unless item
|
45
|
+
item.gsub!(PAR_RE,"\\1(?:\\2")
|
46
|
+
item.gsub!(GROUP_RE,"(?i)")
|
47
|
+
item.gsub!(DASH_RE,"\\1\\2-]")
|
48
|
+
end
|
49
|
+
re = "(#{before})(#{after})"
|
50
|
+
/(?:(#{before})(#{after}))/
|
51
|
+
RULES << [before,after,should_break]
|
52
|
+
rescue RegexpError => ex
|
53
|
+
puts ex
|
54
|
+
invalid_rules += 1
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
CONSOLIDATED_RULES = []
|
60
|
+
CONSOLIDATED_RULES << { [RULES.first[1],RULES.first[2]] => [] }
|
61
|
+
RULES.each do |rule_s,rule_e,value|
|
62
|
+
if [rule_e,value] != CONSOLIDATED_RULES.last.keys.first
|
63
|
+
CONSOLIDATED_RULES << { [rule_e,value] => [] }
|
64
|
+
end
|
65
|
+
CONSOLIDATED_RULES.last[[rule_e,value]] << rule_s
|
66
|
+
end
|
67
|
+
CONSOLIDATED_RULES.map! do |hash|
|
68
|
+
rule_e, value = hash.keys.first
|
69
|
+
start_rules = hash.values.first
|
70
|
+
rule_s_union = start_rules.map do |rule_s|
|
71
|
+
"(?:#{rule_s})"
|
72
|
+
end.join("|")
|
73
|
+
[rule_s_union,rule_e,value]
|
74
|
+
end
|
75
|
+
puts "Breaking/nonbreaking #{breaking_rules}/#{nonbreaking_rules}/#{invalid_rules}"
|
76
|
+
|
77
|
+
result1=<<-END
|
78
|
+
#encoding: utf-8
|
79
|
+
require 'stringio'
|
80
|
+
require 'term/ansicolor'
|
81
|
+
module SRX
|
82
|
+
RULES =
|
83
|
+
END
|
84
|
+
result2 =<<-END
|
85
|
+
BEFORE_RE = /(?:\#{RULES.map{|s,e,v| "(\#{s})"}.join("|")})\\Z/m
|
86
|
+
REGEXPS = RULES.map{|s,e,v| [/(\#{s})\\Z/m,/\\A(\#{e})/m,v] }
|
87
|
+
FIRST_CHAR = /\\A./m
|
88
|
+
|
89
|
+
|
90
|
+
class Sentence
|
91
|
+
attr_accessor :input
|
92
|
+
attr_writer :debug
|
93
|
+
|
94
|
+
def initialize(text=nil)
|
95
|
+
if text.is_a?(String)
|
96
|
+
@input = StringIO.new(text,"r:utf-8")
|
97
|
+
else
|
98
|
+
@input = text
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def each
|
103
|
+
buffer_length = 10
|
104
|
+
sentence = ""
|
105
|
+
before_buffer = ""
|
106
|
+
after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
|
107
|
+
matched_rule = nil
|
108
|
+
while(!@input.eof?) do
|
109
|
+
matched_before = BEFORE_RE.match(before_buffer)
|
110
|
+
break_detected = false
|
111
|
+
if matched_before
|
112
|
+
start_index = (matched_before.size - 1).times.find do |index|
|
113
|
+
matched_before[index+1]
|
114
|
+
end
|
115
|
+
if @debug
|
116
|
+
puts "\#{before_buffer}|\#{after_buffer.gsub(/\\n/,"\\\\n")}"
|
117
|
+
end
|
118
|
+
REGEXPS.each do |before_re,after_re,value|
|
119
|
+
# skip the whole match
|
120
|
+
if before_re.match(before_buffer) && after_re.match(after_buffer)
|
121
|
+
break_detected = true
|
122
|
+
color = value ? :red : :green
|
123
|
+
if @debug
|
124
|
+
sentence << Term::ANSIColor.send(color,"<\#{before_re}:\#{after_re}>")
|
125
|
+
end
|
126
|
+
if value
|
127
|
+
yield sentence
|
128
|
+
sentence = ""
|
129
|
+
end
|
130
|
+
break
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
next_after = @input.readchar
|
135
|
+
before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
|
136
|
+
after_buffer.sub!(FIRST_CHAR,"")
|
137
|
+
before_buffer << $&
|
138
|
+
sentence << $&
|
139
|
+
after_buffer << next_after
|
140
|
+
end
|
141
|
+
yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
END
|
146
|
+
File.open(ARGV[1],"w") do |out|
|
147
|
+
out.puts(result1)
|
148
|
+
PP.pp(CONSOLIDATED_RULES,out)
|
149
|
+
out.puts(result2)
|
150
|
+
end
|
data/srx2ruby.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "srx2ruby"
|
6
|
+
s.version = "0.1.0"
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["Aleksander Pohl"]
|
9
|
+
s.email = ["apohllo@o2.pl"]
|
10
|
+
s.homepage = "http://github.com/apohllo/srx2ruby"
|
11
|
+
s.summary = %q{srx2ruby translates SRX files to Ruby.}
|
12
|
+
s.description = %q{This project allows for generating Ruby class
|
13
|
+
providing sentence breaking capabilities based on given SRX file.}
|
14
|
+
|
15
|
+
s.rubyforge_project = "srx2ruby"
|
16
|
+
s.has_rdoc = false
|
17
|
+
|
18
|
+
s.files = `git ls-files`.split("\n")
|
19
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
20
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
21
|
+
s.require_paths = ["lib"]
|
22
|
+
end
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: srx2ruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.0
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aleksander Pohl
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-06-01 00:00:00 +02:00
|
14
|
+
default_executable:
|
15
|
+
dependencies: []
|
16
|
+
|
17
|
+
description: |-
|
18
|
+
This project allows for generating Ruby class
|
19
|
+
providing sentence breaking capabilities based on given SRX file.
|
20
|
+
email:
|
21
|
+
- apohllo@o2.pl
|
22
|
+
executables:
|
23
|
+
- srx2ruby
|
24
|
+
extensions: []
|
25
|
+
|
26
|
+
extra_rdoc_files: []
|
27
|
+
|
28
|
+
files:
|
29
|
+
- .gitignore
|
30
|
+
- Rakefile
|
31
|
+
- bin/srx2ruby
|
32
|
+
- lib/srx2ruby.rb
|
33
|
+
- srx2ruby.gemspec
|
34
|
+
has_rdoc: true
|
35
|
+
homepage: http://github.com/apohllo/srx2ruby
|
36
|
+
licenses: []
|
37
|
+
|
38
|
+
post_install_message:
|
39
|
+
rdoc_options: []
|
40
|
+
|
41
|
+
require_paths:
|
42
|
+
- lib
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
44
|
+
none: false
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: "0"
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: "0"
|
55
|
+
requirements: []
|
56
|
+
|
57
|
+
rubyforge_project: srx2ruby
|
58
|
+
rubygems_version: 1.5.2
|
59
|
+
signing_key:
|
60
|
+
specification_version: 3
|
61
|
+
summary: srx2ruby translates SRX files to Ruby.
|
62
|
+
test_files: []
|
63
|
+
|