srx2ruby 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/Rakefile +2 -0
- data/bin/srx2ruby +2 -0
- data/lib/srx2ruby.rb +150 -0
- data/srx2ruby.gemspec +22 -0
- metadata +63 -0
data/.gitignore
ADDED
data/Rakefile
ADDED
data/bin/srx2ruby
ADDED
data/lib/srx2ruby.rb
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
require 'rexml/document'
|
4
|
+
require 'pp'
|
5
|
+
include REXML
|
6
|
+
|
7
|
+
|
8
|
+
if ARGV.size < 3
|
9
|
+
puts "Usage: srx2ruby rules_file.srx output.rb LanguageRuleSet1 [LanguageRuleSet2 ...]"
|
10
|
+
puts "rules_file.srx - file with SRX rules"
|
11
|
+
puts "output.rb - the file with Ruby code implementing the breaking rules"
|
12
|
+
puts "LanguageRuleSet* - selected language rules"
|
13
|
+
exit
|
14
|
+
end
|
15
|
+
|
16
|
+
xml = nil
|
17
|
+
File.open(ARGV[0]) do |input|
|
18
|
+
xml = Document.new(input)
|
19
|
+
end
|
20
|
+
|
21
|
+
breaking_rules = 0
|
22
|
+
nonbreaking_rules = 0
|
23
|
+
invalid_rules = 0
|
24
|
+
RULES = []
|
25
|
+
|
26
|
+
PAR_RE = /(^|[^\\])\((?!\?[<:i])/
|
27
|
+
GROUP_RE = /\(\?iu\)/
|
28
|
+
DASH_RE = /(\[(?:[^\]]|\\\])+)-(–(?:[^\]]|\\\])+)\]/
|
29
|
+
|
30
|
+
xml.each_element('//languagerule/') do |language|
|
31
|
+
next unless ARGV[2..-1].include?(language.attributes['languagerulename'])
|
32
|
+
puts language.attributes['languagerulename']
|
33
|
+
language.each_element('rule') do |rule|
|
34
|
+
should_break = rule.attributes['break'] == "yes"
|
35
|
+
if should_break
|
36
|
+
breaking_rules += 1
|
37
|
+
else
|
38
|
+
nonbreaking_rules += 1
|
39
|
+
end
|
40
|
+
before = rule.elements['beforebreak'].text
|
41
|
+
after = rule.elements['afterbreak'].text
|
42
|
+
begin
|
43
|
+
[before,after].each do |item|
|
44
|
+
next unless item
|
45
|
+
item.gsub!(PAR_RE,"\\1(?:\\2")
|
46
|
+
item.gsub!(GROUP_RE,"(?i)")
|
47
|
+
item.gsub!(DASH_RE,"\\1\\2-]")
|
48
|
+
end
|
49
|
+
re = "(#{before})(#{after})"
|
50
|
+
/(?:(#{before})(#{after}))/
|
51
|
+
RULES << [before,after,should_break]
|
52
|
+
rescue RegexpError => ex
|
53
|
+
puts ex
|
54
|
+
invalid_rules += 1
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
CONSOLIDATED_RULES = []
|
60
|
+
CONSOLIDATED_RULES << { [RULES.first[1],RULES.first[2]] => [] }
|
61
|
+
RULES.each do |rule_s,rule_e,value|
|
62
|
+
if [rule_e,value] != CONSOLIDATED_RULES.last.keys.first
|
63
|
+
CONSOLIDATED_RULES << { [rule_e,value] => [] }
|
64
|
+
end
|
65
|
+
CONSOLIDATED_RULES.last[[rule_e,value]] << rule_s
|
66
|
+
end
|
67
|
+
CONSOLIDATED_RULES.map! do |hash|
|
68
|
+
rule_e, value = hash.keys.first
|
69
|
+
start_rules = hash.values.first
|
70
|
+
rule_s_union = start_rules.map do |rule_s|
|
71
|
+
"(?:#{rule_s})"
|
72
|
+
end.join("|")
|
73
|
+
[rule_s_union,rule_e,value]
|
74
|
+
end
|
75
|
+
puts "Breaking/nonbreaking #{breaking_rules}/#{nonbreaking_rules}/#{invalid_rules}"
|
76
|
+
|
77
|
+
result1=<<-END
|
78
|
+
#encoding: utf-8
|
79
|
+
require 'stringio'
|
80
|
+
require 'term/ansicolor'
|
81
|
+
module SRX
|
82
|
+
RULES =
|
83
|
+
END
|
84
|
+
result2 =<<-END
|
85
|
+
BEFORE_RE = /(?:\#{RULES.map{|s,e,v| "(\#{s})"}.join("|")})\\Z/m
|
86
|
+
REGEXPS = RULES.map{|s,e,v| [/(\#{s})\\Z/m,/\\A(\#{e})/m,v] }
|
87
|
+
FIRST_CHAR = /\\A./m
|
88
|
+
|
89
|
+
|
90
|
+
class Sentence
|
91
|
+
attr_accessor :input
|
92
|
+
attr_writer :debug
|
93
|
+
|
94
|
+
def initialize(text=nil)
|
95
|
+
if text.is_a?(String)
|
96
|
+
@input = StringIO.new(text,"r:utf-8")
|
97
|
+
else
|
98
|
+
@input = text
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def each
|
103
|
+
buffer_length = 10
|
104
|
+
sentence = ""
|
105
|
+
before_buffer = ""
|
106
|
+
after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
|
107
|
+
matched_rule = nil
|
108
|
+
while(!@input.eof?) do
|
109
|
+
matched_before = BEFORE_RE.match(before_buffer)
|
110
|
+
break_detected = false
|
111
|
+
if matched_before
|
112
|
+
start_index = (matched_before.size - 1).times.find do |index|
|
113
|
+
matched_before[index+1]
|
114
|
+
end
|
115
|
+
if @debug
|
116
|
+
puts "\#{before_buffer}|\#{after_buffer.gsub(/\\n/,"\\\\n")}"
|
117
|
+
end
|
118
|
+
REGEXPS.each do |before_re,after_re,value|
|
119
|
+
# skip the whole match
|
120
|
+
if before_re.match(before_buffer) && after_re.match(after_buffer)
|
121
|
+
break_detected = true
|
122
|
+
color = value ? :red : :green
|
123
|
+
if @debug
|
124
|
+
sentence << Term::ANSIColor.send(color,"<\#{before_re}:\#{after_re}>")
|
125
|
+
end
|
126
|
+
if value
|
127
|
+
yield sentence
|
128
|
+
sentence = ""
|
129
|
+
end
|
130
|
+
break
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
next_after = @input.readchar
|
135
|
+
before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
|
136
|
+
after_buffer.sub!(FIRST_CHAR,"")
|
137
|
+
before_buffer << $&
|
138
|
+
sentence << $&
|
139
|
+
after_buffer << next_after
|
140
|
+
end
|
141
|
+
yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
END
|
146
|
+
File.open(ARGV[1],"w") do |out|
|
147
|
+
out.puts(result1)
|
148
|
+
PP.pp(CONSOLIDATED_RULES,out)
|
149
|
+
out.puts(result2)
|
150
|
+
end
|
data/srx2ruby.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "srx2ruby"
|
6
|
+
s.version = "0.1.0"
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["Aleksander Pohl"]
|
9
|
+
s.email = ["apohllo@o2.pl"]
|
10
|
+
s.homepage = "http://github.com/apohllo/srx2ruby"
|
11
|
+
s.summary = %q{srx2ruby translates SRX files to Ruby.}
|
12
|
+
s.description = %q{This project allows for generating Ruby class
|
13
|
+
providing sentence breaking capabilities based on given SRX file.}
|
14
|
+
|
15
|
+
s.rubyforge_project = "srx2ruby"
|
16
|
+
s.has_rdoc = false
|
17
|
+
|
18
|
+
s.files = `git ls-files`.split("\n")
|
19
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
20
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
21
|
+
s.require_paths = ["lib"]
|
22
|
+
end
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: srx2ruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.0
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aleksander Pohl
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-06-01 00:00:00 +02:00
|
14
|
+
default_executable:
|
15
|
+
dependencies: []
|
16
|
+
|
17
|
+
description: |-
|
18
|
+
This project allows for generating Ruby class
|
19
|
+
providing sentence breaking capabilities based on given SRX file.
|
20
|
+
email:
|
21
|
+
- apohllo@o2.pl
|
22
|
+
executables:
|
23
|
+
- srx2ruby
|
24
|
+
extensions: []
|
25
|
+
|
26
|
+
extra_rdoc_files: []
|
27
|
+
|
28
|
+
files:
|
29
|
+
- .gitignore
|
30
|
+
- Rakefile
|
31
|
+
- bin/srx2ruby
|
32
|
+
- lib/srx2ruby.rb
|
33
|
+
- srx2ruby.gemspec
|
34
|
+
has_rdoc: true
|
35
|
+
homepage: http://github.com/apohllo/srx2ruby
|
36
|
+
licenses: []
|
37
|
+
|
38
|
+
post_install_message:
|
39
|
+
rdoc_options: []
|
40
|
+
|
41
|
+
require_paths:
|
42
|
+
- lib
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
44
|
+
none: false
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: "0"
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: "0"
|
55
|
+
requirements: []
|
56
|
+
|
57
|
+
rubyforge_project: srx2ruby
|
58
|
+
rubygems_version: 1.5.2
|
59
|
+
signing_key:
|
60
|
+
specification_version: 3
|
61
|
+
summary: srx2ruby translates SRX files to Ruby.
|
62
|
+
test_files: []
|
63
|
+
|