srx2ruby 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ work
2
+ .*.sw?
3
+ *.gem
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
data/bin/srx2ruby ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env ruby
2
+ require 'srx2ruby'
data/lib/srx2ruby.rb ADDED
@@ -0,0 +1,150 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+ require 'rexml/document'
4
+ require 'pp'
5
+ include REXML
6
+
7
+
8
+ if ARGV.size < 3
9
+ puts "Usage: srx2ruby rules_file.srx output.rb LanguageRuleSet1 [LanguageRuleSet2 ...]"
10
+ puts "rules_file.srx - file with SRX rules"
11
+ puts "output.rb - the file with Ruby code implementing the breaking rules"
12
+ puts "LanguageRuleSet* - selected language rules"
13
+ exit
14
+ end
15
+
16
+ xml = nil
17
+ File.open(ARGV[0]) do |input|
18
+ xml = Document.new(input)
19
+ end
20
+
21
+ breaking_rules = 0
22
+ nonbreaking_rules = 0
23
+ invalid_rules = 0
24
+ RULES = []
25
+
26
+ PAR_RE = /(^|[^\\])\((?!\?[<:i])/
27
+ GROUP_RE = /\(\?iu\)/
28
+ DASH_RE = /(\[(?:[^\]]|\\\])+)-(–(?:[^\]]|\\\])+)\]/
29
+
30
+ xml.each_element('//languagerule/') do |language|
31
+ next unless ARGV[2..-1].include?(language.attributes['languagerulename'])
32
+ puts language.attributes['languagerulename']
33
+ language.each_element('rule') do |rule|
34
+ should_break = rule.attributes['break'] == "yes"
35
+ if should_break
36
+ breaking_rules += 1
37
+ else
38
+ nonbreaking_rules += 1
39
+ end
40
+ before = rule.elements['beforebreak'].text
41
+ after = rule.elements['afterbreak'].text
42
+ begin
43
+ [before,after].each do |item|
44
+ next unless item
45
+ item.gsub!(PAR_RE,"\\1(?:\\2")
46
+ item.gsub!(GROUP_RE,"(?i)")
47
+ item.gsub!(DASH_RE,"\\1\\2-]")
48
+ end
49
+ re = "(#{before})(#{after})"
50
+ /(?:(#{before})(#{after}))/
51
+ RULES << [before,after,should_break]
52
+ rescue RegexpError => ex
53
+ puts ex
54
+ invalid_rules += 1
55
+ end
56
+ end
57
+ end
58
+
59
+ CONSOLIDATED_RULES = []
60
+ CONSOLIDATED_RULES << { [RULES.first[1],RULES.first[2]] => [] }
61
+ RULES.each do |rule_s,rule_e,value|
62
+ if [rule_e,value] != CONSOLIDATED_RULES.last.keys.first
63
+ CONSOLIDATED_RULES << { [rule_e,value] => [] }
64
+ end
65
+ CONSOLIDATED_RULES.last[[rule_e,value]] << rule_s
66
+ end
67
+ CONSOLIDATED_RULES.map! do |hash|
68
+ rule_e, value = hash.keys.first
69
+ start_rules = hash.values.first
70
+ rule_s_union = start_rules.map do |rule_s|
71
+ "(?:#{rule_s})"
72
+ end.join("|")
73
+ [rule_s_union,rule_e,value]
74
+ end
75
+ puts "Breaking/nonbreaking #{breaking_rules}/#{nonbreaking_rules}/#{invalid_rules}"
76
+
77
+ result1=<<-END
78
+ #encoding: utf-8
79
+ require 'stringio'
80
+ require 'term/ansicolor'
81
+ module SRX
82
+ RULES =
83
+ END
84
+ result2 =<<-END
85
+ BEFORE_RE = /(?:\#{RULES.map{|s,e,v| "(\#{s})"}.join("|")})\\Z/m
86
+ REGEXPS = RULES.map{|s,e,v| [/(\#{s})\\Z/m,/\\A(\#{e})/m,v] }
87
+ FIRST_CHAR = /\\A./m
88
+
89
+
90
+ class Sentence
91
+ attr_accessor :input
92
+ attr_writer :debug
93
+
94
+ def initialize(text=nil)
95
+ if text.is_a?(String)
96
+ @input = StringIO.new(text,"r:utf-8")
97
+ else
98
+ @input = text
99
+ end
100
+ end
101
+
102
+ def each
103
+ buffer_length = 10
104
+ sentence = ""
105
+ before_buffer = ""
106
+ after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
107
+ matched_rule = nil
108
+ while(!@input.eof?) do
109
+ matched_before = BEFORE_RE.match(before_buffer)
110
+ break_detected = false
111
+ if matched_before
112
+ start_index = (matched_before.size - 1).times.find do |index|
113
+ matched_before[index+1]
114
+ end
115
+ if @debug
116
+ puts "\#{before_buffer}|\#{after_buffer.gsub(/\\n/,"\\\\n")}"
117
+ end
118
+ REGEXPS.each do |before_re,after_re,value|
119
+ # skip the whole match
120
+ if before_re.match(before_buffer) && after_re.match(after_buffer)
121
+ break_detected = true
122
+ color = value ? :red : :green
123
+ if @debug
124
+ sentence << Term::ANSIColor.send(color,"<\#{before_re}:\#{after_re}>")
125
+ end
126
+ if value
127
+ yield sentence
128
+ sentence = ""
129
+ end
130
+ break
131
+ end
132
+ end
133
+ end
134
+ next_after = @input.readchar
135
+ before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
136
+ after_buffer.sub!(FIRST_CHAR,"")
137
+ before_buffer << $&
138
+ sentence << $&
139
+ after_buffer << next_after
140
+ end
141
+ yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
142
+ end
143
+ end
144
+ end
145
+ END
146
+ File.open(ARGV[1],"w") do |out|
147
+ out.puts(result1)
148
+ PP.pp(CONSOLIDATED_RULES,out)
149
+ out.puts(result2)
150
+ end
data/srx2ruby.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "srx2ruby"
6
+ s.version = "0.1.0"
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Aleksander Pohl"]
9
+ s.email = ["apohllo@o2.pl"]
10
+ s.homepage = "http://github.com/apohllo/srx2ruby"
11
+ s.summary = %q{srx2ruby translates SRX files to Ruby.}
12
+ s.description = %q{This project allows for generating Ruby class
13
+ providing sentence breaking capabilities based on given SRX file.}
14
+
15
+ s.rubyforge_project = "srx2ruby"
16
+ s.has_rdoc = false
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+ end
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: srx2ruby
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.0
6
+ platform: ruby
7
+ authors:
8
+ - Aleksander Pohl
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-06-01 00:00:00 +02:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description: |-
18
+ This project allows for generating Ruby class
19
+ providing sentence breaking capabilities based on given SRX file.
20
+ email:
21
+ - apohllo@o2.pl
22
+ executables:
23
+ - srx2ruby
24
+ extensions: []
25
+
26
+ extra_rdoc_files: []
27
+
28
+ files:
29
+ - .gitignore
30
+ - Rakefile
31
+ - bin/srx2ruby
32
+ - lib/srx2ruby.rb
33
+ - srx2ruby.gemspec
34
+ has_rdoc: true
35
+ homepage: http://github.com/apohllo/srx2ruby
36
+ licenses: []
37
+
38
+ post_install_message:
39
+ rdoc_options: []
40
+
41
+ require_paths:
42
+ - lib
43
+ required_ruby_version: !ruby/object:Gem::Requirement
44
+ none: false
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: "0"
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ requirements: []
56
+
57
+ rubyforge_project: srx2ruby
58
+ rubygems_version: 1.5.2
59
+ signing_key:
60
+ specification_version: 3
61
+ summary: srx2ruby translates SRX files to Ruby.
62
+ test_files: []
63
+