srx2ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ work
2
+ .*.sw?
3
+ *.gem
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
data/bin/srx2ruby ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env ruby
2
+ require 'srx2ruby'
data/lib/srx2ruby.rb ADDED
@@ -0,0 +1,150 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+ require 'rexml/document'
4
+ require 'pp'
5
+ include REXML
6
+
7
+
8
+ if ARGV.size < 3
9
+ puts "Usage: srx2ruby rules_file.srx output.rb LanguageRuleSet1 [LanguageRuleSet2 ...]"
10
+ puts "rules_file.srx - file with SRX rules"
11
+ puts "output.rb - the file with Ruby code implementing the breaking rules"
12
+ puts "LanguageRuleSet* - selected language rules"
13
+ exit
14
+ end
15
+
16
+ xml = nil
17
+ File.open(ARGV[0]) do |input|
18
+ xml = Document.new(input)
19
+ end
20
+
21
+ breaking_rules = 0
22
+ nonbreaking_rules = 0
23
+ invalid_rules = 0
24
+ RULES = []
25
+
26
+ PAR_RE = /(^|[^\\])\((?!\?[<:i])/
27
+ GROUP_RE = /\(\?iu\)/
28
+ DASH_RE = /(\[(?:[^\]]|\\\])+)-(–(?:[^\]]|\\\])+)\]/
29
+
30
+ xml.each_element('//languagerule/') do |language|
31
+ next unless ARGV[2..-1].include?(language.attributes['languagerulename'])
32
+ puts language.attributes['languagerulename']
33
+ language.each_element('rule') do |rule|
34
+ should_break = rule.attributes['break'] == "yes"
35
+ if should_break
36
+ breaking_rules += 1
37
+ else
38
+ nonbreaking_rules += 1
39
+ end
40
+ before = rule.elements['beforebreak'].text
41
+ after = rule.elements['afterbreak'].text
42
+ begin
43
+ [before,after].each do |item|
44
+ next unless item
45
+ item.gsub!(PAR_RE,"\\1(?:\\2")
46
+ item.gsub!(GROUP_RE,"(?i)")
47
+ item.gsub!(DASH_RE,"\\1\\2-]")
48
+ end
49
+ re = "(#{before})(#{after})"
50
+ /(?:(#{before})(#{after}))/
51
+ RULES << [before,after,should_break]
52
+ rescue RegexpError => ex
53
+ puts ex
54
+ invalid_rules += 1
55
+ end
56
+ end
57
+ end
58
+
59
+ CONSOLIDATED_RULES = []
60
+ CONSOLIDATED_RULES << { [RULES.first[1],RULES.first[2]] => [] }
61
+ RULES.each do |rule_s,rule_e,value|
62
+ if [rule_e,value] != CONSOLIDATED_RULES.last.keys.first
63
+ CONSOLIDATED_RULES << { [rule_e,value] => [] }
64
+ end
65
+ CONSOLIDATED_RULES.last[[rule_e,value]] << rule_s
66
+ end
67
+ CONSOLIDATED_RULES.map! do |hash|
68
+ rule_e, value = hash.keys.first
69
+ start_rules = hash.values.first
70
+ rule_s_union = start_rules.map do |rule_s|
71
+ "(?:#{rule_s})"
72
+ end.join("|")
73
+ [rule_s_union,rule_e,value]
74
+ end
75
+ puts "Breaking/nonbreaking #{breaking_rules}/#{nonbreaking_rules}/#{invalid_rules}"
76
+
77
+ result1=<<-END
78
+ #encoding: utf-8
79
+ require 'stringio'
80
+ require 'term/ansicolor'
81
+ module SRX
82
+ RULES =
83
+ END
84
+ result2 =<<-END
85
+ BEFORE_RE = /(?:\#{RULES.map{|s,e,v| "(\#{s})"}.join("|")})\\Z/m
86
+ REGEXPS = RULES.map{|s,e,v| [/(\#{s})\\Z/m,/\\A(\#{e})/m,v] }
87
+ FIRST_CHAR = /\\A./m
88
+
89
+
90
+ class Sentence
91
+ attr_accessor :input
92
+ attr_writer :debug
93
+
94
+ def initialize(text=nil)
95
+ if text.is_a?(String)
96
+ @input = StringIO.new(text,"r:utf-8")
97
+ else
98
+ @input = text
99
+ end
100
+ end
101
+
102
+ def each
103
+ buffer_length = 10
104
+ sentence = ""
105
+ before_buffer = ""
106
+ after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
107
+ matched_rule = nil
108
+ while(!@input.eof?) do
109
+ matched_before = BEFORE_RE.match(before_buffer)
110
+ break_detected = false
111
+ if matched_before
112
+ start_index = (matched_before.size - 1).times.find do |index|
113
+ matched_before[index+1]
114
+ end
115
+ if @debug
116
+ puts "\#{before_buffer}|\#{after_buffer.gsub(/\\n/,"\\\\n")}"
117
+ end
118
+ REGEXPS.each do |before_re,after_re,value|
119
+ # skip the whole match
120
+ if before_re.match(before_buffer) && after_re.match(after_buffer)
121
+ break_detected = true
122
+ color = value ? :red : :green
123
+ if @debug
124
+ sentence << Term::ANSIColor.send(color,"<\#{before_re}:\#{after_re}>")
125
+ end
126
+ if value
127
+ yield sentence
128
+ sentence = ""
129
+ end
130
+ break
131
+ end
132
+ end
133
+ end
134
+ next_after = @input.readchar
135
+ before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
136
+ after_buffer.sub!(FIRST_CHAR,"")
137
+ before_buffer << $&
138
+ sentence << $&
139
+ after_buffer << next_after
140
+ end
141
+ yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
142
+ end
143
+ end
144
+ end
145
+ END
146
+ File.open(ARGV[1],"w") do |out|
147
+ out.puts(result1)
148
+ PP.pp(CONSOLIDATED_RULES,out)
149
+ out.puts(result2)
150
+ end
data/srx2ruby.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "srx2ruby"
6
+ s.version = "0.1.0"
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Aleksander Pohl"]
9
+ s.email = ["apohllo@o2.pl"]
10
+ s.homepage = "http://github.com/apohllo/srx2ruby"
11
+ s.summary = %q{srx2ruby translates SRX files to Ruby.}
12
+ s.description = %q{This project allows for generating Ruby class
13
+ providing sentence breaking capabilities based on given SRX file.}
14
+
15
+ s.rubyforge_project = "srx2ruby"
16
+ s.has_rdoc = false
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+ end
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: srx2ruby
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.0
6
+ platform: ruby
7
+ authors:
8
+ - Aleksander Pohl
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-06-01 00:00:00 +02:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description: |-
18
+ This project allows for generating Ruby class
19
+ providing sentence breaking capabilities based on given SRX file.
20
+ email:
21
+ - apohllo@o2.pl
22
+ executables:
23
+ - srx2ruby
24
+ extensions: []
25
+
26
+ extra_rdoc_files: []
27
+
28
+ files:
29
+ - .gitignore
30
+ - Rakefile
31
+ - bin/srx2ruby
32
+ - lib/srx2ruby.rb
33
+ - srx2ruby.gemspec
34
+ has_rdoc: true
35
+ homepage: http://github.com/apohllo/srx2ruby
36
+ licenses: []
37
+
38
+ post_install_message:
39
+ rdoc_options: []
40
+
41
+ require_paths:
42
+ - lib
43
+ required_ruby_version: !ruby/object:Gem::Requirement
44
+ none: false
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: "0"
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: "0"
55
+ requirements: []
56
+
57
+ rubyforge_project: srx2ruby
58
+ rubygems_version: 1.5.2
59
+ signing_key:
60
+ specification_version: 3
61
+ summary: srx2ruby translates SRX files to Ruby.
62
+ test_files: []
63
+