onix 0.7.3 → 0.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +6 -0
- data/lib/onix.rb +1 -1
- data/lib/onix/normaliser.rb +35 -7
- data/spec/normaliser_spec.rb +48 -1
- metadata +2 -2
data/CHANGELOG
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
v0.7.4 (2nd September 2009)
|
2
|
+
- Expand ONIX::Normaliser
|
3
|
+
- strip control chars
|
4
|
+
- add encoding declaration to valid utf-8 files that aren't declared
|
5
|
+
as such
|
6
|
+
|
1
7
|
v0.7.3 (19th August 2009)
|
2
8
|
- Switch from java to xsltproc to convert short tag ONIX files
|
3
9
|
to reference tags
|
data/lib/onix.rb
CHANGED
data/lib/onix/normaliser.rb
CHANGED
@@ -43,6 +43,7 @@ module ONIX
|
|
43
43
|
raise "isutf8 app not found" unless app_available?("isutf8")
|
44
44
|
raise "iconv app not found" unless app_available?("iconv")
|
45
45
|
raise "sed app not found" unless app_available?("sed")
|
46
|
+
raise "tr app not found" unless app_available?("tr")
|
46
47
|
|
47
48
|
@oldfile = oldfile
|
48
49
|
@newfile = newfile
|
@@ -64,6 +65,11 @@ module ONIX
|
|
64
65
|
to_utf8(@curfile, dest)
|
65
66
|
@curfile = dest
|
66
67
|
|
68
|
+
# remove control chars
|
69
|
+
dest = next_tempfile
|
70
|
+
remove_control_chars(@curfile, dest)
|
71
|
+
@curfile = dest
|
72
|
+
|
67
73
|
# remove entities
|
68
74
|
replace_named_entities(@curfile)
|
69
75
|
|
@@ -102,7 +108,18 @@ module ONIX
|
|
102
108
|
`xsltproc -o #{outpath} #{xsltpath} #{inpath}`
|
103
109
|
end
|
104
110
|
|
105
|
-
# ensure the file is valid utf8, then make sure it's declared as such
|
111
|
+
# ensure the file is valid utf8, then make sure it's declared as such.
|
112
|
+
#
|
113
|
+
# The following behaviour is expected:
|
114
|
+
#
|
115
|
+
# file is valid utf8, is marked correctly
|
116
|
+
# - copied untouched
|
117
|
+
# file is valid utf8, is marked incorrectly or has no marked encoding
|
118
|
+
# - copied and encoding mark fixed or added
|
119
|
+
# file is no utf8, encoding is marked
|
120
|
+
# - file is converted to utf8 and enecoding mark is updated
|
121
|
+
# file is not utf8, encoding is not marked
|
122
|
+
# - file is copied untouched
|
106
123
|
#
|
107
124
|
def to_utf8(src, dest)
|
108
125
|
inpath = File.expand_path(src)
|
@@ -112,17 +129,28 @@ module ONIX
|
|
112
129
|
|
113
130
|
# ensure the file is actually utf8
|
114
131
|
if `isutf8 #{inpath}`.strip == ""
|
115
|
-
|
116
|
-
|
132
|
+
if src_enc.to_s.downcase == "utf-8"
|
133
|
+
FileUtils.cp(inpath, outpath)
|
134
|
+
else
|
135
|
+
FileUtils.cp(inpath, outpath)
|
136
|
+
`sed -i 's/<?xml.*?>/<?xml version=\"1.0\" encoding=\"UTF-8\"?>/' #{outpath}`
|
137
|
+
end
|
138
|
+
elsif src_enc
|
117
139
|
`iconv --from-code=#{src_enc} --to-code=UTF-8 #{inpath} > #{outpath}`
|
118
|
-
end
|
119
|
-
|
120
|
-
# ensure the encoding delcaration is correct
|
121
|
-
if src_enc.downcase != "utf-8"
|
122
140
|
`sed -i 's/#{src_enc}/UTF-8/' #{outpath}`
|
141
|
+
else
|
142
|
+
FileUtils.cp(inpath, outpath)
|
123
143
|
end
|
124
144
|
end
|
125
145
|
|
146
|
+
# XML files shouldn't contain low ASCII control chars. Strip them.
|
147
|
+
#
|
148
|
+
def remove_control_chars(src, dest)
|
149
|
+
inpath = File.expand_path(src)
|
150
|
+
outpath = File.expand_path(dest)
|
151
|
+
`cat #{inpath} | tr -d "\\000-\\010\\013\\014\\016-\\037" > #{outpath}`
|
152
|
+
end
|
153
|
+
|
126
154
|
# replace all named entities in the specified file with
|
127
155
|
# numeric entities.
|
128
156
|
#
|
data/spec/normaliser_spec.rb
CHANGED
@@ -21,7 +21,6 @@ context "ONIX::Normaliser", "with a simple short tag file" do
|
|
21
21
|
|
22
22
|
File.file?(@outfile).should be_true
|
23
23
|
content = File.read(@outfile)
|
24
|
-
puts content
|
25
24
|
content.include?("<m174>").should be_false
|
26
25
|
content.include?("<FromCompany>").should be_true
|
27
26
|
end
|
@@ -76,3 +75,51 @@ context "ONIX::Normaliser", "with an file using entities" do
|
|
76
75
|
content.include?("–").should be_true
|
77
76
|
end
|
78
77
|
end
|
78
|
+
|
79
|
+
context "ONIX::Normaliser", "with a utf8 file that has no declared encoding" do
|
80
|
+
|
81
|
+
before(:each) do
|
82
|
+
@data_path = File.join(File.dirname(__FILE__),"..","data")
|
83
|
+
@filename = File.join(@data_path, "no_encoding.xml")
|
84
|
+
@outfile = @filename + ".new"
|
85
|
+
end
|
86
|
+
|
87
|
+
after(:each) do
|
88
|
+
File.unlink(@outfile) if File.file?(@outfile)
|
89
|
+
end
|
90
|
+
|
91
|
+
# this is to test for a bug where an exception was raised on files that
|
92
|
+
# had no declared encoding
|
93
|
+
specify "should add a utf-8 marker to the file" do
|
94
|
+
ONIX::Normaliser.process(@filename, @outfile)
|
95
|
+
|
96
|
+
File.file?(@outfile).should be_true
|
97
|
+
content = File.read(@outfile)
|
98
|
+
|
99
|
+
content.include?("encoding=\"UTF-8\"").should be_true
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
context "ONIX::Normaliser", "with a utf8 file that has illegal control chars" do
|
104
|
+
|
105
|
+
before(:each) do
|
106
|
+
@data_path = File.join(File.dirname(__FILE__),"..","data")
|
107
|
+
@filename = File.join(@data_path, "control_chars.xml")
|
108
|
+
@outfile = @filename + ".new"
|
109
|
+
end
|
110
|
+
|
111
|
+
after(:each) do
|
112
|
+
File.unlink(@outfile) if File.file?(@outfile)
|
113
|
+
end
|
114
|
+
|
115
|
+
# this is to test for a bug where an exception was raised on files that
|
116
|
+
# had no declared encoding
|
117
|
+
specify "should remove all control chars except LF, CR and TAB" do
|
118
|
+
ONIX::Normaliser.process(@filename, @outfile)
|
119
|
+
|
120
|
+
File.file?(@outfile).should be_true
|
121
|
+
content = File.read(@outfile)
|
122
|
+
|
123
|
+
content.include?("<TitleText>OXFORDPICTURE DICTIONARY CHINESE</TitleText>").should be_true
|
124
|
+
end
|
125
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: onix
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-09-02 00:00:00 +10:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|