onix 0.7.3 → 0.7.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +6 -0
- data/lib/onix.rb +1 -1
- data/lib/onix/normaliser.rb +35 -7
- data/spec/normaliser_spec.rb +48 -1
- metadata +2 -2
data/CHANGELOG
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
v0.7.4 (2nd September 2009)
|
2
|
+
- Expand ONIX::Normaliser
|
3
|
+
- strip control chars
|
4
|
+
- add encoding declaration to valid utf-8 files that aren't declared
|
5
|
+
as such
|
6
|
+
|
1
7
|
v0.7.3 (19th August 2009)
|
2
8
|
- Switch from java to xsltproc to convert short tag ONIX files
|
3
9
|
to reference tags
|
data/lib/onix.rb
CHANGED
data/lib/onix/normaliser.rb
CHANGED
@@ -43,6 +43,7 @@ module ONIX
|
|
43
43
|
raise "isutf8 app not found" unless app_available?("isutf8")
|
44
44
|
raise "iconv app not found" unless app_available?("iconv")
|
45
45
|
raise "sed app not found" unless app_available?("sed")
|
46
|
+
raise "tr app not found" unless app_available?("tr")
|
46
47
|
|
47
48
|
@oldfile = oldfile
|
48
49
|
@newfile = newfile
|
@@ -64,6 +65,11 @@ module ONIX
|
|
64
65
|
to_utf8(@curfile, dest)
|
65
66
|
@curfile = dest
|
66
67
|
|
68
|
+
# remove control chars
|
69
|
+
dest = next_tempfile
|
70
|
+
remove_control_chars(@curfile, dest)
|
71
|
+
@curfile = dest
|
72
|
+
|
67
73
|
# remove entities
|
68
74
|
replace_named_entities(@curfile)
|
69
75
|
|
@@ -102,7 +108,18 @@ module ONIX
|
|
102
108
|
`xsltproc -o #{outpath} #{xsltpath} #{inpath}`
|
103
109
|
end
|
104
110
|
|
105
|
-
# ensure the file is valid utf8, then make sure it's declared as such
|
111
|
+
# ensure the file is valid utf8, then make sure it's declared as such.
|
112
|
+
#
|
113
|
+
# The following behaviour is expected:
|
114
|
+
#
|
115
|
+
# file is valid utf8, is marked correctly
|
116
|
+
# - copied untouched
|
117
|
+
# file is valid utf8, is marked incorrectly or has no marked encoding
|
118
|
+
# - copied and encoding mark fixed or added
|
119
|
+
# file is no utf8, encoding is marked
|
120
|
+
# - file is converted to utf8 and enecoding mark is updated
|
121
|
+
# file is not utf8, encoding is not marked
|
122
|
+
# - file is copied untouched
|
106
123
|
#
|
107
124
|
def to_utf8(src, dest)
|
108
125
|
inpath = File.expand_path(src)
|
@@ -112,17 +129,28 @@ module ONIX
|
|
112
129
|
|
113
130
|
# ensure the file is actually utf8
|
114
131
|
if `isutf8 #{inpath}`.strip == ""
|
115
|
-
|
116
|
-
|
132
|
+
if src_enc.to_s.downcase == "utf-8"
|
133
|
+
FileUtils.cp(inpath, outpath)
|
134
|
+
else
|
135
|
+
FileUtils.cp(inpath, outpath)
|
136
|
+
`sed -i 's/<?xml.*?>/<?xml version=\"1.0\" encoding=\"UTF-8\"?>/' #{outpath}`
|
137
|
+
end
|
138
|
+
elsif src_enc
|
117
139
|
`iconv --from-code=#{src_enc} --to-code=UTF-8 #{inpath} > #{outpath}`
|
118
|
-
end
|
119
|
-
|
120
|
-
# ensure the encoding delcaration is correct
|
121
|
-
if src_enc.downcase != "utf-8"
|
122
140
|
`sed -i 's/#{src_enc}/UTF-8/' #{outpath}`
|
141
|
+
else
|
142
|
+
FileUtils.cp(inpath, outpath)
|
123
143
|
end
|
124
144
|
end
|
125
145
|
|
146
|
+
# XML files shouldn't contain low ASCII control chars. Strip them.
|
147
|
+
#
|
148
|
+
def remove_control_chars(src, dest)
|
149
|
+
inpath = File.expand_path(src)
|
150
|
+
outpath = File.expand_path(dest)
|
151
|
+
`cat #{inpath} | tr -d "\\000-\\010\\013\\014\\016-\\037" > #{outpath}`
|
152
|
+
end
|
153
|
+
|
126
154
|
# replace all named entities in the specified file with
|
127
155
|
# numeric entities.
|
128
156
|
#
|
data/spec/normaliser_spec.rb
CHANGED
@@ -21,7 +21,6 @@ context "ONIX::Normaliser", "with a simple short tag file" do
|
|
21
21
|
|
22
22
|
File.file?(@outfile).should be_true
|
23
23
|
content = File.read(@outfile)
|
24
|
-
puts content
|
25
24
|
content.include?("<m174>").should be_false
|
26
25
|
content.include?("<FromCompany>").should be_true
|
27
26
|
end
|
@@ -76,3 +75,51 @@ context "ONIX::Normaliser", "with an file using entities" do
|
|
76
75
|
content.include?("–").should be_true
|
77
76
|
end
|
78
77
|
end
|
78
|
+
|
79
|
+
context "ONIX::Normaliser", "with a utf8 file that has no declared encoding" do
|
80
|
+
|
81
|
+
before(:each) do
|
82
|
+
@data_path = File.join(File.dirname(__FILE__),"..","data")
|
83
|
+
@filename = File.join(@data_path, "no_encoding.xml")
|
84
|
+
@outfile = @filename + ".new"
|
85
|
+
end
|
86
|
+
|
87
|
+
after(:each) do
|
88
|
+
File.unlink(@outfile) if File.file?(@outfile)
|
89
|
+
end
|
90
|
+
|
91
|
+
# this is to test for a bug where an exception was raised on files that
|
92
|
+
# had no declared encoding
|
93
|
+
specify "should add a utf-8 marker to the file" do
|
94
|
+
ONIX::Normaliser.process(@filename, @outfile)
|
95
|
+
|
96
|
+
File.file?(@outfile).should be_true
|
97
|
+
content = File.read(@outfile)
|
98
|
+
|
99
|
+
content.include?("encoding=\"UTF-8\"").should be_true
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
context "ONIX::Normaliser", "with a utf8 file that has illegal control chars" do
|
104
|
+
|
105
|
+
before(:each) do
|
106
|
+
@data_path = File.join(File.dirname(__FILE__),"..","data")
|
107
|
+
@filename = File.join(@data_path, "control_chars.xml")
|
108
|
+
@outfile = @filename + ".new"
|
109
|
+
end
|
110
|
+
|
111
|
+
after(:each) do
|
112
|
+
File.unlink(@outfile) if File.file?(@outfile)
|
113
|
+
end
|
114
|
+
|
115
|
+
# this is to test for a bug where an exception was raised on files that
|
116
|
+
# had no declared encoding
|
117
|
+
specify "should remove all control chars except LF, CR and TAB" do
|
118
|
+
ONIX::Normaliser.process(@filename, @outfile)
|
119
|
+
|
120
|
+
File.file?(@outfile).should be_true
|
121
|
+
content = File.read(@outfile)
|
122
|
+
|
123
|
+
content.include?("<TitleText>OXFORDPICTURE DICTIONARY CHINESE</TitleText>").should be_true
|
124
|
+
end
|
125
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: onix
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-09-02 00:00:00 +10:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|