onix 0.7.3 → 0.7.4

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,9 @@
1
+ v0.7.4 (2nd September 2009)
2
+ - Expand ONIX::Normaliser
3
+ - strip control chars
4
+ - add encoding declaration to valid utf-8 files that aren't declared
5
+ as such
6
+
1
7
  v0.7.3 (19th August 2009)
2
8
  - Switch from java to xsltproc to convert short tag ONIX files
3
9
  to reference tags
data/lib/onix.rb CHANGED
@@ -16,7 +16,7 @@ module ONIX
16
16
  module Version #:nodoc:
17
17
  Major = 0
18
18
  Minor = 7
19
- Tiny = 3
19
+ Tiny = 4
20
20
 
21
21
  String = [Major, Minor, Tiny].join('.')
22
22
  end
@@ -43,6 +43,7 @@ module ONIX
43
43
  raise "isutf8 app not found" unless app_available?("isutf8")
44
44
  raise "iconv app not found" unless app_available?("iconv")
45
45
  raise "sed app not found" unless app_available?("sed")
46
+ raise "tr app not found" unless app_available?("tr")
46
47
 
47
48
  @oldfile = oldfile
48
49
  @newfile = newfile
@@ -64,6 +65,11 @@ module ONIX
64
65
  to_utf8(@curfile, dest)
65
66
  @curfile = dest
66
67
 
68
+ # remove control chars
69
+ dest = next_tempfile
70
+ remove_control_chars(@curfile, dest)
71
+ @curfile = dest
72
+
67
73
  # remove entities
68
74
  replace_named_entities(@curfile)
69
75
 
@@ -102,7 +108,18 @@ module ONIX
102
108
  `xsltproc -o #{outpath} #{xsltpath} #{inpath}`
103
109
  end
104
110
 
105
- # ensure the file is valid utf8, then make sure it's declared as such
111
+ # ensure the file is valid utf8, then make sure it's declared as such.
112
+ #
113
+ # The following behaviour is expected:
114
+ #
115
+ # file is valid utf8, is marked correctly
116
+ # - copied untouched
117
+ # file is valid utf8, is marked incorrectly or has no marked encoding
118
+ # - copied and encoding mark fixed or added
119
+ # file is no utf8, encoding is marked
120
+ # - file is converted to utf8 and enecoding mark is updated
121
+ # file is not utf8, encoding is not marked
122
+ # - file is copied untouched
106
123
  #
107
124
  def to_utf8(src, dest)
108
125
  inpath = File.expand_path(src)
@@ -112,17 +129,28 @@ module ONIX
112
129
 
113
130
  # ensure the file is actually utf8
114
131
  if `isutf8 #{inpath}`.strip == ""
115
- FileUtils.cp(inpath, outpath)
116
- else
132
+ if src_enc.to_s.downcase == "utf-8"
133
+ FileUtils.cp(inpath, outpath)
134
+ else
135
+ FileUtils.cp(inpath, outpath)
136
+ `sed -i 's/<?xml.*?>/<?xml version=\"1.0\" encoding=\"UTF-8\"?>/' #{outpath}`
137
+ end
138
+ elsif src_enc
117
139
  `iconv --from-code=#{src_enc} --to-code=UTF-8 #{inpath} > #{outpath}`
118
- end
119
-
120
- # ensure the encoding delcaration is correct
121
- if src_enc.downcase != "utf-8"
122
140
  `sed -i 's/#{src_enc}/UTF-8/' #{outpath}`
141
+ else
142
+ FileUtils.cp(inpath, outpath)
123
143
  end
124
144
  end
125
145
 
146
+ # XML files shouldn't contain low ASCII control chars. Strip them.
147
+ #
148
+ def remove_control_chars(src, dest)
149
+ inpath = File.expand_path(src)
150
+ outpath = File.expand_path(dest)
151
+ `cat #{inpath} | tr -d "\\000-\\010\\013\\014\\016-\\037" > #{outpath}`
152
+ end
153
+
126
154
  # replace all named entities in the specified file with
127
155
  # numeric entities.
128
156
  #
@@ -21,7 +21,6 @@ context "ONIX::Normaliser", "with a simple short tag file" do
21
21
 
22
22
  File.file?(@outfile).should be_true
23
23
  content = File.read(@outfile)
24
- puts content
25
24
  content.include?("<m174>").should be_false
26
25
  content.include?("<FromCompany>").should be_true
27
26
  end
@@ -76,3 +75,51 @@ context "ONIX::Normaliser", "with an file using entities" do
76
75
  content.include?("&#x02013;").should be_true
77
76
  end
78
77
  end
78
+
79
+ context "ONIX::Normaliser", "with a utf8 file that has no declared encoding" do
80
+
81
+ before(:each) do
82
+ @data_path = File.join(File.dirname(__FILE__),"..","data")
83
+ @filename = File.join(@data_path, "no_encoding.xml")
84
+ @outfile = @filename + ".new"
85
+ end
86
+
87
+ after(:each) do
88
+ File.unlink(@outfile) if File.file?(@outfile)
89
+ end
90
+
91
+ # this is to test for a bug where an exception was raised on files that
92
+ # had no declared encoding
93
+ specify "should add a utf-8 marker to the file" do
94
+ ONIX::Normaliser.process(@filename, @outfile)
95
+
96
+ File.file?(@outfile).should be_true
97
+ content = File.read(@outfile)
98
+
99
+ content.include?("encoding=\"UTF-8\"").should be_true
100
+ end
101
+ end
102
+
103
+ context "ONIX::Normaliser", "with a utf8 file that has illegal control chars" do
104
+
105
+ before(:each) do
106
+ @data_path = File.join(File.dirname(__FILE__),"..","data")
107
+ @filename = File.join(@data_path, "control_chars.xml")
108
+ @outfile = @filename + ".new"
109
+ end
110
+
111
+ after(:each) do
112
+ File.unlink(@outfile) if File.file?(@outfile)
113
+ end
114
+
115
+ # this is to test for a bug where an exception was raised on files that
116
+ # had no declared encoding
117
+ specify "should remove all control chars except LF, CR and TAB" do
118
+ ONIX::Normaliser.process(@filename, @outfile)
119
+
120
+ File.file?(@outfile).should be_true
121
+ content = File.read(@outfile)
122
+
123
+ content.include?("<TitleText>OXFORDPICTURE DICTIONARY CHINESE</TitleText>").should be_true
124
+ end
125
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: onix
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.3
4
+ version: 0.7.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-08-19 00:00:00 +10:00
12
+ date: 2009-09-02 00:00:00 +10:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency