onix 0.7.3 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,9 @@
1
+ v0.7.4 (2nd September 2009)
2
+ - Expand ONIX::Normaliser
3
+ - strip control chars
4
+ - add encoding declaration to valid utf-8 files that aren't declared
5
+ as such
6
+
1
7
  v0.7.3 (19th August 2009)
2
8
  - Switch from java to xsltproc to convert short tag ONIX files
3
9
  to reference tags
data/lib/onix.rb CHANGED
@@ -16,7 +16,7 @@ module ONIX
16
16
  module Version #:nodoc:
17
17
  Major = 0
18
18
  Minor = 7
19
- Tiny = 3
19
+ Tiny = 4
20
20
 
21
21
  String = [Major, Minor, Tiny].join('.')
22
22
  end
@@ -43,6 +43,7 @@ module ONIX
43
43
  raise "isutf8 app not found" unless app_available?("isutf8")
44
44
  raise "iconv app not found" unless app_available?("iconv")
45
45
  raise "sed app not found" unless app_available?("sed")
46
+ raise "tr app not found" unless app_available?("tr")
46
47
 
47
48
  @oldfile = oldfile
48
49
  @newfile = newfile
@@ -64,6 +65,11 @@ module ONIX
64
65
  to_utf8(@curfile, dest)
65
66
  @curfile = dest
66
67
 
68
+ # remove control chars
69
+ dest = next_tempfile
70
+ remove_control_chars(@curfile, dest)
71
+ @curfile = dest
72
+
67
73
  # remove entities
68
74
  replace_named_entities(@curfile)
69
75
 
@@ -102,7 +108,18 @@ module ONIX
102
108
  `xsltproc -o #{outpath} #{xsltpath} #{inpath}`
103
109
  end
104
110
 
105
- # ensure the file is valid utf8, then make sure it's declared as such
111
+ # ensure the file is valid utf8, then make sure it's declared as such.
112
+ #
113
+ # The following behaviour is expected:
114
+ #
115
+ # file is valid utf8, is marked correctly
116
+ # - copied untouched
117
+ # file is valid utf8, is marked incorrectly or has no marked encoding
118
+ # - copied and encoding mark fixed or added
119
+ # file is no utf8, encoding is marked
120
+ # - file is converted to utf8 and enecoding mark is updated
121
+ # file is not utf8, encoding is not marked
122
+ # - file is copied untouched
106
123
  #
107
124
  def to_utf8(src, dest)
108
125
  inpath = File.expand_path(src)
@@ -112,17 +129,28 @@ module ONIX
112
129
 
113
130
  # ensure the file is actually utf8
114
131
  if `isutf8 #{inpath}`.strip == ""
115
- FileUtils.cp(inpath, outpath)
116
- else
132
+ if src_enc.to_s.downcase == "utf-8"
133
+ FileUtils.cp(inpath, outpath)
134
+ else
135
+ FileUtils.cp(inpath, outpath)
136
+ `sed -i 's/<?xml.*?>/<?xml version=\"1.0\" encoding=\"UTF-8\"?>/' #{outpath}`
137
+ end
138
+ elsif src_enc
117
139
  `iconv --from-code=#{src_enc} --to-code=UTF-8 #{inpath} > #{outpath}`
118
- end
119
-
120
- # ensure the encoding delcaration is correct
121
- if src_enc.downcase != "utf-8"
122
140
  `sed -i 's/#{src_enc}/UTF-8/' #{outpath}`
141
+ else
142
+ FileUtils.cp(inpath, outpath)
123
143
  end
124
144
  end
125
145
 
146
+ # XML files shouldn't contain low ASCII control chars. Strip them.
147
+ #
148
+ def remove_control_chars(src, dest)
149
+ inpath = File.expand_path(src)
150
+ outpath = File.expand_path(dest)
151
+ `cat #{inpath} | tr -d "\\000-\\010\\013\\014\\016-\\037" > #{outpath}`
152
+ end
153
+
126
154
  # replace all named entities in the specified file with
127
155
  # numeric entities.
128
156
  #
@@ -21,7 +21,6 @@ context "ONIX::Normaliser", "with a simple short tag file" do
21
21
 
22
22
  File.file?(@outfile).should be_true
23
23
  content = File.read(@outfile)
24
- puts content
25
24
  content.include?("<m174>").should be_false
26
25
  content.include?("<FromCompany>").should be_true
27
26
  end
@@ -76,3 +75,51 @@ context "ONIX::Normaliser", "with an file using entities" do
76
75
  content.include?("&#x02013;").should be_true
77
76
  end
78
77
  end
78
+
79
+ context "ONIX::Normaliser", "with a utf8 file that has no declared encoding" do
80
+
81
+ before(:each) do
82
+ @data_path = File.join(File.dirname(__FILE__),"..","data")
83
+ @filename = File.join(@data_path, "no_encoding.xml")
84
+ @outfile = @filename + ".new"
85
+ end
86
+
87
+ after(:each) do
88
+ File.unlink(@outfile) if File.file?(@outfile)
89
+ end
90
+
91
+ # this is to test for a bug where an exception was raised on files that
92
+ # had no declared encoding
93
+ specify "should add a utf-8 marker to the file" do
94
+ ONIX::Normaliser.process(@filename, @outfile)
95
+
96
+ File.file?(@outfile).should be_true
97
+ content = File.read(@outfile)
98
+
99
+ content.include?("encoding=\"UTF-8\"").should be_true
100
+ end
101
+ end
102
+
103
+ context "ONIX::Normaliser", "with a utf8 file that has illegal control chars" do
104
+
105
+ before(:each) do
106
+ @data_path = File.join(File.dirname(__FILE__),"..","data")
107
+ @filename = File.join(@data_path, "control_chars.xml")
108
+ @outfile = @filename + ".new"
109
+ end
110
+
111
+ after(:each) do
112
+ File.unlink(@outfile) if File.file?(@outfile)
113
+ end
114
+
115
+ # this is to test for a bug where an exception was raised on files that
116
+ # had no declared encoding
117
+ specify "should remove all control chars except LF, CR and TAB" do
118
+ ONIX::Normaliser.process(@filename, @outfile)
119
+
120
+ File.file?(@outfile).should be_true
121
+ content = File.read(@outfile)
122
+
123
+ content.include?("<TitleText>OXFORDPICTURE DICTIONARY CHINESE</TitleText>").should be_true
124
+ end
125
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: onix
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.3
4
+ version: 0.7.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-08-19 00:00:00 +10:00
12
+ date: 2009-09-02 00:00:00 +10:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency