onix 0.7.1 → 0.7.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,79 @@
1
+ <!-- Special characters for HTML -->
2
+
3
+ <!-- Character entity set. Typical invocation:
4
+ <!ENTITY % HTMLspecial PUBLIC
5
+ "-//W3C//ENTITIES Special for XHTML//EN"
6
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent">
7
+ %HTMLspecial;
8
+ -->
9
+
10
+ <!-- Portions (C) International Organization for Standardization 1986:
11
+ Permission to copy in any form is granted for use with
12
+ conforming SGML systems and applications as defined in
13
+ ISO 8879, provided this notice is included in all copies.
14
+ -->
15
+
16
+ <!-- Relevant ISO entity set is given unless names are newly introduced.
17
+ New names (i.e., not in ISO 8879 list) do not clash with any
18
+ existing ISO 8879 entity names. ISO 10646 character numbers
19
+ are given for each character, in hex. values are decimal
20
+ conversions of the ISO 10646 values and refer to the document
21
+ character set. Names are Unicode names.
22
+ -->
23
+
24
+ <!-- C0 Controls and Basic Latin -->
25
+ <!ENTITY quot "&#34;"> <!-- quotation mark = APL quote,
26
+ U+0022 ISOnum -->
27
+ <!ENTITY amp "&#38;#38;"> <!-- ampersand, U+0026 ISOnum -->
28
+ <!ENTITY lt "&#38;#60;"> <!-- less-than sign, U+003C ISOnum -->
29
+ <!ENTITY gt "&#62;"> <!-- greater-than sign, U+003E ISOnum -->
30
+ <!ENTITY apos "&#39;"> <!-- apostrophe mark, U+0027 ISOnum -->
31
+
32
+ <!-- Latin Extended-A -->
33
+ <!ENTITY OElig "&#338;"> <!-- latin capital ligature OE,
34
+ U+0152 ISOlat2 -->
35
+ <!ENTITY oelig "&#339;"> <!-- latin small ligature oe, U+0153 ISOlat2 -->
36
+ <!-- ligature is a misnomer, this is a separate character in some languages -->
37
+ <!ENTITY Scaron "&#352;"> <!-- latin capital letter S with caron,
38
+ U+0160 ISOlat2 -->
39
+ <!ENTITY scaron "&#353;"> <!-- latin small letter s with caron,
40
+ U+0161 ISOlat2 -->
41
+ <!ENTITY Yuml "&#376;"> <!-- latin capital letter Y with diaeresis,
42
+ U+0178 ISOlat2 -->
43
+
44
+ <!-- Spacing Modifier Letters -->
45
+ <!ENTITY circ "&#710;"> <!-- modifier letter circumflex accent,
46
+ U+02C6 ISOpub -->
47
+ <!ENTITY tilde "&#732;"> <!-- small tilde, U+02DC ISOdia -->
48
+
49
+ <!-- General Punctuation -->
50
+ <!ENTITY ensp "&#8194;"> <!-- en space, U+2002 ISOpub -->
51
+ <!ENTITY emsp "&#8195;"> <!-- em space, U+2003 ISOpub -->
52
+ <!ENTITY thinsp "&#8201;"> <!-- thin space, U+2009 ISOpub -->
53
+ <!ENTITY zwnj "&#8204;"> <!-- zero width non-joiner,
54
+ U+200C NEW RFC 2070 -->
55
+ <!ENTITY zwj "&#8205;"> <!-- zero width joiner, U+200D NEW RFC 2070 -->
56
+ <!ENTITY lrm "&#8206;"> <!-- left-to-right mark, U+200E NEW RFC 2070 -->
57
+ <!ENTITY rlm "&#8207;"> <!-- right-to-left mark, U+200F NEW RFC 2070 -->
58
+ <!ENTITY ndash "&#8211;"> <!-- en dash, U+2013 ISOpub -->
59
+ <!ENTITY mdash "&#8212;"> <!-- em dash, U+2014 ISOpub -->
60
+ <!ENTITY lsquo "&#8216;"> <!-- left single quotation mark,
61
+ U+2018 ISOnum -->
62
+ <!ENTITY rsquo "&#8217;"> <!-- right single quotation mark,
63
+ U+2019 ISOnum -->
64
+ <!ENTITY sbquo "&#8218;"> <!-- single low-9 quotation mark, U+201A NEW -->
65
+ <!ENTITY ldquo "&#8220;"> <!-- left double quotation mark,
66
+ U+201C ISOnum -->
67
+ <!ENTITY rdquo "&#8221;"> <!-- right double quotation mark,
68
+ U+201D ISOnum -->
69
+ <!ENTITY bdquo "&#8222;"> <!-- double low-9 quotation mark, U+201E NEW -->
70
+ <!ENTITY dagger "&#8224;"> <!-- dagger, U+2020 ISOpub -->
71
+ <!ENTITY Dagger "&#8225;"> <!-- double dagger, U+2021 ISOpub -->
72
+ <!ENTITY permil "&#8240;"> <!-- per mille sign, U+2030 ISOtech -->
73
+ <!ENTITY lsaquo "&#8249;"> <!-- single left-pointing angle quotation mark,
74
+ U+2039 ISO proposed -->
75
+ <!-- lsaquo is proposed but not yet ISO standardized -->
76
+ <!ENTITY rsaquo "&#8250;"> <!-- single right-pointing angle quotation mark,
77
+ U+203A ISO proposed -->
78
+ <!-- rsaquo is proposed but not yet ISO standardized -->
79
+ <!ENTITY euro "&#8364;"> <!-- euro sign, U+20AC NEW -->
@@ -0,0 +1,242 @@
1
+ <!-- Mathematical, Greek and Symbolic characters for HTML -->
2
+
3
+ <!-- Character entity set. Typical invocation:
4
+ <!ENTITY % HTMLsymbol PUBLIC
5
+ "-//W3C//ENTITIES Symbols for XHTML//EN"
6
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent">
7
+ %HTMLsymbol;
8
+ -->
9
+
10
+ <!-- Portions (C) International Organization for Standardization 1986:
11
+ Permission to copy in any form is granted for use with
12
+ conforming SGML systems and applications as defined in
13
+ ISO 8879, provided this notice is included in all copies.
14
+ -->
15
+
16
+ <!-- Relevant ISO entity set is given unless names are newly introduced.
17
+ New names (i.e., not in ISO 8879 list) do not clash with any
18
+ existing ISO 8879 entity names. ISO 10646 character numbers
19
+ are given for each character, in hex. values are decimal
20
+ conversions of the ISO 10646 values and refer to the document
21
+ character set. Names are Unicode names.
22
+ -->
23
+
24
+ <!-- Latin Extended-B -->
25
+ <!ENTITY fnof "&#402;"> <!-- latin small f with hook = function
26
+ = florin, U+0192 ISOtech -->
27
+
28
+ <!-- Greek -->
29
+ <!ENTITY Alpha "&#913;"> <!-- greek capital letter alpha, U+0391 -->
30
+ <!ENTITY Beta "&#914;"> <!-- greek capital letter beta, U+0392 -->
31
+ <!ENTITY Gamma "&#915;"> <!-- greek capital letter gamma,
32
+ U+0393 ISOgrk3 -->
33
+ <!ENTITY Delta "&#916;"> <!-- greek capital letter delta,
34
+ U+0394 ISOgrk3 -->
35
+ <!ENTITY Epsilon "&#917;"> <!-- greek capital letter epsilon, U+0395 -->
36
+ <!ENTITY Zeta "&#918;"> <!-- greek capital letter zeta, U+0396 -->
37
+ <!ENTITY Eta "&#919;"> <!-- greek capital letter eta, U+0397 -->
38
+ <!ENTITY Theta "&#920;"> <!-- greek capital letter theta,
39
+ U+0398 ISOgrk3 -->
40
+ <!ENTITY Iota "&#921;"> <!-- greek capital letter iota, U+0399 -->
41
+ <!ENTITY Kappa "&#922;"> <!-- greek capital letter kappa, U+039A -->
42
+ <!ENTITY Lambda "&#923;"> <!-- greek capital letter lambda,
43
+ U+039B ISOgrk3 -->
44
+ <!ENTITY Mu "&#924;"> <!-- greek capital letter mu, U+039C -->
45
+ <!ENTITY Nu "&#925;"> <!-- greek capital letter nu, U+039D -->
46
+ <!ENTITY Xi "&#926;"> <!-- greek capital letter xi, U+039E ISOgrk3 -->
47
+ <!ENTITY Omicron "&#927;"> <!-- greek capital letter omicron, U+039F -->
48
+ <!ENTITY Pi "&#928;"> <!-- greek capital letter pi, U+03A0 ISOgrk3 -->
49
+ <!ENTITY Rho "&#929;"> <!-- greek capital letter rho, U+03A1 -->
50
+ <!-- there is no Sigmaf, and no U+03A2 character either -->
51
+ <!ENTITY Sigma "&#931;"> <!-- greek capital letter sigma,
52
+ U+03A3 ISOgrk3 -->
53
+ <!ENTITY Tau "&#932;"> <!-- greek capital letter tau, U+03A4 -->
54
+ <!ENTITY Upsilon "&#933;"> <!-- greek capital letter upsilon,
55
+ U+03A5 ISOgrk3 -->
56
+ <!ENTITY Phi "&#934;"> <!-- greek capital letter phi,
57
+ U+03A6 ISOgrk3 -->
58
+ <!ENTITY Chi "&#935;"> <!-- greek capital letter chi, U+03A7 -->
59
+ <!ENTITY Psi "&#936;"> <!-- greek capital letter psi,
60
+ U+03A8 ISOgrk3 -->
61
+ <!ENTITY Omega "&#937;"> <!-- greek capital letter omega,
62
+ U+03A9 ISOgrk3 -->
63
+
64
+ <!ENTITY alpha "&#945;"> <!-- greek small letter alpha,
65
+ U+03B1 ISOgrk3 -->
66
+ <!ENTITY beta "&#946;"> <!-- greek small letter beta, U+03B2 ISOgrk3 -->
67
+ <!ENTITY gamma "&#947;"> <!-- greek small letter gamma,
68
+ U+03B3 ISOgrk3 -->
69
+ <!ENTITY delta "&#948;"> <!-- greek small letter delta,
70
+ U+03B4 ISOgrk3 -->
71
+ <!ENTITY epsilon "&#949;"> <!-- greek small letter epsilon,
72
+ U+03B5 ISOgrk3 -->
73
+ <!ENTITY zeta "&#950;"> <!-- greek small letter zeta, U+03B6 ISOgrk3 -->
74
+ <!ENTITY eta "&#951;"> <!-- greek small letter eta, U+03B7 ISOgrk3 -->
75
+ <!ENTITY theta "&#952;"> <!-- greek small letter theta,
76
+ U+03B8 ISOgrk3 -->
77
+ <!ENTITY iota "&#953;"> <!-- greek small letter iota, U+03B9 ISOgrk3 -->
78
+ <!ENTITY kappa "&#954;"> <!-- greek small letter kappa,
79
+ U+03BA ISOgrk3 -->
80
+ <!ENTITY lambda "&#955;"> <!-- greek small letter lambda,
81
+ U+03BB ISOgrk3 -->
82
+ <!ENTITY mu "&#956;"> <!-- greek small letter mu, U+03BC ISOgrk3 -->
83
+ <!ENTITY nu "&#957;"> <!-- greek small letter nu, U+03BD ISOgrk3 -->
84
+ <!ENTITY xi "&#958;"> <!-- greek small letter xi, U+03BE ISOgrk3 -->
85
+ <!ENTITY omicron "&#959;"> <!-- greek small letter omicron, U+03BF NEW -->
86
+ <!ENTITY pi "&#960;"> <!-- greek small letter pi, U+03C0 ISOgrk3 -->
87
+ <!ENTITY rho "&#961;"> <!-- greek small letter rho, U+03C1 ISOgrk3 -->
88
+ <!ENTITY sigmaf "&#962;"> <!-- greek small letter final sigma,
89
+ U+03C2 ISOgrk3 -->
90
+ <!ENTITY sigma "&#963;"> <!-- greek small letter sigma,
91
+ U+03C3 ISOgrk3 -->
92
+ <!ENTITY tau "&#964;"> <!-- greek small letter tau, U+03C4 ISOgrk3 -->
93
+ <!ENTITY upsilon "&#965;"> <!-- greek small letter upsilon,
94
+ U+03C5 ISOgrk3 -->
95
+ <!ENTITY phi "&#966;"> <!-- greek small letter phi, U+03C6 ISOgrk3 -->
96
+ <!ENTITY chi "&#967;"> <!-- greek small letter chi, U+03C7 ISOgrk3 -->
97
+ <!ENTITY psi "&#968;"> <!-- greek small letter psi, U+03C8 ISOgrk3 -->
98
+ <!ENTITY omega "&#969;"> <!-- greek small letter omega,
99
+ U+03C9 ISOgrk3 -->
100
+ <!ENTITY thetasym "&#977;"> <!-- greek small letter theta symbol,
101
+ U+03D1 NEW -->
102
+ <!ENTITY upsih "&#978;"> <!-- greek upsilon with hook symbol,
103
+ U+03D2 NEW -->
104
+ <!ENTITY piv "&#982;"> <!-- greek pi symbol, U+03D6 ISOgrk3 -->
105
+
106
+ <!-- General Punctuation -->
107
+ <!ENTITY bull "&#8226;"> <!-- bullet = black small circle,
108
+ U+2022 ISOpub -->
109
+ <!-- bullet is NOT the same as bullet operator, U+2219 -->
110
+ <!ENTITY hellip "&#8230;"> <!-- horizontal ellipsis = three dot leader,
111
+ U+2026 ISOpub -->
112
+ <!ENTITY prime "&#8242;"> <!-- prime = minutes = feet, U+2032 ISOtech -->
113
+ <!ENTITY Prime "&#8243;"> <!-- double prime = seconds = inches,
114
+ U+2033 ISOtech -->
115
+ <!ENTITY oline "&#8254;"> <!-- overline = spacing overscore,
116
+ U+203E NEW -->
117
+ <!ENTITY frasl "&#8260;"> <!-- fraction slash, U+2044 NEW -->
118
+
119
+ <!-- Letterlike Symbols -->
120
+ <!ENTITY weierp "&#8472;"> <!-- script capital P = power set
121
+ = Weierstrass p, U+2118 ISOamso -->
122
+ <!ENTITY image "&#8465;"> <!-- blackletter capital I = imaginary part,
123
+ U+2111 ISOamso -->
124
+ <!ENTITY real "&#8476;"> <!-- blackletter capital R = real part symbol,
125
+ U+211C ISOamso -->
126
+ <!ENTITY trade "&#8482;"> <!-- trade mark sign, U+2122 ISOnum -->
127
+ <!ENTITY alefsym "&#8501;"> <!-- alef symbol = first transfinite cardinal,
128
+ U+2135 NEW -->
129
+ <!-- alef symbol is NOT the same as hebrew letter alef,
130
+ U+05D0 although the same glyph could be used to depict both characters -->
131
+
132
+ <!-- Arrows -->
133
+ <!ENTITY larr "&#8592;"> <!-- leftwards arrow, U+2190 ISOnum -->
134
+ <!ENTITY uarr "&#8593;"> <!-- upwards arrow, U+2191 ISOnum-->
135
+ <!ENTITY rarr "&#8594;"> <!-- rightwards arrow, U+2192 ISOnum -->
136
+ <!ENTITY darr "&#8595;"> <!-- downwards arrow, U+2193 ISOnum -->
137
+ <!ENTITY harr "&#8596;"> <!-- left right arrow, U+2194 ISOamsa -->
138
+ <!ENTITY crarr "&#8629;"> <!-- downwards arrow with corner leftwards
139
+ = carriage return, U+21B5 NEW -->
140
+ <!ENTITY lArr "&#8656;"> <!-- leftwards double arrow, U+21D0 ISOtech -->
141
+ <!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
142
+ but also does not have any other character for that function. So ? lArr can
143
+ be used for 'is implied by' as ISOtech suggests -->
144
+ <!ENTITY uArr "&#8657;"> <!-- upwards double arrow, U+21D1 ISOamsa -->
145
+ <!ENTITY rArr "&#8658;"> <!-- rightwards double arrow,
146
+ U+21D2 ISOtech -->
147
+ <!-- Unicode does not say this is the 'implies' character but does not have
148
+ another character with this function so ?
149
+ rArr can be used for 'implies' as ISOtech suggests -->
150
+ <!ENTITY dArr "&#8659;"> <!-- downwards double arrow, U+21D3 ISOamsa -->
151
+ <!ENTITY hArr "&#8660;"> <!-- left right double arrow,
152
+ U+21D4 ISOamsa -->
153
+
154
+ <!-- Mathematical Operators -->
155
+ <!ENTITY forall "&#8704;"> <!-- for all, U+2200 ISOtech -->
156
+ <!ENTITY part "&#8706;"> <!-- partial differential, U+2202 ISOtech -->
157
+ <!ENTITY exist "&#8707;"> <!-- there exists, U+2203 ISOtech -->
158
+ <!ENTITY empty "&#8709;"> <!-- empty set = null set = diameter,
159
+ U+2205 ISOamso -->
160
+ <!ENTITY nabla "&#8711;"> <!-- nabla = backward difference,
161
+ U+2207 ISOtech -->
162
+ <!ENTITY isin "&#8712;"> <!-- element of, U+2208 ISOtech -->
163
+ <!ENTITY notin "&#8713;"> <!-- not an element of, U+2209 ISOtech -->
164
+ <!ENTITY ni "&#8715;"> <!-- contains as member, U+220B ISOtech -->
165
+ <!-- should there be a more memorable name than 'ni'? -->
166
+ <!ENTITY prod "&#8719;"> <!-- n-ary product = product sign,
167
+ U+220F ISOamsb -->
168
+ <!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
169
+ the same glyph might be used for both -->
170
+ <!ENTITY sum "&#8721;"> <!-- n-ary sumation, U+2211 ISOamsb -->
171
+ <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
172
+ though the same glyph might be used for both -->
173
+ <!ENTITY minus "&#8722;"> <!-- minus sign, U+2212 ISOtech -->
174
+ <!ENTITY lowast "&#8727;"> <!-- asterisk operator, U+2217 ISOtech -->
175
+ <!ENTITY radic "&#8730;"> <!-- square root = radical sign,
176
+ U+221A ISOtech -->
177
+ <!ENTITY prop "&#8733;"> <!-- proportional to, U+221D ISOtech -->
178
+ <!ENTITY infin "&#8734;"> <!-- infinity, U+221E ISOtech -->
179
+ <!ENTITY ang "&#8736;"> <!-- angle, U+2220 ISOamso -->
180
+ <!ENTITY and "&#8743;"> <!-- logical and = wedge, U+2227 ISOtech -->
181
+ <!ENTITY or "&#8744;"> <!-- logical or = vee, U+2228 ISOtech -->
182
+ <!ENTITY cap "&#8745;"> <!-- intersection = cap, U+2229 ISOtech -->
183
+ <!ENTITY cup "&#8746;"> <!-- union = cup, U+222A ISOtech -->
184
+ <!ENTITY int "&#8747;"> <!-- integral, U+222B ISOtech -->
185
+ <!ENTITY there4 "&#8756;"> <!-- therefore, U+2234 ISOtech -->
186
+ <!ENTITY sim "&#8764;"> <!-- tilde operator = varies with = similar to,
187
+ U+223C ISOtech -->
188
+ <!-- tilde operator is NOT the same character as the tilde, U+007E,
189
+ although the same glyph might be used to represent both -->
190
+ <!ENTITY cong "&#8773;"> <!-- approximately equal to, U+2245 ISOtech -->
191
+ <!ENTITY asymp "&#8776;"> <!-- almost equal to = asymptotic to,
192
+ U+2248 ISOamsr -->
193
+ <!ENTITY ne "&#8800;"> <!-- not equal to, U+2260 ISOtech -->
194
+ <!ENTITY equiv "&#8801;"> <!-- identical to, U+2261 ISOtech -->
195
+ <!ENTITY le "&#8804;"> <!-- less-than or equal to, U+2264 ISOtech -->
196
+ <!ENTITY ge "&#8805;"> <!-- greater-than or equal to,
197
+ U+2265 ISOtech -->
198
+ <!ENTITY sub "&#8834;"> <!-- subset of, U+2282 ISOtech -->
199
+ <!ENTITY sup "&#8835;"> <!-- superset of, U+2283 ISOtech -->
200
+ <!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
201
+ font encoding and is not included. Should it be, for symmetry?
202
+ It is in ISOamsn -->
203
+ <!ENTITY nsub "&#8836;"> <!-- not a subset of, U+2284 ISOamsn -->
204
+ <!ENTITY sube "&#8838;"> <!-- subset of or equal to, U+2286 ISOtech -->
205
+ <!ENTITY supe "&#8839;"> <!-- superset of or equal to,
206
+ U+2287 ISOtech -->
207
+ <!ENTITY oplus "&#8853;"> <!-- circled plus = direct sum,
208
+ U+2295 ISOamsb -->
209
+ <!ENTITY otimes "&#8855;"> <!-- circled times = vector product,
210
+ U+2297 ISOamsb -->
211
+ <!ENTITY perp "&#8869;"> <!-- up tack = orthogonal to = perpendicular,
212
+ U+22A5 ISOtech -->
213
+ <!ENTITY sdot "&#8901;"> <!-- dot operator, U+22C5 ISOamsb -->
214
+ <!-- dot operator is NOT the same character as U+00B7 middle dot -->
215
+
216
+ <!-- Miscellaneous Technical -->
217
+ <!ENTITY lceil "&#8968;"> <!-- left ceiling = apl upstile,
218
+ U+2308 ISOamsc -->
219
+ <!ENTITY rceil "&#8969;"> <!-- right ceiling, U+2309 ISOamsc -->
220
+ <!ENTITY lfloor "&#8970;"> <!-- left floor = apl downstile,
221
+ U+230A ISOamsc -->
222
+ <!ENTITY rfloor "&#8971;"> <!-- right floor, U+230B ISOamsc -->
223
+ <!ENTITY lang "&#9001;"> <!-- left-pointing angle bracket = bra,
224
+ U+2329 ISOtech -->
225
+ <!-- lang is NOT the same character as U+003C 'less than'
226
+ or U+2039 'single left-pointing angle quotation mark' -->
227
+ <!ENTITY rang "&#9002;"> <!-- right-pointing angle bracket = ket,
228
+ U+232A ISOtech -->
229
+ <!-- rang is NOT the same character as U+003E 'greater than'
230
+ or U+203A 'single right-pointing angle quotation mark' -->
231
+
232
+ <!-- Geometric Shapes -->
233
+ <!ENTITY loz "&#9674;"> <!-- lozenge, U+25CA ISOpub -->
234
+
235
+ <!-- Miscellaneous Symbols -->
236
+ <!ENTITY spades "&#9824;"> <!-- black spade suit, U+2660 ISOpub -->
237
+ <!-- black here seems to mean filled as opposed to hollow -->
238
+ <!ENTITY clubs "&#9827;"> <!-- black club suit = shamrock,
239
+ U+2663 ISOpub -->
240
+ <!ENTITY hearts "&#9829;"> <!-- black heart suit = valentine,
241
+ U+2665 ISOpub -->
242
+ <!ENTITY diams "&#9830;"> <!-- black diamond suit, U+2666 ISOpub -->
@@ -16,7 +16,7 @@ module ONIX
16
16
  module Version #:nodoc:
17
17
  Major = 0
18
18
  Minor = 7
19
- Tiny = 1
19
+ Tiny = 2
20
20
 
21
21
  String = [Major, Minor, Tiny].join('.')
22
22
  end
@@ -100,3 +100,5 @@ require File.join(File.dirname(__FILE__), "onix", "lists", "product_availability
100
100
  require File.join(File.dirname(__FILE__), "onix", "simple_product")
101
101
  require File.join(File.dirname(__FILE__), "onix", "apa_product")
102
102
 
103
+ # misc
104
+ require File.join(File.dirname(__FILE__), "onix", "normaliser")
@@ -0,0 +1,156 @@
1
+ # coding: utf-8
2
+
3
+ require 'tempfile'
4
+ require 'fileutils'
5
+
6
+ module ONIX
7
+
8
+ # A standalone class that can be used to normalise ONIX files
9
+ # into a standardised form. If you're accepting ONIX files from a wide range
10
+ # of suppliers, you're guarunteed to get all sorts of dialects.
11
+ #
12
+ # This will create a new file that:
13
+ #
14
+ # - is UTF-8 encoded
15
+ # - uses reference tags, not short
16
+ # - has no named entities (ndash, etc) other than &amp; &lt; and &gt;
17
+ #
18
+ # Usage:
19
+ #
20
+ # ONIX::Normaliser.process("oldfile.xml", "newfile.xml")
21
+ #
22
+ # Dependencies:
23
+ #
24
+ # At this stage the class depends on several external apps, all commonly available
25
+ # on *nix systems: xsltproc, isutf8, iconv and sed
26
+ #
27
+ class Normaliser
28
+
29
+ class << self
30
+
31
+ # normalise oldfile and save it as newfile. oldfile
32
+ # will be left untouched
33
+ #
34
+ def process(oldfile, newfile)
35
+ self.new(oldfile, newfile).run
36
+ end
37
+ end
38
+
39
+ def initialize(oldfile, newfile)
40
+ raise ArgumentError, "#{oldfile} does not exist" unless File.file?(oldfile)
41
+ raise ArgumentError, "#{newfile} already exists" if File.file?(newfile)
42
+ raise "java app not found" unless app_available?("which")
43
+ raise "isutf8 app not found" unless app_available?("isutf8")
44
+ raise "iconv app not found" unless app_available?("iconv")
45
+ raise "sed app not found" unless app_available?("sed")
46
+
47
+ @oldfile = oldfile
48
+ @newfile = newfile
49
+ @curfile = next_tempfile
50
+ FileUtils.cp(@oldfile, @curfile)
51
+ @head = File.open(@oldfile, "r") { |f| f.read(1024) }
52
+ end
53
+
54
+ def run
55
+ # remove short tags
56
+ if @head.include?("ONIXmessage")
57
+ dest = next_tempfile
58
+ to_reference_tags(@curfile, dest)
59
+ @curfile = dest
60
+ end
61
+
62
+ # convert to utf8
63
+ dest = next_tempfile
64
+ to_utf8(@curfile, dest)
65
+ @curfile = dest
66
+
67
+ # remove entities
68
+ replace_named_entities(@curfile)
69
+
70
+ FileUtils.cp(@curfile, @newfile)
71
+ end
72
+
73
+ private
74
+
75
+ # check the specified app is available on the system
76
+ #
77
+ def app_available?(app)
78
+ `which #{app}`.strip == "" ? false : true
79
+ end
80
+
81
+ # generate a temp filename
82
+ #
83
+ def next_tempfile
84
+ p = nil
85
+ Tempfile.open("onix") do |tf|
86
+ tf.close
87
+ p = tf.path
88
+ end
89
+ p
90
+ end
91
+
92
+ # uses an XSLT stylesheet provided by edituer to convert
93
+ # a file from short tags to long tags.
94
+ #
95
+ # more detail here:
96
+ # http://www.editeur.org/files/ONIX%203/ONIX%20tagname%20converter%20v2.htm
97
+ #
98
+ def to_reference_tags(src, dest)
99
+ inpath = File.expand_path(src)
100
+ outpath = File.expand_path(dest)
101
+ xsltpath = File.dirname(__FILE__) + "/../../support/switch-onix-tagnames-1.1.xsl"
102
+ # xsltproc doesn't set the DTD correctly in the output. Using
103
+ # saxon instead.
104
+ #`xsltproc -o #{outpath} #{xsltpath} #{inpath}`
105
+ `java -jar /usr/share/java/saxon.jar #{inpath} #{xsltpath} > #{outpath}`
106
+ end
107
+
108
+ # ensure the file is valid utf8, then make sure it's declared as such
109
+ #
110
+ def to_utf8(src, dest)
111
+ inpath = File.expand_path(src)
112
+ outpath = File.expand_path(dest)
113
+
114
+ m, src_enc = *@head.match(/encoding=.([a-zA-Z0-9\-]+)./i)
115
+
116
+ # ensure the file is actually utf8
117
+ if `isutf8 #{inpath}`.strip == ""
118
+ FileUtils.cp(inpath, outpath)
119
+ else
120
+ `iconv --from-code=#{src_enc} --to-code=UTF-8 #{inpath} > #{outpath}`
121
+ end
122
+
123
+ # ensure the encoding delcaration is correct
124
+ if src_enc.downcase != "utf-8"
125
+ `sed -i 's/#{src_enc}/UTF-8/' #{outpath}`
126
+ end
127
+ end
128
+
129
+ # replace all named entities in the specified file with
130
+ # numeric entities.
131
+ #
132
+ def replace_named_entities(path)
133
+ # TODO: this is horrible. 1500 sed calls.
134
+ entity_map.each do |named, numeric|
135
+ `sed -i 's/\\&#{named};/\\&#{numeric};/g' #{path}`
136
+ end
137
+ end
138
+
139
+ # return a named entity to numeric entity mapping, build by extracting
140
+ # data from the ONIX DTD
141
+ #
142
+ def entity_map
143
+ return @map if @map
144
+
145
+ path = File.dirname(__FILE__) + "/../../support/entities.txt"
146
+ @map = {}
147
+ File.read(path).split.each do |line|
148
+ elements = line.split(":")
149
+ @map[elements.first] = elements.last
150
+ end
151
+ @map
152
+ end
153
+
154
+ end
155
+
156
+ end