onix 0.7.1 → 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +6 -0
- data/dtd/2.1r3/iso-amsa.ent +173 -0
- data/dtd/2.1r3/iso-amsb.ent +146 -0
- data/dtd/2.1r3/iso-amsc.ent +49 -0
- data/dtd/2.1r3/iso-amsn.ent +117 -0
- data/dtd/2.1r3/iso-amso.ent +77 -0
- data/dtd/2.1r3/iso-amsr.ent +205 -0
- data/dtd/2.1r3/iso-box.ent +67 -0
- data/dtd/2.1r3/iso-cyr1.ent +94 -0
- data/dtd/2.1r3/iso-cyr2.ent +53 -0
- data/dtd/2.1r3/iso-dia.ent +41 -0
- data/dtd/2.1r3/iso-grk3.ent +70 -0
- data/dtd/2.1r3/iso-lat1.ent +89 -0
- data/dtd/2.1r3/iso-lat2.ent +148 -0
- data/dtd/2.1r3/iso-mfrk.ent +79 -0
- data/dtd/2.1r3/iso-mopf.ent +53 -0
- data/dtd/2.1r3/iso-mscr.ent +79 -0
- data/dtd/2.1r3/iso-num.ent +103 -0
- data/dtd/2.1r3/iso-num.old.ent +103 -0
- data/dtd/2.1r3/iso-pub.ent +110 -0
- data/dtd/2.1r3/iso-tech.ent +183 -0
- data/dtd/2.1r3/onix-international.dtd +1012 -0
- data/dtd/2.1r3/onix-xhtml.elt +672 -0
- data/dtd/2.1r3/reference.elt +4758 -0
- data/dtd/2.1r3/xhtml-special.ent +79 -0
- data/dtd/2.1r3/xhtml-symbol.ent +242 -0
- data/lib/onix.rb +3 -1
- data/lib/onix/normaliser.rb +156 -0
- data/spec/normaliser_spec.rb +77 -0
- data/support/entities.txt +1499 -0
- data/support/extract.rb +25 -0
- data/support/switch-onix-tagnames-1.1.xsl +25 -0
- data/support/switch-onix-tagnames-2.0.xsl +37 -0
- metadata +54 -2
@@ -0,0 +1,79 @@
|
|
1
|
+
<!-- Special characters for HTML -->
|
2
|
+
|
3
|
+
<!-- Character entity set. Typical invocation:
|
4
|
+
<!ENTITY % HTMLspecial PUBLIC
|
5
|
+
"-//W3C//ENTITIES Special for XHTML//EN"
|
6
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent">
|
7
|
+
%HTMLspecial;
|
8
|
+
-->
|
9
|
+
|
10
|
+
<!-- Portions (C) International Organization for Standardization 1986:
|
11
|
+
Permission to copy in any form is granted for use with
|
12
|
+
conforming SGML systems and applications as defined in
|
13
|
+
ISO 8879, provided this notice is included in all copies.
|
14
|
+
-->
|
15
|
+
|
16
|
+
<!-- Relevant ISO entity set is given unless names are newly introduced.
|
17
|
+
New names (i.e., not in ISO 8879 list) do not clash with any
|
18
|
+
existing ISO 8879 entity names. ISO 10646 character numbers
|
19
|
+
are given for each character, in hex. values are decimal
|
20
|
+
conversions of the ISO 10646 values and refer to the document
|
21
|
+
character set. Names are Unicode names.
|
22
|
+
-->
|
23
|
+
|
24
|
+
<!-- C0 Controls and Basic Latin -->
|
25
|
+
<!ENTITY quot """> <!-- quotation mark = APL quote,
|
26
|
+
U+0022 ISOnum -->
|
27
|
+
<!ENTITY amp "&#38;"> <!-- ampersand, U+0026 ISOnum -->
|
28
|
+
<!ENTITY lt "&#60;"> <!-- less-than sign, U+003C ISOnum -->
|
29
|
+
<!ENTITY gt ">"> <!-- greater-than sign, U+003E ISOnum -->
|
30
|
+
<!ENTITY apos "'"> <!-- apostrophe mark, U+0027 ISOnum -->
|
31
|
+
|
32
|
+
<!-- Latin Extended-A -->
|
33
|
+
<!ENTITY OElig "Œ"> <!-- latin capital ligature OE,
|
34
|
+
U+0152 ISOlat2 -->
|
35
|
+
<!ENTITY oelig "œ"> <!-- latin small ligature oe, U+0153 ISOlat2 -->
|
36
|
+
<!-- ligature is a misnomer, this is a separate character in some languages -->
|
37
|
+
<!ENTITY Scaron "Š"> <!-- latin capital letter S with caron,
|
38
|
+
U+0160 ISOlat2 -->
|
39
|
+
<!ENTITY scaron "š"> <!-- latin small letter s with caron,
|
40
|
+
U+0161 ISOlat2 -->
|
41
|
+
<!ENTITY Yuml "Ÿ"> <!-- latin capital letter Y with diaeresis,
|
42
|
+
U+0178 ISOlat2 -->
|
43
|
+
|
44
|
+
<!-- Spacing Modifier Letters -->
|
45
|
+
<!ENTITY circ "ˆ"> <!-- modifier letter circumflex accent,
|
46
|
+
U+02C6 ISOpub -->
|
47
|
+
<!ENTITY tilde "˜"> <!-- small tilde, U+02DC ISOdia -->
|
48
|
+
|
49
|
+
<!-- General Punctuation -->
|
50
|
+
<!ENTITY ensp " "> <!-- en space, U+2002 ISOpub -->
|
51
|
+
<!ENTITY emsp " "> <!-- em space, U+2003 ISOpub -->
|
52
|
+
<!ENTITY thinsp " "> <!-- thin space, U+2009 ISOpub -->
|
53
|
+
<!ENTITY zwnj "‌"> <!-- zero width non-joiner,
|
54
|
+
U+200C NEW RFC 2070 -->
|
55
|
+
<!ENTITY zwj "‍"> <!-- zero width joiner, U+200D NEW RFC 2070 -->
|
56
|
+
<!ENTITY lrm "‎"> <!-- left-to-right mark, U+200E NEW RFC 2070 -->
|
57
|
+
<!ENTITY rlm "‏"> <!-- right-to-left mark, U+200F NEW RFC 2070 -->
|
58
|
+
<!ENTITY ndash "–"> <!-- en dash, U+2013 ISOpub -->
|
59
|
+
<!ENTITY mdash "—"> <!-- em dash, U+2014 ISOpub -->
|
60
|
+
<!ENTITY lsquo "‘"> <!-- left single quotation mark,
|
61
|
+
U+2018 ISOnum -->
|
62
|
+
<!ENTITY rsquo "’"> <!-- right single quotation mark,
|
63
|
+
U+2019 ISOnum -->
|
64
|
+
<!ENTITY sbquo "‚"> <!-- single low-9 quotation mark, U+201A NEW -->
|
65
|
+
<!ENTITY ldquo "“"> <!-- left double quotation mark,
|
66
|
+
U+201C ISOnum -->
|
67
|
+
<!ENTITY rdquo "”"> <!-- right double quotation mark,
|
68
|
+
U+201D ISOnum -->
|
69
|
+
<!ENTITY bdquo "„"> <!-- double low-9 quotation mark, U+201E NEW -->
|
70
|
+
<!ENTITY dagger "†"> <!-- dagger, U+2020 ISOpub -->
|
71
|
+
<!ENTITY Dagger "‡"> <!-- double dagger, U+2021 ISOpub -->
|
72
|
+
<!ENTITY permil "‰"> <!-- per mille sign, U+2030 ISOtech -->
|
73
|
+
<!ENTITY lsaquo "‹"> <!-- single left-pointing angle quotation mark,
|
74
|
+
U+2039 ISO proposed -->
|
75
|
+
<!-- lsaquo is proposed but not yet ISO standardized -->
|
76
|
+
<!ENTITY rsaquo "›"> <!-- single right-pointing angle quotation mark,
|
77
|
+
U+203A ISO proposed -->
|
78
|
+
<!-- rsaquo is proposed but not yet ISO standardized -->
|
79
|
+
<!ENTITY euro "€"> <!-- euro sign, U+20AC NEW -->
|
@@ -0,0 +1,242 @@
|
|
1
|
+
<!-- Mathematical, Greek and Symbolic characters for HTML -->
|
2
|
+
|
3
|
+
<!-- Character entity set. Typical invocation:
|
4
|
+
<!ENTITY % HTMLsymbol PUBLIC
|
5
|
+
"-//W3C//ENTITIES Symbols for XHTML//EN"
|
6
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent">
|
7
|
+
%HTMLsymbol;
|
8
|
+
-->
|
9
|
+
|
10
|
+
<!-- Portions (C) International Organization for Standardization 1986:
|
11
|
+
Permission to copy in any form is granted for use with
|
12
|
+
conforming SGML systems and applications as defined in
|
13
|
+
ISO 8879, provided this notice is included in all copies.
|
14
|
+
-->
|
15
|
+
|
16
|
+
<!-- Relevant ISO entity set is given unless names are newly introduced.
|
17
|
+
New names (i.e., not in ISO 8879 list) do not clash with any
|
18
|
+
existing ISO 8879 entity names. ISO 10646 character numbers
|
19
|
+
are given for each character, in hex. values are decimal
|
20
|
+
conversions of the ISO 10646 values and refer to the document
|
21
|
+
character set. Names are Unicode names.
|
22
|
+
-->
|
23
|
+
|
24
|
+
<!-- Latin Extended-B -->
|
25
|
+
<!ENTITY fnof "ƒ"> <!-- latin small f with hook = function
|
26
|
+
= florin, U+0192 ISOtech -->
|
27
|
+
|
28
|
+
<!-- Greek -->
|
29
|
+
<!ENTITY Alpha "Α"> <!-- greek capital letter alpha, U+0391 -->
|
30
|
+
<!ENTITY Beta "Β"> <!-- greek capital letter beta, U+0392 -->
|
31
|
+
<!ENTITY Gamma "Γ"> <!-- greek capital letter gamma,
|
32
|
+
U+0393 ISOgrk3 -->
|
33
|
+
<!ENTITY Delta "Δ"> <!-- greek capital letter delta,
|
34
|
+
U+0394 ISOgrk3 -->
|
35
|
+
<!ENTITY Epsilon "Ε"> <!-- greek capital letter epsilon, U+0395 -->
|
36
|
+
<!ENTITY Zeta "Ζ"> <!-- greek capital letter zeta, U+0396 -->
|
37
|
+
<!ENTITY Eta "Η"> <!-- greek capital letter eta, U+0397 -->
|
38
|
+
<!ENTITY Theta "Θ"> <!-- greek capital letter theta,
|
39
|
+
U+0398 ISOgrk3 -->
|
40
|
+
<!ENTITY Iota "Ι"> <!-- greek capital letter iota, U+0399 -->
|
41
|
+
<!ENTITY Kappa "Κ"> <!-- greek capital letter kappa, U+039A -->
|
42
|
+
<!ENTITY Lambda "Λ"> <!-- greek capital letter lambda,
|
43
|
+
U+039B ISOgrk3 -->
|
44
|
+
<!ENTITY Mu "Μ"> <!-- greek capital letter mu, U+039C -->
|
45
|
+
<!ENTITY Nu "Ν"> <!-- greek capital letter nu, U+039D -->
|
46
|
+
<!ENTITY Xi "Ξ"> <!-- greek capital letter xi, U+039E ISOgrk3 -->
|
47
|
+
<!ENTITY Omicron "Ο"> <!-- greek capital letter omicron, U+039F -->
|
48
|
+
<!ENTITY Pi "Π"> <!-- greek capital letter pi, U+03A0 ISOgrk3 -->
|
49
|
+
<!ENTITY Rho "Ρ"> <!-- greek capital letter rho, U+03A1 -->
|
50
|
+
<!-- there is no Sigmaf, and no U+03A2 character either -->
|
51
|
+
<!ENTITY Sigma "Σ"> <!-- greek capital letter sigma,
|
52
|
+
U+03A3 ISOgrk3 -->
|
53
|
+
<!ENTITY Tau "Τ"> <!-- greek capital letter tau, U+03A4 -->
|
54
|
+
<!ENTITY Upsilon "Υ"> <!-- greek capital letter upsilon,
|
55
|
+
U+03A5 ISOgrk3 -->
|
56
|
+
<!ENTITY Phi "Φ"> <!-- greek capital letter phi,
|
57
|
+
U+03A6 ISOgrk3 -->
|
58
|
+
<!ENTITY Chi "Χ"> <!-- greek capital letter chi, U+03A7 -->
|
59
|
+
<!ENTITY Psi "Ψ"> <!-- greek capital letter psi,
|
60
|
+
U+03A8 ISOgrk3 -->
|
61
|
+
<!ENTITY Omega "Ω"> <!-- greek capital letter omega,
|
62
|
+
U+03A9 ISOgrk3 -->
|
63
|
+
|
64
|
+
<!ENTITY alpha "α"> <!-- greek small letter alpha,
|
65
|
+
U+03B1 ISOgrk3 -->
|
66
|
+
<!ENTITY beta "β"> <!-- greek small letter beta, U+03B2 ISOgrk3 -->
|
67
|
+
<!ENTITY gamma "γ"> <!-- greek small letter gamma,
|
68
|
+
U+03B3 ISOgrk3 -->
|
69
|
+
<!ENTITY delta "δ"> <!-- greek small letter delta,
|
70
|
+
U+03B4 ISOgrk3 -->
|
71
|
+
<!ENTITY epsilon "ε"> <!-- greek small letter epsilon,
|
72
|
+
U+03B5 ISOgrk3 -->
|
73
|
+
<!ENTITY zeta "ζ"> <!-- greek small letter zeta, U+03B6 ISOgrk3 -->
|
74
|
+
<!ENTITY eta "η"> <!-- greek small letter eta, U+03B7 ISOgrk3 -->
|
75
|
+
<!ENTITY theta "θ"> <!-- greek small letter theta,
|
76
|
+
U+03B8 ISOgrk3 -->
|
77
|
+
<!ENTITY iota "ι"> <!-- greek small letter iota, U+03B9 ISOgrk3 -->
|
78
|
+
<!ENTITY kappa "κ"> <!-- greek small letter kappa,
|
79
|
+
U+03BA ISOgrk3 -->
|
80
|
+
<!ENTITY lambda "λ"> <!-- greek small letter lambda,
|
81
|
+
U+03BB ISOgrk3 -->
|
82
|
+
<!ENTITY mu "μ"> <!-- greek small letter mu, U+03BC ISOgrk3 -->
|
83
|
+
<!ENTITY nu "ν"> <!-- greek small letter nu, U+03BD ISOgrk3 -->
|
84
|
+
<!ENTITY xi "ξ"> <!-- greek small letter xi, U+03BE ISOgrk3 -->
|
85
|
+
<!ENTITY omicron "ο"> <!-- greek small letter omicron, U+03BF NEW -->
|
86
|
+
<!ENTITY pi "π"> <!-- greek small letter pi, U+03C0 ISOgrk3 -->
|
87
|
+
<!ENTITY rho "ρ"> <!-- greek small letter rho, U+03C1 ISOgrk3 -->
|
88
|
+
<!ENTITY sigmaf "ς"> <!-- greek small letter final sigma,
|
89
|
+
U+03C2 ISOgrk3 -->
|
90
|
+
<!ENTITY sigma "σ"> <!-- greek small letter sigma,
|
91
|
+
U+03C3 ISOgrk3 -->
|
92
|
+
<!ENTITY tau "τ"> <!-- greek small letter tau, U+03C4 ISOgrk3 -->
|
93
|
+
<!ENTITY upsilon "υ"> <!-- greek small letter upsilon,
|
94
|
+
U+03C5 ISOgrk3 -->
|
95
|
+
<!ENTITY phi "φ"> <!-- greek small letter phi, U+03C6 ISOgrk3 -->
|
96
|
+
<!ENTITY chi "χ"> <!-- greek small letter chi, U+03C7 ISOgrk3 -->
|
97
|
+
<!ENTITY psi "ψ"> <!-- greek small letter psi, U+03C8 ISOgrk3 -->
|
98
|
+
<!ENTITY omega "ω"> <!-- greek small letter omega,
|
99
|
+
U+03C9 ISOgrk3 -->
|
100
|
+
<!ENTITY thetasym "ϑ"> <!-- greek small letter theta symbol,
|
101
|
+
U+03D1 NEW -->
|
102
|
+
<!ENTITY upsih "ϒ"> <!-- greek upsilon with hook symbol,
|
103
|
+
U+03D2 NEW -->
|
104
|
+
<!ENTITY piv "ϖ"> <!-- greek pi symbol, U+03D6 ISOgrk3 -->
|
105
|
+
|
106
|
+
<!-- General Punctuation -->
|
107
|
+
<!ENTITY bull "•"> <!-- bullet = black small circle,
|
108
|
+
U+2022 ISOpub -->
|
109
|
+
<!-- bullet is NOT the same as bullet operator, U+2219 -->
|
110
|
+
<!ENTITY hellip "…"> <!-- horizontal ellipsis = three dot leader,
|
111
|
+
U+2026 ISOpub -->
|
112
|
+
<!ENTITY prime "′"> <!-- prime = minutes = feet, U+2032 ISOtech -->
|
113
|
+
<!ENTITY Prime "″"> <!-- double prime = seconds = inches,
|
114
|
+
U+2033 ISOtech -->
|
115
|
+
<!ENTITY oline "‾"> <!-- overline = spacing overscore,
|
116
|
+
U+203E NEW -->
|
117
|
+
<!ENTITY frasl "⁄"> <!-- fraction slash, U+2044 NEW -->
|
118
|
+
|
119
|
+
<!-- Letterlike Symbols -->
|
120
|
+
<!ENTITY weierp "℘"> <!-- script capital P = power set
|
121
|
+
= Weierstrass p, U+2118 ISOamso -->
|
122
|
+
<!ENTITY image "ℑ"> <!-- blackletter capital I = imaginary part,
|
123
|
+
U+2111 ISOamso -->
|
124
|
+
<!ENTITY real "ℜ"> <!-- blackletter capital R = real part symbol,
|
125
|
+
U+211C ISOamso -->
|
126
|
+
<!ENTITY trade "™"> <!-- trade mark sign, U+2122 ISOnum -->
|
127
|
+
<!ENTITY alefsym "ℵ"> <!-- alef symbol = first transfinite cardinal,
|
128
|
+
U+2135 NEW -->
|
129
|
+
<!-- alef symbol is NOT the same as hebrew letter alef,
|
130
|
+
U+05D0 although the same glyph could be used to depict both characters -->
|
131
|
+
|
132
|
+
<!-- Arrows -->
|
133
|
+
<!ENTITY larr "←"> <!-- leftwards arrow, U+2190 ISOnum -->
|
134
|
+
<!ENTITY uarr "↑"> <!-- upwards arrow, U+2191 ISOnum-->
|
135
|
+
<!ENTITY rarr "→"> <!-- rightwards arrow, U+2192 ISOnum -->
|
136
|
+
<!ENTITY darr "↓"> <!-- downwards arrow, U+2193 ISOnum -->
|
137
|
+
<!ENTITY harr "↔"> <!-- left right arrow, U+2194 ISOamsa -->
|
138
|
+
<!ENTITY crarr "↵"> <!-- downwards arrow with corner leftwards
|
139
|
+
= carriage return, U+21B5 NEW -->
|
140
|
+
<!ENTITY lArr "⇐"> <!-- leftwards double arrow, U+21D0 ISOtech -->
|
141
|
+
<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
|
142
|
+
but also does not have any other character for that function. So ? lArr can
|
143
|
+
be used for 'is implied by' as ISOtech suggests -->
|
144
|
+
<!ENTITY uArr "⇑"> <!-- upwards double arrow, U+21D1 ISOamsa -->
|
145
|
+
<!ENTITY rArr "⇒"> <!-- rightwards double arrow,
|
146
|
+
U+21D2 ISOtech -->
|
147
|
+
<!-- Unicode does not say this is the 'implies' character but does not have
|
148
|
+
another character with this function so ?
|
149
|
+
rArr can be used for 'implies' as ISOtech suggests -->
|
150
|
+
<!ENTITY dArr "⇓"> <!-- downwards double arrow, U+21D3 ISOamsa -->
|
151
|
+
<!ENTITY hArr "⇔"> <!-- left right double arrow,
|
152
|
+
U+21D4 ISOamsa -->
|
153
|
+
|
154
|
+
<!-- Mathematical Operators -->
|
155
|
+
<!ENTITY forall "∀"> <!-- for all, U+2200 ISOtech -->
|
156
|
+
<!ENTITY part "∂"> <!-- partial differential, U+2202 ISOtech -->
|
157
|
+
<!ENTITY exist "∃"> <!-- there exists, U+2203 ISOtech -->
|
158
|
+
<!ENTITY empty "∅"> <!-- empty set = null set = diameter,
|
159
|
+
U+2205 ISOamso -->
|
160
|
+
<!ENTITY nabla "∇"> <!-- nabla = backward difference,
|
161
|
+
U+2207 ISOtech -->
|
162
|
+
<!ENTITY isin "∈"> <!-- element of, U+2208 ISOtech -->
|
163
|
+
<!ENTITY notin "∉"> <!-- not an element of, U+2209 ISOtech -->
|
164
|
+
<!ENTITY ni "∋"> <!-- contains as member, U+220B ISOtech -->
|
165
|
+
<!-- should there be a more memorable name than 'ni'? -->
|
166
|
+
<!ENTITY prod "∏"> <!-- n-ary product = product sign,
|
167
|
+
U+220F ISOamsb -->
|
168
|
+
<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
|
169
|
+
the same glyph might be used for both -->
|
170
|
+
<!ENTITY sum "∑"> <!-- n-ary sumation, U+2211 ISOamsb -->
|
171
|
+
<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
|
172
|
+
though the same glyph might be used for both -->
|
173
|
+
<!ENTITY minus "−"> <!-- minus sign, U+2212 ISOtech -->
|
174
|
+
<!ENTITY lowast "∗"> <!-- asterisk operator, U+2217 ISOtech -->
|
175
|
+
<!ENTITY radic "√"> <!-- square root = radical sign,
|
176
|
+
U+221A ISOtech -->
|
177
|
+
<!ENTITY prop "∝"> <!-- proportional to, U+221D ISOtech -->
|
178
|
+
<!ENTITY infin "∞"> <!-- infinity, U+221E ISOtech -->
|
179
|
+
<!ENTITY ang "∠"> <!-- angle, U+2220 ISOamso -->
|
180
|
+
<!ENTITY and "∧"> <!-- logical and = wedge, U+2227 ISOtech -->
|
181
|
+
<!ENTITY or "∨"> <!-- logical or = vee, U+2228 ISOtech -->
|
182
|
+
<!ENTITY cap "∩"> <!-- intersection = cap, U+2229 ISOtech -->
|
183
|
+
<!ENTITY cup "∪"> <!-- union = cup, U+222A ISOtech -->
|
184
|
+
<!ENTITY int "∫"> <!-- integral, U+222B ISOtech -->
|
185
|
+
<!ENTITY there4 "∴"> <!-- therefore, U+2234 ISOtech -->
|
186
|
+
<!ENTITY sim "∼"> <!-- tilde operator = varies with = similar to,
|
187
|
+
U+223C ISOtech -->
|
188
|
+
<!-- tilde operator is NOT the same character as the tilde, U+007E,
|
189
|
+
although the same glyph might be used to represent both -->
|
190
|
+
<!ENTITY cong "≅"> <!-- approximately equal to, U+2245 ISOtech -->
|
191
|
+
<!ENTITY asymp "≈"> <!-- almost equal to = asymptotic to,
|
192
|
+
U+2248 ISOamsr -->
|
193
|
+
<!ENTITY ne "≠"> <!-- not equal to, U+2260 ISOtech -->
|
194
|
+
<!ENTITY equiv "≡"> <!-- identical to, U+2261 ISOtech -->
|
195
|
+
<!ENTITY le "≤"> <!-- less-than or equal to, U+2264 ISOtech -->
|
196
|
+
<!ENTITY ge "≥"> <!-- greater-than or equal to,
|
197
|
+
U+2265 ISOtech -->
|
198
|
+
<!ENTITY sub "⊂"> <!-- subset of, U+2282 ISOtech -->
|
199
|
+
<!ENTITY sup "⊃"> <!-- superset of, U+2283 ISOtech -->
|
200
|
+
<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
|
201
|
+
font encoding and is not included. Should it be, for symmetry?
|
202
|
+
It is in ISOamsn -->
|
203
|
+
<!ENTITY nsub "⊄"> <!-- not a subset of, U+2284 ISOamsn -->
|
204
|
+
<!ENTITY sube "⊆"> <!-- subset of or equal to, U+2286 ISOtech -->
|
205
|
+
<!ENTITY supe "⊇"> <!-- superset of or equal to,
|
206
|
+
U+2287 ISOtech -->
|
207
|
+
<!ENTITY oplus "⊕"> <!-- circled plus = direct sum,
|
208
|
+
U+2295 ISOamsb -->
|
209
|
+
<!ENTITY otimes "⊗"> <!-- circled times = vector product,
|
210
|
+
U+2297 ISOamsb -->
|
211
|
+
<!ENTITY perp "⊥"> <!-- up tack = orthogonal to = perpendicular,
|
212
|
+
U+22A5 ISOtech -->
|
213
|
+
<!ENTITY sdot "⋅"> <!-- dot operator, U+22C5 ISOamsb -->
|
214
|
+
<!-- dot operator is NOT the same character as U+00B7 middle dot -->
|
215
|
+
|
216
|
+
<!-- Miscellaneous Technical -->
|
217
|
+
<!ENTITY lceil "⌈"> <!-- left ceiling = apl upstile,
|
218
|
+
U+2308 ISOamsc -->
|
219
|
+
<!ENTITY rceil "⌉"> <!-- right ceiling, U+2309 ISOamsc -->
|
220
|
+
<!ENTITY lfloor "⌊"> <!-- left floor = apl downstile,
|
221
|
+
U+230A ISOamsc -->
|
222
|
+
<!ENTITY rfloor "⌋"> <!-- right floor, U+230B ISOamsc -->
|
223
|
+
<!ENTITY lang "〈"> <!-- left-pointing angle bracket = bra,
|
224
|
+
U+2329 ISOtech -->
|
225
|
+
<!-- lang is NOT the same character as U+003C 'less than'
|
226
|
+
or U+2039 'single left-pointing angle quotation mark' -->
|
227
|
+
<!ENTITY rang "〉"> <!-- right-pointing angle bracket = ket,
|
228
|
+
U+232A ISOtech -->
|
229
|
+
<!-- rang is NOT the same character as U+003E 'greater than'
|
230
|
+
or U+203A 'single right-pointing angle quotation mark' -->
|
231
|
+
|
232
|
+
<!-- Geometric Shapes -->
|
233
|
+
<!ENTITY loz "◊"> <!-- lozenge, U+25CA ISOpub -->
|
234
|
+
|
235
|
+
<!-- Miscellaneous Symbols -->
|
236
|
+
<!ENTITY spades "♠"> <!-- black spade suit, U+2660 ISOpub -->
|
237
|
+
<!-- black here seems to mean filled as opposed to hollow -->
|
238
|
+
<!ENTITY clubs "♣"> <!-- black club suit = shamrock,
|
239
|
+
U+2663 ISOpub -->
|
240
|
+
<!ENTITY hearts "♥"> <!-- black heart suit = valentine,
|
241
|
+
U+2665 ISOpub -->
|
242
|
+
<!ENTITY diams "♦"> <!-- black diamond suit, U+2666 ISOpub -->
|
data/lib/onix.rb
CHANGED
@@ -16,7 +16,7 @@ module ONIX
|
|
16
16
|
module Version #:nodoc:
|
17
17
|
Major = 0
|
18
18
|
Minor = 7
|
19
|
-
Tiny =
|
19
|
+
Tiny = 2
|
20
20
|
|
21
21
|
String = [Major, Minor, Tiny].join('.')
|
22
22
|
end
|
@@ -100,3 +100,5 @@ require File.join(File.dirname(__FILE__), "onix", "lists", "product_availability
|
|
100
100
|
require File.join(File.dirname(__FILE__), "onix", "simple_product")
|
101
101
|
require File.join(File.dirname(__FILE__), "onix", "apa_product")
|
102
102
|
|
103
|
+
# misc
|
104
|
+
require File.join(File.dirname(__FILE__), "onix", "normaliser")
|
@@ -0,0 +1,156 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'tempfile'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
module ONIX
|
7
|
+
|
8
|
+
# A standalone class that can be used to normalise ONIX files
|
9
|
+
# into a standardised form. If you're accepting ONIX files from a wide range
|
10
|
+
# of suppliers, you're guarunteed to get all sorts of dialects.
|
11
|
+
#
|
12
|
+
# This will create a new file that:
|
13
|
+
#
|
14
|
+
# - is UTF-8 encoded
|
15
|
+
# - uses reference tags, not short
|
16
|
+
# - has no named entities (ndash, etc) other than & < and >
|
17
|
+
#
|
18
|
+
# Usage:
|
19
|
+
#
|
20
|
+
# ONIX::Normaliser.process("oldfile.xml", "newfile.xml")
|
21
|
+
#
|
22
|
+
# Dependencies:
|
23
|
+
#
|
24
|
+
# At this stage the class depends on several external apps, all commonly available
|
25
|
+
# on *nix systems: xsltproc, isutf8, iconv and sed
|
26
|
+
#
|
27
|
+
class Normaliser
|
28
|
+
|
29
|
+
class << self
|
30
|
+
|
31
|
+
# normalise oldfile and save it as newfile. oldfile
|
32
|
+
# will be left untouched
|
33
|
+
#
|
34
|
+
def process(oldfile, newfile)
|
35
|
+
self.new(oldfile, newfile).run
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def initialize(oldfile, newfile)
|
40
|
+
raise ArgumentError, "#{oldfile} does not exist" unless File.file?(oldfile)
|
41
|
+
raise ArgumentError, "#{newfile} already exists" if File.file?(newfile)
|
42
|
+
raise "java app not found" unless app_available?("which")
|
43
|
+
raise "isutf8 app not found" unless app_available?("isutf8")
|
44
|
+
raise "iconv app not found" unless app_available?("iconv")
|
45
|
+
raise "sed app not found" unless app_available?("sed")
|
46
|
+
|
47
|
+
@oldfile = oldfile
|
48
|
+
@newfile = newfile
|
49
|
+
@curfile = next_tempfile
|
50
|
+
FileUtils.cp(@oldfile, @curfile)
|
51
|
+
@head = File.open(@oldfile, "r") { |f| f.read(1024) }
|
52
|
+
end
|
53
|
+
|
54
|
+
def run
|
55
|
+
# remove short tags
|
56
|
+
if @head.include?("ONIXmessage")
|
57
|
+
dest = next_tempfile
|
58
|
+
to_reference_tags(@curfile, dest)
|
59
|
+
@curfile = dest
|
60
|
+
end
|
61
|
+
|
62
|
+
# convert to utf8
|
63
|
+
dest = next_tempfile
|
64
|
+
to_utf8(@curfile, dest)
|
65
|
+
@curfile = dest
|
66
|
+
|
67
|
+
# remove entities
|
68
|
+
replace_named_entities(@curfile)
|
69
|
+
|
70
|
+
FileUtils.cp(@curfile, @newfile)
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
# check the specified app is available on the system
|
76
|
+
#
|
77
|
+
def app_available?(app)
|
78
|
+
`which #{app}`.strip == "" ? false : true
|
79
|
+
end
|
80
|
+
|
81
|
+
# generate a temp filename
|
82
|
+
#
|
83
|
+
def next_tempfile
|
84
|
+
p = nil
|
85
|
+
Tempfile.open("onix") do |tf|
|
86
|
+
tf.close
|
87
|
+
p = tf.path
|
88
|
+
end
|
89
|
+
p
|
90
|
+
end
|
91
|
+
|
92
|
+
# uses an XSLT stylesheet provided by edituer to convert
|
93
|
+
# a file from short tags to long tags.
|
94
|
+
#
|
95
|
+
# more detail here:
|
96
|
+
# http://www.editeur.org/files/ONIX%203/ONIX%20tagname%20converter%20v2.htm
|
97
|
+
#
|
98
|
+
def to_reference_tags(src, dest)
|
99
|
+
inpath = File.expand_path(src)
|
100
|
+
outpath = File.expand_path(dest)
|
101
|
+
xsltpath = File.dirname(__FILE__) + "/../../support/switch-onix-tagnames-1.1.xsl"
|
102
|
+
# xsltproc doesn't set the DTD correctly in the output. Using
|
103
|
+
# saxon instead.
|
104
|
+
#`xsltproc -o #{outpath} #{xsltpath} #{inpath}`
|
105
|
+
`java -jar /usr/share/java/saxon.jar #{inpath} #{xsltpath} > #{outpath}`
|
106
|
+
end
|
107
|
+
|
108
|
+
# ensure the file is valid utf8, then make sure it's declared as such
|
109
|
+
#
|
110
|
+
def to_utf8(src, dest)
|
111
|
+
inpath = File.expand_path(src)
|
112
|
+
outpath = File.expand_path(dest)
|
113
|
+
|
114
|
+
m, src_enc = *@head.match(/encoding=.([a-zA-Z0-9\-]+)./i)
|
115
|
+
|
116
|
+
# ensure the file is actually utf8
|
117
|
+
if `isutf8 #{inpath}`.strip == ""
|
118
|
+
FileUtils.cp(inpath, outpath)
|
119
|
+
else
|
120
|
+
`iconv --from-code=#{src_enc} --to-code=UTF-8 #{inpath} > #{outpath}`
|
121
|
+
end
|
122
|
+
|
123
|
+
# ensure the encoding delcaration is correct
|
124
|
+
if src_enc.downcase != "utf-8"
|
125
|
+
`sed -i 's/#{src_enc}/UTF-8/' #{outpath}`
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# replace all named entities in the specified file with
|
130
|
+
# numeric entities.
|
131
|
+
#
|
132
|
+
def replace_named_entities(path)
|
133
|
+
# TODO: this is horrible. 1500 sed calls.
|
134
|
+
entity_map.each do |named, numeric|
|
135
|
+
`sed -i 's/\\&#{named};/\\&#{numeric};/g' #{path}`
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# return a named entity to numeric entity mapping, build by extracting
|
140
|
+
# data from the ONIX DTD
|
141
|
+
#
|
142
|
+
def entity_map
|
143
|
+
return @map if @map
|
144
|
+
|
145
|
+
path = File.dirname(__FILE__) + "/../../support/entities.txt"
|
146
|
+
@map = {}
|
147
|
+
File.read(path).split.each do |line|
|
148
|
+
elements = line.split(":")
|
149
|
+
@map[elements.first] = elements.last
|
150
|
+
end
|
151
|
+
@map
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
155
|
+
|
156
|
+
end
|