ruby-feedparser 0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +340 -0
- data/ChangeLog +59 -0
- data/LICENSE +60 -0
- data/README +14 -0
- data/Rakefile +84 -0
- data/lib/feedparser.rb +28 -0
- data/lib/feedparser/feedparser.rb +343 -0
- data/lib/feedparser/filesizes.rb +14 -0
- data/lib/feedparser/html-output.rb +126 -0
- data/lib/feedparser/html2text-parser.rb +413 -0
- data/lib/feedparser/rexml_patch.rb +28 -0
- data/lib/feedparser/sgml-parser.rb +332 -0
- data/lib/feedparser/text-output.rb +108 -0
- data/lib/feedparser/textconverters.rb +120 -0
- data/setup.rb +1586 -0
- data/test/tc_feed_parse.rb +117 -0
- data/test/tc_htmloutput.rb +52 -0
- data/test/tc_parser.rb +48 -0
- data/test/tc_textoutput.rb +48 -0
- data/test/tc_textwrappedoutput.rb +48 -0
- data/test/ts_feedparser.rb +12 -0
- data/tools/doctoweb.bash +30 -0
- metadata +76 -0
@@ -0,0 +1,14 @@
|
|
1
|
+
class Integer
|
2
|
+
def to_human_readable
|
3
|
+
n = self
|
4
|
+
if n < 1024
|
5
|
+
return "#{n} B"
|
6
|
+
elsif n >= 1024 and n < 1024*1024
|
7
|
+
return "%.1f KB" % (n.to_f / 1024)
|
8
|
+
elsif n >= 1024*1024 and n < 1024*1024*1024
|
9
|
+
return "%.1f MB" % (n.to_f / (1024*1024))
|
10
|
+
else
|
11
|
+
return "%.1f GB" % (n.to_f / (1024*1024*1024))
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
require 'feedparser'
|
2
|
+
require 'feedparser/filesizes'
|
3
|
+
|
4
|
+
module FeedParser
|
5
|
+
class Feed
|
6
|
+
def to_html(localtime = true)
|
7
|
+
s = ''
|
8
|
+
s += '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">'
|
9
|
+
s += "\n"
|
10
|
+
s += "<html>\n"
|
11
|
+
s += "<head>\n"
|
12
|
+
s += "<title>#{@title.escape_html}</title>\n"
|
13
|
+
s += "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\">\n"
|
14
|
+
s += "</head>\n"
|
15
|
+
s += "<body>\n"
|
16
|
+
|
17
|
+
s += <<-EOF
|
18
|
+
<table border="1" width="100%" cellpadding="0" cellspacing="0" borderspacing="0"><tr><td>
|
19
|
+
<table width="100%" bgcolor="#EDEDED" cellpadding="4" cellspacing="2">
|
20
|
+
EOF
|
21
|
+
r = ""
|
22
|
+
r += "<a href=\"#{@link}\">\n" if @link
|
23
|
+
if @title
|
24
|
+
r += "<b>#{@title.escape_html}</b>\n"
|
25
|
+
elsif @link
|
26
|
+
r += "<b>#{@link.escape_html}</b>\n"
|
27
|
+
else
|
28
|
+
r += "<b>Unnamed feed</b>\n"
|
29
|
+
end
|
30
|
+
r += "</a>\n" if @link
|
31
|
+
headline = "<tr><td align=\"right\"><b>%s</b></td>\n<td width=\"100%%\">%s</td></tr>"
|
32
|
+
s += (headline % ["Feed title:", r])
|
33
|
+
s += (headline % ["Type:", @type])
|
34
|
+
s += (headline % ["Encoding:", @encoding])
|
35
|
+
s += (headline % ["Creator:", @creator.escape_html]) if @creator
|
36
|
+
s += "</table></td></tr></table>\n"
|
37
|
+
|
38
|
+
if @description and @description !~ /\A\s*</m
|
39
|
+
s += "<br/>\n"
|
40
|
+
end
|
41
|
+
s += "#{@description}" if @description
|
42
|
+
|
43
|
+
@items.each do |i|
|
44
|
+
s += "\n<hr/><!-- *********************************** -->\n"
|
45
|
+
s += i.to_html(localtime)
|
46
|
+
end
|
47
|
+
s += "\n</body></html>\n"
|
48
|
+
s
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class FeedItem
|
53
|
+
def to_html_with_headers(localtime = true)
|
54
|
+
s = <<-EOF
|
55
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
56
|
+
<html>
|
57
|
+
<body>
|
58
|
+
EOF
|
59
|
+
s += to_html(localtime)
|
60
|
+
s += "\n</body>\n</html>"
|
61
|
+
s
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_html(localtime = true)
|
65
|
+
s = <<-EOF
|
66
|
+
<table border="1" width="100%" cellpadding="0" cellspacing="0" borderspacing="0"><tr><td>
|
67
|
+
<table width="100%" bgcolor="#EDEDED" cellpadding="4" cellspacing="2">
|
68
|
+
EOF
|
69
|
+
r = ""
|
70
|
+
r += "<a href=\"#{@feed.link}\">\n" if @feed.link
|
71
|
+
if @feed.title
|
72
|
+
r += "<b>#{@feed.title.escape_html}</b>\n"
|
73
|
+
elsif @feed.link
|
74
|
+
r += "<b>#{@feed.link.escape_html}</b>\n"
|
75
|
+
else
|
76
|
+
r += "<b>Unnamed feed</b>\n"
|
77
|
+
end
|
78
|
+
r += "</a>\n" if @feed.link
|
79
|
+
headline = "<tr><td align=\"right\"><b>%s</b></td>\n<td width=\"100%%\">%s</td></tr>"
|
80
|
+
s += (headline % ["Feed:", r])
|
81
|
+
|
82
|
+
r = ""
|
83
|
+
r += "<a href=\"#{@link}\">" if @link
|
84
|
+
if @title
|
85
|
+
r += "<b>#{@title.escape_html}</b>\n"
|
86
|
+
elsif @link
|
87
|
+
r += "<b>#{@link.escape_html}</b>\n"
|
88
|
+
end
|
89
|
+
r += "</a>\n" if @link
|
90
|
+
s += (headline % ["Item:", r])
|
91
|
+
s += "</table></td></tr></table>\n"
|
92
|
+
s += "\n"
|
93
|
+
if @content and @content !~ /\A\s*</m
|
94
|
+
s += "<br/>\n"
|
95
|
+
end
|
96
|
+
s += "#{@content}" if @content
|
97
|
+
if @enclosures and @enclosures.length > 0
|
98
|
+
s += <<-EOF
|
99
|
+
<table border="1" width="100%" cellpadding="0" cellspacing="0" borderspacing="0"><tr><td>
|
100
|
+
<table width="100%" bgcolor="#EDEDED" cellpadding="2" cellspacing="2">
|
101
|
+
EOF
|
102
|
+
s += '<tr><td width="100%"><b>Files:</b></td></tr>'
|
103
|
+
s += "\n"
|
104
|
+
@enclosures.each do |e|
|
105
|
+
s += "<tr><td> <a href=\"#{e[0]}\">#{e[0].split('/')[-1]}</a> (#{e[1].to_i.to_human_readable}, #{e[2]})</td></tr>\n"
|
106
|
+
end
|
107
|
+
s += "</table></td></tr></table>\n"
|
108
|
+
end
|
109
|
+
s += "\n<hr width=\"100%\"/>\n"
|
110
|
+
s += '<table width="100%" cellpadding="0" cellspacing="0">' + "\n"
|
111
|
+
l = '<tr><td align="right"><font color="#ababab">%s</font> </td><td><font color="#ababab">%s</font></td></tr>' + "\n"
|
112
|
+
if @date
|
113
|
+
if localtime
|
114
|
+
s += l % [ 'Date:', @date.to_s ]
|
115
|
+
else
|
116
|
+
s += l % [ 'Date:', @date.getutc.to_s ]
|
117
|
+
end
|
118
|
+
end
|
119
|
+
s += l % [ 'Author:', creator.escape_html ] if creator
|
120
|
+
s += l % [ 'Subject:', @subject.escape_html ] if @subject
|
121
|
+
s += l % [ 'Filed under:', @categories.join(', ').escape_html ] unless @categories.empty?
|
122
|
+
s += "</table>\n"
|
123
|
+
s
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
@@ -0,0 +1,413 @@
|
|
1
|
+
require 'feedparser/sgml-parser'
|
2
|
+
|
3
|
+
module FeedParser
|
4
|
+
# this class provides a simple SGML parser that removes HTML tags
|
5
|
+
class HTML2TextParser < SGMLParser
|
6
|
+
|
7
|
+
attr_reader :savedata
|
8
|
+
|
9
|
+
def initialize(verbose = false)
|
10
|
+
@savedata = ''
|
11
|
+
@pre = false
|
12
|
+
@href = nil
|
13
|
+
@links = []
|
14
|
+
@imgs = []
|
15
|
+
@img_index = '@'
|
16
|
+
super(verbose)
|
17
|
+
end
|
18
|
+
|
19
|
+
def next_img_index
|
20
|
+
n = @img_index[0] + 1
|
21
|
+
@img_index = " "
|
22
|
+
@img_index[0] = n
|
23
|
+
return @img_index
|
24
|
+
end
|
25
|
+
|
26
|
+
def handle_data(data)
|
27
|
+
# let's remove all CR
|
28
|
+
if not @pre
|
29
|
+
data.gsub!(/\n/, ' ')
|
30
|
+
data.gsub!(/( )+/, ' ')
|
31
|
+
end
|
32
|
+
@savedata << data
|
33
|
+
end
|
34
|
+
|
35
|
+
def unknown_starttag(tag, attrs)
|
36
|
+
case tag
|
37
|
+
when 'p', 'h4'
|
38
|
+
@savedata << "\n\n"
|
39
|
+
when 'h1'
|
40
|
+
@savedata << "\n\n "
|
41
|
+
when 'h2'
|
42
|
+
@savedata << "\n\n "
|
43
|
+
when 'h3'
|
44
|
+
@savedata << "\n\n "
|
45
|
+
when 'br'
|
46
|
+
@savedata << "\n"
|
47
|
+
when 'ul'
|
48
|
+
@savedata << "\n"
|
49
|
+
when 'li'
|
50
|
+
@savedata << "\n - "
|
51
|
+
when 'b'
|
52
|
+
@savedata << '*'
|
53
|
+
when 'strong'
|
54
|
+
@savedata << '*'
|
55
|
+
when 'em'
|
56
|
+
@savedata << '*'
|
57
|
+
when 'u'
|
58
|
+
@savedata << '_'
|
59
|
+
when 'i'
|
60
|
+
@savedata << '/'
|
61
|
+
when 'pre'
|
62
|
+
@savedata << "\n\n"
|
63
|
+
@pre = true
|
64
|
+
when 'a'
|
65
|
+
# find href in args
|
66
|
+
@href = nil
|
67
|
+
attrs.each do |a|
|
68
|
+
if a[0] == 'href'
|
69
|
+
@href = a[1]
|
70
|
+
end
|
71
|
+
end
|
72
|
+
if @href
|
73
|
+
@links << @href.gsub(/^("|'|)(.*)("|')$/,'\2')
|
74
|
+
end
|
75
|
+
when 'img'
|
76
|
+
# find src in args
|
77
|
+
src = nil
|
78
|
+
attrs.each do |a|
|
79
|
+
if a[0] == 'src'
|
80
|
+
src = a[1]
|
81
|
+
end
|
82
|
+
end
|
83
|
+
if src
|
84
|
+
idx = next_img_index
|
85
|
+
@imgs << [ idx, src.gsub(/^("|'|)(.*)("|')$/,'\2') ]
|
86
|
+
@savedata << "[#{idx}]"
|
87
|
+
end
|
88
|
+
else
|
89
|
+
# puts "unknown tag: #{tag}"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def close
|
94
|
+
super
|
95
|
+
if @links.length > 0
|
96
|
+
@savedata << "\n\n"
|
97
|
+
@links.each_index do |i|
|
98
|
+
@savedata << "[#{i+1}] #{@links[i]}\n"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
if @imgs.length > 0
|
102
|
+
@savedata << "\n\n"
|
103
|
+
@imgs.each do |i|
|
104
|
+
@savedata << "[#{i[0]}] #{i[1]}\n"
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def unknown_endtag(tag)
|
110
|
+
case tag
|
111
|
+
when 'ul'
|
112
|
+
@savedata << "\n"
|
113
|
+
when 'b'
|
114
|
+
@savedata << '*'
|
115
|
+
when 'strong'
|
116
|
+
@savedata << '*'
|
117
|
+
when 'em'
|
118
|
+
@savedata << '*'
|
119
|
+
when 'u'
|
120
|
+
@savedata << '_'
|
121
|
+
when 'i'
|
122
|
+
@savedata << '/'
|
123
|
+
when 'pre'
|
124
|
+
@savedata << "\n\n"
|
125
|
+
@pre = false
|
126
|
+
when 'a'
|
127
|
+
if @href
|
128
|
+
@savedata << "[#{@links.length}]"
|
129
|
+
@href = nil
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
def unknown_charref(ref)
|
135
|
+
handle_data([ref.to_i].pack('U*'))
|
136
|
+
end
|
137
|
+
|
138
|
+
def HTML2TextParser.entities
|
139
|
+
return HTML_ENTITIES
|
140
|
+
end
|
141
|
+
|
142
|
+
HTML_ENTITIES = {
|
143
|
+
"quot" => 34,
|
144
|
+
"amp" => 38,
|
145
|
+
"lt" => 60,
|
146
|
+
"gt" => 62,
|
147
|
+
"apos" => 39,
|
148
|
+
|
149
|
+
"nbsp" => 160,
|
150
|
+
"iexcl" => 161,
|
151
|
+
"cent" => 162,
|
152
|
+
"pound" => 163,
|
153
|
+
"curren" => 164,
|
154
|
+
"yen" => 165,
|
155
|
+
"brvbar" => 166,
|
156
|
+
"sect" => 167,
|
157
|
+
"uml" => 168,
|
158
|
+
"copy" => 169,
|
159
|
+
"ordf" => 170,
|
160
|
+
"laquo" => 171,
|
161
|
+
"not" => 172,
|
162
|
+
"shy" => 173,
|
163
|
+
"reg" => 174,
|
164
|
+
"macr" => 175,
|
165
|
+
"deg" => 176,
|
166
|
+
"plusmn" => 177,
|
167
|
+
"sup2" => 178,
|
168
|
+
"sup3" => 179,
|
169
|
+
"acute" => 180,
|
170
|
+
"micro" => 181,
|
171
|
+
"para" => 182,
|
172
|
+
"middot" => 183,
|
173
|
+
"cedil" => 184,
|
174
|
+
"sup1" => 185,
|
175
|
+
"ordm" => 186,
|
176
|
+
"raquo" => 187,
|
177
|
+
"frac14" => 188,
|
178
|
+
"frac12" => 189,
|
179
|
+
"frac34" => 190,
|
180
|
+
"iquest" => 191,
|
181
|
+
"Agrave" => 192,
|
182
|
+
"Aacute" => 193,
|
183
|
+
"Acirc" => 194,
|
184
|
+
"Atilde" => 195,
|
185
|
+
"Auml" => 196,
|
186
|
+
"Aring" => 197,
|
187
|
+
"AElig" => 198,
|
188
|
+
"Ccedil" => 199,
|
189
|
+
"Egrave" => 200,
|
190
|
+
"Eacute" => 201,
|
191
|
+
"Ecirc" => 202,
|
192
|
+
"Euml" => 203,
|
193
|
+
"Igrave" => 204,
|
194
|
+
"Iacute" => 205,
|
195
|
+
"Icirc" => 206,
|
196
|
+
"Iuml" => 207,
|
197
|
+
"ETH" => 208,
|
198
|
+
"Ntilde" => 209,
|
199
|
+
"Ograve" => 210,
|
200
|
+
"Oacute" => 211,
|
201
|
+
"Ocirc" => 212,
|
202
|
+
"Otilde" => 213,
|
203
|
+
"Ouml" => 214,
|
204
|
+
"times" => 215,
|
205
|
+
"Oslash" => 216,
|
206
|
+
"Ugrave" => 217,
|
207
|
+
"Uacute" => 218,
|
208
|
+
"Ucirc" => 219,
|
209
|
+
"Uuml" => 220,
|
210
|
+
"Yacute" => 221,
|
211
|
+
"THORN" => 222,
|
212
|
+
"szlig" => 223,
|
213
|
+
"agrave" => 224,
|
214
|
+
"aacute" => 225,
|
215
|
+
"acirc" => 226,
|
216
|
+
"atilde" => 227,
|
217
|
+
"auml" => 228,
|
218
|
+
"aring" => 229,
|
219
|
+
"aelig" => 230,
|
220
|
+
"ccedil" => 231,
|
221
|
+
"egrave" => 232,
|
222
|
+
"eacute" => 233,
|
223
|
+
"ecirc" => 234,
|
224
|
+
"euml" => 235,
|
225
|
+
"igrave" => 236,
|
226
|
+
"iacute" => 237,
|
227
|
+
"icirc" => 238,
|
228
|
+
"iuml" => 239,
|
229
|
+
"eth" => 240,
|
230
|
+
"ntilde" => 241,
|
231
|
+
"ograve" => 242,
|
232
|
+
"oacute" => 243,
|
233
|
+
"ocirc" => 244,
|
234
|
+
"otilde" => 245,
|
235
|
+
"ouml" => 246,
|
236
|
+
"divide" => 247,
|
237
|
+
"oslash" => 248,
|
238
|
+
"ugrave" => 249,
|
239
|
+
"uacute" => 250,
|
240
|
+
"ucirc" => 251,
|
241
|
+
"uuml" => 252,
|
242
|
+
"yacute" => 253,
|
243
|
+
"thorn" => 254,
|
244
|
+
"yuml" => 255,
|
245
|
+
|
246
|
+
|
247
|
+
"fnof" => 402,
|
248
|
+
"Alpha" => 913,
|
249
|
+
"Beta" => 914,
|
250
|
+
"Gamma" => 915,
|
251
|
+
"Delta" => 916,
|
252
|
+
"Epsilon" => 917,
|
253
|
+
"Zeta" => 918,
|
254
|
+
"Eta" => 919,
|
255
|
+
"Theta" => 920,
|
256
|
+
"Iota" => 921,
|
257
|
+
"Kappa" => 922,
|
258
|
+
"Lambda" => 923,
|
259
|
+
"Mu" => 924,
|
260
|
+
"Nu" => 925,
|
261
|
+
"Xi" => 926,
|
262
|
+
"Omicron" => 927,
|
263
|
+
"Pi" => 928,
|
264
|
+
"Rho" => 929,
|
265
|
+
"Sigma" => 931,
|
266
|
+
"Tau" => 932,
|
267
|
+
"Upsilon" => 933,
|
268
|
+
"Phi" => 934,
|
269
|
+
"Chi" => 935,
|
270
|
+
"Psi" => 936,
|
271
|
+
"Omega" => 937,
|
272
|
+
"alpha" => 945,
|
273
|
+
"beta" => 946,
|
274
|
+
"gamma" => 947,
|
275
|
+
"delta" => 948,
|
276
|
+
"epsilon" => 949,
|
277
|
+
"zeta" => 950,
|
278
|
+
"eta" => 951,
|
279
|
+
"theta" => 952,
|
280
|
+
"iota" => 953,
|
281
|
+
"kappa" => 954,
|
282
|
+
"lambda" => 955,
|
283
|
+
"mu" => 956,
|
284
|
+
"nu" => 957,
|
285
|
+
"xi" => 958,
|
286
|
+
"omicron" => 959,
|
287
|
+
"pi" => 960,
|
288
|
+
"rho" => 961,
|
289
|
+
"sigmaf" => 962,
|
290
|
+
"sigma" => 963,
|
291
|
+
"tau" => 964,
|
292
|
+
"upsilon" => 965,
|
293
|
+
"phi" => 966,
|
294
|
+
"chi" => 967,
|
295
|
+
"psi" => 968,
|
296
|
+
"omega" => 969,
|
297
|
+
"thetasym" => 977,
|
298
|
+
"upsih" => 978,
|
299
|
+
"piv" => 982,
|
300
|
+
"bull" => 8226,
|
301
|
+
"hellip" => 8230,
|
302
|
+
"prime" => 8242,
|
303
|
+
"Prime" => 8243,
|
304
|
+
"oline" => 8254,
|
305
|
+
"frasl" => 8260,
|
306
|
+
"weierp" => 8472,
|
307
|
+
"image" => 8465,
|
308
|
+
"real" => 8476,
|
309
|
+
"trade" => 8482,
|
310
|
+
"alefsym" => 8501,
|
311
|
+
"larr" => 8592,
|
312
|
+
"uarr" => 8593,
|
313
|
+
"rarr" => 8594,
|
314
|
+
"darr" => 8595,
|
315
|
+
"harr" => 8596,
|
316
|
+
"crarr" => 8629,
|
317
|
+
"lArr" => 8656,
|
318
|
+
"uArr" => 8657,
|
319
|
+
"rArr" => 8658,
|
320
|
+
"dArr" => 8659,
|
321
|
+
"hArr" => 8660,
|
322
|
+
"forall" => 8704,
|
323
|
+
"part" => 8706,
|
324
|
+
"exist" => 8707,
|
325
|
+
"empty" => 8709,
|
326
|
+
"nabla" => 8711,
|
327
|
+
"isin" => 8712,
|
328
|
+
"notin" => 8713,
|
329
|
+
"ni" => 8715,
|
330
|
+
"prod" => 8719,
|
331
|
+
"sum" => 8721,
|
332
|
+
"minus" => 8722,
|
333
|
+
"lowast" => 8727,
|
334
|
+
"radic" => 8730,
|
335
|
+
"prop" => 8733,
|
336
|
+
"infin" => 8734,
|
337
|
+
"ang" => 8736,
|
338
|
+
"and" => 8743,
|
339
|
+
"or" => 8744,
|
340
|
+
"cap" => 8745,
|
341
|
+
"cup" => 8746,
|
342
|
+
"int" => 8747,
|
343
|
+
"there4" => 8756,
|
344
|
+
"sim" => 8764,
|
345
|
+
"cong" => 8773,
|
346
|
+
"asymp" => 8776,
|
347
|
+
"ne" => 8800,
|
348
|
+
"equiv" => 8801,
|
349
|
+
"le" => 8804,
|
350
|
+
"ge" => 8805,
|
351
|
+
"sub" => 8834,
|
352
|
+
"sup" => 8835,
|
353
|
+
"nsub" => 8836,
|
354
|
+
"sube" => 8838,
|
355
|
+
"supe" => 8839,
|
356
|
+
"oplus" => 8853,
|
357
|
+
"otimes" => 8855,
|
358
|
+
"perp" => 8869,
|
359
|
+
"sdot" => 8901,
|
360
|
+
"lceil" => 8968,
|
361
|
+
"rceil" => 8969,
|
362
|
+
"lfloor" => 8970,
|
363
|
+
"rfloor" => 8971,
|
364
|
+
"lang" => 9001,
|
365
|
+
"rang" => 9002,
|
366
|
+
"loz" => 9674,
|
367
|
+
"spades" => 9824,
|
368
|
+
"clubs" => 9827,
|
369
|
+
"hearts" => 9829,
|
370
|
+
"diams" => 9830,
|
371
|
+
|
372
|
+
"OElig" => 338,
|
373
|
+
"oelig" => 339,
|
374
|
+
"Scaron" => 352,
|
375
|
+
"scaron" => 353,
|
376
|
+
"Yuml" => 376,
|
377
|
+
"circ" => 710,
|
378
|
+
"tilde" => 732,
|
379
|
+
"ensp" => 8194,
|
380
|
+
"emsp" => 8195,
|
381
|
+
"thinsp" => 8201,
|
382
|
+
"zwnj" => 8204,
|
383
|
+
"zwj" => 8205,
|
384
|
+
"lrm" => 8206,
|
385
|
+
"rlm" => 8207,
|
386
|
+
"ndash" => 8211,
|
387
|
+
"mdash" => 8212,
|
388
|
+
"lsquo" => 8216,
|
389
|
+
"rsquo" => 8217,
|
390
|
+
"sbquo" => 8218,
|
391
|
+
"ldquo" => 8220,
|
392
|
+
"rdquo" => 8221,
|
393
|
+
"bdquo" => 8222,
|
394
|
+
"dagger" => 8224,
|
395
|
+
"Dagger" => 8225,
|
396
|
+
"permil" => 8240,
|
397
|
+
"lsaquo" => 8249,
|
398
|
+
"rsaquo" => 8250,
|
399
|
+
"euro" => 8364
|
400
|
+
}
|
401
|
+
def unknown_entityref(ref)
|
402
|
+
# hack to avoid considering ­, as it is misused by some blog software (dotclear2)
|
403
|
+
# see http://www.cs.tut.fi/~jkorpela/shy.html
|
404
|
+
if ref == 'shy'
|
405
|
+
handle_data('')
|
406
|
+
elsif HTML_ENTITIES.has_key?(ref)
|
407
|
+
handle_data([HTML_ENTITIES[ref]].pack('U*'))
|
408
|
+
else
|
409
|
+
handle_data(ref)
|
410
|
+
end
|
411
|
+
end
|
412
|
+
end
|
413
|
+
end
|