ruby-feedparser 0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ class Integer
2
+ def to_human_readable
3
+ n = self
4
+ if n < 1024
5
+ return "#{n} B"
6
+ elsif n >= 1024 and n < 1024*1024
7
+ return "%.1f KB" % (n.to_f / 1024)
8
+ elsif n >= 1024*1024 and n < 1024*1024*1024
9
+ return "%.1f MB" % (n.to_f / (1024*1024))
10
+ else
11
+ return "%.1f GB" % (n.to_f / (1024*1024*1024))
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,126 @@
1
+ require 'feedparser'
2
+ require 'feedparser/filesizes'
3
+
4
+ module FeedParser
5
+ class Feed
6
+ def to_html(localtime = true)
7
+ s = ''
8
+ s += '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">'
9
+ s += "\n"
10
+ s += "<html>\n"
11
+ s += "<head>\n"
12
+ s += "<title>#{@title.escape_html}</title>\n"
13
+ s += "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\">\n"
14
+ s += "</head>\n"
15
+ s += "<body>\n"
16
+
17
+ s += <<-EOF
18
+ <table border="1" width="100%" cellpadding="0" cellspacing="0" borderspacing="0"><tr><td>
19
+ <table width="100%" bgcolor="#EDEDED" cellpadding="4" cellspacing="2">
20
+ EOF
21
+ r = ""
22
+ r += "<a href=\"#{@link}\">\n" if @link
23
+ if @title
24
+ r += "<b>#{@title.escape_html}</b>\n"
25
+ elsif @link
26
+ r += "<b>#{@link.escape_html}</b>\n"
27
+ else
28
+ r += "<b>Unnamed feed</b>\n"
29
+ end
30
+ r += "</a>\n" if @link
31
+ headline = "<tr><td align=\"right\"><b>%s</b></td>\n<td width=\"100%%\">%s</td></tr>"
32
+ s += (headline % ["Feed title:", r])
33
+ s += (headline % ["Type:", @type])
34
+ s += (headline % ["Encoding:", @encoding])
35
+ s += (headline % ["Creator:", @creator.escape_html]) if @creator
36
+ s += "</table></td></tr></table>\n"
37
+
38
+ if @description and @description !~ /\A\s*</m
39
+ s += "<br/>\n"
40
+ end
41
+ s += "#{@description}" if @description
42
+
43
+ @items.each do |i|
44
+ s += "\n<hr/><!-- *********************************** -->\n"
45
+ s += i.to_html(localtime)
46
+ end
47
+ s += "\n</body></html>\n"
48
+ s
49
+ end
50
+ end
51
+
52
+ class FeedItem
53
+ def to_html_with_headers(localtime = true)
54
+ s = <<-EOF
55
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
56
+ <html>
57
+ <body>
58
+ EOF
59
+ s += to_html(localtime)
60
+ s += "\n</body>\n</html>"
61
+ s
62
+ end
63
+
64
+ def to_html(localtime = true)
65
+ s = <<-EOF
66
+ <table border="1" width="100%" cellpadding="0" cellspacing="0" borderspacing="0"><tr><td>
67
+ <table width="100%" bgcolor="#EDEDED" cellpadding="4" cellspacing="2">
68
+ EOF
69
+ r = ""
70
+ r += "<a href=\"#{@feed.link}\">\n" if @feed.link
71
+ if @feed.title
72
+ r += "<b>#{@feed.title.escape_html}</b>\n"
73
+ elsif @feed.link
74
+ r += "<b>#{@feed.link.escape_html}</b>\n"
75
+ else
76
+ r += "<b>Unnamed feed</b>\n"
77
+ end
78
+ r += "</a>\n" if @feed.link
79
+ headline = "<tr><td align=\"right\"><b>%s</b></td>\n<td width=\"100%%\">%s</td></tr>"
80
+ s += (headline % ["Feed:", r])
81
+
82
+ r = ""
83
+ r += "<a href=\"#{@link}\">" if @link
84
+ if @title
85
+ r += "<b>#{@title.escape_html}</b>\n"
86
+ elsif @link
87
+ r += "<b>#{@link.escape_html}</b>\n"
88
+ end
89
+ r += "</a>\n" if @link
90
+ s += (headline % ["Item:", r])
91
+ s += "</table></td></tr></table>\n"
92
+ s += "\n"
93
+ if @content and @content !~ /\A\s*</m
94
+ s += "<br/>\n"
95
+ end
96
+ s += "#{@content}" if @content
97
+ if @enclosures and @enclosures.length > 0
98
+ s += <<-EOF
99
+ <table border="1" width="100%" cellpadding="0" cellspacing="0" borderspacing="0"><tr><td>
100
+ <table width="100%" bgcolor="#EDEDED" cellpadding="2" cellspacing="2">
101
+ EOF
102
+ s += '<tr><td width="100%"><b>Files:</b></td></tr>'
103
+ s += "\n"
104
+ @enclosures.each do |e|
105
+ s += "<tr><td>&nbsp;&nbsp;&nbsp;<a href=\"#{e[0]}\">#{e[0].split('/')[-1]}</a> (#{e[1].to_i.to_human_readable}, #{e[2]})</td></tr>\n"
106
+ end
107
+ s += "</table></td></tr></table>\n"
108
+ end
109
+ s += "\n<hr width=\"100%\"/>\n"
110
+ s += '<table width="100%" cellpadding="0" cellspacing="0">' + "\n"
111
+ l = '<tr><td align="right"><font color="#ababab">%s</font>&nbsp;&nbsp;</td><td><font color="#ababab">%s</font></td></tr>' + "\n"
112
+ if @date
113
+ if localtime
114
+ s += l % [ 'Date:', @date.to_s ]
115
+ else
116
+ s += l % [ 'Date:', @date.getutc.to_s ]
117
+ end
118
+ end
119
+ s += l % [ 'Author:', creator.escape_html ] if creator
120
+ s += l % [ 'Subject:', @subject.escape_html ] if @subject
121
+ s += l % [ 'Filed under:', @categories.join(', ').escape_html ] unless @categories.empty?
122
+ s += "</table>\n"
123
+ s
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,413 @@
1
+ require 'feedparser/sgml-parser'
2
+
3
+ module FeedParser
4
+ # this class provides a simple SGML parser that removes HTML tags
5
+ class HTML2TextParser < SGMLParser
6
+
7
+ attr_reader :savedata
8
+
9
+ def initialize(verbose = false)
10
+ @savedata = ''
11
+ @pre = false
12
+ @href = nil
13
+ @links = []
14
+ @imgs = []
15
+ @img_index = '@'
16
+ super(verbose)
17
+ end
18
+
19
+ def next_img_index
20
+ n = @img_index[0] + 1
21
+ @img_index = " "
22
+ @img_index[0] = n
23
+ return @img_index
24
+ end
25
+
26
+ def handle_data(data)
27
+ # let's remove all CR
28
+ if not @pre
29
+ data.gsub!(/\n/, ' ')
30
+ data.gsub!(/( )+/, ' ')
31
+ end
32
+ @savedata << data
33
+ end
34
+
35
+ def unknown_starttag(tag, attrs)
36
+ case tag
37
+ when 'p', 'h4'
38
+ @savedata << "\n\n"
39
+ when 'h1'
40
+ @savedata << "\n\n "
41
+ when 'h2'
42
+ @savedata << "\n\n "
43
+ when 'h3'
44
+ @savedata << "\n\n "
45
+ when 'br'
46
+ @savedata << "\n"
47
+ when 'ul'
48
+ @savedata << "\n"
49
+ when 'li'
50
+ @savedata << "\n - "
51
+ when 'b'
52
+ @savedata << '*'
53
+ when 'strong'
54
+ @savedata << '*'
55
+ when 'em'
56
+ @savedata << '*'
57
+ when 'u'
58
+ @savedata << '_'
59
+ when 'i'
60
+ @savedata << '/'
61
+ when 'pre'
62
+ @savedata << "\n\n"
63
+ @pre = true
64
+ when 'a'
65
+ # find href in args
66
+ @href = nil
67
+ attrs.each do |a|
68
+ if a[0] == 'href'
69
+ @href = a[1]
70
+ end
71
+ end
72
+ if @href
73
+ @links << @href.gsub(/^("|'|)(.*)("|')$/,'\2')
74
+ end
75
+ when 'img'
76
+ # find src in args
77
+ src = nil
78
+ attrs.each do |a|
79
+ if a[0] == 'src'
80
+ src = a[1]
81
+ end
82
+ end
83
+ if src
84
+ idx = next_img_index
85
+ @imgs << [ idx, src.gsub(/^("|'|)(.*)("|')$/,'\2') ]
86
+ @savedata << "[#{idx}]"
87
+ end
88
+ else
89
+ # puts "unknown tag: #{tag}"
90
+ end
91
+ end
92
+
93
+ def close
94
+ super
95
+ if @links.length > 0
96
+ @savedata << "\n\n"
97
+ @links.each_index do |i|
98
+ @savedata << "[#{i+1}] #{@links[i]}\n"
99
+ end
100
+ end
101
+ if @imgs.length > 0
102
+ @savedata << "\n\n"
103
+ @imgs.each do |i|
104
+ @savedata << "[#{i[0]}] #{i[1]}\n"
105
+ end
106
+ end
107
+ end
108
+
109
+ def unknown_endtag(tag)
110
+ case tag
111
+ when 'ul'
112
+ @savedata << "\n"
113
+ when 'b'
114
+ @savedata << '*'
115
+ when 'strong'
116
+ @savedata << '*'
117
+ when 'em'
118
+ @savedata << '*'
119
+ when 'u'
120
+ @savedata << '_'
121
+ when 'i'
122
+ @savedata << '/'
123
+ when 'pre'
124
+ @savedata << "\n\n"
125
+ @pre = false
126
+ when 'a'
127
+ if @href
128
+ @savedata << "[#{@links.length}]"
129
+ @href = nil
130
+ end
131
+ end
132
+ end
133
+
134
+ def unknown_charref(ref)
135
+ handle_data([ref.to_i].pack('U*'))
136
+ end
137
+
138
+ def HTML2TextParser.entities
139
+ return HTML_ENTITIES
140
+ end
141
+
142
+ HTML_ENTITIES = {
143
+ "quot" => 34,
144
+ "amp" => 38,
145
+ "lt" => 60,
146
+ "gt" => 62,
147
+ "apos" => 39,
148
+
149
+ "nbsp" => 160,
150
+ "iexcl" => 161,
151
+ "cent" => 162,
152
+ "pound" => 163,
153
+ "curren" => 164,
154
+ "yen" => 165,
155
+ "brvbar" => 166,
156
+ "sect" => 167,
157
+ "uml" => 168,
158
+ "copy" => 169,
159
+ "ordf" => 170,
160
+ "laquo" => 171,
161
+ "not" => 172,
162
+ "shy" => 173,
163
+ "reg" => 174,
164
+ "macr" => 175,
165
+ "deg" => 176,
166
+ "plusmn" => 177,
167
+ "sup2" => 178,
168
+ "sup3" => 179,
169
+ "acute" => 180,
170
+ "micro" => 181,
171
+ "para" => 182,
172
+ "middot" => 183,
173
+ "cedil" => 184,
174
+ "sup1" => 185,
175
+ "ordm" => 186,
176
+ "raquo" => 187,
177
+ "frac14" => 188,
178
+ "frac12" => 189,
179
+ "frac34" => 190,
180
+ "iquest" => 191,
181
+ "Agrave" => 192,
182
+ "Aacute" => 193,
183
+ "Acirc" => 194,
184
+ "Atilde" => 195,
185
+ "Auml" => 196,
186
+ "Aring" => 197,
187
+ "AElig" => 198,
188
+ "Ccedil" => 199,
189
+ "Egrave" => 200,
190
+ "Eacute" => 201,
191
+ "Ecirc" => 202,
192
+ "Euml" => 203,
193
+ "Igrave" => 204,
194
+ "Iacute" => 205,
195
+ "Icirc" => 206,
196
+ "Iuml" => 207,
197
+ "ETH" => 208,
198
+ "Ntilde" => 209,
199
+ "Ograve" => 210,
200
+ "Oacute" => 211,
201
+ "Ocirc" => 212,
202
+ "Otilde" => 213,
203
+ "Ouml" => 214,
204
+ "times" => 215,
205
+ "Oslash" => 216,
206
+ "Ugrave" => 217,
207
+ "Uacute" => 218,
208
+ "Ucirc" => 219,
209
+ "Uuml" => 220,
210
+ "Yacute" => 221,
211
+ "THORN" => 222,
212
+ "szlig" => 223,
213
+ "agrave" => 224,
214
+ "aacute" => 225,
215
+ "acirc" => 226,
216
+ "atilde" => 227,
217
+ "auml" => 228,
218
+ "aring" => 229,
219
+ "aelig" => 230,
220
+ "ccedil" => 231,
221
+ "egrave" => 232,
222
+ "eacute" => 233,
223
+ "ecirc" => 234,
224
+ "euml" => 235,
225
+ "igrave" => 236,
226
+ "iacute" => 237,
227
+ "icirc" => 238,
228
+ "iuml" => 239,
229
+ "eth" => 240,
230
+ "ntilde" => 241,
231
+ "ograve" => 242,
232
+ "oacute" => 243,
233
+ "ocirc" => 244,
234
+ "otilde" => 245,
235
+ "ouml" => 246,
236
+ "divide" => 247,
237
+ "oslash" => 248,
238
+ "ugrave" => 249,
239
+ "uacute" => 250,
240
+ "ucirc" => 251,
241
+ "uuml" => 252,
242
+ "yacute" => 253,
243
+ "thorn" => 254,
244
+ "yuml" => 255,
245
+
246
+
247
+ "fnof" => 402,
248
+ "Alpha" => 913,
249
+ "Beta" => 914,
250
+ "Gamma" => 915,
251
+ "Delta" => 916,
252
+ "Epsilon" => 917,
253
+ "Zeta" => 918,
254
+ "Eta" => 919,
255
+ "Theta" => 920,
256
+ "Iota" => 921,
257
+ "Kappa" => 922,
258
+ "Lambda" => 923,
259
+ "Mu" => 924,
260
+ "Nu" => 925,
261
+ "Xi" => 926,
262
+ "Omicron" => 927,
263
+ "Pi" => 928,
264
+ "Rho" => 929,
265
+ "Sigma" => 931,
266
+ "Tau" => 932,
267
+ "Upsilon" => 933,
268
+ "Phi" => 934,
269
+ "Chi" => 935,
270
+ "Psi" => 936,
271
+ "Omega" => 937,
272
+ "alpha" => 945,
273
+ "beta" => 946,
274
+ "gamma" => 947,
275
+ "delta" => 948,
276
+ "epsilon" => 949,
277
+ "zeta" => 950,
278
+ "eta" => 951,
279
+ "theta" => 952,
280
+ "iota" => 953,
281
+ "kappa" => 954,
282
+ "lambda" => 955,
283
+ "mu" => 956,
284
+ "nu" => 957,
285
+ "xi" => 958,
286
+ "omicron" => 959,
287
+ "pi" => 960,
288
+ "rho" => 961,
289
+ "sigmaf" => 962,
290
+ "sigma" => 963,
291
+ "tau" => 964,
292
+ "upsilon" => 965,
293
+ "phi" => 966,
294
+ "chi" => 967,
295
+ "psi" => 968,
296
+ "omega" => 969,
297
+ "thetasym" => 977,
298
+ "upsih" => 978,
299
+ "piv" => 982,
300
+ "bull" => 8226,
301
+ "hellip" => 8230,
302
+ "prime" => 8242,
303
+ "Prime" => 8243,
304
+ "oline" => 8254,
305
+ "frasl" => 8260,
306
+ "weierp" => 8472,
307
+ "image" => 8465,
308
+ "real" => 8476,
309
+ "trade" => 8482,
310
+ "alefsym" => 8501,
311
+ "larr" => 8592,
312
+ "uarr" => 8593,
313
+ "rarr" => 8594,
314
+ "darr" => 8595,
315
+ "harr" => 8596,
316
+ "crarr" => 8629,
317
+ "lArr" => 8656,
318
+ "uArr" => 8657,
319
+ "rArr" => 8658,
320
+ "dArr" => 8659,
321
+ "hArr" => 8660,
322
+ "forall" => 8704,
323
+ "part" => 8706,
324
+ "exist" => 8707,
325
+ "empty" => 8709,
326
+ "nabla" => 8711,
327
+ "isin" => 8712,
328
+ "notin" => 8713,
329
+ "ni" => 8715,
330
+ "prod" => 8719,
331
+ "sum" => 8721,
332
+ "minus" => 8722,
333
+ "lowast" => 8727,
334
+ "radic" => 8730,
335
+ "prop" => 8733,
336
+ "infin" => 8734,
337
+ "ang" => 8736,
338
+ "and" => 8743,
339
+ "or" => 8744,
340
+ "cap" => 8745,
341
+ "cup" => 8746,
342
+ "int" => 8747,
343
+ "there4" => 8756,
344
+ "sim" => 8764,
345
+ "cong" => 8773,
346
+ "asymp" => 8776,
347
+ "ne" => 8800,
348
+ "equiv" => 8801,
349
+ "le" => 8804,
350
+ "ge" => 8805,
351
+ "sub" => 8834,
352
+ "sup" => 8835,
353
+ "nsub" => 8836,
354
+ "sube" => 8838,
355
+ "supe" => 8839,
356
+ "oplus" => 8853,
357
+ "otimes" => 8855,
358
+ "perp" => 8869,
359
+ "sdot" => 8901,
360
+ "lceil" => 8968,
361
+ "rceil" => 8969,
362
+ "lfloor" => 8970,
363
+ "rfloor" => 8971,
364
+ "lang" => 9001,
365
+ "rang" => 9002,
366
+ "loz" => 9674,
367
+ "spades" => 9824,
368
+ "clubs" => 9827,
369
+ "hearts" => 9829,
370
+ "diams" => 9830,
371
+
372
+ "OElig" => 338,
373
+ "oelig" => 339,
374
+ "Scaron" => 352,
375
+ "scaron" => 353,
376
+ "Yuml" => 376,
377
+ "circ" => 710,
378
+ "tilde" => 732,
379
+ "ensp" => 8194,
380
+ "emsp" => 8195,
381
+ "thinsp" => 8201,
382
+ "zwnj" => 8204,
383
+ "zwj" => 8205,
384
+ "lrm" => 8206,
385
+ "rlm" => 8207,
386
+ "ndash" => 8211,
387
+ "mdash" => 8212,
388
+ "lsquo" => 8216,
389
+ "rsquo" => 8217,
390
+ "sbquo" => 8218,
391
+ "ldquo" => 8220,
392
+ "rdquo" => 8221,
393
+ "bdquo" => 8222,
394
+ "dagger" => 8224,
395
+ "Dagger" => 8225,
396
+ "permil" => 8240,
397
+ "lsaquo" => 8249,
398
+ "rsaquo" => 8250,
399
+ "euro" => 8364
400
+ }
401
+ def unknown_entityref(ref)
402
+ # hack to avoid considering &shy;, as it is misused by some blog software (dotclear2)
403
+ # see http://www.cs.tut.fi/~jkorpela/shy.html
404
+ if ref == 'shy'
405
+ handle_data('')
406
+ elsif HTML_ENTITIES.has_key?(ref)
407
+ handle_data([HTML_ENTITIES[ref]].pack('U*'))
408
+ else
409
+ handle_data(ref)
410
+ end
411
+ end
412
+ end
413
+ end