ruby-feedparser 0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,14 @@
1
+ class Integer
2
+ def to_human_readable
3
+ n = self
4
+ if n < 1024
5
+ return "#{n} B"
6
+ elsif n >= 1024 and n < 1024*1024
7
+ return "%.1f KB" % (n.to_f / 1024)
8
+ elsif n >= 1024*1024 and n < 1024*1024*1024
9
+ return "%.1f MB" % (n.to_f / (1024*1024))
10
+ else
11
+ return "%.1f GB" % (n.to_f / (1024*1024*1024))
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,126 @@
1
+ require 'feedparser'
2
+ require 'feedparser/filesizes'
3
+
4
+ module FeedParser
5
+ class Feed
6
+ def to_html(localtime = true)
7
+ s = ''
8
+ s += '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">'
9
+ s += "\n"
10
+ s += "<html>\n"
11
+ s += "<head>\n"
12
+ s += "<title>#{@title.escape_html}</title>\n"
13
+ s += "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\">\n"
14
+ s += "</head>\n"
15
+ s += "<body>\n"
16
+
17
+ s += <<-EOF
18
+ <table border="1" width="100%" cellpadding="0" cellspacing="0" borderspacing="0"><tr><td>
19
+ <table width="100%" bgcolor="#EDEDED" cellpadding="4" cellspacing="2">
20
+ EOF
21
+ r = ""
22
+ r += "<a href=\"#{@link}\">\n" if @link
23
+ if @title
24
+ r += "<b>#{@title.escape_html}</b>\n"
25
+ elsif @link
26
+ r += "<b>#{@link.escape_html}</b>\n"
27
+ else
28
+ r += "<b>Unnamed feed</b>\n"
29
+ end
30
+ r += "</a>\n" if @link
31
+ headline = "<tr><td align=\"right\"><b>%s</b></td>\n<td width=\"100%%\">%s</td></tr>"
32
+ s += (headline % ["Feed title:", r])
33
+ s += (headline % ["Type:", @type])
34
+ s += (headline % ["Encoding:", @encoding])
35
+ s += (headline % ["Creator:", @creator.escape_html]) if @creator
36
+ s += "</table></td></tr></table>\n"
37
+
38
+ if @description and @description !~ /\A\s*</m
39
+ s += "<br/>\n"
40
+ end
41
+ s += "#{@description}" if @description
42
+
43
+ @items.each do |i|
44
+ s += "\n<hr/><!-- *********************************** -->\n"
45
+ s += i.to_html(localtime)
46
+ end
47
+ s += "\n</body></html>\n"
48
+ s
49
+ end
50
+ end
51
+
52
+ class FeedItem
53
+ def to_html_with_headers(localtime = true)
54
+ s = <<-EOF
55
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
56
+ <html>
57
+ <body>
58
+ EOF
59
+ s += to_html(localtime)
60
+ s += "\n</body>\n</html>"
61
+ s
62
+ end
63
+
64
+ def to_html(localtime = true)
65
+ s = <<-EOF
66
+ <table border="1" width="100%" cellpadding="0" cellspacing="0" borderspacing="0"><tr><td>
67
+ <table width="100%" bgcolor="#EDEDED" cellpadding="4" cellspacing="2">
68
+ EOF
69
+ r = ""
70
+ r += "<a href=\"#{@feed.link}\">\n" if @feed.link
71
+ if @feed.title
72
+ r += "<b>#{@feed.title.escape_html}</b>\n"
73
+ elsif @feed.link
74
+ r += "<b>#{@feed.link.escape_html}</b>\n"
75
+ else
76
+ r += "<b>Unnamed feed</b>\n"
77
+ end
78
+ r += "</a>\n" if @feed.link
79
+ headline = "<tr><td align=\"right\"><b>%s</b></td>\n<td width=\"100%%\">%s</td></tr>"
80
+ s += (headline % ["Feed:", r])
81
+
82
+ r = ""
83
+ r += "<a href=\"#{@link}\">" if @link
84
+ if @title
85
+ r += "<b>#{@title.escape_html}</b>\n"
86
+ elsif @link
87
+ r += "<b>#{@link.escape_html}</b>\n"
88
+ end
89
+ r += "</a>\n" if @link
90
+ s += (headline % ["Item:", r])
91
+ s += "</table></td></tr></table>\n"
92
+ s += "\n"
93
+ if @content and @content !~ /\A\s*</m
94
+ s += "<br/>\n"
95
+ end
96
+ s += "#{@content}" if @content
97
+ if @enclosures and @enclosures.length > 0
98
+ s += <<-EOF
99
+ <table border="1" width="100%" cellpadding="0" cellspacing="0" borderspacing="0"><tr><td>
100
+ <table width="100%" bgcolor="#EDEDED" cellpadding="2" cellspacing="2">
101
+ EOF
102
+ s += '<tr><td width="100%"><b>Files:</b></td></tr>'
103
+ s += "\n"
104
+ @enclosures.each do |e|
105
+ s += "<tr><td>&nbsp;&nbsp;&nbsp;<a href=\"#{e[0]}\">#{e[0].split('/')[-1]}</a> (#{e[1].to_i.to_human_readable}, #{e[2]})</td></tr>\n"
106
+ end
107
+ s += "</table></td></tr></table>\n"
108
+ end
109
+ s += "\n<hr width=\"100%\"/>\n"
110
+ s += '<table width="100%" cellpadding="0" cellspacing="0">' + "\n"
111
+ l = '<tr><td align="right"><font color="#ababab">%s</font>&nbsp;&nbsp;</td><td><font color="#ababab">%s</font></td></tr>' + "\n"
112
+ if @date
113
+ if localtime
114
+ s += l % [ 'Date:', @date.to_s ]
115
+ else
116
+ s += l % [ 'Date:', @date.getutc.to_s ]
117
+ end
118
+ end
119
+ s += l % [ 'Author:', creator.escape_html ] if creator
120
+ s += l % [ 'Subject:', @subject.escape_html ] if @subject
121
+ s += l % [ 'Filed under:', @categories.join(', ').escape_html ] unless @categories.empty?
122
+ s += "</table>\n"
123
+ s
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,413 @@
1
+ require 'feedparser/sgml-parser'
2
+
3
+ module FeedParser
4
+ # this class provides a simple SGML parser that removes HTML tags
5
+ class HTML2TextParser < SGMLParser
6
+
7
+ attr_reader :savedata
8
+
9
+ def initialize(verbose = false)
10
+ @savedata = ''
11
+ @pre = false
12
+ @href = nil
13
+ @links = []
14
+ @imgs = []
15
+ @img_index = '@'
16
+ super(verbose)
17
+ end
18
+
19
+ def next_img_index
20
+ n = @img_index[0] + 1
21
+ @img_index = " "
22
+ @img_index[0] = n
23
+ return @img_index
24
+ end
25
+
26
+ def handle_data(data)
27
+ # let's remove all CR
28
+ if not @pre
29
+ data.gsub!(/\n/, ' ')
30
+ data.gsub!(/( )+/, ' ')
31
+ end
32
+ @savedata << data
33
+ end
34
+
35
+ def unknown_starttag(tag, attrs)
36
+ case tag
37
+ when 'p', 'h4'
38
+ @savedata << "\n\n"
39
+ when 'h1'
40
+ @savedata << "\n\n "
41
+ when 'h2'
42
+ @savedata << "\n\n "
43
+ when 'h3'
44
+ @savedata << "\n\n "
45
+ when 'br'
46
+ @savedata << "\n"
47
+ when 'ul'
48
+ @savedata << "\n"
49
+ when 'li'
50
+ @savedata << "\n - "
51
+ when 'b'
52
+ @savedata << '*'
53
+ when 'strong'
54
+ @savedata << '*'
55
+ when 'em'
56
+ @savedata << '*'
57
+ when 'u'
58
+ @savedata << '_'
59
+ when 'i'
60
+ @savedata << '/'
61
+ when 'pre'
62
+ @savedata << "\n\n"
63
+ @pre = true
64
+ when 'a'
65
+ # find href in args
66
+ @href = nil
67
+ attrs.each do |a|
68
+ if a[0] == 'href'
69
+ @href = a[1]
70
+ end
71
+ end
72
+ if @href
73
+ @links << @href.gsub(/^("|'|)(.*)("|')$/,'\2')
74
+ end
75
+ when 'img'
76
+ # find src in args
77
+ src = nil
78
+ attrs.each do |a|
79
+ if a[0] == 'src'
80
+ src = a[1]
81
+ end
82
+ end
83
+ if src
84
+ idx = next_img_index
85
+ @imgs << [ idx, src.gsub(/^("|'|)(.*)("|')$/,'\2') ]
86
+ @savedata << "[#{idx}]"
87
+ end
88
+ else
89
+ # puts "unknown tag: #{tag}"
90
+ end
91
+ end
92
+
93
+ def close
94
+ super
95
+ if @links.length > 0
96
+ @savedata << "\n\n"
97
+ @links.each_index do |i|
98
+ @savedata << "[#{i+1}] #{@links[i]}\n"
99
+ end
100
+ end
101
+ if @imgs.length > 0
102
+ @savedata << "\n\n"
103
+ @imgs.each do |i|
104
+ @savedata << "[#{i[0]}] #{i[1]}\n"
105
+ end
106
+ end
107
+ end
108
+
109
+ def unknown_endtag(tag)
110
+ case tag
111
+ when 'ul'
112
+ @savedata << "\n"
113
+ when 'b'
114
+ @savedata << '*'
115
+ when 'strong'
116
+ @savedata << '*'
117
+ when 'em'
118
+ @savedata << '*'
119
+ when 'u'
120
+ @savedata << '_'
121
+ when 'i'
122
+ @savedata << '/'
123
+ when 'pre'
124
+ @savedata << "\n\n"
125
+ @pre = false
126
+ when 'a'
127
+ if @href
128
+ @savedata << "[#{@links.length}]"
129
+ @href = nil
130
+ end
131
+ end
132
+ end
133
+
134
+ def unknown_charref(ref)
135
+ handle_data([ref.to_i].pack('U*'))
136
+ end
137
+
138
+ def HTML2TextParser.entities
139
+ return HTML_ENTITIES
140
+ end
141
+
142
+ HTML_ENTITIES = {
143
+ "quot" => 34,
144
+ "amp" => 38,
145
+ "lt" => 60,
146
+ "gt" => 62,
147
+ "apos" => 39,
148
+
149
+ "nbsp" => 160,
150
+ "iexcl" => 161,
151
+ "cent" => 162,
152
+ "pound" => 163,
153
+ "curren" => 164,
154
+ "yen" => 165,
155
+ "brvbar" => 166,
156
+ "sect" => 167,
157
+ "uml" => 168,
158
+ "copy" => 169,
159
+ "ordf" => 170,
160
+ "laquo" => 171,
161
+ "not" => 172,
162
+ "shy" => 173,
163
+ "reg" => 174,
164
+ "macr" => 175,
165
+ "deg" => 176,
166
+ "plusmn" => 177,
167
+ "sup2" => 178,
168
+ "sup3" => 179,
169
+ "acute" => 180,
170
+ "micro" => 181,
171
+ "para" => 182,
172
+ "middot" => 183,
173
+ "cedil" => 184,
174
+ "sup1" => 185,
175
+ "ordm" => 186,
176
+ "raquo" => 187,
177
+ "frac14" => 188,
178
+ "frac12" => 189,
179
+ "frac34" => 190,
180
+ "iquest" => 191,
181
+ "Agrave" => 192,
182
+ "Aacute" => 193,
183
+ "Acirc" => 194,
184
+ "Atilde" => 195,
185
+ "Auml" => 196,
186
+ "Aring" => 197,
187
+ "AElig" => 198,
188
+ "Ccedil" => 199,
189
+ "Egrave" => 200,
190
+ "Eacute" => 201,
191
+ "Ecirc" => 202,
192
+ "Euml" => 203,
193
+ "Igrave" => 204,
194
+ "Iacute" => 205,
195
+ "Icirc" => 206,
196
+ "Iuml" => 207,
197
+ "ETH" => 208,
198
+ "Ntilde" => 209,
199
+ "Ograve" => 210,
200
+ "Oacute" => 211,
201
+ "Ocirc" => 212,
202
+ "Otilde" => 213,
203
+ "Ouml" => 214,
204
+ "times" => 215,
205
+ "Oslash" => 216,
206
+ "Ugrave" => 217,
207
+ "Uacute" => 218,
208
+ "Ucirc" => 219,
209
+ "Uuml" => 220,
210
+ "Yacute" => 221,
211
+ "THORN" => 222,
212
+ "szlig" => 223,
213
+ "agrave" => 224,
214
+ "aacute" => 225,
215
+ "acirc" => 226,
216
+ "atilde" => 227,
217
+ "auml" => 228,
218
+ "aring" => 229,
219
+ "aelig" => 230,
220
+ "ccedil" => 231,
221
+ "egrave" => 232,
222
+ "eacute" => 233,
223
+ "ecirc" => 234,
224
+ "euml" => 235,
225
+ "igrave" => 236,
226
+ "iacute" => 237,
227
+ "icirc" => 238,
228
+ "iuml" => 239,
229
+ "eth" => 240,
230
+ "ntilde" => 241,
231
+ "ograve" => 242,
232
+ "oacute" => 243,
233
+ "ocirc" => 244,
234
+ "otilde" => 245,
235
+ "ouml" => 246,
236
+ "divide" => 247,
237
+ "oslash" => 248,
238
+ "ugrave" => 249,
239
+ "uacute" => 250,
240
+ "ucirc" => 251,
241
+ "uuml" => 252,
242
+ "yacute" => 253,
243
+ "thorn" => 254,
244
+ "yuml" => 255,
245
+
246
+
247
+ "fnof" => 402,
248
+ "Alpha" => 913,
249
+ "Beta" => 914,
250
+ "Gamma" => 915,
251
+ "Delta" => 916,
252
+ "Epsilon" => 917,
253
+ "Zeta" => 918,
254
+ "Eta" => 919,
255
+ "Theta" => 920,
256
+ "Iota" => 921,
257
+ "Kappa" => 922,
258
+ "Lambda" => 923,
259
+ "Mu" => 924,
260
+ "Nu" => 925,
261
+ "Xi" => 926,
262
+ "Omicron" => 927,
263
+ "Pi" => 928,
264
+ "Rho" => 929,
265
+ "Sigma" => 931,
266
+ "Tau" => 932,
267
+ "Upsilon" => 933,
268
+ "Phi" => 934,
269
+ "Chi" => 935,
270
+ "Psi" => 936,
271
+ "Omega" => 937,
272
+ "alpha" => 945,
273
+ "beta" => 946,
274
+ "gamma" => 947,
275
+ "delta" => 948,
276
+ "epsilon" => 949,
277
+ "zeta" => 950,
278
+ "eta" => 951,
279
+ "theta" => 952,
280
+ "iota" => 953,
281
+ "kappa" => 954,
282
+ "lambda" => 955,
283
+ "mu" => 956,
284
+ "nu" => 957,
285
+ "xi" => 958,
286
+ "omicron" => 959,
287
+ "pi" => 960,
288
+ "rho" => 961,
289
+ "sigmaf" => 962,
290
+ "sigma" => 963,
291
+ "tau" => 964,
292
+ "upsilon" => 965,
293
+ "phi" => 966,
294
+ "chi" => 967,
295
+ "psi" => 968,
296
+ "omega" => 969,
297
+ "thetasym" => 977,
298
+ "upsih" => 978,
299
+ "piv" => 982,
300
+ "bull" => 8226,
301
+ "hellip" => 8230,
302
+ "prime" => 8242,
303
+ "Prime" => 8243,
304
+ "oline" => 8254,
305
+ "frasl" => 8260,
306
+ "weierp" => 8472,
307
+ "image" => 8465,
308
+ "real" => 8476,
309
+ "trade" => 8482,
310
+ "alefsym" => 8501,
311
+ "larr" => 8592,
312
+ "uarr" => 8593,
313
+ "rarr" => 8594,
314
+ "darr" => 8595,
315
+ "harr" => 8596,
316
+ "crarr" => 8629,
317
+ "lArr" => 8656,
318
+ "uArr" => 8657,
319
+ "rArr" => 8658,
320
+ "dArr" => 8659,
321
+ "hArr" => 8660,
322
+ "forall" => 8704,
323
+ "part" => 8706,
324
+ "exist" => 8707,
325
+ "empty" => 8709,
326
+ "nabla" => 8711,
327
+ "isin" => 8712,
328
+ "notin" => 8713,
329
+ "ni" => 8715,
330
+ "prod" => 8719,
331
+ "sum" => 8721,
332
+ "minus" => 8722,
333
+ "lowast" => 8727,
334
+ "radic" => 8730,
335
+ "prop" => 8733,
336
+ "infin" => 8734,
337
+ "ang" => 8736,
338
+ "and" => 8743,
339
+ "or" => 8744,
340
+ "cap" => 8745,
341
+ "cup" => 8746,
342
+ "int" => 8747,
343
+ "there4" => 8756,
344
+ "sim" => 8764,
345
+ "cong" => 8773,
346
+ "asymp" => 8776,
347
+ "ne" => 8800,
348
+ "equiv" => 8801,
349
+ "le" => 8804,
350
+ "ge" => 8805,
351
+ "sub" => 8834,
352
+ "sup" => 8835,
353
+ "nsub" => 8836,
354
+ "sube" => 8838,
355
+ "supe" => 8839,
356
+ "oplus" => 8853,
357
+ "otimes" => 8855,
358
+ "perp" => 8869,
359
+ "sdot" => 8901,
360
+ "lceil" => 8968,
361
+ "rceil" => 8969,
362
+ "lfloor" => 8970,
363
+ "rfloor" => 8971,
364
+ "lang" => 9001,
365
+ "rang" => 9002,
366
+ "loz" => 9674,
367
+ "spades" => 9824,
368
+ "clubs" => 9827,
369
+ "hearts" => 9829,
370
+ "diams" => 9830,
371
+
372
+ "OElig" => 338,
373
+ "oelig" => 339,
374
+ "Scaron" => 352,
375
+ "scaron" => 353,
376
+ "Yuml" => 376,
377
+ "circ" => 710,
378
+ "tilde" => 732,
379
+ "ensp" => 8194,
380
+ "emsp" => 8195,
381
+ "thinsp" => 8201,
382
+ "zwnj" => 8204,
383
+ "zwj" => 8205,
384
+ "lrm" => 8206,
385
+ "rlm" => 8207,
386
+ "ndash" => 8211,
387
+ "mdash" => 8212,
388
+ "lsquo" => 8216,
389
+ "rsquo" => 8217,
390
+ "sbquo" => 8218,
391
+ "ldquo" => 8220,
392
+ "rdquo" => 8221,
393
+ "bdquo" => 8222,
394
+ "dagger" => 8224,
395
+ "Dagger" => 8225,
396
+ "permil" => 8240,
397
+ "lsaquo" => 8249,
398
+ "rsaquo" => 8250,
399
+ "euro" => 8364
400
+ }
401
+ def unknown_entityref(ref)
402
+ # hack to avoid considering &shy;, as it is misused by some blog software (dotclear2)
403
+ # see http://www.cs.tut.fi/~jkorpela/shy.html
404
+ if ref == 'shy'
405
+ handle_data('')
406
+ elsif HTML_ENTITIES.has_key?(ref)
407
+ handle_data([HTML_ENTITIES[ref]].pack('U*'))
408
+ else
409
+ handle_data(ref)
410
+ end
411
+ end
412
+ end
413
+ end