rfeedparser 0.9.8 → 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rfeedparser.rb CHANGED
@@ -14,3314 +14,138 @@ require 'stringio'
14
14
  require 'uri'
15
15
  require 'cgi' # escaping html
16
16
  require 'time'
17
- require 'xml/saxdriver' # calling expat
18
17
  require 'pp'
19
18
  require 'rubygems'
20
19
  require 'base64'
21
20
  require 'iconv'
22
- gem 'hpricot', ">=0.5"
23
- gem 'character-encodings', ">=0.2.0"
24
- gem 'htmltools', ">=1.10"
25
- gem 'htmlentities', ">=4.0.0"
26
- gem 'activesupport', ">=1.4.2"
27
- gem 'rchardet', ">=1.0"
28
-
29
- require 'rchardet'
30
- $chardet = true
31
-
32
- require 'hpricot'
33
- require 'encoding/character/utf-8'
34
- require 'html/sgml-parser'
35
- require 'htmlentities'
36
- require 'active_support'
37
- require 'open-uri'
38
- include OpenURI
39
-
40
- $debug = false
41
- $compatible = true
42
-
43
- Encoding_Aliases = { # Adapted from python2.4's encodings/aliases.py
44
- # ascii codec
45
- '646' => 'ascii',
46
- 'ansi_x3.4_1968' => 'ascii',
47
- 'ansi_x3_4_1968' => 'ascii', # some email headers use this non-standard name
48
- 'ansi_x3.4_1986' => 'ascii',
49
- 'cp367' => 'ascii',
50
- 'csascii' => 'ascii',
51
- 'ibm367' => 'ascii',
52
- 'iso646_us' => 'ascii',
53
- 'iso_646.irv_1991' => 'ascii',
54
- 'iso_ir_6' => 'ascii',
55
- 'us' => 'ascii',
56
- 'us_ascii' => 'ascii',
57
-
58
- # big5 codec
59
- 'big5_tw' => 'big5',
60
- 'csbig5' => 'big5',
61
-
62
- # big5hkscs codec
63
- 'big5_hkscs' => 'big5hkscs',
64
- 'hkscs' => 'big5hkscs',
65
-
66
- # cp037 codec
67
- '037' => 'cp037',
68
- 'csibm037' => 'cp037',
69
- 'ebcdic_cp_ca' => 'cp037',
70
- 'ebcdic_cp_nl' => 'cp037',
71
- 'ebcdic_cp_us' => 'cp037',
72
- 'ebcdic_cp_wt' => 'cp037',
73
- 'ibm037' => 'cp037',
74
- 'ibm039' => 'cp037',
75
-
76
- # cp1026 codec
77
- '1026' => 'cp1026',
78
- 'csibm1026' => 'cp1026',
79
- 'ibm1026' => 'cp1026',
80
-
81
- # cp1140 codec
82
- '1140' => 'cp1140',
83
- 'ibm1140' => 'cp1140',
84
-
85
- # cp1250 codec
86
- '1250' => 'cp1250',
87
- 'windows_1250' => 'cp1250',
88
-
89
- # cp1251 codec
90
- '1251' => 'cp1251',
91
- 'windows_1251' => 'cp1251',
92
-
93
- # cp1252 codec
94
- '1252' => 'cp1252',
95
- 'windows_1252' => 'cp1252',
96
-
97
- # cp1253 codec
98
- '1253' => 'cp1253',
99
- 'windows_1253' => 'cp1253',
100
-
101
- # cp1254 codec
102
- '1254' => 'cp1254',
103
- 'windows_1254' => 'cp1254',
104
-
105
- # cp1255 codec
106
- '1255' => 'cp1255',
107
- 'windows_1255' => 'cp1255',
108
-
109
- # cp1256 codec
110
- '1256' => 'cp1256',
111
- 'windows_1256' => 'cp1256',
112
-
113
- # cp1257 codec
114
- '1257' => 'cp1257',
115
- 'windows_1257' => 'cp1257',
116
-
117
- # cp1258 codec
118
- '1258' => 'cp1258',
119
- 'windows_1258' => 'cp1258',
120
-
121
- # cp424 codec
122
- '424' => 'cp424',
123
- 'csibm424' => 'cp424',
124
- 'ebcdic_cp_he' => 'cp424',
125
- 'ibm424' => 'cp424',
126
-
127
- # cp437 codec
128
- '437' => 'cp437',
129
- 'cspc8codepage437' => 'cp437',
130
- 'ibm437' => 'cp437',
131
-
132
- # cp500 codec
133
- '500' => 'cp500',
134
- 'csibm500' => 'cp500',
135
- 'ebcdic_cp_be' => 'cp500',
136
- 'ebcdic_cp_ch' => 'cp500',
137
- 'ibm500' => 'cp500',
138
-
139
- # cp775 codec
140
- '775' => 'cp775',
141
- 'cspc775baltic' => 'cp775',
142
- 'ibm775' => 'cp775',
143
-
144
- # cp850 codec
145
- '850' => 'cp850',
146
- 'cspc850multilingual' => 'cp850',
147
- 'ibm850' => 'cp850',
148
-
149
- # cp852 codec
150
- '852' => 'cp852',
151
- 'cspcp852' => 'cp852',
152
- 'ibm852' => 'cp852',
153
-
154
- # cp855 codec
155
- '855' => 'cp855',
156
- 'csibm855' => 'cp855',
157
- 'ibm855' => 'cp855',
158
-
159
- # cp857 codec
160
- '857' => 'cp857',
161
- 'csibm857' => 'cp857',
162
- 'ibm857' => 'cp857',
163
-
164
- # cp860 codec
165
- '860' => 'cp860',
166
- 'csibm860' => 'cp860',
167
- 'ibm860' => 'cp860',
168
-
169
- # cp861 codec
170
- '861' => 'cp861',
171
- 'cp_is' => 'cp861',
172
- 'csibm861' => 'cp861',
173
- 'ibm861' => 'cp861',
174
-
175
- # cp862 codec
176
- '862' => 'cp862',
177
- 'cspc862latinhebrew' => 'cp862',
178
- 'ibm862' => 'cp862',
179
-
180
- # cp863 codec
181
- '863' => 'cp863',
182
- 'csibm863' => 'cp863',
183
- 'ibm863' => 'cp863',
184
-
185
- # cp864 codec
186
- '864' => 'cp864',
187
- 'csibm864' => 'cp864',
188
- 'ibm864' => 'cp864',
189
-
190
- # cp865 codec
191
- '865' => 'cp865',
192
- 'csibm865' => 'cp865',
193
- 'ibm865' => 'cp865',
194
-
195
- # cp866 codec
196
- '866' => 'cp866',
197
- 'csibm866' => 'cp866',
198
- 'ibm866' => 'cp866',
199
-
200
- # cp869 codec
201
- '869' => 'cp869',
202
- 'cp_gr' => 'cp869',
203
- 'csibm869' => 'cp869',
204
- 'ibm869' => 'cp869',
205
-
206
- # cp932 codec
207
- '932' => 'cp932',
208
- 'ms932' => 'cp932',
209
- 'mskanji' => 'cp932',
210
- 'ms_kanji' => 'cp932',
211
-
212
- # cp949 codec
213
- '949' => 'cp949',
214
- 'ms949' => 'cp949',
215
- 'uhc' => 'cp949',
216
-
217
- # cp950 codec
218
- '950' => 'cp950',
219
- 'ms950' => 'cp950',
220
-
221
- # euc_jp codec
222
- 'euc_jp' => 'euc-jp',
223
- 'eucjp' => 'euc-jp',
224
- 'ujis' => 'euc-jp',
225
- 'u_jis' => 'euc-jp',
226
-
227
- # euc_kr codec
228
- 'euc_kr' => 'euc-kr',
229
- 'euckr' => 'euc-kr',
230
- 'korean' => 'euc-kr',
231
- 'ksc5601' => 'euc-kr',
232
- 'ks_c_5601' => 'euc-kr',
233
- 'ks_c_5601_1987' => 'euc-kr',
234
- 'ksx1001' => 'euc-kr',
235
- 'ks_x_1001' => 'euc-kr',
236
-
237
- # gb18030 codec
238
- 'gb18030_2000' => 'gb18030',
239
-
240
- # gb2312 codec
241
- 'chinese' => 'gb2312',
242
- 'csiso58gb231280' => 'gb2312',
243
- 'euc_cn' => 'gb2312',
244
- 'euccn' => 'gb2312',
245
- 'eucgb2312_cn' => 'gb2312',
246
- 'gb2312_1980' => 'gb2312',
247
- 'gb2312_80' => 'gb2312',
248
- 'iso_ir_58' => 'gb2312',
249
-
250
- # gbk codec
251
- '936' => 'gbk',
252
- 'cp936' => 'gbk',
253
- 'ms936' => 'gbk',
254
-
255
- # hp-roman8 codec
256
- 'hp_roman8' => 'hp-roman8',
257
- 'roman8' => 'hp-roman8',
258
- 'r8' => 'hp-roman8',
259
- 'csHPRoman8' => 'hp-roman8',
260
-
261
- # iso2022_jp codec
262
- 'iso2022_jp' => 'iso-2022-jp',
263
- 'csiso2022jp' => 'iso-2022-jp',
264
- 'iso2022jp' => 'iso-2022-jp',
265
- 'iso_2022_jp' => 'iso-2022-jp',
266
-
267
- # iso2022_jp_1 codec
268
- 'iso2002_jp_1' => 'iso-2022-jp-1',
269
- 'iso2022jp_1' => 'iso-2022-jp-1',
270
- 'iso_2022_jp_1' => 'iso-2022-jp-1',
271
-
272
- # iso2022_jp_2 codec
273
- 'iso2022_jp_2' => 'iso-2002-jp-2',
274
- 'iso2022jp_2' => 'iso-2022-jp-2',
275
- 'iso_2022_jp_2' => 'iso-2022-jp-2',
276
-
277
- # iso2022_jp_3 codec
278
- 'iso2002_jp_3' => 'iso-2022-jp-3',
279
- 'iso2022jp_3' => 'iso-2022-jp-3',
280
- 'iso_2022_jp_3' => 'iso-2022-jp-3',
281
-
282
- # iso2022_kr codec
283
- 'iso2022_kr' => 'iso-2022-kr',
284
- 'csiso2022kr' => 'iso-2022-kr',
285
- 'iso2022kr' => 'iso-2022-kr',
286
- 'iso_2022_kr' => 'iso-2022-kr',
287
-
288
- # iso8859_10 codec
289
- 'iso8859_10' => 'iso-8859-10',
290
- 'csisolatin6' => 'iso-8859-10',
291
- 'iso_8859_10' => 'iso-8859-10',
292
- 'iso_8859_10_1992' => 'iso-8859-10',
293
- 'iso_ir_157' => 'iso-8859-10',
294
- 'l6' => 'iso-8859-10',
295
- 'latin6' => 'iso-8859-10',
296
-
297
- # iso8859_13 codec
298
- 'iso8859_13' => 'iso-8859-13',
299
- 'iso_8859_13' => 'iso-8859-13',
300
-
301
- # iso8859_14 codec
302
- 'iso8859_14' => 'iso-8859-14',
303
- 'iso_8859_14' => 'iso-8859-14',
304
- 'iso_8859_14_1998' => 'iso-8859-14',
305
- 'iso_celtic' => 'iso-8859-14',
306
- 'iso_ir_199' => 'iso-8859-14',
307
- 'l8' => 'iso-8859-14',
308
- 'latin8' => 'iso-8859-14',
309
-
310
- # iso8859_15 codec
311
- 'iso8859_15' => 'iso-8859-15',
312
- 'iso_8859_15' => 'iso-8859-15',
313
-
314
- # iso8859_1 codec
315
- 'latin_1' => 'iso-8859-1',
316
- 'cp819' => 'iso-8859-1',
317
- 'csisolatin1' => 'iso-8859-1',
318
- 'ibm819' => 'iso-8859-1',
319
- 'iso8859' => 'iso-8859-1',
320
- 'iso_8859_1' => 'iso-8859-1',
321
- 'iso_8859_1_1987' => 'iso-8859-1',
322
- 'iso_ir_100' => 'iso-8859-1',
323
- 'l1' => 'iso-8859-1',
324
- 'latin' => 'iso-8859-1',
325
- 'latin1' => 'iso-8859-1',
326
-
327
- # iso8859_2 codec
328
- 'iso8859_2' => 'iso-8859-2',
329
- 'csisolatin2' => 'iso-8859-2',
330
- 'iso_8859_2' => 'iso-8859-2',
331
- 'iso_8859_2_1987' => 'iso-8859-2',
332
- 'iso_ir_101' => 'iso-8859-2',
333
- 'l2' => 'iso-8859-2',
334
- 'latin2' => 'iso-8859-2',
335
-
336
- # iso8859_3 codec
337
- 'iso8859_3' => 'iso-8859-3',
338
- 'csisolatin3' => 'iso-8859-3',
339
- 'iso_8859_3' => 'iso-8859-3',
340
- 'iso_8859_3_1988' => 'iso-8859-3',
341
- 'iso_ir_109' => 'iso-8859-3',
342
- 'l3' => 'iso-8859-3',
343
- 'latin3' => 'iso-8859-3',
344
-
345
- # iso8859_4 codec
346
- 'iso8849_4' => 'iso-8859-4',
347
- 'csisolatin4' => 'iso-8859-4',
348
- 'iso_8859_4' => 'iso-8859-4',
349
- 'iso_8859_4_1988' => 'iso-8859-4',
350
- 'iso_ir_110' => 'iso-8859-4',
351
- 'l4' => 'iso-8859-4',
352
- 'latin4' => 'iso-8859-4',
353
-
354
- # iso8859_5 codec
355
- 'iso8859_5' => 'iso-8859-5',
356
- 'csisolatincyrillic' => 'iso-8859-5',
357
- 'cyrillic' => 'iso-8859-5',
358
- 'iso_8859_5' => 'iso-8859-5',
359
- 'iso_8859_5_1988' => 'iso-8859-5',
360
- 'iso_ir_144' => 'iso-8859-5',
361
-
362
- # iso8859_6 codec
363
- 'iso8859_6' => 'iso-8859-6',
364
- 'arabic' => 'iso-8859-6',
365
- 'asmo_708' => 'iso-8859-6',
366
- 'csisolatinarabic' => 'iso-8859-6',
367
- 'ecma_114' => 'iso-8859-6',
368
- 'iso_8859_6' => 'iso-8859-6',
369
- 'iso_8859_6_1987' => 'iso-8859-6',
370
- 'iso_ir_127' => 'iso-8859-6',
371
-
372
- # iso8859_7 codec
373
- 'iso8859_7' => 'iso-8859-7',
374
- 'csisolatingreek' => 'iso-8859-7',
375
- 'ecma_118' => 'iso-8859-7',
376
- 'elot_928' => 'iso-8859-7',
377
- 'greek' => 'iso-8859-7',
378
- 'greek8' => 'iso-8859-7',
379
- 'iso_8859_7' => 'iso-8859-7',
380
- 'iso_8859_7_1987' => 'iso-8859-7',
381
- 'iso_ir_126' => 'iso-8859-7',
382
-
383
- # iso8859_8 codec
384
- 'iso8859_9' => 'iso8859_8',
385
- 'csisolatinhebrew' => 'iso-8859-8',
386
- 'hebrew' => 'iso-8859-8',
387
- 'iso_8859_8' => 'iso-8859-8',
388
- 'iso_8859_8_1988' => 'iso-8859-8',
389
- 'iso_ir_138' => 'iso-8859-8',
390
-
391
- # iso8859_9 codec
392
- 'iso8859_9' => 'iso-8859-9',
393
- 'csisolatin5' => 'iso-8859-9',
394
- 'iso_8859_9' => 'iso-8859-9',
395
- 'iso_8859_9_1989' => 'iso-8859-9',
396
- 'iso_ir_148' => 'iso-8859-9',
397
- 'l5' => 'iso-8859-9',
398
- 'latin5' => 'iso-8859-9',
399
-
400
- # iso8859_11 codec
401
- 'iso8859_11' => 'iso-8859-11',
402
- 'thai' => 'iso-8859-11',
403
- 'iso_8859_11' => 'iso-8859-11',
404
- 'iso_8859_11_2001' => 'iso-8859-11',
405
-
406
- # iso8859_16 codec
407
- 'iso8859_16' => 'iso-8859-16',
408
- 'iso_8859_16' => 'iso-8859-16',
409
- 'iso_8859_16_2001' => 'iso-8859-16',
410
- 'iso_ir_226' => 'iso-8859-16',
411
- 'l10' => 'iso-8859-16',
412
- 'latin10' => 'iso-8859-16',
413
-
414
- # cskoi8r codec
415
- 'koi8_r' => 'cskoi8r',
416
-
417
- # mac_cyrillic codec
418
- 'mac_cyrillic' => 'maccyrillic',
419
-
420
- # shift_jis codec
421
- 'csshiftjis' => 'shift_jis',
422
- 'shiftjis' => 'shift_jis',
423
- 'sjis' => 'shift_jis',
424
- 's_jis' => 'shift_jis',
425
-
426
- # shift_jisx0213 codec
427
- 'shiftjisx0213' => 'shift_jisx0213',
428
- 'sjisx0213' => 'shift_jisx0213',
429
- 's_jisx0213' => 'shift_jisx0213',
430
-
431
- # utf_16 codec
432
- 'utf_16' => 'utf-16',
433
- 'u16' => 'utf-16',
434
- 'utf16' => 'utf-16',
435
-
436
- # utf_16_be codec
437
- 'utf_16_be' => 'utf-16be',
438
- 'unicodebigunmarked' => 'utf-16be',
439
- 'utf_16be' => 'utf-16be',
440
-
441
- # utf_16_le codec
442
- 'utf_16_le' => 'utf-16le',
443
- 'unicodelittleunmarked' => 'utf-16le',
444
- 'utf_16le' => 'utf-16le',
445
-
446
- # utf_7 codec
447
- 'utf_7' => 'utf-7',
448
- 'u7' => 'utf-7',
449
- 'utf7' => 'utf-7',
450
-
451
- # utf_8 codec
452
- 'utf_8' => 'utf-8',
453
- 'u8' => 'utf-8',
454
- 'utf' => 'utf-8',
455
- 'utf8' => 'utf-8',
456
- 'utf8_ucs2' => 'utf-8',
457
- 'utf8_ucs4' => 'utf-8',
458
- }
459
-
460
- def unicode(data, from_encoding)
461
- # Takes a single string and converts it from the encoding in
462
- # from_encoding to unicode.
463
- uconvert(data, from_encoding, 'unicode')
464
- end
465
-
466
- def uconvert(data, from_encoding, to_encoding = 'utf-8')
467
- from_encoding = Encoding_Aliases[from_encoding] || from_encoding
468
- to_encoding = Encoding_Aliases[to_encoding] || to_encoding
469
- Iconv.iconv(to_encoding, from_encoding, data)[0]
470
- end
471
-
472
- def unichr(i)
473
- [i].pack('U*')
474
- end
475
-
476
- def index_match(stri,regexp, offset)
477
- if offset == 241
478
- end
479
- i = stri.index(regexp, offset)
480
-
481
- return nil, nil unless i
482
-
483
- full = stri[i..-1].match(regexp)
484
- return i, full
485
- end
486
-
487
- def _ebcdic_to_ascii(s)
488
- return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
489
- end
490
-
491
- def urljoin(base, uri)
492
- urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
493
- uri = uri.sub(urifixer, '\1\3')
494
- begin
495
- return URI.join(base, uri).to_s
496
- rescue URI::BadURIError => e
497
- if URI.parse(base).relative?
498
- return URI::parse(uri).to_s
499
- end
500
- end
501
- end
502
-
503
- def py2rtime(pytuple)
504
- Time.utc(pytuple[0..5])
505
- end
506
-
507
- # http://intertwingly.net/stories/2005/09/28/xchar.rb
508
- module XChar
509
- # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
510
- CP1252 = {
511
- 128 => 8364, # euro sign
512
- 130 => 8218, # single low-9 quotation mark
513
- 131 => 402, # latin small letter f with hook
514
- 132 => 8222, # double low-9 quotation mark
515
- 133 => 8230, # horizontal ellipsis
516
- 134 => 8224, # dagger
517
- 135 => 8225, # double dagger
518
- 136 => 710, # modifier letter circumflex accent
519
- 137 => 8240, # per mille sign
520
- 138 => 352, # latin capital letter s with caron
521
- 139 => 8249, # single left-pointing angle quotation mark
522
- 140 => 338, # latin capital ligature oe
523
- 142 => 381, # latin capital letter z with caron
524
- 145 => 8216, # left single quotation mark
525
- 146 => 8217, # right single quotation mark
526
- 147 => 8220, # left double quotation mark
527
- 148 => 8221, # right double quotation mark
528
- 149 => 8226, # bullet
529
- 150 => 8211, # en dash
530
- 151 => 8212, # em dash
531
- 152 => 732, # small tilde
532
- 153 => 8482, # trade mark sign
533
- 154 => 353, # latin small letter s with caron
534
- 155 => 8250, # single right-pointing angle quotation mark
535
- 156 => 339, # latin small ligature oe
536
- 158 => 382, # latin small letter z with caron
537
- 159 => 376} # latin capital letter y with diaeresis
538
-
539
- # http://www.w3.org/TR/REC-xml/#dt-chardata
540
- PREDEFINED = {
541
- 38 => '&', # ampersand
542
- 60 => '<', # left angle bracket
543
- 62 => '>'} # right angle bracket
544
-
545
- # http://www.w3.org/TR/REC-xml/#charsets
546
- VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
547
- (0xE000..0xFFFD), (0x10000..0x10FFFF)]
548
- end
549
-
550
- class Fixnum
551
- # xml escaped version of chr
552
- def xchr
553
- n = XChar::CP1252[self] || self
554
- n = 42 unless XChar::VALID.find {|range| range.include? n}
555
- XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
556
- end
557
- end
558
-
559
- class String
560
- alias :old_index :index
561
- def to_xs
562
- unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
563
- rescue
564
- unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
565
- end
566
- end
567
-
568
- class BetterSGMLParserError < Exception; end;
569
- class BetterSGMLParser < HTML::SGMLParser
570
- # Replaced Tagfind and Charref Regexps with the ones in feedparser.py
571
- # This makes things work.
572
- Interesting = /[&<]/u
573
- Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
574
- '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
575
- '![^<>]*)?', 64) # 64 is the unicode flag
576
-
577
- Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
578
- Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
579
-
580
- Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
581
- Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
582
- Endtagopen = /<\//u # Matching the Python SGMLParser
583
- Endbracket = /[<>]/u
584
- Declopen = /<!/u
585
- Piopenbegin = /^<\?/u
586
- Piclose = />/u
587
-
588
- Commentopen = /<!--/u
589
- Commentclose = /--\s*>/u
590
- Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
591
- Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
592
- '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
593
- 64)
594
- Endtagfind = /\s*\/\s*>/u
595
- def initialize(verbose=false)
596
- super(verbose)
597
- end
598
- def feed(*args)
599
- super(*args)
600
- end
601
-
602
- def goahead(_end)
603
- rawdata = @rawdata # woo, utf-8 magic
604
- i = 0
605
- n = rawdata.length
606
- while i < n
607
- if @nomoretags
608
- # handle_data_range does nothing more than set a "Range" that is never used. wtf?
609
- handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
610
- i = n
611
- break
612
- end
613
- j = rawdata.index(Interesting, i)
614
- j = n unless j
615
- handle_data(rawdata[i...j]) if i < j
616
- i = j
617
- break if (i == n)
618
- if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
619
- if rawdata.index(Starttagopen,i) == i
620
- if @literal
621
- handle_data(rawdata[i..i])
622
- i = i+1
623
- next
624
- end
625
- k = parse_starttag(i)
626
- break unless k
627
- i = k
628
- next
629
- end
630
- if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
631
- k = parse_endtag(i)
632
- break unless k
633
- i = k
634
- @literal = false
635
- next
636
- end
637
- if @literal
638
- if n > (i+1)
639
- handle_data("<")
640
- i = i+1
641
- else
642
- #incomplete
643
- break
644
- end
645
- next
646
- end
647
- if rawdata.index(Commentopen,i) == i
648
- k = parse_comment(i)
649
- break unless k
650
- i = k
651
- next
652
- end
653
- if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
654
- k = parse_pi(i)
655
- break unless k
656
- i += k
657
- next
658
- end
659
- if rawdata.index(Declopen,i) == i
660
- # This is some sort of declaration; in "HTML as
661
- # deployed," this should only be the document type
662
- # declaration ("<!DOCTYPE html...>").
663
- k = parse_declaration(i)
664
- break unless k
665
- i = k
666
- next
667
- end
668
- elsif rawdata[i..i] == '&'
669
- if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
670
- handle_data(rawdata[i..i])
671
- i += 1
672
- next
673
- end
674
-
675
- # the Char must come first as its #=~ method is the only one that is UTF-8 safe
676
- ni,match = index_match(rawdata, Charref, i)
677
- if ni and ni == i # See? Ugly
678
- handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
679
- i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
680
- i -= 1 unless rawdata[i-1..i-1] == ";"
681
- next
682
- end
683
- ni,match = index_match(rawdata, Entityref, i)
684
- if ni and ni == i
685
- handle_entityref(match[1])
686
- i += match[0].length
687
- i -= 1 unless rawdata[i-1..i-1] == ";"
688
- next
689
- end
690
- else
691
- error('neither < nor & ??')
692
- end
693
- # We get here only if incomplete matches but
694
- # nothing else
695
- ni,match = index_match(rawdata,Incomplete,i)
696
- unless ni and ni == 0
697
- handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
698
- i += 1
699
- next
700
- end
701
- j = ni + match[0].length
702
- break if j == n # Really incomplete
703
- handle_data(rawdata[i...j])
704
- i = j
705
- end # end while
706
-
707
- if _end and i < n
708
- handle_data(rawdata[i...n])
709
- i = n
710
- end
711
-
712
- @rawdata = rawdata[i..-1]
713
- # @offset += i # FIXME BUGME another unused variable in SGMLParser?
714
- end
715
-
716
-
717
- # Internal -- parse processing instr, return length or -1 if not terminated
718
- def parse_pi(i)
719
- rawdata = @rawdata
720
- if rawdata[i...i+2] != '<?'
721
- error("unexpected call to parse_pi()")
722
- end
723
- ni,match = index_match(rawdata,Piclose,i+2)
724
- return nil unless match
725
- j = ni
726
- handle_pi(rawdata[i+2...j])
727
- j = (j + match[0].length)
728
- return j-i
729
- end
730
-
731
- def parse_comment(i)
732
- rawdata = @rawdata
733
- if rawdata[i...i+4] != "<!--"
734
- error("unexpected call to parse_comment()")
735
- end
736
- ni,match = index_match(rawdata, Commentclose,i)
737
- return nil unless match
738
- handle_comment(rawdata[i+4..(ni-1)])
739
- return ni+match[0].length # Length from i to just past the closing comment tag
740
- end
741
-
742
-
743
- def parse_starttag(i)
744
- @_starttag_text = nil
745
- start_pos = i
746
- rawdata = @rawdata
747
- ni,match = index_match(rawdata,Shorttagopen,i)
748
- if ni == i
749
- # SGML shorthand: <tag/data/ == <tag>data</tag>
750
- # XXX Can data contain &... (entity or char refs)?
751
- # XXX Can data contain < or > (tag characters)?
752
- # XXX Can there be whitespace before the first /?
753
- k,match = index_match(rawdata,Shorttag,i)
754
- return nil unless match
755
- tag, data = match[1], match[2]
756
- @_starttag_text = "<#{tag}/"
757
- tag.downcase!
758
- second_end = rawdata.index(Shorttagopen,k)
759
- finish_shorttag(tag, data)
760
- @_starttag_text = rawdata[start_pos...second_end+1]
761
- return k
762
- end
763
-
764
- j = rawdata.index(Endbracket, i+1)
765
- return nil unless j
766
- attrsd = []
767
- if rawdata[i...i+2] == '<>'
768
- # SGML shorthand: <> == <last open tag seen>
769
- k = j
770
- tag = @lasttag
771
- else
772
- ni,match = index_match(rawdata,Tagfind,i+1)
773
- unless match
774
- error('unexpected call to parse_starttag')
775
- end
776
- k = ni+match[0].length+1
777
- tag = match[0].downcase
778
- @lasttag = tag
779
- end
780
-
781
- while k < j
782
- break if rawdata.index(Endtagfind, k) == k
783
- ni,match = index_match(rawdata,Attrfind,k)
784
- break unless ni
785
- matched_length = match[0].length
786
- attrname, rest, attrvalue = match[1],match[2],match[3]
787
- if rest.nil? or rest.empty?
788
- attrvalue = '' # was: = attrname # Why the change?
789
- elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
790
- attrvalue = attrvalue[1...-1]
791
- end
792
- attrsd << [attrname.downcase, attrvalue]
793
- k += matched_length
794
- end
795
- if rawdata[j..j] == ">"
796
- j += 1
797
- end
798
- @_starttag_text = rawdata[start_pos...j]
799
- finish_starttag(tag, attrsd)
800
- return j
801
- end
802
-
803
- def parse_endtag(i)
804
- rawdata = @rawdata
805
- j, match = index_match(rawdata, /[<>]/,i+1)
806
- return nil unless j
807
- tag = rawdata[i+2...j].strip.downcase
808
- if rawdata[j..j] == ">"
809
- j += 1
810
- end
811
- finish_endtag(tag)
812
- return j
813
- end
814
-
815
- def output
816
- # Return processed HTML as a single string
817
- return @pieces.map{|p| p.to_s}.join
818
- end
819
-
820
- def error(message)
821
- raise BetterSGMLParserError.new(message)
822
- end
823
- def handle_pi(text)
824
- end
825
- def handle_decl(text)
826
- end
827
- end
828
-
829
- # Add some helper methods to make AttributeList (all of those damn attrs
830
- # and attrsD used by StrictFeedParser) act more like a Hash.
831
- # NOTE AttributeList is still Read-Only (AFAICT).
832
- # Monkey patching is terrible, and I have an addiction.
833
- module XML
834
- module SAX
835
- module AttributeList # in xml/sax.rb
836
- def [](key)
837
- getValue(key)
838
- end
839
-
840
- def each(&blk)
841
- (0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
842
- end
843
-
844
- def each_key(&blk)
845
- (0...getLength).each{|pos| yield getName(pos) }
846
- end
847
-
848
- def each_value(&blk)
849
- (0...getLength).each{|pos| yield getValue(pos) }
850
- end
851
-
852
- def to_a # Rather use collect? grep for to_a.collect
853
- l = []
854
- each{|k,v| l << [k,v]}
855
- return l
856
- end
857
-
858
- def to_s
859
- l = []
860
- each{|k,v| l << "#{k} => #{v}"}
861
- "{ "+l.join(", ")+" }"
862
- end
863
- end
864
- end
865
- end
866
- # This adds a nice scrub method to Hpricot, so we don't need a _HTMLSanitizer class
867
- # http://underpantsgnome.com/2007/01/20/hpricot-scrub
868
- # I have modified it to check for attributes that are only allowed if they are in a certain tag
869
- module Hpricot
870
- Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
871
- 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
872
- 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
873
- 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
874
- 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
875
- 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
876
- 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
877
- 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
878
- 'ul', 'var'
879
- ]
880
-
881
- Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
882
- 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
883
- 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
884
- 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
885
- 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
886
- 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
887
- 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
888
- 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
889
- 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
890
- 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
891
- 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
892
- ]
893
-
894
- Unacceptable_Elements_With_End_Tag = ['script', 'applet']
895
-
896
- Acceptable_Css_Properties = ['azimuth', 'background-color',
897
- 'border-bottom-color', 'border-collapse', 'border-color',
898
- 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
899
- 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
900
- 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
901
- 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
902
- 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
903
- 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
904
- 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
905
- 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
906
- 'white-space', 'width'
907
- ]
908
-
909
- # survey of common keywords found in feeds
910
- Acceptable_Css_Keywords = ['auto', 'aqua', 'black', 'block', 'blue',
911
- 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
912
- 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
913
- 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
914
- 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
915
- 'transparent', 'underline', 'white', 'yellow'
916
- ]
917
-
918
- Mathml_Elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
919
- 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
920
- 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
921
- 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
922
- 'munderover', 'none'
923
- ]
924
-
925
- Mathml_Attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
926
- 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
927
- 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
928
- 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
929
- 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
930
- 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
931
- 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
932
- 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
933
- 'xlink:type', 'xmlns', 'xmlns:xlink'
934
- ]
935
-
936
- # svgtiny - foreignObject + linearGradient + radialGradient + stop
937
- Svg_Elements = ['a', 'animate', 'animateColor', 'animateMotion',
938
- 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
939
- 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
940
- 'linearGradient', 'line', 'metadata', 'missing-glyph', 'mpath', 'path',
941
- 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg',
942
- 'switch', 'text', 'title', 'use'
943
- ]
944
-
945
- # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
946
- Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
947
- 'arabic-form', 'ascent', 'attributeName', 'attributeType',
948
- 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
949
- 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
950
- 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
951
- 'font-size', 'font-stretch', 'font-style', 'font-variant',
952
- 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
953
- 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
954
- 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
955
- 'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
956
- 'origin', 'overline-position', 'overline-thickness', 'panose-1',
957
- 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
958
- 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
959
- 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
960
- 'stop-color', 'stop-opacity', 'strikethrough-position',
961
- 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
962
- 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
963
- 'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
964
- 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
965
- 'underline-position', 'underline-thickness', 'unicode',
966
- 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
967
- 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
968
- 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
969
- 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
970
- 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
971
- ]
972
-
973
- Svg_Attr_Map = nil
974
- Svg_Elem_Map = nil
975
-
976
- Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
977
- 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
978
- 'stroke-opacity'
979
- ]
980
-
981
- unless $compatible
982
- @@acceptable_tag_specific_attributes = {}
983
- @@mathml_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@mathml_attributes }
984
- @@svg_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@svg_attributes }
985
- end
986
-
987
- class Elements
988
- def strip(allowed_tags=[]) # I completely route around this with the recursive_strip in Doc
989
- each { |x| x.strip(allowed_tags) }
990
- end
991
-
992
- def strip_attributes(safe=[])
993
- each { |x| x.strip_attributes(safe) }
994
- end
995
-
996
- def strip_style(ok_props=[], ok_keywords=[])
997
- each { |x| x.strip_style(ok_props, ok_keywords) }
998
- end
999
- end
1000
-
1001
- class Text
1002
- def strip(foo)
1003
- end
1004
- def strip_attributes(foo)
1005
- end
1006
- end
1007
- class Comment
1008
- def strip(foo)
1009
- end
1010
- def strip_attributes(foo)
1011
- end
1012
- end
1013
- class BogusETag
1014
- def strip(foo)
1015
- end
1016
- def strip_attributes(foo)
1017
- end
1018
- end
1019
-
1020
- class Elem
1021
- def decode_entities
1022
- children.each{ |x| x.decode_entities }
1023
- end
1024
-
1025
- def cull
1026
- if children
1027
- swap(children.to_s)
1028
- end
1029
- end
1030
-
1031
- def strip
1032
- if strip_removes?
1033
- cull
1034
- end
1035
- end
1036
-
1037
- def strip_attributes
1038
- unless attributes.nil?
1039
- attributes.each do |atr|
1040
- unless Acceptable_Attributes.include?atr[0]
1041
- remove_attribute(atr[0])
1042
- end
1043
- end
1044
- end
1045
- end
1046
-
1047
- def strip_removes?
1048
- # I'm sure there are others that shuould be ripped instead of stripped
1049
- attributes && attributes['type'] =~ /script|css/
1050
- end
1051
- end
1052
- end
1053
-
1054
- module FeedParser
1055
- Version = "0.1aleph_naught"
1056
-
1057
- License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
1058
-
1059
- Redistribution and use in source and binary forms, with or without modification,
1060
- are permitted provided that the following conditions are met:
1061
-
1062
- * Redistributions of source code must retain the above copyright notice,
1063
- this list of conditions and the following disclaimer.
1064
- * Redistributions in binary form must reproduce the above copyright notice,
1065
- this list of conditions and the following disclaimer in the documentation
1066
- and/or other materials provided with the distribution.
1067
-
1068
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
1069
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1070
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1071
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
1072
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
1073
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
1074
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
1075
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
1076
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
1077
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1078
- POSSIBILITY OF SUCH DAMAGE."""
1079
-
1080
- Author = "Jeff Hodges <http://somethingsimilar.com>"
1081
- Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
1082
- Contributors = [ "Jason Diamond <http://injektilo.org/>",
1083
- "John Beimler <http://john.beimler.org/>",
1084
- "Fazal Majid <http://www.majid.info/mylos/weblog/>",
1085
- "Aaron Swartz <http://aaronsw.com/>",
1086
- "Kevin Marks <http://epeus.blogspot.com/>"
1087
- ]
1088
- # HTTP "User-Agent" header to send to servers when downloading feeds.
1089
- # If you are embedding feedparser in a larger application, you should
1090
- # change this to your application name and URL.
1091
- USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
1092
-
1093
- # HTTP "Accept" header to send to servers when downloading feeds. If you don't
1094
- # want to send an Accept header, set this to None.
1095
- ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
1096
-
1097
-
1098
- # If you want feedparser to automatically run HTML markup through HTML Tidy, set
1099
- # this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
1100
- # or utidylib <http://utidylib.berlios.de/>.
1101
- TIDY_MARKUP = false #FIXME untranslated
1102
-
1103
- # List of Python interfaces for HTML Tidy, in order of preference. Only useful
1104
- # if TIDY_MARKUP = true
1105
- PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
1106
-
1107
- # The original Python import. I'm using it to help translate
1108
- #import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
1109
-
1110
-
1111
-
1112
- # ---------- don't touch these ----------
1113
- class ThingsNobodyCaresAboutButMe < Exception
1114
- end
1115
- class CharacterEncodingOverride < ThingsNobodyCaresAboutButMe
1116
- end
1117
- class CharacterEncodingUnknown < ThingsNobodyCaresAboutButMe
1118
- end
1119
- class NonXMLContentType < ThingsNobodyCaresAboutButMe
1120
- end
1121
- class UndeclaredNamespace < Exception
1122
- end
1123
-
1124
-
1125
- SUPPORTED_VERSIONS = {'' => 'unknown',
1126
- 'rss090' => 'RSS 0.90',
1127
- 'rss091n' => 'RSS 0.91 (Netscape)',
1128
- 'rss091u' => 'RSS 0.91 (Userland)',
1129
- 'rss092' => 'RSS 0.92',
1130
- 'rss093' => 'RSS 0.93',
1131
- 'rss094' => 'RSS 0.94',
1132
- 'rss20' => 'RSS 2.0',
1133
- 'rss10' => 'RSS 1.0',
1134
- 'rss' => 'RSS (unknown version)',
1135
- 'atom01' => 'Atom 0.1',
1136
- 'atom02' => 'Atom 0.2',
1137
- 'atom03' => 'Atom 0.3',
1138
- 'atom10' => 'Atom 1.0',
1139
- 'atom' => 'Atom (unknown version)',
1140
- 'cdf' => 'CDF',
1141
- 'hotrss' => 'Hot RSS'
1142
- }
1143
- class FeedParserDict < Hash
1144
- =begin
1145
- The naming of a certain common attribute (such as, "When was the last
1146
- time this feed was updated?") can have many different names depending
1147
- on the type of feed we are handling. This class allows us to use
1148
- both the attribute name a person, who has knowledge of the kind of
1149
- feed being parsed, expects, as well as allowing a developer to rely
1150
- on one name to contain the proper attribute no matter what kind of
1151
- feed is being parsed. @@keymaps is a Hash that contains information
1152
- on what certain attributes "really is" in each feed type. It does so
1153
- by providing a common name that will map to any feed type in the keys,
1154
- with possible "correct" attributes in the its values. the #[] and #[]=
1155
- methods check with keymaps to see what attribute the developer "really
1156
- means" if they've asked for one which happens to be in @@keymap's keys.
1157
- =end
1158
- @@keymap = {'channel' => 'feed',
1159
- 'items' => 'entries',
1160
- 'guid' => 'id',
1161
- 'date' => 'updated',
1162
- 'date_parsed' => 'updated_parsed',
1163
- 'description' => ['subtitle', 'summary'],
1164
- 'url' => ['href'],
1165
- 'modified' => 'updated',
1166
- 'modified_parsed' => 'updated_parsed',
1167
- 'issued' => 'published',
1168
- 'issued_parsed' => 'published_parsed',
1169
- 'copyright' => 'rights',
1170
- 'copyright_detail' => 'rights_detail',
1171
- 'tagline' => 'subtitle',
1172
- 'tagline_detail' => 'subtitle_detail'}
1173
-
1174
- def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
1175
- return self['entries']
1176
- end
1177
- # We could include the [] rewrite in new using Hash.new's fancy pants block thing
1178
- # but we'd still have to overwrite []= and such.
1179
- # I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
1180
- def initialize(pairs=nil)
1181
- if pairs.class == Array and pairs[0].class == Array and pairs[0].length == 2
1182
- pairs.each do |l|
1183
- k,v = l
1184
- self[k] = v
1185
- end
1186
- elsif pairs.class == Hash
1187
- self.merge!(pairs)
1188
- end
1189
- end
1190
-
1191
- def [](key)
1192
- if key == 'category'
1193
- return self['tags'][0]['term']
1194
- end
1195
- if key == 'categories'
1196
- return self['tags'].collect{|tag| [tag['scheme'],tag['term']]}
1197
- end
1198
- realkey = @@keymap[key] || key
1199
- if realkey.class == Array
1200
- realkey.each{ |key| return self[key] if has_key?key }
1201
- end
1202
- # Note that the original key is preferred over the realkey we (might
1203
- # have) found in @@keymaps
1204
- if has_key?(key)
1205
- return super(key)
1206
- end
1207
- return super(realkey)
1208
- end
1209
-
1210
- def []=(key,value)
1211
- if @@keymap.key?key
1212
- key = @@keymap[key]
1213
- if key.class == Array
1214
- key = key[0]
1215
- end
1216
- end
1217
- super(key,value)
1218
- end
1219
-
1220
- def method_missing(msym, *args)
1221
- methodname = msym.to_s
1222
- if methodname[-1] == '='
1223
- return self[methodname[0..-2]] = args[0]
1224
- elsif methodname[-1] != '!' and methodname[-1] != '?' and methodname[0] != "_" # FIXME implement with private
1225
- return self[methodname]
1226
- else
1227
- raise NoMethodError, "whoops, we don't know about the attribute or method called `#{methodname}' for #{self}:#{self.class}"
1228
- end
1229
- end
1230
- end
1231
-
1232
-
1233
-
1234
-
1235
- module FeedParserMixin
1236
- attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
1237
-
1238
- def startup(baseuri=nil, baselang=nil, encoding='utf-8')
1239
- $stderr << "initializing FeedParser\n" if $debug
1240
-
1241
- @namespaces = {'' => '',
1242
- 'http://backend.userland.com/rss' => '',
1243
- 'http://blogs.law.harvard.edu/tech/rss' => '',
1244
- 'http://purl.org/rss/1.0/' => '',
1245
- 'http://my.netscape.com/rdf/simple/0.9/' => '',
1246
- 'http://example.com/newformat#' => '',
1247
- 'http://example.com/necho' => '',
1248
- 'http://purl.org/echo/' => '',
1249
- 'uri/of/echo/namespace#' => '',
1250
- 'http://purl.org/pie/' => '',
1251
- 'http://purl.org/atom/ns#' => '',
1252
- 'http://www.w3.org/2005/Atom' => '',
1253
- 'http://purl.org/rss/1.0/modules/rss091#' => '',
1254
- 'http://webns.net/mvcb/' => 'admin',
1255
- 'http://purl.org/rss/1.0/modules/aggregation/' => 'ag',
1256
- 'http://purl.org/rss/1.0/modules/annotate/' => 'annotate',
1257
- 'http://media.tangent.org/rss/1.0/' => 'audio',
1258
- 'http://backend.userland.com/blogChannelModule' => 'blogChannel',
1259
- 'http://web.resource.org/cc/' => 'cc',
1260
- 'http://backend.userland.com/creativeCommonsRssModule' => 'creativeCommons',
1261
- 'http://purl.org/rss/1.0/modules/company' => 'co',
1262
- 'http://purl.org/rss/1.0/modules/content/' => 'content',
1263
- 'http://my.theinfo.org/changed/1.0/rss/' => 'cp',
1264
- 'http://purl.org/dc/elements/1.1/' => 'dc',
1265
- 'http://purl.org/dc/terms/' => 'dcterms',
1266
- 'http://purl.org/rss/1.0/modules/email/' => 'email',
1267
- 'http://purl.org/rss/1.0/modules/event/' => 'ev',
1268
- 'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner',
1269
- 'http://freshmeat.net/rss/fm/' => 'fm',
1270
- 'http://xmlns.com/foaf/0.1/' => 'foaf',
1271
- 'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo',
1272
- 'http://postneo.com/icbm/' => 'icbm',
1273
- 'http://purl.org/rss/1.0/modules/image/' => 'image',
1274
- 'http://www.itunes.com/DTDs/PodCast-1.0.dtd' => 'itunes',
1275
- 'http://example.com/DTDs/PodCast-1.0.dtd' => 'itunes',
1276
- 'http://purl.org/rss/1.0/modules/link/' => 'l',
1277
- 'http://search.yahoo.com/mrss' => 'media',
1278
- 'http://madskills.com/public/xml/rss/module/pingback/' => 'pingback',
1279
- 'http://prismstandard.org/namespaces/1.2/basic/' => 'prism',
1280
- 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
1281
- 'http://www.w3.org/2000/01/rdf-schema#' => 'rdfs',
1282
- 'http://purl.org/rss/1.0/modules/reference/' => 'ref',
1283
- 'http://purl.org/rss/1.0/modules/richequiv/' => 'reqv',
1284
- 'http://purl.org/rss/1.0/modules/search/' => 'search',
1285
- 'http://purl.org/rss/1.0/modules/slash/' => 'slash',
1286
- 'http://schemas.xmlsoap.org/soap/envelope/' => 'soap',
1287
- 'http://purl.org/rss/1.0/modules/servicestatus/' => 'ss',
1288
- 'http://hacks.benhammersley.com/rss/streaming/' => 'str',
1289
- 'http://purl.org/rss/1.0/modules/subscription/' => 'sub',
1290
- 'http://purl.org/rss/1.0/modules/syndication/' => 'sy',
1291
- 'http://purl.org/rss/1.0/modules/taxonomy/' => 'taxo',
1292
- 'http://purl.org/rss/1.0/modules/threading/' => 'thr',
1293
- 'http://purl.org/rss/1.0/modules/textinput/' => 'ti',
1294
- 'http://madskills.com/public/xml/rss/module/trackback/' =>'trackback',
1295
- 'http://wellformedweb.org/commentAPI/' => 'wfw',
1296
- 'http://purl.org/rss/1.0/modules/wiki/' => 'wiki',
1297
- 'http://www.w3.org/1999/xhtml' => 'xhtml',
1298
- 'http://www.w3.org/XML/1998/namespace' => 'xml',
1299
- 'http://www.w3.org/1999/xlink' => 'xlink',
1300
- 'http://schemas.pocketsoap.com/rss/myDescModule/' => 'szf'
1301
- }
1302
- @matchnamespaces = {}
1303
- @namespaces.each do |l|
1304
- @matchnamespaces[l[0].downcase] = l[1]
1305
- end
1306
- @can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
1307
- @can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
1308
- @can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
1309
- @html_types = ['text/html', 'application/xhtml+xml']
1310
- @feeddata = FeedParserDict.new # feed-level data
1311
- @encoding = encoding # character encoding
1312
- @entries = [] # list of entry-level data
1313
- @version = '' # feed type/version see SUPPORTED_VERSIOSN
1314
- @namespacesInUse = {} # hash of namespaces defined by the feed
1315
-
1316
- # the following are used internall to track state;
1317
- # this is really out of control and should be refactored
1318
- @infeed = false
1319
- @inentry = false
1320
- @incontent = 0 # Yes, this needs to be zero until I work out popContent and pushContent
1321
- @intextinput = false
1322
- @inimage = false
1323
- @inauthor = false
1324
- @incontributor = false
1325
- @inpublisher = false
1326
- @insource = false
1327
- @sourcedata = FeedParserDict.new
1328
- @contentparams = FeedParserDict.new
1329
- @summaryKey = nil
1330
- @namespacemap = {}
1331
- @elementstack = []
1332
- @basestack = []
1333
- @langstack = []
1334
- @baseuri = baseuri || ''
1335
- @lang = baselang || nil
1336
- if baselang
1337
- @feeddata['language'] = baselang.gsub('_','-')
1338
- end
1339
- @date_handlers = [:_parse_date_rfc822,
1340
- :_parse_date_hungarian, :_parse_date_greek,:_parse_date_mssql,
1341
- :_parse_date_nate,:_parse_date_onblog,:_parse_date_w3dtf,:_parse_date_iso8601
1342
- ]
1343
- $stderr << "Leaving startup\n" if $debug # My addition
1344
- end
1345
-
1346
- def unknown_starttag(tag, attrsd)
1347
- $stderr << "start #{tag} with #{attrsd}\n" if $debug
1348
- # normalize attrs
1349
- attrsD = {}
1350
- attrsd = Hash[*attrsd.flatten] if attrsd.class == Array # Magic! Asterisk!
1351
- # LooseFeedParser needs the above because SGMLParser sends attrs as a
1352
- # list of lists (like [['type','text/html'],['mode','escaped']])
1353
-
1354
- attrsd.each do |old_k,value|
1355
- # There has to be a better, non-ugly way of doing this
1356
- k = old_k.downcase # Downcase all keys
1357
- attrsD[k] = value
1358
- if ['rel','type'].include?value
1359
- attrsD[k].downcase! # Downcase the value if the key is 'rel' or 'type'
1360
- end
1361
- end
1362
-
1363
- # track xml:base and xml:lang
1364
- baseuri = attrsD['xml:base'] || attrsD['base'] || @baseuri
1365
- @baseuri = urljoin(@baseuri, baseuri)
1366
- lang = attrsD['xml:lang'] || attrsD['lang']
1367
- if lang == '' # FIXME This next bit of code is right? Wtf?
1368
- # xml:lang could be explicitly set to '', we need to capture that
1369
- lang = nil
1370
- elsif lang.nil?
1371
- # if no xml:lang is specified, use parent lang
1372
- lang = @lang
1373
- end
1374
- if lang and not lang.empty? # Seriously, this cannot be correct
1375
- if ['feed', 'rss', 'rdf:RDF'].include?tag
1376
- @feeddata['language'] = lang.gsub('_','-')
1377
- end
1378
- end
1379
- @lang = lang
1380
- @basestack << @baseuri
1381
- @langstack << lang
1382
-
1383
- # track namespaces
1384
- attrsd.each do |prefix, uri|
1385
- if /^xmlns:/ =~ prefix # prefix begins with xmlns:
1386
- trackNamespace(prefix[6..-1], uri)
1387
- elsif prefix == 'xmlns':
1388
- trackNamespace(nil, uri)
1389
- end
1390
- end
1391
-
1392
- # track inline content
1393
- if @incontent != 0 and @contentparams.has_key?('type') and not ( /xml$/ =~ (@contentparams['type'] || 'xml') )
1394
- # element declared itself as escaped markup, but isn't really
1395
-
1396
- @contentparams['type'] = 'application/xhtml+xml'
1397
- end
1398
- if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
1399
- # Note: probably shouldn't simply recreate localname here, but
1400
- # our namespace handling isn't actually 100% correct in cases where
1401
- # the feed redefines the default namespace (which is actually
1402
- # the usual case for inline content, thanks Sam), so here we
1403
- # cheat and just reconstruct the element based on localname
1404
- # because that compensates for the bugs in our namespace handling.
1405
- # This will horribly munge inline content with non-empty qnames,
1406
- # but nobody actually does that, so I'm not fixing it.
1407
- tag = tag.split(':')[-1]
1408
- attrsA = attrsd.to_a.collect{|l| "#{l[0]}=\"#{l[1]}\""}
1409
- attrsS = ' '+attrsA.join(' ')
1410
- return handle_data("<#{tag}#{attrsS}>", escape=false)
1411
- end
1412
-
1413
- # match namespaces
1414
- if /:/ =~ tag
1415
- prefix, suffix = tag.split(':', 2)
1416
- else
1417
- prefix, suffix = '', tag
1418
- end
1419
- prefix = @namespacemap[prefix] || prefix
1420
- if prefix and not prefix.empty?
1421
- prefix = prefix + '_'
1422
- end
1423
-
1424
- # special hack for better tracking of empty textinput/image elements in illformed feeds
1425
- if (not prefix and not prefix.empty?) and not (['title', 'link', 'description','name'].include?tag)
1426
- @intextinput = false
1427
- end
1428
- if (prefix.nil? or prefix.empty?) and not (['title', 'link', 'description', 'url', 'href', 'width', 'height'].include?tag)
1429
- @inimage = false
1430
- end
1431
-
1432
- # call special handler (if defined) or default handler
1433
- begin
1434
- return send('_start_'+prefix+suffix, attrsD)
1435
- rescue NoMethodError
1436
- return push(prefix + suffix, true)
1437
- end
1438
- end # End unknown_starttag
1439
-
1440
- def unknown_endtag(tag)
1441
- $stderr << "end #{tag}\n" if $debug
1442
- # match namespaces
1443
- if tag.index(':')
1444
- prefix, suffix = tag.split(':',2)
1445
- else
1446
- prefix, suffix = '', tag
1447
- end
1448
- prefix = @namespacemap[prefix] || prefix
1449
- if prefix and not prefix.empty?
1450
- prefix = prefix + '_'
1451
- end
1452
-
1453
- # call special handler (if defined) or default handler
1454
- begin
1455
- send('_end_' + prefix + suffix) # NOTE no return here! do not add it!
1456
- rescue NoMethodError => details
1457
- pop(prefix + suffix)
1458
- end
1459
-
1460
- # track inline content
1461
- if @incontent != 0 and @contentparams.has_key?'type' and /xml$/ =~ (@contentparams['type'] || 'xml')
1462
- # element declared itself as escaped markup, but it isn't really
1463
- @contentparams['type'] = 'application/xhtml+xml'
1464
- end
1465
- if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
1466
- tag = tag.split(':')[-1]
1467
- handle_data("</#{tag}>", escape=false)
1468
- end
1469
-
1470
- # track xml:base and xml:lang going out of scope
1471
- if @basestack and not @basestack.empty?
1472
- @basestack.pop
1473
- if @basestack and @basestack[-1] and not (@basestack.empty? or @basestack[-1].empty?)
1474
- @baseuri = @basestack[-1]
1475
- end
1476
- end
1477
- if @langstack and not @langstack.empty?
1478
- @langstack.pop
1479
- if @langstack and not @langstack.empty? # and @langstack[-1] and not @langstack.empty?
1480
- @lang = @langstack[-1]
1481
- end
1482
- end
1483
- end
1484
-
1485
- def handle_charref(ref)
1486
- # LooseParserOnly
1487
- # called for each character reference, e.g. for '&#160;', ref will be '160'
1488
- $stderr << "entering handle_charref with #{ref}\n" if $debug
1489
- return if @elementstack.nil? or @elementstack.empty?
1490
- ref.downcase!
1491
- chars = ['34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e']
1492
- if chars.include?ref
1493
- text = "&##{ref};"
1494
- else
1495
- if ref[0..0] == 'x'
1496
- c = (ref[1..-1]).to_i(16)
1497
- else
1498
- c = ref.to_i
1499
- end
1500
- text = uconvert(unichr(c),'unicode')
1501
- end
1502
- @elementstack[-1][2] << text
1503
- end
1504
-
1505
- def handle_entityref(ref)
1506
- # LooseParserOnly
1507
- # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1508
-
1509
- return if @elementstack.nil? or @elementstack.empty?
1510
- $stderr << "entering handle_entityref with #{ref}\n" if $debug
1511
- ents = ['lt', 'gt', 'quot', 'amp', 'apos']
1512
- if ents.include?ref
1513
- text = "&#{ref};"
1514
- else
1515
- text = HTMLEntities::decode_entities("&#{ref};")
1516
- end
1517
- @elementstack[-1][2] << text
1518
- end
1519
-
1520
- def handle_data(text, escape=true)
1521
- # called for each block of plain text, i.e. outside of any tag and
1522
- # not containing any character or entity references
1523
- return if @elementstack.nil? or @elementstack.empty?
1524
- if escape and @contentparams['type'] == 'application/xhtml+xml'
1525
- text = text.to_xs
1526
- end
1527
- @elementstack[-1][2] << text
1528
- end
1529
-
1530
- def handle_comment(comment)
1531
- # called for each comment, e.g. <!-- insert message here -->
1532
- end
1533
-
1534
- def handle_pi(text)
1535
- end
1536
-
1537
- def handle_decl(text)
1538
- end
1539
-
1540
- def parse_declaration(i)
1541
- # for LooseFeedParser
1542
- $stderr << "entering parse_declaration\n" if $debug
1543
- if @rawdata[i...i+9] == '<![CDATA['
1544
- k = @rawdata.index(/\]\]>/u,i+9)
1545
- k = @rawdata.length unless k
1546
- handle_data(@rawdata[i+9...k].to_xs,false)
1547
- return k+3
1548
- else
1549
- k = @rawdata.index(/>/,i).to_i
1550
- return k+1
1551
- end
1552
- end
1553
-
1554
- def mapContentType(contentType)
1555
- contentType.downcase!
1556
- case contentType
1557
- when 'text'
1558
- contentType = 'text/plain'
1559
- when 'html'
1560
- contentType = 'text/html'
1561
- when 'xhtml'
1562
- contentType = 'application/xhtml+xml'
1563
- end
1564
- return contentType
1565
- end
1566
-
1567
- def trackNamespace(prefix, uri)
1568
-
1569
- loweruri = uri.downcase.strip
1570
- if [prefix, loweruri] == [nil, 'http://my.netscape.com/rdf/simple/0.9/'] and (@version.nil? or @version.empty?)
1571
- @version = 'rss090'
1572
- elsif loweruri == 'http://purl.org/rss/1.0/' and (@version.nil? or @version.empty?)
1573
- @version = 'rss10'
1574
- elsif loweruri == 'http://www.w3.org/2005/atom' and (@version.nil? or @version.empty?)
1575
- @version = 'atom10'
1576
- elsif /backend\.userland\.com\/rss/ =~ loweruri
1577
- # match any backend.userland.com namespace
1578
- uri = 'http://backend.userland.com/rss'
1579
- loweruri = uri
1580
- end
1581
- if @matchnamespaces.has_key? loweruri
1582
- @namespacemap[prefix] = @matchnamespaces[loweruri]
1583
- @namespacesInUse[@matchnamespaces[loweruri]] = uri
1584
- else
1585
- @namespacesInUse[prefix || ''] = uri
1586
- end
1587
- end
1588
-
1589
- def resolveURI(uri)
1590
- return urljoin(@baseuri || '', uri)
1591
- end
1592
-
1593
- def decodeEntities(element, data)
1594
- return data
1595
- end
1596
-
1597
- def push(element, expectingText)
1598
- @elementstack << [element, expectingText, []]
1599
- end
1600
-
1601
- def pop(element, stripWhitespace=true)
1602
- return if @elementstack.nil? or @elementstack.empty?
1603
- return if @elementstack[-1][0] != element
1604
- element, expectingText, pieces = @elementstack.pop
1605
- if pieces.class == Array
1606
- output = pieces.join('')
1607
- else
1608
- output = pieces
1609
- end
1610
- if stripWhitespace
1611
- output.strip!
1612
- end
1613
- return output if not expectingText
1614
-
1615
- # decode base64 content
1616
- if @contentparams['base64']
1617
- out64 = Base64::decode64(output) # a.k.a. [output].unpack('m')[0]
1618
- if not output.empty? and not out64.empty?
1619
- output = out64
1620
- end
1621
- end
1622
-
1623
- # resolve relative URIs
1624
- if @can_be_relative_uri.include?element and output and not output.empty?
1625
- output = resolveURI(output)
1626
- end
1627
-
1628
- # decode entities within embedded markup
1629
- if not @contentparams['base64']
1630
- output = decodeEntities(element, output)
1631
- end
1632
-
1633
- # remove temporary cruft from contentparams
1634
- @contentparams.delete('mode')
1635
- @contentparams.delete('base64')
1636
-
1637
- # resolve relative URIs within embedded markup
1638
- if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
1639
- if @can_contain_relative_uris.include?element
1640
- output = FeedParser.resolveRelativeURIs(output, @baseuri, @encoding)
1641
- end
1642
- end
1643
- # sanitize embedded markup
1644
- if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
1645
- if @can_contain_dangerous_markup.include?element
1646
- output = FeedParser.sanitizeHTML(output, @encoding)
1647
- end
1648
- end
1649
-
1650
- if @encoding and not @encoding.empty? and @encoding != 'utf-8'
1651
- output = uconvert(output, @encoding, 'utf-8')
1652
- # FIXME I turn everything into utf-8, not unicode, originally because REXML was being used but now beause I haven't tested it out yet.
1653
- end
1654
-
1655
- # categories/tags/keywords/whatever are handled in _end_category
1656
- return output if element == 'category'
1657
-
1658
- # store output in appropriate place(s)
1659
- if @inentry and not @insource
1660
- if element == 'content'
1661
- @entries[-1][element] ||= []
1662
- contentparams = Marshal.load(Marshal.dump(@contentparams)) # deepcopy
1663
- contentparams['value'] = output
1664
- @entries[-1][element] << contentparams
1665
- elsif element == 'link'
1666
- @entries[-1][element] = output
1667
- if output and not output.empty?
1668
- @entries[-1]['links'][-1]['href'] = output
1669
- end
1670
- else
1671
- element = 'summary' if element == 'description'
1672
- @entries[-1][element] = output
1673
- if @incontent != 0
1674
- contentparams = Marshal.load(Marshal.dump(@contentparams))
1675
- contentparams['value'] = output
1676
- @entries[-1][element + '_detail'] = contentparams
1677
- end
1678
- end
1679
- elsif (@infeed or @insource) and not @intextinput and not @inimage
1680
- context = getContext()
1681
- element = 'subtitle' if element == 'description'
1682
- context[element] = output
1683
- if element == 'link'
1684
- context['links'][-1]['href'] = output
1685
- elsif @incontent != 0
1686
- contentparams = Marshal.load(Marshal.dump(@contentparams))
1687
- contentparams['value'] = output
1688
- context[element + '_detail'] = contentparams
1689
- end
1690
- end
1691
- return output
1692
- end
1693
-
1694
- def pushContent(tag, attrsD, defaultContentType, expectingText)
1695
- @incontent += 1 # Yes, I hate this.
1696
- type = mapContentType(attrsD['type'] || defaultContentType)
1697
- @contentparams = FeedParserDict.new({'type' => type,'language' => @lang,'base' => @baseuri})
1698
- @contentparams['base64'] = isBase64(attrsD, @contentparams)
1699
- push(tag, expectingText)
1700
- end
1701
-
1702
- def popContent(tag)
1703
- value = pop(tag)
1704
- @incontent -= 1
1705
- @contentparams.clear
1706
- return value
1707
- end
1708
-
1709
- def mapToStandardPrefix(name)
1710
- colonpos = name.index(':')
1711
- if colonpos
1712
- prefix = name[0..colonpos-1]
1713
- suffix = name[colonpos+1..-1]
1714
- prefix = @namespacemap[prefix] || prefix
1715
- name = prefix + ':' + suffix
1716
- end
1717
- return name
1718
- end
1719
-
1720
- def getAttribute(attrsD, name)
1721
- return attrsD[mapToStandardPrefix(name)]
1722
- end
1723
-
1724
- def isBase64(attrsD, contentparams)
1725
- return true if (attrsD['mode'] == 'base64')
1726
- if /(^text\/)|(\+xml$)|(\/xml$)/ =~ contentparams['type']
1727
- return false
1728
- end
1729
- return true
1730
- end
1731
-
1732
- def itsAnHrefDamnIt(attrsD)
1733
- href= attrsD['url'] || attrsD['uri'] || attrsD['href']
1734
- if href
1735
- attrsD.delete('url')
1736
- attrsD.delete('uri')
1737
- attrsD['href'] = href
1738
- end
1739
- return attrsD
1740
- end
1741
-
1742
-
1743
- def _save(key, value)
1744
- context = getContext()
1745
- context[key] ||= value
1746
- end
1747
-
1748
- def _start_rss(attrsD)
1749
- versionmap = {'0.91' => 'rss091u',
1750
- '0.92' => 'rss092',
1751
- '0.93' => 'rss093',
1752
- '0.94' => 'rss094'
1753
- }
1754
-
1755
- if not @version or @version.empty?
1756
- attr_version = attrsD['version'] || ''
1757
- version = versionmap[attr_version]
1758
- if version and not version.empty?
1759
- @version = version
1760
- elsif /^2\./ =~ attr_version
1761
- @version = 'rss20'
1762
- else
1763
- @version = 'rss'
1764
- end
1765
- end
1766
- end
1767
-
1768
- def _start_dlhottitles(attrsD)
1769
- @version = 'hotrss'
1770
- end
1771
-
1772
- def _start_channel(attrsD)
1773
- @infeed = true
1774
- _cdf_common(attrsD)
1775
- end
1776
- alias :_start_feedinfo :_start_channel
1777
-
1778
- def _cdf_common(attrsD)
1779
- if attrsD.has_key?'lastmod'
1780
- _start_modified({})
1781
- @elementstack[-1][-1] = attrsD['lastmod']
1782
- _end_modified
1783
- end
1784
- if attrsD.has_key?'href'
1785
- _start_link({})
1786
- @elementstack[-1][-1] = attrsD['href']
1787
- _end_link
1788
- end
1789
- end
1790
-
1791
- def _start_feed(attrsD)
1792
- @infeed = true
1793
- versionmap = {'0.1' => 'atom01',
1794
- '0.2' => 'atom02',
1795
- '0.3' => 'atom03'
1796
- }
1797
-
1798
- if not @version or @version.empty?
1799
- attr_version = attrsD['version']
1800
- version = versionmap[attr_version]
1801
- if @version and not @version.empty?
1802
- @version = version
1803
- else
1804
- @version = 'atom'
1805
- end
1806
- end
1807
- end
1808
-
1809
- def _end_channel
1810
- @infeed = false
1811
- end
1812
- alias :_end_feed :_end_channel
1813
-
1814
- def _start_image(attrsD)
1815
- @inimage = true
1816
- push('image', false)
1817
- context = getContext()
1818
- context['image'] ||= FeedParserDict.new
1819
- end
1820
-
1821
- def _end_image
1822
- pop('image')
1823
- @inimage = false
1824
- end
1825
-
1826
- def _start_textinput(attrsD)
1827
- @intextinput = true
1828
- push('textinput', false)
1829
- context = getContext()
1830
- context['textinput'] ||= FeedParserDict.new
1831
- end
1832
- alias :_start_textInput :_start_textinput
1833
-
1834
- def _end_textinput
1835
- pop('textinput')
1836
- @intextinput = false
1837
- end
1838
- alias :_end_textInput :_end_textinput
1839
-
1840
- def _start_author(attrsD)
1841
- @inauthor = true
1842
- push('author', true)
1843
- end
1844
- alias :_start_managingeditor :_start_author
1845
- alias :_start_dc_author :_start_author
1846
- alias :_start_dc_creator :_start_author
1847
- alias :_start_itunes_author :_start_author
1848
-
1849
- def _end_author
1850
- pop('author')
1851
- @inauthor = false
1852
- _sync_author_detail()
1853
- end
1854
- alias :_end_managingeditor :_end_author
1855
- alias :_end_dc_author :_end_author
1856
- alias :_end_dc_creator :_end_author
1857
- alias :_end_itunes_author :_end_author
1858
-
1859
- def _start_itunes_owner(attrsD)
1860
- @inpublisher = true
1861
- push('publisher', false)
1862
- end
1863
-
1864
- def _end_itunes_owner
1865
- pop('publisher')
1866
- @inpublisher = false
1867
- _sync_author_detail('publisher')
1868
- end
1869
-
1870
- def _start_contributor(attrsD)
1871
- @incontributor = true
1872
- context = getContext()
1873
- context['contributors'] ||= []
1874
- context['contributors'] << FeedParserDict.new
1875
- push('contributor', false)
1876
- end
1877
-
1878
- def _end_contributor
1879
- pop('contributor')
1880
- @incontributor = false
1881
- end
1882
-
1883
- def _start_dc_contributor(attrsD)
1884
- @incontributor = true
1885
- context = getContext()
1886
- context['contributors'] ||= []
1887
- context['contributors'] << FeedParserDict.new
1888
- push('name', false)
1889
- end
1890
-
1891
- def _end_dc_contributor
1892
- _end_name
1893
- @incontributor = false
1894
- end
1895
-
1896
- def _start_name(attrsD)
1897
- push('name', false)
1898
- end
1899
- alias :_start_itunes_name :_start_name
1900
-
1901
- def _end_name
1902
- value = pop('name')
1903
- if @inpublisher
1904
- _save_author('name', value, 'publisher')
1905
- elsif @inauthor
1906
- _save_author('name', value)
1907
- elsif @incontributor
1908
- _save_contributor('name', value)
1909
- elsif @intextinput
1910
- context = getContext()
1911
- context['textinput']['name'] = value
1912
- end
1913
- end
1914
- alias :_end_itunes_name :_end_name
1915
-
1916
- def _start_width(attrsD)
1917
- push('width', false)
1918
- end
1919
-
1920
- def _end_width
1921
- value = pop('width').to_i
1922
- if @inimage
1923
- context = getContext
1924
- context['image']['width'] = value
1925
- end
1926
- end
1927
-
1928
- def _start_height(attrsD)
1929
- push('height', false)
1930
- end
1931
-
1932
- def _end_height
1933
- value = pop('height').to_i
1934
- if @inimage
1935
- context = getContext()
1936
- context['image']['height'] = value
1937
- end
1938
- end
1939
-
1940
- def _start_url(attrsD)
1941
- push('href', true)
1942
- end
1943
- alias :_start_homepage :_start_url
1944
- alias :_start_uri :_start_url
1945
-
1946
- def _end_url
1947
- value = pop('href')
1948
- if @inauthor
1949
- _save_author('href', value)
1950
- elsif @incontributor
1951
- _save_contributor('href', value)
1952
- elsif @inimage
1953
- context = getContext()
1954
- context['image']['href'] = value
1955
- elsif @intextinput
1956
- context = getContext()
1957
- context['textinput']['link'] = value
1958
- end
1959
- end
1960
- alias :_end_homepage :_end_url
1961
- alias :_end_uri :_end_url
1962
-
1963
- def _start_email(attrsD)
1964
- push('email', false)
1965
- end
1966
- alias :_start_itunes_email :_start_email
1967
-
1968
- def _end_email
1969
- value = pop('email')
1970
- if @inpublisher
1971
- _save_author('email', value, 'publisher')
1972
- elsif @inauthor
1973
- _save_author('email', value)
1974
- elsif @incontributor
1975
- _save_contributor('email', value)
1976
- end
1977
- end
1978
- alias :_end_itunes_email :_end_email
1979
-
1980
- def getContext
1981
- if @insource
1982
- context = @sourcedata
1983
- elsif @inentry
1984
- context = @entries[-1]
1985
- else
1986
- context = @feeddata
1987
- end
1988
- return context
1989
- end
1990
-
1991
- def _save_author(key, value, prefix='author')
1992
- context = getContext()
1993
- context[prefix + '_detail'] ||= FeedParserDict.new
1994
- context[prefix + '_detail'][key] = value
1995
- _sync_author_detail()
1996
- end
1997
-
1998
- def _save_contributor(key, value)
1999
- context = getContext
2000
- context['contributors'] ||= [FeedParserDict.new]
2001
- context['contributors'][-1][key] = value
2002
- end
2003
-
2004
- def _sync_author_detail(key='author')
2005
- context = getContext()
2006
- detail = context["#{key}_detail"]
2007
- if detail and not detail.empty?
2008
- name = detail['name']
2009
- email = detail['email']
2010
-
2011
- if name and email and not (name.empty? or name.empty?)
2012
- context[key] = "#{name} (#{email})"
2013
- elsif name and not name.empty?
2014
- context[key] = name
2015
- elsif email and not email.empty?
2016
- context[key] = email
2017
- end
2018
- else
2019
- author = context[key].dup unless context[key].nil?
2020
- return if not author or author.empty?
2021
- emailmatch = author.match(/(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))/)
2022
- email = emailmatch[1]
2023
- author.gsub!(email, '')
2024
- author.gsub!("\(\)", '')
2025
- author.strip!
2026
- author.gsub!(/^\(/,'')
2027
- author.gsub!(/\)$/,'')
2028
- author.strip!
2029
- context["#{key}_detail"] ||= FeedParserDict.new
2030
- context["#{key}_detail"]['name'] = author
2031
- context["#{key}_detail"]['email'] = email
2032
- end
2033
- end
2034
-
2035
- def _start_subtitle(attrsD)
2036
- pushContent('subtitle', attrsD, 'text/plain', true)
2037
- end
2038
- alias :_start_tagline :_start_subtitle
2039
- alias :_start_itunes_subtitle :_start_subtitle
2040
-
2041
- def _end_subtitle
2042
- popContent('subtitle')
2043
- end
2044
- alias :_end_tagline :_end_subtitle
2045
- alias :_end_itunes_subtitle :_end_subtitle
2046
-
2047
- def _start_rights(attrsD)
2048
- pushContent('rights', attrsD, 'text/plain', true)
2049
- end
2050
- alias :_start_dc_rights :_start_rights
2051
- alias :_start_copyright :_start_rights
2052
-
2053
- def _end_rights
2054
- popContent('rights')
2055
- end
2056
- alias :_end_dc_rights :_end_rights
2057
- alias :_end_copyright :_end_rights
2058
-
2059
- def _start_item(attrsD)
2060
- @entries << FeedParserDict.new
2061
- push('item', false)
2062
- @inentry = true
2063
- @guidislink = false
2064
- id = getAttribute(attrsD, 'rdf:about')
2065
- if id and not id.empty?
2066
- context = getContext()
2067
- context['id'] = id
2068
- end
2069
- _cdf_common(attrsD)
2070
- end
2071
- alias :_start_entry :_start_item
2072
- alias :_start_product :_start_item
2073
-
2074
- def _end_item
2075
- pop('item')
2076
- @inentry = false
2077
- end
2078
- alias :_end_entry :_end_item
2079
-
2080
- def _start_dc_language(attrsD)
2081
- push('language', true)
2082
- end
2083
- alias :_start_language :_start_dc_language
2084
-
2085
- def _end_dc_language
2086
- @lang = pop('language')
2087
- end
2088
- alias :_end_language :_end_dc_language
2089
-
2090
- def _start_dc_publisher(attrsD)
2091
- push('publisher', true)
2092
- end
2093
- alias :_start_webmaster :_start_dc_publisher
2094
-
2095
- def _end_dc_publisher
2096
- pop('publisher')
2097
- _sync_author_detail('publisher')
2098
- end
2099
- alias :_end_webmaster :_end_dc_publisher
2100
-
2101
- def _start_published(attrsD)
2102
- push('published', true)
2103
- end
2104
- alias :_start_dcterms_issued :_start_published
2105
- alias :_start_issued :_start_published
2106
-
2107
- def _end_published
2108
- value = pop('published')
2109
- _save('published_parsed', parse_date(value))
2110
- end
2111
- alias :_end_dcterms_issued :_end_published
2112
- alias :_end_issued :_end_published
2113
-
2114
- def _start_updated(attrsD)
2115
- push('updated', true)
2116
- end
2117
- alias :_start_modified :_start_updated
2118
- alias :_start_dcterms_modified :_start_updated
2119
- alias :_start_pubdate :_start_updated
2120
- alias :_start_dc_date :_start_updated
2121
-
2122
- def _end_updated
2123
- value = pop('updated')
2124
- _save('updated_parsed', parse_date(value))
2125
- end
2126
- alias :_end_modified :_end_updated
2127
- alias :_end_dcterms_modified :_end_updated
2128
- alias :_end_pubdate :_end_updated
2129
- alias :_end_dc_date :_end_updated
2130
-
2131
- def _start_created(attrsD)
2132
- push('created', true)
2133
- end
2134
- alias :_start_dcterms_created :_start_created
2135
-
2136
- def _end_created
2137
- value = pop('created')
2138
- _save('created_parsed', parse_date(value))
2139
- end
2140
- alias :_end_dcterms_created :_end_created
2141
-
2142
- def _start_expirationdate(attrsD)
2143
- push('expired', true)
2144
- end
2145
- def _end_expirationdate
2146
- _save('expired_parsed', parse_date(pop('expired')))
2147
- end
2148
-
2149
- def _start_cc_license(attrsD)
2150
- push('license', true)
2151
- value = getAttribute(attrsD, 'rdf:resource')
2152
- if value and not value.empty?
2153
- elementstack[-1][2] << value
2154
- pop('license')
2155
- end
2156
- end
2157
-
2158
- def _start_creativecommons_license(attrsD)
2159
- push('license', true)
2160
- end
2161
-
2162
- def _end_creativecommons_license
2163
- pop('license')
2164
- end
2165
-
2166
- def addTag(term, scheme, label)
2167
- context = getContext()
2168
- context['tags'] ||= []
2169
- tags = context['tags']
2170
- if (term.nil? or term.empty?) and (scheme.nil? or scheme.empty?) and (label.nil? or label.empty?)
2171
- return
2172
- end
2173
- value = FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
2174
- if not tags.include?value
2175
- context['tags'] << FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
2176
- end
2177
- end
2178
-
2179
- def _start_category(attrsD)
2180
- $stderr << "entering _start_category with #{attrsD}\n" if $debug
2181
-
2182
- term = attrsD['term']
2183
- scheme = attrsD['scheme'] || attrsD['domain']
2184
- label = attrsD['label']
2185
- addTag(term, scheme, label)
2186
- push('category', true)
2187
- end
2188
- alias :_start_dc_subject :_start_category
2189
- alias :_start_keywords :_start_category
2190
-
2191
- def _end_itunes_keywords
2192
- pop('itunes_keywords').split.each do |term|
2193
- addTag(term, 'http://www.itunes.com/', nil)
2194
- end
2195
- end
2196
-
2197
- def _start_itunes_category(attrsD)
2198
- addTag(attrsD['text'], 'http://www.itunes.com/', nil)
2199
- push('category', true)
2200
- end
2201
-
2202
- def _end_category
2203
- value = pop('category')
2204
- return if value.nil? or value.empty?
2205
- context = getContext()
2206
- tags = context['tags']
2207
- if value and not value.empty? and not tags.empty? and not tags[-1]['term']:
2208
- tags[-1]['term'] = value
2209
- else
2210
- addTag(value, nil, nil)
2211
- end
2212
- end
2213
- alias :_end_dc_subject :_end_category
2214
- alias :_end_keywords :_end_category
2215
- alias :_end_itunes_category :_end_category
2216
-
2217
- def _start_cloud(attrsD)
2218
- getContext()['cloud'] = FeedParserDict.new(attrsD)
2219
- end
2220
-
2221
- def _start_link(attrsD)
2222
- attrsD['rel'] ||= 'alternate'
2223
- attrsD['type'] ||= 'text/html'
2224
- attrsD = itsAnHrefDamnIt(attrsD)
2225
- if attrsD.has_key? 'href'
2226
- attrsD['href'] = resolveURI(attrsD['href'])
2227
- end
2228
- expectingText = @infeed || @inentry || @insource
2229
- context = getContext()
2230
- context['links'] ||= []
2231
- context['links'] << FeedParserDict.new(attrsD)
2232
- if attrsD['rel'] == 'enclosure'
2233
- _start_enclosure(attrsD)
2234
- end
2235
- if attrsD.has_key? 'href'
2236
- expectingText = false
2237
- if (attrsD['rel'] == 'alternate') and @html_types.include?mapContentType(attrsD['type'])
2238
- context['link'] = attrsD['href']
2239
- end
2240
- else
2241
- push('link', expectingText)
2242
- end
2243
- end
2244
- alias :_start_producturl :_start_link
2245
-
2246
- def _end_link
2247
- value = pop('link')
2248
- context = getContext()
2249
- if @intextinput
2250
- context['textinput']['link'] = value
2251
- end
2252
- if @inimage
2253
- context['image']['link'] = value
2254
- end
2255
- end
2256
- alias :_end_producturl :_end_link
2257
-
2258
- def _start_guid(attrsD)
2259
- @guidislink = ((attrsD['ispermalink'] || 'true') == 'true')
2260
- push('id', true)
2261
- end
2262
-
2263
- def _end_guid
2264
- value = pop('id')
2265
- _save('guidislink', (@guidislink and not getContext().has_key?('link')))
2266
- if @guidislink:
2267
- # guid acts as link, but only if 'ispermalink' is not present or is 'true',
2268
- # and only if the item doesn't already have a link element
2269
- _save('link', value)
2270
- end
2271
- end
2272
-
2273
-
2274
- def _start_title(attrsD)
2275
- pushContent('title', attrsD, 'text/plain', @infeed || @inentry || @insource)
2276
- end
2277
- alias :_start_dc_title :_start_title
2278
- alias :_start_media_title :_start_title
2279
-
2280
- def _end_title
2281
- value = popContent('title')
2282
- context = getContext()
2283
- if @intextinput
2284
- context['textinput']['title'] = value
2285
- elsif @inimage
2286
- context['image']['title'] = value
2287
- end
2288
- end
2289
- alias :_end_dc_title :_end_title
2290
- alias :_end_media_title :_end_title
2291
-
2292
- def _start_description(attrsD)
2293
- context = getContext()
2294
- if context.has_key?('summary')
2295
- @summaryKey = 'content'
2296
- _start_content(attrsD)
2297
- else
2298
- pushContent('description', attrsD, 'text/html', @infeed || @inentry || @insource)
2299
- end
2300
- end
2301
-
2302
- def _start_abstract(attrsD)
2303
- pushContent('description', attrsD, 'text/plain', @infeed || @inentry || @insource)
2304
- end
2305
-
2306
- def _end_description
2307
- if @summaryKey == 'content'
2308
- _end_content()
2309
- else
2310
- value = popContent('description')
2311
- context = getContext()
2312
- if @intextinput
2313
- context['textinput']['description'] = value
2314
- elsif @inimage:
2315
- context['image']['description'] = value
2316
- end
2317
- end
2318
- @summaryKey = nil
2319
- end
2320
- alias :_end_abstract :_end_description
2321
-
2322
- def _start_info(attrsD)
2323
- pushContent('info', attrsD, 'text/plain', true)
2324
- end
2325
- alias :_start_feedburner_browserfriendly :_start_info
2326
-
2327
- def _end_info
2328
- popContent('info')
2329
- end
2330
- alias :_end_feedburner_browserfriendly :_end_info
2331
-
2332
- def _start_generator(attrsD)
2333
- if attrsD and not attrsD.empty?
2334
- attrsD = itsAnHrefDamnIt(attrsD)
2335
- if attrsD.has_key?('href')
2336
- attrsD['href'] = resolveURI(attrsD['href'])
2337
- end
2338
- end
2339
- getContext()['generator_detail'] = FeedParserDict.new(attrsD)
2340
- push('generator', true)
2341
- end
2342
-
2343
- def _end_generator
2344
- value = pop('generator')
2345
- context = getContext()
2346
- if context.has_key?('generator_detail')
2347
- context['generator_detail']['name'] = value
2348
- end
2349
- end
2350
-
2351
- def _start_admin_generatoragent(attrsD)
2352
- push('generator', true)
2353
- value = getAttribute(attrsD, 'rdf:resource')
2354
- if value and not value.empty?
2355
- elementstack[-1][2] << value
2356
- end
2357
- pop('generator')
2358
- getContext()['generator_detail'] = FeedParserDict.new({'href' => value})
2359
- end
2360
-
2361
- def _start_admin_errorreportsto(attrsD)
2362
- push('errorreportsto', true)
2363
- value = getAttribute(attrsD, 'rdf:resource')
2364
- if value and not value.empty?
2365
- @elementstack[-1][2] << value
2366
- end
2367
- pop('errorreportsto')
2368
- end
2369
-
2370
- def _start_summary(attrsD)
2371
- context = getContext()
2372
- if context.has_key?'summary'
2373
- @summaryKey = 'content'
2374
- _start_content(attrsD)
2375
- else
2376
- @summaryKey = 'summary'
2377
- pushContent(@summaryKey, attrsD, 'text/plain', true)
2378
- end
2379
- end
2380
- alias :_start_itunes_summary :_start_summary
2381
-
2382
- def _end_summary
2383
- if @summaryKey == 'content':
2384
- _end_content()
2385
- else
2386
- popContent(@summaryKey || 'summary')
2387
- end
2388
- @summaryKey = nil
2389
- end
2390
- alias :_end_itunes_summary :_end_summary
2391
-
2392
- def _start_enclosure(attrsD)
2393
- attrsD = itsAnHrefDamnIt(attrsD)
2394
- getContext()['enclosures'] ||= []
2395
- getContext()['enclosures'] << FeedParserDict.new(attrsD)
2396
- href = attrsD['href']
2397
- if href and not href.empty?
2398
- context = getContext()
2399
- if not context['id']
2400
- context['id'] = href
2401
- end
2402
- end
2403
- end
2404
-
2405
- def _start_source(attrsD)
2406
- @insource = true
2407
- end
2408
-
2409
- def _end_source
2410
- @insource = false
2411
- getContext()['source'] = Marshal.load(Marshal.dump(@sourcedata))
2412
- @sourcedata.clear()
2413
- end
2414
-
2415
- def _start_content(attrsD)
2416
- pushContent('content', attrsD, 'text/plain', true)
2417
- src = attrsD['src']
2418
- if src and not src.empty?:
2419
- @contentparams['src'] = src
2420
- end
2421
- push('content', true)
2422
- end
2423
-
2424
- def _start_prodlink(attrsD)
2425
- pushContent('content', attrsD, 'text/html', true)
2426
- end
2427
-
2428
- def _start_body(attrsD)
2429
- pushContent('content', attrsD, 'application/xhtml+xml', true)
2430
- end
2431
- alias :_start_xhtml_body :_start_body
2432
-
2433
- def _start_content_encoded(attrsD)
2434
- pushContent('content', attrsD, 'text/html', true)
2435
- end
2436
- alias :_start_fullitem :_start_content_encoded
2437
-
2438
- def _end_content
2439
- copyToDescription = (['text/plain'] + @html_types).include? mapContentType(@contentparams['type'])
2440
- value = popContent('content')
2441
- if copyToDescription
2442
- _save('description', value)
2443
- end
2444
- alias :_end_body :_end_content
2445
- alias :_end_xhtml_body :_end_content
2446
- alias :_end_content_encoded :_end_content
2447
- alias :_end_fullitem :_end_content
2448
- alias :_end_prodlink :_end_content
2449
- end
2450
-
2451
- def _start_itunes_image(attrsD)
2452
- push('itunes_image', false)
2453
- getContext()['image'] = FeedParserDict.new({'href' => attrsD['href']})
2454
- end
2455
- alias :_start_itunes_link :_start_itunes_image
2456
-
2457
- def _end_itunes_block
2458
- value = pop('itunes_block', false)
2459
- getContext()['itunes_block'] = (value == 'yes') and true or false
2460
- end
2461
-
2462
- def _end_itunes_explicit
2463
- value = pop('itunes_explicit', false)
2464
- getContext()['itunes_explicit'] = (value == 'yes') and true or false
2465
- end
2466
-
2467
-
2468
- # ISO-8601 date parsing routines written by Fazal Majid.
2469
- # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2470
- # parser is beyond the scope of feedparser and the current Time.iso8601
2471
- # method does not work.
2472
- # A single regular expression cannot parse ISO 8601 date formats into groups
2473
- # as the standard is highly irregular (for instance is 030104 2003-01-04 or
2474
- # 0301-04-01), so we use templates instead.
2475
- # Please note the order in templates is significant because we need a
2476
- # greedy match.
2477
- def _parse_date_iso8601(dateString)
2478
- # Parse a variety of ISO-8601-compatible formats like 20040105
2479
-
2480
- # What I'm about to show you may be the ugliest code in all of
2481
- # rfeedparser.
2482
- # FIXME The century regexp maybe not work ('\d\d$' says "two numbers at
2483
- # end of line" but we then attach more of a regexp.
2484
- iso8601_regexps = [ '^(\d{4})-?([01]\d)-([0123]\d)',
2485
- '^(\d{4})-([01]\d)',
2486
- '^(\d{4})-?([0123]\d\d)',
2487
- '^(\d\d)-?([01]\d)-?([0123]\d)',
2488
- '^(\d\d)-?([0123]\d\d)',
2489
- '^(\d{4})',
2490
- '-(\d\d)-?([01]\d)',
2491
- '-([0123]\d\d)',
2492
- '-(\d\d)',
2493
- '--([01]\d)-?([0123]\d)',
2494
- '--([01]\d)',
2495
- '---([0123]\d)',
2496
- '(\d\d$)',
2497
- ''
2498
- ]
2499
- iso8601_values = { '^(\d{4})-?([01]\d)-([0123]\d)' => ['year', 'month', 'day'],
2500
- '^(\d{4})-([01]\d)' => ['year','month'],
2501
- '^(\d{4})-?([0123]\d\d)' => ['year', 'ordinal'],
2502
- '^(\d\d)-?([01]\d)-?([0123]\d)' => ['year','month','day'],
2503
- '^(\d\d)-?([0123]\d\d)' => ['year','ordinal'],
2504
- '^(\d{4})' => ['year'],
2505
- '-(\d\d)-?([01]\d)' => ['year','month'],
2506
- '-([0123]\d\d)' => ['ordinal'],
2507
- '-(\d\d)' => ['year'],
2508
- '--([01]\d)-?([0123]\d)' => ['month','day'],
2509
- '--([01]\d)' => ['month'],
2510
- '---([0123]\d)' => ['day'],
2511
- '(\d\d$)' => ['century'],
2512
- '' => []
2513
- }
2514
- add_to_all = '(T?(\d\d):(\d\d)(?::(\d\d))?([+-](\d\d)(?::(\d\d))?|Z)?)?'
2515
- add_to_all_fields = ['hour', 'minute', 'second', 'tz', 'tzhour', 'tzmin']
2516
- # NOTE We use '(?:' to prevent grouping of optional matches (ones trailed
2517
- # by '?'). The second ':' *are* matched.
2518
- m = nil
2519
- param_keys = []
2520
- iso8601_regexps.each do |s|
2521
- $stderr << "Trying iso8601 regexp: #{s+add_to_all}\n" if $debug
2522
- param_keys = iso8601_values[s] + add_to_all_fields
2523
- m = dateString.match(Regexp.new(s+add_to_all))
2524
- break if m
2525
- end
2526
- return if m.nil? or (m.begin(0).zero? and m.end(0).zero?)
2527
-
2528
- param_values = m.to_a
2529
- param_values = param_values[1..-1]
2530
- params = {}
2531
- param_keys.each_with_index do |key,i|
2532
- params[key] = param_values[i]
2533
- end
2534
21
 
2535
- ordinal = params['ordinal'].to_i unless params['ordinal'].nil?
2536
- year = params['year'] || '--'
2537
- if year.nil? or year.empty? or year == '--' # FIXME When could the regexp ever return a year equal to '--'?
2538
- year = Time.now.utc.year
2539
- elsif year.length == 2
2540
- # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
2541
- year = 100 * (Time.now.utc.year / 100) + year.to_i
2542
- else
2543
- year = year.to_i
2544
- end
2545
-
2546
- month = params['month'] || '-'
2547
- if month.nil? or month.empty? or month == '-'
2548
- # ordinals are NOT normalized by mktime, we simulate them
2549
- # by setting month=1, day=ordinal
2550
- if ordinal
2551
- month = DateTime.ordinal(year,ordinal).month
2552
- else
2553
- month = Time.now.utc.month
2554
- end
2555
- end
2556
- month = month.to_i unless month.nil?
2557
- day = params['day']
2558
- if day.nil? or day.empty?
2559
- # see above
2560
- if ordinal
2561
- day = DateTime.ordinal(year,ordinal).day
2562
- elsif params['century'] or params['year'] or params['month']
2563
- day = 1
2564
- else
2565
- day = Time.now.utc.day
2566
- end
2567
- else
2568
- day = day.to_i
2569
- end
2570
- # special case of the century - is the first year of the 21st century
2571
- # 2000 or 2001 ? The debate goes on...
2572
- if params.has_key? 'century'
2573
- year = (params['century'].to_i - 1) * 100 + 1
2574
- end
2575
- # in ISO 8601 most fields are optional
2576
- hour = params['hour'].to_i
2577
- minute = params['minute'].to_i
2578
- second = params['second'].to_i
2579
- weekday = nil
2580
- # daylight savings is complex, but not needed for feedparser's purposes
2581
- # as time zones, if specified, include mention of whether it is active
2582
- # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
2583
- # and most implementations have DST bugs
2584
- tm = [second, minute, hour, day, month, year, nil, ordinal, false, nil]
2585
- tz = params['tz']
2586
- if tz and not tz.empty? and tz != 'Z'
2587
- # FIXME does this cross over days?
2588
- if tz[0] == '-'
2589
- tm[3] += params['tzhour'].to_i
2590
- tm[4] += params['tzmin'].to_i
2591
- elsif tz[0] == '+'
2592
- tm[3] -= params['tzhour'].to_i
2593
- tm[4] -= params['tzmin'].to_i
2594
- else
2595
- return nil
2596
- end
2597
- end
2598
- return Time.utc(*tm) # Magic!
2599
-
2600
- end
2601
-
2602
- def _parse_date_onblog(dateString)
2603
- # Parse a string according to the OnBlog 8-bit date format
2604
- # 8-bit date handling routes written by ytrewq1
2605
- korean_year = u("년") # b3e2 in euc-kr
2606
- korean_month = u("월") # bff9 in euc-kr
2607
- korean_day = u("일") # c0cf in euc-kr
2608
-
2609
-
2610
- korean_onblog_date_re = /(\d{4})#{korean_year}\s+(\d{2})#{korean_month}\s+(\d{2})#{korean_day}\s+(\d{2}):(\d{2}):(\d{2})/
2611
-
2612
-
2613
- m = korean_onblog_date_re.match(dateString)
2614
- return unless m
2615
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
2616
-
2617
- $stderr << "OnBlog date parsed as: %s\n" % w3dtfdate if $debug
2618
- return _parse_date_w3dtf(w3dtfdate)
2619
- end
2620
-
2621
- def _parse_date_nate(dateString)
2622
- # Parse a string according to the Nate 8-bit date format
2623
- # 8-bit date handling routes written by ytrewq1
2624
- korean_am = u("오전") # bfc0 c0fc in euc-kr
2625
- korean_pm = u("오후") # bfc0 c8c4 in euc-kr
2626
-
2627
- korean_nate_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(#{korean_am}|#{korean_pm})\s+(\d{0,2}):(\d{0,2}):(\d{0,2})/
2628
- m = korean_nate_date_re.match(dateString)
2629
- return unless m
2630
- hour = m[5].to_i
2631
- ampm = m[4]
2632
- if ampm == korean_pm
2633
- hour += 12
2634
- end
2635
- hour = hour.to_s.rjust(2,'0')
2636
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{hour}:#{m[6]}:#{m[7]}+09:00"
2637
- $stderr << "Nate date parsed as: %s\n" % w3dtfdate if $debug
2638
- return _parse_date_w3dtf(w3dtfdate)
2639
- end
2640
-
2641
- def _parse_date_mssql(dateString)
2642
- mssql_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?/
2643
-
2644
- m = mssql_date_re.match(dateString)
2645
- return unless m
2646
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
2647
- $stderr << "MS SQL date parsed as: %s\n" % w3dtfdate if $debug
2648
- return _parse_date_w3dtf(w3dtfdate)
2649
- end
2650
-
2651
- def _parse_date_greek(dateString)
2652
- # Parse a string according to a Greek 8-bit date format
2653
- # Unicode strings for Greek date strings
2654
- greek_months = {
2655
- u("Ιαν") => u("Jan"), # c9e1ed in iso-8859-7
2656
- u("Φεβ") => u("Feb"), # d6e5e2 in iso-8859-7
2657
- u("Μάώ") => u("Mar"), # ccdcfe in iso-8859-7
2658
- u("Μαώ") => u("Mar"), # cce1fe in iso-8859-7
2659
- u("Απρ") => u("Apr"), # c1f0f1 in iso-8859-7
2660
- u("Μάι") => u("May"), # ccdce9 in iso-8859-7
2661
- u("Μαϊ") => u("May"), # cce1fa in iso-8859-7
2662
- u("Μαι") => u("May"), # cce1e9 in iso-8859-7
2663
- u("Ιούν") => u("Jun"), # c9effded in iso-8859-7
2664
- u("Ιον") => u("Jun"), # c9efed in iso-8859-7
2665
- u("Ιούλ") => u("Jul"), # c9effdeb in iso-8859-7
2666
- u("Ιολ") => u("Jul"), # c9f9eb in iso-8859-7
2667
- u("Αύγ") => u("Aug"), # c1fde3 in iso-8859-7
2668
- u("Αυγ") => u("Aug"), # c1f5e3 in iso-8859-7
2669
- u("Σεπ") => u("Sep"), # d3e5f0 in iso-8859-7
2670
- u("Οκτ") => u("Oct"), # cfeaf4 in iso-8859-7
2671
- u("Νοέ") => u("Nov"), # cdefdd in iso-8859-7
2672
- u("Νοε") => u("Nov"), # cdefe5 in iso-8859-7
2673
- u("Δεκ") => u("Dec"), # c4e5ea in iso-8859-7
2674
- }
2675
-
2676
- greek_wdays = {
2677
- u("Κυρ") => u("Sun"), # caf5f1 in iso-8859-7
2678
- u("Δευ") => u("Mon"), # c4e5f5 in iso-8859-7
2679
- u("Τρι") => u("Tue"), # d4f1e9 in iso-8859-7
2680
- u("Τετ") => u("Wed"), # d4e5f4 in iso-8859-7
2681
- u("Πεμ") => u("Thu"), # d0e5ec in iso-8859-7
2682
- u("Παρ") => u("Fri"), # d0e1f1 in iso-8859-7
2683
- u("Σαβ") => u("Sat"), # d3e1e2 in iso-8859-7
2684
- }
2685
-
2686
- greek_date_format = /([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)/
2687
-
2688
- m = greek_date_format.match(dateString)
2689
- return unless m
2690
- begin
2691
- wday = greek_wdays[m[1]]
2692
- month = greek_months[m[3]]
2693
- rescue
2694
- return nil
2695
- end
2696
- rfc822date = "#{wday}, #{m[2]} #{month} #{m[4]} #{m[5]}:#{m[6]}:#{m[7]} #{m[8]}"
2697
- $stderr << "Greek date parsed as: #{rfc822date}\n" if $debug
2698
- return _parse_date_rfc822(rfc822date)
2699
- end
2700
-
2701
- def _parse_date_hungarian(dateString)
2702
- # Parse a string according to a Hungarian 8-bit date format.
2703
- hungarian_date_format_re = /(\d{4})-([^-]+)-(\d{0,2})T(\d{0,2}):(\d{2})((\+|-)(\d{0,2}:\d{2}))/
2704
- m = hungarian_date_format_re.match(dateString)
2705
- return unless m
2706
-
2707
- # Unicode strings for Hungarian date strings
2708
- hungarian_months = {
2709
- u("január") => u("01"), # e1 in iso-8859-2
2710
- u("februári") => u("02"), # e1 in iso-8859-2
2711
- u("március") => u("03"), # e1 in iso-8859-2
2712
- u("április") => u("04"), # e1 in iso-8859-2
2713
- u("máujus") => u("05"), # e1 in iso-8859-2
2714
- u("június") => u("06"), # fa in iso-8859-2
2715
- u("július") => u("07"), # fa in iso-8859-2
2716
- u("augusztus") => u("08"),
2717
- u("szeptember") => u("09"),
2718
- u("október") => u("10"), # f3 in iso-8859-2
2719
- u("november") => u("11"),
2720
- u("december") => u("12"),
2721
- }
2722
- begin
2723
- month = hungarian_months[m[2]]
2724
- day = m[3].rjust(2,'0')
2725
- hour = m[4].rjust(2,'0')
2726
- rescue
2727
- return
2728
- end
2729
-
2730
- w3dtfdate = "#{m[1]}-#{month}-#{day}T#{hour}:#{m[5]}:00#{m[6]}"
2731
- $stderr << "Hungarian date parsed as: #{w3dtfdate}\n" if $debug
2732
- return _parse_date_w3dtf(w3dtfdate)
2733
- end
2734
-
2735
- def rollover(num, modulus)
2736
- return num % modulus, num / modulus
2737
- end
2738
-
2739
- def set_self(num, modulus)
2740
- r = num / modulus
2741
- if r == 0
2742
- return num
2743
- end
2744
- return r
2745
- end
2746
- # W3DTF-style date parsing
2747
- # FIXME shouldn't it be "W3CDTF"?
2748
- def _parse_date_w3dtf(dateString)
2749
- # Ruby's Time docs claim w3cdtf is an alias for iso8601 which is an alias for xmlschema
2750
- # Whatever it is, it doesn't work. This has been fixed in Ruby 1.9 and
2751
- # in Ruby on Rails, but not really. They don't fix the 25 hour or 61 minute or 61 second rollover and fail in other ways.
2752
-
2753
- m = dateString.match(/^(\d{4})-?(?:(?:([01]\d)-?(?:([0123]\d)(?:T(\d\d):(\d\d):(\d\d)([+-]\d\d:\d\d|Z))?)?)?)?/)
2754
-
2755
- w3 = m[1..3].map{|s| s=s.to_i; s += 1 if s == 0;s} # Map the year, month and day to integers and, if they were nil, set them to 1
2756
- w3 += m[4..6].map{|s| s.to_i} # Map the hour, minute and second to integers
2757
- w3 << m[-1] # Leave the timezone as a String
2758
-
2759
- # FIXME this next bit needs some serious refactoring
2760
- # Rollover times. 0 minutes and 61 seconds -> 1 minute and 1 second
2761
- w3[5],r = rollover(w3[5], 60) # rollover seconds
2762
- w3[4] += r
2763
- w3[4],r = rollover(w3[4], 60) # rollover minutes
2764
- w3[3] += r
2765
- w3[3],r = rollover(w3[3], 24) # rollover hours
2766
-
2767
- w3[2] = w3[2] + r
2768
- if w3[1] > 12
2769
- w3[1],r = rollover(w3[1],12)
2770
- w3[1] = 12 if w3[1] == 0
2771
- w3[0] += r
2772
- end
2773
-
2774
- num_days = Time.days_in_month(w3[1], w3[0])
2775
- while w3[2] > num_days
2776
- w3[2] -= num_days
2777
- w3[1] += 1
2778
- if w3[1] > 12
2779
- w3[0] += 1
2780
- w3[1] = set_self(w3[1], 12)
2781
- end
2782
- num_days = Time.days_in_month(w3[1], w3[0])
2783
- end
2784
-
2785
-
2786
- unless w3[6].class != String
2787
- if /^-/ =~ w3[6] # Zone offset goes backwards
2788
- w3[6][0] = '+'
2789
- elsif /^\+/ =~ w3[6]
2790
- w3[6][0] = '-'
2791
- end
2792
- end
2793
- return Time.utc(w3[0], w3[1], w3[2] , w3[3], w3[4], w3[5])+Time.zone_offset(w3[6] || "UTC")
2794
- end
2795
-
2796
- def _parse_date_rfc822(dateString)
2797
- # Parse an RFC822, RFC1123, RFC2822 or asctime-style date
2798
- # These first few lines are to fix up the stupid proprietary format from Disney
2799
- unknown_timezones = { 'AT' => 'EDT', 'ET' => 'EST',
2800
- 'CT' => 'CST', 'MT' => 'MST',
2801
- 'PT' => 'PST'
2802
- }
2803
-
2804
- mon = dateString.split[2]
2805
- if mon.length > 3 and Time::RFC2822_MONTH_NAME.include?mon[0..2]
2806
- dateString.sub!(mon,mon[0..2])
2807
- end
2808
- if dateString[-3..-1] != "GMT" and unknown_timezones[dateString[-2..-1]]
2809
- dateString[-2..-1] = unknown_timezones[dateString[-2..-1]]
2810
- end
2811
- # Okay, the Disney date format should be fixed up now.
2812
- rfc = dateString.match(/([A-Za-z]{3}), ([0123]\d) ([A-Za-z]{3}) (\d{4})( (\d\d):(\d\d)(?::(\d\d))? ([A-Za-z]{3}))?/)
2813
- if rfc.to_a.length > 1 and rfc.to_a.include? nil
2814
- dow, day, mon, year, hour, min, sec, tz = rfc[1..-1]
2815
- hour,min,sec = [hour,min,sec].map{|e| e.to_s.rjust(2,'0') }
2816
- tz ||= "GMT"
2817
- end
2818
- asctime_match = dateString.match(/([A-Za-z]{3}) ([A-Za-z]{3}) (\d?\d) (\d\d):(\d\d):(\d\d) ([A-Za-z]{3}) (\d\d\d\d)/).to_a
2819
- if asctime_match.to_a.length > 1
2820
- # Month-abbr dayofmonth hour:minute:second year
2821
- dow, mon, day, hour, min, sec, tz, year = asctime_match[1..-1]
2822
- day.to_s.rjust(2,'0')
2823
- end
2824
- if (rfc.to_a.length > 1 and rfc.to_a.include? nil) or asctime_match.to_a.length > 1
2825
- ds = "#{dow}, #{day} #{mon} #{year} #{hour}:#{min}:#{sec} #{tz}"
2826
- else
2827
- ds = dateString
2828
- end
2829
- t = Time.rfc2822(ds).utc
2830
- return t
2831
- end
2832
-
2833
- def _parse_date_perforce(aDateString) # FIXME not in 4.1?
2834
- # Parse a date in yyyy/mm/dd hh:mm:ss TTT format
2835
- # Note that there is a day of the week at the beginning
2836
- # Ex. Fri, 2006/09/15 08:19:53 EDT
2837
- return Time.parse(aDateString).utc
2838
- end
2839
-
2840
- def extract_tuple(atime)
2841
- # NOTE leave the error handling to parse_date
2842
- t = [atime.year, atime.month, atime.mday, atime.hour,
2843
- atime.min, atime.sec, (atime.wday-1) % 7, atime.yday,
2844
- atime.isdst
2845
- ]
2846
- # yay for modulus! yaaaaaay! its 530 am and i should be sleeping! yaay!
2847
- t[0..-2].map!{|s| s.to_i}
2848
- t[-1] = t[-1] ? 1 : 0
2849
- return t
2850
- end
2851
-
2852
- def parse_date(dateString)
2853
- @date_handlers.each do |handler|
2854
- begin
2855
- $stderr << "Trying date_handler #{handler}\n" if $debug
2856
- datething = extract_tuple(send(handler,dateString))
2857
- return datething
2858
- rescue Exception => e
2859
- $stderr << "#{handler} raised #{e}\n" if $debug
2860
- end
2861
- end
2862
- return nil
2863
- end
2864
-
2865
- end # End FeedParserMixin
2866
-
2867
- class StrictFeedParser < XML::SAX::HandlerBase # expat
2868
- include FeedParserMixin
2869
-
2870
- attr_accessor :bozo, :entries, :feeddata, :exc
2871
- def initialize(baseuri, baselang, encoding)
2872
- $stderr << "trying StrictFeedParser\n" if $debug
2873
- startup(baseuri, baselang, encoding)
2874
- @bozo = false
2875
- @exc = nil
2876
- super()
2877
- end
2878
-
2879
- def getPos
2880
- [@locator.getSystemId, @locator.getLineNumber]
2881
- end
2882
-
2883
- def getAttrs(attrs)
2884
- ret = []
2885
- for i in 0..attrs.getLength
2886
- ret.push([attrs.getName(i), attrs.getValue(i)])
2887
- end
2888
- ret
2889
- end
2890
-
2891
- def setDocumentLocator(loc)
2892
- @locator = loc
2893
- end
2894
-
2895
- def startDoctypeDecl(name, pub_sys, long_name, uri)
2896
- #Nothing is done here. What could we do that is neat and useful?
2897
- end
2898
-
2899
- def startNamespaceDecl(prefix, uri)
2900
- trackNamespace(prefix, uri)
2901
- end
2902
-
2903
- def endNamespaceDecl(prefix)
2904
- end
2905
-
2906
- def startElement(name, attrs)
2907
- name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
2908
- namespaceuri = ($2 || '').downcase
2909
- name = $3
2910
- if /backend\.userland\.com\/rss/ =~ namespaceuri
2911
- # match any backend.userland.com namespace
2912
- namespaceuri = 'http://backend.userland.com/rss'
2913
- end
2914
- prefix = @matchnamespaces[namespaceuri]
2915
- # No need to raise UndeclaredNamespace, Expat does that for us with
2916
- "unbound prefix (XMLParserError)"
2917
- if prefix and not prefix.empty?
2918
- name = prefix + ':' + name
2919
- end
2920
- name.downcase!
2921
- unknown_starttag(name, attrs)
2922
- end
2923
-
2924
- def character(text, start, length)
2925
- #handle_data(CGI.unescapeHTML(text))
2926
- handle_data(text)
2927
- end
2928
- # expat provides "character" not "characters"!
2929
- alias :characters :character # Just in case.
22
+ gem 'character-encodings', ">=0.2.0"
23
+ gem 'htmltools', ">=1.10"
24
+ gem 'htmlentities', ">=4.0.0"
25
+ gem 'activesupport', ">=1.4.1"
26
+ gem 'rchardet', ">=1.0"
27
+ require 'xml/saxdriver' # calling expat through the xmlparser gem
2930
28
 
2931
- def startCdata(content)
2932
- handle_data(content)
2933
- end
29
+ require 'rchardet'
30
+ $chardet = true
2934
31
 
2935
- def endElement(name)
2936
- name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
2937
- namespaceuri = ($2 || '').downcase
2938
- prefix = @matchnamespaces[namespaceuri]
2939
- if prefix and not prefix.empty?
2940
- localname = prefix + ':' + name
2941
- end
2942
- name.downcase!
2943
- unknown_endtag(name)
2944
- end
32
+ require 'encoding/character/utf-8'
33
+ require 'html/sgml-parser'
34
+ require 'htmlentities'
35
+ require 'active_support'
36
+ require 'open-uri'
37
+ include OpenURI
2945
38
 
2946
- def comment(comment)
2947
- handle_comment(comment)
2948
- end
39
+ $debug = false
40
+ $compatible = true
2949
41
 
2950
- def entityDecl(*foo)
2951
- end
42
+ $LOAD_PATH << File.expand_path(File.dirname(__FILE__))
43
+ require 'rfeedparser/forgiving_uri'
44
+ require 'rfeedparser/aliases'
45
+ require 'rfeedparser/encoding_helpers'
46
+ require 'rfeedparser/better_sgmlparser'
47
+ require 'rfeedparser/better_attributelist'
48
+ require 'rfeedparser/scrub'
49
+ require 'rfeedparser/time_helpers'
50
+ require 'rfeedparser/feedparserdict'
51
+ require 'rfeedparser/parser_mixin'
52
+ require 'rfeedparser/parsers'
53
+ require 'rfeedparser/markup_helpers'
2952
54
 
2953
- def unparsedEntityDecl(*foo)
2954
- end
2955
- def error(exc)
2956
- @bozo = true
2957
- @exc = exc
2958
- end
55
+ include FeedParserUtilities
2959
56
 
2960
- def fatalError(exc)
2961
- error(exc)
2962
- raise exc
2963
- end
2964
- end
2965
57
 
2966
- class LooseFeedParser < BetterSGMLParser
2967
- include FeedParserMixin
2968
- # We write the methods that were in BaseHTMLProcessor in the python code
2969
- # in here directly. We do this because if we inherited from
2970
- # BaseHTMLProcessor but then included from FeedParserMixin, the methods
2971
- # of Mixin would overwrite the methods we inherited from
2972
- # BaseHTMLProcessor. This is exactly the opposite of what we want to
2973
- # happen!
58
+ module FeedParser
59
+ Version = "0.9.9"
2974
60
 
2975
- attr_accessor :encoding, :bozo, :feeddata, :entries, :namespacesInUse
61
+ License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
2976
62
 
2977
- Elements_No_End_Tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
2978
- 'img', 'input', 'isindex', 'link', 'meta', 'param']
2979
- New_Declname_Re = /[a-zA-Z][-_.a-zA-Z0-9:]*\s*/
2980
- alias :sgml_feed :feed # feed needs to mapped to feeddata, not the SGMLParser method feed. I think.
2981
- def feed
2982
- @feeddata
2983
- end
2984
- def feed=(data)
2985
- @feeddata = data
2986
- end
63
+ Redistribution and use in source and binary forms, with or without modification,
64
+ are permitted provided that the following conditions are met:
2987
65
 
2988
- def initialize(baseuri, baselang, encoding)
2989
- startup(baseuri, baselang, encoding)
2990
- super() # Keep the parentheses! No touchy.
2991
- end
66
+ * Redistributions of source code must retain the above copyright notice,
67
+ this list of conditions and the following disclaimer.
68
+ * Redistributions in binary form must reproduce the above copyright notice,
69
+ this list of conditions and the following disclaimer in the documentation
70
+ and/or other materials provided with the distribution.
2992
71
 
2993
- def reset
2994
- @pieces = []
2995
- super
2996
- end
72
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
73
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
74
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
75
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
76
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
77
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
78
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
79
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
80
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
81
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
82
+ POSSIBILITY OF SUCH DAMAGE."""
2997
83
 
2998
- def parse(data)
2999
- data.gsub!(/<!((?!DOCTYPE|--|\[))/i, '&lt;!\1')
3000
- data.gsub!(/<([^<\s]+?)\s*\/>/) do |tag|
3001
- clean = tag[1..-3].strip
3002
- if Elements_No_End_Tag.include?clean
3003
- tag
3004
- else
3005
- '<'+clean+'></'+clean+'>'
3006
- end
3007
- end
84
+ Author = "Jeff Hodges <http://somethingsimilar.com>"
85
+ Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
86
+ Contributors = [ "Jason Diamond <http://injektilo.org/>",
87
+ "John Beimler <http://john.beimler.org/>",
88
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>",
89
+ "Aaron Swartz <http://aaronsw.com/>",
90
+ "Kevin Marks <http://epeus.blogspot.com/>"
91
+ ]
92
+ # HTTP "User-Agent" header to send to servers when downloading feeds.
93
+ # If you are embedding feedparser in a larger application, you should
94
+ # change this to your application name and URL.
95
+ USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
3008
96
 
3009
- data.gsub!(/&#39;/, "'")
3010
- data.gsub!(/&#34;/, "'")
3011
- if @encoding and not @encoding.empty? # FIXME unicode check type(u'')
3012
- data = uconvert(data,'utf-8',@encoding)
3013
- end
3014
- sgml_feed(data) # see the alias above
3015
- end
97
+ # HTTP "Accept" header to send to servers when downloading feeds. If you don't
98
+ # want to send an Accept header, set this to None.
99
+ ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
3016
100
 
3017
101
 
3018
- def decodeEntities(element, data)
3019
- data.gsub!('&#60;', '&lt;')
3020
- data.gsub!('&#x3c;', '&lt;')
3021
- data.gsub!('&#62;', '&gt;')
3022
- data.gsub!('&#x3e;', '&gt;')
3023
- data.gsub!('&#38;', '&amp;')
3024
- data.gsub!('&#x26;', '&amp;')
3025
- data.gsub!('&#34;', '&quot;')
3026
- data.gsub!('&#x22;', '&quot;')
3027
- data.gsub!('&#39;', '&apos;')
3028
- data.gsub!('&#x27;', '&apos;')
3029
- if @contentparams.has_key? 'type' and not ((@contentparams['type'] || 'xml') =~ /xml$/u)
3030
- data.gsub!('&lt;', '<')
3031
- data.gsub!('&gt;', '>')
3032
- data.gsub!('&amp;', '&')
3033
- data.gsub!('&quot;', '"')
3034
- data.gsub!('&apos;', "'")
3035
- end
3036
- return data
3037
- end
3038
- end
102
+ # If you want feedparser to automatically run HTML markup through HTML Tidy, set
103
+ # this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
104
+ # or utidylib <http://utidylib.berlios.de/>.
105
+ #TIDY_MARKUP = false #FIXME untranslated
3039
106
 
3040
- def FeedParser.resolveRelativeURIs(htmlSource, baseURI, encoding)
3041
- $stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
3042
- relative_uris = [ ['a','href'],
3043
- ['applet','codebase'],
3044
- ['area','href'],
3045
- ['blockquote','cite'],
3046
- ['body','background'],
3047
- ['del','cite'],
3048
- ['form','action'],
3049
- ['frame','longdesc'],
3050
- ['frame','src'],
3051
- ['iframe','longdesc'],
3052
- ['iframe','src'],
3053
- ['head','profile'],
3054
- ['img','longdesc'],
3055
- ['img','src'],
3056
- ['img','usemap'],
3057
- ['input','src'],
3058
- ['input','usemap'],
3059
- ['ins','cite'],
3060
- ['link','href'],
3061
- ['object','classid'],
3062
- ['object','codebase'],
3063
- ['object','data'],
3064
- ['object','usemap'],
3065
- ['q','cite'],
3066
- ['script','src'],
3067
- ]
3068
- h = Hpricot(htmlSource)
3069
- relative_uris.each do |l|
3070
- ename, eattr = l
3071
- h.search(ename).each do |elem|
3072
- euri = elem.attributes[eattr]
3073
- if euri and not euri.empty? and URI.parse(euri).relative?
3074
- elem.attributes[eattr] = urljoin(baseURI, euri)
3075
- end
3076
- end
3077
- end
3078
- return h.to_html
3079
- end
107
+ # List of Python interfaces for HTML Tidy, in order of preference. Only useful
108
+ # if TIDY_MARKUP = true
109
+ #PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
3080
110
 
3081
- class SanitizerDoc < Hpricot::Doc
3082
-
3083
- def scrub
3084
- traverse_all_element do |e|
3085
- if e.elem?
3086
- if Acceptable_Elements.include?e.name
3087
- e.strip_attributes
3088
- else
3089
- if Unacceptable_Elements_With_End_Tag.include?e.name
3090
- e.inner_html = ''
3091
- end
3092
- e.swap(SanitizerDoc.new(e.children).scrub.to_html)
3093
- # This works because the children swapped in are brought in "after" the current element.
3094
- end
3095
- elsif e.doctype?
3096
- e.parent.children.delete(e)
3097
- elsif e.text?
3098
- ets = e.to_s
3099
- ets.gsub!(/&#39;/, "'")
3100
- ets.gsub!(/&#34;/, '"')
3101
- ets.gsub!(/\r/,'')
3102
- e.swap(ets)
3103
- else
3104
- end
3105
- end
3106
- # yes, that '/' should be there. It's a search method. See the Hpricot docs.
3107
111
 
3108
- unless $compatible # FIXME not properly recursive, see comment in recursive_strip
3109
- (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
3110
- end
3111
- return self
3112
- end
112
+ # ---------- don't touch these ----------
113
+ class ThingsNobodyCaresAboutButMe < Exception
3113
114
  end
3114
-
3115
- def SanitizerDoc(html)
3116
- FeedParser::SanitizerDoc.new(Hpricot.make(html))
115
+ class CharacterEncodingOverride < ThingsNobodyCaresAboutButMe
3117
116
  end
3118
- module_function(:SanitizerDoc)
3119
- def self.sanitizeHTML(html,encoding)
3120
- # FIXME Tidy not yet supported
3121
- html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
3122
- h = SanitizerDoc(html)
3123
- h = h.scrub
3124
- return h.to_html.strip
117
+ class CharacterEncodingUnknown < ThingsNobodyCaresAboutButMe
3125
118
  end
3126
-
3127
-
3128
-
3129
- def self.getCharacterEncoding(feed, xml_data)
3130
- # Get the character encoding of the XML document
3131
- $stderr << "In getCharacterEncoding\n" if $debug
3132
- sniffed_xml_encoding = nil
3133
- xml_encoding = nil
3134
- true_encoding = nil
3135
- begin
3136
- http_headers = feed.meta
3137
- http_content_type = feed.meta['content-type'].split(';')[0]
3138
- encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
3139
- http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
3140
- http_encoding = nil if http_encoding.empty?
3141
- # FIXME Open-Uri returns iso8859-1 if there is no charset header,
3142
- # but that doesn't pass the tests. Open-Uri claims its following
3143
- # the right RFC. Are they wrong or do we need to change the tests?
3144
- rescue NoMethodError
3145
- http_headers = {}
3146
- http_content_type = nil
3147
- http_encoding = nil
3148
- end
3149
- # Must sniff for non-ASCII-compatible character encodings before
3150
- # searching for XML declaration. This heuristic is defined in
3151
- # section F of the XML specification:
3152
- # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3153
- begin
3154
- if xml_data[0..3] == "\x4c\x6f\xa7\x94"
3155
- # EBCDIC
3156
- xml_data = _ebcdic_to_ascii(xml_data)
3157
- elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
3158
- # UTF-16BE
3159
- sniffed_xml_encoding = 'utf-16be'
3160
- xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
3161
- elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
3162
- # UTF-16BE with BOM
3163
- sniffed_xml_encoding = 'utf-16be'
3164
- xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
3165
- elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
3166
- # UTF-16LE
3167
- sniffed_xml_encoding = 'utf-16le'
3168
- xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
3169
- elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
3170
- # UTF-16LE with BOM
3171
- sniffed_xml_encoding = 'utf-16le'
3172
- xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
3173
- elsif xml_data[0..3] == "\x00\x00\x00\x3c"
3174
- # UTF-32BE
3175
- sniffed_xml_encoding = 'utf-32be'
3176
- xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
3177
- elsif xml_data[0..3] == "\x3c\x00\x00\x00"
3178
- # UTF-32LE
3179
- sniffed_xml_encoding = 'utf-32le'
3180
- xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
3181
- elsif xml_data[0..3] == "\x00\x00\xfe\xff"
3182
- # UTF-32BE with BOM
3183
- sniffed_xml_encoding = 'utf-32be'
3184
- xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
3185
- elsif xml_data[0..3] == "\xff\xfe\x00\x00"
3186
- # UTF-32LE with BOM
3187
- sniffed_xml_encoding = 'utf-32le'
3188
- xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
3189
- elsif xml_data[0..2] == "\xef\xbb\xbf"
3190
- # UTF-8 with BOM
3191
- sniffed_xml_encoding = 'utf-8'
3192
- xml_data = xml_data[3..-1]
3193
- else
3194
- # ASCII-compatible
3195
- end
3196
- xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
3197
- rescue
3198
- xml_encoding_match = nil
3199
- end
3200
- if xml_encoding_match
3201
- xml_encoding = xml_encoding_match[1].downcase
3202
- xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
3203
- if sniffed_xml_encoding and xencodings.include?xml_encoding
3204
- xml_encoding = sniffed_xml_encoding
3205
- end
3206
- end
3207
-
3208
- acceptable_content_type = false
3209
- application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
3210
- text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
3211
-
3212
- if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
3213
- acceptable_content_type = true
3214
- true_encoding = http_encoding || xml_encoding || 'utf-8'
3215
- elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
3216
- acceptable_content_type = true
3217
- true_encoding = http_encoding || 'us-ascii'
3218
- elsif /^text\// =~ http_content_type
3219
- true_encoding = http_encoding || 'us-ascii'
3220
- elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
3221
- true_encoding = xml_encoding || 'iso-8859-1'
3222
- else
3223
- true_encoding = xml_encoding || 'utf-8'
3224
- end
3225
- return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
119
+ class NonXMLContentType < ThingsNobodyCaresAboutButMe
3226
120
  end
3227
-
3228
- def self.toUTF8(data, encoding)
3229
- =begin
3230
- Changes an XML data stream on the fly to specify a new encoding
3231
-
3232
- data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
3233
- encoding is a string recognized by encodings.aliases
3234
- =end
3235
- $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
3236
- # NOTE we must use double quotes when dealing with \x encodings!
3237
- if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
3238
- if $debug
3239
- $stderr << "stripping BOM\n"
3240
- if encoding != 'utf-16be'
3241
- $stderr << "string utf-16be instead\n"
3242
- end
3243
- end
3244
- encoding = 'utf-16be'
3245
- data = data[2..-1]
3246
- elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
3247
- if $debug
3248
- $stderr << "stripping BOM\n"
3249
- $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
3250
- end
3251
- encoding = 'utf-16le'
3252
- data = data[2..-1]
3253
- elsif (data[0..2] == "\xef\xbb\xbf")
3254
- if $debug
3255
- $stderr << "stripping BOM\n"
3256
- $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
3257
- end
3258
- encoding = 'utf-8'
3259
- data = data[3..-1]
3260
- elsif (data[0..3] == "\x00\x00\xfe\xff")
3261
- if $debug
3262
- $stderr << "stripping BOM\n"
3263
- if encoding != 'utf-32be'
3264
- $stderr << "trying utf-32be instead\n"
3265
- end
3266
- end
3267
- encoding = 'utf-32be'
3268
- data = data[4..-1]
3269
- elsif (data[0..3] == "\xff\xfe\x00\x00")
3270
- if $debug
3271
- $stderr << "stripping BOM\n"
3272
- if encoding != 'utf-32le'
3273
- $stderr << "trying utf-32le instead\n"
3274
- end
3275
- end
3276
- encoding = 'utf-32le'
3277
- data = data[4..-1]
3278
- end
3279
- begin
3280
- newdata = uconvert(data, encoding, 'utf-8')
3281
- rescue => details
3282
- end
3283
- $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
3284
- declmatch = /^<\?xml[^>]*?>/
3285
- newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
3286
- if declmatch =~ newdata
3287
- newdata.sub!(declmatch, newdecl)
3288
- else
3289
- newdata = newdecl + "\n" + newdata
3290
- end
3291
- return newdata
121
+ class UndeclaredNamespace < Exception
3292
122
  end
3293
123
 
3294
- def self.stripDoctype(data)
3295
- =begin
3296
- Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3297
-
3298
- rss_version may be 'rss091n' or None
3299
- stripped_data is the same XML document, minus the DOCTYPE
3300
- =end
3301
- entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
3302
- data = data.gsub(entity_pattern,'')
3303
-
3304
- doctype_pattern = /<!DOCTYPE(.*?)>/m
3305
- doctype_results = data.scan(doctype_pattern)
3306
- if doctype_results and doctype_results[0]
3307
- doctype = doctype_results[0][0]
3308
- else
3309
- doctype = ''
3310
- end
3311
-
3312
- if /netscape/ =~ doctype.downcase
3313
- version = 'rss091n'
3314
- else
3315
- version = nil
3316
- end
3317
- data = data.sub(doctype_pattern, '')
3318
- return version, data
3319
- end
3320
124
 
3321
- def parse(*args); FeedParser.parse(*args); end
3322
- def FeedParser.parse(furi, options={})
125
+ SUPPORTED_VERSIONS = {'' => 'unknown',
126
+ 'rss090' => 'RSS 0.90',
127
+ 'rss091n' => 'RSS 0.91 (Netscape)',
128
+ 'rss091u' => 'RSS 0.91 (Userland)',
129
+ 'rss092' => 'RSS 0.92',
130
+ 'rss093' => 'RSS 0.93',
131
+ 'rss094' => 'RSS 0.94',
132
+ 'rss20' => 'RSS 2.0',
133
+ 'rss10' => 'RSS 1.0',
134
+ 'rss' => 'RSS (unknown version)',
135
+ 'atom01' => 'Atom 0.1',
136
+ 'atom02' => 'Atom 0.2',
137
+ 'atom03' => 'Atom 0.3',
138
+ 'atom10' => 'Atom 1.0',
139
+ 'atom' => 'Atom (unknown version)',
140
+ 'cdf' => 'CDF',
141
+ 'hotrss' => 'Hot RSS'
142
+ }
143
+
144
+ def parse(furi, options = {})
3323
145
  # Parse a feed from a URL, file, stream or string
3324
146
  $compatible = options[:compatible] || $compatible # Use the default compatibility if compatible is nil
147
+ strictklass = options[:strict] || StrictFeedParser
148
+ looseklass = options[:loose] || LooseFeedParser
3325
149
  result = FeedParserDict.new
3326
150
  result['feed'] = FeedParserDict.new
3327
151
  result['entries'] = []
@@ -3331,13 +155,12 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3331
155
  end
3332
156
  result['bozo'] = false
3333
157
  handlers = options[:handlers]
3334
-
3335
158
  if handlers.class != Array # FIXME why does this happen?
3336
159
  handlers = [handlers]
3337
160
  end
3338
161
 
3339
162
  begin
3340
- if URI::parse(furi).class == URI::Generic
163
+ if File.exists?furi
3341
164
  f = open(furi) # OpenURI doesn't behave well when passing HTTP options to a file.
3342
165
  else
3343
166
  # And when you do pass them, make sure they aren't just nil (this still true?)
@@ -3504,7 +327,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3504
327
  if use_strict_parser
3505
328
  # initialize the SAX parser
3506
329
  saxparser = XML::SAX::Helpers::ParserFactory.makeParser("XML::Parser::SAXDriver")
3507
- feedparser = StrictFeedParser.new(baseuri, baselang, 'utf-8')
330
+ feedparser = strictklass.new(baseuri, baselang, 'utf-8')
3508
331
  saxparser.setDocumentHandler(feedparser)
3509
332
  saxparser.setDTDHandler(feedparser)
3510
333
  saxparser.setEntityResolver(feedparser)
@@ -3525,7 +348,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3525
348
  end
3526
349
  end
3527
350
  if not use_strict_parser
3528
- feedparser = LooseFeedParser.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
351
+ feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
3529
352
  feedparser.parse(data)
3530
353
  $stderr << "Using LooseFeed\n\n" if $debug
3531
354
  end
@@ -3535,6 +358,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3535
358
  result['namespaces'] = feedparser.namespacesInUse
3536
359
  return result
3537
360
  end
361
+ module_function(:parse)
3538
362
  end # End FeedParser module
3539
363
 
3540
364
  class Serializer
@@ -3574,7 +398,7 @@ class TextSerializer < Serializer
3574
398
  end
3575
399
  end
3576
400
 
3577
- class PprintSerializer < Serializer # FIXME ? use pp instead?
401
+ class PprintSerializer < Serializer # FIXME use pp instead
3578
402
  def write(stream = $stdout)
3579
403
  stream << @results['href'].to_s + "\n\n"
3580
404
  pp(@results)
@@ -3582,87 +406,88 @@ class PprintSerializer < Serializer # FIXME ? use pp instead?
3582
406
  end
3583
407
  end
3584
408
 
3585
-
3586
- require 'optparse'
3587
- require 'ostruct'
3588
- options = OpenStruct.new
3589
- options.etag = options.modified = options.agent = options.referrer = nil
3590
- options.content_language = options.content_location = options.ctype = nil
3591
- options.format = 'pprint'
3592
- options.compatible = $compatible
3593
- options.verbose = false
3594
-
3595
- opts = OptionParser.new do |opts|
3596
- opts.banner
3597
- opts.separator ""
3598
- opts.on("-A", "--user-agent [AGENT]",
409
+ if $0 == __FILE__
410
+ require 'optparse'
411
+ require 'ostruct'
412
+ options = OpenStruct.new
413
+ options.etag = options.modified = options.agent = options.referrer = nil
414
+ options.content_language = options.content_location = options.ctype = nil
415
+ options.format = 'pprint'
416
+ options.compatible = $compatible
417
+ options.verbose = false
418
+
419
+ opts = OptionParser.new do |opts|
420
+ opts.banner
421
+ opts.separator ""
422
+ opts.on("-A", "--user-agent [AGENT]",
3599
423
  "User-Agent for HTTP URLs") {|agent|
3600
- options.agent = agent
3601
- }
424
+ options.agent = agent
425
+ }
3602
426
 
3603
- opts.on("-e", "--referrer [URL]",
427
+ opts.on("-e", "--referrer [URL]",
3604
428
  "Referrer for HTTP URLs") {|referrer|
3605
- options.referrer = referrer
3606
- }
429
+ options.referrer = referrer
430
+ }
3607
431
 
3608
- opts.on("-t", "--etag [TAG]",
432
+ opts.on("-t", "--etag [TAG]",
3609
433
  "ETag/If-None-Match for HTTP URLs") {|etag|
3610
- options.etag = etag
3611
- }
434
+ options.etag = etag
435
+ }
3612
436
 
3613
- opts.on("-m", "--last-modified [DATE]",
437
+ opts.on("-m", "--last-modified [DATE]",
3614
438
  "Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") {|modified|
3615
- options.modified = modified
3616
- }
439
+ options.modified = modified
440
+ }
3617
441
 
3618
- opts.on("-f", "--format [FORMAT]", [:text, :pprint],
442
+ opts.on("-f", "--format [FORMAT]", [:text, :pprint],
3619
443
  "output resutls in FORMAT (text, pprint)") {|format|
3620
- options.format = format
3621
- }
444
+ options.format = format
445
+ }
3622
446
 
3623
- opts.on("-v", "--[no-]verbose",
447
+ opts.on("-v", "--[no-]verbose",
3624
448
  "write debugging information to stderr") {|v|
3625
- options.verbose = v
3626
- }
449
+ options.verbose = v
450
+ }
3627
451
 
3628
- opts.on("-c", "--[no-]compatible",
452
+ opts.on("-c", "--[no-]compatible",
3629
453
  "strip element attributes like feedparser.py 4.1 (default)") {|comp|
3630
- options.compatible = comp
3631
- }
3632
- opts.on("-l", "--content-location [LOCATION]",
454
+ options.compatible = comp
455
+ }
456
+ opts.on("-l", "--content-location [LOCATION]",
3633
457
  "default Content-Location HTTP header") {|loc|
3634
- options.content_location = loc
3635
- }
3636
- opts.on("-a", "--content-language [LANG]",
458
+ options.content_location = loc
459
+ }
460
+ opts.on("-a", "--content-language [LANG]",
3637
461
  "default Content-Language HTTP header") {|lang|
3638
- options.content_language = lang
3639
- }
3640
- opts.on("-t", "--content-type [TYPE]",
462
+ options.content_language = lang
463
+ }
464
+ opts.on("-t", "--content-type [TYPE]",
3641
465
  "default Content-type HTTP header") {|ctype|
3642
- options.ctype = ctype
3643
- }
3644
- end
466
+ options.ctype = ctype
467
+ }
468
+ end
3645
469
 
3646
- opts.parse!(ARGV)
3647
- $debug = true if options.verbose
3648
- $compatible = options.compatible unless options.compatible.nil?
470
+ opts.parse!(ARGV)
471
+ $debug = true if options.verbose
472
+ $compatible = options.compatible unless options.compatible.nil?
3649
473
 
3650
- if options.format == :text
3651
- serializer = TextSerializer
3652
- else
3653
- serializer = PprintSerializer
3654
- end
3655
- args = *ARGV.dup
3656
- unless args.nil?
3657
- args.each do |url| # opts.parse! removes everything but the urls from the command line
3658
- results = FeedParser.parse(url, :etag => options.etag,
3659
- :modified => options.modified,
3660
- :agent => options.agent,
3661
- :referrer => options.referrer,
3662
- :content_location => options.content_location,
3663
- :content_language => options.content_language,
3664
- :content_type => options.ctype
3665
- )
3666
- serializer.new(results).write($stdout)
474
+ if options.format == :text
475
+ serializer = TextSerializer
476
+ else
477
+ serializer = PprintSerializer
478
+ end
479
+ args = *ARGV.dup
480
+ unless args.nil?
481
+ args.each do |url| # opts.parse! removes everything but the urls from the command line
482
+ results = FeedParser.parse(url, :etag => options.etag,
483
+ :modified => options.modified,
484
+ :agent => options.agent,
485
+ :referrer => options.referrer,
486
+ :content_location => options.content_location,
487
+ :content_language => options.content_language,
488
+ :content_type => options.ctype
489
+ )
490
+ serializer.new(results).write($stdout)
491
+ end
3667
492
  end
3668
493
  end