rfeedparser 0.9.8 → 0.9.9

Sign up to get free protection for your applications and to get access to all the features.
data/lib/rfeedparser.rb CHANGED
@@ -14,3314 +14,138 @@ require 'stringio'
14
14
  require 'uri'
15
15
  require 'cgi' # escaping html
16
16
  require 'time'
17
- require 'xml/saxdriver' # calling expat
18
17
  require 'pp'
19
18
  require 'rubygems'
20
19
  require 'base64'
21
20
  require 'iconv'
22
- gem 'hpricot', ">=0.5"
23
- gem 'character-encodings', ">=0.2.0"
24
- gem 'htmltools', ">=1.10"
25
- gem 'htmlentities', ">=4.0.0"
26
- gem 'activesupport', ">=1.4.2"
27
- gem 'rchardet', ">=1.0"
28
-
29
- require 'rchardet'
30
- $chardet = true
31
-
32
- require 'hpricot'
33
- require 'encoding/character/utf-8'
34
- require 'html/sgml-parser'
35
- require 'htmlentities'
36
- require 'active_support'
37
- require 'open-uri'
38
- include OpenURI
39
-
40
- $debug = false
41
- $compatible = true
42
-
43
- Encoding_Aliases = { # Adapted from python2.4's encodings/aliases.py
44
- # ascii codec
45
- '646' => 'ascii',
46
- 'ansi_x3.4_1968' => 'ascii',
47
- 'ansi_x3_4_1968' => 'ascii', # some email headers use this non-standard name
48
- 'ansi_x3.4_1986' => 'ascii',
49
- 'cp367' => 'ascii',
50
- 'csascii' => 'ascii',
51
- 'ibm367' => 'ascii',
52
- 'iso646_us' => 'ascii',
53
- 'iso_646.irv_1991' => 'ascii',
54
- 'iso_ir_6' => 'ascii',
55
- 'us' => 'ascii',
56
- 'us_ascii' => 'ascii',
57
-
58
- # big5 codec
59
- 'big5_tw' => 'big5',
60
- 'csbig5' => 'big5',
61
-
62
- # big5hkscs codec
63
- 'big5_hkscs' => 'big5hkscs',
64
- 'hkscs' => 'big5hkscs',
65
-
66
- # cp037 codec
67
- '037' => 'cp037',
68
- 'csibm037' => 'cp037',
69
- 'ebcdic_cp_ca' => 'cp037',
70
- 'ebcdic_cp_nl' => 'cp037',
71
- 'ebcdic_cp_us' => 'cp037',
72
- 'ebcdic_cp_wt' => 'cp037',
73
- 'ibm037' => 'cp037',
74
- 'ibm039' => 'cp037',
75
-
76
- # cp1026 codec
77
- '1026' => 'cp1026',
78
- 'csibm1026' => 'cp1026',
79
- 'ibm1026' => 'cp1026',
80
-
81
- # cp1140 codec
82
- '1140' => 'cp1140',
83
- 'ibm1140' => 'cp1140',
84
-
85
- # cp1250 codec
86
- '1250' => 'cp1250',
87
- 'windows_1250' => 'cp1250',
88
-
89
- # cp1251 codec
90
- '1251' => 'cp1251',
91
- 'windows_1251' => 'cp1251',
92
-
93
- # cp1252 codec
94
- '1252' => 'cp1252',
95
- 'windows_1252' => 'cp1252',
96
-
97
- # cp1253 codec
98
- '1253' => 'cp1253',
99
- 'windows_1253' => 'cp1253',
100
-
101
- # cp1254 codec
102
- '1254' => 'cp1254',
103
- 'windows_1254' => 'cp1254',
104
-
105
- # cp1255 codec
106
- '1255' => 'cp1255',
107
- 'windows_1255' => 'cp1255',
108
-
109
- # cp1256 codec
110
- '1256' => 'cp1256',
111
- 'windows_1256' => 'cp1256',
112
-
113
- # cp1257 codec
114
- '1257' => 'cp1257',
115
- 'windows_1257' => 'cp1257',
116
-
117
- # cp1258 codec
118
- '1258' => 'cp1258',
119
- 'windows_1258' => 'cp1258',
120
-
121
- # cp424 codec
122
- '424' => 'cp424',
123
- 'csibm424' => 'cp424',
124
- 'ebcdic_cp_he' => 'cp424',
125
- 'ibm424' => 'cp424',
126
-
127
- # cp437 codec
128
- '437' => 'cp437',
129
- 'cspc8codepage437' => 'cp437',
130
- 'ibm437' => 'cp437',
131
-
132
- # cp500 codec
133
- '500' => 'cp500',
134
- 'csibm500' => 'cp500',
135
- 'ebcdic_cp_be' => 'cp500',
136
- 'ebcdic_cp_ch' => 'cp500',
137
- 'ibm500' => 'cp500',
138
-
139
- # cp775 codec
140
- '775' => 'cp775',
141
- 'cspc775baltic' => 'cp775',
142
- 'ibm775' => 'cp775',
143
-
144
- # cp850 codec
145
- '850' => 'cp850',
146
- 'cspc850multilingual' => 'cp850',
147
- 'ibm850' => 'cp850',
148
-
149
- # cp852 codec
150
- '852' => 'cp852',
151
- 'cspcp852' => 'cp852',
152
- 'ibm852' => 'cp852',
153
-
154
- # cp855 codec
155
- '855' => 'cp855',
156
- 'csibm855' => 'cp855',
157
- 'ibm855' => 'cp855',
158
-
159
- # cp857 codec
160
- '857' => 'cp857',
161
- 'csibm857' => 'cp857',
162
- 'ibm857' => 'cp857',
163
-
164
- # cp860 codec
165
- '860' => 'cp860',
166
- 'csibm860' => 'cp860',
167
- 'ibm860' => 'cp860',
168
-
169
- # cp861 codec
170
- '861' => 'cp861',
171
- 'cp_is' => 'cp861',
172
- 'csibm861' => 'cp861',
173
- 'ibm861' => 'cp861',
174
-
175
- # cp862 codec
176
- '862' => 'cp862',
177
- 'cspc862latinhebrew' => 'cp862',
178
- 'ibm862' => 'cp862',
179
-
180
- # cp863 codec
181
- '863' => 'cp863',
182
- 'csibm863' => 'cp863',
183
- 'ibm863' => 'cp863',
184
-
185
- # cp864 codec
186
- '864' => 'cp864',
187
- 'csibm864' => 'cp864',
188
- 'ibm864' => 'cp864',
189
-
190
- # cp865 codec
191
- '865' => 'cp865',
192
- 'csibm865' => 'cp865',
193
- 'ibm865' => 'cp865',
194
-
195
- # cp866 codec
196
- '866' => 'cp866',
197
- 'csibm866' => 'cp866',
198
- 'ibm866' => 'cp866',
199
-
200
- # cp869 codec
201
- '869' => 'cp869',
202
- 'cp_gr' => 'cp869',
203
- 'csibm869' => 'cp869',
204
- 'ibm869' => 'cp869',
205
-
206
- # cp932 codec
207
- '932' => 'cp932',
208
- 'ms932' => 'cp932',
209
- 'mskanji' => 'cp932',
210
- 'ms_kanji' => 'cp932',
211
-
212
- # cp949 codec
213
- '949' => 'cp949',
214
- 'ms949' => 'cp949',
215
- 'uhc' => 'cp949',
216
-
217
- # cp950 codec
218
- '950' => 'cp950',
219
- 'ms950' => 'cp950',
220
-
221
- # euc_jp codec
222
- 'euc_jp' => 'euc-jp',
223
- 'eucjp' => 'euc-jp',
224
- 'ujis' => 'euc-jp',
225
- 'u_jis' => 'euc-jp',
226
-
227
- # euc_kr codec
228
- 'euc_kr' => 'euc-kr',
229
- 'euckr' => 'euc-kr',
230
- 'korean' => 'euc-kr',
231
- 'ksc5601' => 'euc-kr',
232
- 'ks_c_5601' => 'euc-kr',
233
- 'ks_c_5601_1987' => 'euc-kr',
234
- 'ksx1001' => 'euc-kr',
235
- 'ks_x_1001' => 'euc-kr',
236
-
237
- # gb18030 codec
238
- 'gb18030_2000' => 'gb18030',
239
-
240
- # gb2312 codec
241
- 'chinese' => 'gb2312',
242
- 'csiso58gb231280' => 'gb2312',
243
- 'euc_cn' => 'gb2312',
244
- 'euccn' => 'gb2312',
245
- 'eucgb2312_cn' => 'gb2312',
246
- 'gb2312_1980' => 'gb2312',
247
- 'gb2312_80' => 'gb2312',
248
- 'iso_ir_58' => 'gb2312',
249
-
250
- # gbk codec
251
- '936' => 'gbk',
252
- 'cp936' => 'gbk',
253
- 'ms936' => 'gbk',
254
-
255
- # hp-roman8 codec
256
- 'hp_roman8' => 'hp-roman8',
257
- 'roman8' => 'hp-roman8',
258
- 'r8' => 'hp-roman8',
259
- 'csHPRoman8' => 'hp-roman8',
260
-
261
- # iso2022_jp codec
262
- 'iso2022_jp' => 'iso-2022-jp',
263
- 'csiso2022jp' => 'iso-2022-jp',
264
- 'iso2022jp' => 'iso-2022-jp',
265
- 'iso_2022_jp' => 'iso-2022-jp',
266
-
267
- # iso2022_jp_1 codec
268
- 'iso2002_jp_1' => 'iso-2022-jp-1',
269
- 'iso2022jp_1' => 'iso-2022-jp-1',
270
- 'iso_2022_jp_1' => 'iso-2022-jp-1',
271
-
272
- # iso2022_jp_2 codec
273
- 'iso2022_jp_2' => 'iso-2002-jp-2',
274
- 'iso2022jp_2' => 'iso-2022-jp-2',
275
- 'iso_2022_jp_2' => 'iso-2022-jp-2',
276
-
277
- # iso2022_jp_3 codec
278
- 'iso2002_jp_3' => 'iso-2022-jp-3',
279
- 'iso2022jp_3' => 'iso-2022-jp-3',
280
- 'iso_2022_jp_3' => 'iso-2022-jp-3',
281
-
282
- # iso2022_kr codec
283
- 'iso2022_kr' => 'iso-2022-kr',
284
- 'csiso2022kr' => 'iso-2022-kr',
285
- 'iso2022kr' => 'iso-2022-kr',
286
- 'iso_2022_kr' => 'iso-2022-kr',
287
-
288
- # iso8859_10 codec
289
- 'iso8859_10' => 'iso-8859-10',
290
- 'csisolatin6' => 'iso-8859-10',
291
- 'iso_8859_10' => 'iso-8859-10',
292
- 'iso_8859_10_1992' => 'iso-8859-10',
293
- 'iso_ir_157' => 'iso-8859-10',
294
- 'l6' => 'iso-8859-10',
295
- 'latin6' => 'iso-8859-10',
296
-
297
- # iso8859_13 codec
298
- 'iso8859_13' => 'iso-8859-13',
299
- 'iso_8859_13' => 'iso-8859-13',
300
-
301
- # iso8859_14 codec
302
- 'iso8859_14' => 'iso-8859-14',
303
- 'iso_8859_14' => 'iso-8859-14',
304
- 'iso_8859_14_1998' => 'iso-8859-14',
305
- 'iso_celtic' => 'iso-8859-14',
306
- 'iso_ir_199' => 'iso-8859-14',
307
- 'l8' => 'iso-8859-14',
308
- 'latin8' => 'iso-8859-14',
309
-
310
- # iso8859_15 codec
311
- 'iso8859_15' => 'iso-8859-15',
312
- 'iso_8859_15' => 'iso-8859-15',
313
-
314
- # iso8859_1 codec
315
- 'latin_1' => 'iso-8859-1',
316
- 'cp819' => 'iso-8859-1',
317
- 'csisolatin1' => 'iso-8859-1',
318
- 'ibm819' => 'iso-8859-1',
319
- 'iso8859' => 'iso-8859-1',
320
- 'iso_8859_1' => 'iso-8859-1',
321
- 'iso_8859_1_1987' => 'iso-8859-1',
322
- 'iso_ir_100' => 'iso-8859-1',
323
- 'l1' => 'iso-8859-1',
324
- 'latin' => 'iso-8859-1',
325
- 'latin1' => 'iso-8859-1',
326
-
327
- # iso8859_2 codec
328
- 'iso8859_2' => 'iso-8859-2',
329
- 'csisolatin2' => 'iso-8859-2',
330
- 'iso_8859_2' => 'iso-8859-2',
331
- 'iso_8859_2_1987' => 'iso-8859-2',
332
- 'iso_ir_101' => 'iso-8859-2',
333
- 'l2' => 'iso-8859-2',
334
- 'latin2' => 'iso-8859-2',
335
-
336
- # iso8859_3 codec
337
- 'iso8859_3' => 'iso-8859-3',
338
- 'csisolatin3' => 'iso-8859-3',
339
- 'iso_8859_3' => 'iso-8859-3',
340
- 'iso_8859_3_1988' => 'iso-8859-3',
341
- 'iso_ir_109' => 'iso-8859-3',
342
- 'l3' => 'iso-8859-3',
343
- 'latin3' => 'iso-8859-3',
344
-
345
- # iso8859_4 codec
346
- 'iso8849_4' => 'iso-8859-4',
347
- 'csisolatin4' => 'iso-8859-4',
348
- 'iso_8859_4' => 'iso-8859-4',
349
- 'iso_8859_4_1988' => 'iso-8859-4',
350
- 'iso_ir_110' => 'iso-8859-4',
351
- 'l4' => 'iso-8859-4',
352
- 'latin4' => 'iso-8859-4',
353
-
354
- # iso8859_5 codec
355
- 'iso8859_5' => 'iso-8859-5',
356
- 'csisolatincyrillic' => 'iso-8859-5',
357
- 'cyrillic' => 'iso-8859-5',
358
- 'iso_8859_5' => 'iso-8859-5',
359
- 'iso_8859_5_1988' => 'iso-8859-5',
360
- 'iso_ir_144' => 'iso-8859-5',
361
-
362
- # iso8859_6 codec
363
- 'iso8859_6' => 'iso-8859-6',
364
- 'arabic' => 'iso-8859-6',
365
- 'asmo_708' => 'iso-8859-6',
366
- 'csisolatinarabic' => 'iso-8859-6',
367
- 'ecma_114' => 'iso-8859-6',
368
- 'iso_8859_6' => 'iso-8859-6',
369
- 'iso_8859_6_1987' => 'iso-8859-6',
370
- 'iso_ir_127' => 'iso-8859-6',
371
-
372
- # iso8859_7 codec
373
- 'iso8859_7' => 'iso-8859-7',
374
- 'csisolatingreek' => 'iso-8859-7',
375
- 'ecma_118' => 'iso-8859-7',
376
- 'elot_928' => 'iso-8859-7',
377
- 'greek' => 'iso-8859-7',
378
- 'greek8' => 'iso-8859-7',
379
- 'iso_8859_7' => 'iso-8859-7',
380
- 'iso_8859_7_1987' => 'iso-8859-7',
381
- 'iso_ir_126' => 'iso-8859-7',
382
-
383
- # iso8859_8 codec
384
- 'iso8859_9' => 'iso8859_8',
385
- 'csisolatinhebrew' => 'iso-8859-8',
386
- 'hebrew' => 'iso-8859-8',
387
- 'iso_8859_8' => 'iso-8859-8',
388
- 'iso_8859_8_1988' => 'iso-8859-8',
389
- 'iso_ir_138' => 'iso-8859-8',
390
-
391
- # iso8859_9 codec
392
- 'iso8859_9' => 'iso-8859-9',
393
- 'csisolatin5' => 'iso-8859-9',
394
- 'iso_8859_9' => 'iso-8859-9',
395
- 'iso_8859_9_1989' => 'iso-8859-9',
396
- 'iso_ir_148' => 'iso-8859-9',
397
- 'l5' => 'iso-8859-9',
398
- 'latin5' => 'iso-8859-9',
399
-
400
- # iso8859_11 codec
401
- 'iso8859_11' => 'iso-8859-11',
402
- 'thai' => 'iso-8859-11',
403
- 'iso_8859_11' => 'iso-8859-11',
404
- 'iso_8859_11_2001' => 'iso-8859-11',
405
-
406
- # iso8859_16 codec
407
- 'iso8859_16' => 'iso-8859-16',
408
- 'iso_8859_16' => 'iso-8859-16',
409
- 'iso_8859_16_2001' => 'iso-8859-16',
410
- 'iso_ir_226' => 'iso-8859-16',
411
- 'l10' => 'iso-8859-16',
412
- 'latin10' => 'iso-8859-16',
413
-
414
- # cskoi8r codec
415
- 'koi8_r' => 'cskoi8r',
416
-
417
- # mac_cyrillic codec
418
- 'mac_cyrillic' => 'maccyrillic',
419
-
420
- # shift_jis codec
421
- 'csshiftjis' => 'shift_jis',
422
- 'shiftjis' => 'shift_jis',
423
- 'sjis' => 'shift_jis',
424
- 's_jis' => 'shift_jis',
425
-
426
- # shift_jisx0213 codec
427
- 'shiftjisx0213' => 'shift_jisx0213',
428
- 'sjisx0213' => 'shift_jisx0213',
429
- 's_jisx0213' => 'shift_jisx0213',
430
-
431
- # utf_16 codec
432
- 'utf_16' => 'utf-16',
433
- 'u16' => 'utf-16',
434
- 'utf16' => 'utf-16',
435
-
436
- # utf_16_be codec
437
- 'utf_16_be' => 'utf-16be',
438
- 'unicodebigunmarked' => 'utf-16be',
439
- 'utf_16be' => 'utf-16be',
440
-
441
- # utf_16_le codec
442
- 'utf_16_le' => 'utf-16le',
443
- 'unicodelittleunmarked' => 'utf-16le',
444
- 'utf_16le' => 'utf-16le',
445
-
446
- # utf_7 codec
447
- 'utf_7' => 'utf-7',
448
- 'u7' => 'utf-7',
449
- 'utf7' => 'utf-7',
450
-
451
- # utf_8 codec
452
- 'utf_8' => 'utf-8',
453
- 'u8' => 'utf-8',
454
- 'utf' => 'utf-8',
455
- 'utf8' => 'utf-8',
456
- 'utf8_ucs2' => 'utf-8',
457
- 'utf8_ucs4' => 'utf-8',
458
- }
459
-
460
- def unicode(data, from_encoding)
461
- # Takes a single string and converts it from the encoding in
462
- # from_encoding to unicode.
463
- uconvert(data, from_encoding, 'unicode')
464
- end
465
-
466
- def uconvert(data, from_encoding, to_encoding = 'utf-8')
467
- from_encoding = Encoding_Aliases[from_encoding] || from_encoding
468
- to_encoding = Encoding_Aliases[to_encoding] || to_encoding
469
- Iconv.iconv(to_encoding, from_encoding, data)[0]
470
- end
471
-
472
- def unichr(i)
473
- [i].pack('U*')
474
- end
475
-
476
- def index_match(stri,regexp, offset)
477
- if offset == 241
478
- end
479
- i = stri.index(regexp, offset)
480
-
481
- return nil, nil unless i
482
-
483
- full = stri[i..-1].match(regexp)
484
- return i, full
485
- end
486
-
487
- def _ebcdic_to_ascii(s)
488
- return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
489
- end
490
-
491
- def urljoin(base, uri)
492
- urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
493
- uri = uri.sub(urifixer, '\1\3')
494
- begin
495
- return URI.join(base, uri).to_s
496
- rescue URI::BadURIError => e
497
- if URI.parse(base).relative?
498
- return URI::parse(uri).to_s
499
- end
500
- end
501
- end
502
-
503
- def py2rtime(pytuple)
504
- Time.utc(pytuple[0..5])
505
- end
506
-
507
- # http://intertwingly.net/stories/2005/09/28/xchar.rb
508
- module XChar
509
- # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
510
- CP1252 = {
511
- 128 => 8364, # euro sign
512
- 130 => 8218, # single low-9 quotation mark
513
- 131 => 402, # latin small letter f with hook
514
- 132 => 8222, # double low-9 quotation mark
515
- 133 => 8230, # horizontal ellipsis
516
- 134 => 8224, # dagger
517
- 135 => 8225, # double dagger
518
- 136 => 710, # modifier letter circumflex accent
519
- 137 => 8240, # per mille sign
520
- 138 => 352, # latin capital letter s with caron
521
- 139 => 8249, # single left-pointing angle quotation mark
522
- 140 => 338, # latin capital ligature oe
523
- 142 => 381, # latin capital letter z with caron
524
- 145 => 8216, # left single quotation mark
525
- 146 => 8217, # right single quotation mark
526
- 147 => 8220, # left double quotation mark
527
- 148 => 8221, # right double quotation mark
528
- 149 => 8226, # bullet
529
- 150 => 8211, # en dash
530
- 151 => 8212, # em dash
531
- 152 => 732, # small tilde
532
- 153 => 8482, # trade mark sign
533
- 154 => 353, # latin small letter s with caron
534
- 155 => 8250, # single right-pointing angle quotation mark
535
- 156 => 339, # latin small ligature oe
536
- 158 => 382, # latin small letter z with caron
537
- 159 => 376} # latin capital letter y with diaeresis
538
-
539
- # http://www.w3.org/TR/REC-xml/#dt-chardata
540
- PREDEFINED = {
541
- 38 => '&', # ampersand
542
- 60 => '<', # left angle bracket
543
- 62 => '>'} # right angle bracket
544
-
545
- # http://www.w3.org/TR/REC-xml/#charsets
546
- VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
547
- (0xE000..0xFFFD), (0x10000..0x10FFFF)]
548
- end
549
-
550
- class Fixnum
551
- # xml escaped version of chr
552
- def xchr
553
- n = XChar::CP1252[self] || self
554
- n = 42 unless XChar::VALID.find {|range| range.include? n}
555
- XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
556
- end
557
- end
558
-
559
- class String
560
- alias :old_index :index
561
- def to_xs
562
- unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
563
- rescue
564
- unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
565
- end
566
- end
567
-
568
- class BetterSGMLParserError < Exception; end;
569
- class BetterSGMLParser < HTML::SGMLParser
570
- # Replaced Tagfind and Charref Regexps with the ones in feedparser.py
571
- # This makes things work.
572
- Interesting = /[&<]/u
573
- Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
574
- '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
575
- '![^<>]*)?', 64) # 64 is the unicode flag
576
-
577
- Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
578
- Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
579
-
580
- Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
581
- Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
582
- Endtagopen = /<\//u # Matching the Python SGMLParser
583
- Endbracket = /[<>]/u
584
- Declopen = /<!/u
585
- Piopenbegin = /^<\?/u
586
- Piclose = />/u
587
-
588
- Commentopen = /<!--/u
589
- Commentclose = /--\s*>/u
590
- Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
591
- Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
592
- '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
593
- 64)
594
- Endtagfind = /\s*\/\s*>/u
595
- def initialize(verbose=false)
596
- super(verbose)
597
- end
598
- def feed(*args)
599
- super(*args)
600
- end
601
-
602
- def goahead(_end)
603
- rawdata = @rawdata # woo, utf-8 magic
604
- i = 0
605
- n = rawdata.length
606
- while i < n
607
- if @nomoretags
608
- # handle_data_range does nothing more than set a "Range" that is never used. wtf?
609
- handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
610
- i = n
611
- break
612
- end
613
- j = rawdata.index(Interesting, i)
614
- j = n unless j
615
- handle_data(rawdata[i...j]) if i < j
616
- i = j
617
- break if (i == n)
618
- if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
619
- if rawdata.index(Starttagopen,i) == i
620
- if @literal
621
- handle_data(rawdata[i..i])
622
- i = i+1
623
- next
624
- end
625
- k = parse_starttag(i)
626
- break unless k
627
- i = k
628
- next
629
- end
630
- if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
631
- k = parse_endtag(i)
632
- break unless k
633
- i = k
634
- @literal = false
635
- next
636
- end
637
- if @literal
638
- if n > (i+1)
639
- handle_data("<")
640
- i = i+1
641
- else
642
- #incomplete
643
- break
644
- end
645
- next
646
- end
647
- if rawdata.index(Commentopen,i) == i
648
- k = parse_comment(i)
649
- break unless k
650
- i = k
651
- next
652
- end
653
- if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
654
- k = parse_pi(i)
655
- break unless k
656
- i += k
657
- next
658
- end
659
- if rawdata.index(Declopen,i) == i
660
- # This is some sort of declaration; in "HTML as
661
- # deployed," this should only be the document type
662
- # declaration ("<!DOCTYPE html...>").
663
- k = parse_declaration(i)
664
- break unless k
665
- i = k
666
- next
667
- end
668
- elsif rawdata[i..i] == '&'
669
- if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
670
- handle_data(rawdata[i..i])
671
- i += 1
672
- next
673
- end
674
-
675
- # the Char must come first as its #=~ method is the only one that is UTF-8 safe
676
- ni,match = index_match(rawdata, Charref, i)
677
- if ni and ni == i # See? Ugly
678
- handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
679
- i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
680
- i -= 1 unless rawdata[i-1..i-1] == ";"
681
- next
682
- end
683
- ni,match = index_match(rawdata, Entityref, i)
684
- if ni and ni == i
685
- handle_entityref(match[1])
686
- i += match[0].length
687
- i -= 1 unless rawdata[i-1..i-1] == ";"
688
- next
689
- end
690
- else
691
- error('neither < nor & ??')
692
- end
693
- # We get here only if incomplete matches but
694
- # nothing else
695
- ni,match = index_match(rawdata,Incomplete,i)
696
- unless ni and ni == 0
697
- handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
698
- i += 1
699
- next
700
- end
701
- j = ni + match[0].length
702
- break if j == n # Really incomplete
703
- handle_data(rawdata[i...j])
704
- i = j
705
- end # end while
706
-
707
- if _end and i < n
708
- handle_data(rawdata[i...n])
709
- i = n
710
- end
711
-
712
- @rawdata = rawdata[i..-1]
713
- # @offset += i # FIXME BUGME another unused variable in SGMLParser?
714
- end
715
-
716
-
717
- # Internal -- parse processing instr, return length or -1 if not terminated
718
- def parse_pi(i)
719
- rawdata = @rawdata
720
- if rawdata[i...i+2] != '<?'
721
- error("unexpected call to parse_pi()")
722
- end
723
- ni,match = index_match(rawdata,Piclose,i+2)
724
- return nil unless match
725
- j = ni
726
- handle_pi(rawdata[i+2...j])
727
- j = (j + match[0].length)
728
- return j-i
729
- end
730
-
731
- def parse_comment(i)
732
- rawdata = @rawdata
733
- if rawdata[i...i+4] != "<!--"
734
- error("unexpected call to parse_comment()")
735
- end
736
- ni,match = index_match(rawdata, Commentclose,i)
737
- return nil unless match
738
- handle_comment(rawdata[i+4..(ni-1)])
739
- return ni+match[0].length # Length from i to just past the closing comment tag
740
- end
741
-
742
-
743
- def parse_starttag(i)
744
- @_starttag_text = nil
745
- start_pos = i
746
- rawdata = @rawdata
747
- ni,match = index_match(rawdata,Shorttagopen,i)
748
- if ni == i
749
- # SGML shorthand: <tag/data/ == <tag>data</tag>
750
- # XXX Can data contain &... (entity or char refs)?
751
- # XXX Can data contain < or > (tag characters)?
752
- # XXX Can there be whitespace before the first /?
753
- k,match = index_match(rawdata,Shorttag,i)
754
- return nil unless match
755
- tag, data = match[1], match[2]
756
- @_starttag_text = "<#{tag}/"
757
- tag.downcase!
758
- second_end = rawdata.index(Shorttagopen,k)
759
- finish_shorttag(tag, data)
760
- @_starttag_text = rawdata[start_pos...second_end+1]
761
- return k
762
- end
763
-
764
- j = rawdata.index(Endbracket, i+1)
765
- return nil unless j
766
- attrsd = []
767
- if rawdata[i...i+2] == '<>'
768
- # SGML shorthand: <> == <last open tag seen>
769
- k = j
770
- tag = @lasttag
771
- else
772
- ni,match = index_match(rawdata,Tagfind,i+1)
773
- unless match
774
- error('unexpected call to parse_starttag')
775
- end
776
- k = ni+match[0].length+1
777
- tag = match[0].downcase
778
- @lasttag = tag
779
- end
780
-
781
- while k < j
782
- break if rawdata.index(Endtagfind, k) == k
783
- ni,match = index_match(rawdata,Attrfind,k)
784
- break unless ni
785
- matched_length = match[0].length
786
- attrname, rest, attrvalue = match[1],match[2],match[3]
787
- if rest.nil? or rest.empty?
788
- attrvalue = '' # was: = attrname # Why the change?
789
- elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
790
- attrvalue = attrvalue[1...-1]
791
- end
792
- attrsd << [attrname.downcase, attrvalue]
793
- k += matched_length
794
- end
795
- if rawdata[j..j] == ">"
796
- j += 1
797
- end
798
- @_starttag_text = rawdata[start_pos...j]
799
- finish_starttag(tag, attrsd)
800
- return j
801
- end
802
-
803
- def parse_endtag(i)
804
- rawdata = @rawdata
805
- j, match = index_match(rawdata, /[<>]/,i+1)
806
- return nil unless j
807
- tag = rawdata[i+2...j].strip.downcase
808
- if rawdata[j..j] == ">"
809
- j += 1
810
- end
811
- finish_endtag(tag)
812
- return j
813
- end
814
-
815
- def output
816
- # Return processed HTML as a single string
817
- return @pieces.map{|p| p.to_s}.join
818
- end
819
-
820
- def error(message)
821
- raise BetterSGMLParserError.new(message)
822
- end
823
- def handle_pi(text)
824
- end
825
- def handle_decl(text)
826
- end
827
- end
828
-
829
- # Add some helper methods to make AttributeList (all of those damn attrs
830
- # and attrsD used by StrictFeedParser) act more like a Hash.
831
- # NOTE AttributeList is still Read-Only (AFAICT).
832
- # Monkey patching is terrible, and I have an addiction.
833
- module XML
834
- module SAX
835
- module AttributeList # in xml/sax.rb
836
- def [](key)
837
- getValue(key)
838
- end
839
-
840
- def each(&blk)
841
- (0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
842
- end
843
-
844
- def each_key(&blk)
845
- (0...getLength).each{|pos| yield getName(pos) }
846
- end
847
-
848
- def each_value(&blk)
849
- (0...getLength).each{|pos| yield getValue(pos) }
850
- end
851
-
852
- def to_a # Rather use collect? grep for to_a.collect
853
- l = []
854
- each{|k,v| l << [k,v]}
855
- return l
856
- end
857
-
858
- def to_s
859
- l = []
860
- each{|k,v| l << "#{k} => #{v}"}
861
- "{ "+l.join(", ")+" }"
862
- end
863
- end
864
- end
865
- end
866
- # This adds a nice scrub method to Hpricot, so we don't need a _HTMLSanitizer class
867
- # http://underpantsgnome.com/2007/01/20/hpricot-scrub
868
- # I have modified it to check for attributes that are only allowed if they are in a certain tag
869
- module Hpricot
870
- Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
871
- 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
872
- 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
873
- 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
874
- 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
875
- 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
876
- 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
877
- 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
878
- 'ul', 'var'
879
- ]
880
-
881
- Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
882
- 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
883
- 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
884
- 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
885
- 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
886
- 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
887
- 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
888
- 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
889
- 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
890
- 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
891
- 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
892
- ]
893
-
894
- Unacceptable_Elements_With_End_Tag = ['script', 'applet']
895
-
896
- Acceptable_Css_Properties = ['azimuth', 'background-color',
897
- 'border-bottom-color', 'border-collapse', 'border-color',
898
- 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
899
- 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
900
- 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
901
- 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
902
- 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
903
- 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
904
- 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
905
- 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
906
- 'white-space', 'width'
907
- ]
908
-
909
- # survey of common keywords found in feeds
910
- Acceptable_Css_Keywords = ['auto', 'aqua', 'black', 'block', 'blue',
911
- 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
912
- 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
913
- 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
914
- 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
915
- 'transparent', 'underline', 'white', 'yellow'
916
- ]
917
-
918
- Mathml_Elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
919
- 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
920
- 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
921
- 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
922
- 'munderover', 'none'
923
- ]
924
-
925
- Mathml_Attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
926
- 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
927
- 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
928
- 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
929
- 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
930
- 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
931
- 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
932
- 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
933
- 'xlink:type', 'xmlns', 'xmlns:xlink'
934
- ]
935
-
936
- # svgtiny - foreignObject + linearGradient + radialGradient + stop
937
- Svg_Elements = ['a', 'animate', 'animateColor', 'animateMotion',
938
- 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
939
- 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
940
- 'linearGradient', 'line', 'metadata', 'missing-glyph', 'mpath', 'path',
941
- 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg',
942
- 'switch', 'text', 'title', 'use'
943
- ]
944
-
945
- # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
946
- Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
947
- 'arabic-form', 'ascent', 'attributeName', 'attributeType',
948
- 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
949
- 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
950
- 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
951
- 'font-size', 'font-stretch', 'font-style', 'font-variant',
952
- 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
953
- 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
954
- 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
955
- 'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
956
- 'origin', 'overline-position', 'overline-thickness', 'panose-1',
957
- 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
958
- 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
959
- 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
960
- 'stop-color', 'stop-opacity', 'strikethrough-position',
961
- 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
962
- 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
963
- 'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
964
- 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
965
- 'underline-position', 'underline-thickness', 'unicode',
966
- 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
967
- 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
968
- 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
969
- 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
970
- 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
971
- ]
972
-
973
- Svg_Attr_Map = nil
974
- Svg_Elem_Map = nil
975
-
976
- Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
977
- 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
978
- 'stroke-opacity'
979
- ]
980
-
981
- unless $compatible
982
- @@acceptable_tag_specific_attributes = {}
983
- @@mathml_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@mathml_attributes }
984
- @@svg_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@svg_attributes }
985
- end
986
-
987
- class Elements
988
- def strip(allowed_tags=[]) # I completely route around this with the recursive_strip in Doc
989
- each { |x| x.strip(allowed_tags) }
990
- end
991
-
992
- def strip_attributes(safe=[])
993
- each { |x| x.strip_attributes(safe) }
994
- end
995
-
996
- def strip_style(ok_props=[], ok_keywords=[])
997
- each { |x| x.strip_style(ok_props, ok_keywords) }
998
- end
999
- end
1000
-
1001
- class Text
1002
- def strip(foo)
1003
- end
1004
- def strip_attributes(foo)
1005
- end
1006
- end
1007
- class Comment
1008
- def strip(foo)
1009
- end
1010
- def strip_attributes(foo)
1011
- end
1012
- end
1013
- class BogusETag
1014
- def strip(foo)
1015
- end
1016
- def strip_attributes(foo)
1017
- end
1018
- end
1019
-
1020
- class Elem
1021
- def decode_entities
1022
- children.each{ |x| x.decode_entities }
1023
- end
1024
-
1025
- def cull
1026
- if children
1027
- swap(children.to_s)
1028
- end
1029
- end
1030
-
1031
- def strip
1032
- if strip_removes?
1033
- cull
1034
- end
1035
- end
1036
-
1037
- def strip_attributes
1038
- unless attributes.nil?
1039
- attributes.each do |atr|
1040
- unless Acceptable_Attributes.include?atr[0]
1041
- remove_attribute(atr[0])
1042
- end
1043
- end
1044
- end
1045
- end
1046
-
1047
- def strip_removes?
1048
- # I'm sure there are others that shuould be ripped instead of stripped
1049
- attributes && attributes['type'] =~ /script|css/
1050
- end
1051
- end
1052
- end
1053
-
1054
- module FeedParser
1055
- Version = "0.1aleph_naught"
1056
-
1057
- License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
1058
-
1059
- Redistribution and use in source and binary forms, with or without modification,
1060
- are permitted provided that the following conditions are met:
1061
-
1062
- * Redistributions of source code must retain the above copyright notice,
1063
- this list of conditions and the following disclaimer.
1064
- * Redistributions in binary form must reproduce the above copyright notice,
1065
- this list of conditions and the following disclaimer in the documentation
1066
- and/or other materials provided with the distribution.
1067
-
1068
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
1069
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1070
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1071
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
1072
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
1073
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
1074
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
1075
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
1076
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
1077
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1078
- POSSIBILITY OF SUCH DAMAGE."""
1079
-
1080
- Author = "Jeff Hodges <http://somethingsimilar.com>"
1081
- Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
1082
- Contributors = [ "Jason Diamond <http://injektilo.org/>",
1083
- "John Beimler <http://john.beimler.org/>",
1084
- "Fazal Majid <http://www.majid.info/mylos/weblog/>",
1085
- "Aaron Swartz <http://aaronsw.com/>",
1086
- "Kevin Marks <http://epeus.blogspot.com/>"
1087
- ]
1088
- # HTTP "User-Agent" header to send to servers when downloading feeds.
1089
- # If you are embedding feedparser in a larger application, you should
1090
- # change this to your application name and URL.
1091
- USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
1092
-
1093
- # HTTP "Accept" header to send to servers when downloading feeds. If you don't
1094
- # want to send an Accept header, set this to None.
1095
- ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
1096
-
1097
-
1098
- # If you want feedparser to automatically run HTML markup through HTML Tidy, set
1099
- # this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
1100
- # or utidylib <http://utidylib.berlios.de/>.
1101
- TIDY_MARKUP = false #FIXME untranslated
1102
-
1103
- # List of Python interfaces for HTML Tidy, in order of preference. Only useful
1104
- # if TIDY_MARKUP = true
1105
- PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
1106
-
1107
- # The original Python import. I'm using it to help translate
1108
- #import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
1109
-
1110
-
1111
-
1112
- # ---------- don't touch these ----------
1113
- class ThingsNobodyCaresAboutButMe < Exception
1114
- end
1115
- class CharacterEncodingOverride < ThingsNobodyCaresAboutButMe
1116
- end
1117
- class CharacterEncodingUnknown < ThingsNobodyCaresAboutButMe
1118
- end
1119
- class NonXMLContentType < ThingsNobodyCaresAboutButMe
1120
- end
1121
- class UndeclaredNamespace < Exception
1122
- end
1123
-
1124
-
1125
- SUPPORTED_VERSIONS = {'' => 'unknown',
1126
- 'rss090' => 'RSS 0.90',
1127
- 'rss091n' => 'RSS 0.91 (Netscape)',
1128
- 'rss091u' => 'RSS 0.91 (Userland)',
1129
- 'rss092' => 'RSS 0.92',
1130
- 'rss093' => 'RSS 0.93',
1131
- 'rss094' => 'RSS 0.94',
1132
- 'rss20' => 'RSS 2.0',
1133
- 'rss10' => 'RSS 1.0',
1134
- 'rss' => 'RSS (unknown version)',
1135
- 'atom01' => 'Atom 0.1',
1136
- 'atom02' => 'Atom 0.2',
1137
- 'atom03' => 'Atom 0.3',
1138
- 'atom10' => 'Atom 1.0',
1139
- 'atom' => 'Atom (unknown version)',
1140
- 'cdf' => 'CDF',
1141
- 'hotrss' => 'Hot RSS'
1142
- }
1143
- class FeedParserDict < Hash
1144
- =begin
1145
- The naming of a certain common attribute (such as, "When was the last
1146
- time this feed was updated?") can have many different names depending
1147
- on the type of feed we are handling. This class allows us to use
1148
- both the attribute name a person, who has knowledge of the kind of
1149
- feed being parsed, expects, as well as allowing a developer to rely
1150
- on one name to contain the proper attribute no matter what kind of
1151
- feed is being parsed. @@keymaps is a Hash that contains information
1152
- on what certain attributes "really is" in each feed type. It does so
1153
- by providing a common name that will map to any feed type in the keys,
1154
- with possible "correct" attributes in the its values. the #[] and #[]=
1155
- methods check with keymaps to see what attribute the developer "really
1156
- means" if they've asked for one which happens to be in @@keymap's keys.
1157
- =end
1158
- @@keymap = {'channel' => 'feed',
1159
- 'items' => 'entries',
1160
- 'guid' => 'id',
1161
- 'date' => 'updated',
1162
- 'date_parsed' => 'updated_parsed',
1163
- 'description' => ['subtitle', 'summary'],
1164
- 'url' => ['href'],
1165
- 'modified' => 'updated',
1166
- 'modified_parsed' => 'updated_parsed',
1167
- 'issued' => 'published',
1168
- 'issued_parsed' => 'published_parsed',
1169
- 'copyright' => 'rights',
1170
- 'copyright_detail' => 'rights_detail',
1171
- 'tagline' => 'subtitle',
1172
- 'tagline_detail' => 'subtitle_detail'}
1173
-
1174
- def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
1175
- return self['entries']
1176
- end
1177
- # We could include the [] rewrite in new using Hash.new's fancy pants block thing
1178
- # but we'd still have to overwrite []= and such.
1179
- # I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
1180
- def initialize(pairs=nil)
1181
- if pairs.class == Array and pairs[0].class == Array and pairs[0].length == 2
1182
- pairs.each do |l|
1183
- k,v = l
1184
- self[k] = v
1185
- end
1186
- elsif pairs.class == Hash
1187
- self.merge!(pairs)
1188
- end
1189
- end
1190
-
1191
- def [](key)
1192
- if key == 'category'
1193
- return self['tags'][0]['term']
1194
- end
1195
- if key == 'categories'
1196
- return self['tags'].collect{|tag| [tag['scheme'],tag['term']]}
1197
- end
1198
- realkey = @@keymap[key] || key
1199
- if realkey.class == Array
1200
- realkey.each{ |key| return self[key] if has_key?key }
1201
- end
1202
- # Note that the original key is preferred over the realkey we (might
1203
- # have) found in @@keymaps
1204
- if has_key?(key)
1205
- return super(key)
1206
- end
1207
- return super(realkey)
1208
- end
1209
-
1210
- def []=(key,value)
1211
- if @@keymap.key?key
1212
- key = @@keymap[key]
1213
- if key.class == Array
1214
- key = key[0]
1215
- end
1216
- end
1217
- super(key,value)
1218
- end
1219
-
1220
- def method_missing(msym, *args)
1221
- methodname = msym.to_s
1222
- if methodname[-1] == '='
1223
- return self[methodname[0..-2]] = args[0]
1224
- elsif methodname[-1] != '!' and methodname[-1] != '?' and methodname[0] != "_" # FIXME implement with private
1225
- return self[methodname]
1226
- else
1227
- raise NoMethodError, "whoops, we don't know about the attribute or method called `#{methodname}' for #{self}:#{self.class}"
1228
- end
1229
- end
1230
- end
1231
-
1232
-
1233
-
1234
-
1235
- module FeedParserMixin
1236
- attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
1237
-
1238
- def startup(baseuri=nil, baselang=nil, encoding='utf-8')
1239
- $stderr << "initializing FeedParser\n" if $debug
1240
-
1241
- @namespaces = {'' => '',
1242
- 'http://backend.userland.com/rss' => '',
1243
- 'http://blogs.law.harvard.edu/tech/rss' => '',
1244
- 'http://purl.org/rss/1.0/' => '',
1245
- 'http://my.netscape.com/rdf/simple/0.9/' => '',
1246
- 'http://example.com/newformat#' => '',
1247
- 'http://example.com/necho' => '',
1248
- 'http://purl.org/echo/' => '',
1249
- 'uri/of/echo/namespace#' => '',
1250
- 'http://purl.org/pie/' => '',
1251
- 'http://purl.org/atom/ns#' => '',
1252
- 'http://www.w3.org/2005/Atom' => '',
1253
- 'http://purl.org/rss/1.0/modules/rss091#' => '',
1254
- 'http://webns.net/mvcb/' => 'admin',
1255
- 'http://purl.org/rss/1.0/modules/aggregation/' => 'ag',
1256
- 'http://purl.org/rss/1.0/modules/annotate/' => 'annotate',
1257
- 'http://media.tangent.org/rss/1.0/' => 'audio',
1258
- 'http://backend.userland.com/blogChannelModule' => 'blogChannel',
1259
- 'http://web.resource.org/cc/' => 'cc',
1260
- 'http://backend.userland.com/creativeCommonsRssModule' => 'creativeCommons',
1261
- 'http://purl.org/rss/1.0/modules/company' => 'co',
1262
- 'http://purl.org/rss/1.0/modules/content/' => 'content',
1263
- 'http://my.theinfo.org/changed/1.0/rss/' => 'cp',
1264
- 'http://purl.org/dc/elements/1.1/' => 'dc',
1265
- 'http://purl.org/dc/terms/' => 'dcterms',
1266
- 'http://purl.org/rss/1.0/modules/email/' => 'email',
1267
- 'http://purl.org/rss/1.0/modules/event/' => 'ev',
1268
- 'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner',
1269
- 'http://freshmeat.net/rss/fm/' => 'fm',
1270
- 'http://xmlns.com/foaf/0.1/' => 'foaf',
1271
- 'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo',
1272
- 'http://postneo.com/icbm/' => 'icbm',
1273
- 'http://purl.org/rss/1.0/modules/image/' => 'image',
1274
- 'http://www.itunes.com/DTDs/PodCast-1.0.dtd' => 'itunes',
1275
- 'http://example.com/DTDs/PodCast-1.0.dtd' => 'itunes',
1276
- 'http://purl.org/rss/1.0/modules/link/' => 'l',
1277
- 'http://search.yahoo.com/mrss' => 'media',
1278
- 'http://madskills.com/public/xml/rss/module/pingback/' => 'pingback',
1279
- 'http://prismstandard.org/namespaces/1.2/basic/' => 'prism',
1280
- 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
1281
- 'http://www.w3.org/2000/01/rdf-schema#' => 'rdfs',
1282
- 'http://purl.org/rss/1.0/modules/reference/' => 'ref',
1283
- 'http://purl.org/rss/1.0/modules/richequiv/' => 'reqv',
1284
- 'http://purl.org/rss/1.0/modules/search/' => 'search',
1285
- 'http://purl.org/rss/1.0/modules/slash/' => 'slash',
1286
- 'http://schemas.xmlsoap.org/soap/envelope/' => 'soap',
1287
- 'http://purl.org/rss/1.0/modules/servicestatus/' => 'ss',
1288
- 'http://hacks.benhammersley.com/rss/streaming/' => 'str',
1289
- 'http://purl.org/rss/1.0/modules/subscription/' => 'sub',
1290
- 'http://purl.org/rss/1.0/modules/syndication/' => 'sy',
1291
- 'http://purl.org/rss/1.0/modules/taxonomy/' => 'taxo',
1292
- 'http://purl.org/rss/1.0/modules/threading/' => 'thr',
1293
- 'http://purl.org/rss/1.0/modules/textinput/' => 'ti',
1294
- 'http://madskills.com/public/xml/rss/module/trackback/' =>'trackback',
1295
- 'http://wellformedweb.org/commentAPI/' => 'wfw',
1296
- 'http://purl.org/rss/1.0/modules/wiki/' => 'wiki',
1297
- 'http://www.w3.org/1999/xhtml' => 'xhtml',
1298
- 'http://www.w3.org/XML/1998/namespace' => 'xml',
1299
- 'http://www.w3.org/1999/xlink' => 'xlink',
1300
- 'http://schemas.pocketsoap.com/rss/myDescModule/' => 'szf'
1301
- }
1302
- @matchnamespaces = {}
1303
- @namespaces.each do |l|
1304
- @matchnamespaces[l[0].downcase] = l[1]
1305
- end
1306
- @can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
1307
- @can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
1308
- @can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
1309
- @html_types = ['text/html', 'application/xhtml+xml']
1310
- @feeddata = FeedParserDict.new # feed-level data
1311
- @encoding = encoding # character encoding
1312
- @entries = [] # list of entry-level data
1313
- @version = '' # feed type/version see SUPPORTED_VERSIOSN
1314
- @namespacesInUse = {} # hash of namespaces defined by the feed
1315
-
1316
- # the following are used internall to track state;
1317
- # this is really out of control and should be refactored
1318
- @infeed = false
1319
- @inentry = false
1320
- @incontent = 0 # Yes, this needs to be zero until I work out popContent and pushContent
1321
- @intextinput = false
1322
- @inimage = false
1323
- @inauthor = false
1324
- @incontributor = false
1325
- @inpublisher = false
1326
- @insource = false
1327
- @sourcedata = FeedParserDict.new
1328
- @contentparams = FeedParserDict.new
1329
- @summaryKey = nil
1330
- @namespacemap = {}
1331
- @elementstack = []
1332
- @basestack = []
1333
- @langstack = []
1334
- @baseuri = baseuri || ''
1335
- @lang = baselang || nil
1336
- if baselang
1337
- @feeddata['language'] = baselang.gsub('_','-')
1338
- end
1339
- @date_handlers = [:_parse_date_rfc822,
1340
- :_parse_date_hungarian, :_parse_date_greek,:_parse_date_mssql,
1341
- :_parse_date_nate,:_parse_date_onblog,:_parse_date_w3dtf,:_parse_date_iso8601
1342
- ]
1343
- $stderr << "Leaving startup\n" if $debug # My addition
1344
- end
1345
-
1346
- def unknown_starttag(tag, attrsd)
1347
- $stderr << "start #{tag} with #{attrsd}\n" if $debug
1348
- # normalize attrs
1349
- attrsD = {}
1350
- attrsd = Hash[*attrsd.flatten] if attrsd.class == Array # Magic! Asterisk!
1351
- # LooseFeedParser needs the above because SGMLParser sends attrs as a
1352
- # list of lists (like [['type','text/html'],['mode','escaped']])
1353
-
1354
- attrsd.each do |old_k,value|
1355
- # There has to be a better, non-ugly way of doing this
1356
- k = old_k.downcase # Downcase all keys
1357
- attrsD[k] = value
1358
- if ['rel','type'].include?value
1359
- attrsD[k].downcase! # Downcase the value if the key is 'rel' or 'type'
1360
- end
1361
- end
1362
-
1363
- # track xml:base and xml:lang
1364
- baseuri = attrsD['xml:base'] || attrsD['base'] || @baseuri
1365
- @baseuri = urljoin(@baseuri, baseuri)
1366
- lang = attrsD['xml:lang'] || attrsD['lang']
1367
- if lang == '' # FIXME This next bit of code is right? Wtf?
1368
- # xml:lang could be explicitly set to '', we need to capture that
1369
- lang = nil
1370
- elsif lang.nil?
1371
- # if no xml:lang is specified, use parent lang
1372
- lang = @lang
1373
- end
1374
- if lang and not lang.empty? # Seriously, this cannot be correct
1375
- if ['feed', 'rss', 'rdf:RDF'].include?tag
1376
- @feeddata['language'] = lang.gsub('_','-')
1377
- end
1378
- end
1379
- @lang = lang
1380
- @basestack << @baseuri
1381
- @langstack << lang
1382
-
1383
- # track namespaces
1384
- attrsd.each do |prefix, uri|
1385
- if /^xmlns:/ =~ prefix # prefix begins with xmlns:
1386
- trackNamespace(prefix[6..-1], uri)
1387
- elsif prefix == 'xmlns':
1388
- trackNamespace(nil, uri)
1389
- end
1390
- end
1391
-
1392
- # track inline content
1393
- if @incontent != 0 and @contentparams.has_key?('type') and not ( /xml$/ =~ (@contentparams['type'] || 'xml') )
1394
- # element declared itself as escaped markup, but isn't really
1395
-
1396
- @contentparams['type'] = 'application/xhtml+xml'
1397
- end
1398
- if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
1399
- # Note: probably shouldn't simply recreate localname here, but
1400
- # our namespace handling isn't actually 100% correct in cases where
1401
- # the feed redefines the default namespace (which is actually
1402
- # the usual case for inline content, thanks Sam), so here we
1403
- # cheat and just reconstruct the element based on localname
1404
- # because that compensates for the bugs in our namespace handling.
1405
- # This will horribly munge inline content with non-empty qnames,
1406
- # but nobody actually does that, so I'm not fixing it.
1407
- tag = tag.split(':')[-1]
1408
- attrsA = attrsd.to_a.collect{|l| "#{l[0]}=\"#{l[1]}\""}
1409
- attrsS = ' '+attrsA.join(' ')
1410
- return handle_data("<#{tag}#{attrsS}>", escape=false)
1411
- end
1412
-
1413
- # match namespaces
1414
- if /:/ =~ tag
1415
- prefix, suffix = tag.split(':', 2)
1416
- else
1417
- prefix, suffix = '', tag
1418
- end
1419
- prefix = @namespacemap[prefix] || prefix
1420
- if prefix and not prefix.empty?
1421
- prefix = prefix + '_'
1422
- end
1423
-
1424
- # special hack for better tracking of empty textinput/image elements in illformed feeds
1425
- if (not prefix and not prefix.empty?) and not (['title', 'link', 'description','name'].include?tag)
1426
- @intextinput = false
1427
- end
1428
- if (prefix.nil? or prefix.empty?) and not (['title', 'link', 'description', 'url', 'href', 'width', 'height'].include?tag)
1429
- @inimage = false
1430
- end
1431
-
1432
- # call special handler (if defined) or default handler
1433
- begin
1434
- return send('_start_'+prefix+suffix, attrsD)
1435
- rescue NoMethodError
1436
- return push(prefix + suffix, true)
1437
- end
1438
- end # End unknown_starttag
1439
-
1440
- def unknown_endtag(tag)
1441
- $stderr << "end #{tag}\n" if $debug
1442
- # match namespaces
1443
- if tag.index(':')
1444
- prefix, suffix = tag.split(':',2)
1445
- else
1446
- prefix, suffix = '', tag
1447
- end
1448
- prefix = @namespacemap[prefix] || prefix
1449
- if prefix and not prefix.empty?
1450
- prefix = prefix + '_'
1451
- end
1452
-
1453
- # call special handler (if defined) or default handler
1454
- begin
1455
- send('_end_' + prefix + suffix) # NOTE no return here! do not add it!
1456
- rescue NoMethodError => details
1457
- pop(prefix + suffix)
1458
- end
1459
-
1460
- # track inline content
1461
- if @incontent != 0 and @contentparams.has_key?'type' and /xml$/ =~ (@contentparams['type'] || 'xml')
1462
- # element declared itself as escaped markup, but it isn't really
1463
- @contentparams['type'] = 'application/xhtml+xml'
1464
- end
1465
- if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
1466
- tag = tag.split(':')[-1]
1467
- handle_data("</#{tag}>", escape=false)
1468
- end
1469
-
1470
- # track xml:base and xml:lang going out of scope
1471
- if @basestack and not @basestack.empty?
1472
- @basestack.pop
1473
- if @basestack and @basestack[-1] and not (@basestack.empty? or @basestack[-1].empty?)
1474
- @baseuri = @basestack[-1]
1475
- end
1476
- end
1477
- if @langstack and not @langstack.empty?
1478
- @langstack.pop
1479
- if @langstack and not @langstack.empty? # and @langstack[-1] and not @langstack.empty?
1480
- @lang = @langstack[-1]
1481
- end
1482
- end
1483
- end
1484
-
1485
- def handle_charref(ref)
1486
- # LooseParserOnly
1487
- # called for each character reference, e.g. for '&#160;', ref will be '160'
1488
- $stderr << "entering handle_charref with #{ref}\n" if $debug
1489
- return if @elementstack.nil? or @elementstack.empty?
1490
- ref.downcase!
1491
- chars = ['34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e']
1492
- if chars.include?ref
1493
- text = "&##{ref};"
1494
- else
1495
- if ref[0..0] == 'x'
1496
- c = (ref[1..-1]).to_i(16)
1497
- else
1498
- c = ref.to_i
1499
- end
1500
- text = uconvert(unichr(c),'unicode')
1501
- end
1502
- @elementstack[-1][2] << text
1503
- end
1504
-
1505
- def handle_entityref(ref)
1506
- # LooseParserOnly
1507
- # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1508
-
1509
- return if @elementstack.nil? or @elementstack.empty?
1510
- $stderr << "entering handle_entityref with #{ref}\n" if $debug
1511
- ents = ['lt', 'gt', 'quot', 'amp', 'apos']
1512
- if ents.include?ref
1513
- text = "&#{ref};"
1514
- else
1515
- text = HTMLEntities::decode_entities("&#{ref};")
1516
- end
1517
- @elementstack[-1][2] << text
1518
- end
1519
-
1520
- def handle_data(text, escape=true)
1521
- # called for each block of plain text, i.e. outside of any tag and
1522
- # not containing any character or entity references
1523
- return if @elementstack.nil? or @elementstack.empty?
1524
- if escape and @contentparams['type'] == 'application/xhtml+xml'
1525
- text = text.to_xs
1526
- end
1527
- @elementstack[-1][2] << text
1528
- end
1529
-
1530
- def handle_comment(comment)
1531
- # called for each comment, e.g. <!-- insert message here -->
1532
- end
1533
-
1534
- def handle_pi(text)
1535
- end
1536
-
1537
- def handle_decl(text)
1538
- end
1539
-
1540
- def parse_declaration(i)
1541
- # for LooseFeedParser
1542
- $stderr << "entering parse_declaration\n" if $debug
1543
- if @rawdata[i...i+9] == '<![CDATA['
1544
- k = @rawdata.index(/\]\]>/u,i+9)
1545
- k = @rawdata.length unless k
1546
- handle_data(@rawdata[i+9...k].to_xs,false)
1547
- return k+3
1548
- else
1549
- k = @rawdata.index(/>/,i).to_i
1550
- return k+1
1551
- end
1552
- end
1553
-
1554
- def mapContentType(contentType)
1555
- contentType.downcase!
1556
- case contentType
1557
- when 'text'
1558
- contentType = 'text/plain'
1559
- when 'html'
1560
- contentType = 'text/html'
1561
- when 'xhtml'
1562
- contentType = 'application/xhtml+xml'
1563
- end
1564
- return contentType
1565
- end
1566
-
1567
- def trackNamespace(prefix, uri)
1568
-
1569
- loweruri = uri.downcase.strip
1570
- if [prefix, loweruri] == [nil, 'http://my.netscape.com/rdf/simple/0.9/'] and (@version.nil? or @version.empty?)
1571
- @version = 'rss090'
1572
- elsif loweruri == 'http://purl.org/rss/1.0/' and (@version.nil? or @version.empty?)
1573
- @version = 'rss10'
1574
- elsif loweruri == 'http://www.w3.org/2005/atom' and (@version.nil? or @version.empty?)
1575
- @version = 'atom10'
1576
- elsif /backend\.userland\.com\/rss/ =~ loweruri
1577
- # match any backend.userland.com namespace
1578
- uri = 'http://backend.userland.com/rss'
1579
- loweruri = uri
1580
- end
1581
- if @matchnamespaces.has_key? loweruri
1582
- @namespacemap[prefix] = @matchnamespaces[loweruri]
1583
- @namespacesInUse[@matchnamespaces[loweruri]] = uri
1584
- else
1585
- @namespacesInUse[prefix || ''] = uri
1586
- end
1587
- end
1588
-
1589
- def resolveURI(uri)
1590
- return urljoin(@baseuri || '', uri)
1591
- end
1592
-
1593
- def decodeEntities(element, data)
1594
- return data
1595
- end
1596
-
1597
- def push(element, expectingText)
1598
- @elementstack << [element, expectingText, []]
1599
- end
1600
-
1601
- def pop(element, stripWhitespace=true)
1602
- return if @elementstack.nil? or @elementstack.empty?
1603
- return if @elementstack[-1][0] != element
1604
- element, expectingText, pieces = @elementstack.pop
1605
- if pieces.class == Array
1606
- output = pieces.join('')
1607
- else
1608
- output = pieces
1609
- end
1610
- if stripWhitespace
1611
- output.strip!
1612
- end
1613
- return output if not expectingText
1614
-
1615
- # decode base64 content
1616
- if @contentparams['base64']
1617
- out64 = Base64::decode64(output) # a.k.a. [output].unpack('m')[0]
1618
- if not output.empty? and not out64.empty?
1619
- output = out64
1620
- end
1621
- end
1622
-
1623
- # resolve relative URIs
1624
- if @can_be_relative_uri.include?element and output and not output.empty?
1625
- output = resolveURI(output)
1626
- end
1627
-
1628
- # decode entities within embedded markup
1629
- if not @contentparams['base64']
1630
- output = decodeEntities(element, output)
1631
- end
1632
-
1633
- # remove temporary cruft from contentparams
1634
- @contentparams.delete('mode')
1635
- @contentparams.delete('base64')
1636
-
1637
- # resolve relative URIs within embedded markup
1638
- if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
1639
- if @can_contain_relative_uris.include?element
1640
- output = FeedParser.resolveRelativeURIs(output, @baseuri, @encoding)
1641
- end
1642
- end
1643
- # sanitize embedded markup
1644
- if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
1645
- if @can_contain_dangerous_markup.include?element
1646
- output = FeedParser.sanitizeHTML(output, @encoding)
1647
- end
1648
- end
1649
-
1650
- if @encoding and not @encoding.empty? and @encoding != 'utf-8'
1651
- output = uconvert(output, @encoding, 'utf-8')
1652
- # FIXME I turn everything into utf-8, not unicode, originally because REXML was being used but now beause I haven't tested it out yet.
1653
- end
1654
-
1655
- # categories/tags/keywords/whatever are handled in _end_category
1656
- return output if element == 'category'
1657
-
1658
- # store output in appropriate place(s)
1659
- if @inentry and not @insource
1660
- if element == 'content'
1661
- @entries[-1][element] ||= []
1662
- contentparams = Marshal.load(Marshal.dump(@contentparams)) # deepcopy
1663
- contentparams['value'] = output
1664
- @entries[-1][element] << contentparams
1665
- elsif element == 'link'
1666
- @entries[-1][element] = output
1667
- if output and not output.empty?
1668
- @entries[-1]['links'][-1]['href'] = output
1669
- end
1670
- else
1671
- element = 'summary' if element == 'description'
1672
- @entries[-1][element] = output
1673
- if @incontent != 0
1674
- contentparams = Marshal.load(Marshal.dump(@contentparams))
1675
- contentparams['value'] = output
1676
- @entries[-1][element + '_detail'] = contentparams
1677
- end
1678
- end
1679
- elsif (@infeed or @insource) and not @intextinput and not @inimage
1680
- context = getContext()
1681
- element = 'subtitle' if element == 'description'
1682
- context[element] = output
1683
- if element == 'link'
1684
- context['links'][-1]['href'] = output
1685
- elsif @incontent != 0
1686
- contentparams = Marshal.load(Marshal.dump(@contentparams))
1687
- contentparams['value'] = output
1688
- context[element + '_detail'] = contentparams
1689
- end
1690
- end
1691
- return output
1692
- end
1693
-
1694
- def pushContent(tag, attrsD, defaultContentType, expectingText)
1695
- @incontent += 1 # Yes, I hate this.
1696
- type = mapContentType(attrsD['type'] || defaultContentType)
1697
- @contentparams = FeedParserDict.new({'type' => type,'language' => @lang,'base' => @baseuri})
1698
- @contentparams['base64'] = isBase64(attrsD, @contentparams)
1699
- push(tag, expectingText)
1700
- end
1701
-
1702
- def popContent(tag)
1703
- value = pop(tag)
1704
- @incontent -= 1
1705
- @contentparams.clear
1706
- return value
1707
- end
1708
-
1709
- def mapToStandardPrefix(name)
1710
- colonpos = name.index(':')
1711
- if colonpos
1712
- prefix = name[0..colonpos-1]
1713
- suffix = name[colonpos+1..-1]
1714
- prefix = @namespacemap[prefix] || prefix
1715
- name = prefix + ':' + suffix
1716
- end
1717
- return name
1718
- end
1719
-
1720
- def getAttribute(attrsD, name)
1721
- return attrsD[mapToStandardPrefix(name)]
1722
- end
1723
-
1724
- def isBase64(attrsD, contentparams)
1725
- return true if (attrsD['mode'] == 'base64')
1726
- if /(^text\/)|(\+xml$)|(\/xml$)/ =~ contentparams['type']
1727
- return false
1728
- end
1729
- return true
1730
- end
1731
-
1732
- def itsAnHrefDamnIt(attrsD)
1733
- href= attrsD['url'] || attrsD['uri'] || attrsD['href']
1734
- if href
1735
- attrsD.delete('url')
1736
- attrsD.delete('uri')
1737
- attrsD['href'] = href
1738
- end
1739
- return attrsD
1740
- end
1741
-
1742
-
1743
- def _save(key, value)
1744
- context = getContext()
1745
- context[key] ||= value
1746
- end
1747
-
1748
- def _start_rss(attrsD)
1749
- versionmap = {'0.91' => 'rss091u',
1750
- '0.92' => 'rss092',
1751
- '0.93' => 'rss093',
1752
- '0.94' => 'rss094'
1753
- }
1754
-
1755
- if not @version or @version.empty?
1756
- attr_version = attrsD['version'] || ''
1757
- version = versionmap[attr_version]
1758
- if version and not version.empty?
1759
- @version = version
1760
- elsif /^2\./ =~ attr_version
1761
- @version = 'rss20'
1762
- else
1763
- @version = 'rss'
1764
- end
1765
- end
1766
- end
1767
-
1768
- def _start_dlhottitles(attrsD)
1769
- @version = 'hotrss'
1770
- end
1771
-
1772
- def _start_channel(attrsD)
1773
- @infeed = true
1774
- _cdf_common(attrsD)
1775
- end
1776
- alias :_start_feedinfo :_start_channel
1777
-
1778
- def _cdf_common(attrsD)
1779
- if attrsD.has_key?'lastmod'
1780
- _start_modified({})
1781
- @elementstack[-1][-1] = attrsD['lastmod']
1782
- _end_modified
1783
- end
1784
- if attrsD.has_key?'href'
1785
- _start_link({})
1786
- @elementstack[-1][-1] = attrsD['href']
1787
- _end_link
1788
- end
1789
- end
1790
-
1791
- def _start_feed(attrsD)
1792
- @infeed = true
1793
- versionmap = {'0.1' => 'atom01',
1794
- '0.2' => 'atom02',
1795
- '0.3' => 'atom03'
1796
- }
1797
-
1798
- if not @version or @version.empty?
1799
- attr_version = attrsD['version']
1800
- version = versionmap[attr_version]
1801
- if @version and not @version.empty?
1802
- @version = version
1803
- else
1804
- @version = 'atom'
1805
- end
1806
- end
1807
- end
1808
-
1809
- def _end_channel
1810
- @infeed = false
1811
- end
1812
- alias :_end_feed :_end_channel
1813
-
1814
- def _start_image(attrsD)
1815
- @inimage = true
1816
- push('image', false)
1817
- context = getContext()
1818
- context['image'] ||= FeedParserDict.new
1819
- end
1820
-
1821
- def _end_image
1822
- pop('image')
1823
- @inimage = false
1824
- end
1825
-
1826
- def _start_textinput(attrsD)
1827
- @intextinput = true
1828
- push('textinput', false)
1829
- context = getContext()
1830
- context['textinput'] ||= FeedParserDict.new
1831
- end
1832
- alias :_start_textInput :_start_textinput
1833
-
1834
- def _end_textinput
1835
- pop('textinput')
1836
- @intextinput = false
1837
- end
1838
- alias :_end_textInput :_end_textinput
1839
-
1840
- def _start_author(attrsD)
1841
- @inauthor = true
1842
- push('author', true)
1843
- end
1844
- alias :_start_managingeditor :_start_author
1845
- alias :_start_dc_author :_start_author
1846
- alias :_start_dc_creator :_start_author
1847
- alias :_start_itunes_author :_start_author
1848
-
1849
- def _end_author
1850
- pop('author')
1851
- @inauthor = false
1852
- _sync_author_detail()
1853
- end
1854
- alias :_end_managingeditor :_end_author
1855
- alias :_end_dc_author :_end_author
1856
- alias :_end_dc_creator :_end_author
1857
- alias :_end_itunes_author :_end_author
1858
-
1859
- def _start_itunes_owner(attrsD)
1860
- @inpublisher = true
1861
- push('publisher', false)
1862
- end
1863
-
1864
- def _end_itunes_owner
1865
- pop('publisher')
1866
- @inpublisher = false
1867
- _sync_author_detail('publisher')
1868
- end
1869
-
1870
- def _start_contributor(attrsD)
1871
- @incontributor = true
1872
- context = getContext()
1873
- context['contributors'] ||= []
1874
- context['contributors'] << FeedParserDict.new
1875
- push('contributor', false)
1876
- end
1877
-
1878
- def _end_contributor
1879
- pop('contributor')
1880
- @incontributor = false
1881
- end
1882
-
1883
- def _start_dc_contributor(attrsD)
1884
- @incontributor = true
1885
- context = getContext()
1886
- context['contributors'] ||= []
1887
- context['contributors'] << FeedParserDict.new
1888
- push('name', false)
1889
- end
1890
-
1891
- def _end_dc_contributor
1892
- _end_name
1893
- @incontributor = false
1894
- end
1895
-
1896
- def _start_name(attrsD)
1897
- push('name', false)
1898
- end
1899
- alias :_start_itunes_name :_start_name
1900
-
1901
- def _end_name
1902
- value = pop('name')
1903
- if @inpublisher
1904
- _save_author('name', value, 'publisher')
1905
- elsif @inauthor
1906
- _save_author('name', value)
1907
- elsif @incontributor
1908
- _save_contributor('name', value)
1909
- elsif @intextinput
1910
- context = getContext()
1911
- context['textinput']['name'] = value
1912
- end
1913
- end
1914
- alias :_end_itunes_name :_end_name
1915
-
1916
- def _start_width(attrsD)
1917
- push('width', false)
1918
- end
1919
-
1920
- def _end_width
1921
- value = pop('width').to_i
1922
- if @inimage
1923
- context = getContext
1924
- context['image']['width'] = value
1925
- end
1926
- end
1927
-
1928
- def _start_height(attrsD)
1929
- push('height', false)
1930
- end
1931
-
1932
- def _end_height
1933
- value = pop('height').to_i
1934
- if @inimage
1935
- context = getContext()
1936
- context['image']['height'] = value
1937
- end
1938
- end
1939
-
1940
- def _start_url(attrsD)
1941
- push('href', true)
1942
- end
1943
- alias :_start_homepage :_start_url
1944
- alias :_start_uri :_start_url
1945
-
1946
- def _end_url
1947
- value = pop('href')
1948
- if @inauthor
1949
- _save_author('href', value)
1950
- elsif @incontributor
1951
- _save_contributor('href', value)
1952
- elsif @inimage
1953
- context = getContext()
1954
- context['image']['href'] = value
1955
- elsif @intextinput
1956
- context = getContext()
1957
- context['textinput']['link'] = value
1958
- end
1959
- end
1960
- alias :_end_homepage :_end_url
1961
- alias :_end_uri :_end_url
1962
-
1963
- def _start_email(attrsD)
1964
- push('email', false)
1965
- end
1966
- alias :_start_itunes_email :_start_email
1967
-
1968
- def _end_email
1969
- value = pop('email')
1970
- if @inpublisher
1971
- _save_author('email', value, 'publisher')
1972
- elsif @inauthor
1973
- _save_author('email', value)
1974
- elsif @incontributor
1975
- _save_contributor('email', value)
1976
- end
1977
- end
1978
- alias :_end_itunes_email :_end_email
1979
-
1980
- def getContext
1981
- if @insource
1982
- context = @sourcedata
1983
- elsif @inentry
1984
- context = @entries[-1]
1985
- else
1986
- context = @feeddata
1987
- end
1988
- return context
1989
- end
1990
-
1991
- def _save_author(key, value, prefix='author')
1992
- context = getContext()
1993
- context[prefix + '_detail'] ||= FeedParserDict.new
1994
- context[prefix + '_detail'][key] = value
1995
- _sync_author_detail()
1996
- end
1997
-
1998
- def _save_contributor(key, value)
1999
- context = getContext
2000
- context['contributors'] ||= [FeedParserDict.new]
2001
- context['contributors'][-1][key] = value
2002
- end
2003
-
2004
- def _sync_author_detail(key='author')
2005
- context = getContext()
2006
- detail = context["#{key}_detail"]
2007
- if detail and not detail.empty?
2008
- name = detail['name']
2009
- email = detail['email']
2010
-
2011
- if name and email and not (name.empty? or name.empty?)
2012
- context[key] = "#{name} (#{email})"
2013
- elsif name and not name.empty?
2014
- context[key] = name
2015
- elsif email and not email.empty?
2016
- context[key] = email
2017
- end
2018
- else
2019
- author = context[key].dup unless context[key].nil?
2020
- return if not author or author.empty?
2021
- emailmatch = author.match(/(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))/)
2022
- email = emailmatch[1]
2023
- author.gsub!(email, '')
2024
- author.gsub!("\(\)", '')
2025
- author.strip!
2026
- author.gsub!(/^\(/,'')
2027
- author.gsub!(/\)$/,'')
2028
- author.strip!
2029
- context["#{key}_detail"] ||= FeedParserDict.new
2030
- context["#{key}_detail"]['name'] = author
2031
- context["#{key}_detail"]['email'] = email
2032
- end
2033
- end
2034
-
2035
- def _start_subtitle(attrsD)
2036
- pushContent('subtitle', attrsD, 'text/plain', true)
2037
- end
2038
- alias :_start_tagline :_start_subtitle
2039
- alias :_start_itunes_subtitle :_start_subtitle
2040
-
2041
- def _end_subtitle
2042
- popContent('subtitle')
2043
- end
2044
- alias :_end_tagline :_end_subtitle
2045
- alias :_end_itunes_subtitle :_end_subtitle
2046
-
2047
- def _start_rights(attrsD)
2048
- pushContent('rights', attrsD, 'text/plain', true)
2049
- end
2050
- alias :_start_dc_rights :_start_rights
2051
- alias :_start_copyright :_start_rights
2052
-
2053
- def _end_rights
2054
- popContent('rights')
2055
- end
2056
- alias :_end_dc_rights :_end_rights
2057
- alias :_end_copyright :_end_rights
2058
-
2059
- def _start_item(attrsD)
2060
- @entries << FeedParserDict.new
2061
- push('item', false)
2062
- @inentry = true
2063
- @guidislink = false
2064
- id = getAttribute(attrsD, 'rdf:about')
2065
- if id and not id.empty?
2066
- context = getContext()
2067
- context['id'] = id
2068
- end
2069
- _cdf_common(attrsD)
2070
- end
2071
- alias :_start_entry :_start_item
2072
- alias :_start_product :_start_item
2073
-
2074
- def _end_item
2075
- pop('item')
2076
- @inentry = false
2077
- end
2078
- alias :_end_entry :_end_item
2079
-
2080
- def _start_dc_language(attrsD)
2081
- push('language', true)
2082
- end
2083
- alias :_start_language :_start_dc_language
2084
-
2085
- def _end_dc_language
2086
- @lang = pop('language')
2087
- end
2088
- alias :_end_language :_end_dc_language
2089
-
2090
- def _start_dc_publisher(attrsD)
2091
- push('publisher', true)
2092
- end
2093
- alias :_start_webmaster :_start_dc_publisher
2094
-
2095
- def _end_dc_publisher
2096
- pop('publisher')
2097
- _sync_author_detail('publisher')
2098
- end
2099
- alias :_end_webmaster :_end_dc_publisher
2100
-
2101
- def _start_published(attrsD)
2102
- push('published', true)
2103
- end
2104
- alias :_start_dcterms_issued :_start_published
2105
- alias :_start_issued :_start_published
2106
-
2107
- def _end_published
2108
- value = pop('published')
2109
- _save('published_parsed', parse_date(value))
2110
- end
2111
- alias :_end_dcterms_issued :_end_published
2112
- alias :_end_issued :_end_published
2113
-
2114
- def _start_updated(attrsD)
2115
- push('updated', true)
2116
- end
2117
- alias :_start_modified :_start_updated
2118
- alias :_start_dcterms_modified :_start_updated
2119
- alias :_start_pubdate :_start_updated
2120
- alias :_start_dc_date :_start_updated
2121
-
2122
- def _end_updated
2123
- value = pop('updated')
2124
- _save('updated_parsed', parse_date(value))
2125
- end
2126
- alias :_end_modified :_end_updated
2127
- alias :_end_dcterms_modified :_end_updated
2128
- alias :_end_pubdate :_end_updated
2129
- alias :_end_dc_date :_end_updated
2130
-
2131
- def _start_created(attrsD)
2132
- push('created', true)
2133
- end
2134
- alias :_start_dcterms_created :_start_created
2135
-
2136
- def _end_created
2137
- value = pop('created')
2138
- _save('created_parsed', parse_date(value))
2139
- end
2140
- alias :_end_dcterms_created :_end_created
2141
-
2142
- def _start_expirationdate(attrsD)
2143
- push('expired', true)
2144
- end
2145
- def _end_expirationdate
2146
- _save('expired_parsed', parse_date(pop('expired')))
2147
- end
2148
-
2149
- def _start_cc_license(attrsD)
2150
- push('license', true)
2151
- value = getAttribute(attrsD, 'rdf:resource')
2152
- if value and not value.empty?
2153
- elementstack[-1][2] << value
2154
- pop('license')
2155
- end
2156
- end
2157
-
2158
- def _start_creativecommons_license(attrsD)
2159
- push('license', true)
2160
- end
2161
-
2162
- def _end_creativecommons_license
2163
- pop('license')
2164
- end
2165
-
2166
- def addTag(term, scheme, label)
2167
- context = getContext()
2168
- context['tags'] ||= []
2169
- tags = context['tags']
2170
- if (term.nil? or term.empty?) and (scheme.nil? or scheme.empty?) and (label.nil? or label.empty?)
2171
- return
2172
- end
2173
- value = FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
2174
- if not tags.include?value
2175
- context['tags'] << FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
2176
- end
2177
- end
2178
-
2179
- def _start_category(attrsD)
2180
- $stderr << "entering _start_category with #{attrsD}\n" if $debug
2181
-
2182
- term = attrsD['term']
2183
- scheme = attrsD['scheme'] || attrsD['domain']
2184
- label = attrsD['label']
2185
- addTag(term, scheme, label)
2186
- push('category', true)
2187
- end
2188
- alias :_start_dc_subject :_start_category
2189
- alias :_start_keywords :_start_category
2190
-
2191
- def _end_itunes_keywords
2192
- pop('itunes_keywords').split.each do |term|
2193
- addTag(term, 'http://www.itunes.com/', nil)
2194
- end
2195
- end
2196
-
2197
- def _start_itunes_category(attrsD)
2198
- addTag(attrsD['text'], 'http://www.itunes.com/', nil)
2199
- push('category', true)
2200
- end
2201
-
2202
- def _end_category
2203
- value = pop('category')
2204
- return if value.nil? or value.empty?
2205
- context = getContext()
2206
- tags = context['tags']
2207
- if value and not value.empty? and not tags.empty? and not tags[-1]['term']:
2208
- tags[-1]['term'] = value
2209
- else
2210
- addTag(value, nil, nil)
2211
- end
2212
- end
2213
- alias :_end_dc_subject :_end_category
2214
- alias :_end_keywords :_end_category
2215
- alias :_end_itunes_category :_end_category
2216
-
2217
- def _start_cloud(attrsD)
2218
- getContext()['cloud'] = FeedParserDict.new(attrsD)
2219
- end
2220
-
2221
- def _start_link(attrsD)
2222
- attrsD['rel'] ||= 'alternate'
2223
- attrsD['type'] ||= 'text/html'
2224
- attrsD = itsAnHrefDamnIt(attrsD)
2225
- if attrsD.has_key? 'href'
2226
- attrsD['href'] = resolveURI(attrsD['href'])
2227
- end
2228
- expectingText = @infeed || @inentry || @insource
2229
- context = getContext()
2230
- context['links'] ||= []
2231
- context['links'] << FeedParserDict.new(attrsD)
2232
- if attrsD['rel'] == 'enclosure'
2233
- _start_enclosure(attrsD)
2234
- end
2235
- if attrsD.has_key? 'href'
2236
- expectingText = false
2237
- if (attrsD['rel'] == 'alternate') and @html_types.include?mapContentType(attrsD['type'])
2238
- context['link'] = attrsD['href']
2239
- end
2240
- else
2241
- push('link', expectingText)
2242
- end
2243
- end
2244
- alias :_start_producturl :_start_link
2245
-
2246
- def _end_link
2247
- value = pop('link')
2248
- context = getContext()
2249
- if @intextinput
2250
- context['textinput']['link'] = value
2251
- end
2252
- if @inimage
2253
- context['image']['link'] = value
2254
- end
2255
- end
2256
- alias :_end_producturl :_end_link
2257
-
2258
- def _start_guid(attrsD)
2259
- @guidislink = ((attrsD['ispermalink'] || 'true') == 'true')
2260
- push('id', true)
2261
- end
2262
-
2263
- def _end_guid
2264
- value = pop('id')
2265
- _save('guidislink', (@guidislink and not getContext().has_key?('link')))
2266
- if @guidislink:
2267
- # guid acts as link, but only if 'ispermalink' is not present or is 'true',
2268
- # and only if the item doesn't already have a link element
2269
- _save('link', value)
2270
- end
2271
- end
2272
-
2273
-
2274
- def _start_title(attrsD)
2275
- pushContent('title', attrsD, 'text/plain', @infeed || @inentry || @insource)
2276
- end
2277
- alias :_start_dc_title :_start_title
2278
- alias :_start_media_title :_start_title
2279
-
2280
- def _end_title
2281
- value = popContent('title')
2282
- context = getContext()
2283
- if @intextinput
2284
- context['textinput']['title'] = value
2285
- elsif @inimage
2286
- context['image']['title'] = value
2287
- end
2288
- end
2289
- alias :_end_dc_title :_end_title
2290
- alias :_end_media_title :_end_title
2291
-
2292
- def _start_description(attrsD)
2293
- context = getContext()
2294
- if context.has_key?('summary')
2295
- @summaryKey = 'content'
2296
- _start_content(attrsD)
2297
- else
2298
- pushContent('description', attrsD, 'text/html', @infeed || @inentry || @insource)
2299
- end
2300
- end
2301
-
2302
- def _start_abstract(attrsD)
2303
- pushContent('description', attrsD, 'text/plain', @infeed || @inentry || @insource)
2304
- end
2305
-
2306
- def _end_description
2307
- if @summaryKey == 'content'
2308
- _end_content()
2309
- else
2310
- value = popContent('description')
2311
- context = getContext()
2312
- if @intextinput
2313
- context['textinput']['description'] = value
2314
- elsif @inimage:
2315
- context['image']['description'] = value
2316
- end
2317
- end
2318
- @summaryKey = nil
2319
- end
2320
- alias :_end_abstract :_end_description
2321
-
2322
- def _start_info(attrsD)
2323
- pushContent('info', attrsD, 'text/plain', true)
2324
- end
2325
- alias :_start_feedburner_browserfriendly :_start_info
2326
-
2327
- def _end_info
2328
- popContent('info')
2329
- end
2330
- alias :_end_feedburner_browserfriendly :_end_info
2331
-
2332
- def _start_generator(attrsD)
2333
- if attrsD and not attrsD.empty?
2334
- attrsD = itsAnHrefDamnIt(attrsD)
2335
- if attrsD.has_key?('href')
2336
- attrsD['href'] = resolveURI(attrsD['href'])
2337
- end
2338
- end
2339
- getContext()['generator_detail'] = FeedParserDict.new(attrsD)
2340
- push('generator', true)
2341
- end
2342
-
2343
- def _end_generator
2344
- value = pop('generator')
2345
- context = getContext()
2346
- if context.has_key?('generator_detail')
2347
- context['generator_detail']['name'] = value
2348
- end
2349
- end
2350
-
2351
- def _start_admin_generatoragent(attrsD)
2352
- push('generator', true)
2353
- value = getAttribute(attrsD, 'rdf:resource')
2354
- if value and not value.empty?
2355
- elementstack[-1][2] << value
2356
- end
2357
- pop('generator')
2358
- getContext()['generator_detail'] = FeedParserDict.new({'href' => value})
2359
- end
2360
-
2361
- def _start_admin_errorreportsto(attrsD)
2362
- push('errorreportsto', true)
2363
- value = getAttribute(attrsD, 'rdf:resource')
2364
- if value and not value.empty?
2365
- @elementstack[-1][2] << value
2366
- end
2367
- pop('errorreportsto')
2368
- end
2369
-
2370
- def _start_summary(attrsD)
2371
- context = getContext()
2372
- if context.has_key?'summary'
2373
- @summaryKey = 'content'
2374
- _start_content(attrsD)
2375
- else
2376
- @summaryKey = 'summary'
2377
- pushContent(@summaryKey, attrsD, 'text/plain', true)
2378
- end
2379
- end
2380
- alias :_start_itunes_summary :_start_summary
2381
-
2382
- def _end_summary
2383
- if @summaryKey == 'content':
2384
- _end_content()
2385
- else
2386
- popContent(@summaryKey || 'summary')
2387
- end
2388
- @summaryKey = nil
2389
- end
2390
- alias :_end_itunes_summary :_end_summary
2391
-
2392
- def _start_enclosure(attrsD)
2393
- attrsD = itsAnHrefDamnIt(attrsD)
2394
- getContext()['enclosures'] ||= []
2395
- getContext()['enclosures'] << FeedParserDict.new(attrsD)
2396
- href = attrsD['href']
2397
- if href and not href.empty?
2398
- context = getContext()
2399
- if not context['id']
2400
- context['id'] = href
2401
- end
2402
- end
2403
- end
2404
-
2405
- def _start_source(attrsD)
2406
- @insource = true
2407
- end
2408
-
2409
- def _end_source
2410
- @insource = false
2411
- getContext()['source'] = Marshal.load(Marshal.dump(@sourcedata))
2412
- @sourcedata.clear()
2413
- end
2414
-
2415
- def _start_content(attrsD)
2416
- pushContent('content', attrsD, 'text/plain', true)
2417
- src = attrsD['src']
2418
- if src and not src.empty?:
2419
- @contentparams['src'] = src
2420
- end
2421
- push('content', true)
2422
- end
2423
-
2424
- def _start_prodlink(attrsD)
2425
- pushContent('content', attrsD, 'text/html', true)
2426
- end
2427
-
2428
- def _start_body(attrsD)
2429
- pushContent('content', attrsD, 'application/xhtml+xml', true)
2430
- end
2431
- alias :_start_xhtml_body :_start_body
2432
-
2433
- def _start_content_encoded(attrsD)
2434
- pushContent('content', attrsD, 'text/html', true)
2435
- end
2436
- alias :_start_fullitem :_start_content_encoded
2437
-
2438
- def _end_content
2439
- copyToDescription = (['text/plain'] + @html_types).include? mapContentType(@contentparams['type'])
2440
- value = popContent('content')
2441
- if copyToDescription
2442
- _save('description', value)
2443
- end
2444
- alias :_end_body :_end_content
2445
- alias :_end_xhtml_body :_end_content
2446
- alias :_end_content_encoded :_end_content
2447
- alias :_end_fullitem :_end_content
2448
- alias :_end_prodlink :_end_content
2449
- end
2450
-
2451
- def _start_itunes_image(attrsD)
2452
- push('itunes_image', false)
2453
- getContext()['image'] = FeedParserDict.new({'href' => attrsD['href']})
2454
- end
2455
- alias :_start_itunes_link :_start_itunes_image
2456
-
2457
- def _end_itunes_block
2458
- value = pop('itunes_block', false)
2459
- getContext()['itunes_block'] = (value == 'yes') and true or false
2460
- end
2461
-
2462
- def _end_itunes_explicit
2463
- value = pop('itunes_explicit', false)
2464
- getContext()['itunes_explicit'] = (value == 'yes') and true or false
2465
- end
2466
-
2467
-
2468
- # ISO-8601 date parsing routines written by Fazal Majid.
2469
- # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2470
- # parser is beyond the scope of feedparser and the current Time.iso8601
2471
- # method does not work.
2472
- # A single regular expression cannot parse ISO 8601 date formats into groups
2473
- # as the standard is highly irregular (for instance is 030104 2003-01-04 or
2474
- # 0301-04-01), so we use templates instead.
2475
- # Please note the order in templates is significant because we need a
2476
- # greedy match.
2477
- def _parse_date_iso8601(dateString)
2478
- # Parse a variety of ISO-8601-compatible formats like 20040105
2479
-
2480
- # What I'm about to show you may be the ugliest code in all of
2481
- # rfeedparser.
2482
- # FIXME The century regexp maybe not work ('\d\d$' says "two numbers at
2483
- # end of line" but we then attach more of a regexp.
2484
- iso8601_regexps = [ '^(\d{4})-?([01]\d)-([0123]\d)',
2485
- '^(\d{4})-([01]\d)',
2486
- '^(\d{4})-?([0123]\d\d)',
2487
- '^(\d\d)-?([01]\d)-?([0123]\d)',
2488
- '^(\d\d)-?([0123]\d\d)',
2489
- '^(\d{4})',
2490
- '-(\d\d)-?([01]\d)',
2491
- '-([0123]\d\d)',
2492
- '-(\d\d)',
2493
- '--([01]\d)-?([0123]\d)',
2494
- '--([01]\d)',
2495
- '---([0123]\d)',
2496
- '(\d\d$)',
2497
- ''
2498
- ]
2499
- iso8601_values = { '^(\d{4})-?([01]\d)-([0123]\d)' => ['year', 'month', 'day'],
2500
- '^(\d{4})-([01]\d)' => ['year','month'],
2501
- '^(\d{4})-?([0123]\d\d)' => ['year', 'ordinal'],
2502
- '^(\d\d)-?([01]\d)-?([0123]\d)' => ['year','month','day'],
2503
- '^(\d\d)-?([0123]\d\d)' => ['year','ordinal'],
2504
- '^(\d{4})' => ['year'],
2505
- '-(\d\d)-?([01]\d)' => ['year','month'],
2506
- '-([0123]\d\d)' => ['ordinal'],
2507
- '-(\d\d)' => ['year'],
2508
- '--([01]\d)-?([0123]\d)' => ['month','day'],
2509
- '--([01]\d)' => ['month'],
2510
- '---([0123]\d)' => ['day'],
2511
- '(\d\d$)' => ['century'],
2512
- '' => []
2513
- }
2514
- add_to_all = '(T?(\d\d):(\d\d)(?::(\d\d))?([+-](\d\d)(?::(\d\d))?|Z)?)?'
2515
- add_to_all_fields = ['hour', 'minute', 'second', 'tz', 'tzhour', 'tzmin']
2516
- # NOTE We use '(?:' to prevent grouping of optional matches (ones trailed
2517
- # by '?'). The second ':' *are* matched.
2518
- m = nil
2519
- param_keys = []
2520
- iso8601_regexps.each do |s|
2521
- $stderr << "Trying iso8601 regexp: #{s+add_to_all}\n" if $debug
2522
- param_keys = iso8601_values[s] + add_to_all_fields
2523
- m = dateString.match(Regexp.new(s+add_to_all))
2524
- break if m
2525
- end
2526
- return if m.nil? or (m.begin(0).zero? and m.end(0).zero?)
2527
-
2528
- param_values = m.to_a
2529
- param_values = param_values[1..-1]
2530
- params = {}
2531
- param_keys.each_with_index do |key,i|
2532
- params[key] = param_values[i]
2533
- end
2534
21
 
2535
- ordinal = params['ordinal'].to_i unless params['ordinal'].nil?
2536
- year = params['year'] || '--'
2537
- if year.nil? or year.empty? or year == '--' # FIXME When could the regexp ever return a year equal to '--'?
2538
- year = Time.now.utc.year
2539
- elsif year.length == 2
2540
- # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
2541
- year = 100 * (Time.now.utc.year / 100) + year.to_i
2542
- else
2543
- year = year.to_i
2544
- end
2545
-
2546
- month = params['month'] || '-'
2547
- if month.nil? or month.empty? or month == '-'
2548
- # ordinals are NOT normalized by mktime, we simulate them
2549
- # by setting month=1, day=ordinal
2550
- if ordinal
2551
- month = DateTime.ordinal(year,ordinal).month
2552
- else
2553
- month = Time.now.utc.month
2554
- end
2555
- end
2556
- month = month.to_i unless month.nil?
2557
- day = params['day']
2558
- if day.nil? or day.empty?
2559
- # see above
2560
- if ordinal
2561
- day = DateTime.ordinal(year,ordinal).day
2562
- elsif params['century'] or params['year'] or params['month']
2563
- day = 1
2564
- else
2565
- day = Time.now.utc.day
2566
- end
2567
- else
2568
- day = day.to_i
2569
- end
2570
- # special case of the century - is the first year of the 21st century
2571
- # 2000 or 2001 ? The debate goes on...
2572
- if params.has_key? 'century'
2573
- year = (params['century'].to_i - 1) * 100 + 1
2574
- end
2575
- # in ISO 8601 most fields are optional
2576
- hour = params['hour'].to_i
2577
- minute = params['minute'].to_i
2578
- second = params['second'].to_i
2579
- weekday = nil
2580
- # daylight savings is complex, but not needed for feedparser's purposes
2581
- # as time zones, if specified, include mention of whether it is active
2582
- # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
2583
- # and most implementations have DST bugs
2584
- tm = [second, minute, hour, day, month, year, nil, ordinal, false, nil]
2585
- tz = params['tz']
2586
- if tz and not tz.empty? and tz != 'Z'
2587
- # FIXME does this cross over days?
2588
- if tz[0] == '-'
2589
- tm[3] += params['tzhour'].to_i
2590
- tm[4] += params['tzmin'].to_i
2591
- elsif tz[0] == '+'
2592
- tm[3] -= params['tzhour'].to_i
2593
- tm[4] -= params['tzmin'].to_i
2594
- else
2595
- return nil
2596
- end
2597
- end
2598
- return Time.utc(*tm) # Magic!
2599
-
2600
- end
2601
-
2602
- def _parse_date_onblog(dateString)
2603
- # Parse a string according to the OnBlog 8-bit date format
2604
- # 8-bit date handling routes written by ytrewq1
2605
- korean_year = u("년") # b3e2 in euc-kr
2606
- korean_month = u("월") # bff9 in euc-kr
2607
- korean_day = u("일") # c0cf in euc-kr
2608
-
2609
-
2610
- korean_onblog_date_re = /(\d{4})#{korean_year}\s+(\d{2})#{korean_month}\s+(\d{2})#{korean_day}\s+(\d{2}):(\d{2}):(\d{2})/
2611
-
2612
-
2613
- m = korean_onblog_date_re.match(dateString)
2614
- return unless m
2615
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
2616
-
2617
- $stderr << "OnBlog date parsed as: %s\n" % w3dtfdate if $debug
2618
- return _parse_date_w3dtf(w3dtfdate)
2619
- end
2620
-
2621
- def _parse_date_nate(dateString)
2622
- # Parse a string according to the Nate 8-bit date format
2623
- # 8-bit date handling routes written by ytrewq1
2624
- korean_am = u("오전") # bfc0 c0fc in euc-kr
2625
- korean_pm = u("오후") # bfc0 c8c4 in euc-kr
2626
-
2627
- korean_nate_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(#{korean_am}|#{korean_pm})\s+(\d{0,2}):(\d{0,2}):(\d{0,2})/
2628
- m = korean_nate_date_re.match(dateString)
2629
- return unless m
2630
- hour = m[5].to_i
2631
- ampm = m[4]
2632
- if ampm == korean_pm
2633
- hour += 12
2634
- end
2635
- hour = hour.to_s.rjust(2,'0')
2636
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{hour}:#{m[6]}:#{m[7]}+09:00"
2637
- $stderr << "Nate date parsed as: %s\n" % w3dtfdate if $debug
2638
- return _parse_date_w3dtf(w3dtfdate)
2639
- end
2640
-
2641
- def _parse_date_mssql(dateString)
2642
- mssql_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?/
2643
-
2644
- m = mssql_date_re.match(dateString)
2645
- return unless m
2646
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
2647
- $stderr << "MS SQL date parsed as: %s\n" % w3dtfdate if $debug
2648
- return _parse_date_w3dtf(w3dtfdate)
2649
- end
2650
-
2651
- def _parse_date_greek(dateString)
2652
- # Parse a string according to a Greek 8-bit date format
2653
- # Unicode strings for Greek date strings
2654
- greek_months = {
2655
- u("Ιαν") => u("Jan"), # c9e1ed in iso-8859-7
2656
- u("Φεβ") => u("Feb"), # d6e5e2 in iso-8859-7
2657
- u("Μάώ") => u("Mar"), # ccdcfe in iso-8859-7
2658
- u("Μαώ") => u("Mar"), # cce1fe in iso-8859-7
2659
- u("Απρ") => u("Apr"), # c1f0f1 in iso-8859-7
2660
- u("Μάι") => u("May"), # ccdce9 in iso-8859-7
2661
- u("Μαϊ") => u("May"), # cce1fa in iso-8859-7
2662
- u("Μαι") => u("May"), # cce1e9 in iso-8859-7
2663
- u("Ιούν") => u("Jun"), # c9effded in iso-8859-7
2664
- u("Ιον") => u("Jun"), # c9efed in iso-8859-7
2665
- u("Ιούλ") => u("Jul"), # c9effdeb in iso-8859-7
2666
- u("Ιολ") => u("Jul"), # c9f9eb in iso-8859-7
2667
- u("Αύγ") => u("Aug"), # c1fde3 in iso-8859-7
2668
- u("Αυγ") => u("Aug"), # c1f5e3 in iso-8859-7
2669
- u("Σεπ") => u("Sep"), # d3e5f0 in iso-8859-7
2670
- u("Οκτ") => u("Oct"), # cfeaf4 in iso-8859-7
2671
- u("Νοέ") => u("Nov"), # cdefdd in iso-8859-7
2672
- u("Νοε") => u("Nov"), # cdefe5 in iso-8859-7
2673
- u("Δεκ") => u("Dec"), # c4e5ea in iso-8859-7
2674
- }
2675
-
2676
- greek_wdays = {
2677
- u("Κυρ") => u("Sun"), # caf5f1 in iso-8859-7
2678
- u("Δευ") => u("Mon"), # c4e5f5 in iso-8859-7
2679
- u("Τρι") => u("Tue"), # d4f1e9 in iso-8859-7
2680
- u("Τετ") => u("Wed"), # d4e5f4 in iso-8859-7
2681
- u("Πεμ") => u("Thu"), # d0e5ec in iso-8859-7
2682
- u("Παρ") => u("Fri"), # d0e1f1 in iso-8859-7
2683
- u("Σαβ") => u("Sat"), # d3e1e2 in iso-8859-7
2684
- }
2685
-
2686
- greek_date_format = /([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)/
2687
-
2688
- m = greek_date_format.match(dateString)
2689
- return unless m
2690
- begin
2691
- wday = greek_wdays[m[1]]
2692
- month = greek_months[m[3]]
2693
- rescue
2694
- return nil
2695
- end
2696
- rfc822date = "#{wday}, #{m[2]} #{month} #{m[4]} #{m[5]}:#{m[6]}:#{m[7]} #{m[8]}"
2697
- $stderr << "Greek date parsed as: #{rfc822date}\n" if $debug
2698
- return _parse_date_rfc822(rfc822date)
2699
- end
2700
-
2701
- def _parse_date_hungarian(dateString)
2702
- # Parse a string according to a Hungarian 8-bit date format.
2703
- hungarian_date_format_re = /(\d{4})-([^-]+)-(\d{0,2})T(\d{0,2}):(\d{2})((\+|-)(\d{0,2}:\d{2}))/
2704
- m = hungarian_date_format_re.match(dateString)
2705
- return unless m
2706
-
2707
- # Unicode strings for Hungarian date strings
2708
- hungarian_months = {
2709
- u("január") => u("01"), # e1 in iso-8859-2
2710
- u("februári") => u("02"), # e1 in iso-8859-2
2711
- u("március") => u("03"), # e1 in iso-8859-2
2712
- u("április") => u("04"), # e1 in iso-8859-2
2713
- u("máujus") => u("05"), # e1 in iso-8859-2
2714
- u("június") => u("06"), # fa in iso-8859-2
2715
- u("július") => u("07"), # fa in iso-8859-2
2716
- u("augusztus") => u("08"),
2717
- u("szeptember") => u("09"),
2718
- u("október") => u("10"), # f3 in iso-8859-2
2719
- u("november") => u("11"),
2720
- u("december") => u("12"),
2721
- }
2722
- begin
2723
- month = hungarian_months[m[2]]
2724
- day = m[3].rjust(2,'0')
2725
- hour = m[4].rjust(2,'0')
2726
- rescue
2727
- return
2728
- end
2729
-
2730
- w3dtfdate = "#{m[1]}-#{month}-#{day}T#{hour}:#{m[5]}:00#{m[6]}"
2731
- $stderr << "Hungarian date parsed as: #{w3dtfdate}\n" if $debug
2732
- return _parse_date_w3dtf(w3dtfdate)
2733
- end
2734
-
2735
- def rollover(num, modulus)
2736
- return num % modulus, num / modulus
2737
- end
2738
-
2739
- def set_self(num, modulus)
2740
- r = num / modulus
2741
- if r == 0
2742
- return num
2743
- end
2744
- return r
2745
- end
2746
- # W3DTF-style date parsing
2747
- # FIXME shouldn't it be "W3CDTF"?
2748
- def _parse_date_w3dtf(dateString)
2749
- # Ruby's Time docs claim w3cdtf is an alias for iso8601 which is an alias for xmlschema
2750
- # Whatever it is, it doesn't work. This has been fixed in Ruby 1.9 and
2751
- # in Ruby on Rails, but not really. They don't fix the 25 hour or 61 minute or 61 second rollover and fail in other ways.
2752
-
2753
- m = dateString.match(/^(\d{4})-?(?:(?:([01]\d)-?(?:([0123]\d)(?:T(\d\d):(\d\d):(\d\d)([+-]\d\d:\d\d|Z))?)?)?)?/)
2754
-
2755
- w3 = m[1..3].map{|s| s=s.to_i; s += 1 if s == 0;s} # Map the year, month and day to integers and, if they were nil, set them to 1
2756
- w3 += m[4..6].map{|s| s.to_i} # Map the hour, minute and second to integers
2757
- w3 << m[-1] # Leave the timezone as a String
2758
-
2759
- # FIXME this next bit needs some serious refactoring
2760
- # Rollover times. 0 minutes and 61 seconds -> 1 minute and 1 second
2761
- w3[5],r = rollover(w3[5], 60) # rollover seconds
2762
- w3[4] += r
2763
- w3[4],r = rollover(w3[4], 60) # rollover minutes
2764
- w3[3] += r
2765
- w3[3],r = rollover(w3[3], 24) # rollover hours
2766
-
2767
- w3[2] = w3[2] + r
2768
- if w3[1] > 12
2769
- w3[1],r = rollover(w3[1],12)
2770
- w3[1] = 12 if w3[1] == 0
2771
- w3[0] += r
2772
- end
2773
-
2774
- num_days = Time.days_in_month(w3[1], w3[0])
2775
- while w3[2] > num_days
2776
- w3[2] -= num_days
2777
- w3[1] += 1
2778
- if w3[1] > 12
2779
- w3[0] += 1
2780
- w3[1] = set_self(w3[1], 12)
2781
- end
2782
- num_days = Time.days_in_month(w3[1], w3[0])
2783
- end
2784
-
2785
-
2786
- unless w3[6].class != String
2787
- if /^-/ =~ w3[6] # Zone offset goes backwards
2788
- w3[6][0] = '+'
2789
- elsif /^\+/ =~ w3[6]
2790
- w3[6][0] = '-'
2791
- end
2792
- end
2793
- return Time.utc(w3[0], w3[1], w3[2] , w3[3], w3[4], w3[5])+Time.zone_offset(w3[6] || "UTC")
2794
- end
2795
-
2796
- def _parse_date_rfc822(dateString)
2797
- # Parse an RFC822, RFC1123, RFC2822 or asctime-style date
2798
- # These first few lines are to fix up the stupid proprietary format from Disney
2799
- unknown_timezones = { 'AT' => 'EDT', 'ET' => 'EST',
2800
- 'CT' => 'CST', 'MT' => 'MST',
2801
- 'PT' => 'PST'
2802
- }
2803
-
2804
- mon = dateString.split[2]
2805
- if mon.length > 3 and Time::RFC2822_MONTH_NAME.include?mon[0..2]
2806
- dateString.sub!(mon,mon[0..2])
2807
- end
2808
- if dateString[-3..-1] != "GMT" and unknown_timezones[dateString[-2..-1]]
2809
- dateString[-2..-1] = unknown_timezones[dateString[-2..-1]]
2810
- end
2811
- # Okay, the Disney date format should be fixed up now.
2812
- rfc = dateString.match(/([A-Za-z]{3}), ([0123]\d) ([A-Za-z]{3}) (\d{4})( (\d\d):(\d\d)(?::(\d\d))? ([A-Za-z]{3}))?/)
2813
- if rfc.to_a.length > 1 and rfc.to_a.include? nil
2814
- dow, day, mon, year, hour, min, sec, tz = rfc[1..-1]
2815
- hour,min,sec = [hour,min,sec].map{|e| e.to_s.rjust(2,'0') }
2816
- tz ||= "GMT"
2817
- end
2818
- asctime_match = dateString.match(/([A-Za-z]{3}) ([A-Za-z]{3}) (\d?\d) (\d\d):(\d\d):(\d\d) ([A-Za-z]{3}) (\d\d\d\d)/).to_a
2819
- if asctime_match.to_a.length > 1
2820
- # Month-abbr dayofmonth hour:minute:second year
2821
- dow, mon, day, hour, min, sec, tz, year = asctime_match[1..-1]
2822
- day.to_s.rjust(2,'0')
2823
- end
2824
- if (rfc.to_a.length > 1 and rfc.to_a.include? nil) or asctime_match.to_a.length > 1
2825
- ds = "#{dow}, #{day} #{mon} #{year} #{hour}:#{min}:#{sec} #{tz}"
2826
- else
2827
- ds = dateString
2828
- end
2829
- t = Time.rfc2822(ds).utc
2830
- return t
2831
- end
2832
-
2833
- def _parse_date_perforce(aDateString) # FIXME not in 4.1?
2834
- # Parse a date in yyyy/mm/dd hh:mm:ss TTT format
2835
- # Note that there is a day of the week at the beginning
2836
- # Ex. Fri, 2006/09/15 08:19:53 EDT
2837
- return Time.parse(aDateString).utc
2838
- end
2839
-
2840
- def extract_tuple(atime)
2841
- # NOTE leave the error handling to parse_date
2842
- t = [atime.year, atime.month, atime.mday, atime.hour,
2843
- atime.min, atime.sec, (atime.wday-1) % 7, atime.yday,
2844
- atime.isdst
2845
- ]
2846
- # yay for modulus! yaaaaaay! its 530 am and i should be sleeping! yaay!
2847
- t[0..-2].map!{|s| s.to_i}
2848
- t[-1] = t[-1] ? 1 : 0
2849
- return t
2850
- end
2851
-
2852
- def parse_date(dateString)
2853
- @date_handlers.each do |handler|
2854
- begin
2855
- $stderr << "Trying date_handler #{handler}\n" if $debug
2856
- datething = extract_tuple(send(handler,dateString))
2857
- return datething
2858
- rescue Exception => e
2859
- $stderr << "#{handler} raised #{e}\n" if $debug
2860
- end
2861
- end
2862
- return nil
2863
- end
2864
-
2865
- end # End FeedParserMixin
2866
-
2867
- class StrictFeedParser < XML::SAX::HandlerBase # expat
2868
- include FeedParserMixin
2869
-
2870
- attr_accessor :bozo, :entries, :feeddata, :exc
2871
- def initialize(baseuri, baselang, encoding)
2872
- $stderr << "trying StrictFeedParser\n" if $debug
2873
- startup(baseuri, baselang, encoding)
2874
- @bozo = false
2875
- @exc = nil
2876
- super()
2877
- end
2878
-
2879
- def getPos
2880
- [@locator.getSystemId, @locator.getLineNumber]
2881
- end
2882
-
2883
- def getAttrs(attrs)
2884
- ret = []
2885
- for i in 0..attrs.getLength
2886
- ret.push([attrs.getName(i), attrs.getValue(i)])
2887
- end
2888
- ret
2889
- end
2890
-
2891
- def setDocumentLocator(loc)
2892
- @locator = loc
2893
- end
2894
-
2895
- def startDoctypeDecl(name, pub_sys, long_name, uri)
2896
- #Nothing is done here. What could we do that is neat and useful?
2897
- end
2898
-
2899
- def startNamespaceDecl(prefix, uri)
2900
- trackNamespace(prefix, uri)
2901
- end
2902
-
2903
- def endNamespaceDecl(prefix)
2904
- end
2905
-
2906
- def startElement(name, attrs)
2907
- name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
2908
- namespaceuri = ($2 || '').downcase
2909
- name = $3
2910
- if /backend\.userland\.com\/rss/ =~ namespaceuri
2911
- # match any backend.userland.com namespace
2912
- namespaceuri = 'http://backend.userland.com/rss'
2913
- end
2914
- prefix = @matchnamespaces[namespaceuri]
2915
- # No need to raise UndeclaredNamespace, Expat does that for us with
2916
- "unbound prefix (XMLParserError)"
2917
- if prefix and not prefix.empty?
2918
- name = prefix + ':' + name
2919
- end
2920
- name.downcase!
2921
- unknown_starttag(name, attrs)
2922
- end
2923
-
2924
- def character(text, start, length)
2925
- #handle_data(CGI.unescapeHTML(text))
2926
- handle_data(text)
2927
- end
2928
- # expat provides "character" not "characters"!
2929
- alias :characters :character # Just in case.
22
+ gem 'character-encodings', ">=0.2.0"
23
+ gem 'htmltools', ">=1.10"
24
+ gem 'htmlentities', ">=4.0.0"
25
+ gem 'activesupport', ">=1.4.1"
26
+ gem 'rchardet', ">=1.0"
27
+ require 'xml/saxdriver' # calling expat through the xmlparser gem
2930
28
 
2931
- def startCdata(content)
2932
- handle_data(content)
2933
- end
29
+ require 'rchardet'
30
+ $chardet = true
2934
31
 
2935
- def endElement(name)
2936
- name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
2937
- namespaceuri = ($2 || '').downcase
2938
- prefix = @matchnamespaces[namespaceuri]
2939
- if prefix and not prefix.empty?
2940
- localname = prefix + ':' + name
2941
- end
2942
- name.downcase!
2943
- unknown_endtag(name)
2944
- end
32
+ require 'encoding/character/utf-8'
33
+ require 'html/sgml-parser'
34
+ require 'htmlentities'
35
+ require 'active_support'
36
+ require 'open-uri'
37
+ include OpenURI
2945
38
 
2946
- def comment(comment)
2947
- handle_comment(comment)
2948
- end
39
+ $debug = false
40
+ $compatible = true
2949
41
 
2950
- def entityDecl(*foo)
2951
- end
42
+ $LOAD_PATH << File.expand_path(File.dirname(__FILE__))
43
+ require 'rfeedparser/forgiving_uri'
44
+ require 'rfeedparser/aliases'
45
+ require 'rfeedparser/encoding_helpers'
46
+ require 'rfeedparser/better_sgmlparser'
47
+ require 'rfeedparser/better_attributelist'
48
+ require 'rfeedparser/scrub'
49
+ require 'rfeedparser/time_helpers'
50
+ require 'rfeedparser/feedparserdict'
51
+ require 'rfeedparser/parser_mixin'
52
+ require 'rfeedparser/parsers'
53
+ require 'rfeedparser/markup_helpers'
2952
54
 
2953
- def unparsedEntityDecl(*foo)
2954
- end
2955
- def error(exc)
2956
- @bozo = true
2957
- @exc = exc
2958
- end
55
+ include FeedParserUtilities
2959
56
 
2960
- def fatalError(exc)
2961
- error(exc)
2962
- raise exc
2963
- end
2964
- end
2965
57
 
2966
- class LooseFeedParser < BetterSGMLParser
2967
- include FeedParserMixin
2968
- # We write the methods that were in BaseHTMLProcessor in the python code
2969
- # in here directly. We do this because if we inherited from
2970
- # BaseHTMLProcessor but then included from FeedParserMixin, the methods
2971
- # of Mixin would overwrite the methods we inherited from
2972
- # BaseHTMLProcessor. This is exactly the opposite of what we want to
2973
- # happen!
58
+ module FeedParser
59
+ Version = "0.9.9"
2974
60
 
2975
- attr_accessor :encoding, :bozo, :feeddata, :entries, :namespacesInUse
61
+ License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
2976
62
 
2977
- Elements_No_End_Tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
2978
- 'img', 'input', 'isindex', 'link', 'meta', 'param']
2979
- New_Declname_Re = /[a-zA-Z][-_.a-zA-Z0-9:]*\s*/
2980
- alias :sgml_feed :feed # feed needs to mapped to feeddata, not the SGMLParser method feed. I think.
2981
- def feed
2982
- @feeddata
2983
- end
2984
- def feed=(data)
2985
- @feeddata = data
2986
- end
63
+ Redistribution and use in source and binary forms, with or without modification,
64
+ are permitted provided that the following conditions are met:
2987
65
 
2988
- def initialize(baseuri, baselang, encoding)
2989
- startup(baseuri, baselang, encoding)
2990
- super() # Keep the parentheses! No touchy.
2991
- end
66
+ * Redistributions of source code must retain the above copyright notice,
67
+ this list of conditions and the following disclaimer.
68
+ * Redistributions in binary form must reproduce the above copyright notice,
69
+ this list of conditions and the following disclaimer in the documentation
70
+ and/or other materials provided with the distribution.
2992
71
 
2993
- def reset
2994
- @pieces = []
2995
- super
2996
- end
72
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
73
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
74
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
75
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
76
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
77
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
78
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
79
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
80
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
81
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
82
+ POSSIBILITY OF SUCH DAMAGE."""
2997
83
 
2998
- def parse(data)
2999
- data.gsub!(/<!((?!DOCTYPE|--|\[))/i, '&lt;!\1')
3000
- data.gsub!(/<([^<\s]+?)\s*\/>/) do |tag|
3001
- clean = tag[1..-3].strip
3002
- if Elements_No_End_Tag.include?clean
3003
- tag
3004
- else
3005
- '<'+clean+'></'+clean+'>'
3006
- end
3007
- end
84
+ Author = "Jeff Hodges <http://somethingsimilar.com>"
85
+ Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
86
+ Contributors = [ "Jason Diamond <http://injektilo.org/>",
87
+ "John Beimler <http://john.beimler.org/>",
88
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>",
89
+ "Aaron Swartz <http://aaronsw.com/>",
90
+ "Kevin Marks <http://epeus.blogspot.com/>"
91
+ ]
92
+ # HTTP "User-Agent" header to send to servers when downloading feeds.
93
+ # If you are embedding feedparser in a larger application, you should
94
+ # change this to your application name and URL.
95
+ USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
3008
96
 
3009
- data.gsub!(/&#39;/, "'")
3010
- data.gsub!(/&#34;/, "'")
3011
- if @encoding and not @encoding.empty? # FIXME unicode check type(u'')
3012
- data = uconvert(data,'utf-8',@encoding)
3013
- end
3014
- sgml_feed(data) # see the alias above
3015
- end
97
+ # HTTP "Accept" header to send to servers when downloading feeds. If you don't
98
+ # want to send an Accept header, set this to None.
99
+ ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
3016
100
 
3017
101
 
3018
- def decodeEntities(element, data)
3019
- data.gsub!('&#60;', '&lt;')
3020
- data.gsub!('&#x3c;', '&lt;')
3021
- data.gsub!('&#62;', '&gt;')
3022
- data.gsub!('&#x3e;', '&gt;')
3023
- data.gsub!('&#38;', '&amp;')
3024
- data.gsub!('&#x26;', '&amp;')
3025
- data.gsub!('&#34;', '&quot;')
3026
- data.gsub!('&#x22;', '&quot;')
3027
- data.gsub!('&#39;', '&apos;')
3028
- data.gsub!('&#x27;', '&apos;')
3029
- if @contentparams.has_key? 'type' and not ((@contentparams['type'] || 'xml') =~ /xml$/u)
3030
- data.gsub!('&lt;', '<')
3031
- data.gsub!('&gt;', '>')
3032
- data.gsub!('&amp;', '&')
3033
- data.gsub!('&quot;', '"')
3034
- data.gsub!('&apos;', "'")
3035
- end
3036
- return data
3037
- end
3038
- end
102
+ # If you want feedparser to automatically run HTML markup through HTML Tidy, set
103
+ # this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
104
+ # or utidylib <http://utidylib.berlios.de/>.
105
+ #TIDY_MARKUP = false #FIXME untranslated
3039
106
 
3040
- def FeedParser.resolveRelativeURIs(htmlSource, baseURI, encoding)
3041
- $stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
3042
- relative_uris = [ ['a','href'],
3043
- ['applet','codebase'],
3044
- ['area','href'],
3045
- ['blockquote','cite'],
3046
- ['body','background'],
3047
- ['del','cite'],
3048
- ['form','action'],
3049
- ['frame','longdesc'],
3050
- ['frame','src'],
3051
- ['iframe','longdesc'],
3052
- ['iframe','src'],
3053
- ['head','profile'],
3054
- ['img','longdesc'],
3055
- ['img','src'],
3056
- ['img','usemap'],
3057
- ['input','src'],
3058
- ['input','usemap'],
3059
- ['ins','cite'],
3060
- ['link','href'],
3061
- ['object','classid'],
3062
- ['object','codebase'],
3063
- ['object','data'],
3064
- ['object','usemap'],
3065
- ['q','cite'],
3066
- ['script','src'],
3067
- ]
3068
- h = Hpricot(htmlSource)
3069
- relative_uris.each do |l|
3070
- ename, eattr = l
3071
- h.search(ename).each do |elem|
3072
- euri = elem.attributes[eattr]
3073
- if euri and not euri.empty? and URI.parse(euri).relative?
3074
- elem.attributes[eattr] = urljoin(baseURI, euri)
3075
- end
3076
- end
3077
- end
3078
- return h.to_html
3079
- end
107
+ # List of Python interfaces for HTML Tidy, in order of preference. Only useful
108
+ # if TIDY_MARKUP = true
109
+ #PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
3080
110
 
3081
- class SanitizerDoc < Hpricot::Doc
3082
-
3083
- def scrub
3084
- traverse_all_element do |e|
3085
- if e.elem?
3086
- if Acceptable_Elements.include?e.name
3087
- e.strip_attributes
3088
- else
3089
- if Unacceptable_Elements_With_End_Tag.include?e.name
3090
- e.inner_html = ''
3091
- end
3092
- e.swap(SanitizerDoc.new(e.children).scrub.to_html)
3093
- # This works because the children swapped in are brought in "after" the current element.
3094
- end
3095
- elsif e.doctype?
3096
- e.parent.children.delete(e)
3097
- elsif e.text?
3098
- ets = e.to_s
3099
- ets.gsub!(/&#39;/, "'")
3100
- ets.gsub!(/&#34;/, '"')
3101
- ets.gsub!(/\r/,'')
3102
- e.swap(ets)
3103
- else
3104
- end
3105
- end
3106
- # yes, that '/' should be there. It's a search method. See the Hpricot docs.
3107
111
 
3108
- unless $compatible # FIXME not properly recursive, see comment in recursive_strip
3109
- (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
3110
- end
3111
- return self
3112
- end
112
+ # ---------- don't touch these ----------
113
+ class ThingsNobodyCaresAboutButMe < Exception
3113
114
  end
3114
-
3115
- def SanitizerDoc(html)
3116
- FeedParser::SanitizerDoc.new(Hpricot.make(html))
115
+ class CharacterEncodingOverride < ThingsNobodyCaresAboutButMe
3117
116
  end
3118
- module_function(:SanitizerDoc)
3119
- def self.sanitizeHTML(html,encoding)
3120
- # FIXME Tidy not yet supported
3121
- html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
3122
- h = SanitizerDoc(html)
3123
- h = h.scrub
3124
- return h.to_html.strip
117
+ class CharacterEncodingUnknown < ThingsNobodyCaresAboutButMe
3125
118
  end
3126
-
3127
-
3128
-
3129
- def self.getCharacterEncoding(feed, xml_data)
3130
- # Get the character encoding of the XML document
3131
- $stderr << "In getCharacterEncoding\n" if $debug
3132
- sniffed_xml_encoding = nil
3133
- xml_encoding = nil
3134
- true_encoding = nil
3135
- begin
3136
- http_headers = feed.meta
3137
- http_content_type = feed.meta['content-type'].split(';')[0]
3138
- encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
3139
- http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
3140
- http_encoding = nil if http_encoding.empty?
3141
- # FIXME Open-Uri returns iso8859-1 if there is no charset header,
3142
- # but that doesn't pass the tests. Open-Uri claims its following
3143
- # the right RFC. Are they wrong or do we need to change the tests?
3144
- rescue NoMethodError
3145
- http_headers = {}
3146
- http_content_type = nil
3147
- http_encoding = nil
3148
- end
3149
- # Must sniff for non-ASCII-compatible character encodings before
3150
- # searching for XML declaration. This heuristic is defined in
3151
- # section F of the XML specification:
3152
- # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3153
- begin
3154
- if xml_data[0..3] == "\x4c\x6f\xa7\x94"
3155
- # EBCDIC
3156
- xml_data = _ebcdic_to_ascii(xml_data)
3157
- elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
3158
- # UTF-16BE
3159
- sniffed_xml_encoding = 'utf-16be'
3160
- xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
3161
- elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
3162
- # UTF-16BE with BOM
3163
- sniffed_xml_encoding = 'utf-16be'
3164
- xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
3165
- elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
3166
- # UTF-16LE
3167
- sniffed_xml_encoding = 'utf-16le'
3168
- xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
3169
- elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
3170
- # UTF-16LE with BOM
3171
- sniffed_xml_encoding = 'utf-16le'
3172
- xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
3173
- elsif xml_data[0..3] == "\x00\x00\x00\x3c"
3174
- # UTF-32BE
3175
- sniffed_xml_encoding = 'utf-32be'
3176
- xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
3177
- elsif xml_data[0..3] == "\x3c\x00\x00\x00"
3178
- # UTF-32LE
3179
- sniffed_xml_encoding = 'utf-32le'
3180
- xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
3181
- elsif xml_data[0..3] == "\x00\x00\xfe\xff"
3182
- # UTF-32BE with BOM
3183
- sniffed_xml_encoding = 'utf-32be'
3184
- xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
3185
- elsif xml_data[0..3] == "\xff\xfe\x00\x00"
3186
- # UTF-32LE with BOM
3187
- sniffed_xml_encoding = 'utf-32le'
3188
- xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
3189
- elsif xml_data[0..2] == "\xef\xbb\xbf"
3190
- # UTF-8 with BOM
3191
- sniffed_xml_encoding = 'utf-8'
3192
- xml_data = xml_data[3..-1]
3193
- else
3194
- # ASCII-compatible
3195
- end
3196
- xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
3197
- rescue
3198
- xml_encoding_match = nil
3199
- end
3200
- if xml_encoding_match
3201
- xml_encoding = xml_encoding_match[1].downcase
3202
- xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
3203
- if sniffed_xml_encoding and xencodings.include?xml_encoding
3204
- xml_encoding = sniffed_xml_encoding
3205
- end
3206
- end
3207
-
3208
- acceptable_content_type = false
3209
- application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
3210
- text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
3211
-
3212
- if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
3213
- acceptable_content_type = true
3214
- true_encoding = http_encoding || xml_encoding || 'utf-8'
3215
- elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
3216
- acceptable_content_type = true
3217
- true_encoding = http_encoding || 'us-ascii'
3218
- elsif /^text\// =~ http_content_type
3219
- true_encoding = http_encoding || 'us-ascii'
3220
- elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
3221
- true_encoding = xml_encoding || 'iso-8859-1'
3222
- else
3223
- true_encoding = xml_encoding || 'utf-8'
3224
- end
3225
- return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
119
+ class NonXMLContentType < ThingsNobodyCaresAboutButMe
3226
120
  end
3227
-
3228
- def self.toUTF8(data, encoding)
3229
- =begin
3230
- Changes an XML data stream on the fly to specify a new encoding
3231
-
3232
- data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
3233
- encoding is a string recognized by encodings.aliases
3234
- =end
3235
- $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
3236
- # NOTE we must use double quotes when dealing with \x encodings!
3237
- if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
3238
- if $debug
3239
- $stderr << "stripping BOM\n"
3240
- if encoding != 'utf-16be'
3241
- $stderr << "string utf-16be instead\n"
3242
- end
3243
- end
3244
- encoding = 'utf-16be'
3245
- data = data[2..-1]
3246
- elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
3247
- if $debug
3248
- $stderr << "stripping BOM\n"
3249
- $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
3250
- end
3251
- encoding = 'utf-16le'
3252
- data = data[2..-1]
3253
- elsif (data[0..2] == "\xef\xbb\xbf")
3254
- if $debug
3255
- $stderr << "stripping BOM\n"
3256
- $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
3257
- end
3258
- encoding = 'utf-8'
3259
- data = data[3..-1]
3260
- elsif (data[0..3] == "\x00\x00\xfe\xff")
3261
- if $debug
3262
- $stderr << "stripping BOM\n"
3263
- if encoding != 'utf-32be'
3264
- $stderr << "trying utf-32be instead\n"
3265
- end
3266
- end
3267
- encoding = 'utf-32be'
3268
- data = data[4..-1]
3269
- elsif (data[0..3] == "\xff\xfe\x00\x00")
3270
- if $debug
3271
- $stderr << "stripping BOM\n"
3272
- if encoding != 'utf-32le'
3273
- $stderr << "trying utf-32le instead\n"
3274
- end
3275
- end
3276
- encoding = 'utf-32le'
3277
- data = data[4..-1]
3278
- end
3279
- begin
3280
- newdata = uconvert(data, encoding, 'utf-8')
3281
- rescue => details
3282
- end
3283
- $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
3284
- declmatch = /^<\?xml[^>]*?>/
3285
- newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
3286
- if declmatch =~ newdata
3287
- newdata.sub!(declmatch, newdecl)
3288
- else
3289
- newdata = newdecl + "\n" + newdata
3290
- end
3291
- return newdata
121
+ class UndeclaredNamespace < Exception
3292
122
  end
3293
123
 
3294
- def self.stripDoctype(data)
3295
- =begin
3296
- Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3297
-
3298
- rss_version may be 'rss091n' or None
3299
- stripped_data is the same XML document, minus the DOCTYPE
3300
- =end
3301
- entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
3302
- data = data.gsub(entity_pattern,'')
3303
-
3304
- doctype_pattern = /<!DOCTYPE(.*?)>/m
3305
- doctype_results = data.scan(doctype_pattern)
3306
- if doctype_results and doctype_results[0]
3307
- doctype = doctype_results[0][0]
3308
- else
3309
- doctype = ''
3310
- end
3311
-
3312
- if /netscape/ =~ doctype.downcase
3313
- version = 'rss091n'
3314
- else
3315
- version = nil
3316
- end
3317
- data = data.sub(doctype_pattern, '')
3318
- return version, data
3319
- end
3320
124
 
3321
- def parse(*args); FeedParser.parse(*args); end
3322
- def FeedParser.parse(furi, options={})
125
+ SUPPORTED_VERSIONS = {'' => 'unknown',
126
+ 'rss090' => 'RSS 0.90',
127
+ 'rss091n' => 'RSS 0.91 (Netscape)',
128
+ 'rss091u' => 'RSS 0.91 (Userland)',
129
+ 'rss092' => 'RSS 0.92',
130
+ 'rss093' => 'RSS 0.93',
131
+ 'rss094' => 'RSS 0.94',
132
+ 'rss20' => 'RSS 2.0',
133
+ 'rss10' => 'RSS 1.0',
134
+ 'rss' => 'RSS (unknown version)',
135
+ 'atom01' => 'Atom 0.1',
136
+ 'atom02' => 'Atom 0.2',
137
+ 'atom03' => 'Atom 0.3',
138
+ 'atom10' => 'Atom 1.0',
139
+ 'atom' => 'Atom (unknown version)',
140
+ 'cdf' => 'CDF',
141
+ 'hotrss' => 'Hot RSS'
142
+ }
143
+
144
+ def parse(furi, options = {})
3323
145
  # Parse a feed from a URL, file, stream or string
3324
146
  $compatible = options[:compatible] || $compatible # Use the default compatibility if compatible is nil
147
+ strictklass = options[:strict] || StrictFeedParser
148
+ looseklass = options[:loose] || LooseFeedParser
3325
149
  result = FeedParserDict.new
3326
150
  result['feed'] = FeedParserDict.new
3327
151
  result['entries'] = []
@@ -3331,13 +155,12 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3331
155
  end
3332
156
  result['bozo'] = false
3333
157
  handlers = options[:handlers]
3334
-
3335
158
  if handlers.class != Array # FIXME why does this happen?
3336
159
  handlers = [handlers]
3337
160
  end
3338
161
 
3339
162
  begin
3340
- if URI::parse(furi).class == URI::Generic
163
+ if File.exists?furi
3341
164
  f = open(furi) # OpenURI doesn't behave well when passing HTTP options to a file.
3342
165
  else
3343
166
  # And when you do pass them, make sure they aren't just nil (this still true?)
@@ -3504,7 +327,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3504
327
  if use_strict_parser
3505
328
  # initialize the SAX parser
3506
329
  saxparser = XML::SAX::Helpers::ParserFactory.makeParser("XML::Parser::SAXDriver")
3507
- feedparser = StrictFeedParser.new(baseuri, baselang, 'utf-8')
330
+ feedparser = strictklass.new(baseuri, baselang, 'utf-8')
3508
331
  saxparser.setDocumentHandler(feedparser)
3509
332
  saxparser.setDTDHandler(feedparser)
3510
333
  saxparser.setEntityResolver(feedparser)
@@ -3525,7 +348,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3525
348
  end
3526
349
  end
3527
350
  if not use_strict_parser
3528
- feedparser = LooseFeedParser.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
351
+ feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
3529
352
  feedparser.parse(data)
3530
353
  $stderr << "Using LooseFeed\n\n" if $debug
3531
354
  end
@@ -3535,6 +358,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3535
358
  result['namespaces'] = feedparser.namespacesInUse
3536
359
  return result
3537
360
  end
361
+ module_function(:parse)
3538
362
  end # End FeedParser module
3539
363
 
3540
364
  class Serializer
@@ -3574,7 +398,7 @@ class TextSerializer < Serializer
3574
398
  end
3575
399
  end
3576
400
 
3577
- class PprintSerializer < Serializer # FIXME ? use pp instead?
401
+ class PprintSerializer < Serializer # FIXME use pp instead
3578
402
  def write(stream = $stdout)
3579
403
  stream << @results['href'].to_s + "\n\n"
3580
404
  pp(@results)
@@ -3582,87 +406,88 @@ class PprintSerializer < Serializer # FIXME ? use pp instead?
3582
406
  end
3583
407
  end
3584
408
 
3585
-
3586
- require 'optparse'
3587
- require 'ostruct'
3588
- options = OpenStruct.new
3589
- options.etag = options.modified = options.agent = options.referrer = nil
3590
- options.content_language = options.content_location = options.ctype = nil
3591
- options.format = 'pprint'
3592
- options.compatible = $compatible
3593
- options.verbose = false
3594
-
3595
- opts = OptionParser.new do |opts|
3596
- opts.banner
3597
- opts.separator ""
3598
- opts.on("-A", "--user-agent [AGENT]",
409
+ if $0 == __FILE__
410
+ require 'optparse'
411
+ require 'ostruct'
412
+ options = OpenStruct.new
413
+ options.etag = options.modified = options.agent = options.referrer = nil
414
+ options.content_language = options.content_location = options.ctype = nil
415
+ options.format = 'pprint'
416
+ options.compatible = $compatible
417
+ options.verbose = false
418
+
419
+ opts = OptionParser.new do |opts|
420
+ opts.banner
421
+ opts.separator ""
422
+ opts.on("-A", "--user-agent [AGENT]",
3599
423
  "User-Agent for HTTP URLs") {|agent|
3600
- options.agent = agent
3601
- }
424
+ options.agent = agent
425
+ }
3602
426
 
3603
- opts.on("-e", "--referrer [URL]",
427
+ opts.on("-e", "--referrer [URL]",
3604
428
  "Referrer for HTTP URLs") {|referrer|
3605
- options.referrer = referrer
3606
- }
429
+ options.referrer = referrer
430
+ }
3607
431
 
3608
- opts.on("-t", "--etag [TAG]",
432
+ opts.on("-t", "--etag [TAG]",
3609
433
  "ETag/If-None-Match for HTTP URLs") {|etag|
3610
- options.etag = etag
3611
- }
434
+ options.etag = etag
435
+ }
3612
436
 
3613
- opts.on("-m", "--last-modified [DATE]",
437
+ opts.on("-m", "--last-modified [DATE]",
3614
438
  "Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") {|modified|
3615
- options.modified = modified
3616
- }
439
+ options.modified = modified
440
+ }
3617
441
 
3618
- opts.on("-f", "--format [FORMAT]", [:text, :pprint],
442
+ opts.on("-f", "--format [FORMAT]", [:text, :pprint],
3619
443
  "output resutls in FORMAT (text, pprint)") {|format|
3620
- options.format = format
3621
- }
444
+ options.format = format
445
+ }
3622
446
 
3623
- opts.on("-v", "--[no-]verbose",
447
+ opts.on("-v", "--[no-]verbose",
3624
448
  "write debugging information to stderr") {|v|
3625
- options.verbose = v
3626
- }
449
+ options.verbose = v
450
+ }
3627
451
 
3628
- opts.on("-c", "--[no-]compatible",
452
+ opts.on("-c", "--[no-]compatible",
3629
453
  "strip element attributes like feedparser.py 4.1 (default)") {|comp|
3630
- options.compatible = comp
3631
- }
3632
- opts.on("-l", "--content-location [LOCATION]",
454
+ options.compatible = comp
455
+ }
456
+ opts.on("-l", "--content-location [LOCATION]",
3633
457
  "default Content-Location HTTP header") {|loc|
3634
- options.content_location = loc
3635
- }
3636
- opts.on("-a", "--content-language [LANG]",
458
+ options.content_location = loc
459
+ }
460
+ opts.on("-a", "--content-language [LANG]",
3637
461
  "default Content-Language HTTP header") {|lang|
3638
- options.content_language = lang
3639
- }
3640
- opts.on("-t", "--content-type [TYPE]",
462
+ options.content_language = lang
463
+ }
464
+ opts.on("-t", "--content-type [TYPE]",
3641
465
  "default Content-type HTTP header") {|ctype|
3642
- options.ctype = ctype
3643
- }
3644
- end
466
+ options.ctype = ctype
467
+ }
468
+ end
3645
469
 
3646
- opts.parse!(ARGV)
3647
- $debug = true if options.verbose
3648
- $compatible = options.compatible unless options.compatible.nil?
470
+ opts.parse!(ARGV)
471
+ $debug = true if options.verbose
472
+ $compatible = options.compatible unless options.compatible.nil?
3649
473
 
3650
- if options.format == :text
3651
- serializer = TextSerializer
3652
- else
3653
- serializer = PprintSerializer
3654
- end
3655
- args = *ARGV.dup
3656
- unless args.nil?
3657
- args.each do |url| # opts.parse! removes everything but the urls from the command line
3658
- results = FeedParser.parse(url, :etag => options.etag,
3659
- :modified => options.modified,
3660
- :agent => options.agent,
3661
- :referrer => options.referrer,
3662
- :content_location => options.content_location,
3663
- :content_language => options.content_language,
3664
- :content_type => options.ctype
3665
- )
3666
- serializer.new(results).write($stdout)
474
+ if options.format == :text
475
+ serializer = TextSerializer
476
+ else
477
+ serializer = PprintSerializer
478
+ end
479
+ args = *ARGV.dup
480
+ unless args.nil?
481
+ args.each do |url| # opts.parse! removes everything but the urls from the command line
482
+ results = FeedParser.parse(url, :etag => options.etag,
483
+ :modified => options.modified,
484
+ :agent => options.agent,
485
+ :referrer => options.referrer,
486
+ :content_location => options.content_location,
487
+ :content_language => options.content_language,
488
+ :content_type => options.ctype
489
+ )
490
+ serializer.new(results).write($stdout)
491
+ end
3667
492
  end
3668
493
  end