rfeedparser 0.9.9 → 0.9.85

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,138 +14,3323 @@ require 'stringio'
14
14
  require 'uri'
15
15
  require 'cgi' # escaping html
16
16
  require 'time'
17
+ require 'xml/saxdriver' # calling expat
17
18
  require 'pp'
18
19
  require 'rubygems'
19
20
  require 'base64'
20
21
  require 'iconv'
22
+ gem 'hpricot', ">=0.5"
23
+ gem 'character-encodings', ">=0.2.0"
24
+ gem 'htmltools', ">=1.10"
25
+ gem 'htmlentities', ">=4.0.0"
26
+ gem 'activesupport', ">=1.4.2"
27
+ gem 'rchardet', ">=1.0"
28
+
29
+ require 'rchardet'
30
+ $chardet = true
31
+
32
+ require 'hpricot'
33
+ require 'encoding/character/utf-8'
34
+ require 'html/sgml-parser'
35
+ require 'htmlentities'
36
+ require 'active_support'
37
+ require 'open-uri'
38
+ include OpenURI
39
+
40
+ $debug = false
41
+ $compatible = true
42
+
43
+ Encoding_Aliases = { # Adapted from python2.4's encodings/aliases.py
44
+ 'unicode' => 'utf-16',
45
+ # MacOSX does not have Unicode as a separate encoding nor even
46
+ # aliased. My Ubuntu box has it as a separate encoding but I cannot
47
+ # for the life of me figure out where the source code for UNICODE.so
48
+ # is (supposedly, in libc6 .deb but that's a damn lie), so I don't
49
+ # know what it expects. After some extensive research, I've decided
50
+ # to alias it to utf-16 much like Python does when it is built with
51
+ # --enable-unicode=ucs2. This could be seriously wrong. I have no idea.
52
+
53
+ # ascii codec
54
+ '646' => 'ascii',
55
+ 'ansi_x3.4_1968' => 'ascii',
56
+ 'ansi_x3_4_1968' => 'ascii', # some email headers use this non-standard name
57
+ 'ansi_x3.4_1986' => 'ascii',
58
+ 'cp367' => 'ascii',
59
+ 'csascii' => 'ascii',
60
+ 'ibm367' => 'ascii',
61
+ 'iso646_us' => 'ascii',
62
+ 'iso_646.irv_1991' => 'ascii',
63
+ 'iso_ir_6' => 'ascii',
64
+ 'us' => 'ascii',
65
+ 'us_ascii' => 'ascii',
66
+
67
+ # big5 codec
68
+ 'big5_tw' => 'big5',
69
+ 'csbig5' => 'big5',
70
+
71
+ # big5hkscs codec
72
+ 'big5_hkscs' => 'big5hkscs',
73
+ 'hkscs' => 'big5hkscs',
74
+
75
+ # cp037 codec
76
+ '037' => 'cp037',
77
+ 'csibm037' => 'cp037',
78
+ 'ebcdic_cp_ca' => 'cp037',
79
+ 'ebcdic_cp_nl' => 'cp037',
80
+ 'ebcdic_cp_us' => 'cp037',
81
+ 'ebcdic_cp_wt' => 'cp037',
82
+ 'ibm037' => 'cp037',
83
+ 'ibm039' => 'cp037',
84
+
85
+ # cp1026 codec
86
+ '1026' => 'cp1026',
87
+ 'csibm1026' => 'cp1026',
88
+ 'ibm1026' => 'cp1026',
89
+
90
+ # cp1140 codec
91
+ '1140' => 'cp1140',
92
+ 'ibm1140' => 'cp1140',
93
+
94
+ # cp1250 codec
95
+ '1250' => 'cp1250',
96
+ 'windows_1250' => 'cp1250',
97
+
98
+ # cp1251 codec
99
+ '1251' => 'cp1251',
100
+ 'windows_1251' => 'cp1251',
101
+
102
+ # cp1252 codec
103
+ '1252' => 'cp1252',
104
+ 'windows_1252' => 'cp1252',
105
+
106
+ # cp1253 codec
107
+ '1253' => 'cp1253',
108
+ 'windows_1253' => 'cp1253',
109
+
110
+ # cp1254 codec
111
+ '1254' => 'cp1254',
112
+ 'windows_1254' => 'cp1254',
113
+
114
+ # cp1255 codec
115
+ '1255' => 'cp1255',
116
+ 'windows_1255' => 'cp1255',
117
+
118
+ # cp1256 codec
119
+ '1256' => 'cp1256',
120
+ 'windows_1256' => 'cp1256',
121
+
122
+ # cp1257 codec
123
+ '1257' => 'cp1257',
124
+ 'windows_1257' => 'cp1257',
125
+
126
+ # cp1258 codec
127
+ '1258' => 'cp1258',
128
+ 'windows_1258' => 'cp1258',
129
+
130
+ # cp424 codec
131
+ '424' => 'cp424',
132
+ 'csibm424' => 'cp424',
133
+ 'ebcdic_cp_he' => 'cp424',
134
+ 'ibm424' => 'cp424',
135
+
136
+ # cp437 codec
137
+ '437' => 'cp437',
138
+ 'cspc8codepage437' => 'cp437',
139
+ 'ibm437' => 'cp437',
140
+
141
+ # cp500 codec
142
+ '500' => 'cp500',
143
+ 'csibm500' => 'cp500',
144
+ 'ebcdic_cp_be' => 'cp500',
145
+ 'ebcdic_cp_ch' => 'cp500',
146
+ 'ibm500' => 'cp500',
147
+
148
+ # cp775 codec
149
+ '775' => 'cp775',
150
+ 'cspc775baltic' => 'cp775',
151
+ 'ibm775' => 'cp775',
152
+
153
+ # cp850 codec
154
+ '850' => 'cp850',
155
+ 'cspc850multilingual' => 'cp850',
156
+ 'ibm850' => 'cp850',
157
+
158
+ # cp852 codec
159
+ '852' => 'cp852',
160
+ 'cspcp852' => 'cp852',
161
+ 'ibm852' => 'cp852',
162
+
163
+ # cp855 codec
164
+ '855' => 'cp855',
165
+ 'csibm855' => 'cp855',
166
+ 'ibm855' => 'cp855',
167
+
168
+ # cp857 codec
169
+ '857' => 'cp857',
170
+ 'csibm857' => 'cp857',
171
+ 'ibm857' => 'cp857',
172
+
173
+ # cp860 codec
174
+ '860' => 'cp860',
175
+ 'csibm860' => 'cp860',
176
+ 'ibm860' => 'cp860',
177
+
178
+ # cp861 codec
179
+ '861' => 'cp861',
180
+ 'cp_is' => 'cp861',
181
+ 'csibm861' => 'cp861',
182
+ 'ibm861' => 'cp861',
183
+
184
+ # cp862 codec
185
+ '862' => 'cp862',
186
+ 'cspc862latinhebrew' => 'cp862',
187
+ 'ibm862' => 'cp862',
188
+
189
+ # cp863 codec
190
+ '863' => 'cp863',
191
+ 'csibm863' => 'cp863',
192
+ 'ibm863' => 'cp863',
193
+
194
+ # cp864 codec
195
+ '864' => 'cp864',
196
+ 'csibm864' => 'cp864',
197
+ 'ibm864' => 'cp864',
198
+
199
+ # cp865 codec
200
+ '865' => 'cp865',
201
+ 'csibm865' => 'cp865',
202
+ 'ibm865' => 'cp865',
203
+
204
+ # cp866 codec
205
+ '866' => 'cp866',
206
+ 'csibm866' => 'cp866',
207
+ 'ibm866' => 'cp866',
208
+
209
+ # cp869 codec
210
+ '869' => 'cp869',
211
+ 'cp_gr' => 'cp869',
212
+ 'csibm869' => 'cp869',
213
+ 'ibm869' => 'cp869',
214
+
215
+ # cp932 codec
216
+ '932' => 'cp932',
217
+ 'ms932' => 'cp932',
218
+ 'mskanji' => 'cp932',
219
+ 'ms_kanji' => 'cp932',
220
+
221
+ # cp949 codec
222
+ '949' => 'cp949',
223
+ 'ms949' => 'cp949',
224
+ 'uhc' => 'cp949',
225
+
226
+ # cp950 codec
227
+ '950' => 'cp950',
228
+ 'ms950' => 'cp950',
229
+
230
+ # euc_jp codec
231
+ 'euc_jp' => 'euc-jp',
232
+ 'eucjp' => 'euc-jp',
233
+ 'ujis' => 'euc-jp',
234
+ 'u_jis' => 'euc-jp',
235
+
236
+ # euc_kr codec
237
+ 'euc_kr' => 'euc-kr',
238
+ 'euckr' => 'euc-kr',
239
+ 'korean' => 'euc-kr',
240
+ 'ksc5601' => 'euc-kr',
241
+ 'ks_c_5601' => 'euc-kr',
242
+ 'ks_c_5601_1987' => 'euc-kr',
243
+ 'ksx1001' => 'euc-kr',
244
+ 'ks_x_1001' => 'euc-kr',
245
+
246
+ # gb18030 codec
247
+ 'gb18030_2000' => 'gb18030',
248
+
249
+ # gb2312 codec
250
+ 'chinese' => 'gb2312',
251
+ 'csiso58gb231280' => 'gb2312',
252
+ 'euc_cn' => 'gb2312',
253
+ 'euccn' => 'gb2312',
254
+ 'eucgb2312_cn' => 'gb2312',
255
+ 'gb2312_1980' => 'gb2312',
256
+ 'gb2312_80' => 'gb2312',
257
+ 'iso_ir_58' => 'gb2312',
258
+
259
+ # gbk codec
260
+ '936' => 'gbk',
261
+ 'cp936' => 'gbk',
262
+ 'ms936' => 'gbk',
263
+
264
+ # hp-roman8 codec
265
+ 'hp_roman8' => 'hp-roman8',
266
+ 'roman8' => 'hp-roman8',
267
+ 'r8' => 'hp-roman8',
268
+ 'csHPRoman8' => 'hp-roman8',
269
+
270
+ # iso2022_jp codec
271
+ 'iso2022_jp' => 'iso-2022-jp',
272
+ 'csiso2022jp' => 'iso-2022-jp',
273
+ 'iso2022jp' => 'iso-2022-jp',
274
+ 'iso_2022_jp' => 'iso-2022-jp',
275
+
276
+ # iso2022_jp_1 codec
277
+ 'iso2002_jp_1' => 'iso-2022-jp-1',
278
+ 'iso2022jp_1' => 'iso-2022-jp-1',
279
+ 'iso_2022_jp_1' => 'iso-2022-jp-1',
280
+
281
+ # iso2022_jp_2 codec
282
+ 'iso2022_jp_2' => 'iso-2002-jp-2',
283
+ 'iso2022jp_2' => 'iso-2022-jp-2',
284
+ 'iso_2022_jp_2' => 'iso-2022-jp-2',
285
+
286
+ # iso2022_jp_3 codec
287
+ 'iso2002_jp_3' => 'iso-2022-jp-3',
288
+ 'iso2022jp_3' => 'iso-2022-jp-3',
289
+ 'iso_2022_jp_3' => 'iso-2022-jp-3',
290
+
291
+ # iso2022_kr codec
292
+ 'iso2022_kr' => 'iso-2022-kr',
293
+ 'csiso2022kr' => 'iso-2022-kr',
294
+ 'iso2022kr' => 'iso-2022-kr',
295
+ 'iso_2022_kr' => 'iso-2022-kr',
296
+
297
+ # iso8859_10 codec
298
+ 'iso8859_10' => 'iso-8859-10',
299
+ 'csisolatin6' => 'iso-8859-10',
300
+ 'iso_8859_10' => 'iso-8859-10',
301
+ 'iso_8859_10_1992' => 'iso-8859-10',
302
+ 'iso_ir_157' => 'iso-8859-10',
303
+ 'l6' => 'iso-8859-10',
304
+ 'latin6' => 'iso-8859-10',
305
+
306
+ # iso8859_13 codec
307
+ 'iso8859_13' => 'iso-8859-13',
308
+ 'iso_8859_13' => 'iso-8859-13',
309
+
310
+ # iso8859_14 codec
311
+ 'iso8859_14' => 'iso-8859-14',
312
+ 'iso_8859_14' => 'iso-8859-14',
313
+ 'iso_8859_14_1998' => 'iso-8859-14',
314
+ 'iso_celtic' => 'iso-8859-14',
315
+ 'iso_ir_199' => 'iso-8859-14',
316
+ 'l8' => 'iso-8859-14',
317
+ 'latin8' => 'iso-8859-14',
318
+
319
+ # iso8859_15 codec
320
+ 'iso8859_15' => 'iso-8859-15',
321
+ 'iso_8859_15' => 'iso-8859-15',
322
+
323
+ # iso8859_1 codec
324
+ 'latin_1' => 'iso-8859-1',
325
+ 'cp819' => 'iso-8859-1',
326
+ 'csisolatin1' => 'iso-8859-1',
327
+ 'ibm819' => 'iso-8859-1',
328
+ 'iso8859' => 'iso-8859-1',
329
+ 'iso_8859_1' => 'iso-8859-1',
330
+ 'iso_8859_1_1987' => 'iso-8859-1',
331
+ 'iso_ir_100' => 'iso-8859-1',
332
+ 'l1' => 'iso-8859-1',
333
+ 'latin' => 'iso-8859-1',
334
+ 'latin1' => 'iso-8859-1',
335
+
336
+ # iso8859_2 codec
337
+ 'iso8859_2' => 'iso-8859-2',
338
+ 'csisolatin2' => 'iso-8859-2',
339
+ 'iso_8859_2' => 'iso-8859-2',
340
+ 'iso_8859_2_1987' => 'iso-8859-2',
341
+ 'iso_ir_101' => 'iso-8859-2',
342
+ 'l2' => 'iso-8859-2',
343
+ 'latin2' => 'iso-8859-2',
344
+
345
+ # iso8859_3 codec
346
+ 'iso8859_3' => 'iso-8859-3',
347
+ 'csisolatin3' => 'iso-8859-3',
348
+ 'iso_8859_3' => 'iso-8859-3',
349
+ 'iso_8859_3_1988' => 'iso-8859-3',
350
+ 'iso_ir_109' => 'iso-8859-3',
351
+ 'l3' => 'iso-8859-3',
352
+ 'latin3' => 'iso-8859-3',
353
+
354
+ # iso8859_4 codec
355
+ 'iso8849_4' => 'iso-8859-4',
356
+ 'csisolatin4' => 'iso-8859-4',
357
+ 'iso_8859_4' => 'iso-8859-4',
358
+ 'iso_8859_4_1988' => 'iso-8859-4',
359
+ 'iso_ir_110' => 'iso-8859-4',
360
+ 'l4' => 'iso-8859-4',
361
+ 'latin4' => 'iso-8859-4',
362
+
363
+ # iso8859_5 codec
364
+ 'iso8859_5' => 'iso-8859-5',
365
+ 'csisolatincyrillic' => 'iso-8859-5',
366
+ 'cyrillic' => 'iso-8859-5',
367
+ 'iso_8859_5' => 'iso-8859-5',
368
+ 'iso_8859_5_1988' => 'iso-8859-5',
369
+ 'iso_ir_144' => 'iso-8859-5',
370
+
371
+ # iso8859_6 codec
372
+ 'iso8859_6' => 'iso-8859-6',
373
+ 'arabic' => 'iso-8859-6',
374
+ 'asmo_708' => 'iso-8859-6',
375
+ 'csisolatinarabic' => 'iso-8859-6',
376
+ 'ecma_114' => 'iso-8859-6',
377
+ 'iso_8859_6' => 'iso-8859-6',
378
+ 'iso_8859_6_1987' => 'iso-8859-6',
379
+ 'iso_ir_127' => 'iso-8859-6',
380
+
381
+ # iso8859_7 codec
382
+ 'iso8859_7' => 'iso-8859-7',
383
+ 'csisolatingreek' => 'iso-8859-7',
384
+ 'ecma_118' => 'iso-8859-7',
385
+ 'elot_928' => 'iso-8859-7',
386
+ 'greek' => 'iso-8859-7',
387
+ 'greek8' => 'iso-8859-7',
388
+ 'iso_8859_7' => 'iso-8859-7',
389
+ 'iso_8859_7_1987' => 'iso-8859-7',
390
+ 'iso_ir_126' => 'iso-8859-7',
391
+
392
+ # iso8859_8 codec
393
+ 'iso8859_9' => 'iso8859_8',
394
+ 'csisolatinhebrew' => 'iso-8859-8',
395
+ 'hebrew' => 'iso-8859-8',
396
+ 'iso_8859_8' => 'iso-8859-8',
397
+ 'iso_8859_8_1988' => 'iso-8859-8',
398
+ 'iso_ir_138' => 'iso-8859-8',
399
+
400
+ # iso8859_9 codec
401
+ 'iso8859_9' => 'iso-8859-9',
402
+ 'csisolatin5' => 'iso-8859-9',
403
+ 'iso_8859_9' => 'iso-8859-9',
404
+ 'iso_8859_9_1989' => 'iso-8859-9',
405
+ 'iso_ir_148' => 'iso-8859-9',
406
+ 'l5' => 'iso-8859-9',
407
+ 'latin5' => 'iso-8859-9',
408
+
409
+ # iso8859_11 codec
410
+ 'iso8859_11' => 'iso-8859-11',
411
+ 'thai' => 'iso-8859-11',
412
+ 'iso_8859_11' => 'iso-8859-11',
413
+ 'iso_8859_11_2001' => 'iso-8859-11',
414
+
415
+ # iso8859_16 codec
416
+ 'iso8859_16' => 'iso-8859-16',
417
+ 'iso_8859_16' => 'iso-8859-16',
418
+ 'iso_8859_16_2001' => 'iso-8859-16',
419
+ 'iso_ir_226' => 'iso-8859-16',
420
+ 'l10' => 'iso-8859-16',
421
+ 'latin10' => 'iso-8859-16',
422
+
423
+ # cskoi8r codec
424
+ 'koi8_r' => 'cskoi8r',
425
+
426
+ # mac_cyrillic codec
427
+ 'mac_cyrillic' => 'maccyrillic',
428
+
429
+ # shift_jis codec
430
+ 'csshiftjis' => 'shift_jis',
431
+ 'shiftjis' => 'shift_jis',
432
+ 'sjis' => 'shift_jis',
433
+ 's_jis' => 'shift_jis',
434
+
435
+ # shift_jisx0213 codec
436
+ 'shiftjisx0213' => 'shift_jisx0213',
437
+ 'sjisx0213' => 'shift_jisx0213',
438
+ 's_jisx0213' => 'shift_jisx0213',
439
+
440
+ # utf_16 codec
441
+ 'utf_16' => 'utf-16',
442
+ 'u16' => 'utf-16',
443
+ 'utf16' => 'utf-16',
444
+
445
+ # utf_16_be codec
446
+ 'utf_16_be' => 'utf-16be',
447
+ 'unicodebigunmarked' => 'utf-16be',
448
+ 'utf_16be' => 'utf-16be',
449
+
450
+ # utf_16_le codec
451
+ 'utf_16_le' => 'utf-16le',
452
+ 'unicodelittleunmarked' => 'utf-16le',
453
+ 'utf_16le' => 'utf-16le',
454
+
455
+ # utf_7 codec
456
+ 'utf_7' => 'utf-7',
457
+ 'u7' => 'utf-7',
458
+ 'utf7' => 'utf-7',
459
+
460
+ # utf_8 codec
461
+ 'utf_8' => 'utf-8',
462
+ 'u8' => 'utf-8',
463
+ 'utf' => 'utf-8',
464
+ 'utf8' => 'utf-8',
465
+ 'utf8_ucs2' => 'utf-8',
466
+ 'utf8_ucs4' => 'utf-8',
467
+ }
468
+
469
+ def unicode(data, from_encoding)
470
+ # Takes a single string and converts it from the encoding in
471
+ # from_encoding to unicode.
472
+ uconvert(data, from_encoding, 'unicode')
473
+ end
474
+
475
+ def uconvert(data, from_encoding, to_encoding = 'utf-8')
476
+ from_encoding = Encoding_Aliases[from_encoding] || from_encoding
477
+ to_encoding = Encoding_Aliases[to_encoding] || to_encoding
478
+ Iconv.iconv(to_encoding, from_encoding, data)[0]
479
+ end
480
+
481
+ def unichr(i)
482
+ [i].pack('U*')
483
+ end
484
+
485
+ def index_match(stri,regexp, offset)
486
+ if offset == 241
487
+ end
488
+ i = stri.index(regexp, offset)
489
+
490
+ return nil, nil unless i
491
+
492
+ full = stri[i..-1].match(regexp)
493
+ return i, full
494
+ end
495
+
496
+ def _ebcdic_to_ascii(s)
497
+ return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
498
+ end
499
+
500
+ def urljoin(base, uri)
501
+ urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
502
+ uri = uri.sub(urifixer, '\1\3')
503
+ begin
504
+ return URI.join(base, uri).to_s
505
+ rescue URI::BadURIError => e
506
+ if URI.parse(base).relative?
507
+ return URI::parse(uri).to_s
508
+ end
509
+ end
510
+ end
511
+
512
+ def py2rtime(pytuple)
513
+ Time.utc(pytuple[0..5])
514
+ end
515
+
516
+ # http://intertwingly.net/stories/2005/09/28/xchar.rb
517
+ module XChar
518
+ # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
519
+ CP1252 = {
520
+ 128 => 8364, # euro sign
521
+ 130 => 8218, # single low-9 quotation mark
522
+ 131 => 402, # latin small letter f with hook
523
+ 132 => 8222, # double low-9 quotation mark
524
+ 133 => 8230, # horizontal ellipsis
525
+ 134 => 8224, # dagger
526
+ 135 => 8225, # double dagger
527
+ 136 => 710, # modifier letter circumflex accent
528
+ 137 => 8240, # per mille sign
529
+ 138 => 352, # latin capital letter s with caron
530
+ 139 => 8249, # single left-pointing angle quotation mark
531
+ 140 => 338, # latin capital ligature oe
532
+ 142 => 381, # latin capital letter z with caron
533
+ 145 => 8216, # left single quotation mark
534
+ 146 => 8217, # right single quotation mark
535
+ 147 => 8220, # left double quotation mark
536
+ 148 => 8221, # right double quotation mark
537
+ 149 => 8226, # bullet
538
+ 150 => 8211, # en dash
539
+ 151 => 8212, # em dash
540
+ 152 => 732, # small tilde
541
+ 153 => 8482, # trade mark sign
542
+ 154 => 353, # latin small letter s with caron
543
+ 155 => 8250, # single right-pointing angle quotation mark
544
+ 156 => 339, # latin small ligature oe
545
+ 158 => 382, # latin small letter z with caron
546
+ 159 => 376} # latin capital letter y with diaeresis
547
+
548
+ # http://www.w3.org/TR/REC-xml/#dt-chardata
549
+ PREDEFINED = {
550
+ 38 => '&', # ampersand
551
+ 60 => '<', # left angle bracket
552
+ 62 => '>'} # right angle bracket
553
+
554
+ # http://www.w3.org/TR/REC-xml/#charsets
555
+ VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
556
+ (0xE000..0xFFFD), (0x10000..0x10FFFF)]
557
+ end
558
+
559
+ class Fixnum
560
+ # xml escaped version of chr
561
+ def xchr
562
+ n = XChar::CP1252[self] || self
563
+ n = 42 unless XChar::VALID.find {|range| range.include? n}
564
+ XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
565
+ end
566
+ end
567
+
568
+ class String
569
+ alias :old_index :index
570
+ def to_xs
571
+ unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
572
+ rescue
573
+ unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
574
+ end
575
+ end
576
+
577
+ class BetterSGMLParserError < Exception; end;
578
+ class BetterSGMLParser < HTML::SGMLParser
579
+ # Replaced Tagfind and Charref Regexps with the ones in feedparser.py
580
+ # This makes things work.
581
+ Interesting = /[&<]/u
582
+ Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
583
+ '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
584
+ '![^<>]*)?', 64) # 64 is the unicode flag
585
+
586
+ Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
587
+ Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
588
+
589
+ Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
590
+ Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
591
+ Endtagopen = /<\//u # Matching the Python SGMLParser
592
+ Endbracket = /[<>]/u
593
+ Declopen = /<!/u
594
+ Piopenbegin = /^<\?/u
595
+ Piclose = />/u
596
+
597
+ Commentopen = /<!--/u
598
+ Commentclose = /--\s*>/u
599
+ Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
600
+ Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
601
+ '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
602
+ 64)
603
+ Endtagfind = /\s*\/\s*>/u
604
+ def initialize(verbose=false)
605
+ super(verbose)
606
+ end
607
+ def feed(*args)
608
+ super(*args)
609
+ end
610
+
611
+ def goahead(_end)
612
+ rawdata = @rawdata # woo, utf-8 magic
613
+ i = 0
614
+ n = rawdata.length
615
+ while i < n
616
+ if @nomoretags
617
+ # handle_data_range does nothing more than set a "Range" that is never used. wtf?
618
+ handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
619
+ i = n
620
+ break
621
+ end
622
+ j = rawdata.index(Interesting, i)
623
+ j = n unless j
624
+ handle_data(rawdata[i...j]) if i < j
625
+ i = j
626
+ break if (i == n)
627
+ if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
628
+ if rawdata.index(Starttagopen,i) == i
629
+ if @literal
630
+ handle_data(rawdata[i..i])
631
+ i = i+1
632
+ next
633
+ end
634
+ k = parse_starttag(i)
635
+ break unless k
636
+ i = k
637
+ next
638
+ end
639
+ if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
640
+ k = parse_endtag(i)
641
+ break unless k
642
+ i = k
643
+ @literal = false
644
+ next
645
+ end
646
+ if @literal
647
+ if n > (i+1)
648
+ handle_data("<")
649
+ i = i+1
650
+ else
651
+ #incomplete
652
+ break
653
+ end
654
+ next
655
+ end
656
+ if rawdata.index(Commentopen,i) == i
657
+ k = parse_comment(i)
658
+ break unless k
659
+ i = k
660
+ next
661
+ end
662
+ if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
663
+ k = parse_pi(i)
664
+ break unless k
665
+ i += k
666
+ next
667
+ end
668
+ if rawdata.index(Declopen,i) == i
669
+ # This is some sort of declaration; in "HTML as
670
+ # deployed," this should only be the document type
671
+ # declaration ("<!DOCTYPE html...>").
672
+ k = parse_declaration(i)
673
+ break unless k
674
+ i = k
675
+ next
676
+ end
677
+ elsif rawdata[i..i] == '&'
678
+ if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
679
+ handle_data(rawdata[i..i])
680
+ i += 1
681
+ next
682
+ end
683
+
684
+ # the Char must come first as its #=~ method is the only one that is UTF-8 safe
685
+ ni,match = index_match(rawdata, Charref, i)
686
+ if ni and ni == i # See? Ugly
687
+ handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
688
+ i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
689
+ i -= 1 unless rawdata[i-1..i-1] == ";"
690
+ next
691
+ end
692
+ ni,match = index_match(rawdata, Entityref, i)
693
+ if ni and ni == i
694
+ handle_entityref(match[1])
695
+ i += match[0].length
696
+ i -= 1 unless rawdata[i-1..i-1] == ";"
697
+ next
698
+ end
699
+ else
700
+ error('neither < nor & ??')
701
+ end
702
+ # We get here only if incomplete matches but
703
+ # nothing else
704
+ ni,match = index_match(rawdata,Incomplete,i)
705
+ unless ni and ni == 0
706
+ handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
707
+ i += 1
708
+ next
709
+ end
710
+ j = ni + match[0].length
711
+ break if j == n # Really incomplete
712
+ handle_data(rawdata[i...j])
713
+ i = j
714
+ end # end while
715
+
716
+ if _end and i < n
717
+ handle_data(rawdata[i...n])
718
+ i = n
719
+ end
720
+
721
+ @rawdata = rawdata[i..-1]
722
+ # @offset += i # FIXME BUGME another unused variable in SGMLParser?
723
+ end
724
+
725
+
726
+ # Internal -- parse processing instr, return length or -1 if not terminated
727
+ def parse_pi(i)
728
+ rawdata = @rawdata
729
+ if rawdata[i...i+2] != '<?'
730
+ error("unexpected call to parse_pi()")
731
+ end
732
+ ni,match = index_match(rawdata,Piclose,i+2)
733
+ return nil unless match
734
+ j = ni
735
+ handle_pi(rawdata[i+2...j])
736
+ j = (j + match[0].length)
737
+ return j-i
738
+ end
739
+
740
+ def parse_comment(i)
741
+ rawdata = @rawdata
742
+ if rawdata[i...i+4] != "<!--"
743
+ error("unexpected call to parse_comment()")
744
+ end
745
+ ni,match = index_match(rawdata, Commentclose,i)
746
+ return nil unless match
747
+ handle_comment(rawdata[i+4..(ni-1)])
748
+ return ni+match[0].length # Length from i to just past the closing comment tag
749
+ end
750
+
751
+
752
+ def parse_starttag(i)
753
+ @_starttag_text = nil
754
+ start_pos = i
755
+ rawdata = @rawdata
756
+ ni,match = index_match(rawdata,Shorttagopen,i)
757
+ if ni == i
758
+ # SGML shorthand: <tag/data/ == <tag>data</tag>
759
+ # XXX Can data contain &... (entity or char refs)?
760
+ # XXX Can data contain < or > (tag characters)?
761
+ # XXX Can there be whitespace before the first /?
762
+ k,match = index_match(rawdata,Shorttag,i)
763
+ return nil unless match
764
+ tag, data = match[1], match[2]
765
+ @_starttag_text = "<#{tag}/"
766
+ tag.downcase!
767
+ second_end = rawdata.index(Shorttagopen,k)
768
+ finish_shorttag(tag, data)
769
+ @_starttag_text = rawdata[start_pos...second_end+1]
770
+ return k
771
+ end
772
+
773
+ j = rawdata.index(Endbracket, i+1)
774
+ return nil unless j
775
+ attrsd = []
776
+ if rawdata[i...i+2] == '<>'
777
+ # SGML shorthand: <> == <last open tag seen>
778
+ k = j
779
+ tag = @lasttag
780
+ else
781
+ ni,match = index_match(rawdata,Tagfind,i+1)
782
+ unless match
783
+ error('unexpected call to parse_starttag')
784
+ end
785
+ k = ni+match[0].length+1
786
+ tag = match[0].downcase
787
+ @lasttag = tag
788
+ end
789
+
790
+ while k < j
791
+ break if rawdata.index(Endtagfind, k) == k
792
+ ni,match = index_match(rawdata,Attrfind,k)
793
+ break unless ni
794
+ matched_length = match[0].length
795
+ attrname, rest, attrvalue = match[1],match[2],match[3]
796
+ if rest.nil? or rest.empty?
797
+ attrvalue = '' # was: = attrname # Why the change?
798
+ elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
799
+ attrvalue = attrvalue[1...-1]
800
+ end
801
+ attrsd << [attrname.downcase, attrvalue]
802
+ k += matched_length
803
+ end
804
+ if rawdata[j..j] == ">"
805
+ j += 1
806
+ end
807
+ @_starttag_text = rawdata[start_pos...j]
808
+ finish_starttag(tag, attrsd)
809
+ return j
810
+ end
811
+
812
+ def parse_endtag(i)
813
+ rawdata = @rawdata
814
+ j, match = index_match(rawdata, /[<>]/,i+1)
815
+ return nil unless j
816
+ tag = rawdata[i+2...j].strip.downcase
817
+ if rawdata[j..j] == ">"
818
+ j += 1
819
+ end
820
+ finish_endtag(tag)
821
+ return j
822
+ end
823
+
824
+ def output
825
+ # Return processed HTML as a single string
826
+ return @pieces.map{|p| p.to_s}.join
827
+ end
828
+
829
+ def error(message)
830
+ raise BetterSGMLParserError.new(message)
831
+ end
832
+ def handle_pi(text)
833
+ end
834
+ def handle_decl(text)
835
+ end
836
+ end
837
+
838
+ # Add some helper methods to make AttributeList (all of those damn attrs
839
+ # and attrsD used by StrictFeedParser) act more like a Hash.
840
+ # NOTE AttributeList is still Read-Only (AFAICT).
841
+ # Monkey patching is terrible, and I have an addiction.
842
+ module XML
843
+ module SAX
844
+ module AttributeList # in xml/sax.rb
845
+ def [](key)
846
+ getValue(key)
847
+ end
848
+
849
+ def each(&blk)
850
+ (0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
851
+ end
852
+
853
+ def each_key(&blk)
854
+ (0...getLength).each{|pos| yield getName(pos) }
855
+ end
856
+
857
+ def each_value(&blk)
858
+ (0...getLength).each{|pos| yield getValue(pos) }
859
+ end
860
+
861
+ def to_a # Rather use collect? grep for to_a.collect
862
+ l = []
863
+ each{|k,v| l << [k,v]}
864
+ return l
865
+ end
866
+
867
+ def to_s
868
+ l = []
869
+ each{|k,v| l << "#{k} => #{v}"}
870
+ "{ "+l.join(", ")+" }"
871
+ end
872
+ end
873
+ end
874
+ end
875
+ # This adds a nice scrub method to Hpricot, so we don't need a _HTMLSanitizer class
876
+ # http://underpantsgnome.com/2007/01/20/hpricot-scrub
877
+ # I have modified it to check for attributes that are only allowed if they are in a certain tag
878
+ module Hpricot
879
+ Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
880
+ 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
881
+ 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
882
+ 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
883
+ 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
884
+ 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
885
+ 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
886
+ 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
887
+ 'ul', 'var'
888
+ ]
889
+
890
+ Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
891
+ 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
892
+ 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
893
+ 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
894
+ 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
895
+ 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
896
+ 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
897
+ 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
898
+ 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
899
+ 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
900
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
901
+ ]
902
+
903
+ Unacceptable_Elements_With_End_Tag = ['script', 'applet']
904
+
905
+ Acceptable_Css_Properties = ['azimuth', 'background-color',
906
+ 'border-bottom-color', 'border-collapse', 'border-color',
907
+ 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
908
+ 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
909
+ 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
910
+ 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
911
+ 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
912
+ 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
913
+ 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
914
+ 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
915
+ 'white-space', 'width'
916
+ ]
917
+
918
+ # survey of common keywords found in feeds
919
+ Acceptable_Css_Keywords = ['auto', 'aqua', 'black', 'block', 'blue',
920
+ 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
921
+ 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
922
+ 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
923
+ 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
924
+ 'transparent', 'underline', 'white', 'yellow'
925
+ ]
926
+
927
+ Mathml_Elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
928
+ 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
929
+ 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
930
+ 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
931
+ 'munderover', 'none'
932
+ ]
933
+
934
+ Mathml_Attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
935
+ 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
936
+ 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
937
+ 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
938
+ 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
939
+ 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
940
+ 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
941
+ 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
942
+ 'xlink:type', 'xmlns', 'xmlns:xlink'
943
+ ]
944
+
945
+ # svgtiny - foreignObject + linearGradient + radialGradient + stop
946
+ Svg_Elements = ['a', 'animate', 'animateColor', 'animateMotion',
947
+ 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
948
+ 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
949
+ 'linearGradient', 'line', 'metadata', 'missing-glyph', 'mpath', 'path',
950
+ 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg',
951
+ 'switch', 'text', 'title', 'use'
952
+ ]
953
+
954
+ # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
955
+ Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
956
+ 'arabic-form', 'ascent', 'attributeName', 'attributeType',
957
+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
958
+ 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
959
+ 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
960
+ 'font-size', 'font-stretch', 'font-style', 'font-variant',
961
+ 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
962
+ 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
963
+ 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
964
+ 'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
965
+ 'origin', 'overline-position', 'overline-thickness', 'panose-1',
966
+ 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
967
+ 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
968
+ 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
969
+ 'stop-color', 'stop-opacity', 'strikethrough-position',
970
+ 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
971
+ 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
972
+ 'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
973
+ 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
974
+ 'underline-position', 'underline-thickness', 'unicode',
975
+ 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
976
+ 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
977
+ 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
978
+ 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
979
+ 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
980
+ ]
981
+
982
+ Svg_Attr_Map = nil
983
+ Svg_Elem_Map = nil
984
+
985
+ Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
986
+ 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
987
+ 'stroke-opacity'
988
+ ]
989
+
990
+ unless $compatible
991
+ @@acceptable_tag_specific_attributes = {}
992
+ @@mathml_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@mathml_attributes }
993
+ @@svg_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@svg_attributes }
994
+ end
995
+
996
+ class Elements
997
+ def strip(allowed_tags=[]) # I completely route around this with the recursive_strip in Doc
998
+ each { |x| x.strip(allowed_tags) }
999
+ end
1000
+
1001
+ def strip_attributes(safe=[])
1002
+ each { |x| x.strip_attributes(safe) }
1003
+ end
1004
+
1005
+ def strip_style(ok_props=[], ok_keywords=[])
1006
+ each { |x| x.strip_style(ok_props, ok_keywords) }
1007
+ end
1008
+ end
1009
+
1010
+ class Text
1011
+ def strip(foo)
1012
+ end
1013
+ def strip_attributes(foo)
1014
+ end
1015
+ end
1016
+ class Comment
1017
+ def strip(foo)
1018
+ end
1019
+ def strip_attributes(foo)
1020
+ end
1021
+ end
1022
+ class BogusETag
1023
+ def strip(foo)
1024
+ end
1025
+ def strip_attributes(foo)
1026
+ end
1027
+ end
1028
+
1029
+ class Elem
1030
+ def decode_entities
1031
+ children.each{ |x| x.decode_entities }
1032
+ end
1033
+
1034
+ def cull
1035
+ if children
1036
+ swap(children.to_s)
1037
+ end
1038
+ end
1039
+
1040
+ def strip
1041
+ if strip_removes?
1042
+ cull
1043
+ end
1044
+ end
1045
+
1046
+ def strip_attributes
1047
+ unless attributes.nil?
1048
+ attributes.each do |atr|
1049
+ unless Acceptable_Attributes.include?atr[0]
1050
+ remove_attribute(atr[0])
1051
+ end
1052
+ end
1053
+ end
1054
+ end
1055
+
1056
+ def strip_removes?
1057
+ # I'm sure there are others that shuould be ripped instead of stripped
1058
+ attributes && attributes['type'] =~ /script|css/
1059
+ end
1060
+ end
1061
+ end
1062
+
1063
+ module FeedParser
1064
+ Version = "0.1aleph_naught"
1065
+
1066
+ License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
1067
+
1068
+ Redistribution and use in source and binary forms, with or without modification,
1069
+ are permitted provided that the following conditions are met:
1070
+
1071
+ * Redistributions of source code must retain the above copyright notice,
1072
+ this list of conditions and the following disclaimer.
1073
+ * Redistributions in binary form must reproduce the above copyright notice,
1074
+ this list of conditions and the following disclaimer in the documentation
1075
+ and/or other materials provided with the distribution.
1076
+
1077
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
1078
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1079
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1080
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
1081
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
1082
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
1083
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
1084
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
1085
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
1086
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1087
+ POSSIBILITY OF SUCH DAMAGE."""
1088
+
1089
+ Author = "Jeff Hodges <http://somethingsimilar.com>"
1090
+ Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
1091
+ Contributors = [ "Jason Diamond <http://injektilo.org/>",
1092
+ "John Beimler <http://john.beimler.org/>",
1093
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>",
1094
+ "Aaron Swartz <http://aaronsw.com/>",
1095
+ "Kevin Marks <http://epeus.blogspot.com/>"
1096
+ ]
1097
+ # HTTP "User-Agent" header to send to servers when downloading feeds.
1098
+ # If you are embedding feedparser in a larger application, you should
1099
+ # change this to your application name and URL.
1100
+ USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
1101
+
1102
+ # HTTP "Accept" header to send to servers when downloading feeds. If you don't
1103
+ # want to send an Accept header, set this to None.
1104
+ ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
1105
+
1106
+
1107
+ # If you want feedparser to automatically run HTML markup through HTML Tidy, set
1108
+ # this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
1109
+ # or utidylib <http://utidylib.berlios.de/>.
1110
+ TIDY_MARKUP = false #FIXME untranslated
1111
+
1112
+ # List of Python interfaces for HTML Tidy, in order of preference. Only useful
1113
+ # if TIDY_MARKUP = true
1114
+ PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
1115
+
1116
+ # The original Python import. I'm using it to help translate
1117
+ #import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
1118
+
1119
+
1120
+
1121
+ # ---------- don't touch these ----------
1122
+ class ThingsNobodyCaresAboutButMe < Exception
1123
+ end
1124
+ class CharacterEncodingOverride < ThingsNobodyCaresAboutButMe
1125
+ end
1126
+ class CharacterEncodingUnknown < ThingsNobodyCaresAboutButMe
1127
+ end
1128
+ class NonXMLContentType < ThingsNobodyCaresAboutButMe
1129
+ end
1130
+ class UndeclaredNamespace < Exception
1131
+ end
1132
+
1133
+
1134
+ SUPPORTED_VERSIONS = {'' => 'unknown',
1135
+ 'rss090' => 'RSS 0.90',
1136
+ 'rss091n' => 'RSS 0.91 (Netscape)',
1137
+ 'rss091u' => 'RSS 0.91 (Userland)',
1138
+ 'rss092' => 'RSS 0.92',
1139
+ 'rss093' => 'RSS 0.93',
1140
+ 'rss094' => 'RSS 0.94',
1141
+ 'rss20' => 'RSS 2.0',
1142
+ 'rss10' => 'RSS 1.0',
1143
+ 'rss' => 'RSS (unknown version)',
1144
+ 'atom01' => 'Atom 0.1',
1145
+ 'atom02' => 'Atom 0.2',
1146
+ 'atom03' => 'Atom 0.3',
1147
+ 'atom10' => 'Atom 1.0',
1148
+ 'atom' => 'Atom (unknown version)',
1149
+ 'cdf' => 'CDF',
1150
+ 'hotrss' => 'Hot RSS'
1151
+ }
1152
+ class FeedParserDict < Hash
1153
+ =begin
1154
+ The naming of a certain common attribute (such as, "When was the last
1155
+ time this feed was updated?") can have many different names depending
1156
+ on the type of feed we are handling. This class allows us to use
1157
+ both the attribute name a person, who has knowledge of the kind of
1158
+ feed being parsed, expects, as well as allowing a developer to rely
1159
+ on one name to contain the proper attribute no matter what kind of
1160
+ feed is being parsed. @@keymaps is a Hash that contains information
1161
+ on what certain attributes "really is" in each feed type. It does so
1162
+ by providing a common name that will map to any feed type in the keys,
1163
+ with possible "correct" attributes in the its values. the #[] and #[]=
1164
+ methods check with keymaps to see what attribute the developer "really
1165
+ means" if they've asked for one which happens to be in @@keymap's keys.
1166
+ =end
1167
+ @@keymap = {'channel' => 'feed',
1168
+ 'items' => 'entries',
1169
+ 'guid' => 'id',
1170
+ 'date' => 'updated',
1171
+ 'date_parsed' => 'updated_parsed',
1172
+ 'description' => ['subtitle', 'summary'],
1173
+ 'url' => ['href'],
1174
+ 'modified' => 'updated',
1175
+ 'modified_parsed' => 'updated_parsed',
1176
+ 'issued' => 'published',
1177
+ 'issued_parsed' => 'published_parsed',
1178
+ 'copyright' => 'rights',
1179
+ 'copyright_detail' => 'rights_detail',
1180
+ 'tagline' => 'subtitle',
1181
+ 'tagline_detail' => 'subtitle_detail'}
1182
+
1183
+ def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
1184
+ return self['entries']
1185
+ end
1186
+ # We could include the [] rewrite in new using Hash.new's fancy pants block thing
1187
+ # but we'd still have to overwrite []= and such.
1188
+ # I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
1189
+ def initialize(pairs=nil)
1190
+ if pairs.class == Array and pairs[0].class == Array and pairs[0].length == 2
1191
+ pairs.each do |l|
1192
+ k,v = l
1193
+ self[k] = v
1194
+ end
1195
+ elsif pairs.class == Hash
1196
+ self.merge!(pairs)
1197
+ end
1198
+ end
1199
+
1200
+ def [](key)
1201
+ if key == 'category'
1202
+ return self['tags'][0]['term']
1203
+ end
1204
+ if key == 'categories'
1205
+ return self['tags'].collect{|tag| [tag['scheme'],tag['term']]}
1206
+ end
1207
+ realkey = @@keymap[key] || key
1208
+ if realkey.class == Array
1209
+ realkey.each{ |key| return self[key] if has_key?key }
1210
+ end
1211
+ # Note that the original key is preferred over the realkey we (might
1212
+ # have) found in @@keymaps
1213
+ if has_key?(key)
1214
+ return super(key)
1215
+ end
1216
+ return super(realkey)
1217
+ end
1218
+
1219
+ def []=(key,value)
1220
+ if @@keymap.key?key
1221
+ key = @@keymap[key]
1222
+ if key.class == Array
1223
+ key = key[0]
1224
+ end
1225
+ end
1226
+ super(key,value)
1227
+ end
1228
+
1229
+ def method_missing(msym, *args)
1230
+ methodname = msym.to_s
1231
+ if methodname[-1] == '='
1232
+ return self[methodname[0..-2]] = args[0]
1233
+ elsif methodname[-1] != '!' and methodname[-1] != '?' and methodname[0] != "_" # FIXME implement with private
1234
+ return self[methodname]
1235
+ else
1236
+ raise NoMethodError, "whoops, we don't know about the attribute or method called `#{methodname}' for #{self}:#{self.class}"
1237
+ end
1238
+ end
1239
+ end
1240
+
1241
+
1242
+
1243
+
1244
+ module FeedParserMixin
1245
+ attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
1246
+
1247
+ def startup(baseuri=nil, baselang=nil, encoding='utf-8')
1248
+ $stderr << "initializing FeedParser\n" if $debug
1249
+
1250
+ @namespaces = {'' => '',
1251
+ 'http://backend.userland.com/rss' => '',
1252
+ 'http://blogs.law.harvard.edu/tech/rss' => '',
1253
+ 'http://purl.org/rss/1.0/' => '',
1254
+ 'http://my.netscape.com/rdf/simple/0.9/' => '',
1255
+ 'http://example.com/newformat#' => '',
1256
+ 'http://example.com/necho' => '',
1257
+ 'http://purl.org/echo/' => '',
1258
+ 'uri/of/echo/namespace#' => '',
1259
+ 'http://purl.org/pie/' => '',
1260
+ 'http://purl.org/atom/ns#' => '',
1261
+ 'http://www.w3.org/2005/Atom' => '',
1262
+ 'http://purl.org/rss/1.0/modules/rss091#' => '',
1263
+ 'http://webns.net/mvcb/' => 'admin',
1264
+ 'http://purl.org/rss/1.0/modules/aggregation/' => 'ag',
1265
+ 'http://purl.org/rss/1.0/modules/annotate/' => 'annotate',
1266
+ 'http://media.tangent.org/rss/1.0/' => 'audio',
1267
+ 'http://backend.userland.com/blogChannelModule' => 'blogChannel',
1268
+ 'http://web.resource.org/cc/' => 'cc',
1269
+ 'http://backend.userland.com/creativeCommonsRssModule' => 'creativeCommons',
1270
+ 'http://purl.org/rss/1.0/modules/company' => 'co',
1271
+ 'http://purl.org/rss/1.0/modules/content/' => 'content',
1272
+ 'http://my.theinfo.org/changed/1.0/rss/' => 'cp',
1273
+ 'http://purl.org/dc/elements/1.1/' => 'dc',
1274
+ 'http://purl.org/dc/terms/' => 'dcterms',
1275
+ 'http://purl.org/rss/1.0/modules/email/' => 'email',
1276
+ 'http://purl.org/rss/1.0/modules/event/' => 'ev',
1277
+ 'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner',
1278
+ 'http://freshmeat.net/rss/fm/' => 'fm',
1279
+ 'http://xmlns.com/foaf/0.1/' => 'foaf',
1280
+ 'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo',
1281
+ 'http://postneo.com/icbm/' => 'icbm',
1282
+ 'http://purl.org/rss/1.0/modules/image/' => 'image',
1283
+ 'http://www.itunes.com/DTDs/PodCast-1.0.dtd' => 'itunes',
1284
+ 'http://example.com/DTDs/PodCast-1.0.dtd' => 'itunes',
1285
+ 'http://purl.org/rss/1.0/modules/link/' => 'l',
1286
+ 'http://search.yahoo.com/mrss' => 'media',
1287
+ 'http://madskills.com/public/xml/rss/module/pingback/' => 'pingback',
1288
+ 'http://prismstandard.org/namespaces/1.2/basic/' => 'prism',
1289
+ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
1290
+ 'http://www.w3.org/2000/01/rdf-schema#' => 'rdfs',
1291
+ 'http://purl.org/rss/1.0/modules/reference/' => 'ref',
1292
+ 'http://purl.org/rss/1.0/modules/richequiv/' => 'reqv',
1293
+ 'http://purl.org/rss/1.0/modules/search/' => 'search',
1294
+ 'http://purl.org/rss/1.0/modules/slash/' => 'slash',
1295
+ 'http://schemas.xmlsoap.org/soap/envelope/' => 'soap',
1296
+ 'http://purl.org/rss/1.0/modules/servicestatus/' => 'ss',
1297
+ 'http://hacks.benhammersley.com/rss/streaming/' => 'str',
1298
+ 'http://purl.org/rss/1.0/modules/subscription/' => 'sub',
1299
+ 'http://purl.org/rss/1.0/modules/syndication/' => 'sy',
1300
+ 'http://purl.org/rss/1.0/modules/taxonomy/' => 'taxo',
1301
+ 'http://purl.org/rss/1.0/modules/threading/' => 'thr',
1302
+ 'http://purl.org/rss/1.0/modules/textinput/' => 'ti',
1303
+ 'http://madskills.com/public/xml/rss/module/trackback/' =>'trackback',
1304
+ 'http://wellformedweb.org/commentAPI/' => 'wfw',
1305
+ 'http://purl.org/rss/1.0/modules/wiki/' => 'wiki',
1306
+ 'http://www.w3.org/1999/xhtml' => 'xhtml',
1307
+ 'http://www.w3.org/XML/1998/namespace' => 'xml',
1308
+ 'http://www.w3.org/1999/xlink' => 'xlink',
1309
+ 'http://schemas.pocketsoap.com/rss/myDescModule/' => 'szf'
1310
+ }
1311
+ @matchnamespaces = {}
1312
+ @namespaces.each do |l|
1313
+ @matchnamespaces[l[0].downcase] = l[1]
1314
+ end
1315
+ @can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
1316
+ @can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
1317
+ @can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
1318
+ @html_types = ['text/html', 'application/xhtml+xml']
1319
+ @feeddata = FeedParserDict.new # feed-level data
1320
+ @encoding = encoding # character encoding
1321
+ @entries = [] # list of entry-level data
1322
+ @version = '' # feed type/version see SUPPORTED_VERSIOSN
1323
+ @namespacesInUse = {} # hash of namespaces defined by the feed
1324
+
1325
+ # the following are used internall to track state;
1326
+ # this is really out of control and should be refactored
1327
+ @infeed = false
1328
+ @inentry = false
1329
+ @incontent = 0 # Yes, this needs to be zero until I work out popContent and pushContent
1330
+ @intextinput = false
1331
+ @inimage = false
1332
+ @inauthor = false
1333
+ @incontributor = false
1334
+ @inpublisher = false
1335
+ @insource = false
1336
+ @sourcedata = FeedParserDict.new
1337
+ @contentparams = FeedParserDict.new
1338
+ @summaryKey = nil
1339
+ @namespacemap = {}
1340
+ @elementstack = []
1341
+ @basestack = []
1342
+ @langstack = []
1343
+ @baseuri = baseuri || ''
1344
+ @lang = baselang || nil
1345
+ if baselang
1346
+ @feeddata['language'] = baselang.gsub('_','-')
1347
+ end
1348
+ @date_handlers = [:_parse_date_rfc822,
1349
+ :_parse_date_hungarian, :_parse_date_greek,:_parse_date_mssql,
1350
+ :_parse_date_nate,:_parse_date_onblog,:_parse_date_w3dtf,:_parse_date_iso8601
1351
+ ]
1352
+ $stderr << "Leaving startup\n" if $debug # My addition
1353
+ end
1354
+
1355
+ def unknown_starttag(tag, attrsd)
1356
+ $stderr << "start #{tag} with #{attrsd}\n" if $debug
1357
+ # normalize attrs
1358
+ attrsD = {}
1359
+ attrsd = Hash[*attrsd.flatten] if attrsd.class == Array # Magic! Asterisk!
1360
+ # LooseFeedParser needs the above because SGMLParser sends attrs as a
1361
+ # list of lists (like [['type','text/html'],['mode','escaped']])
1362
+
1363
+ attrsd.each do |old_k,value|
1364
+ # There has to be a better, non-ugly way of doing this
1365
+ k = old_k.downcase # Downcase all keys
1366
+ attrsD[k] = value
1367
+ if ['rel','type'].include?value
1368
+ attrsD[k].downcase! # Downcase the value if the key is 'rel' or 'type'
1369
+ end
1370
+ end
1371
+
1372
+ # track xml:base and xml:lang
1373
+ baseuri = attrsD['xml:base'] || attrsD['base'] || @baseuri
1374
+ @baseuri = urljoin(@baseuri, baseuri)
1375
+ lang = attrsD['xml:lang'] || attrsD['lang']
1376
+ if lang == '' # FIXME This next bit of code is right? Wtf?
1377
+ # xml:lang could be explicitly set to '', we need to capture that
1378
+ lang = nil
1379
+ elsif lang.nil?
1380
+ # if no xml:lang is specified, use parent lang
1381
+ lang = @lang
1382
+ end
1383
+ if lang and not lang.empty? # Seriously, this cannot be correct
1384
+ if ['feed', 'rss', 'rdf:RDF'].include?tag
1385
+ @feeddata['language'] = lang.gsub('_','-')
1386
+ end
1387
+ end
1388
+ @lang = lang
1389
+ @basestack << @baseuri
1390
+ @langstack << lang
1391
+
1392
+ # track namespaces
1393
+ attrsd.each do |prefix, uri|
1394
+ if /^xmlns:/ =~ prefix # prefix begins with xmlns:
1395
+ trackNamespace(prefix[6..-1], uri)
1396
+ elsif prefix == 'xmlns':
1397
+ trackNamespace(nil, uri)
1398
+ end
1399
+ end
1400
+
1401
+ # track inline content
1402
+ if @incontent != 0 and @contentparams.has_key?('type') and not ( /xml$/ =~ (@contentparams['type'] || 'xml') )
1403
+ # element declared itself as escaped markup, but isn't really
1404
+
1405
+ @contentparams['type'] = 'application/xhtml+xml'
1406
+ end
1407
+ if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
1408
+ # Note: probably shouldn't simply recreate localname here, but
1409
+ # our namespace handling isn't actually 100% correct in cases where
1410
+ # the feed redefines the default namespace (which is actually
1411
+ # the usual case for inline content, thanks Sam), so here we
1412
+ # cheat and just reconstruct the element based on localname
1413
+ # because that compensates for the bugs in our namespace handling.
1414
+ # This will horribly munge inline content with non-empty qnames,
1415
+ # but nobody actually does that, so I'm not fixing it.
1416
+ tag = tag.split(':')[-1]
1417
+ attrsA = attrsd.to_a.collect{|l| "#{l[0]}=\"#{l[1]}\""}
1418
+ attrsS = ' '+attrsA.join(' ')
1419
+ return handle_data("<#{tag}#{attrsS}>", escape=false)
1420
+ end
1421
+
1422
+ # match namespaces
1423
+ if /:/ =~ tag
1424
+ prefix, suffix = tag.split(':', 2)
1425
+ else
1426
+ prefix, suffix = '', tag
1427
+ end
1428
+ prefix = @namespacemap[prefix] || prefix
1429
+ if prefix and not prefix.empty?
1430
+ prefix = prefix + '_'
1431
+ end
1432
+
1433
+ # special hack for better tracking of empty textinput/image elements in illformed feeds
1434
+ if (not prefix and not prefix.empty?) and not (['title', 'link', 'description','name'].include?tag)
1435
+ @intextinput = false
1436
+ end
1437
+ if (prefix.nil? or prefix.empty?) and not (['title', 'link', 'description', 'url', 'href', 'width', 'height'].include?tag)
1438
+ @inimage = false
1439
+ end
1440
+
1441
+ # call special handler (if defined) or default handler
1442
+ begin
1443
+ return send('_start_'+prefix+suffix, attrsD)
1444
+ rescue NoMethodError
1445
+ return push(prefix + suffix, true)
1446
+ end
1447
+ end # End unknown_starttag
1448
+
1449
+ def unknown_endtag(tag)
1450
+ $stderr << "end #{tag}\n" if $debug
1451
+ # match namespaces
1452
+ if tag.index(':')
1453
+ prefix, suffix = tag.split(':',2)
1454
+ else
1455
+ prefix, suffix = '', tag
1456
+ end
1457
+ prefix = @namespacemap[prefix] || prefix
1458
+ if prefix and not prefix.empty?
1459
+ prefix = prefix + '_'
1460
+ end
1461
+
1462
+ # call special handler (if defined) or default handler
1463
+ begin
1464
+ send('_end_' + prefix + suffix) # NOTE no return here! do not add it!
1465
+ rescue NoMethodError => details
1466
+ pop(prefix + suffix)
1467
+ end
1468
+
1469
+ # track inline content
1470
+ if @incontent != 0 and @contentparams.has_key?'type' and /xml$/ =~ (@contentparams['type'] || 'xml')
1471
+ # element declared itself as escaped markup, but it isn't really
1472
+ @contentparams['type'] = 'application/xhtml+xml'
1473
+ end
1474
+ if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
1475
+ tag = tag.split(':')[-1]
1476
+ handle_data("</#{tag}>", escape=false)
1477
+ end
1478
+
1479
+ # track xml:base and xml:lang going out of scope
1480
+ if @basestack and not @basestack.empty?
1481
+ @basestack.pop
1482
+ if @basestack and @basestack[-1] and not (@basestack.empty? or @basestack[-1].empty?)
1483
+ @baseuri = @basestack[-1]
1484
+ end
1485
+ end
1486
+ if @langstack and not @langstack.empty?
1487
+ @langstack.pop
1488
+ if @langstack and not @langstack.empty? # and @langstack[-1] and not @langstack.empty?
1489
+ @lang = @langstack[-1]
1490
+ end
1491
+ end
1492
+ end
1493
+
1494
+ def handle_charref(ref)
1495
+ # LooseParserOnly
1496
+ # called for each character reference, e.g. for '&#160;', ref will be '160'
1497
+ $stderr << "entering handle_charref with #{ref}\n" if $debug
1498
+ return if @elementstack.nil? or @elementstack.empty?
1499
+ ref.downcase!
1500
+ chars = ['34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e']
1501
+ if chars.include?ref
1502
+ text = "&##{ref};"
1503
+ else
1504
+ if ref[0..0] == 'x'
1505
+ c = (ref[1..-1]).to_i(16)
1506
+ else
1507
+ c = ref.to_i
1508
+ end
1509
+ text = uconvert(unichr(c),'unicode')
1510
+ end
1511
+ @elementstack[-1][2] << text
1512
+ end
1513
+
1514
+ def handle_entityref(ref)
1515
+ # LooseParserOnly
1516
+ # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1517
+
1518
+ return if @elementstack.nil? or @elementstack.empty?
1519
+ $stderr << "entering handle_entityref with #{ref}\n" if $debug
1520
+ ents = ['lt', 'gt', 'quot', 'amp', 'apos']
1521
+ if ents.include?ref
1522
+ text = "&#{ref};"
1523
+ else
1524
+ text = HTMLEntities::decode_entities("&#{ref};")
1525
+ end
1526
+ @elementstack[-1][2] << text
1527
+ end
1528
+
1529
+ def handle_data(text, escape=true)
1530
+ # called for each block of plain text, i.e. outside of any tag and
1531
+ # not containing any character or entity references
1532
+ return if @elementstack.nil? or @elementstack.empty?
1533
+ if escape and @contentparams['type'] == 'application/xhtml+xml'
1534
+ text = text.to_xs
1535
+ end
1536
+ @elementstack[-1][2] << text
1537
+ end
1538
+
1539
+ def handle_comment(comment)
1540
+ # called for each comment, e.g. <!-- insert message here -->
1541
+ end
1542
+
1543
+ def handle_pi(text)
1544
+ end
1545
+
1546
+ def handle_decl(text)
1547
+ end
1548
+
1549
+ def parse_declaration(i)
1550
+ # for LooseFeedParser
1551
+ $stderr << "entering parse_declaration\n" if $debug
1552
+ if @rawdata[i...i+9] == '<![CDATA['
1553
+ k = @rawdata.index(/\]\]>/u,i+9)
1554
+ k = @rawdata.length unless k
1555
+ handle_data(@rawdata[i+9...k].to_xs,false)
1556
+ return k+3
1557
+ else
1558
+ k = @rawdata.index(/>/,i).to_i
1559
+ return k+1
1560
+ end
1561
+ end
1562
+
1563
+ def mapContentType(contentType)
1564
+ contentType.downcase!
1565
+ case contentType
1566
+ when 'text'
1567
+ contentType = 'text/plain'
1568
+ when 'html'
1569
+ contentType = 'text/html'
1570
+ when 'xhtml'
1571
+ contentType = 'application/xhtml+xml'
1572
+ end
1573
+ return contentType
1574
+ end
1575
+
1576
+ def trackNamespace(prefix, uri)
1577
+
1578
+ loweruri = uri.downcase.strip
1579
+ if [prefix, loweruri] == [nil, 'http://my.netscape.com/rdf/simple/0.9/'] and (@version.nil? or @version.empty?)
1580
+ @version = 'rss090'
1581
+ elsif loweruri == 'http://purl.org/rss/1.0/' and (@version.nil? or @version.empty?)
1582
+ @version = 'rss10'
1583
+ elsif loweruri == 'http://www.w3.org/2005/atom' and (@version.nil? or @version.empty?)
1584
+ @version = 'atom10'
1585
+ elsif /backend\.userland\.com\/rss/ =~ loweruri
1586
+ # match any backend.userland.com namespace
1587
+ uri = 'http://backend.userland.com/rss'
1588
+ loweruri = uri
1589
+ end
1590
+ if @matchnamespaces.has_key? loweruri
1591
+ @namespacemap[prefix] = @matchnamespaces[loweruri]
1592
+ @namespacesInUse[@matchnamespaces[loweruri]] = uri
1593
+ else
1594
+ @namespacesInUse[prefix || ''] = uri
1595
+ end
1596
+ end
1597
+
1598
+ def resolveURI(uri)
1599
+ return urljoin(@baseuri || '', uri)
1600
+ end
1601
+
1602
+ def decodeEntities(element, data)
1603
+ return data
1604
+ end
1605
+
1606
+ def push(element, expectingText)
1607
+ @elementstack << [element, expectingText, []]
1608
+ end
1609
+
1610
+ def pop(element, stripWhitespace=true)
1611
+ return if @elementstack.nil? or @elementstack.empty?
1612
+ return if @elementstack[-1][0] != element
1613
+ element, expectingText, pieces = @elementstack.pop
1614
+ if pieces.class == Array
1615
+ output = pieces.join('')
1616
+ else
1617
+ output = pieces
1618
+ end
1619
+ if stripWhitespace
1620
+ output.strip!
1621
+ end
1622
+ return output if not expectingText
1623
+
1624
+ # decode base64 content
1625
+ if @contentparams['base64']
1626
+ out64 = Base64::decode64(output) # a.k.a. [output].unpack('m')[0]
1627
+ if not output.empty? and not out64.empty?
1628
+ output = out64
1629
+ end
1630
+ end
1631
+
1632
+ # resolve relative URIs
1633
+ if @can_be_relative_uri.include?element and output and not output.empty?
1634
+ output = resolveURI(output)
1635
+ end
1636
+
1637
+ # decode entities within embedded markup
1638
+ if not @contentparams['base64']
1639
+ output = decodeEntities(element, output)
1640
+ end
1641
+
1642
+ # remove temporary cruft from contentparams
1643
+ @contentparams.delete('mode')
1644
+ @contentparams.delete('base64')
1645
+
1646
+ # resolve relative URIs within embedded markup
1647
+ if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
1648
+ if @can_contain_relative_uris.include?element
1649
+ output = FeedParser.resolveRelativeURIs(output, @baseuri, @encoding)
1650
+ end
1651
+ end
1652
+ # sanitize embedded markup
1653
+ if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
1654
+ if @can_contain_dangerous_markup.include?element
1655
+ output = FeedParser.sanitizeHTML(output, @encoding)
1656
+ end
1657
+ end
1658
+
1659
+ if @encoding and not @encoding.empty? and @encoding != 'utf-8'
1660
+ output = uconvert(output, @encoding, 'utf-8')
1661
+ # FIXME I turn everything into utf-8, not unicode, originally because REXML was being used but now beause I haven't tested it out yet.
1662
+ end
1663
+
1664
+ # categories/tags/keywords/whatever are handled in _end_category
1665
+ return output if element == 'category'
1666
+
1667
+ # store output in appropriate place(s)
1668
+ if @inentry and not @insource
1669
+ if element == 'content'
1670
+ @entries[-1][element] ||= []
1671
+ contentparams = Marshal.load(Marshal.dump(@contentparams)) # deepcopy
1672
+ contentparams['value'] = output
1673
+ @entries[-1][element] << contentparams
1674
+ elsif element == 'link'
1675
+ @entries[-1][element] = output
1676
+ if output and not output.empty?
1677
+ @entries[-1]['links'][-1]['href'] = output
1678
+ end
1679
+ else
1680
+ element = 'summary' if element == 'description'
1681
+ @entries[-1][element] = output
1682
+ if @incontent != 0
1683
+ contentparams = Marshal.load(Marshal.dump(@contentparams))
1684
+ contentparams['value'] = output
1685
+ @entries[-1][element + '_detail'] = contentparams
1686
+ end
1687
+ end
1688
+ elsif (@infeed or @insource) and not @intextinput and not @inimage
1689
+ context = getContext()
1690
+ element = 'subtitle' if element == 'description'
1691
+ context[element] = output
1692
+ if element == 'link'
1693
+ context['links'][-1]['href'] = output
1694
+ elsif @incontent != 0
1695
+ contentparams = Marshal.load(Marshal.dump(@contentparams))
1696
+ contentparams['value'] = output
1697
+ context[element + '_detail'] = contentparams
1698
+ end
1699
+ end
1700
+ return output
1701
+ end
1702
+
1703
+ def pushContent(tag, attrsD, defaultContentType, expectingText)
1704
+ @incontent += 1 # Yes, I hate this.
1705
+ type = mapContentType(attrsD['type'] || defaultContentType)
1706
+ @contentparams = FeedParserDict.new({'type' => type,'language' => @lang,'base' => @baseuri})
1707
+ @contentparams['base64'] = isBase64(attrsD, @contentparams)
1708
+ push(tag, expectingText)
1709
+ end
1710
+
1711
+ def popContent(tag)
1712
+ value = pop(tag)
1713
+ @incontent -= 1
1714
+ @contentparams.clear
1715
+ return value
1716
+ end
1717
+
1718
+ def mapToStandardPrefix(name)
1719
+ colonpos = name.index(':')
1720
+ if colonpos
1721
+ prefix = name[0..colonpos-1]
1722
+ suffix = name[colonpos+1..-1]
1723
+ prefix = @namespacemap[prefix] || prefix
1724
+ name = prefix + ':' + suffix
1725
+ end
1726
+ return name
1727
+ end
1728
+
1729
+ def getAttribute(attrsD, name)
1730
+ return attrsD[mapToStandardPrefix(name)]
1731
+ end
1732
+
1733
+ def isBase64(attrsD, contentparams)
1734
+ return true if (attrsD['mode'] == 'base64')
1735
+ if /(^text\/)|(\+xml$)|(\/xml$)/ =~ contentparams['type']
1736
+ return false
1737
+ end
1738
+ return true
1739
+ end
1740
+
1741
+ def itsAnHrefDamnIt(attrsD)
1742
+ href= attrsD['url'] || attrsD['uri'] || attrsD['href']
1743
+ if href
1744
+ attrsD.delete('url')
1745
+ attrsD.delete('uri')
1746
+ attrsD['href'] = href
1747
+ end
1748
+ return attrsD
1749
+ end
1750
+
1751
+
1752
+ def _save(key, value)
1753
+ context = getContext()
1754
+ context[key] ||= value
1755
+ end
1756
+
1757
+ def _start_rss(attrsD)
1758
+ versionmap = {'0.91' => 'rss091u',
1759
+ '0.92' => 'rss092',
1760
+ '0.93' => 'rss093',
1761
+ '0.94' => 'rss094'
1762
+ }
1763
+
1764
+ if not @version or @version.empty?
1765
+ attr_version = attrsD['version'] || ''
1766
+ version = versionmap[attr_version]
1767
+ if version and not version.empty?
1768
+ @version = version
1769
+ elsif /^2\./ =~ attr_version
1770
+ @version = 'rss20'
1771
+ else
1772
+ @version = 'rss'
1773
+ end
1774
+ end
1775
+ end
1776
+
1777
+ def _start_dlhottitles(attrsD)
1778
+ @version = 'hotrss'
1779
+ end
1780
+
1781
+ def _start_channel(attrsD)
1782
+ @infeed = true
1783
+ _cdf_common(attrsD)
1784
+ end
1785
+ alias :_start_feedinfo :_start_channel
1786
+
1787
+ def _cdf_common(attrsD)
1788
+ if attrsD.has_key?'lastmod'
1789
+ _start_modified({})
1790
+ @elementstack[-1][-1] = attrsD['lastmod']
1791
+ _end_modified
1792
+ end
1793
+ if attrsD.has_key?'href'
1794
+ _start_link({})
1795
+ @elementstack[-1][-1] = attrsD['href']
1796
+ _end_link
1797
+ end
1798
+ end
1799
+
1800
+ def _start_feed(attrsD)
1801
+ @infeed = true
1802
+ versionmap = {'0.1' => 'atom01',
1803
+ '0.2' => 'atom02',
1804
+ '0.3' => 'atom03'
1805
+ }
1806
+
1807
+ if not @version or @version.empty?
1808
+ attr_version = attrsD['version']
1809
+ version = versionmap[attr_version]
1810
+ if @version and not @version.empty?
1811
+ @version = version
1812
+ else
1813
+ @version = 'atom'
1814
+ end
1815
+ end
1816
+ end
1817
+
1818
+ def _end_channel
1819
+ @infeed = false
1820
+ end
1821
+ alias :_end_feed :_end_channel
1822
+
1823
+ def _start_image(attrsD)
1824
+ @inimage = true
1825
+ push('image', false)
1826
+ context = getContext()
1827
+ context['image'] ||= FeedParserDict.new
1828
+ end
1829
+
1830
+ def _end_image
1831
+ pop('image')
1832
+ @inimage = false
1833
+ end
1834
+
1835
+ def _start_textinput(attrsD)
1836
+ @intextinput = true
1837
+ push('textinput', false)
1838
+ context = getContext()
1839
+ context['textinput'] ||= FeedParserDict.new
1840
+ end
1841
+ alias :_start_textInput :_start_textinput
1842
+
1843
+ def _end_textinput
1844
+ pop('textinput')
1845
+ @intextinput = false
1846
+ end
1847
+ alias :_end_textInput :_end_textinput
1848
+
1849
+ def _start_author(attrsD)
1850
+ @inauthor = true
1851
+ push('author', true)
1852
+ end
1853
+ alias :_start_managingeditor :_start_author
1854
+ alias :_start_dc_author :_start_author
1855
+ alias :_start_dc_creator :_start_author
1856
+ alias :_start_itunes_author :_start_author
1857
+
1858
+ def _end_author
1859
+ pop('author')
1860
+ @inauthor = false
1861
+ _sync_author_detail()
1862
+ end
1863
+ alias :_end_managingeditor :_end_author
1864
+ alias :_end_dc_author :_end_author
1865
+ alias :_end_dc_creator :_end_author
1866
+ alias :_end_itunes_author :_end_author
1867
+
1868
+ def _start_itunes_owner(attrsD)
1869
+ @inpublisher = true
1870
+ push('publisher', false)
1871
+ end
1872
+
1873
+ def _end_itunes_owner
1874
+ pop('publisher')
1875
+ @inpublisher = false
1876
+ _sync_author_detail('publisher')
1877
+ end
1878
+
1879
+ def _start_contributor(attrsD)
1880
+ @incontributor = true
1881
+ context = getContext()
1882
+ context['contributors'] ||= []
1883
+ context['contributors'] << FeedParserDict.new
1884
+ push('contributor', false)
1885
+ end
1886
+
1887
+ def _end_contributor
1888
+ pop('contributor')
1889
+ @incontributor = false
1890
+ end
1891
+
1892
+ def _start_dc_contributor(attrsD)
1893
+ @incontributor = true
1894
+ context = getContext()
1895
+ context['contributors'] ||= []
1896
+ context['contributors'] << FeedParserDict.new
1897
+ push('name', false)
1898
+ end
1899
+
1900
+ def _end_dc_contributor
1901
+ _end_name
1902
+ @incontributor = false
1903
+ end
1904
+
1905
+ def _start_name(attrsD)
1906
+ push('name', false)
1907
+ end
1908
+ alias :_start_itunes_name :_start_name
1909
+
1910
+ def _end_name
1911
+ value = pop('name')
1912
+ if @inpublisher
1913
+ _save_author('name', value, 'publisher')
1914
+ elsif @inauthor
1915
+ _save_author('name', value)
1916
+ elsif @incontributor
1917
+ _save_contributor('name', value)
1918
+ elsif @intextinput
1919
+ context = getContext()
1920
+ context['textinput']['name'] = value
1921
+ end
1922
+ end
1923
+ alias :_end_itunes_name :_end_name
1924
+
1925
+ def _start_width(attrsD)
1926
+ push('width', false)
1927
+ end
1928
+
1929
+ def _end_width
1930
+ value = pop('width').to_i
1931
+ if @inimage
1932
+ context = getContext
1933
+ context['image']['width'] = value
1934
+ end
1935
+ end
1936
+
1937
+ def _start_height(attrsD)
1938
+ push('height', false)
1939
+ end
1940
+
1941
+ def _end_height
1942
+ value = pop('height').to_i
1943
+ if @inimage
1944
+ context = getContext()
1945
+ context['image']['height'] = value
1946
+ end
1947
+ end
1948
+
1949
+ def _start_url(attrsD)
1950
+ push('href', true)
1951
+ end
1952
+ alias :_start_homepage :_start_url
1953
+ alias :_start_uri :_start_url
1954
+
1955
+ def _end_url
1956
+ value = pop('href')
1957
+ if @inauthor
1958
+ _save_author('href', value)
1959
+ elsif @incontributor
1960
+ _save_contributor('href', value)
1961
+ elsif @inimage
1962
+ context = getContext()
1963
+ context['image']['href'] = value
1964
+ elsif @intextinput
1965
+ context = getContext()
1966
+ context['textinput']['link'] = value
1967
+ end
1968
+ end
1969
+ alias :_end_homepage :_end_url
1970
+ alias :_end_uri :_end_url
1971
+
1972
+ def _start_email(attrsD)
1973
+ push('email', false)
1974
+ end
1975
+ alias :_start_itunes_email :_start_email
1976
+
1977
+ def _end_email
1978
+ value = pop('email')
1979
+ if @inpublisher
1980
+ _save_author('email', value, 'publisher')
1981
+ elsif @inauthor
1982
+ _save_author('email', value)
1983
+ elsif @incontributor
1984
+ _save_contributor('email', value)
1985
+ end
1986
+ end
1987
+ alias :_end_itunes_email :_end_email
1988
+
1989
+ def getContext
1990
+ if @insource
1991
+ context = @sourcedata
1992
+ elsif @inentry
1993
+ context = @entries[-1]
1994
+ else
1995
+ context = @feeddata
1996
+ end
1997
+ return context
1998
+ end
1999
+
2000
+ def _save_author(key, value, prefix='author')
2001
+ context = getContext()
2002
+ context[prefix + '_detail'] ||= FeedParserDict.new
2003
+ context[prefix + '_detail'][key] = value
2004
+ _sync_author_detail()
2005
+ end
2006
+
2007
+ def _save_contributor(key, value)
2008
+ context = getContext
2009
+ context['contributors'] ||= [FeedParserDict.new]
2010
+ context['contributors'][-1][key] = value
2011
+ end
2012
+
2013
+ def _sync_author_detail(key='author')
2014
+ context = getContext()
2015
+ detail = context["#{key}_detail"]
2016
+ if detail and not detail.empty?
2017
+ name = detail['name']
2018
+ email = detail['email']
2019
+
2020
+ if name and email and not (name.empty? or name.empty?)
2021
+ context[key] = "#{name} (#{email})"
2022
+ elsif name and not name.empty?
2023
+ context[key] = name
2024
+ elsif email and not email.empty?
2025
+ context[key] = email
2026
+ end
2027
+ else
2028
+ author = context[key].dup unless context[key].nil?
2029
+ return if not author or author.empty?
2030
+ emailmatch = author.match(/(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))/)
2031
+ email = emailmatch[1]
2032
+ author.gsub!(email, '')
2033
+ author.gsub!("\(\)", '')
2034
+ author.strip!
2035
+ author.gsub!(/^\(/,'')
2036
+ author.gsub!(/\)$/,'')
2037
+ author.strip!
2038
+ context["#{key}_detail"] ||= FeedParserDict.new
2039
+ context["#{key}_detail"]['name'] = author
2040
+ context["#{key}_detail"]['email'] = email
2041
+ end
2042
+ end
2043
+
2044
+ def _start_subtitle(attrsD)
2045
+ pushContent('subtitle', attrsD, 'text/plain', true)
2046
+ end
2047
+ alias :_start_tagline :_start_subtitle
2048
+ alias :_start_itunes_subtitle :_start_subtitle
2049
+
2050
+ def _end_subtitle
2051
+ popContent('subtitle')
2052
+ end
2053
+ alias :_end_tagline :_end_subtitle
2054
+ alias :_end_itunes_subtitle :_end_subtitle
2055
+
2056
+ def _start_rights(attrsD)
2057
+ pushContent('rights', attrsD, 'text/plain', true)
2058
+ end
2059
+ alias :_start_dc_rights :_start_rights
2060
+ alias :_start_copyright :_start_rights
2061
+
2062
+ def _end_rights
2063
+ popContent('rights')
2064
+ end
2065
+ alias :_end_dc_rights :_end_rights
2066
+ alias :_end_copyright :_end_rights
2067
+
2068
+ def _start_item(attrsD)
2069
+ @entries << FeedParserDict.new
2070
+ push('item', false)
2071
+ @inentry = true
2072
+ @guidislink = false
2073
+ id = getAttribute(attrsD, 'rdf:about')
2074
+ if id and not id.empty?
2075
+ context = getContext()
2076
+ context['id'] = id
2077
+ end
2078
+ _cdf_common(attrsD)
2079
+ end
2080
+ alias :_start_entry :_start_item
2081
+ alias :_start_product :_start_item
2082
+
2083
+ def _end_item
2084
+ pop('item')
2085
+ @inentry = false
2086
+ end
2087
+ alias :_end_entry :_end_item
2088
+
2089
+ def _start_dc_language(attrsD)
2090
+ push('language', true)
2091
+ end
2092
+ alias :_start_language :_start_dc_language
2093
+
2094
+ def _end_dc_language
2095
+ @lang = pop('language')
2096
+ end
2097
+ alias :_end_language :_end_dc_language
2098
+
2099
+ def _start_dc_publisher(attrsD)
2100
+ push('publisher', true)
2101
+ end
2102
+ alias :_start_webmaster :_start_dc_publisher
2103
+
2104
+ def _end_dc_publisher
2105
+ pop('publisher')
2106
+ _sync_author_detail('publisher')
2107
+ end
2108
+ alias :_end_webmaster :_end_dc_publisher
2109
+
2110
+ def _start_published(attrsD)
2111
+ push('published', true)
2112
+ end
2113
+ alias :_start_dcterms_issued :_start_published
2114
+ alias :_start_issued :_start_published
2115
+
2116
+ def _end_published
2117
+ value = pop('published')
2118
+ _save('published_parsed', parse_date(value))
2119
+ end
2120
+ alias :_end_dcterms_issued :_end_published
2121
+ alias :_end_issued :_end_published
2122
+
2123
+ def _start_updated(attrsD)
2124
+ push('updated', true)
2125
+ end
2126
+ alias :_start_modified :_start_updated
2127
+ alias :_start_dcterms_modified :_start_updated
2128
+ alias :_start_pubdate :_start_updated
2129
+ alias :_start_dc_date :_start_updated
2130
+
2131
+ def _end_updated
2132
+ value = pop('updated')
2133
+ _save('updated_parsed', parse_date(value))
2134
+ end
2135
+ alias :_end_modified :_end_updated
2136
+ alias :_end_dcterms_modified :_end_updated
2137
+ alias :_end_pubdate :_end_updated
2138
+ alias :_end_dc_date :_end_updated
2139
+
2140
+ def _start_created(attrsD)
2141
+ push('created', true)
2142
+ end
2143
+ alias :_start_dcterms_created :_start_created
2144
+
2145
+ def _end_created
2146
+ value = pop('created')
2147
+ _save('created_parsed', parse_date(value))
2148
+ end
2149
+ alias :_end_dcterms_created :_end_created
2150
+
2151
+ def _start_expirationdate(attrsD)
2152
+ push('expired', true)
2153
+ end
2154
+ def _end_expirationdate
2155
+ _save('expired_parsed', parse_date(pop('expired')))
2156
+ end
2157
+
2158
+ def _start_cc_license(attrsD)
2159
+ push('license', true)
2160
+ value = getAttribute(attrsD, 'rdf:resource')
2161
+ if value and not value.empty?
2162
+ elementstack[-1][2] << value
2163
+ pop('license')
2164
+ end
2165
+ end
2166
+
2167
+ def _start_creativecommons_license(attrsD)
2168
+ push('license', true)
2169
+ end
2170
+
2171
+ def _end_creativecommons_license
2172
+ pop('license')
2173
+ end
2174
+
2175
+ def addTag(term, scheme, label)
2176
+ context = getContext()
2177
+ context['tags'] ||= []
2178
+ tags = context['tags']
2179
+ if (term.nil? or term.empty?) and (scheme.nil? or scheme.empty?) and (label.nil? or label.empty?)
2180
+ return
2181
+ end
2182
+ value = FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
2183
+ if not tags.include?value
2184
+ context['tags'] << FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
2185
+ end
2186
+ end
2187
+
2188
+ def _start_category(attrsD)
2189
+ $stderr << "entering _start_category with #{attrsD}\n" if $debug
2190
+
2191
+ term = attrsD['term']
2192
+ scheme = attrsD['scheme'] || attrsD['domain']
2193
+ label = attrsD['label']
2194
+ addTag(term, scheme, label)
2195
+ push('category', true)
2196
+ end
2197
+ alias :_start_dc_subject :_start_category
2198
+ alias :_start_keywords :_start_category
2199
+
2200
+ def _end_itunes_keywords
2201
+ pop('itunes_keywords').split.each do |term|
2202
+ addTag(term, 'http://www.itunes.com/', nil)
2203
+ end
2204
+ end
2205
+
2206
+ def _start_itunes_category(attrsD)
2207
+ addTag(attrsD['text'], 'http://www.itunes.com/', nil)
2208
+ push('category', true)
2209
+ end
2210
+
2211
+ def _end_category
2212
+ value = pop('category')
2213
+ return if value.nil? or value.empty?
2214
+ context = getContext()
2215
+ tags = context['tags']
2216
+ if value and not value.empty? and not tags.empty? and not tags[-1]['term']:
2217
+ tags[-1]['term'] = value
2218
+ else
2219
+ addTag(value, nil, nil)
2220
+ end
2221
+ end
2222
+ alias :_end_dc_subject :_end_category
2223
+ alias :_end_keywords :_end_category
2224
+ alias :_end_itunes_category :_end_category
2225
+
2226
+ def _start_cloud(attrsD)
2227
+ getContext()['cloud'] = FeedParserDict.new(attrsD)
2228
+ end
2229
+
2230
+ def _start_link(attrsD)
2231
+ attrsD['rel'] ||= 'alternate'
2232
+ attrsD['type'] ||= 'text/html'
2233
+ attrsD = itsAnHrefDamnIt(attrsD)
2234
+ if attrsD.has_key? 'href'
2235
+ attrsD['href'] = resolveURI(attrsD['href'])
2236
+ end
2237
+ expectingText = @infeed || @inentry || @insource
2238
+ context = getContext()
2239
+ context['links'] ||= []
2240
+ context['links'] << FeedParserDict.new(attrsD)
2241
+ if attrsD['rel'] == 'enclosure'
2242
+ _start_enclosure(attrsD)
2243
+ end
2244
+ if attrsD.has_key? 'href'
2245
+ expectingText = false
2246
+ if (attrsD['rel'] == 'alternate') and @html_types.include?mapContentType(attrsD['type'])
2247
+ context['link'] = attrsD['href']
2248
+ end
2249
+ else
2250
+ push('link', expectingText)
2251
+ end
2252
+ end
2253
+ alias :_start_producturl :_start_link
2254
+
2255
+ def _end_link
2256
+ value = pop('link')
2257
+ context = getContext()
2258
+ if @intextinput
2259
+ context['textinput']['link'] = value
2260
+ end
2261
+ if @inimage
2262
+ context['image']['link'] = value
2263
+ end
2264
+ end
2265
+ alias :_end_producturl :_end_link
2266
+
2267
+ def _start_guid(attrsD)
2268
+ @guidislink = ((attrsD['ispermalink'] || 'true') == 'true')
2269
+ push('id', true)
2270
+ end
2271
+
2272
+ def _end_guid
2273
+ value = pop('id')
2274
+ _save('guidislink', (@guidislink and not getContext().has_key?('link')))
2275
+ if @guidislink:
2276
+ # guid acts as link, but only if 'ispermalink' is not present or is 'true',
2277
+ # and only if the item doesn't already have a link element
2278
+ _save('link', value)
2279
+ end
2280
+ end
2281
+
2282
+
2283
+ def _start_title(attrsD)
2284
+ pushContent('title', attrsD, 'text/plain', @infeed || @inentry || @insource)
2285
+ end
2286
+ alias :_start_dc_title :_start_title
2287
+ alias :_start_media_title :_start_title
2288
+
2289
+ def _end_title
2290
+ value = popContent('title')
2291
+ context = getContext()
2292
+ if @intextinput
2293
+ context['textinput']['title'] = value
2294
+ elsif @inimage
2295
+ context['image']['title'] = value
2296
+ end
2297
+ end
2298
+ alias :_end_dc_title :_end_title
2299
+ alias :_end_media_title :_end_title
2300
+
2301
+ def _start_description(attrsD)
2302
+ context = getContext()
2303
+ if context.has_key?('summary')
2304
+ @summaryKey = 'content'
2305
+ _start_content(attrsD)
2306
+ else
2307
+ pushContent('description', attrsD, 'text/html', @infeed || @inentry || @insource)
2308
+ end
2309
+ end
2310
+
2311
+ def _start_abstract(attrsD)
2312
+ pushContent('description', attrsD, 'text/plain', @infeed || @inentry || @insource)
2313
+ end
2314
+
2315
+ def _end_description
2316
+ if @summaryKey == 'content'
2317
+ _end_content()
2318
+ else
2319
+ value = popContent('description')
2320
+ context = getContext()
2321
+ if @intextinput
2322
+ context['textinput']['description'] = value
2323
+ elsif @inimage:
2324
+ context['image']['description'] = value
2325
+ end
2326
+ end
2327
+ @summaryKey = nil
2328
+ end
2329
+ alias :_end_abstract :_end_description
2330
+
2331
+ def _start_info(attrsD)
2332
+ pushContent('info', attrsD, 'text/plain', true)
2333
+ end
2334
+ alias :_start_feedburner_browserfriendly :_start_info
2335
+
2336
+ def _end_info
2337
+ popContent('info')
2338
+ end
2339
+ alias :_end_feedburner_browserfriendly :_end_info
2340
+
2341
+ def _start_generator(attrsD)
2342
+ if attrsD and not attrsD.empty?
2343
+ attrsD = itsAnHrefDamnIt(attrsD)
2344
+ if attrsD.has_key?('href')
2345
+ attrsD['href'] = resolveURI(attrsD['href'])
2346
+ end
2347
+ end
2348
+ getContext()['generator_detail'] = FeedParserDict.new(attrsD)
2349
+ push('generator', true)
2350
+ end
2351
+
2352
+ def _end_generator
2353
+ value = pop('generator')
2354
+ context = getContext()
2355
+ if context.has_key?('generator_detail')
2356
+ context['generator_detail']['name'] = value
2357
+ end
2358
+ end
2359
+
2360
+ def _start_admin_generatoragent(attrsD)
2361
+ push('generator', true)
2362
+ value = getAttribute(attrsD, 'rdf:resource')
2363
+ if value and not value.empty?
2364
+ elementstack[-1][2] << value
2365
+ end
2366
+ pop('generator')
2367
+ getContext()['generator_detail'] = FeedParserDict.new({'href' => value})
2368
+ end
2369
+
2370
+ def _start_admin_errorreportsto(attrsD)
2371
+ push('errorreportsto', true)
2372
+ value = getAttribute(attrsD, 'rdf:resource')
2373
+ if value and not value.empty?
2374
+ @elementstack[-1][2] << value
2375
+ end
2376
+ pop('errorreportsto')
2377
+ end
2378
+
2379
+ def _start_summary(attrsD)
2380
+ context = getContext()
2381
+ if context.has_key?'summary'
2382
+ @summaryKey = 'content'
2383
+ _start_content(attrsD)
2384
+ else
2385
+ @summaryKey = 'summary'
2386
+ pushContent(@summaryKey, attrsD, 'text/plain', true)
2387
+ end
2388
+ end
2389
+ alias :_start_itunes_summary :_start_summary
2390
+
2391
+ def _end_summary
2392
+ if @summaryKey == 'content':
2393
+ _end_content()
2394
+ else
2395
+ popContent(@summaryKey || 'summary')
2396
+ end
2397
+ @summaryKey = nil
2398
+ end
2399
+ alias :_end_itunes_summary :_end_summary
2400
+
2401
+ def _start_enclosure(attrsD)
2402
+ attrsD = itsAnHrefDamnIt(attrsD)
2403
+ getContext()['enclosures'] ||= []
2404
+ getContext()['enclosures'] << FeedParserDict.new(attrsD)
2405
+ href = attrsD['href']
2406
+ if href and not href.empty?
2407
+ context = getContext()
2408
+ if not context['id']
2409
+ context['id'] = href
2410
+ end
2411
+ end
2412
+ end
2413
+
2414
+ def _start_source(attrsD)
2415
+ @insource = true
2416
+ end
2417
+
2418
+ def _end_source
2419
+ @insource = false
2420
+ getContext()['source'] = Marshal.load(Marshal.dump(@sourcedata))
2421
+ @sourcedata.clear()
2422
+ end
2423
+
2424
+ def _start_content(attrsD)
2425
+ pushContent('content', attrsD, 'text/plain', true)
2426
+ src = attrsD['src']
2427
+ if src and not src.empty?:
2428
+ @contentparams['src'] = src
2429
+ end
2430
+ push('content', true)
2431
+ end
2432
+
2433
+ def _start_prodlink(attrsD)
2434
+ pushContent('content', attrsD, 'text/html', true)
2435
+ end
2436
+
2437
+ def _start_body(attrsD)
2438
+ pushContent('content', attrsD, 'application/xhtml+xml', true)
2439
+ end
2440
+ alias :_start_xhtml_body :_start_body
2441
+
2442
+ def _start_content_encoded(attrsD)
2443
+ pushContent('content', attrsD, 'text/html', true)
2444
+ end
2445
+ alias :_start_fullitem :_start_content_encoded
2446
+
2447
+ def _end_content
2448
+ copyToDescription = (['text/plain'] + @html_types).include? mapContentType(@contentparams['type'])
2449
+ value = popContent('content')
2450
+ if copyToDescription
2451
+ _save('description', value)
2452
+ end
2453
+ alias :_end_body :_end_content
2454
+ alias :_end_xhtml_body :_end_content
2455
+ alias :_end_content_encoded :_end_content
2456
+ alias :_end_fullitem :_end_content
2457
+ alias :_end_prodlink :_end_content
2458
+ end
2459
+
2460
+ def _start_itunes_image(attrsD)
2461
+ push('itunes_image', false)
2462
+ getContext()['image'] = FeedParserDict.new({'href' => attrsD['href']})
2463
+ end
2464
+ alias :_start_itunes_link :_start_itunes_image
2465
+
2466
+ def _end_itunes_block
2467
+ value = pop('itunes_block', false)
2468
+ getContext()['itunes_block'] = (value == 'yes') and true or false
2469
+ end
2470
+
2471
+ def _end_itunes_explicit
2472
+ value = pop('itunes_explicit', false)
2473
+ getContext()['itunes_explicit'] = (value == 'yes') and true or false
2474
+ end
2475
+
2476
+
2477
+ # ISO-8601 date parsing routines written by Fazal Majid.
2478
+ # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2479
+ # parser is beyond the scope of feedparser and the current Time.iso8601
2480
+ # method does not work.
2481
+ # A single regular expression cannot parse ISO 8601 date formats into groups
2482
+ # as the standard is highly irregular (for instance is 030104 2003-01-04 or
2483
+ # 0301-04-01), so we use templates instead.
2484
+ # Please note the order in templates is significant because we need a
2485
+ # greedy match.
2486
+ def _parse_date_iso8601(dateString)
2487
+ # Parse a variety of ISO-8601-compatible formats like 20040105
2488
+
2489
+ # What I'm about to show you may be the ugliest code in all of
2490
+ # rfeedparser.
2491
+ # FIXME The century regexp maybe not work ('\d\d$' says "two numbers at
2492
+ # end of line" but we then attach more of a regexp.
2493
+ iso8601_regexps = [ '^(\d{4})-?([01]\d)-([0123]\d)',
2494
+ '^(\d{4})-([01]\d)',
2495
+ '^(\d{4})-?([0123]\d\d)',
2496
+ '^(\d\d)-?([01]\d)-?([0123]\d)',
2497
+ '^(\d\d)-?([0123]\d\d)',
2498
+ '^(\d{4})',
2499
+ '-(\d\d)-?([01]\d)',
2500
+ '-([0123]\d\d)',
2501
+ '-(\d\d)',
2502
+ '--([01]\d)-?([0123]\d)',
2503
+ '--([01]\d)',
2504
+ '---([0123]\d)',
2505
+ '(\d\d$)',
2506
+ ''
2507
+ ]
2508
+ iso8601_values = { '^(\d{4})-?([01]\d)-([0123]\d)' => ['year', 'month', 'day'],
2509
+ '^(\d{4})-([01]\d)' => ['year','month'],
2510
+ '^(\d{4})-?([0123]\d\d)' => ['year', 'ordinal'],
2511
+ '^(\d\d)-?([01]\d)-?([0123]\d)' => ['year','month','day'],
2512
+ '^(\d\d)-?([0123]\d\d)' => ['year','ordinal'],
2513
+ '^(\d{4})' => ['year'],
2514
+ '-(\d\d)-?([01]\d)' => ['year','month'],
2515
+ '-([0123]\d\d)' => ['ordinal'],
2516
+ '-(\d\d)' => ['year'],
2517
+ '--([01]\d)-?([0123]\d)' => ['month','day'],
2518
+ '--([01]\d)' => ['month'],
2519
+ '---([0123]\d)' => ['day'],
2520
+ '(\d\d$)' => ['century'],
2521
+ '' => []
2522
+ }
2523
+ add_to_all = '(T?(\d\d):(\d\d)(?::(\d\d))?([+-](\d\d)(?::(\d\d))?|Z)?)?'
2524
+ add_to_all_fields = ['hour', 'minute', 'second', 'tz', 'tzhour', 'tzmin']
2525
+ # NOTE We use '(?:' to prevent grouping of optional matches (ones trailed
2526
+ # by '?'). The second ':' *are* matched.
2527
+ m = nil
2528
+ param_keys = []
2529
+ iso8601_regexps.each do |s|
2530
+ $stderr << "Trying iso8601 regexp: #{s+add_to_all}\n" if $debug
2531
+ param_keys = iso8601_values[s] + add_to_all_fields
2532
+ m = dateString.match(Regexp.new(s+add_to_all))
2533
+ break if m
2534
+ end
2535
+ return if m.nil? or (m.begin(0).zero? and m.end(0).zero?)
2536
+
2537
+ param_values = m.to_a
2538
+ param_values = param_values[1..-1]
2539
+ params = {}
2540
+ param_keys.each_with_index do |key,i|
2541
+ params[key] = param_values[i]
2542
+ end
21
2543
 
22
- gem 'character-encodings', ">=0.2.0"
23
- gem 'htmltools', ">=1.10"
24
- gem 'htmlentities', ">=4.0.0"
25
- gem 'activesupport', ">=1.4.1"
26
- gem 'rchardet', ">=1.0"
27
- require 'xml/saxdriver' # calling expat through the xmlparser gem
2544
+ ordinal = params['ordinal'].to_i unless params['ordinal'].nil?
2545
+ year = params['year'] || '--'
2546
+ if year.nil? or year.empty? or year == '--' # FIXME When could the regexp ever return a year equal to '--'?
2547
+ year = Time.now.utc.year
2548
+ elsif year.length == 2
2549
+ # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
2550
+ year = 100 * (Time.now.utc.year / 100) + year.to_i
2551
+ else
2552
+ year = year.to_i
2553
+ end
28
2554
 
29
- require 'rchardet'
30
- $chardet = true
2555
+ month = params['month'] || '-'
2556
+ if month.nil? or month.empty? or month == '-'
2557
+ # ordinals are NOT normalized by mktime, we simulate them
2558
+ # by setting month=1, day=ordinal
2559
+ if ordinal
2560
+ month = DateTime.ordinal(year,ordinal).month
2561
+ else
2562
+ month = Time.now.utc.month
2563
+ end
2564
+ end
2565
+ month = month.to_i unless month.nil?
2566
+ day = params['day']
2567
+ if day.nil? or day.empty?
2568
+ # see above
2569
+ if ordinal
2570
+ day = DateTime.ordinal(year,ordinal).day
2571
+ elsif params['century'] or params['year'] or params['month']
2572
+ day = 1
2573
+ else
2574
+ day = Time.now.utc.day
2575
+ end
2576
+ else
2577
+ day = day.to_i
2578
+ end
2579
+ # special case of the century - is the first year of the 21st century
2580
+ # 2000 or 2001 ? The debate goes on...
2581
+ if params.has_key? 'century'
2582
+ year = (params['century'].to_i - 1) * 100 + 1
2583
+ end
2584
+ # in ISO 8601 most fields are optional
2585
+ hour = params['hour'].to_i
2586
+ minute = params['minute'].to_i
2587
+ second = params['second'].to_i
2588
+ weekday = nil
2589
+ # daylight savings is complex, but not needed for feedparser's purposes
2590
+ # as time zones, if specified, include mention of whether it is active
2591
+ # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
2592
+ # and most implementations have DST bugs
2593
+ tm = [second, minute, hour, day, month, year, nil, ordinal, false, nil]
2594
+ tz = params['tz']
2595
+ if tz and not tz.empty? and tz != 'Z'
2596
+ # FIXME does this cross over days?
2597
+ if tz[0] == '-'
2598
+ tm[3] += params['tzhour'].to_i
2599
+ tm[4] += params['tzmin'].to_i
2600
+ elsif tz[0] == '+'
2601
+ tm[3] -= params['tzhour'].to_i
2602
+ tm[4] -= params['tzmin'].to_i
2603
+ else
2604
+ return nil
2605
+ end
2606
+ end
2607
+ return Time.utc(*tm) # Magic!
31
2608
 
32
- require 'encoding/character/utf-8'
33
- require 'html/sgml-parser'
34
- require 'htmlentities'
35
- require 'active_support'
36
- require 'open-uri'
37
- include OpenURI
2609
+ end
38
2610
 
39
- $debug = false
40
- $compatible = true
2611
+ def _parse_date_onblog(dateString)
2612
+ # Parse a string according to the OnBlog 8-bit date format
2613
+ # 8-bit date handling routes written by ytrewq1
2614
+ korean_year = u("년") # b3e2 in euc-kr
2615
+ korean_month = u("월") # bff9 in euc-kr
2616
+ korean_day = u("일") # c0cf in euc-kr
41
2617
 
42
- $LOAD_PATH << File.expand_path(File.dirname(__FILE__))
43
- require 'rfeedparser/forgiving_uri'
44
- require 'rfeedparser/aliases'
45
- require 'rfeedparser/encoding_helpers'
46
- require 'rfeedparser/better_sgmlparser'
47
- require 'rfeedparser/better_attributelist'
48
- require 'rfeedparser/scrub'
49
- require 'rfeedparser/time_helpers'
50
- require 'rfeedparser/feedparserdict'
51
- require 'rfeedparser/parser_mixin'
52
- require 'rfeedparser/parsers'
53
- require 'rfeedparser/markup_helpers'
54
2618
 
55
- include FeedParserUtilities
2619
+ korean_onblog_date_re = /(\d{4})#{korean_year}\s+(\d{2})#{korean_month}\s+(\d{2})#{korean_day}\s+(\d{2}):(\d{2}):(\d{2})/
56
2620
 
57
2621
 
58
- module FeedParser
59
- Version = "0.9.9"
2622
+ m = korean_onblog_date_re.match(dateString)
2623
+ return unless m
2624
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
60
2625
 
61
- License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
2626
+ $stderr << "OnBlog date parsed as: %s\n" % w3dtfdate if $debug
2627
+ return _parse_date_w3dtf(w3dtfdate)
2628
+ end
62
2629
 
63
- Redistribution and use in source and binary forms, with or without modification,
64
- are permitted provided that the following conditions are met:
2630
+ def _parse_date_nate(dateString)
2631
+ # Parse a string according to the Nate 8-bit date format
2632
+ # 8-bit date handling routes written by ytrewq1
2633
+ korean_am = u("오전") # bfc0 c0fc in euc-kr
2634
+ korean_pm = u("오후") # bfc0 c8c4 in euc-kr
65
2635
 
66
- * Redistributions of source code must retain the above copyright notice,
67
- this list of conditions and the following disclaimer.
68
- * Redistributions in binary form must reproduce the above copyright notice,
69
- this list of conditions and the following disclaimer in the documentation
70
- and/or other materials provided with the distribution.
2636
+ korean_nate_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(#{korean_am}|#{korean_pm})\s+(\d{0,2}):(\d{0,2}):(\d{0,2})/
2637
+ m = korean_nate_date_re.match(dateString)
2638
+ return unless m
2639
+ hour = m[5].to_i
2640
+ ampm = m[4]
2641
+ if ampm == korean_pm
2642
+ hour += 12
2643
+ end
2644
+ hour = hour.to_s.rjust(2,'0')
2645
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{hour}:#{m[6]}:#{m[7]}+09:00"
2646
+ $stderr << "Nate date parsed as: %s\n" % w3dtfdate if $debug
2647
+ return _parse_date_w3dtf(w3dtfdate)
2648
+ end
71
2649
 
72
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
73
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
74
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
75
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
76
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
77
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
78
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
79
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
80
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
81
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
82
- POSSIBILITY OF SUCH DAMAGE."""
2650
+ def _parse_date_mssql(dateString)
2651
+ mssql_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?/
83
2652
 
84
- Author = "Jeff Hodges <http://somethingsimilar.com>"
85
- Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
86
- Contributors = [ "Jason Diamond <http://injektilo.org/>",
87
- "John Beimler <http://john.beimler.org/>",
88
- "Fazal Majid <http://www.majid.info/mylos/weblog/>",
89
- "Aaron Swartz <http://aaronsw.com/>",
90
- "Kevin Marks <http://epeus.blogspot.com/>"
91
- ]
92
- # HTTP "User-Agent" header to send to servers when downloading feeds.
93
- # If you are embedding feedparser in a larger application, you should
94
- # change this to your application name and URL.
95
- USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
2653
+ m = mssql_date_re.match(dateString)
2654
+ return unless m
2655
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
2656
+ $stderr << "MS SQL date parsed as: %s\n" % w3dtfdate if $debug
2657
+ return _parse_date_w3dtf(w3dtfdate)
2658
+ end
96
2659
 
97
- # HTTP "Accept" header to send to servers when downloading feeds. If you don't
98
- # want to send an Accept header, set this to None.
99
- ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
2660
+ def _parse_date_greek(dateString)
2661
+ # Parse a string according to a Greek 8-bit date format
2662
+ # Unicode strings for Greek date strings
2663
+ greek_months = {
2664
+ u("Ιαν") => u("Jan"), # c9e1ed in iso-8859-7
2665
+ u("Φεβ") => u("Feb"), # d6e5e2 in iso-8859-7
2666
+ u("Μάώ") => u("Mar"), # ccdcfe in iso-8859-7
2667
+ u("Μαώ") => u("Mar"), # cce1fe in iso-8859-7
2668
+ u("Απρ") => u("Apr"), # c1f0f1 in iso-8859-7
2669
+ u("Μάι") => u("May"), # ccdce9 in iso-8859-7
2670
+ u("Μαϊ") => u("May"), # cce1fa in iso-8859-7
2671
+ u("Μαι") => u("May"), # cce1e9 in iso-8859-7
2672
+ u("Ιούν") => u("Jun"), # c9effded in iso-8859-7
2673
+ u("Ιον") => u("Jun"), # c9efed in iso-8859-7
2674
+ u("Ιούλ") => u("Jul"), # c9effdeb in iso-8859-7
2675
+ u("Ιολ") => u("Jul"), # c9f9eb in iso-8859-7
2676
+ u("Αύγ") => u("Aug"), # c1fde3 in iso-8859-7
2677
+ u("Αυγ") => u("Aug"), # c1f5e3 in iso-8859-7
2678
+ u("Σεπ") => u("Sep"), # d3e5f0 in iso-8859-7
2679
+ u("Οκτ") => u("Oct"), # cfeaf4 in iso-8859-7
2680
+ u("Νοέ") => u("Nov"), # cdefdd in iso-8859-7
2681
+ u("Νοε") => u("Nov"), # cdefe5 in iso-8859-7
2682
+ u("Δεκ") => u("Dec"), # c4e5ea in iso-8859-7
2683
+ }
100
2684
 
2685
+ greek_wdays = {
2686
+ u("Κυρ") => u("Sun"), # caf5f1 in iso-8859-7
2687
+ u("Δευ") => u("Mon"), # c4e5f5 in iso-8859-7
2688
+ u("Τρι") => u("Tue"), # d4f1e9 in iso-8859-7
2689
+ u("Τετ") => u("Wed"), # d4e5f4 in iso-8859-7
2690
+ u("Πεμ") => u("Thu"), # d0e5ec in iso-8859-7
2691
+ u("Παρ") => u("Fri"), # d0e1f1 in iso-8859-7
2692
+ u("Σαβ") => u("Sat"), # d3e1e2 in iso-8859-7
2693
+ }
101
2694
 
102
- # If you want feedparser to automatically run HTML markup through HTML Tidy, set
103
- # this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
104
- # or utidylib <http://utidylib.berlios.de/>.
105
- #TIDY_MARKUP = false #FIXME untranslated
2695
+ greek_date_format = /([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)/
106
2696
 
107
- # List of Python interfaces for HTML Tidy, in order of preference. Only useful
108
- # if TIDY_MARKUP = true
109
- #PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
2697
+ m = greek_date_format.match(dateString)
2698
+ return unless m
2699
+ begin
2700
+ wday = greek_wdays[m[1]]
2701
+ month = greek_months[m[3]]
2702
+ rescue
2703
+ return nil
2704
+ end
2705
+ rfc822date = "#{wday}, #{m[2]} #{month} #{m[4]} #{m[5]}:#{m[6]}:#{m[7]} #{m[8]}"
2706
+ $stderr << "Greek date parsed as: #{rfc822date}\n" if $debug
2707
+ return _parse_date_rfc822(rfc822date)
2708
+ end
110
2709
 
2710
+ def _parse_date_hungarian(dateString)
2711
+ # Parse a string according to a Hungarian 8-bit date format.
2712
+ hungarian_date_format_re = /(\d{4})-([^-]+)-(\d{0,2})T(\d{0,2}):(\d{2})((\+|-)(\d{0,2}:\d{2}))/
2713
+ m = hungarian_date_format_re.match(dateString)
2714
+ return unless m
111
2715
 
112
- # ---------- don't touch these ----------
113
- class ThingsNobodyCaresAboutButMe < Exception
2716
+ # Unicode strings for Hungarian date strings
2717
+ hungarian_months = {
2718
+ u("január") => u("01"), # e1 in iso-8859-2
2719
+ u("februári") => u("02"), # e1 in iso-8859-2
2720
+ u("március") => u("03"), # e1 in iso-8859-2
2721
+ u("április") => u("04"), # e1 in iso-8859-2
2722
+ u("máujus") => u("05"), # e1 in iso-8859-2
2723
+ u("június") => u("06"), # fa in iso-8859-2
2724
+ u("július") => u("07"), # fa in iso-8859-2
2725
+ u("augusztus") => u("08"),
2726
+ u("szeptember") => u("09"),
2727
+ u("október") => u("10"), # f3 in iso-8859-2
2728
+ u("november") => u("11"),
2729
+ u("december") => u("12"),
2730
+ }
2731
+ begin
2732
+ month = hungarian_months[m[2]]
2733
+ day = m[3].rjust(2,'0')
2734
+ hour = m[4].rjust(2,'0')
2735
+ rescue
2736
+ return
2737
+ end
2738
+
2739
+ w3dtfdate = "#{m[1]}-#{month}-#{day}T#{hour}:#{m[5]}:00#{m[6]}"
2740
+ $stderr << "Hungarian date parsed as: #{w3dtfdate}\n" if $debug
2741
+ return _parse_date_w3dtf(w3dtfdate)
2742
+ end
2743
+
2744
+ def rollover(num, modulus)
2745
+ return num % modulus, num / modulus
2746
+ end
2747
+
2748
+ def set_self(num, modulus)
2749
+ r = num / modulus
2750
+ if r == 0
2751
+ return num
2752
+ end
2753
+ return r
2754
+ end
2755
+ # W3DTF-style date parsing
2756
+ # FIXME shouldn't it be "W3CDTF"?
2757
+ def _parse_date_w3dtf(dateString)
2758
+ # Ruby's Time docs claim w3cdtf is an alias for iso8601 which is an alias for xmlschema
2759
+ # Whatever it is, it doesn't work. This has been fixed in Ruby 1.9 and
2760
+ # in Ruby on Rails, but not really. They don't fix the 25 hour or 61 minute or 61 second rollover and fail in other ways.
2761
+
2762
+ m = dateString.match(/^(\d{4})-?(?:(?:([01]\d)-?(?:([0123]\d)(?:T(\d\d):(\d\d):(\d\d)([+-]\d\d:\d\d|Z))?)?)?)?/)
2763
+
2764
+ w3 = m[1..3].map{|s| s=s.to_i; s += 1 if s == 0;s} # Map the year, month and day to integers and, if they were nil, set them to 1
2765
+ w3 += m[4..6].map{|s| s.to_i} # Map the hour, minute and second to integers
2766
+ w3 << m[-1] # Leave the timezone as a String
2767
+
2768
+ # FIXME this next bit needs some serious refactoring
2769
+ # Rollover times. 0 minutes and 61 seconds -> 1 minute and 1 second
2770
+ w3[5],r = rollover(w3[5], 60) # rollover seconds
2771
+ w3[4] += r
2772
+ w3[4],r = rollover(w3[4], 60) # rollover minutes
2773
+ w3[3] += r
2774
+ w3[3],r = rollover(w3[3], 24) # rollover hours
2775
+
2776
+ w3[2] = w3[2] + r
2777
+ if w3[1] > 12
2778
+ w3[1],r = rollover(w3[1],12)
2779
+ w3[1] = 12 if w3[1] == 0
2780
+ w3[0] += r
2781
+ end
2782
+
2783
+ num_days = Time.days_in_month(w3[1], w3[0])
2784
+ while w3[2] > num_days
2785
+ w3[2] -= num_days
2786
+ w3[1] += 1
2787
+ if w3[1] > 12
2788
+ w3[0] += 1
2789
+ w3[1] = set_self(w3[1], 12)
2790
+ end
2791
+ num_days = Time.days_in_month(w3[1], w3[0])
2792
+ end
2793
+
2794
+
2795
+ unless w3[6].class != String
2796
+ if /^-/ =~ w3[6] # Zone offset goes backwards
2797
+ w3[6][0] = '+'
2798
+ elsif /^\+/ =~ w3[6]
2799
+ w3[6][0] = '-'
2800
+ end
2801
+ end
2802
+ return Time.utc(w3[0], w3[1], w3[2] , w3[3], w3[4], w3[5])+Time.zone_offset(w3[6] || "UTC")
2803
+ end
2804
+
2805
+ def _parse_date_rfc822(dateString)
2806
+ # Parse an RFC822, RFC1123, RFC2822 or asctime-style date
2807
+ # These first few lines are to fix up the stupid proprietary format from Disney
2808
+ unknown_timezones = { 'AT' => 'EDT', 'ET' => 'EST',
2809
+ 'CT' => 'CST', 'MT' => 'MST',
2810
+ 'PT' => 'PST'
2811
+ }
2812
+
2813
+ mon = dateString.split[2]
2814
+ if mon.length > 3 and Time::RFC2822_MONTH_NAME.include?mon[0..2]
2815
+ dateString.sub!(mon,mon[0..2])
2816
+ end
2817
+ if dateString[-3..-1] != "GMT" and unknown_timezones[dateString[-2..-1]]
2818
+ dateString[-2..-1] = unknown_timezones[dateString[-2..-1]]
2819
+ end
2820
+ # Okay, the Disney date format should be fixed up now.
2821
+ rfc = dateString.match(/([A-Za-z]{3}), ([0123]\d) ([A-Za-z]{3}) (\d{4})( (\d\d):(\d\d)(?::(\d\d))? ([A-Za-z]{3}))?/)
2822
+ if rfc.to_a.length > 1 and rfc.to_a.include? nil
2823
+ dow, day, mon, year, hour, min, sec, tz = rfc[1..-1]
2824
+ hour,min,sec = [hour,min,sec].map{|e| e.to_s.rjust(2,'0') }
2825
+ tz ||= "GMT"
2826
+ end
2827
+ asctime_match = dateString.match(/([A-Za-z]{3}) ([A-Za-z]{3}) (\d?\d) (\d\d):(\d\d):(\d\d) ([A-Za-z]{3}) (\d\d\d\d)/).to_a
2828
+ if asctime_match.to_a.length > 1
2829
+ # Month-abbr dayofmonth hour:minute:second year
2830
+ dow, mon, day, hour, min, sec, tz, year = asctime_match[1..-1]
2831
+ day.to_s.rjust(2,'0')
2832
+ end
2833
+ if (rfc.to_a.length > 1 and rfc.to_a.include? nil) or asctime_match.to_a.length > 1
2834
+ ds = "#{dow}, #{day} #{mon} #{year} #{hour}:#{min}:#{sec} #{tz}"
2835
+ else
2836
+ ds = dateString
2837
+ end
2838
+ t = Time.rfc2822(ds).utc
2839
+ return t
2840
+ end
2841
+
2842
+ def _parse_date_perforce(aDateString) # FIXME not in 4.1?
2843
+ # Parse a date in yyyy/mm/dd hh:mm:ss TTT format
2844
+ # Note that there is a day of the week at the beginning
2845
+ # Ex. Fri, 2006/09/15 08:19:53 EDT
2846
+ return Time.parse(aDateString).utc
2847
+ end
2848
+
2849
+ def extract_tuple(atime)
2850
+ # NOTE leave the error handling to parse_date
2851
+ t = [atime.year, atime.month, atime.mday, atime.hour,
2852
+ atime.min, atime.sec, (atime.wday-1) % 7, atime.yday,
2853
+ atime.isdst
2854
+ ]
2855
+ # yay for modulus! yaaaaaay! its 530 am and i should be sleeping! yaay!
2856
+ t[0..-2].map!{|s| s.to_i}
2857
+ t[-1] = t[-1] ? 1 : 0
2858
+ return t
2859
+ end
2860
+
2861
+ def parse_date(dateString)
2862
+ @date_handlers.each do |handler|
2863
+ begin
2864
+ $stderr << "Trying date_handler #{handler}\n" if $debug
2865
+ datething = extract_tuple(send(handler,dateString))
2866
+ return datething
2867
+ rescue Exception => e
2868
+ $stderr << "#{handler} raised #{e}\n" if $debug
2869
+ end
2870
+ end
2871
+ return nil
2872
+ end
2873
+
2874
+ end # End FeedParserMixin
2875
+
2876
+ class StrictFeedParser < XML::SAX::HandlerBase # expat
2877
+ include FeedParserMixin
2878
+
2879
+ attr_accessor :bozo, :entries, :feeddata, :exc
2880
+ def initialize(baseuri, baselang, encoding)
2881
+ $stderr << "trying StrictFeedParser\n" if $debug
2882
+ startup(baseuri, baselang, encoding)
2883
+ @bozo = false
2884
+ @exc = nil
2885
+ super()
2886
+ end
2887
+
2888
+ def getPos
2889
+ [@locator.getSystemId, @locator.getLineNumber]
2890
+ end
2891
+
2892
+ def getAttrs(attrs)
2893
+ ret = []
2894
+ for i in 0..attrs.getLength
2895
+ ret.push([attrs.getName(i), attrs.getValue(i)])
2896
+ end
2897
+ ret
2898
+ end
2899
+
2900
+ def setDocumentLocator(loc)
2901
+ @locator = loc
2902
+ end
2903
+
2904
+ def startDoctypeDecl(name, pub_sys, long_name, uri)
2905
+ #Nothing is done here. What could we do that is neat and useful?
2906
+ end
2907
+
2908
+ def startNamespaceDecl(prefix, uri)
2909
+ trackNamespace(prefix, uri)
2910
+ end
2911
+
2912
+ def endNamespaceDecl(prefix)
2913
+ end
2914
+
2915
+ def startElement(name, attrs)
2916
+ name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
2917
+ namespaceuri = ($2 || '').downcase
2918
+ name = $3
2919
+ if /backend\.userland\.com\/rss/ =~ namespaceuri
2920
+ # match any backend.userland.com namespace
2921
+ namespaceuri = 'http://backend.userland.com/rss'
2922
+ end
2923
+ prefix = @matchnamespaces[namespaceuri]
2924
+ # No need to raise UndeclaredNamespace, Expat does that for us with
2925
+ "unbound prefix (XMLParserError)"
2926
+ if prefix and not prefix.empty?
2927
+ name = prefix + ':' + name
2928
+ end
2929
+ name.downcase!
2930
+ unknown_starttag(name, attrs)
2931
+ end
2932
+
2933
+ def character(text, start, length)
2934
+ #handle_data(CGI.unescapeHTML(text))
2935
+ handle_data(text)
2936
+ end
2937
+ # expat provides "character" not "characters"!
2938
+ alias :characters :character # Just in case.
2939
+
2940
+ def startCdata(content)
2941
+ handle_data(content)
2942
+ end
2943
+
2944
+ def endElement(name)
2945
+ name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
2946
+ namespaceuri = ($2 || '').downcase
2947
+ prefix = @matchnamespaces[namespaceuri]
2948
+ if prefix and not prefix.empty?
2949
+ localname = prefix + ':' + name
2950
+ end
2951
+ name.downcase!
2952
+ unknown_endtag(name)
2953
+ end
2954
+
2955
+ def comment(comment)
2956
+ handle_comment(comment)
2957
+ end
2958
+
2959
+ def entityDecl(*foo)
2960
+ end
2961
+
2962
+ def unparsedEntityDecl(*foo)
2963
+ end
2964
+ def error(exc)
2965
+ @bozo = true
2966
+ @exc = exc
2967
+ end
2968
+
2969
+ def fatalError(exc)
2970
+ error(exc)
2971
+ raise exc
2972
+ end
114
2973
  end
115
- class CharacterEncodingOverride < ThingsNobodyCaresAboutButMe
2974
+
2975
+ class LooseFeedParser < BetterSGMLParser
2976
+ include FeedParserMixin
2977
+ # We write the methods that were in BaseHTMLProcessor in the python code
2978
+ # in here directly. We do this because if we inherited from
2979
+ # BaseHTMLProcessor but then included from FeedParserMixin, the methods
2980
+ # of Mixin would overwrite the methods we inherited from
2981
+ # BaseHTMLProcessor. This is exactly the opposite of what we want to
2982
+ # happen!
2983
+
2984
+ attr_accessor :encoding, :bozo, :feeddata, :entries, :namespacesInUse
2985
+
2986
+ Elements_No_End_Tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
2987
+ 'img', 'input', 'isindex', 'link', 'meta', 'param']
2988
+ New_Declname_Re = /[a-zA-Z][-_.a-zA-Z0-9:]*\s*/
2989
+ alias :sgml_feed :feed # feed needs to mapped to feeddata, not the SGMLParser method feed. I think.
2990
+ def feed
2991
+ @feeddata
2992
+ end
2993
+ def feed=(data)
2994
+ @feeddata = data
2995
+ end
2996
+
2997
+ def initialize(baseuri, baselang, encoding)
2998
+ startup(baseuri, baselang, encoding)
2999
+ super() # Keep the parentheses! No touchy.
3000
+ end
3001
+
3002
+ def reset
3003
+ @pieces = []
3004
+ super
3005
+ end
3006
+
3007
+ def parse(data)
3008
+ data.gsub!(/<!((?!DOCTYPE|--|\[))/i, '&lt;!\1')
3009
+ data.gsub!(/<([^<\s]+?)\s*\/>/) do |tag|
3010
+ clean = tag[1..-3].strip
3011
+ if Elements_No_End_Tag.include?clean
3012
+ tag
3013
+ else
3014
+ '<'+clean+'></'+clean+'>'
3015
+ end
3016
+ end
3017
+
3018
+ data.gsub!(/&#39;/, "'")
3019
+ data.gsub!(/&#34;/, "'")
3020
+ if @encoding and not @encoding.empty? # FIXME unicode check type(u'')
3021
+ data = uconvert(data,'utf-8',@encoding)
3022
+ end
3023
+ sgml_feed(data) # see the alias above
3024
+ end
3025
+
3026
+
3027
+ def decodeEntities(element, data)
3028
+ data.gsub!('&#60;', '&lt;')
3029
+ data.gsub!('&#x3c;', '&lt;')
3030
+ data.gsub!('&#62;', '&gt;')
3031
+ data.gsub!('&#x3e;', '&gt;')
3032
+ data.gsub!('&#38;', '&amp;')
3033
+ data.gsub!('&#x26;', '&amp;')
3034
+ data.gsub!('&#34;', '&quot;')
3035
+ data.gsub!('&#x22;', '&quot;')
3036
+ data.gsub!('&#39;', '&apos;')
3037
+ data.gsub!('&#x27;', '&apos;')
3038
+ if @contentparams.has_key? 'type' and not ((@contentparams['type'] || 'xml') =~ /xml$/u)
3039
+ data.gsub!('&lt;', '<')
3040
+ data.gsub!('&gt;', '>')
3041
+ data.gsub!('&amp;', '&')
3042
+ data.gsub!('&quot;', '"')
3043
+ data.gsub!('&apos;', "'")
3044
+ end
3045
+ return data
3046
+ end
116
3047
  end
117
- class CharacterEncodingUnknown < ThingsNobodyCaresAboutButMe
3048
+
3049
+ def FeedParser.resolveRelativeURIs(htmlSource, baseURI, encoding)
3050
+ $stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
3051
+ relative_uris = [ ['a','href'],
3052
+ ['applet','codebase'],
3053
+ ['area','href'],
3054
+ ['blockquote','cite'],
3055
+ ['body','background'],
3056
+ ['del','cite'],
3057
+ ['form','action'],
3058
+ ['frame','longdesc'],
3059
+ ['frame','src'],
3060
+ ['iframe','longdesc'],
3061
+ ['iframe','src'],
3062
+ ['head','profile'],
3063
+ ['img','longdesc'],
3064
+ ['img','src'],
3065
+ ['img','usemap'],
3066
+ ['input','src'],
3067
+ ['input','usemap'],
3068
+ ['ins','cite'],
3069
+ ['link','href'],
3070
+ ['object','classid'],
3071
+ ['object','codebase'],
3072
+ ['object','data'],
3073
+ ['object','usemap'],
3074
+ ['q','cite'],
3075
+ ['script','src'],
3076
+ ]
3077
+ h = Hpricot(htmlSource)
3078
+ relative_uris.each do |l|
3079
+ ename, eattr = l
3080
+ h.search(ename).each do |elem|
3081
+ euri = elem.attributes[eattr]
3082
+ if euri and not euri.empty? and URI.parse(euri).relative?
3083
+ elem.attributes[eattr] = urljoin(baseURI, euri)
3084
+ end
3085
+ end
3086
+ end
3087
+ return h.to_html
118
3088
  end
119
- class NonXMLContentType < ThingsNobodyCaresAboutButMe
3089
+
3090
+ class SanitizerDoc < Hpricot::Doc
3091
+
3092
+ def scrub
3093
+ traverse_all_element do |e|
3094
+ if e.elem?
3095
+ if Acceptable_Elements.include?e.name
3096
+ e.strip_attributes
3097
+ else
3098
+ if Unacceptable_Elements_With_End_Tag.include?e.name
3099
+ e.inner_html = ''
3100
+ end
3101
+ e.swap(SanitizerDoc.new(e.children).scrub.to_html)
3102
+ # This works because the children swapped in are brought in "after" the current element.
3103
+ end
3104
+ elsif e.doctype?
3105
+ e.parent.children.delete(e)
3106
+ elsif e.text?
3107
+ ets = e.to_s
3108
+ ets.gsub!(/&#39;/, "'")
3109
+ ets.gsub!(/&#34;/, '"')
3110
+ ets.gsub!(/\r/,'')
3111
+ e.swap(ets)
3112
+ else
3113
+ end
3114
+ end
3115
+ # yes, that '/' should be there. It's a search method. See the Hpricot docs.
3116
+
3117
+ unless $compatible # FIXME not properly recursive, see comment in recursive_strip
3118
+ (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
3119
+ end
3120
+ return self
3121
+ end
120
3122
  end
121
- class UndeclaredNamespace < Exception
3123
+
3124
+ def SanitizerDoc(html)
3125
+ FeedParser::SanitizerDoc.new(Hpricot.make(html))
3126
+ end
3127
+ module_function(:SanitizerDoc)
3128
+ def self.sanitizeHTML(html,encoding)
3129
+ # FIXME Tidy not yet supported
3130
+ html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
3131
+ h = SanitizerDoc(html)
3132
+ h = h.scrub
3133
+ return h.to_html.strip
122
3134
  end
123
3135
 
124
3136
 
125
- SUPPORTED_VERSIONS = {'' => 'unknown',
126
- 'rss090' => 'RSS 0.90',
127
- 'rss091n' => 'RSS 0.91 (Netscape)',
128
- 'rss091u' => 'RSS 0.91 (Userland)',
129
- 'rss092' => 'RSS 0.92',
130
- 'rss093' => 'RSS 0.93',
131
- 'rss094' => 'RSS 0.94',
132
- 'rss20' => 'RSS 2.0',
133
- 'rss10' => 'RSS 1.0',
134
- 'rss' => 'RSS (unknown version)',
135
- 'atom01' => 'Atom 0.1',
136
- 'atom02' => 'Atom 0.2',
137
- 'atom03' => 'Atom 0.3',
138
- 'atom10' => 'Atom 1.0',
139
- 'atom' => 'Atom (unknown version)',
140
- 'cdf' => 'CDF',
141
- 'hotrss' => 'Hot RSS'
142
- }
143
-
144
- def parse(furi, options = {})
3137
+
3138
+ def self.getCharacterEncoding(feed, xml_data)
3139
+ # Get the character encoding of the XML document
3140
+ $stderr << "In getCharacterEncoding\n" if $debug
3141
+ sniffed_xml_encoding = nil
3142
+ xml_encoding = nil
3143
+ true_encoding = nil
3144
+ begin
3145
+ http_headers = feed.meta
3146
+ http_content_type = feed.meta['content-type'].split(';')[0]
3147
+ encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
3148
+ http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
3149
+ http_encoding = nil if http_encoding.empty?
3150
+ # FIXME Open-Uri returns iso8859-1 if there is no charset header,
3151
+ # but that doesn't pass the tests. Open-Uri claims its following
3152
+ # the right RFC. Are they wrong or do we need to change the tests?
3153
+ rescue NoMethodError
3154
+ http_headers = {}
3155
+ http_content_type = nil
3156
+ http_encoding = nil
3157
+ end
3158
+ # Must sniff for non-ASCII-compatible character encodings before
3159
+ # searching for XML declaration. This heuristic is defined in
3160
+ # section F of the XML specification:
3161
+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3162
+ begin
3163
+ if xml_data[0..3] == "\x4c\x6f\xa7\x94"
3164
+ # EBCDIC
3165
+ xml_data = _ebcdic_to_ascii(xml_data)
3166
+ elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
3167
+ # UTF-16BE
3168
+ sniffed_xml_encoding = 'utf-16be'
3169
+ xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
3170
+ elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
3171
+ # UTF-16BE with BOM
3172
+ sniffed_xml_encoding = 'utf-16be'
3173
+ xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
3174
+ elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
3175
+ # UTF-16LE
3176
+ sniffed_xml_encoding = 'utf-16le'
3177
+ xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
3178
+ elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
3179
+ # UTF-16LE with BOM
3180
+ sniffed_xml_encoding = 'utf-16le'
3181
+ xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
3182
+ elsif xml_data[0..3] == "\x00\x00\x00\x3c"
3183
+ # UTF-32BE
3184
+ sniffed_xml_encoding = 'utf-32be'
3185
+ xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
3186
+ elsif xml_data[0..3] == "\x3c\x00\x00\x00"
3187
+ # UTF-32LE
3188
+ sniffed_xml_encoding = 'utf-32le'
3189
+ xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
3190
+ elsif xml_data[0..3] == "\x00\x00\xfe\xff"
3191
+ # UTF-32BE with BOM
3192
+ sniffed_xml_encoding = 'utf-32be'
3193
+ xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
3194
+ elsif xml_data[0..3] == "\xff\xfe\x00\x00"
3195
+ # UTF-32LE with BOM
3196
+ sniffed_xml_encoding = 'utf-32le'
3197
+ xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
3198
+ elsif xml_data[0..2] == "\xef\xbb\xbf"
3199
+ # UTF-8 with BOM
3200
+ sniffed_xml_encoding = 'utf-8'
3201
+ xml_data = xml_data[3..-1]
3202
+ else
3203
+ # ASCII-compatible
3204
+ end
3205
+ xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
3206
+ rescue
3207
+ xml_encoding_match = nil
3208
+ end
3209
+ if xml_encoding_match
3210
+ xml_encoding = xml_encoding_match[1].downcase
3211
+ xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
3212
+ if sniffed_xml_encoding and xencodings.include?xml_encoding
3213
+ xml_encoding = sniffed_xml_encoding
3214
+ end
3215
+ end
3216
+
3217
+ acceptable_content_type = false
3218
+ application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
3219
+ text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
3220
+
3221
+ if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
3222
+ acceptable_content_type = true
3223
+ true_encoding = http_encoding || xml_encoding || 'utf-8'
3224
+ elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
3225
+ acceptable_content_type = true
3226
+ true_encoding = http_encoding || 'us-ascii'
3227
+ elsif /^text\// =~ http_content_type
3228
+ true_encoding = http_encoding || 'us-ascii'
3229
+ elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
3230
+ true_encoding = xml_encoding || 'iso-8859-1'
3231
+ else
3232
+ true_encoding = xml_encoding || 'utf-8'
3233
+ end
3234
+ return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
3235
+ end
3236
+
3237
+ def self.toUTF8(data, encoding)
3238
+ =begin
3239
+ Changes an XML data stream on the fly to specify a new encoding
3240
+
3241
+ data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
3242
+ encoding is a string recognized by encodings.aliases
3243
+ =end
3244
+ $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
3245
+ # NOTE we must use double quotes when dealing with \x encodings!
3246
+ if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
3247
+ if $debug
3248
+ $stderr << "stripping BOM\n"
3249
+ if encoding != 'utf-16be'
3250
+ $stderr << "string utf-16be instead\n"
3251
+ end
3252
+ end
3253
+ encoding = 'utf-16be'
3254
+ data = data[2..-1]
3255
+ elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
3256
+ if $debug
3257
+ $stderr << "stripping BOM\n"
3258
+ $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
3259
+ end
3260
+ encoding = 'utf-16le'
3261
+ data = data[2..-1]
3262
+ elsif (data[0..2] == "\xef\xbb\xbf")
3263
+ if $debug
3264
+ $stderr << "stripping BOM\n"
3265
+ $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
3266
+ end
3267
+ encoding = 'utf-8'
3268
+ data = data[3..-1]
3269
+ elsif (data[0..3] == "\x00\x00\xfe\xff")
3270
+ if $debug
3271
+ $stderr << "stripping BOM\n"
3272
+ if encoding != 'utf-32be'
3273
+ $stderr << "trying utf-32be instead\n"
3274
+ end
3275
+ end
3276
+ encoding = 'utf-32be'
3277
+ data = data[4..-1]
3278
+ elsif (data[0..3] == "\xff\xfe\x00\x00")
3279
+ if $debug
3280
+ $stderr << "stripping BOM\n"
3281
+ if encoding != 'utf-32le'
3282
+ $stderr << "trying utf-32le instead\n"
3283
+ end
3284
+ end
3285
+ encoding = 'utf-32le'
3286
+ data = data[4..-1]
3287
+ end
3288
+ begin
3289
+ newdata = uconvert(data, encoding, 'utf-8')
3290
+ rescue => details
3291
+ end
3292
+ $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
3293
+ declmatch = /^<\?xml[^>]*?>/
3294
+ newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
3295
+ if declmatch =~ newdata
3296
+ newdata.sub!(declmatch, newdecl)
3297
+ else
3298
+ newdata = newdecl + "\n" + newdata
3299
+ end
3300
+ return newdata
3301
+ end
3302
+
3303
+ def self.stripDoctype(data)
3304
+ =begin
3305
+ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3306
+
3307
+ rss_version may be 'rss091n' or None
3308
+ stripped_data is the same XML document, minus the DOCTYPE
3309
+ =end
3310
+ entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
3311
+ data = data.gsub(entity_pattern,'')
3312
+
3313
+ doctype_pattern = /<!DOCTYPE(.*?)>/m
3314
+ doctype_results = data.scan(doctype_pattern)
3315
+ if doctype_results and doctype_results[0]
3316
+ doctype = doctype_results[0][0]
3317
+ else
3318
+ doctype = ''
3319
+ end
3320
+
3321
+ if /netscape/ =~ doctype.downcase
3322
+ version = 'rss091n'
3323
+ else
3324
+ version = nil
3325
+ end
3326
+ data = data.sub(doctype_pattern, '')
3327
+ return version, data
3328
+ end
3329
+
3330
+ def parse(*args); FeedParser.parse(*args); end
3331
+ def FeedParser.parse(furi, options={})
145
3332
  # Parse a feed from a URL, file, stream or string
146
3333
  $compatible = options[:compatible] || $compatible # Use the default compatibility if compatible is nil
147
- strictklass = options[:strict] || StrictFeedParser
148
- looseklass = options[:loose] || LooseFeedParser
149
3334
  result = FeedParserDict.new
150
3335
  result['feed'] = FeedParserDict.new
151
3336
  result['entries'] = []
@@ -155,12 +3340,13 @@ POSSIBILITY OF SUCH DAMAGE."""
155
3340
  end
156
3341
  result['bozo'] = false
157
3342
  handlers = options[:handlers]
3343
+
158
3344
  if handlers.class != Array # FIXME why does this happen?
159
3345
  handlers = [handlers]
160
3346
  end
161
3347
 
162
3348
  begin
163
- if File.exists?furi
3349
+ if URI::parse(furi).class == URI::Generic
164
3350
  f = open(furi) # OpenURI doesn't behave well when passing HTTP options to a file.
165
3351
  else
166
3352
  # And when you do pass them, make sure they aren't just nil (this still true?)
@@ -327,7 +3513,7 @@ POSSIBILITY OF SUCH DAMAGE."""
327
3513
  if use_strict_parser
328
3514
  # initialize the SAX parser
329
3515
  saxparser = XML::SAX::Helpers::ParserFactory.makeParser("XML::Parser::SAXDriver")
330
- feedparser = strictklass.new(baseuri, baselang, 'utf-8')
3516
+ feedparser = StrictFeedParser.new(baseuri, baselang, 'utf-8')
331
3517
  saxparser.setDocumentHandler(feedparser)
332
3518
  saxparser.setDTDHandler(feedparser)
333
3519
  saxparser.setEntityResolver(feedparser)
@@ -348,7 +3534,7 @@ POSSIBILITY OF SUCH DAMAGE."""
348
3534
  end
349
3535
  end
350
3536
  if not use_strict_parser
351
- feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
3537
+ feedparser = LooseFeedParser.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
352
3538
  feedparser.parse(data)
353
3539
  $stderr << "Using LooseFeed\n\n" if $debug
354
3540
  end
@@ -358,7 +3544,6 @@ POSSIBILITY OF SUCH DAMAGE."""
358
3544
  result['namespaces'] = feedparser.namespacesInUse
359
3545
  return result
360
3546
  end
361
- module_function(:parse)
362
3547
  end # End FeedParser module
363
3548
 
364
3549
  class Serializer
@@ -398,7 +3583,7 @@ class TextSerializer < Serializer
398
3583
  end
399
3584
  end
400
3585
 
401
- class PprintSerializer < Serializer # FIXME use pp instead
3586
+ class PprintSerializer < Serializer # FIXME ? use pp instead?
402
3587
  def write(stream = $stdout)
403
3588
  stream << @results['href'].to_s + "\n\n"
404
3589
  pp(@results)
@@ -406,88 +3591,87 @@ class PprintSerializer < Serializer # FIXME use pp instead
406
3591
  end
407
3592
  end
408
3593
 
409
- if $0 == __FILE__
410
- require 'optparse'
411
- require 'ostruct'
412
- options = OpenStruct.new
413
- options.etag = options.modified = options.agent = options.referrer = nil
414
- options.content_language = options.content_location = options.ctype = nil
415
- options.format = 'pprint'
416
- options.compatible = $compatible
417
- options.verbose = false
418
-
419
- opts = OptionParser.new do |opts|
420
- opts.banner
421
- opts.separator ""
422
- opts.on("-A", "--user-agent [AGENT]",
3594
+
3595
+ require 'optparse'
3596
+ require 'ostruct'
3597
+ options = OpenStruct.new
3598
+ options.etag = options.modified = options.agent = options.referrer = nil
3599
+ options.content_language = options.content_location = options.ctype = nil
3600
+ options.format = 'pprint'
3601
+ options.compatible = $compatible
3602
+ options.verbose = false
3603
+
3604
+ opts = OptionParser.new do |opts|
3605
+ opts.banner
3606
+ opts.separator ""
3607
+ opts.on("-A", "--user-agent [AGENT]",
423
3608
  "User-Agent for HTTP URLs") {|agent|
424
- options.agent = agent
425
- }
3609
+ options.agent = agent
3610
+ }
426
3611
 
427
- opts.on("-e", "--referrer [URL]",
3612
+ opts.on("-e", "--referrer [URL]",
428
3613
  "Referrer for HTTP URLs") {|referrer|
429
- options.referrer = referrer
430
- }
3614
+ options.referrer = referrer
3615
+ }
431
3616
 
432
- opts.on("-t", "--etag [TAG]",
3617
+ opts.on("-t", "--etag [TAG]",
433
3618
  "ETag/If-None-Match for HTTP URLs") {|etag|
434
- options.etag = etag
435
- }
3619
+ options.etag = etag
3620
+ }
436
3621
 
437
- opts.on("-m", "--last-modified [DATE]",
3622
+ opts.on("-m", "--last-modified [DATE]",
438
3623
  "Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") {|modified|
439
- options.modified = modified
440
- }
3624
+ options.modified = modified
3625
+ }
441
3626
 
442
- opts.on("-f", "--format [FORMAT]", [:text, :pprint],
3627
+ opts.on("-f", "--format [FORMAT]", [:text, :pprint],
443
3628
  "output resutls in FORMAT (text, pprint)") {|format|
444
- options.format = format
445
- }
3629
+ options.format = format
3630
+ }
446
3631
 
447
- opts.on("-v", "--[no-]verbose",
3632
+ opts.on("-v", "--[no-]verbose",
448
3633
  "write debugging information to stderr") {|v|
449
- options.verbose = v
450
- }
3634
+ options.verbose = v
3635
+ }
451
3636
 
452
- opts.on("-c", "--[no-]compatible",
3637
+ opts.on("-c", "--[no-]compatible",
453
3638
  "strip element attributes like feedparser.py 4.1 (default)") {|comp|
454
- options.compatible = comp
455
- }
456
- opts.on("-l", "--content-location [LOCATION]",
3639
+ options.compatible = comp
3640
+ }
3641
+ opts.on("-l", "--content-location [LOCATION]",
457
3642
  "default Content-Location HTTP header") {|loc|
458
- options.content_location = loc
459
- }
460
- opts.on("-a", "--content-language [LANG]",
3643
+ options.content_location = loc
3644
+ }
3645
+ opts.on("-a", "--content-language [LANG]",
461
3646
  "default Content-Language HTTP header") {|lang|
462
- options.content_language = lang
463
- }
464
- opts.on("-t", "--content-type [TYPE]",
3647
+ options.content_language = lang
3648
+ }
3649
+ opts.on("-t", "--content-type [TYPE]",
465
3650
  "default Content-type HTTP header") {|ctype|
466
- options.ctype = ctype
467
- }
468
- end
3651
+ options.ctype = ctype
3652
+ }
3653
+ end
469
3654
 
470
- opts.parse!(ARGV)
471
- $debug = true if options.verbose
472
- $compatible = options.compatible unless options.compatible.nil?
3655
+ opts.parse!(ARGV)
3656
+ $debug = true if options.verbose
3657
+ $compatible = options.compatible unless options.compatible.nil?
473
3658
 
474
- if options.format == :text
475
- serializer = TextSerializer
476
- else
477
- serializer = PprintSerializer
478
- end
479
- args = *ARGV.dup
480
- unless args.nil?
481
- args.each do |url| # opts.parse! removes everything but the urls from the command line
482
- results = FeedParser.parse(url, :etag => options.etag,
483
- :modified => options.modified,
484
- :agent => options.agent,
485
- :referrer => options.referrer,
486
- :content_location => options.content_location,
487
- :content_language => options.content_language,
488
- :content_type => options.ctype
489
- )
490
- serializer.new(results).write($stdout)
491
- end
3659
+ if options.format == :text
3660
+ serializer = TextSerializer
3661
+ else
3662
+ serializer = PprintSerializer
3663
+ end
3664
+ args = *ARGV.dup
3665
+ unless args.nil?
3666
+ args.each do |url| # opts.parse! removes everything but the urls from the command line
3667
+ results = FeedParser.parse(url, :etag => options.etag,
3668
+ :modified => options.modified,
3669
+ :agent => options.agent,
3670
+ :referrer => options.referrer,
3671
+ :content_location => options.content_location,
3672
+ :content_language => options.content_language,
3673
+ :content_type => options.ctype
3674
+ )
3675
+ serializer.new(results).write($stdout)
492
3676
  end
493
3677
  end