rfeedparser 0.9.9 → 0.9.85

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,138 +14,3323 @@ require 'stringio'
14
14
  require 'uri'
15
15
  require 'cgi' # escaping html
16
16
  require 'time'
17
+ require 'xml/saxdriver' # calling expat
17
18
  require 'pp'
18
19
  require 'rubygems'
19
20
  require 'base64'
20
21
  require 'iconv'
22
+ gem 'hpricot', ">=0.5"
23
+ gem 'character-encodings', ">=0.2.0"
24
+ gem 'htmltools', ">=1.10"
25
+ gem 'htmlentities', ">=4.0.0"
26
+ gem 'activesupport', ">=1.4.2"
27
+ gem 'rchardet', ">=1.0"
28
+
29
+ require 'rchardet'
30
+ $chardet = true
31
+
32
+ require 'hpricot'
33
+ require 'encoding/character/utf-8'
34
+ require 'html/sgml-parser'
35
+ require 'htmlentities'
36
+ require 'active_support'
37
+ require 'open-uri'
38
+ include OpenURI
39
+
40
+ $debug = false
41
+ $compatible = true
42
+
43
+ Encoding_Aliases = { # Adapted from python2.4's encodings/aliases.py
44
+ 'unicode' => 'utf-16',
45
+ # MacOSX does not have Unicode as a separate encoding nor even
46
+ # aliased. My Ubuntu box has it as a separate encoding but I cannot
47
+ # for the life of me figure out where the source code for UNICODE.so
48
+ # is (supposedly, in libc6 .deb but that's a damn lie), so I don't
49
+ # know what it expects. After some extensive research, I've decided
50
+ # to alias it to utf-16 much like Python does when it is built with
51
+ # --enable-unicode=ucs2. This could be seriously wrong. I have no idea.
52
+
53
+ # ascii codec
54
+ '646' => 'ascii',
55
+ 'ansi_x3.4_1968' => 'ascii',
56
+ 'ansi_x3_4_1968' => 'ascii', # some email headers use this non-standard name
57
+ 'ansi_x3.4_1986' => 'ascii',
58
+ 'cp367' => 'ascii',
59
+ 'csascii' => 'ascii',
60
+ 'ibm367' => 'ascii',
61
+ 'iso646_us' => 'ascii',
62
+ 'iso_646.irv_1991' => 'ascii',
63
+ 'iso_ir_6' => 'ascii',
64
+ 'us' => 'ascii',
65
+ 'us_ascii' => 'ascii',
66
+
67
+ # big5 codec
68
+ 'big5_tw' => 'big5',
69
+ 'csbig5' => 'big5',
70
+
71
+ # big5hkscs codec
72
+ 'big5_hkscs' => 'big5hkscs',
73
+ 'hkscs' => 'big5hkscs',
74
+
75
+ # cp037 codec
76
+ '037' => 'cp037',
77
+ 'csibm037' => 'cp037',
78
+ 'ebcdic_cp_ca' => 'cp037',
79
+ 'ebcdic_cp_nl' => 'cp037',
80
+ 'ebcdic_cp_us' => 'cp037',
81
+ 'ebcdic_cp_wt' => 'cp037',
82
+ 'ibm037' => 'cp037',
83
+ 'ibm039' => 'cp037',
84
+
85
+ # cp1026 codec
86
+ '1026' => 'cp1026',
87
+ 'csibm1026' => 'cp1026',
88
+ 'ibm1026' => 'cp1026',
89
+
90
+ # cp1140 codec
91
+ '1140' => 'cp1140',
92
+ 'ibm1140' => 'cp1140',
93
+
94
+ # cp1250 codec
95
+ '1250' => 'cp1250',
96
+ 'windows_1250' => 'cp1250',
97
+
98
+ # cp1251 codec
99
+ '1251' => 'cp1251',
100
+ 'windows_1251' => 'cp1251',
101
+
102
+ # cp1252 codec
103
+ '1252' => 'cp1252',
104
+ 'windows_1252' => 'cp1252',
105
+
106
+ # cp1253 codec
107
+ '1253' => 'cp1253',
108
+ 'windows_1253' => 'cp1253',
109
+
110
+ # cp1254 codec
111
+ '1254' => 'cp1254',
112
+ 'windows_1254' => 'cp1254',
113
+
114
+ # cp1255 codec
115
+ '1255' => 'cp1255',
116
+ 'windows_1255' => 'cp1255',
117
+
118
+ # cp1256 codec
119
+ '1256' => 'cp1256',
120
+ 'windows_1256' => 'cp1256',
121
+
122
+ # cp1257 codec
123
+ '1257' => 'cp1257',
124
+ 'windows_1257' => 'cp1257',
125
+
126
+ # cp1258 codec
127
+ '1258' => 'cp1258',
128
+ 'windows_1258' => 'cp1258',
129
+
130
+ # cp424 codec
131
+ '424' => 'cp424',
132
+ 'csibm424' => 'cp424',
133
+ 'ebcdic_cp_he' => 'cp424',
134
+ 'ibm424' => 'cp424',
135
+
136
+ # cp437 codec
137
+ '437' => 'cp437',
138
+ 'cspc8codepage437' => 'cp437',
139
+ 'ibm437' => 'cp437',
140
+
141
+ # cp500 codec
142
+ '500' => 'cp500',
143
+ 'csibm500' => 'cp500',
144
+ 'ebcdic_cp_be' => 'cp500',
145
+ 'ebcdic_cp_ch' => 'cp500',
146
+ 'ibm500' => 'cp500',
147
+
148
+ # cp775 codec
149
+ '775' => 'cp775',
150
+ 'cspc775baltic' => 'cp775',
151
+ 'ibm775' => 'cp775',
152
+
153
+ # cp850 codec
154
+ '850' => 'cp850',
155
+ 'cspc850multilingual' => 'cp850',
156
+ 'ibm850' => 'cp850',
157
+
158
+ # cp852 codec
159
+ '852' => 'cp852',
160
+ 'cspcp852' => 'cp852',
161
+ 'ibm852' => 'cp852',
162
+
163
+ # cp855 codec
164
+ '855' => 'cp855',
165
+ 'csibm855' => 'cp855',
166
+ 'ibm855' => 'cp855',
167
+
168
+ # cp857 codec
169
+ '857' => 'cp857',
170
+ 'csibm857' => 'cp857',
171
+ 'ibm857' => 'cp857',
172
+
173
+ # cp860 codec
174
+ '860' => 'cp860',
175
+ 'csibm860' => 'cp860',
176
+ 'ibm860' => 'cp860',
177
+
178
+ # cp861 codec
179
+ '861' => 'cp861',
180
+ 'cp_is' => 'cp861',
181
+ 'csibm861' => 'cp861',
182
+ 'ibm861' => 'cp861',
183
+
184
+ # cp862 codec
185
+ '862' => 'cp862',
186
+ 'cspc862latinhebrew' => 'cp862',
187
+ 'ibm862' => 'cp862',
188
+
189
+ # cp863 codec
190
+ '863' => 'cp863',
191
+ 'csibm863' => 'cp863',
192
+ 'ibm863' => 'cp863',
193
+
194
+ # cp864 codec
195
+ '864' => 'cp864',
196
+ 'csibm864' => 'cp864',
197
+ 'ibm864' => 'cp864',
198
+
199
+ # cp865 codec
200
+ '865' => 'cp865',
201
+ 'csibm865' => 'cp865',
202
+ 'ibm865' => 'cp865',
203
+
204
+ # cp866 codec
205
+ '866' => 'cp866',
206
+ 'csibm866' => 'cp866',
207
+ 'ibm866' => 'cp866',
208
+
209
+ # cp869 codec
210
+ '869' => 'cp869',
211
+ 'cp_gr' => 'cp869',
212
+ 'csibm869' => 'cp869',
213
+ 'ibm869' => 'cp869',
214
+
215
+ # cp932 codec
216
+ '932' => 'cp932',
217
+ 'ms932' => 'cp932',
218
+ 'mskanji' => 'cp932',
219
+ 'ms_kanji' => 'cp932',
220
+
221
+ # cp949 codec
222
+ '949' => 'cp949',
223
+ 'ms949' => 'cp949',
224
+ 'uhc' => 'cp949',
225
+
226
+ # cp950 codec
227
+ '950' => 'cp950',
228
+ 'ms950' => 'cp950',
229
+
230
+ # euc_jp codec
231
+ 'euc_jp' => 'euc-jp',
232
+ 'eucjp' => 'euc-jp',
233
+ 'ujis' => 'euc-jp',
234
+ 'u_jis' => 'euc-jp',
235
+
236
+ # euc_kr codec
237
+ 'euc_kr' => 'euc-kr',
238
+ 'euckr' => 'euc-kr',
239
+ 'korean' => 'euc-kr',
240
+ 'ksc5601' => 'euc-kr',
241
+ 'ks_c_5601' => 'euc-kr',
242
+ 'ks_c_5601_1987' => 'euc-kr',
243
+ 'ksx1001' => 'euc-kr',
244
+ 'ks_x_1001' => 'euc-kr',
245
+
246
+ # gb18030 codec
247
+ 'gb18030_2000' => 'gb18030',
248
+
249
+ # gb2312 codec
250
+ 'chinese' => 'gb2312',
251
+ 'csiso58gb231280' => 'gb2312',
252
+ 'euc_cn' => 'gb2312',
253
+ 'euccn' => 'gb2312',
254
+ 'eucgb2312_cn' => 'gb2312',
255
+ 'gb2312_1980' => 'gb2312',
256
+ 'gb2312_80' => 'gb2312',
257
+ 'iso_ir_58' => 'gb2312',
258
+
259
+ # gbk codec
260
+ '936' => 'gbk',
261
+ 'cp936' => 'gbk',
262
+ 'ms936' => 'gbk',
263
+
264
+ # hp-roman8 codec
265
+ 'hp_roman8' => 'hp-roman8',
266
+ 'roman8' => 'hp-roman8',
267
+ 'r8' => 'hp-roman8',
268
+ 'csHPRoman8' => 'hp-roman8',
269
+
270
+ # iso2022_jp codec
271
+ 'iso2022_jp' => 'iso-2022-jp',
272
+ 'csiso2022jp' => 'iso-2022-jp',
273
+ 'iso2022jp' => 'iso-2022-jp',
274
+ 'iso_2022_jp' => 'iso-2022-jp',
275
+
276
+ # iso2022_jp_1 codec
277
+ 'iso2002_jp_1' => 'iso-2022-jp-1',
278
+ 'iso2022jp_1' => 'iso-2022-jp-1',
279
+ 'iso_2022_jp_1' => 'iso-2022-jp-1',
280
+
281
+ # iso2022_jp_2 codec
282
+ 'iso2022_jp_2' => 'iso-2002-jp-2',
283
+ 'iso2022jp_2' => 'iso-2022-jp-2',
284
+ 'iso_2022_jp_2' => 'iso-2022-jp-2',
285
+
286
+ # iso2022_jp_3 codec
287
+ 'iso2002_jp_3' => 'iso-2022-jp-3',
288
+ 'iso2022jp_3' => 'iso-2022-jp-3',
289
+ 'iso_2022_jp_3' => 'iso-2022-jp-3',
290
+
291
+ # iso2022_kr codec
292
+ 'iso2022_kr' => 'iso-2022-kr',
293
+ 'csiso2022kr' => 'iso-2022-kr',
294
+ 'iso2022kr' => 'iso-2022-kr',
295
+ 'iso_2022_kr' => 'iso-2022-kr',
296
+
297
+ # iso8859_10 codec
298
+ 'iso8859_10' => 'iso-8859-10',
299
+ 'csisolatin6' => 'iso-8859-10',
300
+ 'iso_8859_10' => 'iso-8859-10',
301
+ 'iso_8859_10_1992' => 'iso-8859-10',
302
+ 'iso_ir_157' => 'iso-8859-10',
303
+ 'l6' => 'iso-8859-10',
304
+ 'latin6' => 'iso-8859-10',
305
+
306
+ # iso8859_13 codec
307
+ 'iso8859_13' => 'iso-8859-13',
308
+ 'iso_8859_13' => 'iso-8859-13',
309
+
310
+ # iso8859_14 codec
311
+ 'iso8859_14' => 'iso-8859-14',
312
+ 'iso_8859_14' => 'iso-8859-14',
313
+ 'iso_8859_14_1998' => 'iso-8859-14',
314
+ 'iso_celtic' => 'iso-8859-14',
315
+ 'iso_ir_199' => 'iso-8859-14',
316
+ 'l8' => 'iso-8859-14',
317
+ 'latin8' => 'iso-8859-14',
318
+
319
+ # iso8859_15 codec
320
+ 'iso8859_15' => 'iso-8859-15',
321
+ 'iso_8859_15' => 'iso-8859-15',
322
+
323
+ # iso8859_1 codec
324
+ 'latin_1' => 'iso-8859-1',
325
+ 'cp819' => 'iso-8859-1',
326
+ 'csisolatin1' => 'iso-8859-1',
327
+ 'ibm819' => 'iso-8859-1',
328
+ 'iso8859' => 'iso-8859-1',
329
+ 'iso_8859_1' => 'iso-8859-1',
330
+ 'iso_8859_1_1987' => 'iso-8859-1',
331
+ 'iso_ir_100' => 'iso-8859-1',
332
+ 'l1' => 'iso-8859-1',
333
+ 'latin' => 'iso-8859-1',
334
+ 'latin1' => 'iso-8859-1',
335
+
336
+ # iso8859_2 codec
337
+ 'iso8859_2' => 'iso-8859-2',
338
+ 'csisolatin2' => 'iso-8859-2',
339
+ 'iso_8859_2' => 'iso-8859-2',
340
+ 'iso_8859_2_1987' => 'iso-8859-2',
341
+ 'iso_ir_101' => 'iso-8859-2',
342
+ 'l2' => 'iso-8859-2',
343
+ 'latin2' => 'iso-8859-2',
344
+
345
+ # iso8859_3 codec
346
+ 'iso8859_3' => 'iso-8859-3',
347
+ 'csisolatin3' => 'iso-8859-3',
348
+ 'iso_8859_3' => 'iso-8859-3',
349
+ 'iso_8859_3_1988' => 'iso-8859-3',
350
+ 'iso_ir_109' => 'iso-8859-3',
351
+ 'l3' => 'iso-8859-3',
352
+ 'latin3' => 'iso-8859-3',
353
+
354
+ # iso8859_4 codec
355
+ 'iso8849_4' => 'iso-8859-4',
356
+ 'csisolatin4' => 'iso-8859-4',
357
+ 'iso_8859_4' => 'iso-8859-4',
358
+ 'iso_8859_4_1988' => 'iso-8859-4',
359
+ 'iso_ir_110' => 'iso-8859-4',
360
+ 'l4' => 'iso-8859-4',
361
+ 'latin4' => 'iso-8859-4',
362
+
363
+ # iso8859_5 codec
364
+ 'iso8859_5' => 'iso-8859-5',
365
+ 'csisolatincyrillic' => 'iso-8859-5',
366
+ 'cyrillic' => 'iso-8859-5',
367
+ 'iso_8859_5' => 'iso-8859-5',
368
+ 'iso_8859_5_1988' => 'iso-8859-5',
369
+ 'iso_ir_144' => 'iso-8859-5',
370
+
371
+ # iso8859_6 codec
372
+ 'iso8859_6' => 'iso-8859-6',
373
+ 'arabic' => 'iso-8859-6',
374
+ 'asmo_708' => 'iso-8859-6',
375
+ 'csisolatinarabic' => 'iso-8859-6',
376
+ 'ecma_114' => 'iso-8859-6',
377
+ 'iso_8859_6' => 'iso-8859-6',
378
+ 'iso_8859_6_1987' => 'iso-8859-6',
379
+ 'iso_ir_127' => 'iso-8859-6',
380
+
381
+ # iso8859_7 codec
382
+ 'iso8859_7' => 'iso-8859-7',
383
+ 'csisolatingreek' => 'iso-8859-7',
384
+ 'ecma_118' => 'iso-8859-7',
385
+ 'elot_928' => 'iso-8859-7',
386
+ 'greek' => 'iso-8859-7',
387
+ 'greek8' => 'iso-8859-7',
388
+ 'iso_8859_7' => 'iso-8859-7',
389
+ 'iso_8859_7_1987' => 'iso-8859-7',
390
+ 'iso_ir_126' => 'iso-8859-7',
391
+
392
+ # iso8859_8 codec
393
+ 'iso8859_9' => 'iso8859_8',
394
+ 'csisolatinhebrew' => 'iso-8859-8',
395
+ 'hebrew' => 'iso-8859-8',
396
+ 'iso_8859_8' => 'iso-8859-8',
397
+ 'iso_8859_8_1988' => 'iso-8859-8',
398
+ 'iso_ir_138' => 'iso-8859-8',
399
+
400
+ # iso8859_9 codec
401
+ 'iso8859_9' => 'iso-8859-9',
402
+ 'csisolatin5' => 'iso-8859-9',
403
+ 'iso_8859_9' => 'iso-8859-9',
404
+ 'iso_8859_9_1989' => 'iso-8859-9',
405
+ 'iso_ir_148' => 'iso-8859-9',
406
+ 'l5' => 'iso-8859-9',
407
+ 'latin5' => 'iso-8859-9',
408
+
409
+ # iso8859_11 codec
410
+ 'iso8859_11' => 'iso-8859-11',
411
+ 'thai' => 'iso-8859-11',
412
+ 'iso_8859_11' => 'iso-8859-11',
413
+ 'iso_8859_11_2001' => 'iso-8859-11',
414
+
415
+ # iso8859_16 codec
416
+ 'iso8859_16' => 'iso-8859-16',
417
+ 'iso_8859_16' => 'iso-8859-16',
418
+ 'iso_8859_16_2001' => 'iso-8859-16',
419
+ 'iso_ir_226' => 'iso-8859-16',
420
+ 'l10' => 'iso-8859-16',
421
+ 'latin10' => 'iso-8859-16',
422
+
423
+ # cskoi8r codec
424
+ 'koi8_r' => 'cskoi8r',
425
+
426
+ # mac_cyrillic codec
427
+ 'mac_cyrillic' => 'maccyrillic',
428
+
429
+ # shift_jis codec
430
+ 'csshiftjis' => 'shift_jis',
431
+ 'shiftjis' => 'shift_jis',
432
+ 'sjis' => 'shift_jis',
433
+ 's_jis' => 'shift_jis',
434
+
435
+ # shift_jisx0213 codec
436
+ 'shiftjisx0213' => 'shift_jisx0213',
437
+ 'sjisx0213' => 'shift_jisx0213',
438
+ 's_jisx0213' => 'shift_jisx0213',
439
+
440
+ # utf_16 codec
441
+ 'utf_16' => 'utf-16',
442
+ 'u16' => 'utf-16',
443
+ 'utf16' => 'utf-16',
444
+
445
+ # utf_16_be codec
446
+ 'utf_16_be' => 'utf-16be',
447
+ 'unicodebigunmarked' => 'utf-16be',
448
+ 'utf_16be' => 'utf-16be',
449
+
450
+ # utf_16_le codec
451
+ 'utf_16_le' => 'utf-16le',
452
+ 'unicodelittleunmarked' => 'utf-16le',
453
+ 'utf_16le' => 'utf-16le',
454
+
455
+ # utf_7 codec
456
+ 'utf_7' => 'utf-7',
457
+ 'u7' => 'utf-7',
458
+ 'utf7' => 'utf-7',
459
+
460
+ # utf_8 codec
461
+ 'utf_8' => 'utf-8',
462
+ 'u8' => 'utf-8',
463
+ 'utf' => 'utf-8',
464
+ 'utf8' => 'utf-8',
465
+ 'utf8_ucs2' => 'utf-8',
466
+ 'utf8_ucs4' => 'utf-8',
467
+ }
468
+
469
+ def unicode(data, from_encoding)
470
+ # Takes a single string and converts it from the encoding in
471
+ # from_encoding to unicode.
472
+ uconvert(data, from_encoding, 'unicode')
473
+ end
474
+
475
+ def uconvert(data, from_encoding, to_encoding = 'utf-8')
476
+ from_encoding = Encoding_Aliases[from_encoding] || from_encoding
477
+ to_encoding = Encoding_Aliases[to_encoding] || to_encoding
478
+ Iconv.iconv(to_encoding, from_encoding, data)[0]
479
+ end
480
+
481
+ def unichr(i)
482
+ [i].pack('U*')
483
+ end
484
+
485
+ def index_match(stri,regexp, offset)
486
+ if offset == 241
487
+ end
488
+ i = stri.index(regexp, offset)
489
+
490
+ return nil, nil unless i
491
+
492
+ full = stri[i..-1].match(regexp)
493
+ return i, full
494
+ end
495
+
496
+ def _ebcdic_to_ascii(s)
497
+ return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
498
+ end
499
+
500
+ def urljoin(base, uri)
501
+ urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
502
+ uri = uri.sub(urifixer, '\1\3')
503
+ begin
504
+ return URI.join(base, uri).to_s
505
+ rescue URI::BadURIError => e
506
+ if URI.parse(base).relative?
507
+ return URI::parse(uri).to_s
508
+ end
509
+ end
510
+ end
511
+
512
+ def py2rtime(pytuple)
513
+ Time.utc(pytuple[0..5])
514
+ end
515
+
516
+ # http://intertwingly.net/stories/2005/09/28/xchar.rb
517
+ module XChar
518
+ # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
519
+ CP1252 = {
520
+ 128 => 8364, # euro sign
521
+ 130 => 8218, # single low-9 quotation mark
522
+ 131 => 402, # latin small letter f with hook
523
+ 132 => 8222, # double low-9 quotation mark
524
+ 133 => 8230, # horizontal ellipsis
525
+ 134 => 8224, # dagger
526
+ 135 => 8225, # double dagger
527
+ 136 => 710, # modifier letter circumflex accent
528
+ 137 => 8240, # per mille sign
529
+ 138 => 352, # latin capital letter s with caron
530
+ 139 => 8249, # single left-pointing angle quotation mark
531
+ 140 => 338, # latin capital ligature oe
532
+ 142 => 381, # latin capital letter z with caron
533
+ 145 => 8216, # left single quotation mark
534
+ 146 => 8217, # right single quotation mark
535
+ 147 => 8220, # left double quotation mark
536
+ 148 => 8221, # right double quotation mark
537
+ 149 => 8226, # bullet
538
+ 150 => 8211, # en dash
539
+ 151 => 8212, # em dash
540
+ 152 => 732, # small tilde
541
+ 153 => 8482, # trade mark sign
542
+ 154 => 353, # latin small letter s with caron
543
+ 155 => 8250, # single right-pointing angle quotation mark
544
+ 156 => 339, # latin small ligature oe
545
+ 158 => 382, # latin small letter z with caron
546
+ 159 => 376} # latin capital letter y with diaeresis
547
+
548
+ # http://www.w3.org/TR/REC-xml/#dt-chardata
549
+ PREDEFINED = {
550
+ 38 => '&', # ampersand
551
+ 60 => '<', # left angle bracket
552
+ 62 => '>'} # right angle bracket
553
+
554
+ # http://www.w3.org/TR/REC-xml/#charsets
555
+ VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
556
+ (0xE000..0xFFFD), (0x10000..0x10FFFF)]
557
+ end
558
+
559
+ class Fixnum
560
+ # xml escaped version of chr
561
+ def xchr
562
+ n = XChar::CP1252[self] || self
563
+ n = 42 unless XChar::VALID.find {|range| range.include? n}
564
+ XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
565
+ end
566
+ end
567
+
568
+ class String
569
+ alias :old_index :index
570
+ def to_xs
571
+ unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
572
+ rescue
573
+ unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
574
+ end
575
+ end
576
+
577
+ class BetterSGMLParserError < Exception; end;
578
+ class BetterSGMLParser < HTML::SGMLParser
579
+ # Replaced Tagfind and Charref Regexps with the ones in feedparser.py
580
+ # This makes things work.
581
+ Interesting = /[&<]/u
582
+ Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
583
+ '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
584
+ '![^<>]*)?', 64) # 64 is the unicode flag
585
+
586
+ Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
587
+ Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
588
+
589
+ Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
590
+ Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
591
+ Endtagopen = /<\//u # Matching the Python SGMLParser
592
+ Endbracket = /[<>]/u
593
+ Declopen = /<!/u
594
+ Piopenbegin = /^<\?/u
595
+ Piclose = />/u
596
+
597
+ Commentopen = /<!--/u
598
+ Commentclose = /--\s*>/u
599
+ Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
600
+ Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
601
+ '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
602
+ 64)
603
+ Endtagfind = /\s*\/\s*>/u
604
+ def initialize(verbose=false)
605
+ super(verbose)
606
+ end
607
+ def feed(*args)
608
+ super(*args)
609
+ end
610
+
611
+ def goahead(_end)
612
+ rawdata = @rawdata # woo, utf-8 magic
613
+ i = 0
614
+ n = rawdata.length
615
+ while i < n
616
+ if @nomoretags
617
+ # handle_data_range does nothing more than set a "Range" that is never used. wtf?
618
+ handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
619
+ i = n
620
+ break
621
+ end
622
+ j = rawdata.index(Interesting, i)
623
+ j = n unless j
624
+ handle_data(rawdata[i...j]) if i < j
625
+ i = j
626
+ break if (i == n)
627
+ if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
628
+ if rawdata.index(Starttagopen,i) == i
629
+ if @literal
630
+ handle_data(rawdata[i..i])
631
+ i = i+1
632
+ next
633
+ end
634
+ k = parse_starttag(i)
635
+ break unless k
636
+ i = k
637
+ next
638
+ end
639
+ if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
640
+ k = parse_endtag(i)
641
+ break unless k
642
+ i = k
643
+ @literal = false
644
+ next
645
+ end
646
+ if @literal
647
+ if n > (i+1)
648
+ handle_data("<")
649
+ i = i+1
650
+ else
651
+ #incomplete
652
+ break
653
+ end
654
+ next
655
+ end
656
+ if rawdata.index(Commentopen,i) == i
657
+ k = parse_comment(i)
658
+ break unless k
659
+ i = k
660
+ next
661
+ end
662
+ if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
663
+ k = parse_pi(i)
664
+ break unless k
665
+ i += k
666
+ next
667
+ end
668
+ if rawdata.index(Declopen,i) == i
669
+ # This is some sort of declaration; in "HTML as
670
+ # deployed," this should only be the document type
671
+ # declaration ("<!DOCTYPE html...>").
672
+ k = parse_declaration(i)
673
+ break unless k
674
+ i = k
675
+ next
676
+ end
677
+ elsif rawdata[i..i] == '&'
678
+ if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
679
+ handle_data(rawdata[i..i])
680
+ i += 1
681
+ next
682
+ end
683
+
684
+ # the Char must come first as its #=~ method is the only one that is UTF-8 safe
685
+ ni,match = index_match(rawdata, Charref, i)
686
+ if ni and ni == i # See? Ugly
687
+ handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
688
+ i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
689
+ i -= 1 unless rawdata[i-1..i-1] == ";"
690
+ next
691
+ end
692
+ ni,match = index_match(rawdata, Entityref, i)
693
+ if ni and ni == i
694
+ handle_entityref(match[1])
695
+ i += match[0].length
696
+ i -= 1 unless rawdata[i-1..i-1] == ";"
697
+ next
698
+ end
699
+ else
700
+ error('neither < nor & ??')
701
+ end
702
+ # We get here only if incomplete matches but
703
+ # nothing else
704
+ ni,match = index_match(rawdata,Incomplete,i)
705
+ unless ni and ni == 0
706
+ handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
707
+ i += 1
708
+ next
709
+ end
710
+ j = ni + match[0].length
711
+ break if j == n # Really incomplete
712
+ handle_data(rawdata[i...j])
713
+ i = j
714
+ end # end while
715
+
716
+ if _end and i < n
717
+ handle_data(rawdata[i...n])
718
+ i = n
719
+ end
720
+
721
+ @rawdata = rawdata[i..-1]
722
+ # @offset += i # FIXME BUGME another unused variable in SGMLParser?
723
+ end
724
+
725
+
726
+ # Internal -- parse processing instr, return length or -1 if not terminated
727
+ def parse_pi(i)
728
+ rawdata = @rawdata
729
+ if rawdata[i...i+2] != '<?'
730
+ error("unexpected call to parse_pi()")
731
+ end
732
+ ni,match = index_match(rawdata,Piclose,i+2)
733
+ return nil unless match
734
+ j = ni
735
+ handle_pi(rawdata[i+2...j])
736
+ j = (j + match[0].length)
737
+ return j-i
738
+ end
739
+
740
+ def parse_comment(i)
741
+ rawdata = @rawdata
742
+ if rawdata[i...i+4] != "<!--"
743
+ error("unexpected call to parse_comment()")
744
+ end
745
+ ni,match = index_match(rawdata, Commentclose,i)
746
+ return nil unless match
747
+ handle_comment(rawdata[i+4..(ni-1)])
748
+ return ni+match[0].length # Length from i to just past the closing comment tag
749
+ end
750
+
751
+
752
+ def parse_starttag(i)
753
+ @_starttag_text = nil
754
+ start_pos = i
755
+ rawdata = @rawdata
756
+ ni,match = index_match(rawdata,Shorttagopen,i)
757
+ if ni == i
758
+ # SGML shorthand: <tag/data/ == <tag>data</tag>
759
+ # XXX Can data contain &... (entity or char refs)?
760
+ # XXX Can data contain < or > (tag characters)?
761
+ # XXX Can there be whitespace before the first /?
762
+ k,match = index_match(rawdata,Shorttag,i)
763
+ return nil unless match
764
+ tag, data = match[1], match[2]
765
+ @_starttag_text = "<#{tag}/"
766
+ tag.downcase!
767
+ second_end = rawdata.index(Shorttagopen,k)
768
+ finish_shorttag(tag, data)
769
+ @_starttag_text = rawdata[start_pos...second_end+1]
770
+ return k
771
+ end
772
+
773
+ j = rawdata.index(Endbracket, i+1)
774
+ return nil unless j
775
+ attrsd = []
776
+ if rawdata[i...i+2] == '<>'
777
+ # SGML shorthand: <> == <last open tag seen>
778
+ k = j
779
+ tag = @lasttag
780
+ else
781
+ ni,match = index_match(rawdata,Tagfind,i+1)
782
+ unless match
783
+ error('unexpected call to parse_starttag')
784
+ end
785
+ k = ni+match[0].length+1
786
+ tag = match[0].downcase
787
+ @lasttag = tag
788
+ end
789
+
790
+ while k < j
791
+ break if rawdata.index(Endtagfind, k) == k
792
+ ni,match = index_match(rawdata,Attrfind,k)
793
+ break unless ni
794
+ matched_length = match[0].length
795
+ attrname, rest, attrvalue = match[1],match[2],match[3]
796
+ if rest.nil? or rest.empty?
797
+ attrvalue = '' # was: = attrname # Why the change?
798
+ elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
799
+ attrvalue = attrvalue[1...-1]
800
+ end
801
+ attrsd << [attrname.downcase, attrvalue]
802
+ k += matched_length
803
+ end
804
+ if rawdata[j..j] == ">"
805
+ j += 1
806
+ end
807
+ @_starttag_text = rawdata[start_pos...j]
808
+ finish_starttag(tag, attrsd)
809
+ return j
810
+ end
811
+
812
+ def parse_endtag(i)
813
+ rawdata = @rawdata
814
+ j, match = index_match(rawdata, /[<>]/,i+1)
815
+ return nil unless j
816
+ tag = rawdata[i+2...j].strip.downcase
817
+ if rawdata[j..j] == ">"
818
+ j += 1
819
+ end
820
+ finish_endtag(tag)
821
+ return j
822
+ end
823
+
824
+ def output
825
+ # Return processed HTML as a single string
826
+ return @pieces.map{|p| p.to_s}.join
827
+ end
828
+
829
+ def error(message)
830
+ raise BetterSGMLParserError.new(message)
831
+ end
832
+ def handle_pi(text)
833
+ end
834
+ def handle_decl(text)
835
+ end
836
+ end
837
+
838
+ # Add some helper methods to make AttributeList (all of those damn attrs
839
+ # and attrsD used by StrictFeedParser) act more like a Hash.
840
+ # NOTE AttributeList is still Read-Only (AFAICT).
841
+ # Monkey patching is terrible, and I have an addiction.
842
+ module XML
843
+ module SAX
844
+ module AttributeList # in xml/sax.rb
845
+ def [](key)
846
+ getValue(key)
847
+ end
848
+
849
+ def each(&blk)
850
+ (0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
851
+ end
852
+
853
+ def each_key(&blk)
854
+ (0...getLength).each{|pos| yield getName(pos) }
855
+ end
856
+
857
+ def each_value(&blk)
858
+ (0...getLength).each{|pos| yield getValue(pos) }
859
+ end
860
+
861
+ def to_a # Rather use collect? grep for to_a.collect
862
+ l = []
863
+ each{|k,v| l << [k,v]}
864
+ return l
865
+ end
866
+
867
+ def to_s
868
+ l = []
869
+ each{|k,v| l << "#{k} => #{v}"}
870
+ "{ "+l.join(", ")+" }"
871
+ end
872
+ end
873
+ end
874
+ end
875
+ # This adds a nice scrub method to Hpricot, so we don't need a _HTMLSanitizer class
876
+ # http://underpantsgnome.com/2007/01/20/hpricot-scrub
877
+ # I have modified it to check for attributes that are only allowed if they are in a certain tag
878
+ module Hpricot
879
+ Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
880
+ 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
881
+ 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
882
+ 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
883
+ 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
884
+ 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
885
+ 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
886
+ 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
887
+ 'ul', 'var'
888
+ ]
889
+
890
+ Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
891
+ 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
892
+ 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
893
+ 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
894
+ 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
895
+ 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
896
+ 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
897
+ 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
898
+ 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
899
+ 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
900
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
901
+ ]
902
+
903
+ Unacceptable_Elements_With_End_Tag = ['script', 'applet']
904
+
905
+ Acceptable_Css_Properties = ['azimuth', 'background-color',
906
+ 'border-bottom-color', 'border-collapse', 'border-color',
907
+ 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
908
+ 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
909
+ 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
910
+ 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
911
+ 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
912
+ 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
913
+ 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
914
+ 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
915
+ 'white-space', 'width'
916
+ ]
917
+
918
+ # survey of common keywords found in feeds
919
+ Acceptable_Css_Keywords = ['auto', 'aqua', 'black', 'block', 'blue',
920
+ 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
921
+ 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
922
+ 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
923
+ 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
924
+ 'transparent', 'underline', 'white', 'yellow'
925
+ ]
926
+
927
+ Mathml_Elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
928
+ 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
929
+ 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
930
+ 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
931
+ 'munderover', 'none'
932
+ ]
933
+
934
+ Mathml_Attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
935
+ 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
936
+ 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
937
+ 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
938
+ 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
939
+ 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
940
+ 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
941
+ 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
942
+ 'xlink:type', 'xmlns', 'xmlns:xlink'
943
+ ]
944
+
945
+ # svgtiny - foreignObject + linearGradient + radialGradient + stop
946
+ Svg_Elements = ['a', 'animate', 'animateColor', 'animateMotion',
947
+ 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
948
+ 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
949
+ 'linearGradient', 'line', 'metadata', 'missing-glyph', 'mpath', 'path',
950
+ 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg',
951
+ 'switch', 'text', 'title', 'use'
952
+ ]
953
+
954
+ # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
955
+ Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
956
+ 'arabic-form', 'ascent', 'attributeName', 'attributeType',
957
+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
958
+ 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
959
+ 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
960
+ 'font-size', 'font-stretch', 'font-style', 'font-variant',
961
+ 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
962
+ 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
963
+ 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
964
+ 'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
965
+ 'origin', 'overline-position', 'overline-thickness', 'panose-1',
966
+ 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
967
+ 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
968
+ 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
969
+ 'stop-color', 'stop-opacity', 'strikethrough-position',
970
+ 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
971
+ 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
972
+ 'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
973
+ 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
974
+ 'underline-position', 'underline-thickness', 'unicode',
975
+ 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
976
+ 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
977
+ 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
978
+ 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
979
+ 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
980
+ ]
981
+
982
+ Svg_Attr_Map = nil
983
+ Svg_Elem_Map = nil
984
+
985
+ Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
986
+ 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
987
+ 'stroke-opacity'
988
+ ]
989
+
990
+ unless $compatible
991
+ @@acceptable_tag_specific_attributes = {}
992
+ @@mathml_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@mathml_attributes }
993
+ @@svg_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@svg_attributes }
994
+ end
995
+
996
+ class Elements
997
+ def strip(allowed_tags=[]) # I completely route around this with the recursive_strip in Doc
998
+ each { |x| x.strip(allowed_tags) }
999
+ end
1000
+
1001
+ def strip_attributes(safe=[])
1002
+ each { |x| x.strip_attributes(safe) }
1003
+ end
1004
+
1005
+ def strip_style(ok_props=[], ok_keywords=[])
1006
+ each { |x| x.strip_style(ok_props, ok_keywords) }
1007
+ end
1008
+ end
1009
+
1010
+ class Text
1011
+ def strip(foo)
1012
+ end
1013
+ def strip_attributes(foo)
1014
+ end
1015
+ end
1016
+ class Comment
1017
+ def strip(foo)
1018
+ end
1019
+ def strip_attributes(foo)
1020
+ end
1021
+ end
1022
+ class BogusETag
1023
+ def strip(foo)
1024
+ end
1025
+ def strip_attributes(foo)
1026
+ end
1027
+ end
1028
+
1029
+ class Elem
1030
+ def decode_entities
1031
+ children.each{ |x| x.decode_entities }
1032
+ end
1033
+
1034
+ def cull
1035
+ if children
1036
+ swap(children.to_s)
1037
+ end
1038
+ end
1039
+
1040
+ def strip
1041
+ if strip_removes?
1042
+ cull
1043
+ end
1044
+ end
1045
+
1046
+ def strip_attributes
1047
+ unless attributes.nil?
1048
+ attributes.each do |atr|
1049
+ unless Acceptable_Attributes.include?atr[0]
1050
+ remove_attribute(atr[0])
1051
+ end
1052
+ end
1053
+ end
1054
+ end
1055
+
1056
+ def strip_removes?
1057
+ # I'm sure there are others that shuould be ripped instead of stripped
1058
+ attributes && attributes['type'] =~ /script|css/
1059
+ end
1060
+ end
1061
+ end
1062
+
1063
+ module FeedParser
1064
+ Version = "0.1aleph_naught"
1065
+
1066
+ License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
1067
+
1068
+ Redistribution and use in source and binary forms, with or without modification,
1069
+ are permitted provided that the following conditions are met:
1070
+
1071
+ * Redistributions of source code must retain the above copyright notice,
1072
+ this list of conditions and the following disclaimer.
1073
+ * Redistributions in binary form must reproduce the above copyright notice,
1074
+ this list of conditions and the following disclaimer in the documentation
1075
+ and/or other materials provided with the distribution.
1076
+
1077
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
1078
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1079
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1080
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
1081
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
1082
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
1083
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
1084
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
1085
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
1086
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1087
+ POSSIBILITY OF SUCH DAMAGE."""
1088
+
1089
+ Author = "Jeff Hodges <http://somethingsimilar.com>"
1090
+ Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
1091
+ Contributors = [ "Jason Diamond <http://injektilo.org/>",
1092
+ "John Beimler <http://john.beimler.org/>",
1093
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>",
1094
+ "Aaron Swartz <http://aaronsw.com/>",
1095
+ "Kevin Marks <http://epeus.blogspot.com/>"
1096
+ ]
1097
+ # HTTP "User-Agent" header to send to servers when downloading feeds.
1098
+ # If you are embedding feedparser in a larger application, you should
1099
+ # change this to your application name and URL.
1100
+ USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
1101
+
1102
+ # HTTP "Accept" header to send to servers when downloading feeds. If you don't
1103
+ # want to send an Accept header, set this to None.
1104
+ ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
1105
+
1106
+
1107
+ # If you want feedparser to automatically run HTML markup through HTML Tidy, set
1108
+ # this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
1109
+ # or utidylib <http://utidylib.berlios.de/>.
1110
+ TIDY_MARKUP = false #FIXME untranslated
1111
+
1112
+ # List of Python interfaces for HTML Tidy, in order of preference. Only useful
1113
+ # if TIDY_MARKUP = true
1114
+ PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
1115
+
1116
+ # The original Python import. I'm using it to help translate
1117
+ #import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
1118
+
1119
+
1120
+
1121
+ # ---------- don't touch these ----------
1122
+ class ThingsNobodyCaresAboutButMe < Exception
1123
+ end
1124
+ class CharacterEncodingOverride < ThingsNobodyCaresAboutButMe
1125
+ end
1126
+ class CharacterEncodingUnknown < ThingsNobodyCaresAboutButMe
1127
+ end
1128
+ class NonXMLContentType < ThingsNobodyCaresAboutButMe
1129
+ end
1130
+ class UndeclaredNamespace < Exception
1131
+ end
1132
+
1133
+
1134
+ SUPPORTED_VERSIONS = {'' => 'unknown',
1135
+ 'rss090' => 'RSS 0.90',
1136
+ 'rss091n' => 'RSS 0.91 (Netscape)',
1137
+ 'rss091u' => 'RSS 0.91 (Userland)',
1138
+ 'rss092' => 'RSS 0.92',
1139
+ 'rss093' => 'RSS 0.93',
1140
+ 'rss094' => 'RSS 0.94',
1141
+ 'rss20' => 'RSS 2.0',
1142
+ 'rss10' => 'RSS 1.0',
1143
+ 'rss' => 'RSS (unknown version)',
1144
+ 'atom01' => 'Atom 0.1',
1145
+ 'atom02' => 'Atom 0.2',
1146
+ 'atom03' => 'Atom 0.3',
1147
+ 'atom10' => 'Atom 1.0',
1148
+ 'atom' => 'Atom (unknown version)',
1149
+ 'cdf' => 'CDF',
1150
+ 'hotrss' => 'Hot RSS'
1151
+ }
1152
+ class FeedParserDict < Hash
1153
+ =begin
1154
+ The naming of a certain common attribute (such as, "When was the last
1155
+ time this feed was updated?") can have many different names depending
1156
+ on the type of feed we are handling. This class allows us to use
1157
+ both the attribute name a person, who has knowledge of the kind of
1158
+ feed being parsed, expects, as well as allowing a developer to rely
1159
+ on one name to contain the proper attribute no matter what kind of
1160
+ feed is being parsed. @@keymaps is a Hash that contains information
1161
+ on what certain attributes "really is" in each feed type. It does so
1162
+ by providing a common name that will map to any feed type in the keys,
1163
+ with possible "correct" attributes in the its values. the #[] and #[]=
1164
+ methods check with keymaps to see what attribute the developer "really
1165
+ means" if they've asked for one which happens to be in @@keymap's keys.
1166
+ =end
1167
+ @@keymap = {'channel' => 'feed',
1168
+ 'items' => 'entries',
1169
+ 'guid' => 'id',
1170
+ 'date' => 'updated',
1171
+ 'date_parsed' => 'updated_parsed',
1172
+ 'description' => ['subtitle', 'summary'],
1173
+ 'url' => ['href'],
1174
+ 'modified' => 'updated',
1175
+ 'modified_parsed' => 'updated_parsed',
1176
+ 'issued' => 'published',
1177
+ 'issued_parsed' => 'published_parsed',
1178
+ 'copyright' => 'rights',
1179
+ 'copyright_detail' => 'rights_detail',
1180
+ 'tagline' => 'subtitle',
1181
+ 'tagline_detail' => 'subtitle_detail'}
1182
+
1183
+ def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
1184
+ return self['entries']
1185
+ end
1186
+ # We could include the [] rewrite in new using Hash.new's fancy pants block thing
1187
+ # but we'd still have to overwrite []= and such.
1188
+ # I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
1189
+ def initialize(pairs=nil)
1190
+ if pairs.class == Array and pairs[0].class == Array and pairs[0].length == 2
1191
+ pairs.each do |l|
1192
+ k,v = l
1193
+ self[k] = v
1194
+ end
1195
+ elsif pairs.class == Hash
1196
+ self.merge!(pairs)
1197
+ end
1198
+ end
1199
+
1200
+ def [](key)
1201
+ if key == 'category'
1202
+ return self['tags'][0]['term']
1203
+ end
1204
+ if key == 'categories'
1205
+ return self['tags'].collect{|tag| [tag['scheme'],tag['term']]}
1206
+ end
1207
+ realkey = @@keymap[key] || key
1208
+ if realkey.class == Array
1209
+ realkey.each{ |key| return self[key] if has_key?key }
1210
+ end
1211
+ # Note that the original key is preferred over the realkey we (might
1212
+ # have) found in @@keymaps
1213
+ if has_key?(key)
1214
+ return super(key)
1215
+ end
1216
+ return super(realkey)
1217
+ end
1218
+
1219
+ def []=(key,value)
1220
+ if @@keymap.key?key
1221
+ key = @@keymap[key]
1222
+ if key.class == Array
1223
+ key = key[0]
1224
+ end
1225
+ end
1226
+ super(key,value)
1227
+ end
1228
+
1229
+ def method_missing(msym, *args)
1230
+ methodname = msym.to_s
1231
+ if methodname[-1] == '='
1232
+ return self[methodname[0..-2]] = args[0]
1233
+ elsif methodname[-1] != '!' and methodname[-1] != '?' and methodname[0] != "_" # FIXME implement with private
1234
+ return self[methodname]
1235
+ else
1236
+ raise NoMethodError, "whoops, we don't know about the attribute or method called `#{methodname}' for #{self}:#{self.class}"
1237
+ end
1238
+ end
1239
+ end
1240
+
1241
+
1242
+
1243
+
1244
+ module FeedParserMixin
1245
+ attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
1246
+
1247
+ def startup(baseuri=nil, baselang=nil, encoding='utf-8')
1248
+ $stderr << "initializing FeedParser\n" if $debug
1249
+
1250
+ @namespaces = {'' => '',
1251
+ 'http://backend.userland.com/rss' => '',
1252
+ 'http://blogs.law.harvard.edu/tech/rss' => '',
1253
+ 'http://purl.org/rss/1.0/' => '',
1254
+ 'http://my.netscape.com/rdf/simple/0.9/' => '',
1255
+ 'http://example.com/newformat#' => '',
1256
+ 'http://example.com/necho' => '',
1257
+ 'http://purl.org/echo/' => '',
1258
+ 'uri/of/echo/namespace#' => '',
1259
+ 'http://purl.org/pie/' => '',
1260
+ 'http://purl.org/atom/ns#' => '',
1261
+ 'http://www.w3.org/2005/Atom' => '',
1262
+ 'http://purl.org/rss/1.0/modules/rss091#' => '',
1263
+ 'http://webns.net/mvcb/' => 'admin',
1264
+ 'http://purl.org/rss/1.0/modules/aggregation/' => 'ag',
1265
+ 'http://purl.org/rss/1.0/modules/annotate/' => 'annotate',
1266
+ 'http://media.tangent.org/rss/1.0/' => 'audio',
1267
+ 'http://backend.userland.com/blogChannelModule' => 'blogChannel',
1268
+ 'http://web.resource.org/cc/' => 'cc',
1269
+ 'http://backend.userland.com/creativeCommonsRssModule' => 'creativeCommons',
1270
+ 'http://purl.org/rss/1.0/modules/company' => 'co',
1271
+ 'http://purl.org/rss/1.0/modules/content/' => 'content',
1272
+ 'http://my.theinfo.org/changed/1.0/rss/' => 'cp',
1273
+ 'http://purl.org/dc/elements/1.1/' => 'dc',
1274
+ 'http://purl.org/dc/terms/' => 'dcterms',
1275
+ 'http://purl.org/rss/1.0/modules/email/' => 'email',
1276
+ 'http://purl.org/rss/1.0/modules/event/' => 'ev',
1277
+ 'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner',
1278
+ 'http://freshmeat.net/rss/fm/' => 'fm',
1279
+ 'http://xmlns.com/foaf/0.1/' => 'foaf',
1280
+ 'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo',
1281
+ 'http://postneo.com/icbm/' => 'icbm',
1282
+ 'http://purl.org/rss/1.0/modules/image/' => 'image',
1283
+ 'http://www.itunes.com/DTDs/PodCast-1.0.dtd' => 'itunes',
1284
+ 'http://example.com/DTDs/PodCast-1.0.dtd' => 'itunes',
1285
+ 'http://purl.org/rss/1.0/modules/link/' => 'l',
1286
+ 'http://search.yahoo.com/mrss' => 'media',
1287
+ 'http://madskills.com/public/xml/rss/module/pingback/' => 'pingback',
1288
+ 'http://prismstandard.org/namespaces/1.2/basic/' => 'prism',
1289
+ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
1290
+ 'http://www.w3.org/2000/01/rdf-schema#' => 'rdfs',
1291
+ 'http://purl.org/rss/1.0/modules/reference/' => 'ref',
1292
+ 'http://purl.org/rss/1.0/modules/richequiv/' => 'reqv',
1293
+ 'http://purl.org/rss/1.0/modules/search/' => 'search',
1294
+ 'http://purl.org/rss/1.0/modules/slash/' => 'slash',
1295
+ 'http://schemas.xmlsoap.org/soap/envelope/' => 'soap',
1296
+ 'http://purl.org/rss/1.0/modules/servicestatus/' => 'ss',
1297
+ 'http://hacks.benhammersley.com/rss/streaming/' => 'str',
1298
+ 'http://purl.org/rss/1.0/modules/subscription/' => 'sub',
1299
+ 'http://purl.org/rss/1.0/modules/syndication/' => 'sy',
1300
+ 'http://purl.org/rss/1.0/modules/taxonomy/' => 'taxo',
1301
+ 'http://purl.org/rss/1.0/modules/threading/' => 'thr',
1302
+ 'http://purl.org/rss/1.0/modules/textinput/' => 'ti',
1303
+ 'http://madskills.com/public/xml/rss/module/trackback/' =>'trackback',
1304
+ 'http://wellformedweb.org/commentAPI/' => 'wfw',
1305
+ 'http://purl.org/rss/1.0/modules/wiki/' => 'wiki',
1306
+ 'http://www.w3.org/1999/xhtml' => 'xhtml',
1307
+ 'http://www.w3.org/XML/1998/namespace' => 'xml',
1308
+ 'http://www.w3.org/1999/xlink' => 'xlink',
1309
+ 'http://schemas.pocketsoap.com/rss/myDescModule/' => 'szf'
1310
+ }
1311
+ @matchnamespaces = {}
1312
+ @namespaces.each do |l|
1313
+ @matchnamespaces[l[0].downcase] = l[1]
1314
+ end
1315
+ @can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
1316
+ @can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
1317
+ @can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
1318
+ @html_types = ['text/html', 'application/xhtml+xml']
1319
+ @feeddata = FeedParserDict.new # feed-level data
1320
+ @encoding = encoding # character encoding
1321
+ @entries = [] # list of entry-level data
1322
+ @version = '' # feed type/version see SUPPORTED_VERSIOSN
1323
+ @namespacesInUse = {} # hash of namespaces defined by the feed
1324
+
1325
+ # the following are used internall to track state;
1326
+ # this is really out of control and should be refactored
1327
+ @infeed = false
1328
+ @inentry = false
1329
+ @incontent = 0 # Yes, this needs to be zero until I work out popContent and pushContent
1330
+ @intextinput = false
1331
+ @inimage = false
1332
+ @inauthor = false
1333
+ @incontributor = false
1334
+ @inpublisher = false
1335
+ @insource = false
1336
+ @sourcedata = FeedParserDict.new
1337
+ @contentparams = FeedParserDict.new
1338
+ @summaryKey = nil
1339
+ @namespacemap = {}
1340
+ @elementstack = []
1341
+ @basestack = []
1342
+ @langstack = []
1343
+ @baseuri = baseuri || ''
1344
+ @lang = baselang || nil
1345
+ if baselang
1346
+ @feeddata['language'] = baselang.gsub('_','-')
1347
+ end
1348
+ @date_handlers = [:_parse_date_rfc822,
1349
+ :_parse_date_hungarian, :_parse_date_greek,:_parse_date_mssql,
1350
+ :_parse_date_nate,:_parse_date_onblog,:_parse_date_w3dtf,:_parse_date_iso8601
1351
+ ]
1352
+ $stderr << "Leaving startup\n" if $debug # My addition
1353
+ end
1354
+
1355
+ def unknown_starttag(tag, attrsd)
1356
+ $stderr << "start #{tag} with #{attrsd}\n" if $debug
1357
+ # normalize attrs
1358
+ attrsD = {}
1359
+ attrsd = Hash[*attrsd.flatten] if attrsd.class == Array # Magic! Asterisk!
1360
+ # LooseFeedParser needs the above because SGMLParser sends attrs as a
1361
+ # list of lists (like [['type','text/html'],['mode','escaped']])
1362
+
1363
+ attrsd.each do |old_k,value|
1364
+ # There has to be a better, non-ugly way of doing this
1365
+ k = old_k.downcase # Downcase all keys
1366
+ attrsD[k] = value
1367
+ if ['rel','type'].include?value
1368
+ attrsD[k].downcase! # Downcase the value if the key is 'rel' or 'type'
1369
+ end
1370
+ end
1371
+
1372
+ # track xml:base and xml:lang
1373
+ baseuri = attrsD['xml:base'] || attrsD['base'] || @baseuri
1374
+ @baseuri = urljoin(@baseuri, baseuri)
1375
+ lang = attrsD['xml:lang'] || attrsD['lang']
1376
+ if lang == '' # FIXME This next bit of code is right? Wtf?
1377
+ # xml:lang could be explicitly set to '', we need to capture that
1378
+ lang = nil
1379
+ elsif lang.nil?
1380
+ # if no xml:lang is specified, use parent lang
1381
+ lang = @lang
1382
+ end
1383
+ if lang and not lang.empty? # Seriously, this cannot be correct
1384
+ if ['feed', 'rss', 'rdf:RDF'].include?tag
1385
+ @feeddata['language'] = lang.gsub('_','-')
1386
+ end
1387
+ end
1388
+ @lang = lang
1389
+ @basestack << @baseuri
1390
+ @langstack << lang
1391
+
1392
+ # track namespaces
1393
+ attrsd.each do |prefix, uri|
1394
+ if /^xmlns:/ =~ prefix # prefix begins with xmlns:
1395
+ trackNamespace(prefix[6..-1], uri)
1396
+ elsif prefix == 'xmlns':
1397
+ trackNamespace(nil, uri)
1398
+ end
1399
+ end
1400
+
1401
+ # track inline content
1402
+ if @incontent != 0 and @contentparams.has_key?('type') and not ( /xml$/ =~ (@contentparams['type'] || 'xml') )
1403
+ # element declared itself as escaped markup, but isn't really
1404
+
1405
+ @contentparams['type'] = 'application/xhtml+xml'
1406
+ end
1407
+ if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
1408
+ # Note: probably shouldn't simply recreate localname here, but
1409
+ # our namespace handling isn't actually 100% correct in cases where
1410
+ # the feed redefines the default namespace (which is actually
1411
+ # the usual case for inline content, thanks Sam), so here we
1412
+ # cheat and just reconstruct the element based on localname
1413
+ # because that compensates for the bugs in our namespace handling.
1414
+ # This will horribly munge inline content with non-empty qnames,
1415
+ # but nobody actually does that, so I'm not fixing it.
1416
+ tag = tag.split(':')[-1]
1417
+ attrsA = attrsd.to_a.collect{|l| "#{l[0]}=\"#{l[1]}\""}
1418
+ attrsS = ' '+attrsA.join(' ')
1419
+ return handle_data("<#{tag}#{attrsS}>", escape=false)
1420
+ end
1421
+
1422
+ # match namespaces
1423
+ if /:/ =~ tag
1424
+ prefix, suffix = tag.split(':', 2)
1425
+ else
1426
+ prefix, suffix = '', tag
1427
+ end
1428
+ prefix = @namespacemap[prefix] || prefix
1429
+ if prefix and not prefix.empty?
1430
+ prefix = prefix + '_'
1431
+ end
1432
+
1433
+ # special hack for better tracking of empty textinput/image elements in illformed feeds
1434
+ if (not prefix and not prefix.empty?) and not (['title', 'link', 'description','name'].include?tag)
1435
+ @intextinput = false
1436
+ end
1437
+ if (prefix.nil? or prefix.empty?) and not (['title', 'link', 'description', 'url', 'href', 'width', 'height'].include?tag)
1438
+ @inimage = false
1439
+ end
1440
+
1441
+ # call special handler (if defined) or default handler
1442
+ begin
1443
+ return send('_start_'+prefix+suffix, attrsD)
1444
+ rescue NoMethodError
1445
+ return push(prefix + suffix, true)
1446
+ end
1447
+ end # End unknown_starttag
1448
+
1449
+ def unknown_endtag(tag)
1450
+ $stderr << "end #{tag}\n" if $debug
1451
+ # match namespaces
1452
+ if tag.index(':')
1453
+ prefix, suffix = tag.split(':',2)
1454
+ else
1455
+ prefix, suffix = '', tag
1456
+ end
1457
+ prefix = @namespacemap[prefix] || prefix
1458
+ if prefix and not prefix.empty?
1459
+ prefix = prefix + '_'
1460
+ end
1461
+
1462
+ # call special handler (if defined) or default handler
1463
+ begin
1464
+ send('_end_' + prefix + suffix) # NOTE no return here! do not add it!
1465
+ rescue NoMethodError => details
1466
+ pop(prefix + suffix)
1467
+ end
1468
+
1469
+ # track inline content
1470
+ if @incontent != 0 and @contentparams.has_key?'type' and /xml$/ =~ (@contentparams['type'] || 'xml')
1471
+ # element declared itself as escaped markup, but it isn't really
1472
+ @contentparams['type'] = 'application/xhtml+xml'
1473
+ end
1474
+ if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
1475
+ tag = tag.split(':')[-1]
1476
+ handle_data("</#{tag}>", escape=false)
1477
+ end
1478
+
1479
+ # track xml:base and xml:lang going out of scope
1480
+ if @basestack and not @basestack.empty?
1481
+ @basestack.pop
1482
+ if @basestack and @basestack[-1] and not (@basestack.empty? or @basestack[-1].empty?)
1483
+ @baseuri = @basestack[-1]
1484
+ end
1485
+ end
1486
+ if @langstack and not @langstack.empty?
1487
+ @langstack.pop
1488
+ if @langstack and not @langstack.empty? # and @langstack[-1] and not @langstack.empty?
1489
+ @lang = @langstack[-1]
1490
+ end
1491
+ end
1492
+ end
1493
+
1494
+ def handle_charref(ref)
1495
+ # LooseParserOnly
1496
+ # called for each character reference, e.g. for '&#160;', ref will be '160'
1497
+ $stderr << "entering handle_charref with #{ref}\n" if $debug
1498
+ return if @elementstack.nil? or @elementstack.empty?
1499
+ ref.downcase!
1500
+ chars = ['34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e']
1501
+ if chars.include?ref
1502
+ text = "&##{ref};"
1503
+ else
1504
+ if ref[0..0] == 'x'
1505
+ c = (ref[1..-1]).to_i(16)
1506
+ else
1507
+ c = ref.to_i
1508
+ end
1509
+ text = uconvert(unichr(c),'unicode')
1510
+ end
1511
+ @elementstack[-1][2] << text
1512
+ end
1513
+
1514
+ def handle_entityref(ref)
1515
+ # LooseParserOnly
1516
+ # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1517
+
1518
+ return if @elementstack.nil? or @elementstack.empty?
1519
+ $stderr << "entering handle_entityref with #{ref}\n" if $debug
1520
+ ents = ['lt', 'gt', 'quot', 'amp', 'apos']
1521
+ if ents.include?ref
1522
+ text = "&#{ref};"
1523
+ else
1524
+ text = HTMLEntities::decode_entities("&#{ref};")
1525
+ end
1526
+ @elementstack[-1][2] << text
1527
+ end
1528
+
1529
+ def handle_data(text, escape=true)
1530
+ # called for each block of plain text, i.e. outside of any tag and
1531
+ # not containing any character or entity references
1532
+ return if @elementstack.nil? or @elementstack.empty?
1533
+ if escape and @contentparams['type'] == 'application/xhtml+xml'
1534
+ text = text.to_xs
1535
+ end
1536
+ @elementstack[-1][2] << text
1537
+ end
1538
+
1539
+ def handle_comment(comment)
1540
+ # called for each comment, e.g. <!-- insert message here -->
1541
+ end
1542
+
1543
+ def handle_pi(text)
1544
+ end
1545
+
1546
+ def handle_decl(text)
1547
+ end
1548
+
1549
+ def parse_declaration(i)
1550
+ # for LooseFeedParser
1551
+ $stderr << "entering parse_declaration\n" if $debug
1552
+ if @rawdata[i...i+9] == '<![CDATA['
1553
+ k = @rawdata.index(/\]\]>/u,i+9)
1554
+ k = @rawdata.length unless k
1555
+ handle_data(@rawdata[i+9...k].to_xs,false)
1556
+ return k+3
1557
+ else
1558
+ k = @rawdata.index(/>/,i).to_i
1559
+ return k+1
1560
+ end
1561
+ end
1562
+
1563
+ def mapContentType(contentType)
1564
+ contentType.downcase!
1565
+ case contentType
1566
+ when 'text'
1567
+ contentType = 'text/plain'
1568
+ when 'html'
1569
+ contentType = 'text/html'
1570
+ when 'xhtml'
1571
+ contentType = 'application/xhtml+xml'
1572
+ end
1573
+ return contentType
1574
+ end
1575
+
1576
+ def trackNamespace(prefix, uri)
1577
+
1578
+ loweruri = uri.downcase.strip
1579
+ if [prefix, loweruri] == [nil, 'http://my.netscape.com/rdf/simple/0.9/'] and (@version.nil? or @version.empty?)
1580
+ @version = 'rss090'
1581
+ elsif loweruri == 'http://purl.org/rss/1.0/' and (@version.nil? or @version.empty?)
1582
+ @version = 'rss10'
1583
+ elsif loweruri == 'http://www.w3.org/2005/atom' and (@version.nil? or @version.empty?)
1584
+ @version = 'atom10'
1585
+ elsif /backend\.userland\.com\/rss/ =~ loweruri
1586
+ # match any backend.userland.com namespace
1587
+ uri = 'http://backend.userland.com/rss'
1588
+ loweruri = uri
1589
+ end
1590
+ if @matchnamespaces.has_key? loweruri
1591
+ @namespacemap[prefix] = @matchnamespaces[loweruri]
1592
+ @namespacesInUse[@matchnamespaces[loweruri]] = uri
1593
+ else
1594
+ @namespacesInUse[prefix || ''] = uri
1595
+ end
1596
+ end
1597
+
1598
+ def resolveURI(uri)
1599
+ return urljoin(@baseuri || '', uri)
1600
+ end
1601
+
1602
+ def decodeEntities(element, data)
1603
+ return data
1604
+ end
1605
+
1606
+ def push(element, expectingText)
1607
+ @elementstack << [element, expectingText, []]
1608
+ end
1609
+
1610
+ def pop(element, stripWhitespace=true)
1611
+ return if @elementstack.nil? or @elementstack.empty?
1612
+ return if @elementstack[-1][0] != element
1613
+ element, expectingText, pieces = @elementstack.pop
1614
+ if pieces.class == Array
1615
+ output = pieces.join('')
1616
+ else
1617
+ output = pieces
1618
+ end
1619
+ if stripWhitespace
1620
+ output.strip!
1621
+ end
1622
+ return output if not expectingText
1623
+
1624
+ # decode base64 content
1625
+ if @contentparams['base64']
1626
+ out64 = Base64::decode64(output) # a.k.a. [output].unpack('m')[0]
1627
+ if not output.empty? and not out64.empty?
1628
+ output = out64
1629
+ end
1630
+ end
1631
+
1632
+ # resolve relative URIs
1633
+ if @can_be_relative_uri.include?element and output and not output.empty?
1634
+ output = resolveURI(output)
1635
+ end
1636
+
1637
+ # decode entities within embedded markup
1638
+ if not @contentparams['base64']
1639
+ output = decodeEntities(element, output)
1640
+ end
1641
+
1642
+ # remove temporary cruft from contentparams
1643
+ @contentparams.delete('mode')
1644
+ @contentparams.delete('base64')
1645
+
1646
+ # resolve relative URIs within embedded markup
1647
+ if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
1648
+ if @can_contain_relative_uris.include?element
1649
+ output = FeedParser.resolveRelativeURIs(output, @baseuri, @encoding)
1650
+ end
1651
+ end
1652
+ # sanitize embedded markup
1653
+ if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
1654
+ if @can_contain_dangerous_markup.include?element
1655
+ output = FeedParser.sanitizeHTML(output, @encoding)
1656
+ end
1657
+ end
1658
+
1659
+ if @encoding and not @encoding.empty? and @encoding != 'utf-8'
1660
+ output = uconvert(output, @encoding, 'utf-8')
1661
+ # FIXME I turn everything into utf-8, not unicode, originally because REXML was being used but now beause I haven't tested it out yet.
1662
+ end
1663
+
1664
+ # categories/tags/keywords/whatever are handled in _end_category
1665
+ return output if element == 'category'
1666
+
1667
+ # store output in appropriate place(s)
1668
+ if @inentry and not @insource
1669
+ if element == 'content'
1670
+ @entries[-1][element] ||= []
1671
+ contentparams = Marshal.load(Marshal.dump(@contentparams)) # deepcopy
1672
+ contentparams['value'] = output
1673
+ @entries[-1][element] << contentparams
1674
+ elsif element == 'link'
1675
+ @entries[-1][element] = output
1676
+ if output and not output.empty?
1677
+ @entries[-1]['links'][-1]['href'] = output
1678
+ end
1679
+ else
1680
+ element = 'summary' if element == 'description'
1681
+ @entries[-1][element] = output
1682
+ if @incontent != 0
1683
+ contentparams = Marshal.load(Marshal.dump(@contentparams))
1684
+ contentparams['value'] = output
1685
+ @entries[-1][element + '_detail'] = contentparams
1686
+ end
1687
+ end
1688
+ elsif (@infeed or @insource) and not @intextinput and not @inimage
1689
+ context = getContext()
1690
+ element = 'subtitle' if element == 'description'
1691
+ context[element] = output
1692
+ if element == 'link'
1693
+ context['links'][-1]['href'] = output
1694
+ elsif @incontent != 0
1695
+ contentparams = Marshal.load(Marshal.dump(@contentparams))
1696
+ contentparams['value'] = output
1697
+ context[element + '_detail'] = contentparams
1698
+ end
1699
+ end
1700
+ return output
1701
+ end
1702
+
1703
+ def pushContent(tag, attrsD, defaultContentType, expectingText)
1704
+ @incontent += 1 # Yes, I hate this.
1705
+ type = mapContentType(attrsD['type'] || defaultContentType)
1706
+ @contentparams = FeedParserDict.new({'type' => type,'language' => @lang,'base' => @baseuri})
1707
+ @contentparams['base64'] = isBase64(attrsD, @contentparams)
1708
+ push(tag, expectingText)
1709
+ end
1710
+
1711
+ def popContent(tag)
1712
+ value = pop(tag)
1713
+ @incontent -= 1
1714
+ @contentparams.clear
1715
+ return value
1716
+ end
1717
+
1718
+ def mapToStandardPrefix(name)
1719
+ colonpos = name.index(':')
1720
+ if colonpos
1721
+ prefix = name[0..colonpos-1]
1722
+ suffix = name[colonpos+1..-1]
1723
+ prefix = @namespacemap[prefix] || prefix
1724
+ name = prefix + ':' + suffix
1725
+ end
1726
+ return name
1727
+ end
1728
+
1729
+ def getAttribute(attrsD, name)
1730
+ return attrsD[mapToStandardPrefix(name)]
1731
+ end
1732
+
1733
+ def isBase64(attrsD, contentparams)
1734
+ return true if (attrsD['mode'] == 'base64')
1735
+ if /(^text\/)|(\+xml$)|(\/xml$)/ =~ contentparams['type']
1736
+ return false
1737
+ end
1738
+ return true
1739
+ end
1740
+
1741
+ def itsAnHrefDamnIt(attrsD)
1742
+ href= attrsD['url'] || attrsD['uri'] || attrsD['href']
1743
+ if href
1744
+ attrsD.delete('url')
1745
+ attrsD.delete('uri')
1746
+ attrsD['href'] = href
1747
+ end
1748
+ return attrsD
1749
+ end
1750
+
1751
+
1752
+ def _save(key, value)
1753
+ context = getContext()
1754
+ context[key] ||= value
1755
+ end
1756
+
1757
+ def _start_rss(attrsD)
1758
+ versionmap = {'0.91' => 'rss091u',
1759
+ '0.92' => 'rss092',
1760
+ '0.93' => 'rss093',
1761
+ '0.94' => 'rss094'
1762
+ }
1763
+
1764
+ if not @version or @version.empty?
1765
+ attr_version = attrsD['version'] || ''
1766
+ version = versionmap[attr_version]
1767
+ if version and not version.empty?
1768
+ @version = version
1769
+ elsif /^2\./ =~ attr_version
1770
+ @version = 'rss20'
1771
+ else
1772
+ @version = 'rss'
1773
+ end
1774
+ end
1775
+ end
1776
+
1777
+ def _start_dlhottitles(attrsD)
1778
+ @version = 'hotrss'
1779
+ end
1780
+
1781
+ def _start_channel(attrsD)
1782
+ @infeed = true
1783
+ _cdf_common(attrsD)
1784
+ end
1785
+ alias :_start_feedinfo :_start_channel
1786
+
1787
+ def _cdf_common(attrsD)
1788
+ if attrsD.has_key?'lastmod'
1789
+ _start_modified({})
1790
+ @elementstack[-1][-1] = attrsD['lastmod']
1791
+ _end_modified
1792
+ end
1793
+ if attrsD.has_key?'href'
1794
+ _start_link({})
1795
+ @elementstack[-1][-1] = attrsD['href']
1796
+ _end_link
1797
+ end
1798
+ end
1799
+
1800
+ def _start_feed(attrsD)
1801
+ @infeed = true
1802
+ versionmap = {'0.1' => 'atom01',
1803
+ '0.2' => 'atom02',
1804
+ '0.3' => 'atom03'
1805
+ }
1806
+
1807
+ if not @version or @version.empty?
1808
+ attr_version = attrsD['version']
1809
+ version = versionmap[attr_version]
1810
+ if @version and not @version.empty?
1811
+ @version = version
1812
+ else
1813
+ @version = 'atom'
1814
+ end
1815
+ end
1816
+ end
1817
+
1818
+ def _end_channel
1819
+ @infeed = false
1820
+ end
1821
+ alias :_end_feed :_end_channel
1822
+
1823
+ def _start_image(attrsD)
1824
+ @inimage = true
1825
+ push('image', false)
1826
+ context = getContext()
1827
+ context['image'] ||= FeedParserDict.new
1828
+ end
1829
+
1830
+ def _end_image
1831
+ pop('image')
1832
+ @inimage = false
1833
+ end
1834
+
1835
+ def _start_textinput(attrsD)
1836
+ @intextinput = true
1837
+ push('textinput', false)
1838
+ context = getContext()
1839
+ context['textinput'] ||= FeedParserDict.new
1840
+ end
1841
+ alias :_start_textInput :_start_textinput
1842
+
1843
+ def _end_textinput
1844
+ pop('textinput')
1845
+ @intextinput = false
1846
+ end
1847
+ alias :_end_textInput :_end_textinput
1848
+
1849
+ def _start_author(attrsD)
1850
+ @inauthor = true
1851
+ push('author', true)
1852
+ end
1853
+ alias :_start_managingeditor :_start_author
1854
+ alias :_start_dc_author :_start_author
1855
+ alias :_start_dc_creator :_start_author
1856
+ alias :_start_itunes_author :_start_author
1857
+
1858
+ def _end_author
1859
+ pop('author')
1860
+ @inauthor = false
1861
+ _sync_author_detail()
1862
+ end
1863
+ alias :_end_managingeditor :_end_author
1864
+ alias :_end_dc_author :_end_author
1865
+ alias :_end_dc_creator :_end_author
1866
+ alias :_end_itunes_author :_end_author
1867
+
1868
+ def _start_itunes_owner(attrsD)
1869
+ @inpublisher = true
1870
+ push('publisher', false)
1871
+ end
1872
+
1873
+ def _end_itunes_owner
1874
+ pop('publisher')
1875
+ @inpublisher = false
1876
+ _sync_author_detail('publisher')
1877
+ end
1878
+
1879
+ def _start_contributor(attrsD)
1880
+ @incontributor = true
1881
+ context = getContext()
1882
+ context['contributors'] ||= []
1883
+ context['contributors'] << FeedParserDict.new
1884
+ push('contributor', false)
1885
+ end
1886
+
1887
+ def _end_contributor
1888
+ pop('contributor')
1889
+ @incontributor = false
1890
+ end
1891
+
1892
+ def _start_dc_contributor(attrsD)
1893
+ @incontributor = true
1894
+ context = getContext()
1895
+ context['contributors'] ||= []
1896
+ context['contributors'] << FeedParserDict.new
1897
+ push('name', false)
1898
+ end
1899
+
1900
+ def _end_dc_contributor
1901
+ _end_name
1902
+ @incontributor = false
1903
+ end
1904
+
1905
+ def _start_name(attrsD)
1906
+ push('name', false)
1907
+ end
1908
+ alias :_start_itunes_name :_start_name
1909
+
1910
+ def _end_name
1911
+ value = pop('name')
1912
+ if @inpublisher
1913
+ _save_author('name', value, 'publisher')
1914
+ elsif @inauthor
1915
+ _save_author('name', value)
1916
+ elsif @incontributor
1917
+ _save_contributor('name', value)
1918
+ elsif @intextinput
1919
+ context = getContext()
1920
+ context['textinput']['name'] = value
1921
+ end
1922
+ end
1923
+ alias :_end_itunes_name :_end_name
1924
+
1925
+ def _start_width(attrsD)
1926
+ push('width', false)
1927
+ end
1928
+
1929
+ def _end_width
1930
+ value = pop('width').to_i
1931
+ if @inimage
1932
+ context = getContext
1933
+ context['image']['width'] = value
1934
+ end
1935
+ end
1936
+
1937
+ def _start_height(attrsD)
1938
+ push('height', false)
1939
+ end
1940
+
1941
+ def _end_height
1942
+ value = pop('height').to_i
1943
+ if @inimage
1944
+ context = getContext()
1945
+ context['image']['height'] = value
1946
+ end
1947
+ end
1948
+
1949
+ def _start_url(attrsD)
1950
+ push('href', true)
1951
+ end
1952
+ alias :_start_homepage :_start_url
1953
+ alias :_start_uri :_start_url
1954
+
1955
+ def _end_url
1956
+ value = pop('href')
1957
+ if @inauthor
1958
+ _save_author('href', value)
1959
+ elsif @incontributor
1960
+ _save_contributor('href', value)
1961
+ elsif @inimage
1962
+ context = getContext()
1963
+ context['image']['href'] = value
1964
+ elsif @intextinput
1965
+ context = getContext()
1966
+ context['textinput']['link'] = value
1967
+ end
1968
+ end
1969
+ alias :_end_homepage :_end_url
1970
+ alias :_end_uri :_end_url
1971
+
1972
+ def _start_email(attrsD)
1973
+ push('email', false)
1974
+ end
1975
+ alias :_start_itunes_email :_start_email
1976
+
1977
+ def _end_email
1978
+ value = pop('email')
1979
+ if @inpublisher
1980
+ _save_author('email', value, 'publisher')
1981
+ elsif @inauthor
1982
+ _save_author('email', value)
1983
+ elsif @incontributor
1984
+ _save_contributor('email', value)
1985
+ end
1986
+ end
1987
+ alias :_end_itunes_email :_end_email
1988
+
1989
+ def getContext
1990
+ if @insource
1991
+ context = @sourcedata
1992
+ elsif @inentry
1993
+ context = @entries[-1]
1994
+ else
1995
+ context = @feeddata
1996
+ end
1997
+ return context
1998
+ end
1999
+
2000
+ def _save_author(key, value, prefix='author')
2001
+ context = getContext()
2002
+ context[prefix + '_detail'] ||= FeedParserDict.new
2003
+ context[prefix + '_detail'][key] = value
2004
+ _sync_author_detail()
2005
+ end
2006
+
2007
+ def _save_contributor(key, value)
2008
+ context = getContext
2009
+ context['contributors'] ||= [FeedParserDict.new]
2010
+ context['contributors'][-1][key] = value
2011
+ end
2012
+
2013
+ def _sync_author_detail(key='author')
2014
+ context = getContext()
2015
+ detail = context["#{key}_detail"]
2016
+ if detail and not detail.empty?
2017
+ name = detail['name']
2018
+ email = detail['email']
2019
+
2020
+ if name and email and not (name.empty? or name.empty?)
2021
+ context[key] = "#{name} (#{email})"
2022
+ elsif name and not name.empty?
2023
+ context[key] = name
2024
+ elsif email and not email.empty?
2025
+ context[key] = email
2026
+ end
2027
+ else
2028
+ author = context[key].dup unless context[key].nil?
2029
+ return if not author or author.empty?
2030
+ emailmatch = author.match(/(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))/)
2031
+ email = emailmatch[1]
2032
+ author.gsub!(email, '')
2033
+ author.gsub!("\(\)", '')
2034
+ author.strip!
2035
+ author.gsub!(/^\(/,'')
2036
+ author.gsub!(/\)$/,'')
2037
+ author.strip!
2038
+ context["#{key}_detail"] ||= FeedParserDict.new
2039
+ context["#{key}_detail"]['name'] = author
2040
+ context["#{key}_detail"]['email'] = email
2041
+ end
2042
+ end
2043
+
2044
+ def _start_subtitle(attrsD)
2045
+ pushContent('subtitle', attrsD, 'text/plain', true)
2046
+ end
2047
+ alias :_start_tagline :_start_subtitle
2048
+ alias :_start_itunes_subtitle :_start_subtitle
2049
+
2050
+ def _end_subtitle
2051
+ popContent('subtitle')
2052
+ end
2053
+ alias :_end_tagline :_end_subtitle
2054
+ alias :_end_itunes_subtitle :_end_subtitle
2055
+
2056
+ def _start_rights(attrsD)
2057
+ pushContent('rights', attrsD, 'text/plain', true)
2058
+ end
2059
+ alias :_start_dc_rights :_start_rights
2060
+ alias :_start_copyright :_start_rights
2061
+
2062
+ def _end_rights
2063
+ popContent('rights')
2064
+ end
2065
+ alias :_end_dc_rights :_end_rights
2066
+ alias :_end_copyright :_end_rights
2067
+
2068
+ def _start_item(attrsD)
2069
+ @entries << FeedParserDict.new
2070
+ push('item', false)
2071
+ @inentry = true
2072
+ @guidislink = false
2073
+ id = getAttribute(attrsD, 'rdf:about')
2074
+ if id and not id.empty?
2075
+ context = getContext()
2076
+ context['id'] = id
2077
+ end
2078
+ _cdf_common(attrsD)
2079
+ end
2080
+ alias :_start_entry :_start_item
2081
+ alias :_start_product :_start_item
2082
+
2083
+ def _end_item
2084
+ pop('item')
2085
+ @inentry = false
2086
+ end
2087
+ alias :_end_entry :_end_item
2088
+
2089
+ def _start_dc_language(attrsD)
2090
+ push('language', true)
2091
+ end
2092
+ alias :_start_language :_start_dc_language
2093
+
2094
+ def _end_dc_language
2095
+ @lang = pop('language')
2096
+ end
2097
+ alias :_end_language :_end_dc_language
2098
+
2099
+ def _start_dc_publisher(attrsD)
2100
+ push('publisher', true)
2101
+ end
2102
+ alias :_start_webmaster :_start_dc_publisher
2103
+
2104
+ def _end_dc_publisher
2105
+ pop('publisher')
2106
+ _sync_author_detail('publisher')
2107
+ end
2108
+ alias :_end_webmaster :_end_dc_publisher
2109
+
2110
+ def _start_published(attrsD)
2111
+ push('published', true)
2112
+ end
2113
+ alias :_start_dcterms_issued :_start_published
2114
+ alias :_start_issued :_start_published
2115
+
2116
+ def _end_published
2117
+ value = pop('published')
2118
+ _save('published_parsed', parse_date(value))
2119
+ end
2120
+ alias :_end_dcterms_issued :_end_published
2121
+ alias :_end_issued :_end_published
2122
+
2123
+ def _start_updated(attrsD)
2124
+ push('updated', true)
2125
+ end
2126
+ alias :_start_modified :_start_updated
2127
+ alias :_start_dcterms_modified :_start_updated
2128
+ alias :_start_pubdate :_start_updated
2129
+ alias :_start_dc_date :_start_updated
2130
+
2131
+ def _end_updated
2132
+ value = pop('updated')
2133
+ _save('updated_parsed', parse_date(value))
2134
+ end
2135
+ alias :_end_modified :_end_updated
2136
+ alias :_end_dcterms_modified :_end_updated
2137
+ alias :_end_pubdate :_end_updated
2138
+ alias :_end_dc_date :_end_updated
2139
+
2140
+ def _start_created(attrsD)
2141
+ push('created', true)
2142
+ end
2143
+ alias :_start_dcterms_created :_start_created
2144
+
2145
+ def _end_created
2146
+ value = pop('created')
2147
+ _save('created_parsed', parse_date(value))
2148
+ end
2149
+ alias :_end_dcterms_created :_end_created
2150
+
2151
+ def _start_expirationdate(attrsD)
2152
+ push('expired', true)
2153
+ end
2154
+ def _end_expirationdate
2155
+ _save('expired_parsed', parse_date(pop('expired')))
2156
+ end
2157
+
2158
+ def _start_cc_license(attrsD)
2159
+ push('license', true)
2160
+ value = getAttribute(attrsD, 'rdf:resource')
2161
+ if value and not value.empty?
2162
+ elementstack[-1][2] << value
2163
+ pop('license')
2164
+ end
2165
+ end
2166
+
2167
+ def _start_creativecommons_license(attrsD)
2168
+ push('license', true)
2169
+ end
2170
+
2171
+ def _end_creativecommons_license
2172
+ pop('license')
2173
+ end
2174
+
2175
+ def addTag(term, scheme, label)
2176
+ context = getContext()
2177
+ context['tags'] ||= []
2178
+ tags = context['tags']
2179
+ if (term.nil? or term.empty?) and (scheme.nil? or scheme.empty?) and (label.nil? or label.empty?)
2180
+ return
2181
+ end
2182
+ value = FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
2183
+ if not tags.include?value
2184
+ context['tags'] << FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
2185
+ end
2186
+ end
2187
+
2188
+ def _start_category(attrsD)
2189
+ $stderr << "entering _start_category with #{attrsD}\n" if $debug
2190
+
2191
+ term = attrsD['term']
2192
+ scheme = attrsD['scheme'] || attrsD['domain']
2193
+ label = attrsD['label']
2194
+ addTag(term, scheme, label)
2195
+ push('category', true)
2196
+ end
2197
+ alias :_start_dc_subject :_start_category
2198
+ alias :_start_keywords :_start_category
2199
+
2200
+ def _end_itunes_keywords
2201
+ pop('itunes_keywords').split.each do |term|
2202
+ addTag(term, 'http://www.itunes.com/', nil)
2203
+ end
2204
+ end
2205
+
2206
+ def _start_itunes_category(attrsD)
2207
+ addTag(attrsD['text'], 'http://www.itunes.com/', nil)
2208
+ push('category', true)
2209
+ end
2210
+
2211
+ def _end_category
2212
+ value = pop('category')
2213
+ return if value.nil? or value.empty?
2214
+ context = getContext()
2215
+ tags = context['tags']
2216
+ if value and not value.empty? and not tags.empty? and not tags[-1]['term']:
2217
+ tags[-1]['term'] = value
2218
+ else
2219
+ addTag(value, nil, nil)
2220
+ end
2221
+ end
2222
+ alias :_end_dc_subject :_end_category
2223
+ alias :_end_keywords :_end_category
2224
+ alias :_end_itunes_category :_end_category
2225
+
2226
+ def _start_cloud(attrsD)
2227
+ getContext()['cloud'] = FeedParserDict.new(attrsD)
2228
+ end
2229
+
2230
+ def _start_link(attrsD)
2231
+ attrsD['rel'] ||= 'alternate'
2232
+ attrsD['type'] ||= 'text/html'
2233
+ attrsD = itsAnHrefDamnIt(attrsD)
2234
+ if attrsD.has_key? 'href'
2235
+ attrsD['href'] = resolveURI(attrsD['href'])
2236
+ end
2237
+ expectingText = @infeed || @inentry || @insource
2238
+ context = getContext()
2239
+ context['links'] ||= []
2240
+ context['links'] << FeedParserDict.new(attrsD)
2241
+ if attrsD['rel'] == 'enclosure'
2242
+ _start_enclosure(attrsD)
2243
+ end
2244
+ if attrsD.has_key? 'href'
2245
+ expectingText = false
2246
+ if (attrsD['rel'] == 'alternate') and @html_types.include?mapContentType(attrsD['type'])
2247
+ context['link'] = attrsD['href']
2248
+ end
2249
+ else
2250
+ push('link', expectingText)
2251
+ end
2252
+ end
2253
+ alias :_start_producturl :_start_link
2254
+
2255
+ def _end_link
2256
+ value = pop('link')
2257
+ context = getContext()
2258
+ if @intextinput
2259
+ context['textinput']['link'] = value
2260
+ end
2261
+ if @inimage
2262
+ context['image']['link'] = value
2263
+ end
2264
+ end
2265
+ alias :_end_producturl :_end_link
2266
+
2267
+ def _start_guid(attrsD)
2268
+ @guidislink = ((attrsD['ispermalink'] || 'true') == 'true')
2269
+ push('id', true)
2270
+ end
2271
+
2272
+ def _end_guid
2273
+ value = pop('id')
2274
+ _save('guidislink', (@guidislink and not getContext().has_key?('link')))
2275
+ if @guidislink:
2276
+ # guid acts as link, but only if 'ispermalink' is not present or is 'true',
2277
+ # and only if the item doesn't already have a link element
2278
+ _save('link', value)
2279
+ end
2280
+ end
2281
+
2282
+
2283
+ def _start_title(attrsD)
2284
+ pushContent('title', attrsD, 'text/plain', @infeed || @inentry || @insource)
2285
+ end
2286
+ alias :_start_dc_title :_start_title
2287
+ alias :_start_media_title :_start_title
2288
+
2289
+ def _end_title
2290
+ value = popContent('title')
2291
+ context = getContext()
2292
+ if @intextinput
2293
+ context['textinput']['title'] = value
2294
+ elsif @inimage
2295
+ context['image']['title'] = value
2296
+ end
2297
+ end
2298
+ alias :_end_dc_title :_end_title
2299
+ alias :_end_media_title :_end_title
2300
+
2301
+ def _start_description(attrsD)
2302
+ context = getContext()
2303
+ if context.has_key?('summary')
2304
+ @summaryKey = 'content'
2305
+ _start_content(attrsD)
2306
+ else
2307
+ pushContent('description', attrsD, 'text/html', @infeed || @inentry || @insource)
2308
+ end
2309
+ end
2310
+
2311
+ def _start_abstract(attrsD)
2312
+ pushContent('description', attrsD, 'text/plain', @infeed || @inentry || @insource)
2313
+ end
2314
+
2315
+ def _end_description
2316
+ if @summaryKey == 'content'
2317
+ _end_content()
2318
+ else
2319
+ value = popContent('description')
2320
+ context = getContext()
2321
+ if @intextinput
2322
+ context['textinput']['description'] = value
2323
+ elsif @inimage:
2324
+ context['image']['description'] = value
2325
+ end
2326
+ end
2327
+ @summaryKey = nil
2328
+ end
2329
+ alias :_end_abstract :_end_description
2330
+
2331
+ def _start_info(attrsD)
2332
+ pushContent('info', attrsD, 'text/plain', true)
2333
+ end
2334
+ alias :_start_feedburner_browserfriendly :_start_info
2335
+
2336
+ def _end_info
2337
+ popContent('info')
2338
+ end
2339
+ alias :_end_feedburner_browserfriendly :_end_info
2340
+
2341
+ def _start_generator(attrsD)
2342
+ if attrsD and not attrsD.empty?
2343
+ attrsD = itsAnHrefDamnIt(attrsD)
2344
+ if attrsD.has_key?('href')
2345
+ attrsD['href'] = resolveURI(attrsD['href'])
2346
+ end
2347
+ end
2348
+ getContext()['generator_detail'] = FeedParserDict.new(attrsD)
2349
+ push('generator', true)
2350
+ end
2351
+
2352
+ def _end_generator
2353
+ value = pop('generator')
2354
+ context = getContext()
2355
+ if context.has_key?('generator_detail')
2356
+ context['generator_detail']['name'] = value
2357
+ end
2358
+ end
2359
+
2360
+ def _start_admin_generatoragent(attrsD)
2361
+ push('generator', true)
2362
+ value = getAttribute(attrsD, 'rdf:resource')
2363
+ if value and not value.empty?
2364
+ elementstack[-1][2] << value
2365
+ end
2366
+ pop('generator')
2367
+ getContext()['generator_detail'] = FeedParserDict.new({'href' => value})
2368
+ end
2369
+
2370
+ def _start_admin_errorreportsto(attrsD)
2371
+ push('errorreportsto', true)
2372
+ value = getAttribute(attrsD, 'rdf:resource')
2373
+ if value and not value.empty?
2374
+ @elementstack[-1][2] << value
2375
+ end
2376
+ pop('errorreportsto')
2377
+ end
2378
+
2379
+ def _start_summary(attrsD)
2380
+ context = getContext()
2381
+ if context.has_key?'summary'
2382
+ @summaryKey = 'content'
2383
+ _start_content(attrsD)
2384
+ else
2385
+ @summaryKey = 'summary'
2386
+ pushContent(@summaryKey, attrsD, 'text/plain', true)
2387
+ end
2388
+ end
2389
+ alias :_start_itunes_summary :_start_summary
2390
+
2391
+ def _end_summary
2392
+ if @summaryKey == 'content':
2393
+ _end_content()
2394
+ else
2395
+ popContent(@summaryKey || 'summary')
2396
+ end
2397
+ @summaryKey = nil
2398
+ end
2399
+ alias :_end_itunes_summary :_end_summary
2400
+
2401
+ def _start_enclosure(attrsD)
2402
+ attrsD = itsAnHrefDamnIt(attrsD)
2403
+ getContext()['enclosures'] ||= []
2404
+ getContext()['enclosures'] << FeedParserDict.new(attrsD)
2405
+ href = attrsD['href']
2406
+ if href and not href.empty?
2407
+ context = getContext()
2408
+ if not context['id']
2409
+ context['id'] = href
2410
+ end
2411
+ end
2412
+ end
2413
+
2414
+ def _start_source(attrsD)
2415
+ @insource = true
2416
+ end
2417
+
2418
+ def _end_source
2419
+ @insource = false
2420
+ getContext()['source'] = Marshal.load(Marshal.dump(@sourcedata))
2421
+ @sourcedata.clear()
2422
+ end
2423
+
2424
+ def _start_content(attrsD)
2425
+ pushContent('content', attrsD, 'text/plain', true)
2426
+ src = attrsD['src']
2427
+ if src and not src.empty?:
2428
+ @contentparams['src'] = src
2429
+ end
2430
+ push('content', true)
2431
+ end
2432
+
2433
+ def _start_prodlink(attrsD)
2434
+ pushContent('content', attrsD, 'text/html', true)
2435
+ end
2436
+
2437
+ def _start_body(attrsD)
2438
+ pushContent('content', attrsD, 'application/xhtml+xml', true)
2439
+ end
2440
+ alias :_start_xhtml_body :_start_body
2441
+
2442
+ def _start_content_encoded(attrsD)
2443
+ pushContent('content', attrsD, 'text/html', true)
2444
+ end
2445
+ alias :_start_fullitem :_start_content_encoded
2446
+
2447
+ def _end_content
2448
+ copyToDescription = (['text/plain'] + @html_types).include? mapContentType(@contentparams['type'])
2449
+ value = popContent('content')
2450
+ if copyToDescription
2451
+ _save('description', value)
2452
+ end
2453
+ alias :_end_body :_end_content
2454
+ alias :_end_xhtml_body :_end_content
2455
+ alias :_end_content_encoded :_end_content
2456
+ alias :_end_fullitem :_end_content
2457
+ alias :_end_prodlink :_end_content
2458
+ end
2459
+
2460
+ def _start_itunes_image(attrsD)
2461
+ push('itunes_image', false)
2462
+ getContext()['image'] = FeedParserDict.new({'href' => attrsD['href']})
2463
+ end
2464
+ alias :_start_itunes_link :_start_itunes_image
2465
+
2466
+ def _end_itunes_block
2467
+ value = pop('itunes_block', false)
2468
+ getContext()['itunes_block'] = (value == 'yes') and true or false
2469
+ end
2470
+
2471
+ def _end_itunes_explicit
2472
+ value = pop('itunes_explicit', false)
2473
+ getContext()['itunes_explicit'] = (value == 'yes') and true or false
2474
+ end
2475
+
2476
+
2477
+ # ISO-8601 date parsing routines written by Fazal Majid.
2478
+ # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2479
+ # parser is beyond the scope of feedparser and the current Time.iso8601
2480
+ # method does not work.
2481
+ # A single regular expression cannot parse ISO 8601 date formats into groups
2482
+ # as the standard is highly irregular (for instance is 030104 2003-01-04 or
2483
+ # 0301-04-01), so we use templates instead.
2484
+ # Please note the order in templates is significant because we need a
2485
+ # greedy match.
2486
+ def _parse_date_iso8601(dateString)
2487
+ # Parse a variety of ISO-8601-compatible formats like 20040105
2488
+
2489
+ # What I'm about to show you may be the ugliest code in all of
2490
+ # rfeedparser.
2491
+ # FIXME The century regexp maybe not work ('\d\d$' says "two numbers at
2492
+ # end of line" but we then attach more of a regexp.
2493
+ iso8601_regexps = [ '^(\d{4})-?([01]\d)-([0123]\d)',
2494
+ '^(\d{4})-([01]\d)',
2495
+ '^(\d{4})-?([0123]\d\d)',
2496
+ '^(\d\d)-?([01]\d)-?([0123]\d)',
2497
+ '^(\d\d)-?([0123]\d\d)',
2498
+ '^(\d{4})',
2499
+ '-(\d\d)-?([01]\d)',
2500
+ '-([0123]\d\d)',
2501
+ '-(\d\d)',
2502
+ '--([01]\d)-?([0123]\d)',
2503
+ '--([01]\d)',
2504
+ '---([0123]\d)',
2505
+ '(\d\d$)',
2506
+ ''
2507
+ ]
2508
+ iso8601_values = { '^(\d{4})-?([01]\d)-([0123]\d)' => ['year', 'month', 'day'],
2509
+ '^(\d{4})-([01]\d)' => ['year','month'],
2510
+ '^(\d{4})-?([0123]\d\d)' => ['year', 'ordinal'],
2511
+ '^(\d\d)-?([01]\d)-?([0123]\d)' => ['year','month','day'],
2512
+ '^(\d\d)-?([0123]\d\d)' => ['year','ordinal'],
2513
+ '^(\d{4})' => ['year'],
2514
+ '-(\d\d)-?([01]\d)' => ['year','month'],
2515
+ '-([0123]\d\d)' => ['ordinal'],
2516
+ '-(\d\d)' => ['year'],
2517
+ '--([01]\d)-?([0123]\d)' => ['month','day'],
2518
+ '--([01]\d)' => ['month'],
2519
+ '---([0123]\d)' => ['day'],
2520
+ '(\d\d$)' => ['century'],
2521
+ '' => []
2522
+ }
2523
+ add_to_all = '(T?(\d\d):(\d\d)(?::(\d\d))?([+-](\d\d)(?::(\d\d))?|Z)?)?'
2524
+ add_to_all_fields = ['hour', 'minute', 'second', 'tz', 'tzhour', 'tzmin']
2525
+ # NOTE We use '(?:' to prevent grouping of optional matches (ones trailed
2526
+ # by '?'). The second ':' *are* matched.
2527
+ m = nil
2528
+ param_keys = []
2529
+ iso8601_regexps.each do |s|
2530
+ $stderr << "Trying iso8601 regexp: #{s+add_to_all}\n" if $debug
2531
+ param_keys = iso8601_values[s] + add_to_all_fields
2532
+ m = dateString.match(Regexp.new(s+add_to_all))
2533
+ break if m
2534
+ end
2535
+ return if m.nil? or (m.begin(0).zero? and m.end(0).zero?)
2536
+
2537
+ param_values = m.to_a
2538
+ param_values = param_values[1..-1]
2539
+ params = {}
2540
+ param_keys.each_with_index do |key,i|
2541
+ params[key] = param_values[i]
2542
+ end
21
2543
 
22
- gem 'character-encodings', ">=0.2.0"
23
- gem 'htmltools', ">=1.10"
24
- gem 'htmlentities', ">=4.0.0"
25
- gem 'activesupport', ">=1.4.1"
26
- gem 'rchardet', ">=1.0"
27
- require 'xml/saxdriver' # calling expat through the xmlparser gem
2544
+ ordinal = params['ordinal'].to_i unless params['ordinal'].nil?
2545
+ year = params['year'] || '--'
2546
+ if year.nil? or year.empty? or year == '--' # FIXME When could the regexp ever return a year equal to '--'?
2547
+ year = Time.now.utc.year
2548
+ elsif year.length == 2
2549
+ # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
2550
+ year = 100 * (Time.now.utc.year / 100) + year.to_i
2551
+ else
2552
+ year = year.to_i
2553
+ end
28
2554
 
29
- require 'rchardet'
30
- $chardet = true
2555
+ month = params['month'] || '-'
2556
+ if month.nil? or month.empty? or month == '-'
2557
+ # ordinals are NOT normalized by mktime, we simulate them
2558
+ # by setting month=1, day=ordinal
2559
+ if ordinal
2560
+ month = DateTime.ordinal(year,ordinal).month
2561
+ else
2562
+ month = Time.now.utc.month
2563
+ end
2564
+ end
2565
+ month = month.to_i unless month.nil?
2566
+ day = params['day']
2567
+ if day.nil? or day.empty?
2568
+ # see above
2569
+ if ordinal
2570
+ day = DateTime.ordinal(year,ordinal).day
2571
+ elsif params['century'] or params['year'] or params['month']
2572
+ day = 1
2573
+ else
2574
+ day = Time.now.utc.day
2575
+ end
2576
+ else
2577
+ day = day.to_i
2578
+ end
2579
+ # special case of the century - is the first year of the 21st century
2580
+ # 2000 or 2001 ? The debate goes on...
2581
+ if params.has_key? 'century'
2582
+ year = (params['century'].to_i - 1) * 100 + 1
2583
+ end
2584
+ # in ISO 8601 most fields are optional
2585
+ hour = params['hour'].to_i
2586
+ minute = params['minute'].to_i
2587
+ second = params['second'].to_i
2588
+ weekday = nil
2589
+ # daylight savings is complex, but not needed for feedparser's purposes
2590
+ # as time zones, if specified, include mention of whether it is active
2591
+ # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
2592
+ # and most implementations have DST bugs
2593
+ tm = [second, minute, hour, day, month, year, nil, ordinal, false, nil]
2594
+ tz = params['tz']
2595
+ if tz and not tz.empty? and tz != 'Z'
2596
+ # FIXME does this cross over days?
2597
+ if tz[0] == '-'
2598
+ tm[3] += params['tzhour'].to_i
2599
+ tm[4] += params['tzmin'].to_i
2600
+ elsif tz[0] == '+'
2601
+ tm[3] -= params['tzhour'].to_i
2602
+ tm[4] -= params['tzmin'].to_i
2603
+ else
2604
+ return nil
2605
+ end
2606
+ end
2607
+ return Time.utc(*tm) # Magic!
31
2608
 
32
- require 'encoding/character/utf-8'
33
- require 'html/sgml-parser'
34
- require 'htmlentities'
35
- require 'active_support'
36
- require 'open-uri'
37
- include OpenURI
2609
+ end
38
2610
 
39
- $debug = false
40
- $compatible = true
2611
+ def _parse_date_onblog(dateString)
2612
+ # Parse a string according to the OnBlog 8-bit date format
2613
+ # 8-bit date handling routes written by ytrewq1
2614
+ korean_year = u("년") # b3e2 in euc-kr
2615
+ korean_month = u("월") # bff9 in euc-kr
2616
+ korean_day = u("일") # c0cf in euc-kr
41
2617
 
42
- $LOAD_PATH << File.expand_path(File.dirname(__FILE__))
43
- require 'rfeedparser/forgiving_uri'
44
- require 'rfeedparser/aliases'
45
- require 'rfeedparser/encoding_helpers'
46
- require 'rfeedparser/better_sgmlparser'
47
- require 'rfeedparser/better_attributelist'
48
- require 'rfeedparser/scrub'
49
- require 'rfeedparser/time_helpers'
50
- require 'rfeedparser/feedparserdict'
51
- require 'rfeedparser/parser_mixin'
52
- require 'rfeedparser/parsers'
53
- require 'rfeedparser/markup_helpers'
54
2618
 
55
- include FeedParserUtilities
2619
+ korean_onblog_date_re = /(\d{4})#{korean_year}\s+(\d{2})#{korean_month}\s+(\d{2})#{korean_day}\s+(\d{2}):(\d{2}):(\d{2})/
56
2620
 
57
2621
 
58
- module FeedParser
59
- Version = "0.9.9"
2622
+ m = korean_onblog_date_re.match(dateString)
2623
+ return unless m
2624
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
60
2625
 
61
- License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
2626
+ $stderr << "OnBlog date parsed as: %s\n" % w3dtfdate if $debug
2627
+ return _parse_date_w3dtf(w3dtfdate)
2628
+ end
62
2629
 
63
- Redistribution and use in source and binary forms, with or without modification,
64
- are permitted provided that the following conditions are met:
2630
+ def _parse_date_nate(dateString)
2631
+ # Parse a string according to the Nate 8-bit date format
2632
+ # 8-bit date handling routes written by ytrewq1
2633
+ korean_am = u("오전") # bfc0 c0fc in euc-kr
2634
+ korean_pm = u("오후") # bfc0 c8c4 in euc-kr
65
2635
 
66
- * Redistributions of source code must retain the above copyright notice,
67
- this list of conditions and the following disclaimer.
68
- * Redistributions in binary form must reproduce the above copyright notice,
69
- this list of conditions and the following disclaimer in the documentation
70
- and/or other materials provided with the distribution.
2636
+ korean_nate_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(#{korean_am}|#{korean_pm})\s+(\d{0,2}):(\d{0,2}):(\d{0,2})/
2637
+ m = korean_nate_date_re.match(dateString)
2638
+ return unless m
2639
+ hour = m[5].to_i
2640
+ ampm = m[4]
2641
+ if ampm == korean_pm
2642
+ hour += 12
2643
+ end
2644
+ hour = hour.to_s.rjust(2,'0')
2645
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{hour}:#{m[6]}:#{m[7]}+09:00"
2646
+ $stderr << "Nate date parsed as: %s\n" % w3dtfdate if $debug
2647
+ return _parse_date_w3dtf(w3dtfdate)
2648
+ end
71
2649
 
72
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
73
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
74
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
75
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
76
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
77
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
78
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
79
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
80
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
81
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
82
- POSSIBILITY OF SUCH DAMAGE."""
2650
+ def _parse_date_mssql(dateString)
2651
+ mssql_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?/
83
2652
 
84
- Author = "Jeff Hodges <http://somethingsimilar.com>"
85
- Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
86
- Contributors = [ "Jason Diamond <http://injektilo.org/>",
87
- "John Beimler <http://john.beimler.org/>",
88
- "Fazal Majid <http://www.majid.info/mylos/weblog/>",
89
- "Aaron Swartz <http://aaronsw.com/>",
90
- "Kevin Marks <http://epeus.blogspot.com/>"
91
- ]
92
- # HTTP "User-Agent" header to send to servers when downloading feeds.
93
- # If you are embedding feedparser in a larger application, you should
94
- # change this to your application name and URL.
95
- USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
2653
+ m = mssql_date_re.match(dateString)
2654
+ return unless m
2655
+ w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
2656
+ $stderr << "MS SQL date parsed as: %s\n" % w3dtfdate if $debug
2657
+ return _parse_date_w3dtf(w3dtfdate)
2658
+ end
96
2659
 
97
- # HTTP "Accept" header to send to servers when downloading feeds. If you don't
98
- # want to send an Accept header, set this to None.
99
- ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
2660
+ def _parse_date_greek(dateString)
2661
+ # Parse a string according to a Greek 8-bit date format
2662
+ # Unicode strings for Greek date strings
2663
+ greek_months = {
2664
+ u("Ιαν") => u("Jan"), # c9e1ed in iso-8859-7
2665
+ u("Φεβ") => u("Feb"), # d6e5e2 in iso-8859-7
2666
+ u("Μάώ") => u("Mar"), # ccdcfe in iso-8859-7
2667
+ u("Μαώ") => u("Mar"), # cce1fe in iso-8859-7
2668
+ u("Απρ") => u("Apr"), # c1f0f1 in iso-8859-7
2669
+ u("Μάι") => u("May"), # ccdce9 in iso-8859-7
2670
+ u("Μαϊ") => u("May"), # cce1fa in iso-8859-7
2671
+ u("Μαι") => u("May"), # cce1e9 in iso-8859-7
2672
+ u("Ιούν") => u("Jun"), # c9effded in iso-8859-7
2673
+ u("Ιον") => u("Jun"), # c9efed in iso-8859-7
2674
+ u("Ιούλ") => u("Jul"), # c9effdeb in iso-8859-7
2675
+ u("Ιολ") => u("Jul"), # c9f9eb in iso-8859-7
2676
+ u("Αύγ") => u("Aug"), # c1fde3 in iso-8859-7
2677
+ u("Αυγ") => u("Aug"), # c1f5e3 in iso-8859-7
2678
+ u("Σεπ") => u("Sep"), # d3e5f0 in iso-8859-7
2679
+ u("Οκτ") => u("Oct"), # cfeaf4 in iso-8859-7
2680
+ u("Νοέ") => u("Nov"), # cdefdd in iso-8859-7
2681
+ u("Νοε") => u("Nov"), # cdefe5 in iso-8859-7
2682
+ u("Δεκ") => u("Dec"), # c4e5ea in iso-8859-7
2683
+ }
100
2684
 
2685
+ greek_wdays = {
2686
+ u("Κυρ") => u("Sun"), # caf5f1 in iso-8859-7
2687
+ u("Δευ") => u("Mon"), # c4e5f5 in iso-8859-7
2688
+ u("Τρι") => u("Tue"), # d4f1e9 in iso-8859-7
2689
+ u("Τετ") => u("Wed"), # d4e5f4 in iso-8859-7
2690
+ u("Πεμ") => u("Thu"), # d0e5ec in iso-8859-7
2691
+ u("Παρ") => u("Fri"), # d0e1f1 in iso-8859-7
2692
+ u("Σαβ") => u("Sat"), # d3e1e2 in iso-8859-7
2693
+ }
101
2694
 
102
- # If you want feedparser to automatically run HTML markup through HTML Tidy, set
103
- # this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
104
- # or utidylib <http://utidylib.berlios.de/>.
105
- #TIDY_MARKUP = false #FIXME untranslated
2695
+ greek_date_format = /([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)/
106
2696
 
107
- # List of Python interfaces for HTML Tidy, in order of preference. Only useful
108
- # if TIDY_MARKUP = true
109
- #PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
2697
+ m = greek_date_format.match(dateString)
2698
+ return unless m
2699
+ begin
2700
+ wday = greek_wdays[m[1]]
2701
+ month = greek_months[m[3]]
2702
+ rescue
2703
+ return nil
2704
+ end
2705
+ rfc822date = "#{wday}, #{m[2]} #{month} #{m[4]} #{m[5]}:#{m[6]}:#{m[7]} #{m[8]}"
2706
+ $stderr << "Greek date parsed as: #{rfc822date}\n" if $debug
2707
+ return _parse_date_rfc822(rfc822date)
2708
+ end
110
2709
 
2710
+ def _parse_date_hungarian(dateString)
2711
+ # Parse a string according to a Hungarian 8-bit date format.
2712
+ hungarian_date_format_re = /(\d{4})-([^-]+)-(\d{0,2})T(\d{0,2}):(\d{2})((\+|-)(\d{0,2}:\d{2}))/
2713
+ m = hungarian_date_format_re.match(dateString)
2714
+ return unless m
111
2715
 
112
- # ---------- don't touch these ----------
113
- class ThingsNobodyCaresAboutButMe < Exception
2716
+ # Unicode strings for Hungarian date strings
2717
+ hungarian_months = {
2718
+ u("január") => u("01"), # e1 in iso-8859-2
2719
+ u("februári") => u("02"), # e1 in iso-8859-2
2720
+ u("március") => u("03"), # e1 in iso-8859-2
2721
+ u("április") => u("04"), # e1 in iso-8859-2
2722
+ u("máujus") => u("05"), # e1 in iso-8859-2
2723
+ u("június") => u("06"), # fa in iso-8859-2
2724
+ u("július") => u("07"), # fa in iso-8859-2
2725
+ u("augusztus") => u("08"),
2726
+ u("szeptember") => u("09"),
2727
+ u("október") => u("10"), # f3 in iso-8859-2
2728
+ u("november") => u("11"),
2729
+ u("december") => u("12"),
2730
+ }
2731
+ begin
2732
+ month = hungarian_months[m[2]]
2733
+ day = m[3].rjust(2,'0')
2734
+ hour = m[4].rjust(2,'0')
2735
+ rescue
2736
+ return
2737
+ end
2738
+
2739
+ w3dtfdate = "#{m[1]}-#{month}-#{day}T#{hour}:#{m[5]}:00#{m[6]}"
2740
+ $stderr << "Hungarian date parsed as: #{w3dtfdate}\n" if $debug
2741
+ return _parse_date_w3dtf(w3dtfdate)
2742
+ end
2743
+
2744
+ def rollover(num, modulus)
2745
+ return num % modulus, num / modulus
2746
+ end
2747
+
2748
+ def set_self(num, modulus)
2749
+ r = num / modulus
2750
+ if r == 0
2751
+ return num
2752
+ end
2753
+ return r
2754
+ end
2755
+ # W3DTF-style date parsing
2756
+ # FIXME shouldn't it be "W3CDTF"?
2757
+ def _parse_date_w3dtf(dateString)
2758
+ # Ruby's Time docs claim w3cdtf is an alias for iso8601 which is an alias for xmlschema
2759
+ # Whatever it is, it doesn't work. This has been fixed in Ruby 1.9 and
2760
+ # in Ruby on Rails, but not really. They don't fix the 25 hour or 61 minute or 61 second rollover and fail in other ways.
2761
+
2762
+ m = dateString.match(/^(\d{4})-?(?:(?:([01]\d)-?(?:([0123]\d)(?:T(\d\d):(\d\d):(\d\d)([+-]\d\d:\d\d|Z))?)?)?)?/)
2763
+
2764
+ w3 = m[1..3].map{|s| s=s.to_i; s += 1 if s == 0;s} # Map the year, month and day to integers and, if they were nil, set them to 1
2765
+ w3 += m[4..6].map{|s| s.to_i} # Map the hour, minute and second to integers
2766
+ w3 << m[-1] # Leave the timezone as a String
2767
+
2768
+ # FIXME this next bit needs some serious refactoring
2769
+ # Rollover times. 0 minutes and 61 seconds -> 1 minute and 1 second
2770
+ w3[5],r = rollover(w3[5], 60) # rollover seconds
2771
+ w3[4] += r
2772
+ w3[4],r = rollover(w3[4], 60) # rollover minutes
2773
+ w3[3] += r
2774
+ w3[3],r = rollover(w3[3], 24) # rollover hours
2775
+
2776
+ w3[2] = w3[2] + r
2777
+ if w3[1] > 12
2778
+ w3[1],r = rollover(w3[1],12)
2779
+ w3[1] = 12 if w3[1] == 0
2780
+ w3[0] += r
2781
+ end
2782
+
2783
+ num_days = Time.days_in_month(w3[1], w3[0])
2784
+ while w3[2] > num_days
2785
+ w3[2] -= num_days
2786
+ w3[1] += 1
2787
+ if w3[1] > 12
2788
+ w3[0] += 1
2789
+ w3[1] = set_self(w3[1], 12)
2790
+ end
2791
+ num_days = Time.days_in_month(w3[1], w3[0])
2792
+ end
2793
+
2794
+
2795
+ unless w3[6].class != String
2796
+ if /^-/ =~ w3[6] # Zone offset goes backwards
2797
+ w3[6][0] = '+'
2798
+ elsif /^\+/ =~ w3[6]
2799
+ w3[6][0] = '-'
2800
+ end
2801
+ end
2802
+ return Time.utc(w3[0], w3[1], w3[2] , w3[3], w3[4], w3[5])+Time.zone_offset(w3[6] || "UTC")
2803
+ end
2804
+
2805
+ def _parse_date_rfc822(dateString)
2806
+ # Parse an RFC822, RFC1123, RFC2822 or asctime-style date
2807
+ # These first few lines are to fix up the stupid proprietary format from Disney
2808
+ unknown_timezones = { 'AT' => 'EDT', 'ET' => 'EST',
2809
+ 'CT' => 'CST', 'MT' => 'MST',
2810
+ 'PT' => 'PST'
2811
+ }
2812
+
2813
+ mon = dateString.split[2]
2814
+ if mon.length > 3 and Time::RFC2822_MONTH_NAME.include?mon[0..2]
2815
+ dateString.sub!(mon,mon[0..2])
2816
+ end
2817
+ if dateString[-3..-1] != "GMT" and unknown_timezones[dateString[-2..-1]]
2818
+ dateString[-2..-1] = unknown_timezones[dateString[-2..-1]]
2819
+ end
2820
+ # Okay, the Disney date format should be fixed up now.
2821
+ rfc = dateString.match(/([A-Za-z]{3}), ([0123]\d) ([A-Za-z]{3}) (\d{4})( (\d\d):(\d\d)(?::(\d\d))? ([A-Za-z]{3}))?/)
2822
+ if rfc.to_a.length > 1 and rfc.to_a.include? nil
2823
+ dow, day, mon, year, hour, min, sec, tz = rfc[1..-1]
2824
+ hour,min,sec = [hour,min,sec].map{|e| e.to_s.rjust(2,'0') }
2825
+ tz ||= "GMT"
2826
+ end
2827
+ asctime_match = dateString.match(/([A-Za-z]{3}) ([A-Za-z]{3}) (\d?\d) (\d\d):(\d\d):(\d\d) ([A-Za-z]{3}) (\d\d\d\d)/).to_a
2828
+ if asctime_match.to_a.length > 1
2829
+ # Month-abbr dayofmonth hour:minute:second year
2830
+ dow, mon, day, hour, min, sec, tz, year = asctime_match[1..-1]
2831
+ day.to_s.rjust(2,'0')
2832
+ end
2833
+ if (rfc.to_a.length > 1 and rfc.to_a.include? nil) or asctime_match.to_a.length > 1
2834
+ ds = "#{dow}, #{day} #{mon} #{year} #{hour}:#{min}:#{sec} #{tz}"
2835
+ else
2836
+ ds = dateString
2837
+ end
2838
+ t = Time.rfc2822(ds).utc
2839
+ return t
2840
+ end
2841
+
2842
+ def _parse_date_perforce(aDateString) # FIXME not in 4.1?
2843
+ # Parse a date in yyyy/mm/dd hh:mm:ss TTT format
2844
+ # Note that there is a day of the week at the beginning
2845
+ # Ex. Fri, 2006/09/15 08:19:53 EDT
2846
+ return Time.parse(aDateString).utc
2847
+ end
2848
+
2849
+ def extract_tuple(atime)
2850
+ # NOTE leave the error handling to parse_date
2851
+ t = [atime.year, atime.month, atime.mday, atime.hour,
2852
+ atime.min, atime.sec, (atime.wday-1) % 7, atime.yday,
2853
+ atime.isdst
2854
+ ]
2855
+ # yay for modulus! yaaaaaay! its 530 am and i should be sleeping! yaay!
2856
+ t[0..-2].map!{|s| s.to_i}
2857
+ t[-1] = t[-1] ? 1 : 0
2858
+ return t
2859
+ end
2860
+
2861
+ def parse_date(dateString)
2862
+ @date_handlers.each do |handler|
2863
+ begin
2864
+ $stderr << "Trying date_handler #{handler}\n" if $debug
2865
+ datething = extract_tuple(send(handler,dateString))
2866
+ return datething
2867
+ rescue Exception => e
2868
+ $stderr << "#{handler} raised #{e}\n" if $debug
2869
+ end
2870
+ end
2871
+ return nil
2872
+ end
2873
+
2874
+ end # End FeedParserMixin
2875
+
2876
+ class StrictFeedParser < XML::SAX::HandlerBase # expat
2877
+ include FeedParserMixin
2878
+
2879
+ attr_accessor :bozo, :entries, :feeddata, :exc
2880
+ def initialize(baseuri, baselang, encoding)
2881
+ $stderr << "trying StrictFeedParser\n" if $debug
2882
+ startup(baseuri, baselang, encoding)
2883
+ @bozo = false
2884
+ @exc = nil
2885
+ super()
2886
+ end
2887
+
2888
+ def getPos
2889
+ [@locator.getSystemId, @locator.getLineNumber]
2890
+ end
2891
+
2892
+ def getAttrs(attrs)
2893
+ ret = []
2894
+ for i in 0..attrs.getLength
2895
+ ret.push([attrs.getName(i), attrs.getValue(i)])
2896
+ end
2897
+ ret
2898
+ end
2899
+
2900
+ def setDocumentLocator(loc)
2901
+ @locator = loc
2902
+ end
2903
+
2904
+ def startDoctypeDecl(name, pub_sys, long_name, uri)
2905
+ #Nothing is done here. What could we do that is neat and useful?
2906
+ end
2907
+
2908
+ def startNamespaceDecl(prefix, uri)
2909
+ trackNamespace(prefix, uri)
2910
+ end
2911
+
2912
+ def endNamespaceDecl(prefix)
2913
+ end
2914
+
2915
+ def startElement(name, attrs)
2916
+ name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
2917
+ namespaceuri = ($2 || '').downcase
2918
+ name = $3
2919
+ if /backend\.userland\.com\/rss/ =~ namespaceuri
2920
+ # match any backend.userland.com namespace
2921
+ namespaceuri = 'http://backend.userland.com/rss'
2922
+ end
2923
+ prefix = @matchnamespaces[namespaceuri]
2924
+ # No need to raise UndeclaredNamespace, Expat does that for us with
2925
+ "unbound prefix (XMLParserError)"
2926
+ if prefix and not prefix.empty?
2927
+ name = prefix + ':' + name
2928
+ end
2929
+ name.downcase!
2930
+ unknown_starttag(name, attrs)
2931
+ end
2932
+
2933
+ def character(text, start, length)
2934
+ #handle_data(CGI.unescapeHTML(text))
2935
+ handle_data(text)
2936
+ end
2937
+ # expat provides "character" not "characters"!
2938
+ alias :characters :character # Just in case.
2939
+
2940
+ def startCdata(content)
2941
+ handle_data(content)
2942
+ end
2943
+
2944
+ def endElement(name)
2945
+ name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
2946
+ namespaceuri = ($2 || '').downcase
2947
+ prefix = @matchnamespaces[namespaceuri]
2948
+ if prefix and not prefix.empty?
2949
+ localname = prefix + ':' + name
2950
+ end
2951
+ name.downcase!
2952
+ unknown_endtag(name)
2953
+ end
2954
+
2955
+ def comment(comment)
2956
+ handle_comment(comment)
2957
+ end
2958
+
2959
+ def entityDecl(*foo)
2960
+ end
2961
+
2962
+ def unparsedEntityDecl(*foo)
2963
+ end
2964
+ def error(exc)
2965
+ @bozo = true
2966
+ @exc = exc
2967
+ end
2968
+
2969
+ def fatalError(exc)
2970
+ error(exc)
2971
+ raise exc
2972
+ end
114
2973
  end
115
- class CharacterEncodingOverride < ThingsNobodyCaresAboutButMe
2974
+
2975
+ class LooseFeedParser < BetterSGMLParser
2976
+ include FeedParserMixin
2977
+ # We write the methods that were in BaseHTMLProcessor in the python code
2978
+ # in here directly. We do this because if we inherited from
2979
+ # BaseHTMLProcessor but then included from FeedParserMixin, the methods
2980
+ # of Mixin would overwrite the methods we inherited from
2981
+ # BaseHTMLProcessor. This is exactly the opposite of what we want to
2982
+ # happen!
2983
+
2984
+ attr_accessor :encoding, :bozo, :feeddata, :entries, :namespacesInUse
2985
+
2986
+ Elements_No_End_Tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
2987
+ 'img', 'input', 'isindex', 'link', 'meta', 'param']
2988
+ New_Declname_Re = /[a-zA-Z][-_.a-zA-Z0-9:]*\s*/
2989
+ alias :sgml_feed :feed # feed needs to mapped to feeddata, not the SGMLParser method feed. I think.
2990
+ def feed
2991
+ @feeddata
2992
+ end
2993
+ def feed=(data)
2994
+ @feeddata = data
2995
+ end
2996
+
2997
+ def initialize(baseuri, baselang, encoding)
2998
+ startup(baseuri, baselang, encoding)
2999
+ super() # Keep the parentheses! No touchy.
3000
+ end
3001
+
3002
+ def reset
3003
+ @pieces = []
3004
+ super
3005
+ end
3006
+
3007
+ def parse(data)
3008
+ data.gsub!(/<!((?!DOCTYPE|--|\[))/i, '&lt;!\1')
3009
+ data.gsub!(/<([^<\s]+?)\s*\/>/) do |tag|
3010
+ clean = tag[1..-3].strip
3011
+ if Elements_No_End_Tag.include?clean
3012
+ tag
3013
+ else
3014
+ '<'+clean+'></'+clean+'>'
3015
+ end
3016
+ end
3017
+
3018
+ data.gsub!(/&#39;/, "'")
3019
+ data.gsub!(/&#34;/, "'")
3020
+ if @encoding and not @encoding.empty? # FIXME unicode check type(u'')
3021
+ data = uconvert(data,'utf-8',@encoding)
3022
+ end
3023
+ sgml_feed(data) # see the alias above
3024
+ end
3025
+
3026
+
3027
+ def decodeEntities(element, data)
3028
+ data.gsub!('&#60;', '&lt;')
3029
+ data.gsub!('&#x3c;', '&lt;')
3030
+ data.gsub!('&#62;', '&gt;')
3031
+ data.gsub!('&#x3e;', '&gt;')
3032
+ data.gsub!('&#38;', '&amp;')
3033
+ data.gsub!('&#x26;', '&amp;')
3034
+ data.gsub!('&#34;', '&quot;')
3035
+ data.gsub!('&#x22;', '&quot;')
3036
+ data.gsub!('&#39;', '&apos;')
3037
+ data.gsub!('&#x27;', '&apos;')
3038
+ if @contentparams.has_key? 'type' and not ((@contentparams['type'] || 'xml') =~ /xml$/u)
3039
+ data.gsub!('&lt;', '<')
3040
+ data.gsub!('&gt;', '>')
3041
+ data.gsub!('&amp;', '&')
3042
+ data.gsub!('&quot;', '"')
3043
+ data.gsub!('&apos;', "'")
3044
+ end
3045
+ return data
3046
+ end
116
3047
  end
117
- class CharacterEncodingUnknown < ThingsNobodyCaresAboutButMe
3048
+
3049
+ def FeedParser.resolveRelativeURIs(htmlSource, baseURI, encoding)
3050
+ $stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
3051
+ relative_uris = [ ['a','href'],
3052
+ ['applet','codebase'],
3053
+ ['area','href'],
3054
+ ['blockquote','cite'],
3055
+ ['body','background'],
3056
+ ['del','cite'],
3057
+ ['form','action'],
3058
+ ['frame','longdesc'],
3059
+ ['frame','src'],
3060
+ ['iframe','longdesc'],
3061
+ ['iframe','src'],
3062
+ ['head','profile'],
3063
+ ['img','longdesc'],
3064
+ ['img','src'],
3065
+ ['img','usemap'],
3066
+ ['input','src'],
3067
+ ['input','usemap'],
3068
+ ['ins','cite'],
3069
+ ['link','href'],
3070
+ ['object','classid'],
3071
+ ['object','codebase'],
3072
+ ['object','data'],
3073
+ ['object','usemap'],
3074
+ ['q','cite'],
3075
+ ['script','src'],
3076
+ ]
3077
+ h = Hpricot(htmlSource)
3078
+ relative_uris.each do |l|
3079
+ ename, eattr = l
3080
+ h.search(ename).each do |elem|
3081
+ euri = elem.attributes[eattr]
3082
+ if euri and not euri.empty? and URI.parse(euri).relative?
3083
+ elem.attributes[eattr] = urljoin(baseURI, euri)
3084
+ end
3085
+ end
3086
+ end
3087
+ return h.to_html
118
3088
  end
119
- class NonXMLContentType < ThingsNobodyCaresAboutButMe
3089
+
3090
+ class SanitizerDoc < Hpricot::Doc
3091
+
3092
+ def scrub
3093
+ traverse_all_element do |e|
3094
+ if e.elem?
3095
+ if Acceptable_Elements.include?e.name
3096
+ e.strip_attributes
3097
+ else
3098
+ if Unacceptable_Elements_With_End_Tag.include?e.name
3099
+ e.inner_html = ''
3100
+ end
3101
+ e.swap(SanitizerDoc.new(e.children).scrub.to_html)
3102
+ # This works because the children swapped in are brought in "after" the current element.
3103
+ end
3104
+ elsif e.doctype?
3105
+ e.parent.children.delete(e)
3106
+ elsif e.text?
3107
+ ets = e.to_s
3108
+ ets.gsub!(/&#39;/, "'")
3109
+ ets.gsub!(/&#34;/, '"')
3110
+ ets.gsub!(/\r/,'')
3111
+ e.swap(ets)
3112
+ else
3113
+ end
3114
+ end
3115
+ # yes, that '/' should be there. It's a search method. See the Hpricot docs.
3116
+
3117
+ unless $compatible # FIXME not properly recursive, see comment in recursive_strip
3118
+ (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
3119
+ end
3120
+ return self
3121
+ end
120
3122
  end
121
- class UndeclaredNamespace < Exception
3123
+
3124
+ def SanitizerDoc(html)
3125
+ FeedParser::SanitizerDoc.new(Hpricot.make(html))
3126
+ end
3127
+ module_function(:SanitizerDoc)
3128
+ def self.sanitizeHTML(html,encoding)
3129
+ # FIXME Tidy not yet supported
3130
+ html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
3131
+ h = SanitizerDoc(html)
3132
+ h = h.scrub
3133
+ return h.to_html.strip
122
3134
  end
123
3135
 
124
3136
 
125
- SUPPORTED_VERSIONS = {'' => 'unknown',
126
- 'rss090' => 'RSS 0.90',
127
- 'rss091n' => 'RSS 0.91 (Netscape)',
128
- 'rss091u' => 'RSS 0.91 (Userland)',
129
- 'rss092' => 'RSS 0.92',
130
- 'rss093' => 'RSS 0.93',
131
- 'rss094' => 'RSS 0.94',
132
- 'rss20' => 'RSS 2.0',
133
- 'rss10' => 'RSS 1.0',
134
- 'rss' => 'RSS (unknown version)',
135
- 'atom01' => 'Atom 0.1',
136
- 'atom02' => 'Atom 0.2',
137
- 'atom03' => 'Atom 0.3',
138
- 'atom10' => 'Atom 1.0',
139
- 'atom' => 'Atom (unknown version)',
140
- 'cdf' => 'CDF',
141
- 'hotrss' => 'Hot RSS'
142
- }
143
-
144
- def parse(furi, options = {})
3137
+
3138
+ def self.getCharacterEncoding(feed, xml_data)
3139
+ # Get the character encoding of the XML document
3140
+ $stderr << "In getCharacterEncoding\n" if $debug
3141
+ sniffed_xml_encoding = nil
3142
+ xml_encoding = nil
3143
+ true_encoding = nil
3144
+ begin
3145
+ http_headers = feed.meta
3146
+ http_content_type = feed.meta['content-type'].split(';')[0]
3147
+ encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
3148
+ http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
3149
+ http_encoding = nil if http_encoding.empty?
3150
+ # FIXME Open-Uri returns iso8859-1 if there is no charset header,
3151
+ # but that doesn't pass the tests. Open-Uri claims its following
3152
+ # the right RFC. Are they wrong or do we need to change the tests?
3153
+ rescue NoMethodError
3154
+ http_headers = {}
3155
+ http_content_type = nil
3156
+ http_encoding = nil
3157
+ end
3158
+ # Must sniff for non-ASCII-compatible character encodings before
3159
+ # searching for XML declaration. This heuristic is defined in
3160
+ # section F of the XML specification:
3161
+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3162
+ begin
3163
+ if xml_data[0..3] == "\x4c\x6f\xa7\x94"
3164
+ # EBCDIC
3165
+ xml_data = _ebcdic_to_ascii(xml_data)
3166
+ elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
3167
+ # UTF-16BE
3168
+ sniffed_xml_encoding = 'utf-16be'
3169
+ xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
3170
+ elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
3171
+ # UTF-16BE with BOM
3172
+ sniffed_xml_encoding = 'utf-16be'
3173
+ xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
3174
+ elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
3175
+ # UTF-16LE
3176
+ sniffed_xml_encoding = 'utf-16le'
3177
+ xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
3178
+ elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
3179
+ # UTF-16LE with BOM
3180
+ sniffed_xml_encoding = 'utf-16le'
3181
+ xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
3182
+ elsif xml_data[0..3] == "\x00\x00\x00\x3c"
3183
+ # UTF-32BE
3184
+ sniffed_xml_encoding = 'utf-32be'
3185
+ xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
3186
+ elsif xml_data[0..3] == "\x3c\x00\x00\x00"
3187
+ # UTF-32LE
3188
+ sniffed_xml_encoding = 'utf-32le'
3189
+ xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
3190
+ elsif xml_data[0..3] == "\x00\x00\xfe\xff"
3191
+ # UTF-32BE with BOM
3192
+ sniffed_xml_encoding = 'utf-32be'
3193
+ xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
3194
+ elsif xml_data[0..3] == "\xff\xfe\x00\x00"
3195
+ # UTF-32LE with BOM
3196
+ sniffed_xml_encoding = 'utf-32le'
3197
+ xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
3198
+ elsif xml_data[0..2] == "\xef\xbb\xbf"
3199
+ # UTF-8 with BOM
3200
+ sniffed_xml_encoding = 'utf-8'
3201
+ xml_data = xml_data[3..-1]
3202
+ else
3203
+ # ASCII-compatible
3204
+ end
3205
+ xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
3206
+ rescue
3207
+ xml_encoding_match = nil
3208
+ end
3209
+ if xml_encoding_match
3210
+ xml_encoding = xml_encoding_match[1].downcase
3211
+ xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
3212
+ if sniffed_xml_encoding and xencodings.include?xml_encoding
3213
+ xml_encoding = sniffed_xml_encoding
3214
+ end
3215
+ end
3216
+
3217
+ acceptable_content_type = false
3218
+ application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
3219
+ text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
3220
+
3221
+ if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
3222
+ acceptable_content_type = true
3223
+ true_encoding = http_encoding || xml_encoding || 'utf-8'
3224
+ elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
3225
+ acceptable_content_type = true
3226
+ true_encoding = http_encoding || 'us-ascii'
3227
+ elsif /^text\// =~ http_content_type
3228
+ true_encoding = http_encoding || 'us-ascii'
3229
+ elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
3230
+ true_encoding = xml_encoding || 'iso-8859-1'
3231
+ else
3232
+ true_encoding = xml_encoding || 'utf-8'
3233
+ end
3234
+ return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
3235
+ end
3236
+
3237
+ def self.toUTF8(data, encoding)
3238
+ =begin
3239
+ Changes an XML data stream on the fly to specify a new encoding
3240
+
3241
+ data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
3242
+ encoding is a string recognized by encodings.aliases
3243
+ =end
3244
+ $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
3245
+ # NOTE we must use double quotes when dealing with \x encodings!
3246
+ if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
3247
+ if $debug
3248
+ $stderr << "stripping BOM\n"
3249
+ if encoding != 'utf-16be'
3250
+ $stderr << "string utf-16be instead\n"
3251
+ end
3252
+ end
3253
+ encoding = 'utf-16be'
3254
+ data = data[2..-1]
3255
+ elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
3256
+ if $debug
3257
+ $stderr << "stripping BOM\n"
3258
+ $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
3259
+ end
3260
+ encoding = 'utf-16le'
3261
+ data = data[2..-1]
3262
+ elsif (data[0..2] == "\xef\xbb\xbf")
3263
+ if $debug
3264
+ $stderr << "stripping BOM\n"
3265
+ $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
3266
+ end
3267
+ encoding = 'utf-8'
3268
+ data = data[3..-1]
3269
+ elsif (data[0..3] == "\x00\x00\xfe\xff")
3270
+ if $debug
3271
+ $stderr << "stripping BOM\n"
3272
+ if encoding != 'utf-32be'
3273
+ $stderr << "trying utf-32be instead\n"
3274
+ end
3275
+ end
3276
+ encoding = 'utf-32be'
3277
+ data = data[4..-1]
3278
+ elsif (data[0..3] == "\xff\xfe\x00\x00")
3279
+ if $debug
3280
+ $stderr << "stripping BOM\n"
3281
+ if encoding != 'utf-32le'
3282
+ $stderr << "trying utf-32le instead\n"
3283
+ end
3284
+ end
3285
+ encoding = 'utf-32le'
3286
+ data = data[4..-1]
3287
+ end
3288
+ begin
3289
+ newdata = uconvert(data, encoding, 'utf-8')
3290
+ rescue => details
3291
+ end
3292
+ $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
3293
+ declmatch = /^<\?xml[^>]*?>/
3294
+ newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
3295
+ if declmatch =~ newdata
3296
+ newdata.sub!(declmatch, newdecl)
3297
+ else
3298
+ newdata = newdecl + "\n" + newdata
3299
+ end
3300
+ return newdata
3301
+ end
3302
+
3303
+ def self.stripDoctype(data)
3304
+ =begin
3305
+ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3306
+
3307
+ rss_version may be 'rss091n' or None
3308
+ stripped_data is the same XML document, minus the DOCTYPE
3309
+ =end
3310
+ entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
3311
+ data = data.gsub(entity_pattern,'')
3312
+
3313
+ doctype_pattern = /<!DOCTYPE(.*?)>/m
3314
+ doctype_results = data.scan(doctype_pattern)
3315
+ if doctype_results and doctype_results[0]
3316
+ doctype = doctype_results[0][0]
3317
+ else
3318
+ doctype = ''
3319
+ end
3320
+
3321
+ if /netscape/ =~ doctype.downcase
3322
+ version = 'rss091n'
3323
+ else
3324
+ version = nil
3325
+ end
3326
+ data = data.sub(doctype_pattern, '')
3327
+ return version, data
3328
+ end
3329
+
3330
+ def parse(*args); FeedParser.parse(*args); end
3331
+ def FeedParser.parse(furi, options={})
145
3332
  # Parse a feed from a URL, file, stream or string
146
3333
  $compatible = options[:compatible] || $compatible # Use the default compatibility if compatible is nil
147
- strictklass = options[:strict] || StrictFeedParser
148
- looseklass = options[:loose] || LooseFeedParser
149
3334
  result = FeedParserDict.new
150
3335
  result['feed'] = FeedParserDict.new
151
3336
  result['entries'] = []
@@ -155,12 +3340,13 @@ POSSIBILITY OF SUCH DAMAGE."""
155
3340
  end
156
3341
  result['bozo'] = false
157
3342
  handlers = options[:handlers]
3343
+
158
3344
  if handlers.class != Array # FIXME why does this happen?
159
3345
  handlers = [handlers]
160
3346
  end
161
3347
 
162
3348
  begin
163
- if File.exists?furi
3349
+ if URI::parse(furi).class == URI::Generic
164
3350
  f = open(furi) # OpenURI doesn't behave well when passing HTTP options to a file.
165
3351
  else
166
3352
  # And when you do pass them, make sure they aren't just nil (this still true?)
@@ -327,7 +3513,7 @@ POSSIBILITY OF SUCH DAMAGE."""
327
3513
  if use_strict_parser
328
3514
  # initialize the SAX parser
329
3515
  saxparser = XML::SAX::Helpers::ParserFactory.makeParser("XML::Parser::SAXDriver")
330
- feedparser = strictklass.new(baseuri, baselang, 'utf-8')
3516
+ feedparser = StrictFeedParser.new(baseuri, baselang, 'utf-8')
331
3517
  saxparser.setDocumentHandler(feedparser)
332
3518
  saxparser.setDTDHandler(feedparser)
333
3519
  saxparser.setEntityResolver(feedparser)
@@ -348,7 +3534,7 @@ POSSIBILITY OF SUCH DAMAGE."""
348
3534
  end
349
3535
  end
350
3536
  if not use_strict_parser
351
- feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
3537
+ feedparser = LooseFeedParser.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
352
3538
  feedparser.parse(data)
353
3539
  $stderr << "Using LooseFeed\n\n" if $debug
354
3540
  end
@@ -358,7 +3544,6 @@ POSSIBILITY OF SUCH DAMAGE."""
358
3544
  result['namespaces'] = feedparser.namespacesInUse
359
3545
  return result
360
3546
  end
361
- module_function(:parse)
362
3547
  end # End FeedParser module
363
3548
 
364
3549
  class Serializer
@@ -398,7 +3583,7 @@ class TextSerializer < Serializer
398
3583
  end
399
3584
  end
400
3585
 
401
- class PprintSerializer < Serializer # FIXME use pp instead
3586
+ class PprintSerializer < Serializer # FIXME ? use pp instead?
402
3587
  def write(stream = $stdout)
403
3588
  stream << @results['href'].to_s + "\n\n"
404
3589
  pp(@results)
@@ -406,88 +3591,87 @@ class PprintSerializer < Serializer # FIXME use pp instead
406
3591
  end
407
3592
  end
408
3593
 
409
- if $0 == __FILE__
410
- require 'optparse'
411
- require 'ostruct'
412
- options = OpenStruct.new
413
- options.etag = options.modified = options.agent = options.referrer = nil
414
- options.content_language = options.content_location = options.ctype = nil
415
- options.format = 'pprint'
416
- options.compatible = $compatible
417
- options.verbose = false
418
-
419
- opts = OptionParser.new do |opts|
420
- opts.banner
421
- opts.separator ""
422
- opts.on("-A", "--user-agent [AGENT]",
3594
+
3595
+ require 'optparse'
3596
+ require 'ostruct'
3597
+ options = OpenStruct.new
3598
+ options.etag = options.modified = options.agent = options.referrer = nil
3599
+ options.content_language = options.content_location = options.ctype = nil
3600
+ options.format = 'pprint'
3601
+ options.compatible = $compatible
3602
+ options.verbose = false
3603
+
3604
+ opts = OptionParser.new do |opts|
3605
+ opts.banner
3606
+ opts.separator ""
3607
+ opts.on("-A", "--user-agent [AGENT]",
423
3608
  "User-Agent for HTTP URLs") {|agent|
424
- options.agent = agent
425
- }
3609
+ options.agent = agent
3610
+ }
426
3611
 
427
- opts.on("-e", "--referrer [URL]",
3612
+ opts.on("-e", "--referrer [URL]",
428
3613
  "Referrer for HTTP URLs") {|referrer|
429
- options.referrer = referrer
430
- }
3614
+ options.referrer = referrer
3615
+ }
431
3616
 
432
- opts.on("-t", "--etag [TAG]",
3617
+ opts.on("-t", "--etag [TAG]",
433
3618
  "ETag/If-None-Match for HTTP URLs") {|etag|
434
- options.etag = etag
435
- }
3619
+ options.etag = etag
3620
+ }
436
3621
 
437
- opts.on("-m", "--last-modified [DATE]",
3622
+ opts.on("-m", "--last-modified [DATE]",
438
3623
  "Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") {|modified|
439
- options.modified = modified
440
- }
3624
+ options.modified = modified
3625
+ }
441
3626
 
442
- opts.on("-f", "--format [FORMAT]", [:text, :pprint],
3627
+ opts.on("-f", "--format [FORMAT]", [:text, :pprint],
443
3628
  "output resutls in FORMAT (text, pprint)") {|format|
444
- options.format = format
445
- }
3629
+ options.format = format
3630
+ }
446
3631
 
447
- opts.on("-v", "--[no-]verbose",
3632
+ opts.on("-v", "--[no-]verbose",
448
3633
  "write debugging information to stderr") {|v|
449
- options.verbose = v
450
- }
3634
+ options.verbose = v
3635
+ }
451
3636
 
452
- opts.on("-c", "--[no-]compatible",
3637
+ opts.on("-c", "--[no-]compatible",
453
3638
  "strip element attributes like feedparser.py 4.1 (default)") {|comp|
454
- options.compatible = comp
455
- }
456
- opts.on("-l", "--content-location [LOCATION]",
3639
+ options.compatible = comp
3640
+ }
3641
+ opts.on("-l", "--content-location [LOCATION]",
457
3642
  "default Content-Location HTTP header") {|loc|
458
- options.content_location = loc
459
- }
460
- opts.on("-a", "--content-language [LANG]",
3643
+ options.content_location = loc
3644
+ }
3645
+ opts.on("-a", "--content-language [LANG]",
461
3646
  "default Content-Language HTTP header") {|lang|
462
- options.content_language = lang
463
- }
464
- opts.on("-t", "--content-type [TYPE]",
3647
+ options.content_language = lang
3648
+ }
3649
+ opts.on("-t", "--content-type [TYPE]",
465
3650
  "default Content-type HTTP header") {|ctype|
466
- options.ctype = ctype
467
- }
468
- end
3651
+ options.ctype = ctype
3652
+ }
3653
+ end
469
3654
 
470
- opts.parse!(ARGV)
471
- $debug = true if options.verbose
472
- $compatible = options.compatible unless options.compatible.nil?
3655
+ opts.parse!(ARGV)
3656
+ $debug = true if options.verbose
3657
+ $compatible = options.compatible unless options.compatible.nil?
473
3658
 
474
- if options.format == :text
475
- serializer = TextSerializer
476
- else
477
- serializer = PprintSerializer
478
- end
479
- args = *ARGV.dup
480
- unless args.nil?
481
- args.each do |url| # opts.parse! removes everything but the urls from the command line
482
- results = FeedParser.parse(url, :etag => options.etag,
483
- :modified => options.modified,
484
- :agent => options.agent,
485
- :referrer => options.referrer,
486
- :content_location => options.content_location,
487
- :content_language => options.content_language,
488
- :content_type => options.ctype
489
- )
490
- serializer.new(results).write($stdout)
491
- end
3659
+ if options.format == :text
3660
+ serializer = TextSerializer
3661
+ else
3662
+ serializer = PprintSerializer
3663
+ end
3664
+ args = *ARGV.dup
3665
+ unless args.nil?
3666
+ args.each do |url| # opts.parse! removes everything but the urls from the command line
3667
+ results = FeedParser.parse(url, :etag => options.etag,
3668
+ :modified => options.modified,
3669
+ :agent => options.agent,
3670
+ :referrer => options.referrer,
3671
+ :content_location => options.content_location,
3672
+ :content_language => options.content_language,
3673
+ :content_type => options.ctype
3674
+ )
3675
+ serializer.new(results).write($stdout)
492
3676
  end
493
3677
  end