rfeedparser 0.9.87 → 0.9.91

Sign up to get free protection for your applications and to get access to all the features.
data/lib/rfeedparser.rb CHANGED
@@ -14,22 +14,21 @@ require 'stringio'
14
14
  require 'uri'
15
15
  require 'cgi' # escaping html
16
16
  require 'time'
17
- require 'xml/saxdriver' # calling expat
18
17
  require 'pp'
19
18
  require 'rubygems'
20
19
  require 'base64'
21
20
  require 'iconv'
22
- gem 'hpricot', ">=0.5"
21
+
23
22
  gem 'character-encodings', ">=0.2.0"
24
23
  gem 'htmltools', ">=1.10"
25
24
  gem 'htmlentities', ">=4.0.0"
26
25
  gem 'activesupport', ">=1.4.1"
27
26
  gem 'rchardet', ">=1.0"
27
+ require 'xml/saxdriver' # calling expat through the xmlparser gem
28
28
 
29
29
  require 'rchardet'
30
30
  $chardet = true
31
31
 
32
- require 'hpricot'
33
32
  require 'encoding/character/utf-8'
34
33
  require 'html/sgml-parser'
35
34
  require 'htmlentities'
@@ -40,998 +39,24 @@ include OpenURI
40
39
  $debug = false
41
40
  $compatible = true
42
41
 
43
- Encoding_Aliases = { # Adapted from python2.4's encodings/aliases.py
44
- 'unicode' => 'utf-16',
45
-
46
- # MacOSX does not have Unicode as a separate encoding nor even
47
- # aliased. My Ubuntu box has it as a separate encoding but I cannot
48
- # for the life of me figure out where the source code for UNICODE.so
49
- # is (supposedly, in libc6 .deb but that's a damn lie), so I don't
50
- # know what it expects. After some extensive research, I've decided
51
- # to alias it to utf-16 much like Python does when it is built with
52
- # --enable-unicode=ucs2. This could be seriously wrong. I have no idea.
53
-
54
- # ascii codec
55
- '646' => 'ascii',
56
- 'ansi_x3.4_1968' => 'ascii',
57
- 'ansi_x3_4_1968' => 'ascii', # some email headers use this non-standard name
58
- 'ansi_x3.4_1986' => 'ascii',
59
- 'cp367' => 'ascii',
60
- 'csascii' => 'ascii',
61
- 'ibm367' => 'ascii',
62
- 'iso646_us' => 'ascii',
63
- 'iso_646.irv_1991' => 'ascii',
64
- 'iso_ir_6' => 'ascii',
65
- 'us' => 'ascii',
66
- 'us_ascii' => 'ascii',
67
-
68
- # big5 codec
69
- 'big5_tw' => 'big5',
70
- 'csbig5' => 'big5',
71
-
72
- # big5hkscs codec
73
- 'big5_hkscs' => 'big5hkscs',
74
- 'hkscs' => 'big5hkscs',
75
-
76
- # cp037 codec
77
- '037' => 'cp037',
78
- 'csibm037' => 'cp037',
79
- 'ebcdic_cp_ca' => 'cp037',
80
- 'ebcdic_cp_nl' => 'cp037',
81
- 'ebcdic_cp_us' => 'cp037',
82
- 'ebcdic_cp_wt' => 'cp037',
83
- 'ibm037' => 'cp037',
84
- 'ibm039' => 'cp037',
85
-
86
- # cp1026 codec
87
- '1026' => 'cp1026',
88
- 'csibm1026' => 'cp1026',
89
- 'ibm1026' => 'cp1026',
90
-
91
- # cp1140 codec
92
- '1140' => 'cp1140',
93
- 'ibm1140' => 'cp1140',
94
-
95
- # cp1250 codec
96
- '1250' => 'cp1250',
97
- 'windows_1250' => 'cp1250',
98
-
99
- # cp1251 codec
100
- '1251' => 'cp1251',
101
- 'windows_1251' => 'cp1251',
102
-
103
- # cp1252 codec
104
- '1252' => 'cp1252',
105
- 'windows_1252' => 'cp1252',
106
-
107
- # cp1253 codec
108
- '1253' => 'cp1253',
109
- 'windows_1253' => 'cp1253',
110
-
111
- # cp1254 codec
112
- '1254' => 'cp1254',
113
- 'windows_1254' => 'cp1254',
114
-
115
- # cp1255 codec
116
- '1255' => 'cp1255',
117
- 'windows_1255' => 'cp1255',
118
-
119
- # cp1256 codec
120
- '1256' => 'cp1256',
121
- 'windows_1256' => 'cp1256',
122
-
123
- # cp1257 codec
124
- '1257' => 'cp1257',
125
- 'windows_1257' => 'cp1257',
126
-
127
- # cp1258 codec
128
- '1258' => 'cp1258',
129
- 'windows_1258' => 'cp1258',
130
-
131
- # cp424 codec
132
- '424' => 'cp424',
133
- 'csibm424' => 'cp424',
134
- 'ebcdic_cp_he' => 'cp424',
135
- 'ibm424' => 'cp424',
136
-
137
- # cp437 codec
138
- '437' => 'cp437',
139
- 'cspc8codepage437' => 'cp437',
140
- 'ibm437' => 'cp437',
141
-
142
- # cp500 codec
143
- '500' => 'cp500',
144
- 'csibm500' => 'cp500',
145
- 'ebcdic_cp_be' => 'cp500',
146
- 'ebcdic_cp_ch' => 'cp500',
147
- 'ibm500' => 'cp500',
148
-
149
- # cp775 codec
150
- '775' => 'cp775',
151
- 'cspc775baltic' => 'cp775',
152
- 'ibm775' => 'cp775',
153
-
154
- # cp850 codec
155
- '850' => 'cp850',
156
- 'cspc850multilingual' => 'cp850',
157
- 'ibm850' => 'cp850',
158
-
159
- # cp852 codec
160
- '852' => 'cp852',
161
- 'cspcp852' => 'cp852',
162
- 'ibm852' => 'cp852',
163
-
164
- # cp855 codec
165
- '855' => 'cp855',
166
- 'csibm855' => 'cp855',
167
- 'ibm855' => 'cp855',
168
-
169
- # cp857 codec
170
- '857' => 'cp857',
171
- 'csibm857' => 'cp857',
172
- 'ibm857' => 'cp857',
173
-
174
- # cp860 codec
175
- '860' => 'cp860',
176
- 'csibm860' => 'cp860',
177
- 'ibm860' => 'cp860',
178
-
179
- # cp861 codec
180
- '861' => 'cp861',
181
- 'cp_is' => 'cp861',
182
- 'csibm861' => 'cp861',
183
- 'ibm861' => 'cp861',
184
-
185
- # cp862 codec
186
- '862' => 'cp862',
187
- 'cspc862latinhebrew' => 'cp862',
188
- 'ibm862' => 'cp862',
189
-
190
- # cp863 codec
191
- '863' => 'cp863',
192
- 'csibm863' => 'cp863',
193
- 'ibm863' => 'cp863',
194
-
195
- # cp864 codec
196
- '864' => 'cp864',
197
- 'csibm864' => 'cp864',
198
- 'ibm864' => 'cp864',
199
-
200
- # cp865 codec
201
- '865' => 'cp865',
202
- 'csibm865' => 'cp865',
203
- 'ibm865' => 'cp865',
204
-
205
- # cp866 codec
206
- '866' => 'cp866',
207
- 'csibm866' => 'cp866',
208
- 'ibm866' => 'cp866',
209
-
210
- # cp869 codec
211
- '869' => 'cp869',
212
- 'cp_gr' => 'cp869',
213
- 'csibm869' => 'cp869',
214
- 'ibm869' => 'cp869',
215
-
216
- # cp932 codec
217
- '932' => 'cp932',
218
- 'ms932' => 'cp932',
219
- 'mskanji' => 'cp932',
220
- 'ms_kanji' => 'cp932',
221
-
222
- # cp949 codec
223
- '949' => 'cp949',
224
- 'ms949' => 'cp949',
225
- 'uhc' => 'cp949',
226
-
227
- # cp950 codec
228
- '950' => 'cp950',
229
- 'ms950' => 'cp950',
230
-
231
- # euc_jp codec
232
- 'euc_jp' => 'euc-jp',
233
- 'eucjp' => 'euc-jp',
234
- 'ujis' => 'euc-jp',
235
- 'u_jis' => 'euc-jp',
236
-
237
- # euc_kr codec
238
- 'euc_kr' => 'euc-kr',
239
- 'euckr' => 'euc-kr',
240
- 'korean' => 'euc-kr',
241
- 'ksc5601' => 'euc-kr',
242
- 'ks_c_5601' => 'euc-kr',
243
- 'ks_c_5601_1987' => 'euc-kr',
244
- 'ksx1001' => 'euc-kr',
245
- 'ks_x_1001' => 'euc-kr',
246
-
247
- # gb18030 codec
248
- 'gb18030_2000' => 'gb18030',
249
-
250
- # gb2312 codec
251
- 'chinese' => 'gb2312',
252
- 'csiso58gb231280' => 'gb2312',
253
- 'euc_cn' => 'gb2312',
254
- 'euccn' => 'gb2312',
255
- 'eucgb2312_cn' => 'gb2312',
256
- 'gb2312_1980' => 'gb2312',
257
- 'gb2312_80' => 'gb2312',
258
- 'iso_ir_58' => 'gb2312',
259
-
260
- # gbk codec
261
- '936' => 'gbk',
262
- 'cp936' => 'gbk',
263
- 'ms936' => 'gbk',
264
-
265
- # hp-roman8 codec
266
- 'hp_roman8' => 'hp-roman8',
267
- 'roman8' => 'hp-roman8',
268
- 'r8' => 'hp-roman8',
269
- 'csHPRoman8' => 'hp-roman8',
270
-
271
- # iso2022_jp codec
272
- 'iso2022_jp' => 'iso-2022-jp',
273
- 'csiso2022jp' => 'iso-2022-jp',
274
- 'iso2022jp' => 'iso-2022-jp',
275
- 'iso_2022_jp' => 'iso-2022-jp',
276
-
277
- # iso2022_jp_1 codec
278
- 'iso2002_jp_1' => 'iso-2022-jp-1',
279
- 'iso2022jp_1' => 'iso-2022-jp-1',
280
- 'iso_2022_jp_1' => 'iso-2022-jp-1',
281
-
282
- # iso2022_jp_2 codec
283
- 'iso2022_jp_2' => 'iso-2002-jp-2',
284
- 'iso2022jp_2' => 'iso-2022-jp-2',
285
- 'iso_2022_jp_2' => 'iso-2022-jp-2',
286
-
287
- # iso2022_jp_3 codec
288
- 'iso2002_jp_3' => 'iso-2022-jp-3',
289
- 'iso2022jp_3' => 'iso-2022-jp-3',
290
- 'iso_2022_jp_3' => 'iso-2022-jp-3',
291
-
292
- # iso2022_kr codec
293
- 'iso2022_kr' => 'iso-2022-kr',
294
- 'csiso2022kr' => 'iso-2022-kr',
295
- 'iso2022kr' => 'iso-2022-kr',
296
- 'iso_2022_kr' => 'iso-2022-kr',
297
-
298
- # iso8859_10 codec
299
- 'iso8859_10' => 'iso-8859-10',
300
- 'csisolatin6' => 'iso-8859-10',
301
- 'iso_8859_10' => 'iso-8859-10',
302
- 'iso_8859_10_1992' => 'iso-8859-10',
303
- 'iso_ir_157' => 'iso-8859-10',
304
- 'l6' => 'iso-8859-10',
305
- 'latin6' => 'iso-8859-10',
306
-
307
- # iso8859_13 codec
308
- 'iso8859_13' => 'iso-8859-13',
309
- 'iso_8859_13' => 'iso-8859-13',
310
-
311
- # iso8859_14 codec
312
- 'iso8859_14' => 'iso-8859-14',
313
- 'iso_8859_14' => 'iso-8859-14',
314
- 'iso_8859_14_1998' => 'iso-8859-14',
315
- 'iso_celtic' => 'iso-8859-14',
316
- 'iso_ir_199' => 'iso-8859-14',
317
- 'l8' => 'iso-8859-14',
318
- 'latin8' => 'iso-8859-14',
319
-
320
- # iso8859_15 codec
321
- 'iso8859_15' => 'iso-8859-15',
322
- 'iso_8859_15' => 'iso-8859-15',
323
-
324
- # iso8859_1 codec
325
- 'latin_1' => 'iso-8859-1',
326
- 'cp819' => 'iso-8859-1',
327
- 'csisolatin1' => 'iso-8859-1',
328
- 'ibm819' => 'iso-8859-1',
329
- 'iso8859' => 'iso-8859-1',
330
- 'iso_8859_1' => 'iso-8859-1',
331
- 'iso_8859_1_1987' => 'iso-8859-1',
332
- 'iso_ir_100' => 'iso-8859-1',
333
- 'l1' => 'iso-8859-1',
334
- 'latin' => 'iso-8859-1',
335
- 'latin1' => 'iso-8859-1',
336
-
337
- # iso8859_2 codec
338
- 'iso8859_2' => 'iso-8859-2',
339
- 'csisolatin2' => 'iso-8859-2',
340
- 'iso_8859_2' => 'iso-8859-2',
341
- 'iso_8859_2_1987' => 'iso-8859-2',
342
- 'iso_ir_101' => 'iso-8859-2',
343
- 'l2' => 'iso-8859-2',
344
- 'latin2' => 'iso-8859-2',
345
-
346
- # iso8859_3 codec
347
- 'iso8859_3' => 'iso-8859-3',
348
- 'csisolatin3' => 'iso-8859-3',
349
- 'iso_8859_3' => 'iso-8859-3',
350
- 'iso_8859_3_1988' => 'iso-8859-3',
351
- 'iso_ir_109' => 'iso-8859-3',
352
- 'l3' => 'iso-8859-3',
353
- 'latin3' => 'iso-8859-3',
354
-
355
- # iso8859_4 codec
356
- 'iso8849_4' => 'iso-8859-4',
357
- 'csisolatin4' => 'iso-8859-4',
358
- 'iso_8859_4' => 'iso-8859-4',
359
- 'iso_8859_4_1988' => 'iso-8859-4',
360
- 'iso_ir_110' => 'iso-8859-4',
361
- 'l4' => 'iso-8859-4',
362
- 'latin4' => 'iso-8859-4',
363
-
364
- # iso8859_5 codec
365
- 'iso8859_5' => 'iso-8859-5',
366
- 'csisolatincyrillic' => 'iso-8859-5',
367
- 'cyrillic' => 'iso-8859-5',
368
- 'iso_8859_5' => 'iso-8859-5',
369
- 'iso_8859_5_1988' => 'iso-8859-5',
370
- 'iso_ir_144' => 'iso-8859-5',
371
-
372
- # iso8859_6 codec
373
- 'iso8859_6' => 'iso-8859-6',
374
- 'arabic' => 'iso-8859-6',
375
- 'asmo_708' => 'iso-8859-6',
376
- 'csisolatinarabic' => 'iso-8859-6',
377
- 'ecma_114' => 'iso-8859-6',
378
- 'iso_8859_6' => 'iso-8859-6',
379
- 'iso_8859_6_1987' => 'iso-8859-6',
380
- 'iso_ir_127' => 'iso-8859-6',
381
-
382
- # iso8859_7 codec
383
- 'iso8859_7' => 'iso-8859-7',
384
- 'csisolatingreek' => 'iso-8859-7',
385
- 'ecma_118' => 'iso-8859-7',
386
- 'elot_928' => 'iso-8859-7',
387
- 'greek' => 'iso-8859-7',
388
- 'greek8' => 'iso-8859-7',
389
- 'iso_8859_7' => 'iso-8859-7',
390
- 'iso_8859_7_1987' => 'iso-8859-7',
391
- 'iso_ir_126' => 'iso-8859-7',
392
-
393
- # iso8859_8 codec
394
- 'iso8859_9' => 'iso8859_8',
395
- 'csisolatinhebrew' => 'iso-8859-8',
396
- 'hebrew' => 'iso-8859-8',
397
- 'iso_8859_8' => 'iso-8859-8',
398
- 'iso_8859_8_1988' => 'iso-8859-8',
399
- 'iso_ir_138' => 'iso-8859-8',
400
-
401
- # iso8859_9 codec
402
- 'iso8859_9' => 'iso-8859-9',
403
- 'csisolatin5' => 'iso-8859-9',
404
- 'iso_8859_9' => 'iso-8859-9',
405
- 'iso_8859_9_1989' => 'iso-8859-9',
406
- 'iso_ir_148' => 'iso-8859-9',
407
- 'l5' => 'iso-8859-9',
408
- 'latin5' => 'iso-8859-9',
409
-
410
- # iso8859_11 codec
411
- 'iso8859_11' => 'iso-8859-11',
412
- 'thai' => 'iso-8859-11',
413
- 'iso_8859_11' => 'iso-8859-11',
414
- 'iso_8859_11_2001' => 'iso-8859-11',
415
-
416
- # iso8859_16 codec
417
- 'iso8859_16' => 'iso-8859-16',
418
- 'iso_8859_16' => 'iso-8859-16',
419
- 'iso_8859_16_2001' => 'iso-8859-16',
420
- 'iso_ir_226' => 'iso-8859-16',
421
- 'l10' => 'iso-8859-16',
422
- 'latin10' => 'iso-8859-16',
423
-
424
- # cskoi8r codec
425
- 'koi8_r' => 'cskoi8r',
426
-
427
- # mac_cyrillic codec
428
- 'mac_cyrillic' => 'maccyrillic',
429
-
430
- # shift_jis codec
431
- 'csshiftjis' => 'shift_jis',
432
- 'shiftjis' => 'shift_jis',
433
- 'sjis' => 'shift_jis',
434
- 's_jis' => 'shift_jis',
435
-
436
- # shift_jisx0213 codec
437
- 'shiftjisx0213' => 'shift_jisx0213',
438
- 'sjisx0213' => 'shift_jisx0213',
439
- 's_jisx0213' => 'shift_jisx0213',
440
-
441
- # utf_16 codec
442
- 'utf_16' => 'utf-16',
443
- 'u16' => 'utf-16',
444
- 'utf16' => 'utf-16',
445
-
446
- # utf_16_be codec
447
- 'utf_16_be' => 'utf-16be',
448
- 'unicodebigunmarked' => 'utf-16be',
449
- 'utf_16be' => 'utf-16be',
450
-
451
- # utf_16_le codec
452
- 'utf_16_le' => 'utf-16le',
453
- 'unicodelittleunmarked' => 'utf-16le',
454
- 'utf_16le' => 'utf-16le',
455
-
456
- # utf_7 codec
457
- 'utf_7' => 'utf-7',
458
- 'u7' => 'utf-7',
459
- 'utf7' => 'utf-7',
460
-
461
- # utf_8 codec
462
- 'utf_8' => 'utf-8',
463
- 'u8' => 'utf-8',
464
- 'utf' => 'utf-8',
465
- 'utf8' => 'utf-8',
466
- 'utf8_ucs2' => 'utf-8',
467
- 'utf8_ucs4' => 'utf-8',
468
- }
469
-
470
- def unicode(data, from_encoding)
471
- # Takes a single string and converts it from the encoding in
472
- # from_encoding to unicode.
473
- uconvert(data, from_encoding, 'unicode')
474
- end
475
-
476
- def uconvert(data, from_encoding, to_encoding = 'utf-8')
477
- from_encoding = Encoding_Aliases[from_encoding] || from_encoding
478
- to_encoding = Encoding_Aliases[to_encoding] || to_encoding
479
- Iconv.iconv(to_encoding, from_encoding, data)[0]
480
- end
481
-
482
- def unichr(i)
483
- [i].pack('U*')
484
- end
485
-
486
- def index_match(stri,regexp, offset)
487
- i = stri.index(regexp, offset)
488
-
489
- return nil, nil unless i
490
-
491
- full = stri[i..-1].match(regexp)
492
- return i, full
493
- end
494
-
495
- def _ebcdic_to_ascii(s)
496
- return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
497
- end
498
-
499
- def urljoin(base, uri)
500
- urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
501
- uri = uri.sub(urifixer, '\1\3')
502
-
503
- begin
504
- return URI.join(base, uri).to_s
505
- rescue URI::BadURIError => e
506
- if URI.parse(base).relative?
507
- return URI::parse(uri).to_s
508
- end
509
- end
510
- end
511
-
512
- def py2rtime(pytuple)
513
- Time.utc(pytuple[0..5])
514
- end
515
-
516
- # http://intertwingly.net/stories/2005/09/28/xchar.rb
517
- module XChar
518
- # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
519
- CP1252 = {
520
- 128 => 8364, # euro sign
521
- 130 => 8218, # single low-9 quotation mark
522
- 131 => 402, # latin small letter f with hook
523
- 132 => 8222, # double low-9 quotation mark
524
- 133 => 8230, # horizontal ellipsis
525
- 134 => 8224, # dagger
526
- 135 => 8225, # double dagger
527
- 136 => 710, # modifier letter circumflex accent
528
- 137 => 8240, # per mille sign
529
- 138 => 352, # latin capital letter s with caron
530
- 139 => 8249, # single left-pointing angle quotation mark
531
- 140 => 338, # latin capital ligature oe
532
- 142 => 381, # latin capital letter z with caron
533
- 145 => 8216, # left single quotation mark
534
- 146 => 8217, # right single quotation mark
535
- 147 => 8220, # left double quotation mark
536
- 148 => 8221, # right double quotation mark
537
- 149 => 8226, # bullet
538
- 150 => 8211, # en dash
539
- 151 => 8212, # em dash
540
- 152 => 732, # small tilde
541
- 153 => 8482, # trade mark sign
542
- 154 => 353, # latin small letter s with caron
543
- 155 => 8250, # single right-pointing angle quotation mark
544
- 156 => 339, # latin small ligature oe
545
- 158 => 382, # latin small letter z with caron
546
- 159 => 376} # latin capital letter y with diaeresis
547
-
548
- # http://www.w3.org/TR/REC-xml/#dt-chardata
549
- PREDEFINED = {
550
- 38 => '&', # ampersand
551
- 60 => '<', # left angle bracket
552
- 62 => '>'} # right angle bracket
553
-
554
- # http://www.w3.org/TR/REC-xml/#charsets
555
- VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
556
- (0xE000..0xFFFD), (0x10000..0x10FFFF)]
557
- end
558
-
559
- class Fixnum
560
- # xml escaped version of chr
561
- def xchr
562
- n = XChar::CP1252[self] || self
563
- n = 42 unless XChar::VALID.find {|range| range.include? n}
564
- XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
565
- end
566
- end
567
-
568
- class String
569
- alias :old_index :index
570
- def to_xs
571
- unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
572
- rescue
573
- unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
574
- end
575
- end
576
-
577
- class BetterSGMLParserError < Exception; end;
578
- class BetterSGMLParser < HTML::SGMLParser
579
- # Replaced Tagfind and Charref Regexps with the ones in feedparser.py
580
- # This makes things work.
581
- Interesting = /[&<]/u
582
- Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
583
- '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
584
- '![^<>]*)?', 64) # 64 is the unicode flag
585
-
586
- Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
587
- Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
588
-
589
- Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
590
- Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
591
- Endtagopen = /<\//u # Matching the Python SGMLParser
592
- Endbracket = /[<>]/u
593
- Declopen = /<!/u
594
- Piopenbegin = /^<\?/u
595
- Piclose = />/u
596
-
597
- Commentopen = /<!--/u
598
- Commentclose = /--\s*>/u
599
- Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
600
- Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
601
- '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
602
- 64)
603
- Endtagfind = /\s*\/\s*>/u
604
- def initialize(verbose=false)
605
- super(verbose)
606
- end
607
- def feed(*args)
608
- super(*args)
609
- end
610
-
611
- def goahead(_end)
612
- rawdata = @rawdata # woo, utf-8 magic
613
- i = 0
614
- n = rawdata.length
615
- while i < n
616
- if @nomoretags
617
- # handle_data_range does nothing more than set a "Range" that is never used. wtf?
618
- handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
619
- i = n
620
- break
621
- end
622
- j = rawdata.index(Interesting, i)
623
- j = n unless j
624
- handle_data(rawdata[i...j]) if i < j
625
- i = j
626
- break if (i == n)
627
- if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
628
- if rawdata.index(Starttagopen,i) == i
629
- if @literal
630
- handle_data(rawdata[i..i])
631
- i = i+1
632
- next
633
- end
634
- k = parse_starttag(i)
635
- break unless k
636
- i = k
637
- next
638
- end
639
- if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
640
- k = parse_endtag(i)
641
- break unless k
642
- i = k
643
- @literal = false
644
- next
645
- end
646
- if @literal
647
- if n > (i+1)
648
- handle_data("<")
649
- i = i+1
650
- else
651
- #incomplete
652
- break
653
- end
654
- next
655
- end
656
- if rawdata.index(Commentopen,i) == i
657
- k = parse_comment(i)
658
- break unless k
659
- i = k
660
- next
661
- end
662
- if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
663
- k = parse_pi(i)
664
- break unless k
665
- i += k
666
- next
667
- end
668
- if rawdata.index(Declopen,i) == i
669
- # This is some sort of declaration; in "HTML as
670
- # deployed," this should only be the document type
671
- # declaration ("<!DOCTYPE html...>").
672
- k = parse_declaration(i)
673
- break unless k
674
- i = k
675
- next
676
- end
677
- elsif rawdata[i..i] == '&'
678
- if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
679
- handle_data(rawdata[i..i])
680
- i += 1
681
- next
682
- end
683
-
684
- # the Char must come first as its #=~ method is the only one that is UTF-8 safe
685
- ni,match = index_match(rawdata, Charref, i)
686
- if ni and ni == i # See? Ugly
687
- handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
688
- i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
689
- i -= 1 unless rawdata[i-1..i-1] == ";"
690
- next
691
- end
692
- ni,match = index_match(rawdata, Entityref, i)
693
- if ni and ni == i
694
- handle_entityref(match[1])
695
- i += match[0].length
696
- i -= 1 unless rawdata[i-1..i-1] == ";"
697
- next
698
- end
699
- else
700
- error('neither < nor & ??')
701
- end
702
- # We get here only if incomplete matches but
703
- # nothing else
704
- ni,match = index_match(rawdata,Incomplete,i)
705
- unless ni and ni == 0
706
- handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
707
- i += 1
708
- next
709
- end
710
- j = ni + match[0].length
711
- break if j == n # Really incomplete
712
- handle_data(rawdata[i...j])
713
- i = j
714
- end # end while
715
-
716
- if _end and i < n
717
- handle_data(rawdata[i...n])
718
- i = n
719
- end
720
-
721
- @rawdata = rawdata[i..-1]
722
- # @offset += i # FIXME BUGME another unused variable in SGMLParser?
723
- end
724
-
725
-
726
- # Internal -- parse processing instr, return length or -1 if not terminated
727
- def parse_pi(i)
728
- rawdata = @rawdata
729
- if rawdata[i...i+2] != '<?'
730
- error("unexpected call to parse_pi()")
731
- end
732
- ni,match = index_match(rawdata,Piclose,i+2)
733
- return nil unless match
734
- j = ni
735
- handle_pi(rawdata[i+2...j])
736
- j = (j + match[0].length)
737
- return j-i
738
- end
739
-
740
- def parse_comment(i)
741
- rawdata = @rawdata
742
- if rawdata[i...i+4] != "<!--"
743
- error("unexpected call to parse_comment()")
744
- end
745
- ni,match = index_match(rawdata, Commentclose,i)
746
- return nil unless match
747
- handle_comment(rawdata[i+4..(ni-1)])
748
- return ni+match[0].length # Length from i to just past the closing comment tag
749
- end
750
-
751
-
752
- def parse_starttag(i)
753
- @_starttag_text = nil
754
- start_pos = i
755
- rawdata = @rawdata
756
- ni,match = index_match(rawdata,Shorttagopen,i)
757
- if ni == i
758
- # SGML shorthand: <tag/data/ == <tag>data</tag>
759
- # XXX Can data contain &... (entity or char refs)?
760
- # XXX Can data contain < or > (tag characters)?
761
- # XXX Can there be whitespace before the first /?
762
- k,match = index_match(rawdata,Shorttag,i)
763
- return nil unless match
764
- tag, data = match[1], match[2]
765
- @_starttag_text = "<#{tag}/"
766
- tag.downcase!
767
- second_end = rawdata.index(Shorttagopen,k)
768
- finish_shorttag(tag, data)
769
- @_starttag_text = rawdata[start_pos...second_end+1]
770
- return k
771
- end
772
-
773
- j = rawdata.index(Endbracket, i+1)
774
- return nil unless j
775
- attrsd = []
776
- if rawdata[i...i+2] == '<>'
777
- # SGML shorthand: <> == <last open tag seen>
778
- k = j
779
- tag = @lasttag
780
- else
781
- ni,match = index_match(rawdata,Tagfind,i+1)
782
- unless match
783
- error('unexpected call to parse_starttag')
784
- end
785
- k = ni+match[0].length+1
786
- tag = match[0].downcase
787
- @lasttag = tag
788
- end
789
-
790
- while k < j
791
- break if rawdata.index(Endtagfind, k) == k
792
- ni,match = index_match(rawdata,Attrfind,k)
793
- break unless ni
794
- matched_length = match[0].length
795
- attrname, rest, attrvalue = match[1],match[2],match[3]
796
- if rest.nil? or rest.empty?
797
- attrvalue = '' # was: = attrname # Why the change?
798
- elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
799
- attrvalue = attrvalue[1...-1]
800
- end
801
- attrsd << [attrname.downcase, attrvalue]
802
- k += matched_length
803
- end
804
- if rawdata[j..j] == ">"
805
- j += 1
806
- end
807
- @_starttag_text = rawdata[start_pos...j]
808
- finish_starttag(tag, attrsd)
809
- return j
810
- end
811
-
812
- def parse_endtag(i)
813
- rawdata = @rawdata
814
- j, match = index_match(rawdata, /[<>]/,i+1)
815
- return nil unless j
816
- tag = rawdata[i+2...j].strip.downcase
817
- if rawdata[j..j] == ">"
818
- j += 1
819
- end
820
- finish_endtag(tag)
821
- return j
822
- end
823
-
824
- def output
825
- # Return processed HTML as a single string
826
- return @pieces.map{|p| p.to_s}.join
827
- end
828
-
829
- def error(message)
830
- raise BetterSGMLParserError.new(message)
831
- end
832
- def handle_pi(text)
833
- end
834
- def handle_decl(text)
835
- end
836
- end
837
-
838
- # Add some helper methods to make AttributeList (all of those damn attrs
839
- # and attrsD used by StrictFeedParser) act more like a Hash.
840
- # NOTE AttributeList is still Read-Only (AFAICT).
841
- # Monkey patching is terrible, and I have an addiction.
842
- module XML
843
- module SAX
844
- module AttributeList # in xml/sax.rb
845
- def [](key)
846
- getValue(key)
847
- end
848
-
849
- def each(&blk)
850
- (0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
851
- end
852
-
853
- def each_key(&blk)
854
- (0...getLength).each{|pos| yield getName(pos) }
855
- end
856
-
857
- def each_value(&blk)
858
- (0...getLength).each{|pos| yield getValue(pos) }
859
- end
860
-
861
- def to_a # Rather use collect? grep for to_a.collect
862
- l = []
863
- each{|k,v| l << [k,v]}
864
- return l
865
- end
866
-
867
- def to_s
868
- l = []
869
- each{|k,v| l << "#{k} => #{v}"}
870
- "{ "+l.join(", ")+" }"
871
- end
872
- end
873
- end
874
- end
875
-
876
- # This used to be based on Michael Moen's Hpricot#scrub, but that seems to
877
- # have only been part of its evolution. Hpricot#scrub is cool code, though.
878
- # http://underpantsgnome.com/2007/01/20/hpricot-scrub
879
- module Hpricot
880
- Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
881
- 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
882
- 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
883
- 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
884
- 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
885
- 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
886
- 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
887
- 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
888
- 'ul', 'var'
889
- ]
890
-
891
- Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
892
- 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
893
- 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
894
- 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
895
- 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
896
- 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
897
- 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
898
- 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
899
- 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
900
- 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
901
- 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
902
- ]
903
-
904
- Unacceptable_Elements_With_End_Tag = ['script', 'applet']
905
-
906
- Acceptable_Css_Properties = ['azimuth', 'background-color',
907
- 'border-bottom-color', 'border-collapse', 'border-color',
908
- 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
909
- 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
910
- 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
911
- 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
912
- 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
913
- 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
914
- 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
915
- 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
916
- 'white-space', 'width'
917
- ]
918
-
919
- # survey of common keywords found in feeds
920
- Acceptable_Css_Keywords = ['auto', 'aqua', 'black', 'block', 'blue',
921
- 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
922
- 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
923
- 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
924
- 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
925
- 'transparent', 'underline', 'white', 'yellow'
926
- ]
927
-
928
- Mathml_Elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
929
- 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
930
- 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
931
- 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
932
- 'munderover', 'none'
933
- ]
934
-
935
- Mathml_Attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
936
- 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
937
- 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
938
- 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
939
- 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
940
- 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
941
- 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
942
- 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
943
- 'xlink:type', 'xmlns', 'xmlns:xlink'
944
- ]
945
-
946
- # svgtiny - foreignObject + linearGradient + radialGradient + stop
947
- Svg_Elements = ['a', 'animate', 'animateColor', 'animateMotion',
948
- 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
949
- 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
950
- 'linearGradient', 'line', 'metadata', 'missing-glyph', 'mpath', 'path',
951
- 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg',
952
- 'switch', 'text', 'title', 'use'
953
- ]
954
-
955
- # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
956
- Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
957
- 'arabic-form', 'ascent', 'attributeName', 'attributeType',
958
- 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
959
- 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
960
- 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
961
- 'font-size', 'font-stretch', 'font-style', 'font-variant',
962
- 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
963
- 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
964
- 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
965
- 'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
966
- 'origin', 'overline-position', 'overline-thickness', 'panose-1',
967
- 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
968
- 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
969
- 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
970
- 'stop-color', 'stop-opacity', 'strikethrough-position',
971
- 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
972
- 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
973
- 'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
974
- 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
975
- 'underline-position', 'underline-thickness', 'unicode',
976
- 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
977
- 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
978
- 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
979
- 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
980
- 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
981
- ]
982
-
983
- Svg_Attr_Map = nil
984
- Svg_Elem_Map = nil
985
-
986
- Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
987
- 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
988
- 'stroke-opacity'
989
- ]
990
-
991
- unless $compatible
992
- @@acceptable_tag_specific_attributes = {}
993
- @@mathml_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@mathml_attributes }
994
- @@svg_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@svg_attributes }
995
- end
996
-
997
- class Elements
998
- def strip_attributes(safe=[])
999
- each { |x| x.strip_attributes(safe) }
1000
- end
1001
-
1002
- def strip_style(ok_props=[], ok_keywords=[]) # NOTE unused so far.
1003
- each { |x| x.strip_style(ok_props, ok_keywords) }
1004
- end
1005
- end
42
+ $LOAD_PATH << File.expand_path(File.dirname(__FILE__))
43
+ require 'rfeedparser/forgiving_uri'
44
+ require 'rfeedparser/aliases'
45
+ require 'rfeedparser/encoding_helpers'
46
+ require 'rfeedparser/better_sgmlparser'
47
+ require 'rfeedparser/better_attributelist'
48
+ require 'rfeedparser/scrub'
49
+ require 'rfeedparser/time_helpers'
50
+ require 'rfeedparser/feedparserdict'
51
+ require 'rfeedparser/parser_mixin'
52
+ require 'rfeedparser/parsers'
53
+ require 'rfeedparser/markup_helpers'
1006
54
 
1007
- class Text
1008
- def strip_attributes(foo)
1009
- end
1010
- end
1011
- class Comment
1012
- def strip_attributes(foo)
1013
- end
1014
- end
1015
- class BogusETag
1016
- def strip_attributes(foo)
1017
- end
1018
- end
55
+ include FeedParserUtilities
1019
56
 
1020
- class Elem
1021
- def strip_attributes
1022
- unless attributes.nil?
1023
- attributes.each do |atr|
1024
- unless Acceptable_Attributes.include?atr[0]
1025
- remove_attribute(atr[0])
1026
- end
1027
- end
1028
- end
1029
- end
1030
- end
1031
- end
1032
57
 
1033
58
  module FeedParser
1034
- Version = "0.9.87"
59
+ Version = "0.9.91"
1035
60
 
1036
61
  License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
1037
62
 
@@ -1059,10 +84,10 @@ POSSIBILITY OF SUCH DAMAGE."""
1059
84
  Author = "Jeff Hodges <http://somethingsimilar.com>"
1060
85
  Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
1061
86
  Contributors = [ "Jason Diamond <http://injektilo.org/>",
1062
- "John Beimler <http://john.beimler.org/>",
1063
- "Fazal Majid <http://www.majid.info/mylos/weblog/>",
1064
- "Aaron Swartz <http://aaronsw.com/>",
1065
- "Kevin Marks <http://epeus.blogspot.com/>"
87
+ "John Beimler <http://john.beimler.org/>",
88
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>",
89
+ "Aaron Swartz <http://aaronsw.com/>",
90
+ "Kevin Marks <http://epeus.blogspot.com/>"
1066
91
  ]
1067
92
  # HTTP "User-Agent" header to send to servers when downloading feeds.
1068
93
  # If you are embedding feedparser in a larger application, you should
@@ -1115,2207 +140,27 @@ POSSIBILITY OF SUCH DAMAGE."""
1115
140
  'cdf' => 'CDF',
1116
141
  'hotrss' => 'Hot RSS'
1117
142
  }
1118
- class FeedParserDict < Hash
1119
- =begin
1120
- The naming of a certain common attribute (such as, "When was the last
1121
- time this feed was updated?") can have many different names depending
1122
- on the type of feed we are handling. This class allows us to satisfy
1123
- the expectations of both the developer who has prior knowledge of the
1124
- feed type as well as the developer who wants a consistent application
1125
- interface.
1126
-
1127
- @@keymap is a Hash that contains information on what a certain
1128
- attribute names "really are" in each kind of feed. It does this by
1129
- providing a common name that will map to any feed type in the keys,
1130
- with possible "correct" attributes in the its values. the #[] and #[]=
1131
- methods check with keymaps to see what attribute the developer "really
1132
- means" if they've asked for one which happens to be in @@keymap's keys.
1133
- =end
1134
- @@keymap = {'channel' => 'feed',
1135
- 'items' => 'entries',
1136
- 'guid' => 'id',
1137
- 'date' => 'updated',
1138
- 'date_parsed' => 'updated_parsed',
1139
- 'description' => ['subtitle', 'summary'],
1140
- 'url' => ['href'],
1141
- 'modified' => 'updated',
1142
- 'modified_parsed' => 'updated_parsed',
1143
- 'issued' => 'published',
1144
- 'issued_parsed' => 'published_parsed',
1145
- 'copyright' => 'rights',
1146
- 'copyright_detail' => 'rights_detail',
1147
- 'tagline' => 'subtitle',
1148
- 'tagline_detail' => 'subtitle_detail'}
1149
-
1150
- def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
1151
- return self['entries']
1152
- end
1153
-
1154
- # We could include the [] rewrite in new using Hash.new's fancy pants block thing
1155
- # but we'd still have to overwrite []= and such.
1156
- # I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
1157
- def initialize(pairs=nil)
1158
- if pairs.class == Array and pairs[0].class == Array and pairs[0].length == 2
1159
- pairs.each do |l|
1160
- k,v = l
1161
- self[k] = v
1162
- end
1163
- elsif pairs.class == Hash
1164
- self.merge!(pairs)
1165
- end
1166
- end
1167
-
1168
- def [](key)
1169
- if key == 'category'
1170
- return self['tags'][0]['term']
1171
- end
1172
- if key == 'categories'
1173
- return self['tags'].collect{|tag| [tag['scheme'],tag['term']]}
1174
- end
1175
- realkey = @@keymap[key] || key
1176
- if realkey.class == Array
1177
- realkey.each{ |key| return self[key] if has_key?key }
1178
- end
1179
- # Note that the original key is preferred over the realkey we (might
1180
- # have) found in @@keymap
1181
- if has_key?(key)
1182
- return super(key)
1183
- end
1184
- return super(realkey)
1185
- end
1186
-
1187
- def []=(key,value)
1188
- if @@keymap.key?key
1189
- key = @@keymap[key]
1190
- if key.class == Array
1191
- key = key[0]
1192
- end
1193
- end
1194
- super(key,value)
1195
- end
1196
-
1197
- def method_missing(msym, *args)
1198
- methodname = msym.to_s
1199
- if methodname[-1] == '='
1200
- return self[methodname[0..-2]] = args[0]
1201
- elsif methodname[-1] != '!' and methodname[-1] != '?' and methodname[0] != "_" # FIXME implement with private
1202
- return self[methodname]
1203
- else
1204
- raise NoMethodError, "whoops, we don't know about the attribute or method called `#{methodname}' for #{self}:#{self.class}"
1205
- end
1206
- end
1207
- end
1208
-
1209
-
1210
-
1211
-
1212
- module FeedParserMixin
1213
- attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
1214
-
1215
- def startup(baseuri=nil, baselang=nil, encoding='utf-8')
1216
- $stderr << "initializing FeedParser\n" if $debug
1217
-
1218
- @namespaces = {'' => '',
1219
- 'http://backend.userland.com/rss' => '',
1220
- 'http://blogs.law.harvard.edu/tech/rss' => '',
1221
- 'http://purl.org/rss/1.0/' => '',
1222
- 'http://my.netscape.com/rdf/simple/0.9/' => '',
1223
- 'http://example.com/newformat#' => '',
1224
- 'http://example.com/necho' => '',
1225
- 'http://purl.org/echo/' => '',
1226
- 'uri/of/echo/namespace#' => '',
1227
- 'http://purl.org/pie/' => '',
1228
- 'http://purl.org/atom/ns#' => '',
1229
- 'http://www.w3.org/2005/Atom' => '',
1230
- 'http://purl.org/rss/1.0/modules/rss091#' => '',
1231
- 'http://webns.net/mvcb/' => 'admin',
1232
- 'http://purl.org/rss/1.0/modules/aggregation/' => 'ag',
1233
- 'http://purl.org/rss/1.0/modules/annotate/' => 'annotate',
1234
- 'http://media.tangent.org/rss/1.0/' => 'audio',
1235
- 'http://backend.userland.com/blogChannelModule' => 'blogChannel',
1236
- 'http://web.resource.org/cc/' => 'cc',
1237
- 'http://backend.userland.com/creativeCommonsRssModule' => 'creativeCommons',
1238
- 'http://purl.org/rss/1.0/modules/company' => 'co',
1239
- 'http://purl.org/rss/1.0/modules/content/' => 'content',
1240
- 'http://my.theinfo.org/changed/1.0/rss/' => 'cp',
1241
- 'http://purl.org/dc/elements/1.1/' => 'dc',
1242
- 'http://purl.org/dc/terms/' => 'dcterms',
1243
- 'http://purl.org/rss/1.0/modules/email/' => 'email',
1244
- 'http://purl.org/rss/1.0/modules/event/' => 'ev',
1245
- 'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner',
1246
- 'http://freshmeat.net/rss/fm/' => 'fm',
1247
- 'http://xmlns.com/foaf/0.1/' => 'foaf',
1248
- 'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo',
1249
- 'http://postneo.com/icbm/' => 'icbm',
1250
- 'http://purl.org/rss/1.0/modules/image/' => 'image',
1251
- 'http://www.itunes.com/DTDs/PodCast-1.0.dtd' => 'itunes',
1252
- 'http://example.com/DTDs/PodCast-1.0.dtd' => 'itunes',
1253
- 'http://purl.org/rss/1.0/modules/link/' => 'l',
1254
- 'http://search.yahoo.com/mrss' => 'media',
1255
- 'http://madskills.com/public/xml/rss/module/pingback/' => 'pingback',
1256
- 'http://prismstandard.org/namespaces/1.2/basic/' => 'prism',
1257
- 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
1258
- 'http://www.w3.org/2000/01/rdf-schema#' => 'rdfs',
1259
- 'http://purl.org/rss/1.0/modules/reference/' => 'ref',
1260
- 'http://purl.org/rss/1.0/modules/richequiv/' => 'reqv',
1261
- 'http://purl.org/rss/1.0/modules/search/' => 'search',
1262
- 'http://purl.org/rss/1.0/modules/slash/' => 'slash',
1263
- 'http://schemas.xmlsoap.org/soap/envelope/' => 'soap',
1264
- 'http://purl.org/rss/1.0/modules/servicestatus/' => 'ss',
1265
- 'http://hacks.benhammersley.com/rss/streaming/' => 'str',
1266
- 'http://purl.org/rss/1.0/modules/subscription/' => 'sub',
1267
- 'http://purl.org/rss/1.0/modules/syndication/' => 'sy',
1268
- 'http://purl.org/rss/1.0/modules/taxonomy/' => 'taxo',
1269
- 'http://purl.org/rss/1.0/modules/threading/' => 'thr',
1270
- 'http://purl.org/rss/1.0/modules/textinput/' => 'ti',
1271
- 'http://madskills.com/public/xml/rss/module/trackback/' =>'trackback',
1272
- 'http://wellformedweb.org/commentAPI/' => 'wfw',
1273
- 'http://purl.org/rss/1.0/modules/wiki/' => 'wiki',
1274
- 'http://www.w3.org/1999/xhtml' => 'xhtml',
1275
- 'http://www.w3.org/XML/1998/namespace' => 'xml',
1276
- 'http://www.w3.org/1999/xlink' => 'xlink',
1277
- 'http://schemas.pocketsoap.com/rss/myDescModule/' => 'szf'
1278
- }
1279
- @matchnamespaces = {}
1280
- @namespaces.each do |l|
1281
- @matchnamespaces[l[0].downcase] = l[1]
1282
- end
1283
- @can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
1284
- @can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
1285
- @can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
1286
- @html_types = ['text/html', 'application/xhtml+xml']
1287
- @feeddata = FeedParserDict.new # feed-level data
1288
- @encoding = encoding # character encoding
1289
- @entries = [] # list of entry-level data
1290
- @version = '' # feed type/version see SUPPORTED_VERSIOSN
1291
- @namespacesInUse = {} # hash of namespaces defined by the feed
1292
-
1293
- # the following are used internall to track state;
1294
- # this is really out of control and should be refactored
1295
- @infeed = false
1296
- @inentry = false
1297
- @incontent = 0 # Yes, this needs to be zero until I work out popContent and pushContent
1298
- @intextinput = false
1299
- @inimage = false
1300
- @inauthor = false
1301
- @incontributor = false
1302
- @inpublisher = false
1303
- @insource = false
1304
- @sourcedata = FeedParserDict.new
1305
- @contentparams = FeedParserDict.new
1306
- @summaryKey = nil
1307
- @namespacemap = {}
1308
- @elementstack = []
1309
- @basestack = []
1310
- @langstack = []
1311
- @baseuri = baseuri || ''
1312
- @lang = baselang || nil
1313
- if baselang
1314
- @feeddata['language'] = baselang.gsub('_','-')
1315
- end
1316
- @date_handlers = [:_parse_date_rfc822,
1317
- :_parse_date_hungarian, :_parse_date_greek,:_parse_date_mssql,
1318
- :_parse_date_nate,:_parse_date_onblog,:_parse_date_w3dtf,:_parse_date_iso8601
1319
- ]
1320
- $stderr << "Leaving startup\n" if $debug # My addition
1321
- end
1322
-
1323
- def unknown_starttag(tag, attrsd)
1324
- $stderr << "start #{tag} with #{attrsd}\n" if $debug
1325
- # normalize attrs
1326
- attrsD = {}
1327
- attrsd = Hash[*attrsd.flatten] if attrsd.class == Array # Magic! Asterisk!
1328
- # LooseFeedParser needs the above because SGMLParser sends attrs as a
1329
- # list of lists (like [['type','text/html'],['mode','escaped']])
1330
-
1331
- attrsd.each do |old_k,value|
1332
- # There has to be a better, non-ugly way of doing this
1333
- k = old_k.downcase # Downcase all keys
1334
- attrsD[k] = value
1335
- if ['rel','type'].include?value
1336
- attrsD[k].downcase! # Downcase the value if the key is 'rel' or 'type'
1337
- end
1338
- end
1339
-
1340
- # track xml:base and xml:lang
1341
- baseuri = attrsD['xml:base'] || attrsD['base'] || @baseuri
1342
- @baseuri = urljoin(@baseuri, baseuri)
1343
- lang = attrsD['xml:lang'] || attrsD['lang']
1344
- if lang == '' # FIXME This next bit of code is right? Wtf?
1345
- # xml:lang could be explicitly set to '', we need to capture that
1346
- lang = nil
1347
- elsif lang.nil?
1348
- # if no xml:lang is specified, use parent lang
1349
- lang = @lang
1350
- end
1351
- if lang and not lang.empty? # Seriously, this cannot be correct
1352
- if ['feed', 'rss', 'rdf:RDF'].include?tag
1353
- @feeddata['language'] = lang.gsub('_','-')
1354
- end
1355
- end
1356
- @lang = lang
1357
- @basestack << @baseuri
1358
- @langstack << lang
1359
-
1360
- # track namespaces
1361
- attrsd.each do |prefix, uri|
1362
- if /^xmlns:/ =~ prefix # prefix begins with xmlns:
1363
- trackNamespace(prefix[6..-1], uri)
1364
- elsif prefix == 'xmlns':
1365
- trackNamespace(nil, uri)
1366
- end
1367
- end
1368
-
1369
- # track inline content
1370
- if @incontent != 0 and @contentparams.has_key?('type') and not ( /xml$/ =~ (@contentparams['type'] || 'xml') )
1371
- # element declared itself as escaped markup, but isn't really
1372
-
1373
- @contentparams['type'] = 'application/xhtml+xml'
1374
- end
1375
- if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
1376
- # Note: probably shouldn't simply recreate localname here, but
1377
- # our namespace handling isn't actually 100% correct in cases where
1378
- # the feed redefines the default namespace (which is actually
1379
- # the usual case for inline content, thanks Sam), so here we
1380
- # cheat and just reconstruct the element based on localname
1381
- # because that compensates for the bugs in our namespace handling.
1382
- # This will horribly munge inline content with non-empty qnames,
1383
- # but nobody actually does that, so I'm not fixing it.
1384
- tag = tag.split(':')[-1]
1385
- attrsA = attrsd.to_a.collect{|l| "#{l[0]}=\"#{l[1]}\""}
1386
- attrsS = ' '+attrsA.join(' ')
1387
- return handle_data("<#{tag}#{attrsS}>", escape=false)
1388
- end
1389
-
1390
- # match namespaces
1391
- if /:/ =~ tag
1392
- prefix, suffix = tag.split(':', 2)
1393
- else
1394
- prefix, suffix = '', tag
1395
- end
1396
- prefix = @namespacemap[prefix] || prefix
1397
- if prefix and not prefix.empty?
1398
- prefix = prefix + '_'
1399
- end
1400
-
1401
- # special hack for better tracking of empty textinput/image elements in illformed feeds
1402
- if (not prefix and not prefix.empty?) and not (['title', 'link', 'description','name'].include?tag)
1403
- @intextinput = false
1404
- end
1405
- if (prefix.nil? or prefix.empty?) and not (['title', 'link', 'description', 'url', 'href', 'width', 'height'].include?tag)
1406
- @inimage = false
1407
- end
1408
-
1409
- # call special handler (if defined) or default handler
1410
- begin
1411
- return send('_start_'+prefix+suffix, attrsD)
1412
- rescue NoMethodError
1413
- return push(prefix + suffix, true)
1414
- end
1415
- end # End unknown_starttag
1416
-
1417
- def unknown_endtag(tag)
1418
- $stderr << "end #{tag}\n" if $debug
1419
- # match namespaces
1420
- if tag.index(':')
1421
- prefix, suffix = tag.split(':',2)
1422
- else
1423
- prefix, suffix = '', tag
1424
- end
1425
- prefix = @namespacemap[prefix] || prefix
1426
- if prefix and not prefix.empty?
1427
- prefix = prefix + '_'
1428
- end
1429
-
1430
- # call special handler (if defined) or default handler
1431
- begin
1432
- send('_end_' + prefix + suffix) # NOTE no return here! do not add it!
1433
- rescue NoMethodError => details
1434
- pop(prefix + suffix)
1435
- end
1436
-
1437
- # track inline content
1438
- if @incontent != 0 and @contentparams.has_key?'type' and /xml$/ =~ (@contentparams['type'] || 'xml')
1439
- # element declared itself as escaped markup, but it isn't really
1440
- @contentparams['type'] = 'application/xhtml+xml'
1441
- end
1442
- if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
1443
- tag = tag.split(':')[-1]
1444
- handle_data("</#{tag}>", escape=false)
1445
- end
1446
-
1447
- # track xml:base and xml:lang going out of scope
1448
- if @basestack and not @basestack.empty?
1449
- @basestack.pop
1450
- if @basestack and @basestack[-1] and not (@basestack.empty? or @basestack[-1].empty?)
1451
- @baseuri = @basestack[-1]
1452
- end
1453
- end
1454
- if @langstack and not @langstack.empty?
1455
- @langstack.pop
1456
- if @langstack and not @langstack.empty? # and @langstack[-1] and not @langstack.empty?
1457
- @lang = @langstack[-1]
1458
- end
1459
- end
1460
- end
1461
-
1462
- def handle_charref(ref)
1463
- # LooseParserOnly
1464
- # called for each character reference, e.g. for '&#160;', ref will be '160'
1465
- $stderr << "entering handle_charref with #{ref}\n" if $debug
1466
- return if @elementstack.nil? or @elementstack.empty?
1467
- ref.downcase!
1468
- chars = ['34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e']
1469
- if chars.include?ref
1470
- text = "&##{ref};"
1471
- else
1472
- if ref[0..0] == 'x'
1473
- c = (ref[1..-1]).to_i(16)
1474
- else
1475
- c = ref.to_i
1476
- end
1477
- text = uconvert(unichr(c),'unicode')
1478
- end
1479
- @elementstack[-1][2] << text
1480
- end
1481
-
1482
- def handle_entityref(ref)
1483
- # LooseParserOnly
1484
- # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1485
-
1486
- return if @elementstack.nil? or @elementstack.empty?
1487
- $stderr << "entering handle_entityref with #{ref}\n" if $debug
1488
- ents = ['lt', 'gt', 'quot', 'amp', 'apos']
1489
- if ents.include?ref
1490
- text = "&#{ref};"
1491
- else
1492
- text = HTMLEntities::decode_entities("&#{ref};")
1493
- end
1494
- @elementstack[-1][2] << text
1495
- end
1496
-
1497
- def handle_data(text, escape=true)
1498
- # called for each block of plain text, i.e. outside of any tag and
1499
- # not containing any character or entity references
1500
- return if @elementstack.nil? or @elementstack.empty?
1501
- if escape and @contentparams['type'] == 'application/xhtml+xml'
1502
- text = text.to_xs
1503
- end
1504
- @elementstack[-1][2] << text
1505
- end
1506
-
1507
- def handle_comment(comment)
1508
- # called for each comment, e.g. <!-- insert message here -->
1509
- end
1510
-
1511
- def handle_pi(text)
1512
- end
1513
-
1514
- def handle_decl(text)
1515
- end
1516
-
1517
- def parse_declaration(i)
1518
- # for LooseFeedParser
1519
- $stderr << "entering parse_declaration\n" if $debug
1520
- if @rawdata[i...i+9] == '<![CDATA['
1521
- k = @rawdata.index(/\]\]>/u,i+9)
1522
- k = @rawdata.length unless k
1523
- handle_data(@rawdata[i+9...k].to_xs,false)
1524
- return k+3
1525
- else
1526
- k = @rawdata.index(/>/,i).to_i
1527
- return k+1
1528
- end
1529
- end
1530
-
1531
- def mapContentType(contentType)
1532
- contentType.downcase!
1533
- case contentType
1534
- when 'text'
1535
- contentType = 'text/plain'
1536
- when 'html'
1537
- contentType = 'text/html'
1538
- when 'xhtml'
1539
- contentType = 'application/xhtml+xml'
1540
- end
1541
- return contentType
1542
- end
1543
-
1544
- def trackNamespace(prefix, uri)
1545
-
1546
- loweruri = uri.downcase.strip
1547
- if [prefix, loweruri] == [nil, 'http://my.netscape.com/rdf/simple/0.9/'] and (@version.nil? or @version.empty?)
1548
- @version = 'rss090'
1549
- elsif loweruri == 'http://purl.org/rss/1.0/' and (@version.nil? or @version.empty?)
1550
- @version = 'rss10'
1551
- elsif loweruri == 'http://www.w3.org/2005/atom' and (@version.nil? or @version.empty?)
1552
- @version = 'atom10'
1553
- elsif /backend\.userland\.com\/rss/ =~ loweruri
1554
- # match any backend.userland.com namespace
1555
- uri = 'http://backend.userland.com/rss'
1556
- loweruri = uri
1557
- end
1558
- if @matchnamespaces.has_key? loweruri
1559
- @namespacemap[prefix] = @matchnamespaces[loweruri]
1560
- @namespacesInUse[@matchnamespaces[loweruri]] = uri
1561
- else
1562
- @namespacesInUse[prefix || ''] = uri
1563
- end
1564
- end
1565
-
1566
- def resolveURI(uri)
1567
- return urljoin(@baseuri || '', uri)
1568
- end
1569
-
1570
- def decodeEntities(element, data)
1571
- return data
1572
- end
1573
-
1574
- def push(element, expectingText)
1575
- @elementstack << [element, expectingText, []]
1576
- end
1577
-
1578
- def pop(element, stripWhitespace=true)
1579
- return if @elementstack.nil? or @elementstack.empty?
1580
- return if @elementstack[-1][0] != element
1581
- element, expectingText, pieces = @elementstack.pop
1582
- if pieces.class == Array
1583
- output = pieces.join('')
1584
- else
1585
- output = pieces
1586
- end
1587
- if stripWhitespace
1588
- output.strip!
1589
- end
1590
- return output if not expectingText
1591
-
1592
- # decode base64 content
1593
- if @contentparams['base64']
1594
- out64 = Base64::decode64(output) # a.k.a. [output].unpack('m')[0]
1595
- if not output.empty? and not out64.empty?
1596
- output = out64
1597
- end
1598
- end
1599
-
1600
- # resolve relative URIs
1601
- if @can_be_relative_uri.include?element and output and not output.empty?
1602
- output = resolveURI(output)
1603
- end
1604
-
1605
- # decode entities within embedded markup
1606
- if not @contentparams['base64']
1607
- output = decodeEntities(element, output)
1608
- end
1609
-
1610
- # remove temporary cruft from contentparams
1611
- @contentparams.delete('mode')
1612
- @contentparams.delete('base64')
1613
-
1614
- # resolve relative URIs within embedded markup
1615
- if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
1616
- if @can_contain_relative_uris.include?element
1617
- output = FeedParser.resolveRelativeURIs(output, @baseuri, @encoding)
1618
- end
1619
- end
1620
- # sanitize embedded markup
1621
- if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
1622
- if @can_contain_dangerous_markup.include?element
1623
- output = FeedParser.sanitizeHTML(output, @encoding)
1624
- end
1625
- end
1626
-
1627
- if @encoding and not @encoding.empty? and @encoding != 'utf-8'
1628
- output = uconvert(output, @encoding, 'utf-8')
1629
- # FIXME I turn everything into utf-8, not unicode, originally because REXML was being used but now beause I haven't tested it out yet.
1630
- end
1631
-
1632
- # categories/tags/keywords/whatever are handled in _end_category
1633
- return output if element == 'category'
1634
-
1635
- # store output in appropriate place(s)
1636
- if @inentry and not @insource
1637
- if element == 'content'
1638
- @entries[-1][element] ||= []
1639
- contentparams = Marshal.load(Marshal.dump(@contentparams)) # deepcopy
1640
- contentparams['value'] = output
1641
- @entries[-1][element] << contentparams
1642
- elsif element == 'link'
1643
- @entries[-1][element] = output
1644
- if output and not output.empty?
1645
- @entries[-1]['links'][-1]['href'] = output
1646
- end
1647
- else
1648
- element = 'summary' if element == 'description'
1649
- @entries[-1][element] = output
1650
- if @incontent != 0
1651
- contentparams = Marshal.load(Marshal.dump(@contentparams))
1652
- contentparams['value'] = output
1653
- @entries[-1][element + '_detail'] = contentparams
1654
- end
1655
- end
1656
- elsif (@infeed or @insource) and not @intextinput and not @inimage
1657
- context = getContext()
1658
- element = 'subtitle' if element == 'description'
1659
- context[element] = output
1660
- if element == 'link'
1661
- context['links'][-1]['href'] = output
1662
- elsif @incontent != 0
1663
- contentparams = Marshal.load(Marshal.dump(@contentparams))
1664
- contentparams['value'] = output
1665
- context[element + '_detail'] = contentparams
1666
- end
1667
- end
1668
- return output
1669
- end
1670
-
1671
- def pushContent(tag, attrsD, defaultContentType, expectingText)
1672
- @incontent += 1 # Yes, I hate this.
1673
- type = mapContentType(attrsD['type'] || defaultContentType)
1674
- @contentparams = FeedParserDict.new({'type' => type,'language' => @lang,'base' => @baseuri})
1675
- @contentparams['base64'] = isBase64(attrsD, @contentparams)
1676
- push(tag, expectingText)
1677
- end
1678
-
1679
- def popContent(tag)
1680
- value = pop(tag)
1681
- @incontent -= 1
1682
- @contentparams.clear
1683
- return value
1684
- end
1685
-
1686
- def mapToStandardPrefix(name)
1687
- colonpos = name.index(':')
1688
- if colonpos
1689
- prefix = name[0..colonpos-1]
1690
- suffix = name[colonpos+1..-1]
1691
- prefix = @namespacemap[prefix] || prefix
1692
- name = prefix + ':' + suffix
1693
- end
1694
- return name
1695
- end
1696
-
1697
- def getAttribute(attrsD, name)
1698
- return attrsD[mapToStandardPrefix(name)]
1699
- end
1700
-
1701
- def isBase64(attrsD, contentparams)
1702
- return true if (attrsD['mode'] == 'base64')
1703
- if /(^text\/)|(\+xml$)|(\/xml$)/ =~ contentparams['type']
1704
- return false
1705
- end
1706
- return true
1707
- end
1708
-
1709
- def itsAnHrefDamnIt(attrsD)
1710
- href= attrsD['url'] || attrsD['uri'] || attrsD['href']
1711
- if href
1712
- attrsD.delete('url')
1713
- attrsD.delete('uri')
1714
- attrsD['href'] = href
1715
- end
1716
- return attrsD
1717
- end
1718
-
1719
-
1720
- def _save(key, value)
1721
- context = getContext()
1722
- context[key] ||= value
1723
- end
1724
-
1725
- def _start_rss(attrsD)
1726
- versionmap = {'0.91' => 'rss091u',
1727
- '0.92' => 'rss092',
1728
- '0.93' => 'rss093',
1729
- '0.94' => 'rss094'
1730
- }
1731
-
1732
- if not @version or @version.empty?
1733
- attr_version = attrsD['version'] || ''
1734
- version = versionmap[attr_version]
1735
- if version and not version.empty?
1736
- @version = version
1737
- elsif /^2\./ =~ attr_version
1738
- @version = 'rss20'
1739
- else
1740
- @version = 'rss'
1741
- end
1742
- end
1743
- end
1744
-
1745
- def _start_dlhottitles(attrsD)
1746
- @version = 'hotrss'
1747
- end
1748
-
1749
- def _start_channel(attrsD)
1750
- @infeed = true
1751
- _cdf_common(attrsD)
1752
- end
1753
- alias :_start_feedinfo :_start_channel
1754
-
1755
- def _cdf_common(attrsD)
1756
- if attrsD.has_key?'lastmod'
1757
- _start_modified({})
1758
- @elementstack[-1][-1] = attrsD['lastmod']
1759
- _end_modified
1760
- end
1761
- if attrsD.has_key?'href'
1762
- _start_link({})
1763
- @elementstack[-1][-1] = attrsD['href']
1764
- _end_link
1765
- end
1766
- end
1767
-
1768
- def _start_feed(attrsD)
1769
- @infeed = true
1770
- versionmap = {'0.1' => 'atom01',
1771
- '0.2' => 'atom02',
1772
- '0.3' => 'atom03'
1773
- }
1774
-
1775
- if not @version or @version.empty?
1776
- attr_version = attrsD['version']
1777
- version = versionmap[attr_version]
1778
- if @version and not @version.empty?
1779
- @version = version
1780
- else
1781
- @version = 'atom'
1782
- end
1783
- end
1784
- end
1785
-
1786
- def _end_channel
1787
- @infeed = false
1788
- end
1789
- alias :_end_feed :_end_channel
1790
-
1791
- def _start_image(attrsD)
1792
- @inimage = true
1793
- push('image', false)
1794
- context = getContext()
1795
- context['image'] ||= FeedParserDict.new
1796
- end
1797
-
1798
- def _end_image
1799
- pop('image')
1800
- @inimage = false
1801
- end
1802
-
1803
- def _start_textinput(attrsD)
1804
- @intextinput = true
1805
- push('textinput', false)
1806
- context = getContext()
1807
- context['textinput'] ||= FeedParserDict.new
1808
- end
1809
- alias :_start_textInput :_start_textinput
1810
-
1811
- def _end_textinput
1812
- pop('textinput')
1813
- @intextinput = false
1814
- end
1815
- alias :_end_textInput :_end_textinput
1816
-
1817
- def _start_author(attrsD)
1818
- @inauthor = true
1819
- push('author', true)
1820
- end
1821
- alias :_start_managingeditor :_start_author
1822
- alias :_start_dc_author :_start_author
1823
- alias :_start_dc_creator :_start_author
1824
- alias :_start_itunes_author :_start_author
1825
-
1826
- def _end_author
1827
- pop('author')
1828
- @inauthor = false
1829
- _sync_author_detail()
1830
- end
1831
- alias :_end_managingeditor :_end_author
1832
- alias :_end_dc_author :_end_author
1833
- alias :_end_dc_creator :_end_author
1834
- alias :_end_itunes_author :_end_author
1835
-
1836
- def _start_itunes_owner(attrsD)
1837
- @inpublisher = true
1838
- push('publisher', false)
1839
- end
1840
-
1841
- def _end_itunes_owner
1842
- pop('publisher')
1843
- @inpublisher = false
1844
- _sync_author_detail('publisher')
1845
- end
1846
-
1847
- def _start_contributor(attrsD)
1848
- @incontributor = true
1849
- context = getContext()
1850
- context['contributors'] ||= []
1851
- context['contributors'] << FeedParserDict.new
1852
- push('contributor', false)
1853
- end
1854
-
1855
- def _end_contributor
1856
- pop('contributor')
1857
- @incontributor = false
1858
- end
1859
-
1860
- def _start_dc_contributor(attrsD)
1861
- @incontributor = true
1862
- context = getContext()
1863
- context['contributors'] ||= []
1864
- context['contributors'] << FeedParserDict.new
1865
- push('name', false)
1866
- end
1867
-
1868
- def _end_dc_contributor
1869
- _end_name
1870
- @incontributor = false
1871
- end
1872
-
1873
- def _start_name(attrsD)
1874
- push('name', false)
1875
- end
1876
- alias :_start_itunes_name :_start_name
1877
-
1878
- def _end_name
1879
- value = pop('name')
1880
- if @inpublisher
1881
- _save_author('name', value, 'publisher')
1882
- elsif @inauthor
1883
- _save_author('name', value)
1884
- elsif @incontributor
1885
- _save_contributor('name', value)
1886
- elsif @intextinput
1887
- context = getContext()
1888
- context['textinput']['name'] = value
1889
- end
1890
- end
1891
- alias :_end_itunes_name :_end_name
1892
-
1893
- def _start_width(attrsD)
1894
- push('width', false)
1895
- end
1896
-
1897
- def _end_width
1898
- value = pop('width').to_i
1899
- if @inimage
1900
- context = getContext
1901
- context['image']['width'] = value
1902
- end
1903
- end
1904
-
1905
- def _start_height(attrsD)
1906
- push('height', false)
1907
- end
1908
-
1909
- def _end_height
1910
- value = pop('height').to_i
1911
- if @inimage
1912
- context = getContext()
1913
- context['image']['height'] = value
1914
- end
1915
- end
1916
-
1917
- def _start_url(attrsD)
1918
- push('href', true)
1919
- end
1920
- alias :_start_homepage :_start_url
1921
- alias :_start_uri :_start_url
1922
-
1923
- def _end_url
1924
- value = pop('href')
1925
- if @inauthor
1926
- _save_author('href', value)
1927
- elsif @incontributor
1928
- _save_contributor('href', value)
1929
- elsif @inimage
1930
- context = getContext()
1931
- context['image']['href'] = value
1932
- elsif @intextinput
1933
- context = getContext()
1934
- context['textinput']['link'] = value
1935
- end
1936
- end
1937
- alias :_end_homepage :_end_url
1938
- alias :_end_uri :_end_url
1939
-
1940
- def _start_email(attrsD)
1941
- push('email', false)
1942
- end
1943
- alias :_start_itunes_email :_start_email
1944
-
1945
- def _end_email
1946
- value = pop('email')
1947
- if @inpublisher
1948
- _save_author('email', value, 'publisher')
1949
- elsif @inauthor
1950
- _save_author('email', value)
1951
- elsif @incontributor
1952
- _save_contributor('email', value)
1953
- end
1954
- end
1955
- alias :_end_itunes_email :_end_email
1956
-
1957
- def getContext
1958
- if @insource
1959
- context = @sourcedata
1960
- elsif @inentry
1961
- context = @entries[-1]
1962
- else
1963
- context = @feeddata
1964
- end
1965
- return context
1966
- end
1967
-
1968
- def _save_author(key, value, prefix='author')
1969
- context = getContext()
1970
- context[prefix + '_detail'] ||= FeedParserDict.new
1971
- context[prefix + '_detail'][key] = value
1972
- _sync_author_detail()
1973
- end
1974
-
1975
- def _save_contributor(key, value)
1976
- context = getContext
1977
- context['contributors'] ||= [FeedParserDict.new]
1978
- context['contributors'][-1][key] = value
1979
- end
1980
-
1981
- def _sync_author_detail(key='author')
1982
- context = getContext()
1983
- detail = context["#{key}_detail"]
1984
- if detail and not detail.empty?
1985
- name = detail['name']
1986
- email = detail['email']
1987
-
1988
- if name and email and not (name.empty? or name.empty?)
1989
- context[key] = "#{name} (#{email})"
1990
- elsif name and not name.empty?
1991
- context[key] = name
1992
- elsif email and not email.empty?
1993
- context[key] = email
1994
- end
1995
- else
1996
- author = context[key].dup unless context[key].nil?
1997
- return if not author or author.empty?
1998
- emailmatch = author.match(/(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))/)
1999
- email = emailmatch[1]
2000
- author.gsub!(email, '')
2001
- author.gsub!("\(\)", '')
2002
- author.strip!
2003
- author.gsub!(/^\(/,'')
2004
- author.gsub!(/\)$/,'')
2005
- author.strip!
2006
- context["#{key}_detail"] ||= FeedParserDict.new
2007
- context["#{key}_detail"]['name'] = author
2008
- context["#{key}_detail"]['email'] = email
2009
- end
2010
- end
2011
-
2012
- def _start_subtitle(attrsD)
2013
- pushContent('subtitle', attrsD, 'text/plain', true)
2014
- end
2015
- alias :_start_tagline :_start_subtitle
2016
- alias :_start_itunes_subtitle :_start_subtitle
2017
-
2018
- def _end_subtitle
2019
- popContent('subtitle')
2020
- end
2021
- alias :_end_tagline :_end_subtitle
2022
- alias :_end_itunes_subtitle :_end_subtitle
2023
-
2024
- def _start_rights(attrsD)
2025
- pushContent('rights', attrsD, 'text/plain', true)
2026
- end
2027
- alias :_start_dc_rights :_start_rights
2028
- alias :_start_copyright :_start_rights
2029
-
2030
- def _end_rights
2031
- popContent('rights')
2032
- end
2033
- alias :_end_dc_rights :_end_rights
2034
- alias :_end_copyright :_end_rights
2035
-
2036
- def _start_item(attrsD)
2037
- @entries << FeedParserDict.new
2038
- push('item', false)
2039
- @inentry = true
2040
- @guidislink = false
2041
- id = getAttribute(attrsD, 'rdf:about')
2042
- if id and not id.empty?
2043
- context = getContext()
2044
- context['id'] = id
2045
- end
2046
- _cdf_common(attrsD)
2047
- end
2048
- alias :_start_entry :_start_item
2049
- alias :_start_product :_start_item
2050
-
2051
- def _end_item
2052
- pop('item')
2053
- @inentry = false
2054
- end
2055
- alias :_end_entry :_end_item
2056
-
2057
- def _start_dc_language(attrsD)
2058
- push('language', true)
2059
- end
2060
- alias :_start_language :_start_dc_language
2061
-
2062
- def _end_dc_language
2063
- @lang = pop('language')
2064
- end
2065
- alias :_end_language :_end_dc_language
2066
-
2067
- def _start_dc_publisher(attrsD)
2068
- push('publisher', true)
2069
- end
2070
- alias :_start_webmaster :_start_dc_publisher
2071
-
2072
- def _end_dc_publisher
2073
- pop('publisher')
2074
- _sync_author_detail('publisher')
2075
- end
2076
- alias :_end_webmaster :_end_dc_publisher
2077
-
2078
- def _start_published(attrsD)
2079
- push('published', true)
2080
- end
2081
- alias :_start_dcterms_issued :_start_published
2082
- alias :_start_issued :_start_published
2083
-
2084
- def _end_published
2085
- value = pop('published')
2086
- _save('published_parsed', parse_date(value))
2087
- end
2088
- alias :_end_dcterms_issued :_end_published
2089
- alias :_end_issued :_end_published
2090
-
2091
- def _start_updated(attrsD)
2092
- push('updated', true)
2093
- end
2094
- alias :_start_modified :_start_updated
2095
- alias :_start_dcterms_modified :_start_updated
2096
- alias :_start_pubdate :_start_updated
2097
- alias :_start_dc_date :_start_updated
2098
-
2099
- def _end_updated
2100
- value = pop('updated')
2101
- _save('updated_parsed', parse_date(value))
2102
- end
2103
- alias :_end_modified :_end_updated
2104
- alias :_end_dcterms_modified :_end_updated
2105
- alias :_end_pubdate :_end_updated
2106
- alias :_end_dc_date :_end_updated
2107
-
2108
- def _start_created(attrsD)
2109
- push('created', true)
2110
- end
2111
- alias :_start_dcterms_created :_start_created
2112
-
2113
- def _end_created
2114
- value = pop('created')
2115
- _save('created_parsed', parse_date(value))
2116
- end
2117
- alias :_end_dcterms_created :_end_created
2118
-
2119
- def _start_expirationdate(attrsD)
2120
- push('expired', true)
2121
- end
2122
- def _end_expirationdate
2123
- _save('expired_parsed', parse_date(pop('expired')))
2124
- end
2125
-
2126
- def _start_cc_license(attrsD)
2127
- push('license', true)
2128
- value = getAttribute(attrsD, 'rdf:resource')
2129
- if value and not value.empty?
2130
- elementstack[-1][2] << value
2131
- pop('license')
2132
- end
2133
- end
2134
-
2135
- def _start_creativecommons_license(attrsD)
2136
- push('license', true)
2137
- end
2138
-
2139
- def _end_creativecommons_license
2140
- pop('license')
2141
- end
2142
-
2143
- def addTag(term, scheme, label)
2144
- context = getContext()
2145
- context['tags'] ||= []
2146
- tags = context['tags']
2147
- if (term.nil? or term.empty?) and (scheme.nil? or scheme.empty?) and (label.nil? or label.empty?)
2148
- return
2149
- end
2150
- value = FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
2151
- if not tags.include?value
2152
- context['tags'] << FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
2153
- end
2154
- end
2155
-
2156
- def _start_category(attrsD)
2157
- $stderr << "entering _start_category with #{attrsD}\n" if $debug
2158
-
2159
- term = attrsD['term']
2160
- scheme = attrsD['scheme'] || attrsD['domain']
2161
- label = attrsD['label']
2162
- addTag(term, scheme, label)
2163
- push('category', true)
2164
- end
2165
- alias :_start_dc_subject :_start_category
2166
- alias :_start_keywords :_start_category
2167
-
2168
- def _end_itunes_keywords
2169
- pop('itunes_keywords').split.each do |term|
2170
- addTag(term, 'http://www.itunes.com/', nil)
2171
- end
2172
- end
2173
-
2174
- def _start_itunes_category(attrsD)
2175
- addTag(attrsD['text'], 'http://www.itunes.com/', nil)
2176
- push('category', true)
2177
- end
2178
-
2179
- def _end_category
2180
- value = pop('category')
2181
- return if value.nil? or value.empty?
2182
- context = getContext()
2183
- tags = context['tags']
2184
- if value and not value.empty? and not tags.empty? and not tags[-1]['term']:
2185
- tags[-1]['term'] = value
2186
- else
2187
- addTag(value, nil, nil)
2188
- end
2189
- end
2190
- alias :_end_dc_subject :_end_category
2191
- alias :_end_keywords :_end_category
2192
- alias :_end_itunes_category :_end_category
2193
-
2194
- def _start_cloud(attrsD)
2195
- getContext()['cloud'] = FeedParserDict.new(attrsD)
2196
- end
2197
-
2198
- def _start_link(attrsD)
2199
- attrsD['rel'] ||= 'alternate'
2200
- attrsD['type'] ||= 'text/html'
2201
- attrsD = itsAnHrefDamnIt(attrsD)
2202
- if attrsD.has_key? 'href'
2203
- attrsD['href'] = resolveURI(attrsD['href'])
2204
- end
2205
- expectingText = @infeed || @inentry || @insource
2206
- context = getContext()
2207
- context['links'] ||= []
2208
- context['links'] << FeedParserDict.new(attrsD)
2209
- if attrsD['rel'] == 'enclosure'
2210
- _start_enclosure(attrsD)
2211
- end
2212
- if attrsD.has_key? 'href'
2213
- expectingText = false
2214
- if (attrsD['rel'] == 'alternate') and @html_types.include?mapContentType(attrsD['type'])
2215
- context['link'] = attrsD['href']
2216
- end
2217
- else
2218
- push('link', expectingText)
2219
- end
2220
- end
2221
- alias :_start_producturl :_start_link
2222
-
2223
- def _end_link
2224
- value = pop('link')
2225
- context = getContext()
2226
- if @intextinput
2227
- context['textinput']['link'] = value
2228
- end
2229
- if @inimage
2230
- context['image']['link'] = value
2231
- end
2232
- end
2233
- alias :_end_producturl :_end_link
2234
-
2235
- def _start_guid(attrsD)
2236
- @guidislink = ((attrsD['ispermalink'] || 'true') == 'true')
2237
- push('id', true)
2238
- end
2239
-
2240
- def _end_guid
2241
- value = pop('id')
2242
- _save('guidislink', (@guidislink and not getContext().has_key?('link')))
2243
- if @guidislink:
2244
- # guid acts as link, but only if 'ispermalink' is not present or is 'true',
2245
- # and only if the item doesn't already have a link element
2246
- _save('link', value)
2247
- end
2248
- end
2249
-
2250
-
2251
- def _start_title(attrsD)
2252
- pushContent('title', attrsD, 'text/plain', @infeed || @inentry || @insource)
2253
- end
2254
- alias :_start_dc_title :_start_title
2255
- alias :_start_media_title :_start_title
2256
-
2257
- def _end_title
2258
- value = popContent('title')
2259
- context = getContext()
2260
- if @intextinput
2261
- context['textinput']['title'] = value
2262
- elsif @inimage
2263
- context['image']['title'] = value
2264
- end
2265
- end
2266
- alias :_end_dc_title :_end_title
2267
- alias :_end_media_title :_end_title
2268
-
2269
- def _start_description(attrsD)
2270
- context = getContext()
2271
- if context.has_key?('summary')
2272
- @summaryKey = 'content'
2273
- _start_content(attrsD)
2274
- else
2275
- pushContent('description', attrsD, 'text/html', @infeed || @inentry || @insource)
2276
- end
2277
- end
2278
-
2279
- def _start_abstract(attrsD)
2280
- pushContent('description', attrsD, 'text/plain', @infeed || @inentry || @insource)
2281
- end
2282
-
2283
- def _end_description
2284
- if @summaryKey == 'content'
2285
- _end_content()
2286
- else
2287
- value = popContent('description')
2288
- context = getContext()
2289
- if @intextinput
2290
- context['textinput']['description'] = value
2291
- elsif @inimage:
2292
- context['image']['description'] = value
2293
- end
2294
- end
2295
- @summaryKey = nil
2296
- end
2297
- alias :_end_abstract :_end_description
2298
-
2299
- def _start_info(attrsD)
2300
- pushContent('info', attrsD, 'text/plain', true)
2301
- end
2302
- alias :_start_feedburner_browserfriendly :_start_info
2303
-
2304
- def _end_info
2305
- popContent('info')
2306
- end
2307
- alias :_end_feedburner_browserfriendly :_end_info
2308
-
2309
- def _start_generator(attrsD)
2310
- if attrsD and not attrsD.empty?
2311
- attrsD = itsAnHrefDamnIt(attrsD)
2312
- if attrsD.has_key?('href')
2313
- attrsD['href'] = resolveURI(attrsD['href'])
2314
- end
2315
- end
2316
- getContext()['generator_detail'] = FeedParserDict.new(attrsD)
2317
- push('generator', true)
2318
- end
2319
-
2320
- def _end_generator
2321
- value = pop('generator')
2322
- context = getContext()
2323
- if context.has_key?('generator_detail')
2324
- context['generator_detail']['name'] = value
2325
- end
2326
- end
2327
-
2328
- def _start_admin_generatoragent(attrsD)
2329
- push('generator', true)
2330
- value = getAttribute(attrsD, 'rdf:resource')
2331
- if value and not value.empty?
2332
- elementstack[-1][2] << value
2333
- end
2334
- pop('generator')
2335
- getContext()['generator_detail'] = FeedParserDict.new({'href' => value})
2336
- end
2337
-
2338
- def _start_admin_errorreportsto(attrsD)
2339
- push('errorreportsto', true)
2340
- value = getAttribute(attrsD, 'rdf:resource')
2341
- if value and not value.empty?
2342
- @elementstack[-1][2] << value
2343
- end
2344
- pop('errorreportsto')
2345
- end
2346
-
2347
- def _start_summary(attrsD)
2348
- context = getContext()
2349
- if context.has_key?'summary'
2350
- @summaryKey = 'content'
2351
- _start_content(attrsD)
2352
- else
2353
- @summaryKey = 'summary'
2354
- pushContent(@summaryKey, attrsD, 'text/plain', true)
2355
- end
2356
- end
2357
- alias :_start_itunes_summary :_start_summary
2358
-
2359
- def _end_summary
2360
- if @summaryKey == 'content':
2361
- _end_content()
2362
- else
2363
- popContent(@summaryKey || 'summary')
2364
- end
2365
- @summaryKey = nil
2366
- end
2367
- alias :_end_itunes_summary :_end_summary
2368
-
2369
- def _start_enclosure(attrsD)
2370
- attrsD = itsAnHrefDamnIt(attrsD)
2371
- getContext()['enclosures'] ||= []
2372
- getContext()['enclosures'] << FeedParserDict.new(attrsD)
2373
- href = attrsD['href']
2374
- if href and not href.empty?
2375
- context = getContext()
2376
- if not context['id']
2377
- context['id'] = href
2378
- end
2379
- end
2380
- end
2381
-
2382
- def _start_source(attrsD)
2383
- @insource = true
2384
- end
2385
-
2386
- def _end_source
2387
- @insource = false
2388
- getContext()['source'] = Marshal.load(Marshal.dump(@sourcedata))
2389
- @sourcedata.clear()
2390
- end
2391
-
2392
- def _start_content(attrsD)
2393
- pushContent('content', attrsD, 'text/plain', true)
2394
- src = attrsD['src']
2395
- if src and not src.empty?:
2396
- @contentparams['src'] = src
2397
- end
2398
- push('content', true)
2399
- end
2400
-
2401
- def _start_prodlink(attrsD)
2402
- pushContent('content', attrsD, 'text/html', true)
2403
- end
2404
-
2405
- def _start_body(attrsD)
2406
- pushContent('content', attrsD, 'application/xhtml+xml', true)
2407
- end
2408
- alias :_start_xhtml_body :_start_body
2409
-
2410
- def _start_content_encoded(attrsD)
2411
- pushContent('content', attrsD, 'text/html', true)
2412
- end
2413
- alias :_start_fullitem :_start_content_encoded
2414
-
2415
- def _end_content
2416
- copyToDescription = (['text/plain'] + @html_types).include? mapContentType(@contentparams['type'])
2417
- value = popContent('content')
2418
- if copyToDescription
2419
- _save('description', value)
2420
- end
2421
- alias :_end_body :_end_content
2422
- alias :_end_xhtml_body :_end_content
2423
- alias :_end_content_encoded :_end_content
2424
- alias :_end_fullitem :_end_content
2425
- alias :_end_prodlink :_end_content
2426
- end
2427
-
2428
- def _start_itunes_image(attrsD)
2429
- push('itunes_image', false)
2430
- getContext()['image'] = FeedParserDict.new({'href' => attrsD['href']})
2431
- end
2432
- alias :_start_itunes_link :_start_itunes_image
2433
-
2434
- def _end_itunes_block
2435
- value = pop('itunes_block', false)
2436
- getContext()['itunes_block'] = (value == 'yes') and true or false
2437
- end
2438
-
2439
- def _end_itunes_explicit
2440
- value = pop('itunes_explicit', false)
2441
- getContext()['itunes_explicit'] = (value == 'yes') and true or false
2442
- end
2443
-
2444
-
2445
- # ISO-8601 date parsing routines written by Fazal Majid.
2446
- # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2447
- # parser is beyond the scope of feedparser and the current Time.iso8601
2448
- # method does not work.
2449
- # A single regular expression cannot parse ISO 8601 date formats into groups
2450
- # as the standard is highly irregular (for instance is 030104 2003-01-04 or
2451
- # 0301-04-01), so we use templates instead.
2452
- # Please note the order in templates is significant because we need a
2453
- # greedy match.
2454
- def _parse_date_iso8601(dateString)
2455
- # Parse a variety of ISO-8601-compatible formats like 20040105
2456
-
2457
- # What I'm about to show you may be the ugliest code in all of
2458
- # rfeedparser.
2459
- # FIXME The century regexp maybe not work ('\d\d$' says "two numbers at
2460
- # end of line" but we then attach more of a regexp.
2461
- iso8601_regexps = [ '^(\d{4})-?([01]\d)-([0123]\d)',
2462
- '^(\d{4})-([01]\d)',
2463
- '^(\d{4})-?([0123]\d\d)',
2464
- '^(\d\d)-?([01]\d)-?([0123]\d)',
2465
- '^(\d\d)-?([0123]\d\d)',
2466
- '^(\d{4})',
2467
- '-(\d\d)-?([01]\d)',
2468
- '-([0123]\d\d)',
2469
- '-(\d\d)',
2470
- '--([01]\d)-?([0123]\d)',
2471
- '--([01]\d)',
2472
- '---([0123]\d)',
2473
- '(\d\d$)',
2474
- ''
2475
- ]
2476
- iso8601_values = { '^(\d{4})-?([01]\d)-([0123]\d)' => ['year', 'month', 'day'],
2477
- '^(\d{4})-([01]\d)' => ['year','month'],
2478
- '^(\d{4})-?([0123]\d\d)' => ['year', 'ordinal'],
2479
- '^(\d\d)-?([01]\d)-?([0123]\d)' => ['year','month','day'],
2480
- '^(\d\d)-?([0123]\d\d)' => ['year','ordinal'],
2481
- '^(\d{4})' => ['year'],
2482
- '-(\d\d)-?([01]\d)' => ['year','month'],
2483
- '-([0123]\d\d)' => ['ordinal'],
2484
- '-(\d\d)' => ['year'],
2485
- '--([01]\d)-?([0123]\d)' => ['month','day'],
2486
- '--([01]\d)' => ['month'],
2487
- '---([0123]\d)' => ['day'],
2488
- '(\d\d$)' => ['century'],
2489
- '' => []
2490
- }
2491
- add_to_all = '(T?(\d\d):(\d\d)(?::(\d\d))?([+-](\d\d)(?::(\d\d))?|Z)?)?'
2492
- add_to_all_fields = ['hour', 'minute', 'second', 'tz', 'tzhour', 'tzmin']
2493
- # NOTE We use '(?:' to prevent grouping of optional matches (ones trailed
2494
- # by '?'). The second ':' *are* matched.
2495
- m = nil
2496
- param_keys = []
2497
- iso8601_regexps.each do |s|
2498
- $stderr << "Trying iso8601 regexp: #{s+add_to_all}\n" if $debug
2499
- param_keys = iso8601_values[s] + add_to_all_fields
2500
- m = dateString.match(Regexp.new(s+add_to_all))
2501
- break if m
2502
- end
2503
- return if m.nil? or (m.begin(0).zero? and m.end(0).zero?)
2504
-
2505
- param_values = m.to_a
2506
- param_values = param_values[1..-1]
2507
- params = {}
2508
- param_keys.each_with_index do |key,i|
2509
- params[key] = param_values[i]
2510
- end
2511
-
2512
- ordinal = params['ordinal'].to_i unless params['ordinal'].nil?
2513
- year = params['year'] || '--'
2514
- if year.nil? or year.empty? or year == '--' # FIXME When could the regexp ever return a year equal to '--'?
2515
- year = Time.now.utc.year
2516
- elsif year.length == 2
2517
- # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
2518
- year = 100 * (Time.now.utc.year / 100) + year.to_i
2519
- else
2520
- year = year.to_i
2521
- end
2522
-
2523
- month = params['month'] || '-'
2524
- if month.nil? or month.empty? or month == '-'
2525
- # ordinals are NOT normalized by mktime, we simulate them
2526
- # by setting month=1, day=ordinal
2527
- if ordinal
2528
- month = DateTime.ordinal(year,ordinal).month
2529
- else
2530
- month = Time.now.utc.month
2531
- end
2532
- end
2533
- month = month.to_i unless month.nil?
2534
- day = params['day']
2535
- if day.nil? or day.empty?
2536
- # see above
2537
- if ordinal
2538
- day = DateTime.ordinal(year,ordinal).day
2539
- elsif params['century'] or params['year'] or params['month']
2540
- day = 1
2541
- else
2542
- day = Time.now.utc.day
2543
- end
2544
- else
2545
- day = day.to_i
2546
- end
2547
- # special case of the century - is the first year of the 21st century
2548
- # 2000 or 2001 ? The debate goes on...
2549
- if params.has_key? 'century'
2550
- year = (params['century'].to_i - 1) * 100 + 1
2551
- end
2552
- # in ISO 8601 most fields are optional
2553
- hour = params['hour'].to_i
2554
- minute = params['minute'].to_i
2555
- second = params['second'].to_i
2556
- weekday = nil
2557
- # daylight savings is complex, but not needed for feedparser's purposes
2558
- # as time zones, if specified, include mention of whether it is active
2559
- # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
2560
- # and most implementations have DST bugs
2561
- tm = [second, minute, hour, day, month, year, nil, ordinal, false, nil]
2562
- tz = params['tz']
2563
- if tz and not tz.empty? and tz != 'Z'
2564
- # FIXME does this cross over days?
2565
- if tz[0] == '-'
2566
- tm[3] += params['tzhour'].to_i
2567
- tm[4] += params['tzmin'].to_i
2568
- elsif tz[0] == '+'
2569
- tm[3] -= params['tzhour'].to_i
2570
- tm[4] -= params['tzmin'].to_i
2571
- else
2572
- return nil
2573
- end
2574
- end
2575
- return Time.utc(*tm) # Magic!
2576
-
2577
- end
2578
-
2579
- def _parse_date_onblog(dateString)
2580
- # Parse a string according to the OnBlog 8-bit date format
2581
- # 8-bit date handling routes written by ytrewq1
2582
- korean_year = u("년") # b3e2 in euc-kr
2583
- korean_month = u("월") # bff9 in euc-kr
2584
- korean_day = u("일") # c0cf in euc-kr
2585
-
2586
-
2587
- korean_onblog_date_re = /(\d{4})#{korean_year}\s+(\d{2})#{korean_month}\s+(\d{2})#{korean_day}\s+(\d{2}):(\d{2}):(\d{2})/
2588
-
2589
-
2590
- m = korean_onblog_date_re.match(dateString)
2591
- return unless m
2592
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
2593
-
2594
- $stderr << "OnBlog date parsed as: %s\n" % w3dtfdate if $debug
2595
- return _parse_date_w3dtf(w3dtfdate)
2596
- end
2597
-
2598
- def _parse_date_nate(dateString)
2599
- # Parse a string according to the Nate 8-bit date format
2600
- # 8-bit date handling routes written by ytrewq1
2601
- korean_am = u("오전") # bfc0 c0fc in euc-kr
2602
- korean_pm = u("오후") # bfc0 c8c4 in euc-kr
2603
-
2604
- korean_nate_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(#{korean_am}|#{korean_pm})\s+(\d{0,2}):(\d{0,2}):(\d{0,2})/
2605
- m = korean_nate_date_re.match(dateString)
2606
- return unless m
2607
- hour = m[5].to_i
2608
- ampm = m[4]
2609
- if ampm == korean_pm
2610
- hour += 12
2611
- end
2612
- hour = hour.to_s.rjust(2,'0')
2613
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{hour}:#{m[6]}:#{m[7]}+09:00"
2614
- $stderr << "Nate date parsed as: %s\n" % w3dtfdate if $debug
2615
- return _parse_date_w3dtf(w3dtfdate)
2616
- end
2617
-
2618
- def _parse_date_mssql(dateString)
2619
- mssql_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?/
2620
-
2621
- m = mssql_date_re.match(dateString)
2622
- return unless m
2623
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
2624
- $stderr << "MS SQL date parsed as: %s\n" % w3dtfdate if $debug
2625
- return _parse_date_w3dtf(w3dtfdate)
2626
- end
2627
-
2628
- def _parse_date_greek(dateString)
2629
- # Parse a string according to a Greek 8-bit date format
2630
- # Unicode strings for Greek date strings
2631
- greek_months = {
2632
- u("Ιαν") => u("Jan"), # c9e1ed in iso-8859-7
2633
- u("Φεβ") => u("Feb"), # d6e5e2 in iso-8859-7
2634
- u("Μάώ") => u("Mar"), # ccdcfe in iso-8859-7
2635
- u("Μαώ") => u("Mar"), # cce1fe in iso-8859-7
2636
- u("Απρ") => u("Apr"), # c1f0f1 in iso-8859-7
2637
- u("Μάι") => u("May"), # ccdce9 in iso-8859-7
2638
- u("Μαϊ") => u("May"), # cce1fa in iso-8859-7
2639
- u("Μαι") => u("May"), # cce1e9 in iso-8859-7
2640
- u("Ιούν") => u("Jun"), # c9effded in iso-8859-7
2641
- u("Ιον") => u("Jun"), # c9efed in iso-8859-7
2642
- u("Ιούλ") => u("Jul"), # c9effdeb in iso-8859-7
2643
- u("Ιολ") => u("Jul"), # c9f9eb in iso-8859-7
2644
- u("Αύγ") => u("Aug"), # c1fde3 in iso-8859-7
2645
- u("Αυγ") => u("Aug"), # c1f5e3 in iso-8859-7
2646
- u("Σεπ") => u("Sep"), # d3e5f0 in iso-8859-7
2647
- u("Οκτ") => u("Oct"), # cfeaf4 in iso-8859-7
2648
- u("Νοέ") => u("Nov"), # cdefdd in iso-8859-7
2649
- u("Νοε") => u("Nov"), # cdefe5 in iso-8859-7
2650
- u("Δεκ") => u("Dec"), # c4e5ea in iso-8859-7
2651
- }
2652
-
2653
- greek_wdays = {
2654
- u("Κυρ") => u("Sun"), # caf5f1 in iso-8859-7
2655
- u("Δευ") => u("Mon"), # c4e5f5 in iso-8859-7
2656
- u("Τρι") => u("Tue"), # d4f1e9 in iso-8859-7
2657
- u("Τετ") => u("Wed"), # d4e5f4 in iso-8859-7
2658
- u("Πεμ") => u("Thu"), # d0e5ec in iso-8859-7
2659
- u("Παρ") => u("Fri"), # d0e1f1 in iso-8859-7
2660
- u("Σαβ") => u("Sat"), # d3e1e2 in iso-8859-7
2661
- }
2662
-
2663
- greek_date_format = /([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)/
2664
-
2665
- m = greek_date_format.match(dateString)
2666
- return unless m
2667
- begin
2668
- wday = greek_wdays[m[1]]
2669
- month = greek_months[m[3]]
2670
- rescue
2671
- return nil
2672
- end
2673
- rfc822date = "#{wday}, #{m[2]} #{month} #{m[4]} #{m[5]}:#{m[6]}:#{m[7]} #{m[8]}"
2674
- $stderr << "Greek date parsed as: #{rfc822date}\n" if $debug
2675
- return _parse_date_rfc822(rfc822date)
2676
- end
2677
-
2678
- def _parse_date_hungarian(dateString)
2679
- # Parse a string according to a Hungarian 8-bit date format.
2680
- hungarian_date_format_re = /(\d{4})-([^-]+)-(\d{0,2})T(\d{0,2}):(\d{2})((\+|-)(\d{0,2}:\d{2}))/
2681
- m = hungarian_date_format_re.match(dateString)
2682
- return unless m
2683
-
2684
- # Unicode strings for Hungarian date strings
2685
- hungarian_months = {
2686
- u("január") => u("01"), # e1 in iso-8859-2
2687
- u("februári") => u("02"), # e1 in iso-8859-2
2688
- u("március") => u("03"), # e1 in iso-8859-2
2689
- u("április") => u("04"), # e1 in iso-8859-2
2690
- u("máujus") => u("05"), # e1 in iso-8859-2
2691
- u("június") => u("06"), # fa in iso-8859-2
2692
- u("július") => u("07"), # fa in iso-8859-2
2693
- u("augusztus") => u("08"),
2694
- u("szeptember") => u("09"),
2695
- u("október") => u("10"), # f3 in iso-8859-2
2696
- u("november") => u("11"),
2697
- u("december") => u("12"),
2698
- }
2699
- begin
2700
- month = hungarian_months[m[2]]
2701
- day = m[3].rjust(2,'0')
2702
- hour = m[4].rjust(2,'0')
2703
- rescue
2704
- return
2705
- end
2706
-
2707
- w3dtfdate = "#{m[1]}-#{month}-#{day}T#{hour}:#{m[5]}:00#{m[6]}"
2708
- $stderr << "Hungarian date parsed as: #{w3dtfdate}\n" if $debug
2709
- return _parse_date_w3dtf(w3dtfdate)
2710
- end
2711
-
2712
- def rollover(num, modulus)
2713
- return num % modulus, num / modulus
2714
- end
2715
-
2716
- def set_self(num, modulus)
2717
- r = num / modulus
2718
- if r == 0
2719
- return num
2720
- end
2721
- return r
2722
- end
2723
- # W3DTF-style date parsing
2724
- # FIXME shouldn't it be "W3CDTF"?
2725
- def _parse_date_w3dtf(dateString)
2726
- # Ruby's Time docs claim w3cdtf is an alias for iso8601 which is an alias for xmlschema
2727
- # Whatever it is, it doesn't work. This has been fixed in Ruby 1.9 and
2728
- # in Ruby on Rails, but not really. They don't fix the 25 hour or 61 minute or 61 second rollover and fail in other ways.
2729
-
2730
- m = dateString.match(/^(\d{4})-?(?:(?:([01]\d)-?(?:([0123]\d)(?:T(\d\d):(\d\d):(\d\d)([+-]\d\d:\d\d|Z))?)?)?)?/)
2731
-
2732
- w3 = m[1..3].map{|s| s=s.to_i; s += 1 if s == 0;s} # Map the year, month and day to integers and, if they were nil, set them to 1
2733
- w3 += m[4..6].map{|s| s.to_i} # Map the hour, minute and second to integers
2734
- w3 << m[-1] # Leave the timezone as a String
2735
-
2736
- # FIXME this next bit needs some serious refactoring
2737
- # Rollover times. 0 minutes and 61 seconds -> 1 minute and 1 second
2738
- w3[5],r = rollover(w3[5], 60) # rollover seconds
2739
- w3[4] += r
2740
- w3[4],r = rollover(w3[4], 60) # rollover minutes
2741
- w3[3] += r
2742
- w3[3],r = rollover(w3[3], 24) # rollover hours
2743
-
2744
- w3[2] = w3[2] + r
2745
- if w3[1] > 12
2746
- w3[1],r = rollover(w3[1],12)
2747
- w3[1] = 12 if w3[1] == 0
2748
- w3[0] += r
2749
- end
2750
-
2751
- num_days = Time.days_in_month(w3[1], w3[0])
2752
- while w3[2] > num_days
2753
- w3[2] -= num_days
2754
- w3[1] += 1
2755
- if w3[1] > 12
2756
- w3[0] += 1
2757
- w3[1] = set_self(w3[1], 12)
2758
- end
2759
- num_days = Time.days_in_month(w3[1], w3[0])
2760
- end
2761
-
2762
-
2763
- unless w3[6].class != String
2764
- if /^-/ =~ w3[6] # Zone offset goes backwards
2765
- w3[6][0] = '+'
2766
- elsif /^\+/ =~ w3[6]
2767
- w3[6][0] = '-'
2768
- end
2769
- end
2770
- return Time.utc(w3[0], w3[1], w3[2] , w3[3], w3[4], w3[5])+Time.zone_offset(w3[6] || "UTC")
2771
- end
2772
-
2773
- def _parse_date_rfc822(dateString)
2774
- # Parse an RFC822, RFC1123, RFC2822 or asctime-style date
2775
- # These first few lines are to fix up the stupid proprietary format from Disney
2776
- unknown_timezones = { 'AT' => 'EDT', 'ET' => 'EST',
2777
- 'CT' => 'CST', 'MT' => 'MST',
2778
- 'PT' => 'PST'
2779
- }
2780
-
2781
- mon = dateString.split[2]
2782
- if mon.length > 3 and Time::RFC2822_MONTH_NAME.include?mon[0..2]
2783
- dateString.sub!(mon,mon[0..2])
2784
- end
2785
- if dateString[-3..-1] != "GMT" and unknown_timezones[dateString[-2..-1]]
2786
- dateString[-2..-1] = unknown_timezones[dateString[-2..-1]]
2787
- end
2788
- # Okay, the Disney date format should be fixed up now.
2789
- rfc = dateString.match(/([A-Za-z]{3}), ([0123]\d) ([A-Za-z]{3}) (\d{4})( (\d\d):(\d\d)(?::(\d\d))? ([A-Za-z]{3}))?/)
2790
- if rfc.to_a.length > 1 and rfc.to_a.include? nil
2791
- dow, day, mon, year, hour, min, sec, tz = rfc[1..-1]
2792
- hour,min,sec = [hour,min,sec].map{|e| e.to_s.rjust(2,'0') }
2793
- tz ||= "GMT"
2794
- end
2795
- asctime_match = dateString.match(/([A-Za-z]{3}) ([A-Za-z]{3}) (\d?\d) (\d\d):(\d\d):(\d\d) ([A-Za-z]{3}) (\d\d\d\d)/).to_a
2796
- if asctime_match.to_a.length > 1
2797
- # Month-abbr dayofmonth hour:minute:second year
2798
- dow, mon, day, hour, min, sec, tz, year = asctime_match[1..-1]
2799
- day.to_s.rjust(2,'0')
2800
- end
2801
- if (rfc.to_a.length > 1 and rfc.to_a.include? nil) or asctime_match.to_a.length > 1
2802
- ds = "#{dow}, #{day} #{mon} #{year} #{hour}:#{min}:#{sec} #{tz}"
2803
- else
2804
- ds = dateString
2805
- end
2806
- t = Time.rfc2822(ds).utc
2807
- return t
2808
- end
2809
-
2810
- def _parse_date_perforce(aDateString) # FIXME not in 4.1?
2811
- # Parse a date in yyyy/mm/dd hh:mm:ss TTT format
2812
- # Note that there is a day of the week at the beginning
2813
- # Ex. Fri, 2006/09/15 08:19:53 EDT
2814
- return Time.parse(aDateString).utc
2815
- end
2816
-
2817
- def extract_tuple(atime)
2818
- # NOTE leave the error handling to parse_date
2819
- t = [atime.year, atime.month, atime.mday, atime.hour,
2820
- atime.min, atime.sec, (atime.wday-1) % 7, atime.yday,
2821
- atime.isdst
2822
- ]
2823
- # yay for modulus! yaaaaaay! its 530 am and i should be sleeping! yaay!
2824
- t[0..-2].map!{|s| s.to_i}
2825
- t[-1] = t[-1] ? 1 : 0
2826
- return t
2827
- end
2828
-
2829
- def parse_date(dateString)
2830
- @date_handlers.each do |handler|
2831
- begin
2832
- $stderr << "Trying date_handler #{handler}\n" if $debug
2833
- datething = extract_tuple(send(handler,dateString))
2834
- return datething
2835
- rescue Exception => e
2836
- $stderr << "#{handler} raised #{e}\n" if $debug
2837
- end
2838
- end
2839
- return nil
2840
- end
2841
-
2842
- end # End FeedParserMixin
2843
-
2844
- class StrictFeedParser < XML::SAX::HandlerBase # expat
2845
- include FeedParserMixin
2846
-
2847
- attr_accessor :bozo, :entries, :feeddata, :exc
2848
- def initialize(baseuri, baselang, encoding)
2849
- $stderr << "trying StrictFeedParser\n" if $debug
2850
- startup(baseuri, baselang, encoding)
2851
- @bozo = false
2852
- @exc = nil
2853
- super()
2854
- end
2855
-
2856
- def getPos
2857
- [@locator.getSystemId, @locator.getLineNumber]
2858
- end
2859
-
2860
- def getAttrs(attrs)
2861
- ret = []
2862
- for i in 0..attrs.getLength
2863
- ret.push([attrs.getName(i), attrs.getValue(i)])
2864
- end
2865
- ret
2866
- end
2867
-
2868
- def setDocumentLocator(loc)
2869
- @locator = loc
2870
- end
2871
-
2872
- def startDoctypeDecl(name, pub_sys, long_name, uri)
2873
- #Nothing is done here. What could we do that is neat and useful?
2874
- end
2875
-
2876
- def startNamespaceDecl(prefix, uri)
2877
- trackNamespace(prefix, uri)
2878
- end
2879
-
2880
- def endNamespaceDecl(prefix)
2881
- end
2882
-
2883
- def startElement(name, attrs)
2884
- name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
2885
- namespaceuri = ($2 || '').downcase
2886
- name = $3
2887
- if /backend\.userland\.com\/rss/ =~ namespaceuri
2888
- # match any backend.userland.com namespace
2889
- namespaceuri = 'http://backend.userland.com/rss'
2890
- end
2891
- prefix = @matchnamespaces[namespaceuri]
2892
- # No need to raise UndeclaredNamespace, Expat does that for us with
2893
- "unbound prefix (XMLParserError)"
2894
- if prefix and not prefix.empty?
2895
- name = prefix + ':' + name
2896
- end
2897
- name.downcase!
2898
- unknown_starttag(name, attrs)
2899
- end
2900
-
2901
- def character(text, start, length)
2902
- #handle_data(CGI.unescapeHTML(text))
2903
- handle_data(text)
2904
- end
2905
- # expat provides "character" not "characters"!
2906
- alias :characters :character # Just in case.
2907
-
2908
- def startCdata(content)
2909
- handle_data(content)
2910
- end
2911
-
2912
- def endElement(name)
2913
- name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
2914
- namespaceuri = ($2 || '').downcase
2915
- prefix = @matchnamespaces[namespaceuri]
2916
- if prefix and not prefix.empty?
2917
- localname = prefix + ':' + name
2918
- end
2919
- name.downcase!
2920
- unknown_endtag(name)
2921
- end
2922
-
2923
- def comment(comment)
2924
- handle_comment(comment)
2925
- end
2926
-
2927
- def entityDecl(*foo)
2928
- end
2929
-
2930
- def unparsedEntityDecl(*foo)
2931
- end
2932
- def error(exc)
2933
- @bozo = true
2934
- @exc = exc
2935
- end
2936
-
2937
- def fatalError(exc)
2938
- error(exc)
2939
- raise exc
2940
- end
2941
- end
2942
-
2943
- class LooseFeedParser < BetterSGMLParser
2944
- include FeedParserMixin
2945
- # We write the methods that were in BaseHTMLProcessor in the python code
2946
- # in here directly. We do this because if we inherited from
2947
- # BaseHTMLProcessor but then included from FeedParserMixin, the methods
2948
- # of Mixin would overwrite the methods we inherited from
2949
- # BaseHTMLProcessor. This is exactly the opposite of what we want to
2950
- # happen!
2951
-
2952
- attr_accessor :encoding, :bozo, :feeddata, :entries, :namespacesInUse
2953
-
2954
- Elements_No_End_Tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
2955
- 'img', 'input', 'isindex', 'link', 'meta', 'param']
2956
- New_Declname_Re = /[a-zA-Z][-_.a-zA-Z0-9:]*\s*/
2957
- alias :sgml_feed :feed # feed needs to mapped to feeddata, not the SGMLParser method feed. I think.
2958
- def feed
2959
- @feeddata
2960
- end
2961
- def feed=(data)
2962
- @feeddata = data
2963
- end
2964
-
2965
- def initialize(baseuri, baselang, encoding)
2966
- startup(baseuri, baselang, encoding)
2967
- super() # Keep the parentheses! No touchy.
2968
- end
2969
-
2970
- def reset
2971
- @pieces = []
2972
- super
2973
- end
2974
-
2975
- def parse(data)
2976
- data.gsub!(/<!((?!DOCTYPE|--|\[))/i, '&lt;!\1')
2977
- data.gsub!(/<([^<\s]+?)\s*\/>/) do |tag|
2978
- clean = tag[1..-3].strip
2979
- if Elements_No_End_Tag.include?clean
2980
- tag
2981
- else
2982
- '<'+clean+'></'+clean+'>'
2983
- end
2984
- end
2985
-
2986
- data.gsub!(/&#39;/, "'")
2987
- data.gsub!(/&#34;/, "'")
2988
- if @encoding and not @encoding.empty? # FIXME unicode check type(u'')
2989
- data = uconvert(data,'utf-8',@encoding)
2990
- end
2991
- sgml_feed(data) # see the alias above
2992
- end
2993
-
2994
-
2995
- def decodeEntities(element, data)
2996
- data.gsub!('&#60;', '&lt;')
2997
- data.gsub!('&#x3c;', '&lt;')
2998
- data.gsub!('&#62;', '&gt;')
2999
- data.gsub!('&#x3e;', '&gt;')
3000
- data.gsub!('&#38;', '&amp;')
3001
- data.gsub!('&#x26;', '&amp;')
3002
- data.gsub!('&#34;', '&quot;')
3003
- data.gsub!('&#x22;', '&quot;')
3004
- data.gsub!('&#39;', '&apos;')
3005
- data.gsub!('&#x27;', '&apos;')
3006
- if @contentparams.has_key? 'type' and not ((@contentparams['type'] || 'xml') =~ /xml$/u)
3007
- data.gsub!('&lt;', '<')
3008
- data.gsub!('&gt;', '>')
3009
- data.gsub!('&amp;', '&')
3010
- data.gsub!('&quot;', '"')
3011
- data.gsub!('&apos;', "'")
3012
- end
3013
- return data
3014
- end
3015
- end
3016
-
3017
- def FeedParser.resolveRelativeURIs(htmlSource, baseURI, encoding)
3018
- $stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
3019
- relative_uris = [ ['a','href'],
3020
- ['applet','codebase'],
3021
- ['area','href'],
3022
- ['blockquote','cite'],
3023
- ['body','background'],
3024
- ['del','cite'],
3025
- ['form','action'],
3026
- ['frame','longdesc'],
3027
- ['frame','src'],
3028
- ['iframe','longdesc'],
3029
- ['iframe','src'],
3030
- ['head','profile'],
3031
- ['img','longdesc'],
3032
- ['img','src'],
3033
- ['img','usemap'],
3034
- ['input','src'],
3035
- ['input','usemap'],
3036
- ['ins','cite'],
3037
- ['link','href'],
3038
- ['object','classid'],
3039
- ['object','codebase'],
3040
- ['object','data'],
3041
- ['object','usemap'],
3042
- ['q','cite'],
3043
- ['script','src'],
3044
- ]
3045
- h = Hpricot(htmlSource)
3046
- relative_uris.each do |l|
3047
- ename, eattr = l
3048
- h.search(ename).each do |elem|
3049
- euri = elem.attributes[eattr]
3050
- if euri and not euri.empty? and URI.parse(URI.encode(euri)).relative?
3051
- elem.attributes[eattr] = urljoin(baseURI, euri)
3052
- end
3053
- end
3054
- end
3055
- return h.to_html
3056
- end
3057
-
3058
- class SanitizerDoc < Hpricot::Doc
3059
-
3060
- def scrub
3061
- traverse_all_element do |e|
3062
- if e.elem?
3063
- if Acceptable_Elements.include?e.name
3064
- e.strip_attributes
3065
- else
3066
- if Unacceptable_Elements_With_End_Tag.include?e.name
3067
- e.inner_html = ''
3068
- end
3069
- e.swap(SanitizerDoc.new(e.children).scrub.to_html)
3070
- # This works because the children swapped in are brought in "after" the current element.
3071
- end
3072
- elsif e.doctype?
3073
- e.parent.children.delete(e)
3074
- elsif e.text?
3075
- ets = e.to_s
3076
- ets.gsub!(/&#39;/, "'")
3077
- ets.gsub!(/&#34;/, '"')
3078
- ets.gsub!(/\r/,'')
3079
- e.swap(ets)
3080
- else
3081
- end
3082
- end
3083
- # yes, that '/' should be there. It's a search method. See the Hpricot docs.
3084
-
3085
- unless $compatible # FIXME nonworking
3086
- (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
3087
- end
3088
- return self
3089
- end
3090
- end
3091
-
3092
- def SanitizerDoc(html)
3093
- FeedParser::SanitizerDoc.new(Hpricot.make(html))
3094
- end
3095
- module_function(:SanitizerDoc)
3096
-
3097
- def self.sanitizeHTML(html,encoding)
3098
- # FIXME Tidy not yet supported
3099
- html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
3100
- h = SanitizerDoc(html)
3101
- h = h.scrub
3102
- return h.to_html.strip
3103
- end
3104
-
3105
-
3106
-
3107
- def self.getCharacterEncoding(feed, xml_data)
3108
- # Get the character encoding of the XML document
3109
- $stderr << "In getCharacterEncoding\n" if $debug
3110
- sniffed_xml_encoding = nil
3111
- xml_encoding = nil
3112
- true_encoding = nil
3113
- begin
3114
- http_headers = feed.meta
3115
- http_content_type = feed.meta['content-type'].split(';')[0]
3116
- encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
3117
- http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
3118
- http_encoding = nil if http_encoding.empty?
3119
- # FIXME Open-Uri returns iso8859-1 if there is no charset header,
3120
- # but that doesn't pass the tests. Open-Uri claims its following
3121
- # the right RFC. Are they wrong or do we need to change the tests?
3122
- rescue NoMethodError
3123
- http_headers = {}
3124
- http_content_type = nil
3125
- http_encoding = nil
3126
- end
3127
- # Must sniff for non-ASCII-compatible character encodings before
3128
- # searching for XML declaration. This heuristic is defined in
3129
- # section F of the XML specification:
3130
- # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3131
- begin
3132
- if xml_data[0..3] == "\x4c\x6f\xa7\x94"
3133
- # EBCDIC
3134
- xml_data = _ebcdic_to_ascii(xml_data)
3135
- elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
3136
- # UTF-16BE
3137
- sniffed_xml_encoding = 'utf-16be'
3138
- xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
3139
- elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
3140
- # UTF-16BE with BOM
3141
- sniffed_xml_encoding = 'utf-16be'
3142
- xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
3143
- elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
3144
- # UTF-16LE
3145
- sniffed_xml_encoding = 'utf-16le'
3146
- xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
3147
- elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
3148
- # UTF-16LE with BOM
3149
- sniffed_xml_encoding = 'utf-16le'
3150
- xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
3151
- elsif xml_data[0..3] == "\x00\x00\x00\x3c"
3152
- # UTF-32BE
3153
- sniffed_xml_encoding = 'utf-32be'
3154
- xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
3155
- elsif xml_data[0..3] == "\x3c\x00\x00\x00"
3156
- # UTF-32LE
3157
- sniffed_xml_encoding = 'utf-32le'
3158
- xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
3159
- elsif xml_data[0..3] == "\x00\x00\xfe\xff"
3160
- # UTF-32BE with BOM
3161
- sniffed_xml_encoding = 'utf-32be'
3162
- xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
3163
- elsif xml_data[0..3] == "\xff\xfe\x00\x00"
3164
- # UTF-32LE with BOM
3165
- sniffed_xml_encoding = 'utf-32le'
3166
- xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
3167
- elsif xml_data[0..2] == "\xef\xbb\xbf"
3168
- # UTF-8 with BOM
3169
- sniffed_xml_encoding = 'utf-8'
3170
- xml_data = xml_data[3..-1]
3171
- else
3172
- # ASCII-compatible
3173
- end
3174
- xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
3175
- rescue
3176
- xml_encoding_match = nil
3177
- end
3178
- if xml_encoding_match
3179
- xml_encoding = xml_encoding_match[1].downcase
3180
- xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
3181
- if sniffed_xml_encoding and xencodings.include?xml_encoding
3182
- xml_encoding = sniffed_xml_encoding
3183
- end
3184
- end
3185
-
3186
- acceptable_content_type = false
3187
- application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
3188
- text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
3189
-
3190
- if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
3191
- acceptable_content_type = true
3192
- true_encoding = http_encoding || xml_encoding || 'utf-8'
3193
- elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
3194
- acceptable_content_type = true
3195
- true_encoding = http_encoding || 'us-ascii'
3196
- elsif /^text\// =~ http_content_type
3197
- true_encoding = http_encoding || 'us-ascii'
3198
- elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
3199
- true_encoding = xml_encoding || 'iso-8859-1'
3200
- else
3201
- true_encoding = xml_encoding || 'utf-8'
3202
- end
3203
- return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
3204
- end
3205
-
3206
- def self.toUTF8(data, encoding)
3207
- =begin
3208
- Changes an XML data stream on the fly to specify a new encoding
3209
-
3210
- data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
3211
- encoding is a string recognized by encodings.aliases
3212
- =end
3213
- $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
3214
- # NOTE we must use double quotes when dealing with \x encodings!
3215
- if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
3216
- if $debug
3217
- $stderr << "stripping BOM\n"
3218
- if encoding != 'utf-16be'
3219
- $stderr << "string utf-16be instead\n"
3220
- end
3221
- end
3222
- encoding = 'utf-16be'
3223
- data = data[2..-1]
3224
- elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
3225
- if $debug
3226
- $stderr << "stripping BOM\n"
3227
- $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
3228
- end
3229
- encoding = 'utf-16le'
3230
- data = data[2..-1]
3231
- elsif (data[0..2] == "\xef\xbb\xbf")
3232
- if $debug
3233
- $stderr << "stripping BOM\n"
3234
- $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
3235
- end
3236
- encoding = 'utf-8'
3237
- data = data[3..-1]
3238
- elsif (data[0..3] == "\x00\x00\xfe\xff")
3239
- if $debug
3240
- $stderr << "stripping BOM\n"
3241
- if encoding != 'utf-32be'
3242
- $stderr << "trying utf-32be instead\n"
3243
- end
3244
- end
3245
- encoding = 'utf-32be'
3246
- data = data[4..-1]
3247
- elsif (data[0..3] == "\xff\xfe\x00\x00")
3248
- if $debug
3249
- $stderr << "stripping BOM\n"
3250
- if encoding != 'utf-32le'
3251
- $stderr << "trying utf-32le instead\n"
3252
- end
3253
- end
3254
- encoding = 'utf-32le'
3255
- data = data[4..-1]
3256
- end
3257
- begin
3258
- newdata = uconvert(data, encoding, 'utf-8')
3259
- rescue => details
3260
- end
3261
- $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
3262
- declmatch = /^<\?xml[^>]*?>/
3263
- newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
3264
- if declmatch =~ newdata
3265
- newdata.sub!(declmatch, newdecl)
3266
- else
3267
- newdata = newdecl + "\n" + newdata
3268
- end
3269
- return newdata
3270
- end
3271
-
3272
- def self.stripDoctype(data)
3273
- =begin
3274
- Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3275
-
3276
- rss_version may be 'rss091n' or None
3277
- stripped_data is the same XML document, minus the DOCTYPE
3278
- =end
3279
- entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
3280
- data = data.gsub(entity_pattern,'')
3281
-
3282
- doctype_pattern = /<!DOCTYPE(.*?)>/m
3283
- doctype_results = data.scan(doctype_pattern)
3284
- if doctype_results and doctype_results[0]
3285
- doctype = doctype_results[0][0]
3286
- else
3287
- doctype = ''
3288
- end
3289
-
3290
- if /netscape/ =~ doctype.downcase
3291
- version = 'rss091n'
3292
- else
3293
- version = nil
3294
- end
3295
- data = data.sub(doctype_pattern, '')
3296
- return version, data
3297
- end
3298
-
3299
- def parse(*args); FeedParser.parse(*args); end
3300
- def FeedParser.parse(furi, options={})
3301
- # Parse a feed from a URL, file, stream or string
3302
- $compatible = options[:compatible] || $compatible # Use the default compatibility if compatible is nil
3303
- result = FeedParserDict.new
3304
- result['feed'] = FeedParserDict.new
3305
- result['entries'] = []
3306
- if options[:modified]
3307
- options[:modified] = Time.parse(options[:modified]).rfc2822
3308
- # FIXME this ignores all of our time parsing work. Does it matter?
143
+
144
+ def parse(furi, options = {})
145
+ # Parse a feed from a URL, file, stream or string
146
+ $compatible = options[:compatible] || $compatible # Use the default compatibility if compatible is nil
147
+ strictklass = options[:strict] || StrictFeedParser
148
+ looseklass = options[:loose] || LooseFeedParser
149
+ result = FeedParserDict.new
150
+ result['feed'] = FeedParserDict.new
151
+ result['entries'] = []
152
+ if options[:modified]
153
+ options[:modified] = Time.parse(options[:modified]).rfc2822
154
+ # FIXME this ignores all of our time parsing work. Does it matter?
3309
155
  end
3310
156
  result['bozo'] = false
3311
157
  handlers = options[:handlers]
3312
-
3313
158
  if handlers.class != Array # FIXME why does this happen?
3314
159
  handlers = [handlers]
3315
160
  end
3316
161
 
3317
162
  begin
3318
- if URI::parse(furi).class == URI::Generic
163
+ if File.exists?furi
3319
164
  f = open(furi) # OpenURI doesn't behave well when passing HTTP options to a file.
3320
165
  else
3321
166
  # And when you do pass them, make sure they aren't just nil (this still true?)
@@ -3482,7 +327,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3482
327
  if use_strict_parser
3483
328
  # initialize the SAX parser
3484
329
  saxparser = XML::SAX::Helpers::ParserFactory.makeParser("XML::Parser::SAXDriver")
3485
- feedparser = StrictFeedParser.new(baseuri, baselang, 'utf-8')
330
+ feedparser = strictklass.new(baseuri, baselang, 'utf-8')
3486
331
  saxparser.setDocumentHandler(feedparser)
3487
332
  saxparser.setDTDHandler(feedparser)
3488
333
  saxparser.setEntityResolver(feedparser)
@@ -3503,7 +348,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3503
348
  end
3504
349
  end
3505
350
  if not use_strict_parser
3506
- feedparser = LooseFeedParser.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
351
+ feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
3507
352
  feedparser.parse(data)
3508
353
  $stderr << "Using LooseFeed\n\n" if $debug
3509
354
  end
@@ -3513,6 +358,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3513
358
  result['namespaces'] = feedparser.namespacesInUse
3514
359
  return result
3515
360
  end
361
+ module_function(:parse)
3516
362
  end # End FeedParser module
3517
363
 
3518
364
  class Serializer
@@ -3552,7 +398,7 @@ class TextSerializer < Serializer
3552
398
  end
3553
399
  end
3554
400
 
3555
- class PprintSerializer < Serializer # FIXME ? use pp instead?
401
+ class PprintSerializer < Serializer # FIXME use pp instead
3556
402
  def write(stream = $stdout)
3557
403
  stream << @results['href'].to_s + "\n\n"
3558
404
  pp(@results)