rfeedparser 0.9.87 → 0.9.91

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rfeedparser.rb CHANGED
@@ -14,22 +14,21 @@ require 'stringio'
14
14
  require 'uri'
15
15
  require 'cgi' # escaping html
16
16
  require 'time'
17
- require 'xml/saxdriver' # calling expat
18
17
  require 'pp'
19
18
  require 'rubygems'
20
19
  require 'base64'
21
20
  require 'iconv'
22
- gem 'hpricot', ">=0.5"
21
+
23
22
  gem 'character-encodings', ">=0.2.0"
24
23
  gem 'htmltools', ">=1.10"
25
24
  gem 'htmlentities', ">=4.0.0"
26
25
  gem 'activesupport', ">=1.4.1"
27
26
  gem 'rchardet', ">=1.0"
27
+ require 'xml/saxdriver' # calling expat through the xmlparser gem
28
28
 
29
29
  require 'rchardet'
30
30
  $chardet = true
31
31
 
32
- require 'hpricot'
33
32
  require 'encoding/character/utf-8'
34
33
  require 'html/sgml-parser'
35
34
  require 'htmlentities'
@@ -40,998 +39,24 @@ include OpenURI
40
39
  $debug = false
41
40
  $compatible = true
42
41
 
43
- Encoding_Aliases = { # Adapted from python2.4's encodings/aliases.py
44
- 'unicode' => 'utf-16',
45
-
46
- # MacOSX does not have Unicode as a separate encoding nor even
47
- # aliased. My Ubuntu box has it as a separate encoding but I cannot
48
- # for the life of me figure out where the source code for UNICODE.so
49
- # is (supposedly, in libc6 .deb but that's a damn lie), so I don't
50
- # know what it expects. After some extensive research, I've decided
51
- # to alias it to utf-16 much like Python does when it is built with
52
- # --enable-unicode=ucs2. This could be seriously wrong. I have no idea.
53
-
54
- # ascii codec
55
- '646' => 'ascii',
56
- 'ansi_x3.4_1968' => 'ascii',
57
- 'ansi_x3_4_1968' => 'ascii', # some email headers use this non-standard name
58
- 'ansi_x3.4_1986' => 'ascii',
59
- 'cp367' => 'ascii',
60
- 'csascii' => 'ascii',
61
- 'ibm367' => 'ascii',
62
- 'iso646_us' => 'ascii',
63
- 'iso_646.irv_1991' => 'ascii',
64
- 'iso_ir_6' => 'ascii',
65
- 'us' => 'ascii',
66
- 'us_ascii' => 'ascii',
67
-
68
- # big5 codec
69
- 'big5_tw' => 'big5',
70
- 'csbig5' => 'big5',
71
-
72
- # big5hkscs codec
73
- 'big5_hkscs' => 'big5hkscs',
74
- 'hkscs' => 'big5hkscs',
75
-
76
- # cp037 codec
77
- '037' => 'cp037',
78
- 'csibm037' => 'cp037',
79
- 'ebcdic_cp_ca' => 'cp037',
80
- 'ebcdic_cp_nl' => 'cp037',
81
- 'ebcdic_cp_us' => 'cp037',
82
- 'ebcdic_cp_wt' => 'cp037',
83
- 'ibm037' => 'cp037',
84
- 'ibm039' => 'cp037',
85
-
86
- # cp1026 codec
87
- '1026' => 'cp1026',
88
- 'csibm1026' => 'cp1026',
89
- 'ibm1026' => 'cp1026',
90
-
91
- # cp1140 codec
92
- '1140' => 'cp1140',
93
- 'ibm1140' => 'cp1140',
94
-
95
- # cp1250 codec
96
- '1250' => 'cp1250',
97
- 'windows_1250' => 'cp1250',
98
-
99
- # cp1251 codec
100
- '1251' => 'cp1251',
101
- 'windows_1251' => 'cp1251',
102
-
103
- # cp1252 codec
104
- '1252' => 'cp1252',
105
- 'windows_1252' => 'cp1252',
106
-
107
- # cp1253 codec
108
- '1253' => 'cp1253',
109
- 'windows_1253' => 'cp1253',
110
-
111
- # cp1254 codec
112
- '1254' => 'cp1254',
113
- 'windows_1254' => 'cp1254',
114
-
115
- # cp1255 codec
116
- '1255' => 'cp1255',
117
- 'windows_1255' => 'cp1255',
118
-
119
- # cp1256 codec
120
- '1256' => 'cp1256',
121
- 'windows_1256' => 'cp1256',
122
-
123
- # cp1257 codec
124
- '1257' => 'cp1257',
125
- 'windows_1257' => 'cp1257',
126
-
127
- # cp1258 codec
128
- '1258' => 'cp1258',
129
- 'windows_1258' => 'cp1258',
130
-
131
- # cp424 codec
132
- '424' => 'cp424',
133
- 'csibm424' => 'cp424',
134
- 'ebcdic_cp_he' => 'cp424',
135
- 'ibm424' => 'cp424',
136
-
137
- # cp437 codec
138
- '437' => 'cp437',
139
- 'cspc8codepage437' => 'cp437',
140
- 'ibm437' => 'cp437',
141
-
142
- # cp500 codec
143
- '500' => 'cp500',
144
- 'csibm500' => 'cp500',
145
- 'ebcdic_cp_be' => 'cp500',
146
- 'ebcdic_cp_ch' => 'cp500',
147
- 'ibm500' => 'cp500',
148
-
149
- # cp775 codec
150
- '775' => 'cp775',
151
- 'cspc775baltic' => 'cp775',
152
- 'ibm775' => 'cp775',
153
-
154
- # cp850 codec
155
- '850' => 'cp850',
156
- 'cspc850multilingual' => 'cp850',
157
- 'ibm850' => 'cp850',
158
-
159
- # cp852 codec
160
- '852' => 'cp852',
161
- 'cspcp852' => 'cp852',
162
- 'ibm852' => 'cp852',
163
-
164
- # cp855 codec
165
- '855' => 'cp855',
166
- 'csibm855' => 'cp855',
167
- 'ibm855' => 'cp855',
168
-
169
- # cp857 codec
170
- '857' => 'cp857',
171
- 'csibm857' => 'cp857',
172
- 'ibm857' => 'cp857',
173
-
174
- # cp860 codec
175
- '860' => 'cp860',
176
- 'csibm860' => 'cp860',
177
- 'ibm860' => 'cp860',
178
-
179
- # cp861 codec
180
- '861' => 'cp861',
181
- 'cp_is' => 'cp861',
182
- 'csibm861' => 'cp861',
183
- 'ibm861' => 'cp861',
184
-
185
- # cp862 codec
186
- '862' => 'cp862',
187
- 'cspc862latinhebrew' => 'cp862',
188
- 'ibm862' => 'cp862',
189
-
190
- # cp863 codec
191
- '863' => 'cp863',
192
- 'csibm863' => 'cp863',
193
- 'ibm863' => 'cp863',
194
-
195
- # cp864 codec
196
- '864' => 'cp864',
197
- 'csibm864' => 'cp864',
198
- 'ibm864' => 'cp864',
199
-
200
- # cp865 codec
201
- '865' => 'cp865',
202
- 'csibm865' => 'cp865',
203
- 'ibm865' => 'cp865',
204
-
205
- # cp866 codec
206
- '866' => 'cp866',
207
- 'csibm866' => 'cp866',
208
- 'ibm866' => 'cp866',
209
-
210
- # cp869 codec
211
- '869' => 'cp869',
212
- 'cp_gr' => 'cp869',
213
- 'csibm869' => 'cp869',
214
- 'ibm869' => 'cp869',
215
-
216
- # cp932 codec
217
- '932' => 'cp932',
218
- 'ms932' => 'cp932',
219
- 'mskanji' => 'cp932',
220
- 'ms_kanji' => 'cp932',
221
-
222
- # cp949 codec
223
- '949' => 'cp949',
224
- 'ms949' => 'cp949',
225
- 'uhc' => 'cp949',
226
-
227
- # cp950 codec
228
- '950' => 'cp950',
229
- 'ms950' => 'cp950',
230
-
231
- # euc_jp codec
232
- 'euc_jp' => 'euc-jp',
233
- 'eucjp' => 'euc-jp',
234
- 'ujis' => 'euc-jp',
235
- 'u_jis' => 'euc-jp',
236
-
237
- # euc_kr codec
238
- 'euc_kr' => 'euc-kr',
239
- 'euckr' => 'euc-kr',
240
- 'korean' => 'euc-kr',
241
- 'ksc5601' => 'euc-kr',
242
- 'ks_c_5601' => 'euc-kr',
243
- 'ks_c_5601_1987' => 'euc-kr',
244
- 'ksx1001' => 'euc-kr',
245
- 'ks_x_1001' => 'euc-kr',
246
-
247
- # gb18030 codec
248
- 'gb18030_2000' => 'gb18030',
249
-
250
- # gb2312 codec
251
- 'chinese' => 'gb2312',
252
- 'csiso58gb231280' => 'gb2312',
253
- 'euc_cn' => 'gb2312',
254
- 'euccn' => 'gb2312',
255
- 'eucgb2312_cn' => 'gb2312',
256
- 'gb2312_1980' => 'gb2312',
257
- 'gb2312_80' => 'gb2312',
258
- 'iso_ir_58' => 'gb2312',
259
-
260
- # gbk codec
261
- '936' => 'gbk',
262
- 'cp936' => 'gbk',
263
- 'ms936' => 'gbk',
264
-
265
- # hp-roman8 codec
266
- 'hp_roman8' => 'hp-roman8',
267
- 'roman8' => 'hp-roman8',
268
- 'r8' => 'hp-roman8',
269
- 'csHPRoman8' => 'hp-roman8',
270
-
271
- # iso2022_jp codec
272
- 'iso2022_jp' => 'iso-2022-jp',
273
- 'csiso2022jp' => 'iso-2022-jp',
274
- 'iso2022jp' => 'iso-2022-jp',
275
- 'iso_2022_jp' => 'iso-2022-jp',
276
-
277
- # iso2022_jp_1 codec
278
- 'iso2002_jp_1' => 'iso-2022-jp-1',
279
- 'iso2022jp_1' => 'iso-2022-jp-1',
280
- 'iso_2022_jp_1' => 'iso-2022-jp-1',
281
-
282
- # iso2022_jp_2 codec
283
- 'iso2022_jp_2' => 'iso-2002-jp-2',
284
- 'iso2022jp_2' => 'iso-2022-jp-2',
285
- 'iso_2022_jp_2' => 'iso-2022-jp-2',
286
-
287
- # iso2022_jp_3 codec
288
- 'iso2002_jp_3' => 'iso-2022-jp-3',
289
- 'iso2022jp_3' => 'iso-2022-jp-3',
290
- 'iso_2022_jp_3' => 'iso-2022-jp-3',
291
-
292
- # iso2022_kr codec
293
- 'iso2022_kr' => 'iso-2022-kr',
294
- 'csiso2022kr' => 'iso-2022-kr',
295
- 'iso2022kr' => 'iso-2022-kr',
296
- 'iso_2022_kr' => 'iso-2022-kr',
297
-
298
- # iso8859_10 codec
299
- 'iso8859_10' => 'iso-8859-10',
300
- 'csisolatin6' => 'iso-8859-10',
301
- 'iso_8859_10' => 'iso-8859-10',
302
- 'iso_8859_10_1992' => 'iso-8859-10',
303
- 'iso_ir_157' => 'iso-8859-10',
304
- 'l6' => 'iso-8859-10',
305
- 'latin6' => 'iso-8859-10',
306
-
307
- # iso8859_13 codec
308
- 'iso8859_13' => 'iso-8859-13',
309
- 'iso_8859_13' => 'iso-8859-13',
310
-
311
- # iso8859_14 codec
312
- 'iso8859_14' => 'iso-8859-14',
313
- 'iso_8859_14' => 'iso-8859-14',
314
- 'iso_8859_14_1998' => 'iso-8859-14',
315
- 'iso_celtic' => 'iso-8859-14',
316
- 'iso_ir_199' => 'iso-8859-14',
317
- 'l8' => 'iso-8859-14',
318
- 'latin8' => 'iso-8859-14',
319
-
320
- # iso8859_15 codec
321
- 'iso8859_15' => 'iso-8859-15',
322
- 'iso_8859_15' => 'iso-8859-15',
323
-
324
- # iso8859_1 codec
325
- 'latin_1' => 'iso-8859-1',
326
- 'cp819' => 'iso-8859-1',
327
- 'csisolatin1' => 'iso-8859-1',
328
- 'ibm819' => 'iso-8859-1',
329
- 'iso8859' => 'iso-8859-1',
330
- 'iso_8859_1' => 'iso-8859-1',
331
- 'iso_8859_1_1987' => 'iso-8859-1',
332
- 'iso_ir_100' => 'iso-8859-1',
333
- 'l1' => 'iso-8859-1',
334
- 'latin' => 'iso-8859-1',
335
- 'latin1' => 'iso-8859-1',
336
-
337
- # iso8859_2 codec
338
- 'iso8859_2' => 'iso-8859-2',
339
- 'csisolatin2' => 'iso-8859-2',
340
- 'iso_8859_2' => 'iso-8859-2',
341
- 'iso_8859_2_1987' => 'iso-8859-2',
342
- 'iso_ir_101' => 'iso-8859-2',
343
- 'l2' => 'iso-8859-2',
344
- 'latin2' => 'iso-8859-2',
345
-
346
- # iso8859_3 codec
347
- 'iso8859_3' => 'iso-8859-3',
348
- 'csisolatin3' => 'iso-8859-3',
349
- 'iso_8859_3' => 'iso-8859-3',
350
- 'iso_8859_3_1988' => 'iso-8859-3',
351
- 'iso_ir_109' => 'iso-8859-3',
352
- 'l3' => 'iso-8859-3',
353
- 'latin3' => 'iso-8859-3',
354
-
355
- # iso8859_4 codec
356
- 'iso8849_4' => 'iso-8859-4',
357
- 'csisolatin4' => 'iso-8859-4',
358
- 'iso_8859_4' => 'iso-8859-4',
359
- 'iso_8859_4_1988' => 'iso-8859-4',
360
- 'iso_ir_110' => 'iso-8859-4',
361
- 'l4' => 'iso-8859-4',
362
- 'latin4' => 'iso-8859-4',
363
-
364
- # iso8859_5 codec
365
- 'iso8859_5' => 'iso-8859-5',
366
- 'csisolatincyrillic' => 'iso-8859-5',
367
- 'cyrillic' => 'iso-8859-5',
368
- 'iso_8859_5' => 'iso-8859-5',
369
- 'iso_8859_5_1988' => 'iso-8859-5',
370
- 'iso_ir_144' => 'iso-8859-5',
371
-
372
- # iso8859_6 codec
373
- 'iso8859_6' => 'iso-8859-6',
374
- 'arabic' => 'iso-8859-6',
375
- 'asmo_708' => 'iso-8859-6',
376
- 'csisolatinarabic' => 'iso-8859-6',
377
- 'ecma_114' => 'iso-8859-6',
378
- 'iso_8859_6' => 'iso-8859-6',
379
- 'iso_8859_6_1987' => 'iso-8859-6',
380
- 'iso_ir_127' => 'iso-8859-6',
381
-
382
- # iso8859_7 codec
383
- 'iso8859_7' => 'iso-8859-7',
384
- 'csisolatingreek' => 'iso-8859-7',
385
- 'ecma_118' => 'iso-8859-7',
386
- 'elot_928' => 'iso-8859-7',
387
- 'greek' => 'iso-8859-7',
388
- 'greek8' => 'iso-8859-7',
389
- 'iso_8859_7' => 'iso-8859-7',
390
- 'iso_8859_7_1987' => 'iso-8859-7',
391
- 'iso_ir_126' => 'iso-8859-7',
392
-
393
- # iso8859_8 codec
394
- 'iso8859_9' => 'iso8859_8',
395
- 'csisolatinhebrew' => 'iso-8859-8',
396
- 'hebrew' => 'iso-8859-8',
397
- 'iso_8859_8' => 'iso-8859-8',
398
- 'iso_8859_8_1988' => 'iso-8859-8',
399
- 'iso_ir_138' => 'iso-8859-8',
400
-
401
- # iso8859_9 codec
402
- 'iso8859_9' => 'iso-8859-9',
403
- 'csisolatin5' => 'iso-8859-9',
404
- 'iso_8859_9' => 'iso-8859-9',
405
- 'iso_8859_9_1989' => 'iso-8859-9',
406
- 'iso_ir_148' => 'iso-8859-9',
407
- 'l5' => 'iso-8859-9',
408
- 'latin5' => 'iso-8859-9',
409
-
410
- # iso8859_11 codec
411
- 'iso8859_11' => 'iso-8859-11',
412
- 'thai' => 'iso-8859-11',
413
- 'iso_8859_11' => 'iso-8859-11',
414
- 'iso_8859_11_2001' => 'iso-8859-11',
415
-
416
- # iso8859_16 codec
417
- 'iso8859_16' => 'iso-8859-16',
418
- 'iso_8859_16' => 'iso-8859-16',
419
- 'iso_8859_16_2001' => 'iso-8859-16',
420
- 'iso_ir_226' => 'iso-8859-16',
421
- 'l10' => 'iso-8859-16',
422
- 'latin10' => 'iso-8859-16',
423
-
424
- # cskoi8r codec
425
- 'koi8_r' => 'cskoi8r',
426
-
427
- # mac_cyrillic codec
428
- 'mac_cyrillic' => 'maccyrillic',
429
-
430
- # shift_jis codec
431
- 'csshiftjis' => 'shift_jis',
432
- 'shiftjis' => 'shift_jis',
433
- 'sjis' => 'shift_jis',
434
- 's_jis' => 'shift_jis',
435
-
436
- # shift_jisx0213 codec
437
- 'shiftjisx0213' => 'shift_jisx0213',
438
- 'sjisx0213' => 'shift_jisx0213',
439
- 's_jisx0213' => 'shift_jisx0213',
440
-
441
- # utf_16 codec
442
- 'utf_16' => 'utf-16',
443
- 'u16' => 'utf-16',
444
- 'utf16' => 'utf-16',
445
-
446
- # utf_16_be codec
447
- 'utf_16_be' => 'utf-16be',
448
- 'unicodebigunmarked' => 'utf-16be',
449
- 'utf_16be' => 'utf-16be',
450
-
451
- # utf_16_le codec
452
- 'utf_16_le' => 'utf-16le',
453
- 'unicodelittleunmarked' => 'utf-16le',
454
- 'utf_16le' => 'utf-16le',
455
-
456
- # utf_7 codec
457
- 'utf_7' => 'utf-7',
458
- 'u7' => 'utf-7',
459
- 'utf7' => 'utf-7',
460
-
461
- # utf_8 codec
462
- 'utf_8' => 'utf-8',
463
- 'u8' => 'utf-8',
464
- 'utf' => 'utf-8',
465
- 'utf8' => 'utf-8',
466
- 'utf8_ucs2' => 'utf-8',
467
- 'utf8_ucs4' => 'utf-8',
468
- }
469
-
470
- def unicode(data, from_encoding)
471
- # Takes a single string and converts it from the encoding in
472
- # from_encoding to unicode.
473
- uconvert(data, from_encoding, 'unicode')
474
- end
475
-
476
- def uconvert(data, from_encoding, to_encoding = 'utf-8')
477
- from_encoding = Encoding_Aliases[from_encoding] || from_encoding
478
- to_encoding = Encoding_Aliases[to_encoding] || to_encoding
479
- Iconv.iconv(to_encoding, from_encoding, data)[0]
480
- end
481
-
482
- def unichr(i)
483
- [i].pack('U*')
484
- end
485
-
486
- def index_match(stri,regexp, offset)
487
- i = stri.index(regexp, offset)
488
-
489
- return nil, nil unless i
490
-
491
- full = stri[i..-1].match(regexp)
492
- return i, full
493
- end
494
-
495
- def _ebcdic_to_ascii(s)
496
- return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
497
- end
498
-
499
- def urljoin(base, uri)
500
- urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
501
- uri = uri.sub(urifixer, '\1\3')
502
-
503
- begin
504
- return URI.join(base, uri).to_s
505
- rescue URI::BadURIError => e
506
- if URI.parse(base).relative?
507
- return URI::parse(uri).to_s
508
- end
509
- end
510
- end
511
-
512
- def py2rtime(pytuple)
513
- Time.utc(pytuple[0..5])
514
- end
515
-
516
- # http://intertwingly.net/stories/2005/09/28/xchar.rb
517
- module XChar
518
- # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
519
- CP1252 = {
520
- 128 => 8364, # euro sign
521
- 130 => 8218, # single low-9 quotation mark
522
- 131 => 402, # latin small letter f with hook
523
- 132 => 8222, # double low-9 quotation mark
524
- 133 => 8230, # horizontal ellipsis
525
- 134 => 8224, # dagger
526
- 135 => 8225, # double dagger
527
- 136 => 710, # modifier letter circumflex accent
528
- 137 => 8240, # per mille sign
529
- 138 => 352, # latin capital letter s with caron
530
- 139 => 8249, # single left-pointing angle quotation mark
531
- 140 => 338, # latin capital ligature oe
532
- 142 => 381, # latin capital letter z with caron
533
- 145 => 8216, # left single quotation mark
534
- 146 => 8217, # right single quotation mark
535
- 147 => 8220, # left double quotation mark
536
- 148 => 8221, # right double quotation mark
537
- 149 => 8226, # bullet
538
- 150 => 8211, # en dash
539
- 151 => 8212, # em dash
540
- 152 => 732, # small tilde
541
- 153 => 8482, # trade mark sign
542
- 154 => 353, # latin small letter s with caron
543
- 155 => 8250, # single right-pointing angle quotation mark
544
- 156 => 339, # latin small ligature oe
545
- 158 => 382, # latin small letter z with caron
546
- 159 => 376} # latin capital letter y with diaeresis
547
-
548
- # http://www.w3.org/TR/REC-xml/#dt-chardata
549
- PREDEFINED = {
550
- 38 => '&', # ampersand
551
- 60 => '<', # left angle bracket
552
- 62 => '>'} # right angle bracket
553
-
554
- # http://www.w3.org/TR/REC-xml/#charsets
555
- VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
556
- (0xE000..0xFFFD), (0x10000..0x10FFFF)]
557
- end
558
-
559
- class Fixnum
560
- # xml escaped version of chr
561
- def xchr
562
- n = XChar::CP1252[self] || self
563
- n = 42 unless XChar::VALID.find {|range| range.include? n}
564
- XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
565
- end
566
- end
567
-
568
- class String
569
- alias :old_index :index
570
- def to_xs
571
- unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
572
- rescue
573
- unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
574
- end
575
- end
576
-
577
- class BetterSGMLParserError < Exception; end;
578
- class BetterSGMLParser < HTML::SGMLParser
579
- # Replaced Tagfind and Charref Regexps with the ones in feedparser.py
580
- # This makes things work.
581
- Interesting = /[&<]/u
582
- Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
583
- '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
584
- '![^<>]*)?', 64) # 64 is the unicode flag
585
-
586
- Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
587
- Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
588
-
589
- Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
590
- Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
591
- Endtagopen = /<\//u # Matching the Python SGMLParser
592
- Endbracket = /[<>]/u
593
- Declopen = /<!/u
594
- Piopenbegin = /^<\?/u
595
- Piclose = />/u
596
-
597
- Commentopen = /<!--/u
598
- Commentclose = /--\s*>/u
599
- Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
600
- Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
601
- '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
602
- 64)
603
- Endtagfind = /\s*\/\s*>/u
604
- def initialize(verbose=false)
605
- super(verbose)
606
- end
607
- def feed(*args)
608
- super(*args)
609
- end
610
-
611
- def goahead(_end)
612
- rawdata = @rawdata # woo, utf-8 magic
613
- i = 0
614
- n = rawdata.length
615
- while i < n
616
- if @nomoretags
617
- # handle_data_range does nothing more than set a "Range" that is never used. wtf?
618
- handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
619
- i = n
620
- break
621
- end
622
- j = rawdata.index(Interesting, i)
623
- j = n unless j
624
- handle_data(rawdata[i...j]) if i < j
625
- i = j
626
- break if (i == n)
627
- if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
628
- if rawdata.index(Starttagopen,i) == i
629
- if @literal
630
- handle_data(rawdata[i..i])
631
- i = i+1
632
- next
633
- end
634
- k = parse_starttag(i)
635
- break unless k
636
- i = k
637
- next
638
- end
639
- if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
640
- k = parse_endtag(i)
641
- break unless k
642
- i = k
643
- @literal = false
644
- next
645
- end
646
- if @literal
647
- if n > (i+1)
648
- handle_data("<")
649
- i = i+1
650
- else
651
- #incomplete
652
- break
653
- end
654
- next
655
- end
656
- if rawdata.index(Commentopen,i) == i
657
- k = parse_comment(i)
658
- break unless k
659
- i = k
660
- next
661
- end
662
- if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
663
- k = parse_pi(i)
664
- break unless k
665
- i += k
666
- next
667
- end
668
- if rawdata.index(Declopen,i) == i
669
- # This is some sort of declaration; in "HTML as
670
- # deployed," this should only be the document type
671
- # declaration ("<!DOCTYPE html...>").
672
- k = parse_declaration(i)
673
- break unless k
674
- i = k
675
- next
676
- end
677
- elsif rawdata[i..i] == '&'
678
- if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
679
- handle_data(rawdata[i..i])
680
- i += 1
681
- next
682
- end
683
-
684
- # the Char must come first as its #=~ method is the only one that is UTF-8 safe
685
- ni,match = index_match(rawdata, Charref, i)
686
- if ni and ni == i # See? Ugly
687
- handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
688
- i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
689
- i -= 1 unless rawdata[i-1..i-1] == ";"
690
- next
691
- end
692
- ni,match = index_match(rawdata, Entityref, i)
693
- if ni and ni == i
694
- handle_entityref(match[1])
695
- i += match[0].length
696
- i -= 1 unless rawdata[i-1..i-1] == ";"
697
- next
698
- end
699
- else
700
- error('neither < nor & ??')
701
- end
702
- # We get here only if incomplete matches but
703
- # nothing else
704
- ni,match = index_match(rawdata,Incomplete,i)
705
- unless ni and ni == 0
706
- handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
707
- i += 1
708
- next
709
- end
710
- j = ni + match[0].length
711
- break if j == n # Really incomplete
712
- handle_data(rawdata[i...j])
713
- i = j
714
- end # end while
715
-
716
- if _end and i < n
717
- handle_data(rawdata[i...n])
718
- i = n
719
- end
720
-
721
- @rawdata = rawdata[i..-1]
722
- # @offset += i # FIXME BUGME another unused variable in SGMLParser?
723
- end
724
-
725
-
726
- # Internal -- parse processing instr, return length or -1 if not terminated
727
- def parse_pi(i)
728
- rawdata = @rawdata
729
- if rawdata[i...i+2] != '<?'
730
- error("unexpected call to parse_pi()")
731
- end
732
- ni,match = index_match(rawdata,Piclose,i+2)
733
- return nil unless match
734
- j = ni
735
- handle_pi(rawdata[i+2...j])
736
- j = (j + match[0].length)
737
- return j-i
738
- end
739
-
740
- def parse_comment(i)
741
- rawdata = @rawdata
742
- if rawdata[i...i+4] != "<!--"
743
- error("unexpected call to parse_comment()")
744
- end
745
- ni,match = index_match(rawdata, Commentclose,i)
746
- return nil unless match
747
- handle_comment(rawdata[i+4..(ni-1)])
748
- return ni+match[0].length # Length from i to just past the closing comment tag
749
- end
750
-
751
-
752
- def parse_starttag(i)
753
- @_starttag_text = nil
754
- start_pos = i
755
- rawdata = @rawdata
756
- ni,match = index_match(rawdata,Shorttagopen,i)
757
- if ni == i
758
- # SGML shorthand: <tag/data/ == <tag>data</tag>
759
- # XXX Can data contain &... (entity or char refs)?
760
- # XXX Can data contain < or > (tag characters)?
761
- # XXX Can there be whitespace before the first /?
762
- k,match = index_match(rawdata,Shorttag,i)
763
- return nil unless match
764
- tag, data = match[1], match[2]
765
- @_starttag_text = "<#{tag}/"
766
- tag.downcase!
767
- second_end = rawdata.index(Shorttagopen,k)
768
- finish_shorttag(tag, data)
769
- @_starttag_text = rawdata[start_pos...second_end+1]
770
- return k
771
- end
772
-
773
- j = rawdata.index(Endbracket, i+1)
774
- return nil unless j
775
- attrsd = []
776
- if rawdata[i...i+2] == '<>'
777
- # SGML shorthand: <> == <last open tag seen>
778
- k = j
779
- tag = @lasttag
780
- else
781
- ni,match = index_match(rawdata,Tagfind,i+1)
782
- unless match
783
- error('unexpected call to parse_starttag')
784
- end
785
- k = ni+match[0].length+1
786
- tag = match[0].downcase
787
- @lasttag = tag
788
- end
789
-
790
- while k < j
791
- break if rawdata.index(Endtagfind, k) == k
792
- ni,match = index_match(rawdata,Attrfind,k)
793
- break unless ni
794
- matched_length = match[0].length
795
- attrname, rest, attrvalue = match[1],match[2],match[3]
796
- if rest.nil? or rest.empty?
797
- attrvalue = '' # was: = attrname # Why the change?
798
- elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
799
- attrvalue = attrvalue[1...-1]
800
- end
801
- attrsd << [attrname.downcase, attrvalue]
802
- k += matched_length
803
- end
804
- if rawdata[j..j] == ">"
805
- j += 1
806
- end
807
- @_starttag_text = rawdata[start_pos...j]
808
- finish_starttag(tag, attrsd)
809
- return j
810
- end
811
-
812
- def parse_endtag(i)
813
- rawdata = @rawdata
814
- j, match = index_match(rawdata, /[<>]/,i+1)
815
- return nil unless j
816
- tag = rawdata[i+2...j].strip.downcase
817
- if rawdata[j..j] == ">"
818
- j += 1
819
- end
820
- finish_endtag(tag)
821
- return j
822
- end
823
-
824
- def output
825
- # Return processed HTML as a single string
826
- return @pieces.map{|p| p.to_s}.join
827
- end
828
-
829
- def error(message)
830
- raise BetterSGMLParserError.new(message)
831
- end
832
- def handle_pi(text)
833
- end
834
- def handle_decl(text)
835
- end
836
- end
837
-
838
- # Add some helper methods to make AttributeList (all of those damn attrs
839
- # and attrsD used by StrictFeedParser) act more like a Hash.
840
- # NOTE AttributeList is still Read-Only (AFAICT).
841
- # Monkey patching is terrible, and I have an addiction.
842
- module XML
843
- module SAX
844
- module AttributeList # in xml/sax.rb
845
- def [](key)
846
- getValue(key)
847
- end
848
-
849
- def each(&blk)
850
- (0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
851
- end
852
-
853
- def each_key(&blk)
854
- (0...getLength).each{|pos| yield getName(pos) }
855
- end
856
-
857
- def each_value(&blk)
858
- (0...getLength).each{|pos| yield getValue(pos) }
859
- end
860
-
861
- def to_a # Rather use collect? grep for to_a.collect
862
- l = []
863
- each{|k,v| l << [k,v]}
864
- return l
865
- end
866
-
867
- def to_s
868
- l = []
869
- each{|k,v| l << "#{k} => #{v}"}
870
- "{ "+l.join(", ")+" }"
871
- end
872
- end
873
- end
874
- end
875
-
876
- # This used to be based on Michael Moen's Hpricot#scrub, but that seems to
877
- # have only been part of its evolution. Hpricot#scrub is cool code, though.
878
- # http://underpantsgnome.com/2007/01/20/hpricot-scrub
879
- module Hpricot
880
- Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
881
- 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
882
- 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
883
- 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
884
- 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
885
- 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
886
- 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
887
- 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
888
- 'ul', 'var'
889
- ]
890
-
891
- Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
892
- 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
893
- 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
894
- 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
895
- 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
896
- 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
897
- 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
898
- 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
899
- 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
900
- 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
901
- 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
902
- ]
903
-
904
- Unacceptable_Elements_With_End_Tag = ['script', 'applet']
905
-
906
- Acceptable_Css_Properties = ['azimuth', 'background-color',
907
- 'border-bottom-color', 'border-collapse', 'border-color',
908
- 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
909
- 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
910
- 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
911
- 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
912
- 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
913
- 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
914
- 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
915
- 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
916
- 'white-space', 'width'
917
- ]
918
-
919
- # survey of common keywords found in feeds
920
- Acceptable_Css_Keywords = ['auto', 'aqua', 'black', 'block', 'blue',
921
- 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
922
- 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
923
- 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
924
- 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
925
- 'transparent', 'underline', 'white', 'yellow'
926
- ]
927
-
928
- Mathml_Elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
929
- 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
930
- 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
931
- 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
932
- 'munderover', 'none'
933
- ]
934
-
935
- Mathml_Attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
936
- 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
937
- 'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
938
- 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
939
- 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
940
- 'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
941
- 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
942
- 'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
943
- 'xlink:type', 'xmlns', 'xmlns:xlink'
944
- ]
945
-
946
- # svgtiny - foreignObject + linearGradient + radialGradient + stop
947
- Svg_Elements = ['a', 'animate', 'animateColor', 'animateMotion',
948
- 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
949
- 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
950
- 'linearGradient', 'line', 'metadata', 'missing-glyph', 'mpath', 'path',
951
- 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg',
952
- 'switch', 'text', 'title', 'use'
953
- ]
954
-
955
- # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
956
- Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
957
- 'arabic-form', 'ascent', 'attributeName', 'attributeType',
958
- 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
959
- 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
960
- 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
961
- 'font-size', 'font-stretch', 'font-style', 'font-variant',
962
- 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
963
- 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
964
- 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
965
- 'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
966
- 'origin', 'overline-position', 'overline-thickness', 'panose-1',
967
- 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
968
- 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
969
- 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
970
- 'stop-color', 'stop-opacity', 'strikethrough-position',
971
- 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
972
- 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
973
- 'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
974
- 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
975
- 'underline-position', 'underline-thickness', 'unicode',
976
- 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
977
- 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
978
- 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
979
- 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
980
- 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
981
- ]
982
-
983
- Svg_Attr_Map = nil
984
- Svg_Elem_Map = nil
985
-
986
- Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
987
- 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
988
- 'stroke-opacity'
989
- ]
990
-
991
- unless $compatible
992
- @@acceptable_tag_specific_attributes = {}
993
- @@mathml_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@mathml_attributes }
994
- @@svg_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@svg_attributes }
995
- end
996
-
997
- class Elements
998
- def strip_attributes(safe=[])
999
- each { |x| x.strip_attributes(safe) }
1000
- end
1001
-
1002
- def strip_style(ok_props=[], ok_keywords=[]) # NOTE unused so far.
1003
- each { |x| x.strip_style(ok_props, ok_keywords) }
1004
- end
1005
- end
42
+ $LOAD_PATH << File.expand_path(File.dirname(__FILE__))
43
+ require 'rfeedparser/forgiving_uri'
44
+ require 'rfeedparser/aliases'
45
+ require 'rfeedparser/encoding_helpers'
46
+ require 'rfeedparser/better_sgmlparser'
47
+ require 'rfeedparser/better_attributelist'
48
+ require 'rfeedparser/scrub'
49
+ require 'rfeedparser/time_helpers'
50
+ require 'rfeedparser/feedparserdict'
51
+ require 'rfeedparser/parser_mixin'
52
+ require 'rfeedparser/parsers'
53
+ require 'rfeedparser/markup_helpers'
1006
54
 
1007
- class Text
1008
- def strip_attributes(foo)
1009
- end
1010
- end
1011
- class Comment
1012
- def strip_attributes(foo)
1013
- end
1014
- end
1015
- class BogusETag
1016
- def strip_attributes(foo)
1017
- end
1018
- end
55
+ include FeedParserUtilities
1019
56
 
1020
- class Elem
1021
- def strip_attributes
1022
- unless attributes.nil?
1023
- attributes.each do |atr|
1024
- unless Acceptable_Attributes.include?atr[0]
1025
- remove_attribute(atr[0])
1026
- end
1027
- end
1028
- end
1029
- end
1030
- end
1031
- end
1032
57
 
1033
58
  module FeedParser
1034
- Version = "0.9.87"
59
+ Version = "0.9.91"
1035
60
 
1036
61
  License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
1037
62
 
@@ -1059,10 +84,10 @@ POSSIBILITY OF SUCH DAMAGE."""
1059
84
  Author = "Jeff Hodges <http://somethingsimilar.com>"
1060
85
  Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
1061
86
  Contributors = [ "Jason Diamond <http://injektilo.org/>",
1062
- "John Beimler <http://john.beimler.org/>",
1063
- "Fazal Majid <http://www.majid.info/mylos/weblog/>",
1064
- "Aaron Swartz <http://aaronsw.com/>",
1065
- "Kevin Marks <http://epeus.blogspot.com/>"
87
+ "John Beimler <http://john.beimler.org/>",
88
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>",
89
+ "Aaron Swartz <http://aaronsw.com/>",
90
+ "Kevin Marks <http://epeus.blogspot.com/>"
1066
91
  ]
1067
92
  # HTTP "User-Agent" header to send to servers when downloading feeds.
1068
93
  # If you are embedding feedparser in a larger application, you should
@@ -1115,2207 +140,27 @@ POSSIBILITY OF SUCH DAMAGE."""
1115
140
  'cdf' => 'CDF',
1116
141
  'hotrss' => 'Hot RSS'
1117
142
  }
1118
- class FeedParserDict < Hash
1119
- =begin
1120
- The naming of a certain common attribute (such as, "When was the last
1121
- time this feed was updated?") can have many different names depending
1122
- on the type of feed we are handling. This class allows us to satisfy
1123
- the expectations of both the developer who has prior knowledge of the
1124
- feed type as well as the developer who wants a consistent application
1125
- interface.
1126
-
1127
- @@keymap is a Hash that contains information on what a certain
1128
- attribute names "really are" in each kind of feed. It does this by
1129
- providing a common name that will map to any feed type in the keys,
1130
- with possible "correct" attributes in the its values. the #[] and #[]=
1131
- methods check with keymaps to see what attribute the developer "really
1132
- means" if they've asked for one which happens to be in @@keymap's keys.
1133
- =end
1134
- @@keymap = {'channel' => 'feed',
1135
- 'items' => 'entries',
1136
- 'guid' => 'id',
1137
- 'date' => 'updated',
1138
- 'date_parsed' => 'updated_parsed',
1139
- 'description' => ['subtitle', 'summary'],
1140
- 'url' => ['href'],
1141
- 'modified' => 'updated',
1142
- 'modified_parsed' => 'updated_parsed',
1143
- 'issued' => 'published',
1144
- 'issued_parsed' => 'published_parsed',
1145
- 'copyright' => 'rights',
1146
- 'copyright_detail' => 'rights_detail',
1147
- 'tagline' => 'subtitle',
1148
- 'tagline_detail' => 'subtitle_detail'}
1149
-
1150
- def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
1151
- return self['entries']
1152
- end
1153
-
1154
- # We could include the [] rewrite in new using Hash.new's fancy pants block thing
1155
- # but we'd still have to overwrite []= and such.
1156
- # I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
1157
- def initialize(pairs=nil)
1158
- if pairs.class == Array and pairs[0].class == Array and pairs[0].length == 2
1159
- pairs.each do |l|
1160
- k,v = l
1161
- self[k] = v
1162
- end
1163
- elsif pairs.class == Hash
1164
- self.merge!(pairs)
1165
- end
1166
- end
1167
-
1168
- def [](key)
1169
- if key == 'category'
1170
- return self['tags'][0]['term']
1171
- end
1172
- if key == 'categories'
1173
- return self['tags'].collect{|tag| [tag['scheme'],tag['term']]}
1174
- end
1175
- realkey = @@keymap[key] || key
1176
- if realkey.class == Array
1177
- realkey.each{ |key| return self[key] if has_key?key }
1178
- end
1179
- # Note that the original key is preferred over the realkey we (might
1180
- # have) found in @@keymap
1181
- if has_key?(key)
1182
- return super(key)
1183
- end
1184
- return super(realkey)
1185
- end
1186
-
1187
- def []=(key,value)
1188
- if @@keymap.key?key
1189
- key = @@keymap[key]
1190
- if key.class == Array
1191
- key = key[0]
1192
- end
1193
- end
1194
- super(key,value)
1195
- end
1196
-
1197
- def method_missing(msym, *args)
1198
- methodname = msym.to_s
1199
- if methodname[-1] == '='
1200
- return self[methodname[0..-2]] = args[0]
1201
- elsif methodname[-1] != '!' and methodname[-1] != '?' and methodname[0] != "_" # FIXME implement with private
1202
- return self[methodname]
1203
- else
1204
- raise NoMethodError, "whoops, we don't know about the attribute or method called `#{methodname}' for #{self}:#{self.class}"
1205
- end
1206
- end
1207
- end
1208
-
1209
-
1210
-
1211
-
1212
- module FeedParserMixin
1213
- attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
1214
-
1215
- def startup(baseuri=nil, baselang=nil, encoding='utf-8')
1216
- $stderr << "initializing FeedParser\n" if $debug
1217
-
1218
- @namespaces = {'' => '',
1219
- 'http://backend.userland.com/rss' => '',
1220
- 'http://blogs.law.harvard.edu/tech/rss' => '',
1221
- 'http://purl.org/rss/1.0/' => '',
1222
- 'http://my.netscape.com/rdf/simple/0.9/' => '',
1223
- 'http://example.com/newformat#' => '',
1224
- 'http://example.com/necho' => '',
1225
- 'http://purl.org/echo/' => '',
1226
- 'uri/of/echo/namespace#' => '',
1227
- 'http://purl.org/pie/' => '',
1228
- 'http://purl.org/atom/ns#' => '',
1229
- 'http://www.w3.org/2005/Atom' => '',
1230
- 'http://purl.org/rss/1.0/modules/rss091#' => '',
1231
- 'http://webns.net/mvcb/' => 'admin',
1232
- 'http://purl.org/rss/1.0/modules/aggregation/' => 'ag',
1233
- 'http://purl.org/rss/1.0/modules/annotate/' => 'annotate',
1234
- 'http://media.tangent.org/rss/1.0/' => 'audio',
1235
- 'http://backend.userland.com/blogChannelModule' => 'blogChannel',
1236
- 'http://web.resource.org/cc/' => 'cc',
1237
- 'http://backend.userland.com/creativeCommonsRssModule' => 'creativeCommons',
1238
- 'http://purl.org/rss/1.0/modules/company' => 'co',
1239
- 'http://purl.org/rss/1.0/modules/content/' => 'content',
1240
- 'http://my.theinfo.org/changed/1.0/rss/' => 'cp',
1241
- 'http://purl.org/dc/elements/1.1/' => 'dc',
1242
- 'http://purl.org/dc/terms/' => 'dcterms',
1243
- 'http://purl.org/rss/1.0/modules/email/' => 'email',
1244
- 'http://purl.org/rss/1.0/modules/event/' => 'ev',
1245
- 'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner',
1246
- 'http://freshmeat.net/rss/fm/' => 'fm',
1247
- 'http://xmlns.com/foaf/0.1/' => 'foaf',
1248
- 'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo',
1249
- 'http://postneo.com/icbm/' => 'icbm',
1250
- 'http://purl.org/rss/1.0/modules/image/' => 'image',
1251
- 'http://www.itunes.com/DTDs/PodCast-1.0.dtd' => 'itunes',
1252
- 'http://example.com/DTDs/PodCast-1.0.dtd' => 'itunes',
1253
- 'http://purl.org/rss/1.0/modules/link/' => 'l',
1254
- 'http://search.yahoo.com/mrss' => 'media',
1255
- 'http://madskills.com/public/xml/rss/module/pingback/' => 'pingback',
1256
- 'http://prismstandard.org/namespaces/1.2/basic/' => 'prism',
1257
- 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
1258
- 'http://www.w3.org/2000/01/rdf-schema#' => 'rdfs',
1259
- 'http://purl.org/rss/1.0/modules/reference/' => 'ref',
1260
- 'http://purl.org/rss/1.0/modules/richequiv/' => 'reqv',
1261
- 'http://purl.org/rss/1.0/modules/search/' => 'search',
1262
- 'http://purl.org/rss/1.0/modules/slash/' => 'slash',
1263
- 'http://schemas.xmlsoap.org/soap/envelope/' => 'soap',
1264
- 'http://purl.org/rss/1.0/modules/servicestatus/' => 'ss',
1265
- 'http://hacks.benhammersley.com/rss/streaming/' => 'str',
1266
- 'http://purl.org/rss/1.0/modules/subscription/' => 'sub',
1267
- 'http://purl.org/rss/1.0/modules/syndication/' => 'sy',
1268
- 'http://purl.org/rss/1.0/modules/taxonomy/' => 'taxo',
1269
- 'http://purl.org/rss/1.0/modules/threading/' => 'thr',
1270
- 'http://purl.org/rss/1.0/modules/textinput/' => 'ti',
1271
- 'http://madskills.com/public/xml/rss/module/trackback/' =>'trackback',
1272
- 'http://wellformedweb.org/commentAPI/' => 'wfw',
1273
- 'http://purl.org/rss/1.0/modules/wiki/' => 'wiki',
1274
- 'http://www.w3.org/1999/xhtml' => 'xhtml',
1275
- 'http://www.w3.org/XML/1998/namespace' => 'xml',
1276
- 'http://www.w3.org/1999/xlink' => 'xlink',
1277
- 'http://schemas.pocketsoap.com/rss/myDescModule/' => 'szf'
1278
- }
1279
- @matchnamespaces = {}
1280
- @namespaces.each do |l|
1281
- @matchnamespaces[l[0].downcase] = l[1]
1282
- end
1283
- @can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
1284
- @can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
1285
- @can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
1286
- @html_types = ['text/html', 'application/xhtml+xml']
1287
- @feeddata = FeedParserDict.new # feed-level data
1288
- @encoding = encoding # character encoding
1289
- @entries = [] # list of entry-level data
1290
- @version = '' # feed type/version see SUPPORTED_VERSIOSN
1291
- @namespacesInUse = {} # hash of namespaces defined by the feed
1292
-
1293
- # the following are used internall to track state;
1294
- # this is really out of control and should be refactored
1295
- @infeed = false
1296
- @inentry = false
1297
- @incontent = 0 # Yes, this needs to be zero until I work out popContent and pushContent
1298
- @intextinput = false
1299
- @inimage = false
1300
- @inauthor = false
1301
- @incontributor = false
1302
- @inpublisher = false
1303
- @insource = false
1304
- @sourcedata = FeedParserDict.new
1305
- @contentparams = FeedParserDict.new
1306
- @summaryKey = nil
1307
- @namespacemap = {}
1308
- @elementstack = []
1309
- @basestack = []
1310
- @langstack = []
1311
- @baseuri = baseuri || ''
1312
- @lang = baselang || nil
1313
- if baselang
1314
- @feeddata['language'] = baselang.gsub('_','-')
1315
- end
1316
- @date_handlers = [:_parse_date_rfc822,
1317
- :_parse_date_hungarian, :_parse_date_greek,:_parse_date_mssql,
1318
- :_parse_date_nate,:_parse_date_onblog,:_parse_date_w3dtf,:_parse_date_iso8601
1319
- ]
1320
- $stderr << "Leaving startup\n" if $debug # My addition
1321
- end
1322
-
1323
- def unknown_starttag(tag, attrsd)
1324
- $stderr << "start #{tag} with #{attrsd}\n" if $debug
1325
- # normalize attrs
1326
- attrsD = {}
1327
- attrsd = Hash[*attrsd.flatten] if attrsd.class == Array # Magic! Asterisk!
1328
- # LooseFeedParser needs the above because SGMLParser sends attrs as a
1329
- # list of lists (like [['type','text/html'],['mode','escaped']])
1330
-
1331
- attrsd.each do |old_k,value|
1332
- # There has to be a better, non-ugly way of doing this
1333
- k = old_k.downcase # Downcase all keys
1334
- attrsD[k] = value
1335
- if ['rel','type'].include?value
1336
- attrsD[k].downcase! # Downcase the value if the key is 'rel' or 'type'
1337
- end
1338
- end
1339
-
1340
- # track xml:base and xml:lang
1341
- baseuri = attrsD['xml:base'] || attrsD['base'] || @baseuri
1342
- @baseuri = urljoin(@baseuri, baseuri)
1343
- lang = attrsD['xml:lang'] || attrsD['lang']
1344
- if lang == '' # FIXME This next bit of code is right? Wtf?
1345
- # xml:lang could be explicitly set to '', we need to capture that
1346
- lang = nil
1347
- elsif lang.nil?
1348
- # if no xml:lang is specified, use parent lang
1349
- lang = @lang
1350
- end
1351
- if lang and not lang.empty? # Seriously, this cannot be correct
1352
- if ['feed', 'rss', 'rdf:RDF'].include?tag
1353
- @feeddata['language'] = lang.gsub('_','-')
1354
- end
1355
- end
1356
- @lang = lang
1357
- @basestack << @baseuri
1358
- @langstack << lang
1359
-
1360
- # track namespaces
1361
- attrsd.each do |prefix, uri|
1362
- if /^xmlns:/ =~ prefix # prefix begins with xmlns:
1363
- trackNamespace(prefix[6..-1], uri)
1364
- elsif prefix == 'xmlns':
1365
- trackNamespace(nil, uri)
1366
- end
1367
- end
1368
-
1369
- # track inline content
1370
- if @incontent != 0 and @contentparams.has_key?('type') and not ( /xml$/ =~ (@contentparams['type'] || 'xml') )
1371
- # element declared itself as escaped markup, but isn't really
1372
-
1373
- @contentparams['type'] = 'application/xhtml+xml'
1374
- end
1375
- if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
1376
- # Note: probably shouldn't simply recreate localname here, but
1377
- # our namespace handling isn't actually 100% correct in cases where
1378
- # the feed redefines the default namespace (which is actually
1379
- # the usual case for inline content, thanks Sam), so here we
1380
- # cheat and just reconstruct the element based on localname
1381
- # because that compensates for the bugs in our namespace handling.
1382
- # This will horribly munge inline content with non-empty qnames,
1383
- # but nobody actually does that, so I'm not fixing it.
1384
- tag = tag.split(':')[-1]
1385
- attrsA = attrsd.to_a.collect{|l| "#{l[0]}=\"#{l[1]}\""}
1386
- attrsS = ' '+attrsA.join(' ')
1387
- return handle_data("<#{tag}#{attrsS}>", escape=false)
1388
- end
1389
-
1390
- # match namespaces
1391
- if /:/ =~ tag
1392
- prefix, suffix = tag.split(':', 2)
1393
- else
1394
- prefix, suffix = '', tag
1395
- end
1396
- prefix = @namespacemap[prefix] || prefix
1397
- if prefix and not prefix.empty?
1398
- prefix = prefix + '_'
1399
- end
1400
-
1401
- # special hack for better tracking of empty textinput/image elements in illformed feeds
1402
- if (not prefix and not prefix.empty?) and not (['title', 'link', 'description','name'].include?tag)
1403
- @intextinput = false
1404
- end
1405
- if (prefix.nil? or prefix.empty?) and not (['title', 'link', 'description', 'url', 'href', 'width', 'height'].include?tag)
1406
- @inimage = false
1407
- end
1408
-
1409
- # call special handler (if defined) or default handler
1410
- begin
1411
- return send('_start_'+prefix+suffix, attrsD)
1412
- rescue NoMethodError
1413
- return push(prefix + suffix, true)
1414
- end
1415
- end # End unknown_starttag
1416
-
1417
- def unknown_endtag(tag)
1418
- $stderr << "end #{tag}\n" if $debug
1419
- # match namespaces
1420
- if tag.index(':')
1421
- prefix, suffix = tag.split(':',2)
1422
- else
1423
- prefix, suffix = '', tag
1424
- end
1425
- prefix = @namespacemap[prefix] || prefix
1426
- if prefix and not prefix.empty?
1427
- prefix = prefix + '_'
1428
- end
1429
-
1430
- # call special handler (if defined) or default handler
1431
- begin
1432
- send('_end_' + prefix + suffix) # NOTE no return here! do not add it!
1433
- rescue NoMethodError => details
1434
- pop(prefix + suffix)
1435
- end
1436
-
1437
- # track inline content
1438
- if @incontent != 0 and @contentparams.has_key?'type' and /xml$/ =~ (@contentparams['type'] || 'xml')
1439
- # element declared itself as escaped markup, but it isn't really
1440
- @contentparams['type'] = 'application/xhtml+xml'
1441
- end
1442
- if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
1443
- tag = tag.split(':')[-1]
1444
- handle_data("</#{tag}>", escape=false)
1445
- end
1446
-
1447
- # track xml:base and xml:lang going out of scope
1448
- if @basestack and not @basestack.empty?
1449
- @basestack.pop
1450
- if @basestack and @basestack[-1] and not (@basestack.empty? or @basestack[-1].empty?)
1451
- @baseuri = @basestack[-1]
1452
- end
1453
- end
1454
- if @langstack and not @langstack.empty?
1455
- @langstack.pop
1456
- if @langstack and not @langstack.empty? # and @langstack[-1] and not @langstack.empty?
1457
- @lang = @langstack[-1]
1458
- end
1459
- end
1460
- end
1461
-
1462
- def handle_charref(ref)
1463
- # LooseParserOnly
1464
- # called for each character reference, e.g. for '&#160;', ref will be '160'
1465
- $stderr << "entering handle_charref with #{ref}\n" if $debug
1466
- return if @elementstack.nil? or @elementstack.empty?
1467
- ref.downcase!
1468
- chars = ['34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e']
1469
- if chars.include?ref
1470
- text = "&##{ref};"
1471
- else
1472
- if ref[0..0] == 'x'
1473
- c = (ref[1..-1]).to_i(16)
1474
- else
1475
- c = ref.to_i
1476
- end
1477
- text = uconvert(unichr(c),'unicode')
1478
- end
1479
- @elementstack[-1][2] << text
1480
- end
1481
-
1482
- def handle_entityref(ref)
1483
- # LooseParserOnly
1484
- # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1485
-
1486
- return if @elementstack.nil? or @elementstack.empty?
1487
- $stderr << "entering handle_entityref with #{ref}\n" if $debug
1488
- ents = ['lt', 'gt', 'quot', 'amp', 'apos']
1489
- if ents.include?ref
1490
- text = "&#{ref};"
1491
- else
1492
- text = HTMLEntities::decode_entities("&#{ref};")
1493
- end
1494
- @elementstack[-1][2] << text
1495
- end
1496
-
1497
- def handle_data(text, escape=true)
1498
- # called for each block of plain text, i.e. outside of any tag and
1499
- # not containing any character or entity references
1500
- return if @elementstack.nil? or @elementstack.empty?
1501
- if escape and @contentparams['type'] == 'application/xhtml+xml'
1502
- text = text.to_xs
1503
- end
1504
- @elementstack[-1][2] << text
1505
- end
1506
-
1507
- def handle_comment(comment)
1508
- # called for each comment, e.g. <!-- insert message here -->
1509
- end
1510
-
1511
- def handle_pi(text)
1512
- end
1513
-
1514
- def handle_decl(text)
1515
- end
1516
-
1517
- def parse_declaration(i)
1518
- # for LooseFeedParser
1519
- $stderr << "entering parse_declaration\n" if $debug
1520
- if @rawdata[i...i+9] == '<![CDATA['
1521
- k = @rawdata.index(/\]\]>/u,i+9)
1522
- k = @rawdata.length unless k
1523
- handle_data(@rawdata[i+9...k].to_xs,false)
1524
- return k+3
1525
- else
1526
- k = @rawdata.index(/>/,i).to_i
1527
- return k+1
1528
- end
1529
- end
1530
-
1531
- def mapContentType(contentType)
1532
- contentType.downcase!
1533
- case contentType
1534
- when 'text'
1535
- contentType = 'text/plain'
1536
- when 'html'
1537
- contentType = 'text/html'
1538
- when 'xhtml'
1539
- contentType = 'application/xhtml+xml'
1540
- end
1541
- return contentType
1542
- end
1543
-
1544
- def trackNamespace(prefix, uri)
1545
-
1546
- loweruri = uri.downcase.strip
1547
- if [prefix, loweruri] == [nil, 'http://my.netscape.com/rdf/simple/0.9/'] and (@version.nil? or @version.empty?)
1548
- @version = 'rss090'
1549
- elsif loweruri == 'http://purl.org/rss/1.0/' and (@version.nil? or @version.empty?)
1550
- @version = 'rss10'
1551
- elsif loweruri == 'http://www.w3.org/2005/atom' and (@version.nil? or @version.empty?)
1552
- @version = 'atom10'
1553
- elsif /backend\.userland\.com\/rss/ =~ loweruri
1554
- # match any backend.userland.com namespace
1555
- uri = 'http://backend.userland.com/rss'
1556
- loweruri = uri
1557
- end
1558
- if @matchnamespaces.has_key? loweruri
1559
- @namespacemap[prefix] = @matchnamespaces[loweruri]
1560
- @namespacesInUse[@matchnamespaces[loweruri]] = uri
1561
- else
1562
- @namespacesInUse[prefix || ''] = uri
1563
- end
1564
- end
1565
-
1566
- def resolveURI(uri)
1567
- return urljoin(@baseuri || '', uri)
1568
- end
1569
-
1570
- def decodeEntities(element, data)
1571
- return data
1572
- end
1573
-
1574
- def push(element, expectingText)
1575
- @elementstack << [element, expectingText, []]
1576
- end
1577
-
1578
- def pop(element, stripWhitespace=true)
1579
- return if @elementstack.nil? or @elementstack.empty?
1580
- return if @elementstack[-1][0] != element
1581
- element, expectingText, pieces = @elementstack.pop
1582
- if pieces.class == Array
1583
- output = pieces.join('')
1584
- else
1585
- output = pieces
1586
- end
1587
- if stripWhitespace
1588
- output.strip!
1589
- end
1590
- return output if not expectingText
1591
-
1592
- # decode base64 content
1593
- if @contentparams['base64']
1594
- out64 = Base64::decode64(output) # a.k.a. [output].unpack('m')[0]
1595
- if not output.empty? and not out64.empty?
1596
- output = out64
1597
- end
1598
- end
1599
-
1600
- # resolve relative URIs
1601
- if @can_be_relative_uri.include?element and output and not output.empty?
1602
- output = resolveURI(output)
1603
- end
1604
-
1605
- # decode entities within embedded markup
1606
- if not @contentparams['base64']
1607
- output = decodeEntities(element, output)
1608
- end
1609
-
1610
- # remove temporary cruft from contentparams
1611
- @contentparams.delete('mode')
1612
- @contentparams.delete('base64')
1613
-
1614
- # resolve relative URIs within embedded markup
1615
- if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
1616
- if @can_contain_relative_uris.include?element
1617
- output = FeedParser.resolveRelativeURIs(output, @baseuri, @encoding)
1618
- end
1619
- end
1620
- # sanitize embedded markup
1621
- if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
1622
- if @can_contain_dangerous_markup.include?element
1623
- output = FeedParser.sanitizeHTML(output, @encoding)
1624
- end
1625
- end
1626
-
1627
- if @encoding and not @encoding.empty? and @encoding != 'utf-8'
1628
- output = uconvert(output, @encoding, 'utf-8')
1629
- # FIXME I turn everything into utf-8, not unicode, originally because REXML was being used but now beause I haven't tested it out yet.
1630
- end
1631
-
1632
- # categories/tags/keywords/whatever are handled in _end_category
1633
- return output if element == 'category'
1634
-
1635
- # store output in appropriate place(s)
1636
- if @inentry and not @insource
1637
- if element == 'content'
1638
- @entries[-1][element] ||= []
1639
- contentparams = Marshal.load(Marshal.dump(@contentparams)) # deepcopy
1640
- contentparams['value'] = output
1641
- @entries[-1][element] << contentparams
1642
- elsif element == 'link'
1643
- @entries[-1][element] = output
1644
- if output and not output.empty?
1645
- @entries[-1]['links'][-1]['href'] = output
1646
- end
1647
- else
1648
- element = 'summary' if element == 'description'
1649
- @entries[-1][element] = output
1650
- if @incontent != 0
1651
- contentparams = Marshal.load(Marshal.dump(@contentparams))
1652
- contentparams['value'] = output
1653
- @entries[-1][element + '_detail'] = contentparams
1654
- end
1655
- end
1656
- elsif (@infeed or @insource) and not @intextinput and not @inimage
1657
- context = getContext()
1658
- element = 'subtitle' if element == 'description'
1659
- context[element] = output
1660
- if element == 'link'
1661
- context['links'][-1]['href'] = output
1662
- elsif @incontent != 0
1663
- contentparams = Marshal.load(Marshal.dump(@contentparams))
1664
- contentparams['value'] = output
1665
- context[element + '_detail'] = contentparams
1666
- end
1667
- end
1668
- return output
1669
- end
1670
-
1671
- def pushContent(tag, attrsD, defaultContentType, expectingText)
1672
- @incontent += 1 # Yes, I hate this.
1673
- type = mapContentType(attrsD['type'] || defaultContentType)
1674
- @contentparams = FeedParserDict.new({'type' => type,'language' => @lang,'base' => @baseuri})
1675
- @contentparams['base64'] = isBase64(attrsD, @contentparams)
1676
- push(tag, expectingText)
1677
- end
1678
-
1679
- def popContent(tag)
1680
- value = pop(tag)
1681
- @incontent -= 1
1682
- @contentparams.clear
1683
- return value
1684
- end
1685
-
1686
- def mapToStandardPrefix(name)
1687
- colonpos = name.index(':')
1688
- if colonpos
1689
- prefix = name[0..colonpos-1]
1690
- suffix = name[colonpos+1..-1]
1691
- prefix = @namespacemap[prefix] || prefix
1692
- name = prefix + ':' + suffix
1693
- end
1694
- return name
1695
- end
1696
-
1697
- def getAttribute(attrsD, name)
1698
- return attrsD[mapToStandardPrefix(name)]
1699
- end
1700
-
1701
- def isBase64(attrsD, contentparams)
1702
- return true if (attrsD['mode'] == 'base64')
1703
- if /(^text\/)|(\+xml$)|(\/xml$)/ =~ contentparams['type']
1704
- return false
1705
- end
1706
- return true
1707
- end
1708
-
1709
- def itsAnHrefDamnIt(attrsD)
1710
- href= attrsD['url'] || attrsD['uri'] || attrsD['href']
1711
- if href
1712
- attrsD.delete('url')
1713
- attrsD.delete('uri')
1714
- attrsD['href'] = href
1715
- end
1716
- return attrsD
1717
- end
1718
-
1719
-
1720
- def _save(key, value)
1721
- context = getContext()
1722
- context[key] ||= value
1723
- end
1724
-
1725
- def _start_rss(attrsD)
1726
- versionmap = {'0.91' => 'rss091u',
1727
- '0.92' => 'rss092',
1728
- '0.93' => 'rss093',
1729
- '0.94' => 'rss094'
1730
- }
1731
-
1732
- if not @version or @version.empty?
1733
- attr_version = attrsD['version'] || ''
1734
- version = versionmap[attr_version]
1735
- if version and not version.empty?
1736
- @version = version
1737
- elsif /^2\./ =~ attr_version
1738
- @version = 'rss20'
1739
- else
1740
- @version = 'rss'
1741
- end
1742
- end
1743
- end
1744
-
1745
- def _start_dlhottitles(attrsD)
1746
- @version = 'hotrss'
1747
- end
1748
-
1749
- def _start_channel(attrsD)
1750
- @infeed = true
1751
- _cdf_common(attrsD)
1752
- end
1753
- alias :_start_feedinfo :_start_channel
1754
-
1755
- def _cdf_common(attrsD)
1756
- if attrsD.has_key?'lastmod'
1757
- _start_modified({})
1758
- @elementstack[-1][-1] = attrsD['lastmod']
1759
- _end_modified
1760
- end
1761
- if attrsD.has_key?'href'
1762
- _start_link({})
1763
- @elementstack[-1][-1] = attrsD['href']
1764
- _end_link
1765
- end
1766
- end
1767
-
1768
- def _start_feed(attrsD)
1769
- @infeed = true
1770
- versionmap = {'0.1' => 'atom01',
1771
- '0.2' => 'atom02',
1772
- '0.3' => 'atom03'
1773
- }
1774
-
1775
- if not @version or @version.empty?
1776
- attr_version = attrsD['version']
1777
- version = versionmap[attr_version]
1778
- if @version and not @version.empty?
1779
- @version = version
1780
- else
1781
- @version = 'atom'
1782
- end
1783
- end
1784
- end
1785
-
1786
- def _end_channel
1787
- @infeed = false
1788
- end
1789
- alias :_end_feed :_end_channel
1790
-
1791
- def _start_image(attrsD)
1792
- @inimage = true
1793
- push('image', false)
1794
- context = getContext()
1795
- context['image'] ||= FeedParserDict.new
1796
- end
1797
-
1798
- def _end_image
1799
- pop('image')
1800
- @inimage = false
1801
- end
1802
-
1803
- def _start_textinput(attrsD)
1804
- @intextinput = true
1805
- push('textinput', false)
1806
- context = getContext()
1807
- context['textinput'] ||= FeedParserDict.new
1808
- end
1809
- alias :_start_textInput :_start_textinput
1810
-
1811
- def _end_textinput
1812
- pop('textinput')
1813
- @intextinput = false
1814
- end
1815
- alias :_end_textInput :_end_textinput
1816
-
1817
- def _start_author(attrsD)
1818
- @inauthor = true
1819
- push('author', true)
1820
- end
1821
- alias :_start_managingeditor :_start_author
1822
- alias :_start_dc_author :_start_author
1823
- alias :_start_dc_creator :_start_author
1824
- alias :_start_itunes_author :_start_author
1825
-
1826
- def _end_author
1827
- pop('author')
1828
- @inauthor = false
1829
- _sync_author_detail()
1830
- end
1831
- alias :_end_managingeditor :_end_author
1832
- alias :_end_dc_author :_end_author
1833
- alias :_end_dc_creator :_end_author
1834
- alias :_end_itunes_author :_end_author
1835
-
1836
- def _start_itunes_owner(attrsD)
1837
- @inpublisher = true
1838
- push('publisher', false)
1839
- end
1840
-
1841
- def _end_itunes_owner
1842
- pop('publisher')
1843
- @inpublisher = false
1844
- _sync_author_detail('publisher')
1845
- end
1846
-
1847
- def _start_contributor(attrsD)
1848
- @incontributor = true
1849
- context = getContext()
1850
- context['contributors'] ||= []
1851
- context['contributors'] << FeedParserDict.new
1852
- push('contributor', false)
1853
- end
1854
-
1855
- def _end_contributor
1856
- pop('contributor')
1857
- @incontributor = false
1858
- end
1859
-
1860
- def _start_dc_contributor(attrsD)
1861
- @incontributor = true
1862
- context = getContext()
1863
- context['contributors'] ||= []
1864
- context['contributors'] << FeedParserDict.new
1865
- push('name', false)
1866
- end
1867
-
1868
- def _end_dc_contributor
1869
- _end_name
1870
- @incontributor = false
1871
- end
1872
-
1873
- def _start_name(attrsD)
1874
- push('name', false)
1875
- end
1876
- alias :_start_itunes_name :_start_name
1877
-
1878
- def _end_name
1879
- value = pop('name')
1880
- if @inpublisher
1881
- _save_author('name', value, 'publisher')
1882
- elsif @inauthor
1883
- _save_author('name', value)
1884
- elsif @incontributor
1885
- _save_contributor('name', value)
1886
- elsif @intextinput
1887
- context = getContext()
1888
- context['textinput']['name'] = value
1889
- end
1890
- end
1891
- alias :_end_itunes_name :_end_name
1892
-
1893
- def _start_width(attrsD)
1894
- push('width', false)
1895
- end
1896
-
1897
- def _end_width
1898
- value = pop('width').to_i
1899
- if @inimage
1900
- context = getContext
1901
- context['image']['width'] = value
1902
- end
1903
- end
1904
-
1905
- def _start_height(attrsD)
1906
- push('height', false)
1907
- end
1908
-
1909
- def _end_height
1910
- value = pop('height').to_i
1911
- if @inimage
1912
- context = getContext()
1913
- context['image']['height'] = value
1914
- end
1915
- end
1916
-
1917
- def _start_url(attrsD)
1918
- push('href', true)
1919
- end
1920
- alias :_start_homepage :_start_url
1921
- alias :_start_uri :_start_url
1922
-
1923
- def _end_url
1924
- value = pop('href')
1925
- if @inauthor
1926
- _save_author('href', value)
1927
- elsif @incontributor
1928
- _save_contributor('href', value)
1929
- elsif @inimage
1930
- context = getContext()
1931
- context['image']['href'] = value
1932
- elsif @intextinput
1933
- context = getContext()
1934
- context['textinput']['link'] = value
1935
- end
1936
- end
1937
- alias :_end_homepage :_end_url
1938
- alias :_end_uri :_end_url
1939
-
1940
- def _start_email(attrsD)
1941
- push('email', false)
1942
- end
1943
- alias :_start_itunes_email :_start_email
1944
-
1945
- def _end_email
1946
- value = pop('email')
1947
- if @inpublisher
1948
- _save_author('email', value, 'publisher')
1949
- elsif @inauthor
1950
- _save_author('email', value)
1951
- elsif @incontributor
1952
- _save_contributor('email', value)
1953
- end
1954
- end
1955
- alias :_end_itunes_email :_end_email
1956
-
1957
- def getContext
1958
- if @insource
1959
- context = @sourcedata
1960
- elsif @inentry
1961
- context = @entries[-1]
1962
- else
1963
- context = @feeddata
1964
- end
1965
- return context
1966
- end
1967
-
1968
- def _save_author(key, value, prefix='author')
1969
- context = getContext()
1970
- context[prefix + '_detail'] ||= FeedParserDict.new
1971
- context[prefix + '_detail'][key] = value
1972
- _sync_author_detail()
1973
- end
1974
-
1975
- def _save_contributor(key, value)
1976
- context = getContext
1977
- context['contributors'] ||= [FeedParserDict.new]
1978
- context['contributors'][-1][key] = value
1979
- end
1980
-
1981
- def _sync_author_detail(key='author')
1982
- context = getContext()
1983
- detail = context["#{key}_detail"]
1984
- if detail and not detail.empty?
1985
- name = detail['name']
1986
- email = detail['email']
1987
-
1988
- if name and email and not (name.empty? or name.empty?)
1989
- context[key] = "#{name} (#{email})"
1990
- elsif name and not name.empty?
1991
- context[key] = name
1992
- elsif email and not email.empty?
1993
- context[key] = email
1994
- end
1995
- else
1996
- author = context[key].dup unless context[key].nil?
1997
- return if not author or author.empty?
1998
- emailmatch = author.match(/(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))/)
1999
- email = emailmatch[1]
2000
- author.gsub!(email, '')
2001
- author.gsub!("\(\)", '')
2002
- author.strip!
2003
- author.gsub!(/^\(/,'')
2004
- author.gsub!(/\)$/,'')
2005
- author.strip!
2006
- context["#{key}_detail"] ||= FeedParserDict.new
2007
- context["#{key}_detail"]['name'] = author
2008
- context["#{key}_detail"]['email'] = email
2009
- end
2010
- end
2011
-
2012
- def _start_subtitle(attrsD)
2013
- pushContent('subtitle', attrsD, 'text/plain', true)
2014
- end
2015
- alias :_start_tagline :_start_subtitle
2016
- alias :_start_itunes_subtitle :_start_subtitle
2017
-
2018
- def _end_subtitle
2019
- popContent('subtitle')
2020
- end
2021
- alias :_end_tagline :_end_subtitle
2022
- alias :_end_itunes_subtitle :_end_subtitle
2023
-
2024
- def _start_rights(attrsD)
2025
- pushContent('rights', attrsD, 'text/plain', true)
2026
- end
2027
- alias :_start_dc_rights :_start_rights
2028
- alias :_start_copyright :_start_rights
2029
-
2030
- def _end_rights
2031
- popContent('rights')
2032
- end
2033
- alias :_end_dc_rights :_end_rights
2034
- alias :_end_copyright :_end_rights
2035
-
2036
- def _start_item(attrsD)
2037
- @entries << FeedParserDict.new
2038
- push('item', false)
2039
- @inentry = true
2040
- @guidislink = false
2041
- id = getAttribute(attrsD, 'rdf:about')
2042
- if id and not id.empty?
2043
- context = getContext()
2044
- context['id'] = id
2045
- end
2046
- _cdf_common(attrsD)
2047
- end
2048
- alias :_start_entry :_start_item
2049
- alias :_start_product :_start_item
2050
-
2051
- def _end_item
2052
- pop('item')
2053
- @inentry = false
2054
- end
2055
- alias :_end_entry :_end_item
2056
-
2057
- def _start_dc_language(attrsD)
2058
- push('language', true)
2059
- end
2060
- alias :_start_language :_start_dc_language
2061
-
2062
- def _end_dc_language
2063
- @lang = pop('language')
2064
- end
2065
- alias :_end_language :_end_dc_language
2066
-
2067
- def _start_dc_publisher(attrsD)
2068
- push('publisher', true)
2069
- end
2070
- alias :_start_webmaster :_start_dc_publisher
2071
-
2072
- def _end_dc_publisher
2073
- pop('publisher')
2074
- _sync_author_detail('publisher')
2075
- end
2076
- alias :_end_webmaster :_end_dc_publisher
2077
-
2078
- def _start_published(attrsD)
2079
- push('published', true)
2080
- end
2081
- alias :_start_dcterms_issued :_start_published
2082
- alias :_start_issued :_start_published
2083
-
2084
- def _end_published
2085
- value = pop('published')
2086
- _save('published_parsed', parse_date(value))
2087
- end
2088
- alias :_end_dcterms_issued :_end_published
2089
- alias :_end_issued :_end_published
2090
-
2091
- def _start_updated(attrsD)
2092
- push('updated', true)
2093
- end
2094
- alias :_start_modified :_start_updated
2095
- alias :_start_dcterms_modified :_start_updated
2096
- alias :_start_pubdate :_start_updated
2097
- alias :_start_dc_date :_start_updated
2098
-
2099
- def _end_updated
2100
- value = pop('updated')
2101
- _save('updated_parsed', parse_date(value))
2102
- end
2103
- alias :_end_modified :_end_updated
2104
- alias :_end_dcterms_modified :_end_updated
2105
- alias :_end_pubdate :_end_updated
2106
- alias :_end_dc_date :_end_updated
2107
-
2108
- def _start_created(attrsD)
2109
- push('created', true)
2110
- end
2111
- alias :_start_dcterms_created :_start_created
2112
-
2113
- def _end_created
2114
- value = pop('created')
2115
- _save('created_parsed', parse_date(value))
2116
- end
2117
- alias :_end_dcterms_created :_end_created
2118
-
2119
- def _start_expirationdate(attrsD)
2120
- push('expired', true)
2121
- end
2122
- def _end_expirationdate
2123
- _save('expired_parsed', parse_date(pop('expired')))
2124
- end
2125
-
2126
- def _start_cc_license(attrsD)
2127
- push('license', true)
2128
- value = getAttribute(attrsD, 'rdf:resource')
2129
- if value and not value.empty?
2130
- elementstack[-1][2] << value
2131
- pop('license')
2132
- end
2133
- end
2134
-
2135
- def _start_creativecommons_license(attrsD)
2136
- push('license', true)
2137
- end
2138
-
2139
- def _end_creativecommons_license
2140
- pop('license')
2141
- end
2142
-
2143
- def addTag(term, scheme, label)
2144
- context = getContext()
2145
- context['tags'] ||= []
2146
- tags = context['tags']
2147
- if (term.nil? or term.empty?) and (scheme.nil? or scheme.empty?) and (label.nil? or label.empty?)
2148
- return
2149
- end
2150
- value = FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
2151
- if not tags.include?value
2152
- context['tags'] << FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
2153
- end
2154
- end
2155
-
2156
- def _start_category(attrsD)
2157
- $stderr << "entering _start_category with #{attrsD}\n" if $debug
2158
-
2159
- term = attrsD['term']
2160
- scheme = attrsD['scheme'] || attrsD['domain']
2161
- label = attrsD['label']
2162
- addTag(term, scheme, label)
2163
- push('category', true)
2164
- end
2165
- alias :_start_dc_subject :_start_category
2166
- alias :_start_keywords :_start_category
2167
-
2168
- def _end_itunes_keywords
2169
- pop('itunes_keywords').split.each do |term|
2170
- addTag(term, 'http://www.itunes.com/', nil)
2171
- end
2172
- end
2173
-
2174
- def _start_itunes_category(attrsD)
2175
- addTag(attrsD['text'], 'http://www.itunes.com/', nil)
2176
- push('category', true)
2177
- end
2178
-
2179
- def _end_category
2180
- value = pop('category')
2181
- return if value.nil? or value.empty?
2182
- context = getContext()
2183
- tags = context['tags']
2184
- if value and not value.empty? and not tags.empty? and not tags[-1]['term']:
2185
- tags[-1]['term'] = value
2186
- else
2187
- addTag(value, nil, nil)
2188
- end
2189
- end
2190
- alias :_end_dc_subject :_end_category
2191
- alias :_end_keywords :_end_category
2192
- alias :_end_itunes_category :_end_category
2193
-
2194
- def _start_cloud(attrsD)
2195
- getContext()['cloud'] = FeedParserDict.new(attrsD)
2196
- end
2197
-
2198
- def _start_link(attrsD)
2199
- attrsD['rel'] ||= 'alternate'
2200
- attrsD['type'] ||= 'text/html'
2201
- attrsD = itsAnHrefDamnIt(attrsD)
2202
- if attrsD.has_key? 'href'
2203
- attrsD['href'] = resolveURI(attrsD['href'])
2204
- end
2205
- expectingText = @infeed || @inentry || @insource
2206
- context = getContext()
2207
- context['links'] ||= []
2208
- context['links'] << FeedParserDict.new(attrsD)
2209
- if attrsD['rel'] == 'enclosure'
2210
- _start_enclosure(attrsD)
2211
- end
2212
- if attrsD.has_key? 'href'
2213
- expectingText = false
2214
- if (attrsD['rel'] == 'alternate') and @html_types.include?mapContentType(attrsD['type'])
2215
- context['link'] = attrsD['href']
2216
- end
2217
- else
2218
- push('link', expectingText)
2219
- end
2220
- end
2221
- alias :_start_producturl :_start_link
2222
-
2223
- def _end_link
2224
- value = pop('link')
2225
- context = getContext()
2226
- if @intextinput
2227
- context['textinput']['link'] = value
2228
- end
2229
- if @inimage
2230
- context['image']['link'] = value
2231
- end
2232
- end
2233
- alias :_end_producturl :_end_link
2234
-
2235
- def _start_guid(attrsD)
2236
- @guidislink = ((attrsD['ispermalink'] || 'true') == 'true')
2237
- push('id', true)
2238
- end
2239
-
2240
- def _end_guid
2241
- value = pop('id')
2242
- _save('guidislink', (@guidislink and not getContext().has_key?('link')))
2243
- if @guidislink:
2244
- # guid acts as link, but only if 'ispermalink' is not present or is 'true',
2245
- # and only if the item doesn't already have a link element
2246
- _save('link', value)
2247
- end
2248
- end
2249
-
2250
-
2251
- def _start_title(attrsD)
2252
- pushContent('title', attrsD, 'text/plain', @infeed || @inentry || @insource)
2253
- end
2254
- alias :_start_dc_title :_start_title
2255
- alias :_start_media_title :_start_title
2256
-
2257
- def _end_title
2258
- value = popContent('title')
2259
- context = getContext()
2260
- if @intextinput
2261
- context['textinput']['title'] = value
2262
- elsif @inimage
2263
- context['image']['title'] = value
2264
- end
2265
- end
2266
- alias :_end_dc_title :_end_title
2267
- alias :_end_media_title :_end_title
2268
-
2269
- def _start_description(attrsD)
2270
- context = getContext()
2271
- if context.has_key?('summary')
2272
- @summaryKey = 'content'
2273
- _start_content(attrsD)
2274
- else
2275
- pushContent('description', attrsD, 'text/html', @infeed || @inentry || @insource)
2276
- end
2277
- end
2278
-
2279
- def _start_abstract(attrsD)
2280
- pushContent('description', attrsD, 'text/plain', @infeed || @inentry || @insource)
2281
- end
2282
-
2283
- def _end_description
2284
- if @summaryKey == 'content'
2285
- _end_content()
2286
- else
2287
- value = popContent('description')
2288
- context = getContext()
2289
- if @intextinput
2290
- context['textinput']['description'] = value
2291
- elsif @inimage:
2292
- context['image']['description'] = value
2293
- end
2294
- end
2295
- @summaryKey = nil
2296
- end
2297
- alias :_end_abstract :_end_description
2298
-
2299
- def _start_info(attrsD)
2300
- pushContent('info', attrsD, 'text/plain', true)
2301
- end
2302
- alias :_start_feedburner_browserfriendly :_start_info
2303
-
2304
- def _end_info
2305
- popContent('info')
2306
- end
2307
- alias :_end_feedburner_browserfriendly :_end_info
2308
-
2309
- def _start_generator(attrsD)
2310
- if attrsD and not attrsD.empty?
2311
- attrsD = itsAnHrefDamnIt(attrsD)
2312
- if attrsD.has_key?('href')
2313
- attrsD['href'] = resolveURI(attrsD['href'])
2314
- end
2315
- end
2316
- getContext()['generator_detail'] = FeedParserDict.new(attrsD)
2317
- push('generator', true)
2318
- end
2319
-
2320
- def _end_generator
2321
- value = pop('generator')
2322
- context = getContext()
2323
- if context.has_key?('generator_detail')
2324
- context['generator_detail']['name'] = value
2325
- end
2326
- end
2327
-
2328
- def _start_admin_generatoragent(attrsD)
2329
- push('generator', true)
2330
- value = getAttribute(attrsD, 'rdf:resource')
2331
- if value and not value.empty?
2332
- elementstack[-1][2] << value
2333
- end
2334
- pop('generator')
2335
- getContext()['generator_detail'] = FeedParserDict.new({'href' => value})
2336
- end
2337
-
2338
- def _start_admin_errorreportsto(attrsD)
2339
- push('errorreportsto', true)
2340
- value = getAttribute(attrsD, 'rdf:resource')
2341
- if value and not value.empty?
2342
- @elementstack[-1][2] << value
2343
- end
2344
- pop('errorreportsto')
2345
- end
2346
-
2347
- def _start_summary(attrsD)
2348
- context = getContext()
2349
- if context.has_key?'summary'
2350
- @summaryKey = 'content'
2351
- _start_content(attrsD)
2352
- else
2353
- @summaryKey = 'summary'
2354
- pushContent(@summaryKey, attrsD, 'text/plain', true)
2355
- end
2356
- end
2357
- alias :_start_itunes_summary :_start_summary
2358
-
2359
- def _end_summary
2360
- if @summaryKey == 'content':
2361
- _end_content()
2362
- else
2363
- popContent(@summaryKey || 'summary')
2364
- end
2365
- @summaryKey = nil
2366
- end
2367
- alias :_end_itunes_summary :_end_summary
2368
-
2369
- def _start_enclosure(attrsD)
2370
- attrsD = itsAnHrefDamnIt(attrsD)
2371
- getContext()['enclosures'] ||= []
2372
- getContext()['enclosures'] << FeedParserDict.new(attrsD)
2373
- href = attrsD['href']
2374
- if href and not href.empty?
2375
- context = getContext()
2376
- if not context['id']
2377
- context['id'] = href
2378
- end
2379
- end
2380
- end
2381
-
2382
- def _start_source(attrsD)
2383
- @insource = true
2384
- end
2385
-
2386
- def _end_source
2387
- @insource = false
2388
- getContext()['source'] = Marshal.load(Marshal.dump(@sourcedata))
2389
- @sourcedata.clear()
2390
- end
2391
-
2392
- def _start_content(attrsD)
2393
- pushContent('content', attrsD, 'text/plain', true)
2394
- src = attrsD['src']
2395
- if src and not src.empty?:
2396
- @contentparams['src'] = src
2397
- end
2398
- push('content', true)
2399
- end
2400
-
2401
- def _start_prodlink(attrsD)
2402
- pushContent('content', attrsD, 'text/html', true)
2403
- end
2404
-
2405
- def _start_body(attrsD)
2406
- pushContent('content', attrsD, 'application/xhtml+xml', true)
2407
- end
2408
- alias :_start_xhtml_body :_start_body
2409
-
2410
- def _start_content_encoded(attrsD)
2411
- pushContent('content', attrsD, 'text/html', true)
2412
- end
2413
- alias :_start_fullitem :_start_content_encoded
2414
-
2415
- def _end_content
2416
- copyToDescription = (['text/plain'] + @html_types).include? mapContentType(@contentparams['type'])
2417
- value = popContent('content')
2418
- if copyToDescription
2419
- _save('description', value)
2420
- end
2421
- alias :_end_body :_end_content
2422
- alias :_end_xhtml_body :_end_content
2423
- alias :_end_content_encoded :_end_content
2424
- alias :_end_fullitem :_end_content
2425
- alias :_end_prodlink :_end_content
2426
- end
2427
-
2428
- def _start_itunes_image(attrsD)
2429
- push('itunes_image', false)
2430
- getContext()['image'] = FeedParserDict.new({'href' => attrsD['href']})
2431
- end
2432
- alias :_start_itunes_link :_start_itunes_image
2433
-
2434
- def _end_itunes_block
2435
- value = pop('itunes_block', false)
2436
- getContext()['itunes_block'] = (value == 'yes') and true or false
2437
- end
2438
-
2439
- def _end_itunes_explicit
2440
- value = pop('itunes_explicit', false)
2441
- getContext()['itunes_explicit'] = (value == 'yes') and true or false
2442
- end
2443
-
2444
-
2445
- # ISO-8601 date parsing routines written by Fazal Majid.
2446
- # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2447
- # parser is beyond the scope of feedparser and the current Time.iso8601
2448
- # method does not work.
2449
- # A single regular expression cannot parse ISO 8601 date formats into groups
2450
- # as the standard is highly irregular (for instance is 030104 2003-01-04 or
2451
- # 0301-04-01), so we use templates instead.
2452
- # Please note the order in templates is significant because we need a
2453
- # greedy match.
2454
- def _parse_date_iso8601(dateString)
2455
- # Parse a variety of ISO-8601-compatible formats like 20040105
2456
-
2457
- # What I'm about to show you may be the ugliest code in all of
2458
- # rfeedparser.
2459
- # FIXME The century regexp maybe not work ('\d\d$' says "two numbers at
2460
- # end of line" but we then attach more of a regexp.
2461
- iso8601_regexps = [ '^(\d{4})-?([01]\d)-([0123]\d)',
2462
- '^(\d{4})-([01]\d)',
2463
- '^(\d{4})-?([0123]\d\d)',
2464
- '^(\d\d)-?([01]\d)-?([0123]\d)',
2465
- '^(\d\d)-?([0123]\d\d)',
2466
- '^(\d{4})',
2467
- '-(\d\d)-?([01]\d)',
2468
- '-([0123]\d\d)',
2469
- '-(\d\d)',
2470
- '--([01]\d)-?([0123]\d)',
2471
- '--([01]\d)',
2472
- '---([0123]\d)',
2473
- '(\d\d$)',
2474
- ''
2475
- ]
2476
- iso8601_values = { '^(\d{4})-?([01]\d)-([0123]\d)' => ['year', 'month', 'day'],
2477
- '^(\d{4})-([01]\d)' => ['year','month'],
2478
- '^(\d{4})-?([0123]\d\d)' => ['year', 'ordinal'],
2479
- '^(\d\d)-?([01]\d)-?([0123]\d)' => ['year','month','day'],
2480
- '^(\d\d)-?([0123]\d\d)' => ['year','ordinal'],
2481
- '^(\d{4})' => ['year'],
2482
- '-(\d\d)-?([01]\d)' => ['year','month'],
2483
- '-([0123]\d\d)' => ['ordinal'],
2484
- '-(\d\d)' => ['year'],
2485
- '--([01]\d)-?([0123]\d)' => ['month','day'],
2486
- '--([01]\d)' => ['month'],
2487
- '---([0123]\d)' => ['day'],
2488
- '(\d\d$)' => ['century'],
2489
- '' => []
2490
- }
2491
- add_to_all = '(T?(\d\d):(\d\d)(?::(\d\d))?([+-](\d\d)(?::(\d\d))?|Z)?)?'
2492
- add_to_all_fields = ['hour', 'minute', 'second', 'tz', 'tzhour', 'tzmin']
2493
- # NOTE We use '(?:' to prevent grouping of optional matches (ones trailed
2494
- # by '?'). The second ':' *are* matched.
2495
- m = nil
2496
- param_keys = []
2497
- iso8601_regexps.each do |s|
2498
- $stderr << "Trying iso8601 regexp: #{s+add_to_all}\n" if $debug
2499
- param_keys = iso8601_values[s] + add_to_all_fields
2500
- m = dateString.match(Regexp.new(s+add_to_all))
2501
- break if m
2502
- end
2503
- return if m.nil? or (m.begin(0).zero? and m.end(0).zero?)
2504
-
2505
- param_values = m.to_a
2506
- param_values = param_values[1..-1]
2507
- params = {}
2508
- param_keys.each_with_index do |key,i|
2509
- params[key] = param_values[i]
2510
- end
2511
-
2512
- ordinal = params['ordinal'].to_i unless params['ordinal'].nil?
2513
- year = params['year'] || '--'
2514
- if year.nil? or year.empty? or year == '--' # FIXME When could the regexp ever return a year equal to '--'?
2515
- year = Time.now.utc.year
2516
- elsif year.length == 2
2517
- # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
2518
- year = 100 * (Time.now.utc.year / 100) + year.to_i
2519
- else
2520
- year = year.to_i
2521
- end
2522
-
2523
- month = params['month'] || '-'
2524
- if month.nil? or month.empty? or month == '-'
2525
- # ordinals are NOT normalized by mktime, we simulate them
2526
- # by setting month=1, day=ordinal
2527
- if ordinal
2528
- month = DateTime.ordinal(year,ordinal).month
2529
- else
2530
- month = Time.now.utc.month
2531
- end
2532
- end
2533
- month = month.to_i unless month.nil?
2534
- day = params['day']
2535
- if day.nil? or day.empty?
2536
- # see above
2537
- if ordinal
2538
- day = DateTime.ordinal(year,ordinal).day
2539
- elsif params['century'] or params['year'] or params['month']
2540
- day = 1
2541
- else
2542
- day = Time.now.utc.day
2543
- end
2544
- else
2545
- day = day.to_i
2546
- end
2547
- # special case of the century - is the first year of the 21st century
2548
- # 2000 or 2001 ? The debate goes on...
2549
- if params.has_key? 'century'
2550
- year = (params['century'].to_i - 1) * 100 + 1
2551
- end
2552
- # in ISO 8601 most fields are optional
2553
- hour = params['hour'].to_i
2554
- minute = params['minute'].to_i
2555
- second = params['second'].to_i
2556
- weekday = nil
2557
- # daylight savings is complex, but not needed for feedparser's purposes
2558
- # as time zones, if specified, include mention of whether it is active
2559
- # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
2560
- # and most implementations have DST bugs
2561
- tm = [second, minute, hour, day, month, year, nil, ordinal, false, nil]
2562
- tz = params['tz']
2563
- if tz and not tz.empty? and tz != 'Z'
2564
- # FIXME does this cross over days?
2565
- if tz[0] == '-'
2566
- tm[3] += params['tzhour'].to_i
2567
- tm[4] += params['tzmin'].to_i
2568
- elsif tz[0] == '+'
2569
- tm[3] -= params['tzhour'].to_i
2570
- tm[4] -= params['tzmin'].to_i
2571
- else
2572
- return nil
2573
- end
2574
- end
2575
- return Time.utc(*tm) # Magic!
2576
-
2577
- end
2578
-
2579
- def _parse_date_onblog(dateString)
2580
- # Parse a string according to the OnBlog 8-bit date format
2581
- # 8-bit date handling routes written by ytrewq1
2582
- korean_year = u("년") # b3e2 in euc-kr
2583
- korean_month = u("월") # bff9 in euc-kr
2584
- korean_day = u("일") # c0cf in euc-kr
2585
-
2586
-
2587
- korean_onblog_date_re = /(\d{4})#{korean_year}\s+(\d{2})#{korean_month}\s+(\d{2})#{korean_day}\s+(\d{2}):(\d{2}):(\d{2})/
2588
-
2589
-
2590
- m = korean_onblog_date_re.match(dateString)
2591
- return unless m
2592
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
2593
-
2594
- $stderr << "OnBlog date parsed as: %s\n" % w3dtfdate if $debug
2595
- return _parse_date_w3dtf(w3dtfdate)
2596
- end
2597
-
2598
- def _parse_date_nate(dateString)
2599
- # Parse a string according to the Nate 8-bit date format
2600
- # 8-bit date handling routes written by ytrewq1
2601
- korean_am = u("오전") # bfc0 c0fc in euc-kr
2602
- korean_pm = u("오후") # bfc0 c8c4 in euc-kr
2603
-
2604
- korean_nate_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(#{korean_am}|#{korean_pm})\s+(\d{0,2}):(\d{0,2}):(\d{0,2})/
2605
- m = korean_nate_date_re.match(dateString)
2606
- return unless m
2607
- hour = m[5].to_i
2608
- ampm = m[4]
2609
- if ampm == korean_pm
2610
- hour += 12
2611
- end
2612
- hour = hour.to_s.rjust(2,'0')
2613
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{hour}:#{m[6]}:#{m[7]}+09:00"
2614
- $stderr << "Nate date parsed as: %s\n" % w3dtfdate if $debug
2615
- return _parse_date_w3dtf(w3dtfdate)
2616
- end
2617
-
2618
- def _parse_date_mssql(dateString)
2619
- mssql_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?/
2620
-
2621
- m = mssql_date_re.match(dateString)
2622
- return unless m
2623
- w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
2624
- $stderr << "MS SQL date parsed as: %s\n" % w3dtfdate if $debug
2625
- return _parse_date_w3dtf(w3dtfdate)
2626
- end
2627
-
2628
- def _parse_date_greek(dateString)
2629
- # Parse a string according to a Greek 8-bit date format
2630
- # Unicode strings for Greek date strings
2631
- greek_months = {
2632
- u("Ιαν") => u("Jan"), # c9e1ed in iso-8859-7
2633
- u("Φεβ") => u("Feb"), # d6e5e2 in iso-8859-7
2634
- u("Μάώ") => u("Mar"), # ccdcfe in iso-8859-7
2635
- u("Μαώ") => u("Mar"), # cce1fe in iso-8859-7
2636
- u("Απρ") => u("Apr"), # c1f0f1 in iso-8859-7
2637
- u("Μάι") => u("May"), # ccdce9 in iso-8859-7
2638
- u("Μαϊ") => u("May"), # cce1fa in iso-8859-7
2639
- u("Μαι") => u("May"), # cce1e9 in iso-8859-7
2640
- u("Ιούν") => u("Jun"), # c9effded in iso-8859-7
2641
- u("Ιον") => u("Jun"), # c9efed in iso-8859-7
2642
- u("Ιούλ") => u("Jul"), # c9effdeb in iso-8859-7
2643
- u("Ιολ") => u("Jul"), # c9f9eb in iso-8859-7
2644
- u("Αύγ") => u("Aug"), # c1fde3 in iso-8859-7
2645
- u("Αυγ") => u("Aug"), # c1f5e3 in iso-8859-7
2646
- u("Σεπ") => u("Sep"), # d3e5f0 in iso-8859-7
2647
- u("Οκτ") => u("Oct"), # cfeaf4 in iso-8859-7
2648
- u("Νοέ") => u("Nov"), # cdefdd in iso-8859-7
2649
- u("Νοε") => u("Nov"), # cdefe5 in iso-8859-7
2650
- u("Δεκ") => u("Dec"), # c4e5ea in iso-8859-7
2651
- }
2652
-
2653
- greek_wdays = {
2654
- u("Κυρ") => u("Sun"), # caf5f1 in iso-8859-7
2655
- u("Δευ") => u("Mon"), # c4e5f5 in iso-8859-7
2656
- u("Τρι") => u("Tue"), # d4f1e9 in iso-8859-7
2657
- u("Τετ") => u("Wed"), # d4e5f4 in iso-8859-7
2658
- u("Πεμ") => u("Thu"), # d0e5ec in iso-8859-7
2659
- u("Παρ") => u("Fri"), # d0e1f1 in iso-8859-7
2660
- u("Σαβ") => u("Sat"), # d3e1e2 in iso-8859-7
2661
- }
2662
-
2663
- greek_date_format = /([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)/
2664
-
2665
- m = greek_date_format.match(dateString)
2666
- return unless m
2667
- begin
2668
- wday = greek_wdays[m[1]]
2669
- month = greek_months[m[3]]
2670
- rescue
2671
- return nil
2672
- end
2673
- rfc822date = "#{wday}, #{m[2]} #{month} #{m[4]} #{m[5]}:#{m[6]}:#{m[7]} #{m[8]}"
2674
- $stderr << "Greek date parsed as: #{rfc822date}\n" if $debug
2675
- return _parse_date_rfc822(rfc822date)
2676
- end
2677
-
2678
- def _parse_date_hungarian(dateString)
2679
- # Parse a string according to a Hungarian 8-bit date format.
2680
- hungarian_date_format_re = /(\d{4})-([^-]+)-(\d{0,2})T(\d{0,2}):(\d{2})((\+|-)(\d{0,2}:\d{2}))/
2681
- m = hungarian_date_format_re.match(dateString)
2682
- return unless m
2683
-
2684
- # Unicode strings for Hungarian date strings
2685
- hungarian_months = {
2686
- u("január") => u("01"), # e1 in iso-8859-2
2687
- u("februári") => u("02"), # e1 in iso-8859-2
2688
- u("március") => u("03"), # e1 in iso-8859-2
2689
- u("április") => u("04"), # e1 in iso-8859-2
2690
- u("máujus") => u("05"), # e1 in iso-8859-2
2691
- u("június") => u("06"), # fa in iso-8859-2
2692
- u("július") => u("07"), # fa in iso-8859-2
2693
- u("augusztus") => u("08"),
2694
- u("szeptember") => u("09"),
2695
- u("október") => u("10"), # f3 in iso-8859-2
2696
- u("november") => u("11"),
2697
- u("december") => u("12"),
2698
- }
2699
- begin
2700
- month = hungarian_months[m[2]]
2701
- day = m[3].rjust(2,'0')
2702
- hour = m[4].rjust(2,'0')
2703
- rescue
2704
- return
2705
- end
2706
-
2707
- w3dtfdate = "#{m[1]}-#{month}-#{day}T#{hour}:#{m[5]}:00#{m[6]}"
2708
- $stderr << "Hungarian date parsed as: #{w3dtfdate}\n" if $debug
2709
- return _parse_date_w3dtf(w3dtfdate)
2710
- end
2711
-
2712
- def rollover(num, modulus)
2713
- return num % modulus, num / modulus
2714
- end
2715
-
2716
- def set_self(num, modulus)
2717
- r = num / modulus
2718
- if r == 0
2719
- return num
2720
- end
2721
- return r
2722
- end
2723
- # W3DTF-style date parsing
2724
- # FIXME shouldn't it be "W3CDTF"?
2725
- def _parse_date_w3dtf(dateString)
2726
- # Ruby's Time docs claim w3cdtf is an alias for iso8601 which is an alias for xmlschema
2727
- # Whatever it is, it doesn't work. This has been fixed in Ruby 1.9 and
2728
- # in Ruby on Rails, but not really. They don't fix the 25 hour or 61 minute or 61 second rollover and fail in other ways.
2729
-
2730
- m = dateString.match(/^(\d{4})-?(?:(?:([01]\d)-?(?:([0123]\d)(?:T(\d\d):(\d\d):(\d\d)([+-]\d\d:\d\d|Z))?)?)?)?/)
2731
-
2732
- w3 = m[1..3].map{|s| s=s.to_i; s += 1 if s == 0;s} # Map the year, month and day to integers and, if they were nil, set them to 1
2733
- w3 += m[4..6].map{|s| s.to_i} # Map the hour, minute and second to integers
2734
- w3 << m[-1] # Leave the timezone as a String
2735
-
2736
- # FIXME this next bit needs some serious refactoring
2737
- # Rollover times. 0 minutes and 61 seconds -> 1 minute and 1 second
2738
- w3[5],r = rollover(w3[5], 60) # rollover seconds
2739
- w3[4] += r
2740
- w3[4],r = rollover(w3[4], 60) # rollover minutes
2741
- w3[3] += r
2742
- w3[3],r = rollover(w3[3], 24) # rollover hours
2743
-
2744
- w3[2] = w3[2] + r
2745
- if w3[1] > 12
2746
- w3[1],r = rollover(w3[1],12)
2747
- w3[1] = 12 if w3[1] == 0
2748
- w3[0] += r
2749
- end
2750
-
2751
- num_days = Time.days_in_month(w3[1], w3[0])
2752
- while w3[2] > num_days
2753
- w3[2] -= num_days
2754
- w3[1] += 1
2755
- if w3[1] > 12
2756
- w3[0] += 1
2757
- w3[1] = set_self(w3[1], 12)
2758
- end
2759
- num_days = Time.days_in_month(w3[1], w3[0])
2760
- end
2761
-
2762
-
2763
- unless w3[6].class != String
2764
- if /^-/ =~ w3[6] # Zone offset goes backwards
2765
- w3[6][0] = '+'
2766
- elsif /^\+/ =~ w3[6]
2767
- w3[6][0] = '-'
2768
- end
2769
- end
2770
- return Time.utc(w3[0], w3[1], w3[2] , w3[3], w3[4], w3[5])+Time.zone_offset(w3[6] || "UTC")
2771
- end
2772
-
2773
- def _parse_date_rfc822(dateString)
2774
- # Parse an RFC822, RFC1123, RFC2822 or asctime-style date
2775
- # These first few lines are to fix up the stupid proprietary format from Disney
2776
- unknown_timezones = { 'AT' => 'EDT', 'ET' => 'EST',
2777
- 'CT' => 'CST', 'MT' => 'MST',
2778
- 'PT' => 'PST'
2779
- }
2780
-
2781
- mon = dateString.split[2]
2782
- if mon.length > 3 and Time::RFC2822_MONTH_NAME.include?mon[0..2]
2783
- dateString.sub!(mon,mon[0..2])
2784
- end
2785
- if dateString[-3..-1] != "GMT" and unknown_timezones[dateString[-2..-1]]
2786
- dateString[-2..-1] = unknown_timezones[dateString[-2..-1]]
2787
- end
2788
- # Okay, the Disney date format should be fixed up now.
2789
- rfc = dateString.match(/([A-Za-z]{3}), ([0123]\d) ([A-Za-z]{3}) (\d{4})( (\d\d):(\d\d)(?::(\d\d))? ([A-Za-z]{3}))?/)
2790
- if rfc.to_a.length > 1 and rfc.to_a.include? nil
2791
- dow, day, mon, year, hour, min, sec, tz = rfc[1..-1]
2792
- hour,min,sec = [hour,min,sec].map{|e| e.to_s.rjust(2,'0') }
2793
- tz ||= "GMT"
2794
- end
2795
- asctime_match = dateString.match(/([A-Za-z]{3}) ([A-Za-z]{3}) (\d?\d) (\d\d):(\d\d):(\d\d) ([A-Za-z]{3}) (\d\d\d\d)/).to_a
2796
- if asctime_match.to_a.length > 1
2797
- # Month-abbr dayofmonth hour:minute:second year
2798
- dow, mon, day, hour, min, sec, tz, year = asctime_match[1..-1]
2799
- day.to_s.rjust(2,'0')
2800
- end
2801
- if (rfc.to_a.length > 1 and rfc.to_a.include? nil) or asctime_match.to_a.length > 1
2802
- ds = "#{dow}, #{day} #{mon} #{year} #{hour}:#{min}:#{sec} #{tz}"
2803
- else
2804
- ds = dateString
2805
- end
2806
- t = Time.rfc2822(ds).utc
2807
- return t
2808
- end
2809
-
2810
- def _parse_date_perforce(aDateString) # FIXME not in 4.1?
2811
- # Parse a date in yyyy/mm/dd hh:mm:ss TTT format
2812
- # Note that there is a day of the week at the beginning
2813
- # Ex. Fri, 2006/09/15 08:19:53 EDT
2814
- return Time.parse(aDateString).utc
2815
- end
2816
-
2817
- def extract_tuple(atime)
2818
- # NOTE leave the error handling to parse_date
2819
- t = [atime.year, atime.month, atime.mday, atime.hour,
2820
- atime.min, atime.sec, (atime.wday-1) % 7, atime.yday,
2821
- atime.isdst
2822
- ]
2823
- # yay for modulus! yaaaaaay! its 530 am and i should be sleeping! yaay!
2824
- t[0..-2].map!{|s| s.to_i}
2825
- t[-1] = t[-1] ? 1 : 0
2826
- return t
2827
- end
2828
-
2829
- def parse_date(dateString)
2830
- @date_handlers.each do |handler|
2831
- begin
2832
- $stderr << "Trying date_handler #{handler}\n" if $debug
2833
- datething = extract_tuple(send(handler,dateString))
2834
- return datething
2835
- rescue Exception => e
2836
- $stderr << "#{handler} raised #{e}\n" if $debug
2837
- end
2838
- end
2839
- return nil
2840
- end
2841
-
2842
- end # End FeedParserMixin
2843
-
2844
- class StrictFeedParser < XML::SAX::HandlerBase # expat
2845
- include FeedParserMixin
2846
-
2847
- attr_accessor :bozo, :entries, :feeddata, :exc
2848
- def initialize(baseuri, baselang, encoding)
2849
- $stderr << "trying StrictFeedParser\n" if $debug
2850
- startup(baseuri, baselang, encoding)
2851
- @bozo = false
2852
- @exc = nil
2853
- super()
2854
- end
2855
-
2856
- def getPos
2857
- [@locator.getSystemId, @locator.getLineNumber]
2858
- end
2859
-
2860
- def getAttrs(attrs)
2861
- ret = []
2862
- for i in 0..attrs.getLength
2863
- ret.push([attrs.getName(i), attrs.getValue(i)])
2864
- end
2865
- ret
2866
- end
2867
-
2868
- def setDocumentLocator(loc)
2869
- @locator = loc
2870
- end
2871
-
2872
- def startDoctypeDecl(name, pub_sys, long_name, uri)
2873
- #Nothing is done here. What could we do that is neat and useful?
2874
- end
2875
-
2876
- def startNamespaceDecl(prefix, uri)
2877
- trackNamespace(prefix, uri)
2878
- end
2879
-
2880
- def endNamespaceDecl(prefix)
2881
- end
2882
-
2883
- def startElement(name, attrs)
2884
- name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
2885
- namespaceuri = ($2 || '').downcase
2886
- name = $3
2887
- if /backend\.userland\.com\/rss/ =~ namespaceuri
2888
- # match any backend.userland.com namespace
2889
- namespaceuri = 'http://backend.userland.com/rss'
2890
- end
2891
- prefix = @matchnamespaces[namespaceuri]
2892
- # No need to raise UndeclaredNamespace, Expat does that for us with
2893
- "unbound prefix (XMLParserError)"
2894
- if prefix and not prefix.empty?
2895
- name = prefix + ':' + name
2896
- end
2897
- name.downcase!
2898
- unknown_starttag(name, attrs)
2899
- end
2900
-
2901
- def character(text, start, length)
2902
- #handle_data(CGI.unescapeHTML(text))
2903
- handle_data(text)
2904
- end
2905
- # expat provides "character" not "characters"!
2906
- alias :characters :character # Just in case.
2907
-
2908
- def startCdata(content)
2909
- handle_data(content)
2910
- end
2911
-
2912
- def endElement(name)
2913
- name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
2914
- namespaceuri = ($2 || '').downcase
2915
- prefix = @matchnamespaces[namespaceuri]
2916
- if prefix and not prefix.empty?
2917
- localname = prefix + ':' + name
2918
- end
2919
- name.downcase!
2920
- unknown_endtag(name)
2921
- end
2922
-
2923
- def comment(comment)
2924
- handle_comment(comment)
2925
- end
2926
-
2927
- def entityDecl(*foo)
2928
- end
2929
-
2930
- def unparsedEntityDecl(*foo)
2931
- end
2932
- def error(exc)
2933
- @bozo = true
2934
- @exc = exc
2935
- end
2936
-
2937
- def fatalError(exc)
2938
- error(exc)
2939
- raise exc
2940
- end
2941
- end
2942
-
2943
- class LooseFeedParser < BetterSGMLParser
2944
- include FeedParserMixin
2945
- # We write the methods that were in BaseHTMLProcessor in the python code
2946
- # in here directly. We do this because if we inherited from
2947
- # BaseHTMLProcessor but then included from FeedParserMixin, the methods
2948
- # of Mixin would overwrite the methods we inherited from
2949
- # BaseHTMLProcessor. This is exactly the opposite of what we want to
2950
- # happen!
2951
-
2952
- attr_accessor :encoding, :bozo, :feeddata, :entries, :namespacesInUse
2953
-
2954
- Elements_No_End_Tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
2955
- 'img', 'input', 'isindex', 'link', 'meta', 'param']
2956
- New_Declname_Re = /[a-zA-Z][-_.a-zA-Z0-9:]*\s*/
2957
- alias :sgml_feed :feed # feed needs to mapped to feeddata, not the SGMLParser method feed. I think.
2958
- def feed
2959
- @feeddata
2960
- end
2961
- def feed=(data)
2962
- @feeddata = data
2963
- end
2964
-
2965
- def initialize(baseuri, baselang, encoding)
2966
- startup(baseuri, baselang, encoding)
2967
- super() # Keep the parentheses! No touchy.
2968
- end
2969
-
2970
- def reset
2971
- @pieces = []
2972
- super
2973
- end
2974
-
2975
- def parse(data)
2976
- data.gsub!(/<!((?!DOCTYPE|--|\[))/i, '&lt;!\1')
2977
- data.gsub!(/<([^<\s]+?)\s*\/>/) do |tag|
2978
- clean = tag[1..-3].strip
2979
- if Elements_No_End_Tag.include?clean
2980
- tag
2981
- else
2982
- '<'+clean+'></'+clean+'>'
2983
- end
2984
- end
2985
-
2986
- data.gsub!(/&#39;/, "'")
2987
- data.gsub!(/&#34;/, "'")
2988
- if @encoding and not @encoding.empty? # FIXME unicode check type(u'')
2989
- data = uconvert(data,'utf-8',@encoding)
2990
- end
2991
- sgml_feed(data) # see the alias above
2992
- end
2993
-
2994
-
2995
- def decodeEntities(element, data)
2996
- data.gsub!('&#60;', '&lt;')
2997
- data.gsub!('&#x3c;', '&lt;')
2998
- data.gsub!('&#62;', '&gt;')
2999
- data.gsub!('&#x3e;', '&gt;')
3000
- data.gsub!('&#38;', '&amp;')
3001
- data.gsub!('&#x26;', '&amp;')
3002
- data.gsub!('&#34;', '&quot;')
3003
- data.gsub!('&#x22;', '&quot;')
3004
- data.gsub!('&#39;', '&apos;')
3005
- data.gsub!('&#x27;', '&apos;')
3006
- if @contentparams.has_key? 'type' and not ((@contentparams['type'] || 'xml') =~ /xml$/u)
3007
- data.gsub!('&lt;', '<')
3008
- data.gsub!('&gt;', '>')
3009
- data.gsub!('&amp;', '&')
3010
- data.gsub!('&quot;', '"')
3011
- data.gsub!('&apos;', "'")
3012
- end
3013
- return data
3014
- end
3015
- end
3016
-
3017
- def FeedParser.resolveRelativeURIs(htmlSource, baseURI, encoding)
3018
- $stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
3019
- relative_uris = [ ['a','href'],
3020
- ['applet','codebase'],
3021
- ['area','href'],
3022
- ['blockquote','cite'],
3023
- ['body','background'],
3024
- ['del','cite'],
3025
- ['form','action'],
3026
- ['frame','longdesc'],
3027
- ['frame','src'],
3028
- ['iframe','longdesc'],
3029
- ['iframe','src'],
3030
- ['head','profile'],
3031
- ['img','longdesc'],
3032
- ['img','src'],
3033
- ['img','usemap'],
3034
- ['input','src'],
3035
- ['input','usemap'],
3036
- ['ins','cite'],
3037
- ['link','href'],
3038
- ['object','classid'],
3039
- ['object','codebase'],
3040
- ['object','data'],
3041
- ['object','usemap'],
3042
- ['q','cite'],
3043
- ['script','src'],
3044
- ]
3045
- h = Hpricot(htmlSource)
3046
- relative_uris.each do |l|
3047
- ename, eattr = l
3048
- h.search(ename).each do |elem|
3049
- euri = elem.attributes[eattr]
3050
- if euri and not euri.empty? and URI.parse(URI.encode(euri)).relative?
3051
- elem.attributes[eattr] = urljoin(baseURI, euri)
3052
- end
3053
- end
3054
- end
3055
- return h.to_html
3056
- end
3057
-
3058
- class SanitizerDoc < Hpricot::Doc
3059
-
3060
- def scrub
3061
- traverse_all_element do |e|
3062
- if e.elem?
3063
- if Acceptable_Elements.include?e.name
3064
- e.strip_attributes
3065
- else
3066
- if Unacceptable_Elements_With_End_Tag.include?e.name
3067
- e.inner_html = ''
3068
- end
3069
- e.swap(SanitizerDoc.new(e.children).scrub.to_html)
3070
- # This works because the children swapped in are brought in "after" the current element.
3071
- end
3072
- elsif e.doctype?
3073
- e.parent.children.delete(e)
3074
- elsif e.text?
3075
- ets = e.to_s
3076
- ets.gsub!(/&#39;/, "'")
3077
- ets.gsub!(/&#34;/, '"')
3078
- ets.gsub!(/\r/,'')
3079
- e.swap(ets)
3080
- else
3081
- end
3082
- end
3083
- # yes, that '/' should be there. It's a search method. See the Hpricot docs.
3084
-
3085
- unless $compatible # FIXME nonworking
3086
- (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
3087
- end
3088
- return self
3089
- end
3090
- end
3091
-
3092
- def SanitizerDoc(html)
3093
- FeedParser::SanitizerDoc.new(Hpricot.make(html))
3094
- end
3095
- module_function(:SanitizerDoc)
3096
-
3097
- def self.sanitizeHTML(html,encoding)
3098
- # FIXME Tidy not yet supported
3099
- html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
3100
- h = SanitizerDoc(html)
3101
- h = h.scrub
3102
- return h.to_html.strip
3103
- end
3104
-
3105
-
3106
-
3107
- def self.getCharacterEncoding(feed, xml_data)
3108
- # Get the character encoding of the XML document
3109
- $stderr << "In getCharacterEncoding\n" if $debug
3110
- sniffed_xml_encoding = nil
3111
- xml_encoding = nil
3112
- true_encoding = nil
3113
- begin
3114
- http_headers = feed.meta
3115
- http_content_type = feed.meta['content-type'].split(';')[0]
3116
- encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
3117
- http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
3118
- http_encoding = nil if http_encoding.empty?
3119
- # FIXME Open-Uri returns iso8859-1 if there is no charset header,
3120
- # but that doesn't pass the tests. Open-Uri claims its following
3121
- # the right RFC. Are they wrong or do we need to change the tests?
3122
- rescue NoMethodError
3123
- http_headers = {}
3124
- http_content_type = nil
3125
- http_encoding = nil
3126
- end
3127
- # Must sniff for non-ASCII-compatible character encodings before
3128
- # searching for XML declaration. This heuristic is defined in
3129
- # section F of the XML specification:
3130
- # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3131
- begin
3132
- if xml_data[0..3] == "\x4c\x6f\xa7\x94"
3133
- # EBCDIC
3134
- xml_data = _ebcdic_to_ascii(xml_data)
3135
- elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
3136
- # UTF-16BE
3137
- sniffed_xml_encoding = 'utf-16be'
3138
- xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
3139
- elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
3140
- # UTF-16BE with BOM
3141
- sniffed_xml_encoding = 'utf-16be'
3142
- xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
3143
- elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
3144
- # UTF-16LE
3145
- sniffed_xml_encoding = 'utf-16le'
3146
- xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
3147
- elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
3148
- # UTF-16LE with BOM
3149
- sniffed_xml_encoding = 'utf-16le'
3150
- xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
3151
- elsif xml_data[0..3] == "\x00\x00\x00\x3c"
3152
- # UTF-32BE
3153
- sniffed_xml_encoding = 'utf-32be'
3154
- xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
3155
- elsif xml_data[0..3] == "\x3c\x00\x00\x00"
3156
- # UTF-32LE
3157
- sniffed_xml_encoding = 'utf-32le'
3158
- xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
3159
- elsif xml_data[0..3] == "\x00\x00\xfe\xff"
3160
- # UTF-32BE with BOM
3161
- sniffed_xml_encoding = 'utf-32be'
3162
- xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
3163
- elsif xml_data[0..3] == "\xff\xfe\x00\x00"
3164
- # UTF-32LE with BOM
3165
- sniffed_xml_encoding = 'utf-32le'
3166
- xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
3167
- elsif xml_data[0..2] == "\xef\xbb\xbf"
3168
- # UTF-8 with BOM
3169
- sniffed_xml_encoding = 'utf-8'
3170
- xml_data = xml_data[3..-1]
3171
- else
3172
- # ASCII-compatible
3173
- end
3174
- xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
3175
- rescue
3176
- xml_encoding_match = nil
3177
- end
3178
- if xml_encoding_match
3179
- xml_encoding = xml_encoding_match[1].downcase
3180
- xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
3181
- if sniffed_xml_encoding and xencodings.include?xml_encoding
3182
- xml_encoding = sniffed_xml_encoding
3183
- end
3184
- end
3185
-
3186
- acceptable_content_type = false
3187
- application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
3188
- text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
3189
-
3190
- if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
3191
- acceptable_content_type = true
3192
- true_encoding = http_encoding || xml_encoding || 'utf-8'
3193
- elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
3194
- acceptable_content_type = true
3195
- true_encoding = http_encoding || 'us-ascii'
3196
- elsif /^text\// =~ http_content_type
3197
- true_encoding = http_encoding || 'us-ascii'
3198
- elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
3199
- true_encoding = xml_encoding || 'iso-8859-1'
3200
- else
3201
- true_encoding = xml_encoding || 'utf-8'
3202
- end
3203
- return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
3204
- end
3205
-
3206
- def self.toUTF8(data, encoding)
3207
- =begin
3208
- Changes an XML data stream on the fly to specify a new encoding
3209
-
3210
- data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
3211
- encoding is a string recognized by encodings.aliases
3212
- =end
3213
- $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
3214
- # NOTE we must use double quotes when dealing with \x encodings!
3215
- if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
3216
- if $debug
3217
- $stderr << "stripping BOM\n"
3218
- if encoding != 'utf-16be'
3219
- $stderr << "string utf-16be instead\n"
3220
- end
3221
- end
3222
- encoding = 'utf-16be'
3223
- data = data[2..-1]
3224
- elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
3225
- if $debug
3226
- $stderr << "stripping BOM\n"
3227
- $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
3228
- end
3229
- encoding = 'utf-16le'
3230
- data = data[2..-1]
3231
- elsif (data[0..2] == "\xef\xbb\xbf")
3232
- if $debug
3233
- $stderr << "stripping BOM\n"
3234
- $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
3235
- end
3236
- encoding = 'utf-8'
3237
- data = data[3..-1]
3238
- elsif (data[0..3] == "\x00\x00\xfe\xff")
3239
- if $debug
3240
- $stderr << "stripping BOM\n"
3241
- if encoding != 'utf-32be'
3242
- $stderr << "trying utf-32be instead\n"
3243
- end
3244
- end
3245
- encoding = 'utf-32be'
3246
- data = data[4..-1]
3247
- elsif (data[0..3] == "\xff\xfe\x00\x00")
3248
- if $debug
3249
- $stderr << "stripping BOM\n"
3250
- if encoding != 'utf-32le'
3251
- $stderr << "trying utf-32le instead\n"
3252
- end
3253
- end
3254
- encoding = 'utf-32le'
3255
- data = data[4..-1]
3256
- end
3257
- begin
3258
- newdata = uconvert(data, encoding, 'utf-8')
3259
- rescue => details
3260
- end
3261
- $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
3262
- declmatch = /^<\?xml[^>]*?>/
3263
- newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
3264
- if declmatch =~ newdata
3265
- newdata.sub!(declmatch, newdecl)
3266
- else
3267
- newdata = newdecl + "\n" + newdata
3268
- end
3269
- return newdata
3270
- end
3271
-
3272
- def self.stripDoctype(data)
3273
- =begin
3274
- Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3275
-
3276
- rss_version may be 'rss091n' or None
3277
- stripped_data is the same XML document, minus the DOCTYPE
3278
- =end
3279
- entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
3280
- data = data.gsub(entity_pattern,'')
3281
-
3282
- doctype_pattern = /<!DOCTYPE(.*?)>/m
3283
- doctype_results = data.scan(doctype_pattern)
3284
- if doctype_results and doctype_results[0]
3285
- doctype = doctype_results[0][0]
3286
- else
3287
- doctype = ''
3288
- end
3289
-
3290
- if /netscape/ =~ doctype.downcase
3291
- version = 'rss091n'
3292
- else
3293
- version = nil
3294
- end
3295
- data = data.sub(doctype_pattern, '')
3296
- return version, data
3297
- end
3298
-
3299
- def parse(*args); FeedParser.parse(*args); end
3300
- def FeedParser.parse(furi, options={})
3301
- # Parse a feed from a URL, file, stream or string
3302
- $compatible = options[:compatible] || $compatible # Use the default compatibility if compatible is nil
3303
- result = FeedParserDict.new
3304
- result['feed'] = FeedParserDict.new
3305
- result['entries'] = []
3306
- if options[:modified]
3307
- options[:modified] = Time.parse(options[:modified]).rfc2822
3308
- # FIXME this ignores all of our time parsing work. Does it matter?
143
+
144
+ def parse(furi, options = {})
145
+ # Parse a feed from a URL, file, stream or string
146
+ $compatible = options[:compatible] || $compatible # Use the default compatibility if compatible is nil
147
+ strictklass = options[:strict] || StrictFeedParser
148
+ looseklass = options[:loose] || LooseFeedParser
149
+ result = FeedParserDict.new
150
+ result['feed'] = FeedParserDict.new
151
+ result['entries'] = []
152
+ if options[:modified]
153
+ options[:modified] = Time.parse(options[:modified]).rfc2822
154
+ # FIXME this ignores all of our time parsing work. Does it matter?
3309
155
  end
3310
156
  result['bozo'] = false
3311
157
  handlers = options[:handlers]
3312
-
3313
158
  if handlers.class != Array # FIXME why does this happen?
3314
159
  handlers = [handlers]
3315
160
  end
3316
161
 
3317
162
  begin
3318
- if URI::parse(furi).class == URI::Generic
163
+ if File.exists?furi
3319
164
  f = open(furi) # OpenURI doesn't behave well when passing HTTP options to a file.
3320
165
  else
3321
166
  # And when you do pass them, make sure they aren't just nil (this still true?)
@@ -3482,7 +327,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3482
327
  if use_strict_parser
3483
328
  # initialize the SAX parser
3484
329
  saxparser = XML::SAX::Helpers::ParserFactory.makeParser("XML::Parser::SAXDriver")
3485
- feedparser = StrictFeedParser.new(baseuri, baselang, 'utf-8')
330
+ feedparser = strictklass.new(baseuri, baselang, 'utf-8')
3486
331
  saxparser.setDocumentHandler(feedparser)
3487
332
  saxparser.setDTDHandler(feedparser)
3488
333
  saxparser.setEntityResolver(feedparser)
@@ -3503,7 +348,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3503
348
  end
3504
349
  end
3505
350
  if not use_strict_parser
3506
- feedparser = LooseFeedParser.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
351
+ feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
3507
352
  feedparser.parse(data)
3508
353
  $stderr << "Using LooseFeed\n\n" if $debug
3509
354
  end
@@ -3513,6 +358,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3513
358
  result['namespaces'] = feedparser.namespacesInUse
3514
359
  return result
3515
360
  end
361
+ module_function(:parse)
3516
362
  end # End FeedParser module
3517
363
 
3518
364
  class Serializer
@@ -3552,7 +398,7 @@ class TextSerializer < Serializer
3552
398
  end
3553
399
  end
3554
400
 
3555
- class PprintSerializer < Serializer # FIXME ? use pp instead?
401
+ class PprintSerializer < Serializer # FIXME use pp instead
3556
402
  def write(stream = $stdout)
3557
403
  stream << @results['href'].to_s + "\n\n"
3558
404
  pp(@results)