rfeedparser 0.9.8 → 0.9.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rfeedparser.rb +170 -3345
- data/lib/rfeedparser/aliases.rb +432 -0
- data/lib/rfeedparser/better_attributelist.rb +41 -0
- data/lib/rfeedparser/better_sgmlparser.rb +264 -0
- data/lib/rfeedparser/encoding_helpers.rb +257 -0
- data/lib/rfeedparser/feedparserdict.rb +93 -0
- data/lib/rfeedparser/forgiving_uri.rb +93 -0
- data/lib/rfeedparser/markup_helpers.rb +73 -0
- data/lib/rfeedparser/parser_mixin.rb +1235 -0
- data/lib/rfeedparser/parsers.rb +177 -0
- data/lib/rfeedparser/scrub.rb +207 -0
- data/lib/rfeedparser/time_helpers.rb +408 -0
- data/tests/rfeedparsertest.rb +3 -1
- metadata +3271 -3250
data/lib/rfeedparser.rb
CHANGED
@@ -14,3314 +14,138 @@ require 'stringio'
|
|
14
14
|
require 'uri'
|
15
15
|
require 'cgi' # escaping html
|
16
16
|
require 'time'
|
17
|
-
require 'xml/saxdriver' # calling expat
|
18
17
|
require 'pp'
|
19
18
|
require 'rubygems'
|
20
19
|
require 'base64'
|
21
20
|
require 'iconv'
|
22
|
-
gem 'hpricot', ">=0.5"
|
23
|
-
gem 'character-encodings', ">=0.2.0"
|
24
|
-
gem 'htmltools', ">=1.10"
|
25
|
-
gem 'htmlentities', ">=4.0.0"
|
26
|
-
gem 'activesupport', ">=1.4.2"
|
27
|
-
gem 'rchardet', ">=1.0"
|
28
|
-
|
29
|
-
require 'rchardet'
|
30
|
-
$chardet = true
|
31
|
-
|
32
|
-
require 'hpricot'
|
33
|
-
require 'encoding/character/utf-8'
|
34
|
-
require 'html/sgml-parser'
|
35
|
-
require 'htmlentities'
|
36
|
-
require 'active_support'
|
37
|
-
require 'open-uri'
|
38
|
-
include OpenURI
|
39
|
-
|
40
|
-
$debug = false
|
41
|
-
$compatible = true
|
42
|
-
|
43
|
-
Encoding_Aliases = { # Adapted from python2.4's encodings/aliases.py
|
44
|
-
# ascii codec
|
45
|
-
'646' => 'ascii',
|
46
|
-
'ansi_x3.4_1968' => 'ascii',
|
47
|
-
'ansi_x3_4_1968' => 'ascii', # some email headers use this non-standard name
|
48
|
-
'ansi_x3.4_1986' => 'ascii',
|
49
|
-
'cp367' => 'ascii',
|
50
|
-
'csascii' => 'ascii',
|
51
|
-
'ibm367' => 'ascii',
|
52
|
-
'iso646_us' => 'ascii',
|
53
|
-
'iso_646.irv_1991' => 'ascii',
|
54
|
-
'iso_ir_6' => 'ascii',
|
55
|
-
'us' => 'ascii',
|
56
|
-
'us_ascii' => 'ascii',
|
57
|
-
|
58
|
-
# big5 codec
|
59
|
-
'big5_tw' => 'big5',
|
60
|
-
'csbig5' => 'big5',
|
61
|
-
|
62
|
-
# big5hkscs codec
|
63
|
-
'big5_hkscs' => 'big5hkscs',
|
64
|
-
'hkscs' => 'big5hkscs',
|
65
|
-
|
66
|
-
# cp037 codec
|
67
|
-
'037' => 'cp037',
|
68
|
-
'csibm037' => 'cp037',
|
69
|
-
'ebcdic_cp_ca' => 'cp037',
|
70
|
-
'ebcdic_cp_nl' => 'cp037',
|
71
|
-
'ebcdic_cp_us' => 'cp037',
|
72
|
-
'ebcdic_cp_wt' => 'cp037',
|
73
|
-
'ibm037' => 'cp037',
|
74
|
-
'ibm039' => 'cp037',
|
75
|
-
|
76
|
-
# cp1026 codec
|
77
|
-
'1026' => 'cp1026',
|
78
|
-
'csibm1026' => 'cp1026',
|
79
|
-
'ibm1026' => 'cp1026',
|
80
|
-
|
81
|
-
# cp1140 codec
|
82
|
-
'1140' => 'cp1140',
|
83
|
-
'ibm1140' => 'cp1140',
|
84
|
-
|
85
|
-
# cp1250 codec
|
86
|
-
'1250' => 'cp1250',
|
87
|
-
'windows_1250' => 'cp1250',
|
88
|
-
|
89
|
-
# cp1251 codec
|
90
|
-
'1251' => 'cp1251',
|
91
|
-
'windows_1251' => 'cp1251',
|
92
|
-
|
93
|
-
# cp1252 codec
|
94
|
-
'1252' => 'cp1252',
|
95
|
-
'windows_1252' => 'cp1252',
|
96
|
-
|
97
|
-
# cp1253 codec
|
98
|
-
'1253' => 'cp1253',
|
99
|
-
'windows_1253' => 'cp1253',
|
100
|
-
|
101
|
-
# cp1254 codec
|
102
|
-
'1254' => 'cp1254',
|
103
|
-
'windows_1254' => 'cp1254',
|
104
|
-
|
105
|
-
# cp1255 codec
|
106
|
-
'1255' => 'cp1255',
|
107
|
-
'windows_1255' => 'cp1255',
|
108
|
-
|
109
|
-
# cp1256 codec
|
110
|
-
'1256' => 'cp1256',
|
111
|
-
'windows_1256' => 'cp1256',
|
112
|
-
|
113
|
-
# cp1257 codec
|
114
|
-
'1257' => 'cp1257',
|
115
|
-
'windows_1257' => 'cp1257',
|
116
|
-
|
117
|
-
# cp1258 codec
|
118
|
-
'1258' => 'cp1258',
|
119
|
-
'windows_1258' => 'cp1258',
|
120
|
-
|
121
|
-
# cp424 codec
|
122
|
-
'424' => 'cp424',
|
123
|
-
'csibm424' => 'cp424',
|
124
|
-
'ebcdic_cp_he' => 'cp424',
|
125
|
-
'ibm424' => 'cp424',
|
126
|
-
|
127
|
-
# cp437 codec
|
128
|
-
'437' => 'cp437',
|
129
|
-
'cspc8codepage437' => 'cp437',
|
130
|
-
'ibm437' => 'cp437',
|
131
|
-
|
132
|
-
# cp500 codec
|
133
|
-
'500' => 'cp500',
|
134
|
-
'csibm500' => 'cp500',
|
135
|
-
'ebcdic_cp_be' => 'cp500',
|
136
|
-
'ebcdic_cp_ch' => 'cp500',
|
137
|
-
'ibm500' => 'cp500',
|
138
|
-
|
139
|
-
# cp775 codec
|
140
|
-
'775' => 'cp775',
|
141
|
-
'cspc775baltic' => 'cp775',
|
142
|
-
'ibm775' => 'cp775',
|
143
|
-
|
144
|
-
# cp850 codec
|
145
|
-
'850' => 'cp850',
|
146
|
-
'cspc850multilingual' => 'cp850',
|
147
|
-
'ibm850' => 'cp850',
|
148
|
-
|
149
|
-
# cp852 codec
|
150
|
-
'852' => 'cp852',
|
151
|
-
'cspcp852' => 'cp852',
|
152
|
-
'ibm852' => 'cp852',
|
153
|
-
|
154
|
-
# cp855 codec
|
155
|
-
'855' => 'cp855',
|
156
|
-
'csibm855' => 'cp855',
|
157
|
-
'ibm855' => 'cp855',
|
158
|
-
|
159
|
-
# cp857 codec
|
160
|
-
'857' => 'cp857',
|
161
|
-
'csibm857' => 'cp857',
|
162
|
-
'ibm857' => 'cp857',
|
163
|
-
|
164
|
-
# cp860 codec
|
165
|
-
'860' => 'cp860',
|
166
|
-
'csibm860' => 'cp860',
|
167
|
-
'ibm860' => 'cp860',
|
168
|
-
|
169
|
-
# cp861 codec
|
170
|
-
'861' => 'cp861',
|
171
|
-
'cp_is' => 'cp861',
|
172
|
-
'csibm861' => 'cp861',
|
173
|
-
'ibm861' => 'cp861',
|
174
|
-
|
175
|
-
# cp862 codec
|
176
|
-
'862' => 'cp862',
|
177
|
-
'cspc862latinhebrew' => 'cp862',
|
178
|
-
'ibm862' => 'cp862',
|
179
|
-
|
180
|
-
# cp863 codec
|
181
|
-
'863' => 'cp863',
|
182
|
-
'csibm863' => 'cp863',
|
183
|
-
'ibm863' => 'cp863',
|
184
|
-
|
185
|
-
# cp864 codec
|
186
|
-
'864' => 'cp864',
|
187
|
-
'csibm864' => 'cp864',
|
188
|
-
'ibm864' => 'cp864',
|
189
|
-
|
190
|
-
# cp865 codec
|
191
|
-
'865' => 'cp865',
|
192
|
-
'csibm865' => 'cp865',
|
193
|
-
'ibm865' => 'cp865',
|
194
|
-
|
195
|
-
# cp866 codec
|
196
|
-
'866' => 'cp866',
|
197
|
-
'csibm866' => 'cp866',
|
198
|
-
'ibm866' => 'cp866',
|
199
|
-
|
200
|
-
# cp869 codec
|
201
|
-
'869' => 'cp869',
|
202
|
-
'cp_gr' => 'cp869',
|
203
|
-
'csibm869' => 'cp869',
|
204
|
-
'ibm869' => 'cp869',
|
205
|
-
|
206
|
-
# cp932 codec
|
207
|
-
'932' => 'cp932',
|
208
|
-
'ms932' => 'cp932',
|
209
|
-
'mskanji' => 'cp932',
|
210
|
-
'ms_kanji' => 'cp932',
|
211
|
-
|
212
|
-
# cp949 codec
|
213
|
-
'949' => 'cp949',
|
214
|
-
'ms949' => 'cp949',
|
215
|
-
'uhc' => 'cp949',
|
216
|
-
|
217
|
-
# cp950 codec
|
218
|
-
'950' => 'cp950',
|
219
|
-
'ms950' => 'cp950',
|
220
|
-
|
221
|
-
# euc_jp codec
|
222
|
-
'euc_jp' => 'euc-jp',
|
223
|
-
'eucjp' => 'euc-jp',
|
224
|
-
'ujis' => 'euc-jp',
|
225
|
-
'u_jis' => 'euc-jp',
|
226
|
-
|
227
|
-
# euc_kr codec
|
228
|
-
'euc_kr' => 'euc-kr',
|
229
|
-
'euckr' => 'euc-kr',
|
230
|
-
'korean' => 'euc-kr',
|
231
|
-
'ksc5601' => 'euc-kr',
|
232
|
-
'ks_c_5601' => 'euc-kr',
|
233
|
-
'ks_c_5601_1987' => 'euc-kr',
|
234
|
-
'ksx1001' => 'euc-kr',
|
235
|
-
'ks_x_1001' => 'euc-kr',
|
236
|
-
|
237
|
-
# gb18030 codec
|
238
|
-
'gb18030_2000' => 'gb18030',
|
239
|
-
|
240
|
-
# gb2312 codec
|
241
|
-
'chinese' => 'gb2312',
|
242
|
-
'csiso58gb231280' => 'gb2312',
|
243
|
-
'euc_cn' => 'gb2312',
|
244
|
-
'euccn' => 'gb2312',
|
245
|
-
'eucgb2312_cn' => 'gb2312',
|
246
|
-
'gb2312_1980' => 'gb2312',
|
247
|
-
'gb2312_80' => 'gb2312',
|
248
|
-
'iso_ir_58' => 'gb2312',
|
249
|
-
|
250
|
-
# gbk codec
|
251
|
-
'936' => 'gbk',
|
252
|
-
'cp936' => 'gbk',
|
253
|
-
'ms936' => 'gbk',
|
254
|
-
|
255
|
-
# hp-roman8 codec
|
256
|
-
'hp_roman8' => 'hp-roman8',
|
257
|
-
'roman8' => 'hp-roman8',
|
258
|
-
'r8' => 'hp-roman8',
|
259
|
-
'csHPRoman8' => 'hp-roman8',
|
260
|
-
|
261
|
-
# iso2022_jp codec
|
262
|
-
'iso2022_jp' => 'iso-2022-jp',
|
263
|
-
'csiso2022jp' => 'iso-2022-jp',
|
264
|
-
'iso2022jp' => 'iso-2022-jp',
|
265
|
-
'iso_2022_jp' => 'iso-2022-jp',
|
266
|
-
|
267
|
-
# iso2022_jp_1 codec
|
268
|
-
'iso2002_jp_1' => 'iso-2022-jp-1',
|
269
|
-
'iso2022jp_1' => 'iso-2022-jp-1',
|
270
|
-
'iso_2022_jp_1' => 'iso-2022-jp-1',
|
271
|
-
|
272
|
-
# iso2022_jp_2 codec
|
273
|
-
'iso2022_jp_2' => 'iso-2002-jp-2',
|
274
|
-
'iso2022jp_2' => 'iso-2022-jp-2',
|
275
|
-
'iso_2022_jp_2' => 'iso-2022-jp-2',
|
276
|
-
|
277
|
-
# iso2022_jp_3 codec
|
278
|
-
'iso2002_jp_3' => 'iso-2022-jp-3',
|
279
|
-
'iso2022jp_3' => 'iso-2022-jp-3',
|
280
|
-
'iso_2022_jp_3' => 'iso-2022-jp-3',
|
281
|
-
|
282
|
-
# iso2022_kr codec
|
283
|
-
'iso2022_kr' => 'iso-2022-kr',
|
284
|
-
'csiso2022kr' => 'iso-2022-kr',
|
285
|
-
'iso2022kr' => 'iso-2022-kr',
|
286
|
-
'iso_2022_kr' => 'iso-2022-kr',
|
287
|
-
|
288
|
-
# iso8859_10 codec
|
289
|
-
'iso8859_10' => 'iso-8859-10',
|
290
|
-
'csisolatin6' => 'iso-8859-10',
|
291
|
-
'iso_8859_10' => 'iso-8859-10',
|
292
|
-
'iso_8859_10_1992' => 'iso-8859-10',
|
293
|
-
'iso_ir_157' => 'iso-8859-10',
|
294
|
-
'l6' => 'iso-8859-10',
|
295
|
-
'latin6' => 'iso-8859-10',
|
296
|
-
|
297
|
-
# iso8859_13 codec
|
298
|
-
'iso8859_13' => 'iso-8859-13',
|
299
|
-
'iso_8859_13' => 'iso-8859-13',
|
300
|
-
|
301
|
-
# iso8859_14 codec
|
302
|
-
'iso8859_14' => 'iso-8859-14',
|
303
|
-
'iso_8859_14' => 'iso-8859-14',
|
304
|
-
'iso_8859_14_1998' => 'iso-8859-14',
|
305
|
-
'iso_celtic' => 'iso-8859-14',
|
306
|
-
'iso_ir_199' => 'iso-8859-14',
|
307
|
-
'l8' => 'iso-8859-14',
|
308
|
-
'latin8' => 'iso-8859-14',
|
309
|
-
|
310
|
-
# iso8859_15 codec
|
311
|
-
'iso8859_15' => 'iso-8859-15',
|
312
|
-
'iso_8859_15' => 'iso-8859-15',
|
313
|
-
|
314
|
-
# iso8859_1 codec
|
315
|
-
'latin_1' => 'iso-8859-1',
|
316
|
-
'cp819' => 'iso-8859-1',
|
317
|
-
'csisolatin1' => 'iso-8859-1',
|
318
|
-
'ibm819' => 'iso-8859-1',
|
319
|
-
'iso8859' => 'iso-8859-1',
|
320
|
-
'iso_8859_1' => 'iso-8859-1',
|
321
|
-
'iso_8859_1_1987' => 'iso-8859-1',
|
322
|
-
'iso_ir_100' => 'iso-8859-1',
|
323
|
-
'l1' => 'iso-8859-1',
|
324
|
-
'latin' => 'iso-8859-1',
|
325
|
-
'latin1' => 'iso-8859-1',
|
326
|
-
|
327
|
-
# iso8859_2 codec
|
328
|
-
'iso8859_2' => 'iso-8859-2',
|
329
|
-
'csisolatin2' => 'iso-8859-2',
|
330
|
-
'iso_8859_2' => 'iso-8859-2',
|
331
|
-
'iso_8859_2_1987' => 'iso-8859-2',
|
332
|
-
'iso_ir_101' => 'iso-8859-2',
|
333
|
-
'l2' => 'iso-8859-2',
|
334
|
-
'latin2' => 'iso-8859-2',
|
335
|
-
|
336
|
-
# iso8859_3 codec
|
337
|
-
'iso8859_3' => 'iso-8859-3',
|
338
|
-
'csisolatin3' => 'iso-8859-3',
|
339
|
-
'iso_8859_3' => 'iso-8859-3',
|
340
|
-
'iso_8859_3_1988' => 'iso-8859-3',
|
341
|
-
'iso_ir_109' => 'iso-8859-3',
|
342
|
-
'l3' => 'iso-8859-3',
|
343
|
-
'latin3' => 'iso-8859-3',
|
344
|
-
|
345
|
-
# iso8859_4 codec
|
346
|
-
'iso8849_4' => 'iso-8859-4',
|
347
|
-
'csisolatin4' => 'iso-8859-4',
|
348
|
-
'iso_8859_4' => 'iso-8859-4',
|
349
|
-
'iso_8859_4_1988' => 'iso-8859-4',
|
350
|
-
'iso_ir_110' => 'iso-8859-4',
|
351
|
-
'l4' => 'iso-8859-4',
|
352
|
-
'latin4' => 'iso-8859-4',
|
353
|
-
|
354
|
-
# iso8859_5 codec
|
355
|
-
'iso8859_5' => 'iso-8859-5',
|
356
|
-
'csisolatincyrillic' => 'iso-8859-5',
|
357
|
-
'cyrillic' => 'iso-8859-5',
|
358
|
-
'iso_8859_5' => 'iso-8859-5',
|
359
|
-
'iso_8859_5_1988' => 'iso-8859-5',
|
360
|
-
'iso_ir_144' => 'iso-8859-5',
|
361
|
-
|
362
|
-
# iso8859_6 codec
|
363
|
-
'iso8859_6' => 'iso-8859-6',
|
364
|
-
'arabic' => 'iso-8859-6',
|
365
|
-
'asmo_708' => 'iso-8859-6',
|
366
|
-
'csisolatinarabic' => 'iso-8859-6',
|
367
|
-
'ecma_114' => 'iso-8859-6',
|
368
|
-
'iso_8859_6' => 'iso-8859-6',
|
369
|
-
'iso_8859_6_1987' => 'iso-8859-6',
|
370
|
-
'iso_ir_127' => 'iso-8859-6',
|
371
|
-
|
372
|
-
# iso8859_7 codec
|
373
|
-
'iso8859_7' => 'iso-8859-7',
|
374
|
-
'csisolatingreek' => 'iso-8859-7',
|
375
|
-
'ecma_118' => 'iso-8859-7',
|
376
|
-
'elot_928' => 'iso-8859-7',
|
377
|
-
'greek' => 'iso-8859-7',
|
378
|
-
'greek8' => 'iso-8859-7',
|
379
|
-
'iso_8859_7' => 'iso-8859-7',
|
380
|
-
'iso_8859_7_1987' => 'iso-8859-7',
|
381
|
-
'iso_ir_126' => 'iso-8859-7',
|
382
|
-
|
383
|
-
# iso8859_8 codec
|
384
|
-
'iso8859_9' => 'iso8859_8',
|
385
|
-
'csisolatinhebrew' => 'iso-8859-8',
|
386
|
-
'hebrew' => 'iso-8859-8',
|
387
|
-
'iso_8859_8' => 'iso-8859-8',
|
388
|
-
'iso_8859_8_1988' => 'iso-8859-8',
|
389
|
-
'iso_ir_138' => 'iso-8859-8',
|
390
|
-
|
391
|
-
# iso8859_9 codec
|
392
|
-
'iso8859_9' => 'iso-8859-9',
|
393
|
-
'csisolatin5' => 'iso-8859-9',
|
394
|
-
'iso_8859_9' => 'iso-8859-9',
|
395
|
-
'iso_8859_9_1989' => 'iso-8859-9',
|
396
|
-
'iso_ir_148' => 'iso-8859-9',
|
397
|
-
'l5' => 'iso-8859-9',
|
398
|
-
'latin5' => 'iso-8859-9',
|
399
|
-
|
400
|
-
# iso8859_11 codec
|
401
|
-
'iso8859_11' => 'iso-8859-11',
|
402
|
-
'thai' => 'iso-8859-11',
|
403
|
-
'iso_8859_11' => 'iso-8859-11',
|
404
|
-
'iso_8859_11_2001' => 'iso-8859-11',
|
405
|
-
|
406
|
-
# iso8859_16 codec
|
407
|
-
'iso8859_16' => 'iso-8859-16',
|
408
|
-
'iso_8859_16' => 'iso-8859-16',
|
409
|
-
'iso_8859_16_2001' => 'iso-8859-16',
|
410
|
-
'iso_ir_226' => 'iso-8859-16',
|
411
|
-
'l10' => 'iso-8859-16',
|
412
|
-
'latin10' => 'iso-8859-16',
|
413
|
-
|
414
|
-
# cskoi8r codec
|
415
|
-
'koi8_r' => 'cskoi8r',
|
416
|
-
|
417
|
-
# mac_cyrillic codec
|
418
|
-
'mac_cyrillic' => 'maccyrillic',
|
419
|
-
|
420
|
-
# shift_jis codec
|
421
|
-
'csshiftjis' => 'shift_jis',
|
422
|
-
'shiftjis' => 'shift_jis',
|
423
|
-
'sjis' => 'shift_jis',
|
424
|
-
's_jis' => 'shift_jis',
|
425
|
-
|
426
|
-
# shift_jisx0213 codec
|
427
|
-
'shiftjisx0213' => 'shift_jisx0213',
|
428
|
-
'sjisx0213' => 'shift_jisx0213',
|
429
|
-
's_jisx0213' => 'shift_jisx0213',
|
430
|
-
|
431
|
-
# utf_16 codec
|
432
|
-
'utf_16' => 'utf-16',
|
433
|
-
'u16' => 'utf-16',
|
434
|
-
'utf16' => 'utf-16',
|
435
|
-
|
436
|
-
# utf_16_be codec
|
437
|
-
'utf_16_be' => 'utf-16be',
|
438
|
-
'unicodebigunmarked' => 'utf-16be',
|
439
|
-
'utf_16be' => 'utf-16be',
|
440
|
-
|
441
|
-
# utf_16_le codec
|
442
|
-
'utf_16_le' => 'utf-16le',
|
443
|
-
'unicodelittleunmarked' => 'utf-16le',
|
444
|
-
'utf_16le' => 'utf-16le',
|
445
|
-
|
446
|
-
# utf_7 codec
|
447
|
-
'utf_7' => 'utf-7',
|
448
|
-
'u7' => 'utf-7',
|
449
|
-
'utf7' => 'utf-7',
|
450
|
-
|
451
|
-
# utf_8 codec
|
452
|
-
'utf_8' => 'utf-8',
|
453
|
-
'u8' => 'utf-8',
|
454
|
-
'utf' => 'utf-8',
|
455
|
-
'utf8' => 'utf-8',
|
456
|
-
'utf8_ucs2' => 'utf-8',
|
457
|
-
'utf8_ucs4' => 'utf-8',
|
458
|
-
}
|
459
|
-
|
460
|
-
def unicode(data, from_encoding)
|
461
|
-
# Takes a single string and converts it from the encoding in
|
462
|
-
# from_encoding to unicode.
|
463
|
-
uconvert(data, from_encoding, 'unicode')
|
464
|
-
end
|
465
|
-
|
466
|
-
def uconvert(data, from_encoding, to_encoding = 'utf-8')
|
467
|
-
from_encoding = Encoding_Aliases[from_encoding] || from_encoding
|
468
|
-
to_encoding = Encoding_Aliases[to_encoding] || to_encoding
|
469
|
-
Iconv.iconv(to_encoding, from_encoding, data)[0]
|
470
|
-
end
|
471
|
-
|
472
|
-
def unichr(i)
|
473
|
-
[i].pack('U*')
|
474
|
-
end
|
475
|
-
|
476
|
-
def index_match(stri,regexp, offset)
|
477
|
-
if offset == 241
|
478
|
-
end
|
479
|
-
i = stri.index(regexp, offset)
|
480
|
-
|
481
|
-
return nil, nil unless i
|
482
|
-
|
483
|
-
full = stri[i..-1].match(regexp)
|
484
|
-
return i, full
|
485
|
-
end
|
486
|
-
|
487
|
-
def _ebcdic_to_ascii(s)
|
488
|
-
return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
|
489
|
-
end
|
490
|
-
|
491
|
-
def urljoin(base, uri)
|
492
|
-
urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
|
493
|
-
uri = uri.sub(urifixer, '\1\3')
|
494
|
-
begin
|
495
|
-
return URI.join(base, uri).to_s
|
496
|
-
rescue URI::BadURIError => e
|
497
|
-
if URI.parse(base).relative?
|
498
|
-
return URI::parse(uri).to_s
|
499
|
-
end
|
500
|
-
end
|
501
|
-
end
|
502
|
-
|
503
|
-
def py2rtime(pytuple)
|
504
|
-
Time.utc(pytuple[0..5])
|
505
|
-
end
|
506
|
-
|
507
|
-
# http://intertwingly.net/stories/2005/09/28/xchar.rb
|
508
|
-
module XChar
|
509
|
-
# http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
|
510
|
-
CP1252 = {
|
511
|
-
128 => 8364, # euro sign
|
512
|
-
130 => 8218, # single low-9 quotation mark
|
513
|
-
131 => 402, # latin small letter f with hook
|
514
|
-
132 => 8222, # double low-9 quotation mark
|
515
|
-
133 => 8230, # horizontal ellipsis
|
516
|
-
134 => 8224, # dagger
|
517
|
-
135 => 8225, # double dagger
|
518
|
-
136 => 710, # modifier letter circumflex accent
|
519
|
-
137 => 8240, # per mille sign
|
520
|
-
138 => 352, # latin capital letter s with caron
|
521
|
-
139 => 8249, # single left-pointing angle quotation mark
|
522
|
-
140 => 338, # latin capital ligature oe
|
523
|
-
142 => 381, # latin capital letter z with caron
|
524
|
-
145 => 8216, # left single quotation mark
|
525
|
-
146 => 8217, # right single quotation mark
|
526
|
-
147 => 8220, # left double quotation mark
|
527
|
-
148 => 8221, # right double quotation mark
|
528
|
-
149 => 8226, # bullet
|
529
|
-
150 => 8211, # en dash
|
530
|
-
151 => 8212, # em dash
|
531
|
-
152 => 732, # small tilde
|
532
|
-
153 => 8482, # trade mark sign
|
533
|
-
154 => 353, # latin small letter s with caron
|
534
|
-
155 => 8250, # single right-pointing angle quotation mark
|
535
|
-
156 => 339, # latin small ligature oe
|
536
|
-
158 => 382, # latin small letter z with caron
|
537
|
-
159 => 376} # latin capital letter y with diaeresis
|
538
|
-
|
539
|
-
# http://www.w3.org/TR/REC-xml/#dt-chardata
|
540
|
-
PREDEFINED = {
|
541
|
-
38 => '&', # ampersand
|
542
|
-
60 => '<', # left angle bracket
|
543
|
-
62 => '>'} # right angle bracket
|
544
|
-
|
545
|
-
# http://www.w3.org/TR/REC-xml/#charsets
|
546
|
-
VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
|
547
|
-
(0xE000..0xFFFD), (0x10000..0x10FFFF)]
|
548
|
-
end
|
549
|
-
|
550
|
-
class Fixnum
|
551
|
-
# xml escaped version of chr
|
552
|
-
def xchr
|
553
|
-
n = XChar::CP1252[self] || self
|
554
|
-
n = 42 unless XChar::VALID.find {|range| range.include? n}
|
555
|
-
XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
|
556
|
-
end
|
557
|
-
end
|
558
|
-
|
559
|
-
class String
|
560
|
-
alias :old_index :index
|
561
|
-
def to_xs
|
562
|
-
unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
|
563
|
-
rescue
|
564
|
-
unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
|
565
|
-
end
|
566
|
-
end
|
567
|
-
|
568
|
-
class BetterSGMLParserError < Exception; end;
|
569
|
-
class BetterSGMLParser < HTML::SGMLParser
|
570
|
-
# Replaced Tagfind and Charref Regexps with the ones in feedparser.py
|
571
|
-
# This makes things work.
|
572
|
-
Interesting = /[&<]/u
|
573
|
-
Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
|
574
|
-
'<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
|
575
|
-
'![^<>]*)?', 64) # 64 is the unicode flag
|
576
|
-
|
577
|
-
Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
|
578
|
-
Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
|
579
|
-
|
580
|
-
Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
|
581
|
-
Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
|
582
|
-
Endtagopen = /<\//u # Matching the Python SGMLParser
|
583
|
-
Endbracket = /[<>]/u
|
584
|
-
Declopen = /<!/u
|
585
|
-
Piopenbegin = /^<\?/u
|
586
|
-
Piclose = />/u
|
587
|
-
|
588
|
-
Commentopen = /<!--/u
|
589
|
-
Commentclose = /--\s*>/u
|
590
|
-
Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
|
591
|
-
Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
|
592
|
-
'(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
|
593
|
-
64)
|
594
|
-
Endtagfind = /\s*\/\s*>/u
|
595
|
-
def initialize(verbose=false)
|
596
|
-
super(verbose)
|
597
|
-
end
|
598
|
-
def feed(*args)
|
599
|
-
super(*args)
|
600
|
-
end
|
601
|
-
|
602
|
-
def goahead(_end)
|
603
|
-
rawdata = @rawdata # woo, utf-8 magic
|
604
|
-
i = 0
|
605
|
-
n = rawdata.length
|
606
|
-
while i < n
|
607
|
-
if @nomoretags
|
608
|
-
# handle_data_range does nothing more than set a "Range" that is never used. wtf?
|
609
|
-
handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
|
610
|
-
i = n
|
611
|
-
break
|
612
|
-
end
|
613
|
-
j = rawdata.index(Interesting, i)
|
614
|
-
j = n unless j
|
615
|
-
handle_data(rawdata[i...j]) if i < j
|
616
|
-
i = j
|
617
|
-
break if (i == n)
|
618
|
-
if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
|
619
|
-
if rawdata.index(Starttagopen,i) == i
|
620
|
-
if @literal
|
621
|
-
handle_data(rawdata[i..i])
|
622
|
-
i = i+1
|
623
|
-
next
|
624
|
-
end
|
625
|
-
k = parse_starttag(i)
|
626
|
-
break unless k
|
627
|
-
i = k
|
628
|
-
next
|
629
|
-
end
|
630
|
-
if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
|
631
|
-
k = parse_endtag(i)
|
632
|
-
break unless k
|
633
|
-
i = k
|
634
|
-
@literal = false
|
635
|
-
next
|
636
|
-
end
|
637
|
-
if @literal
|
638
|
-
if n > (i+1)
|
639
|
-
handle_data("<")
|
640
|
-
i = i+1
|
641
|
-
else
|
642
|
-
#incomplete
|
643
|
-
break
|
644
|
-
end
|
645
|
-
next
|
646
|
-
end
|
647
|
-
if rawdata.index(Commentopen,i) == i
|
648
|
-
k = parse_comment(i)
|
649
|
-
break unless k
|
650
|
-
i = k
|
651
|
-
next
|
652
|
-
end
|
653
|
-
if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
|
654
|
-
k = parse_pi(i)
|
655
|
-
break unless k
|
656
|
-
i += k
|
657
|
-
next
|
658
|
-
end
|
659
|
-
if rawdata.index(Declopen,i) == i
|
660
|
-
# This is some sort of declaration; in "HTML as
|
661
|
-
# deployed," this should only be the document type
|
662
|
-
# declaration ("<!DOCTYPE html...>").
|
663
|
-
k = parse_declaration(i)
|
664
|
-
break unless k
|
665
|
-
i = k
|
666
|
-
next
|
667
|
-
end
|
668
|
-
elsif rawdata[i..i] == '&'
|
669
|
-
if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
|
670
|
-
handle_data(rawdata[i..i])
|
671
|
-
i += 1
|
672
|
-
next
|
673
|
-
end
|
674
|
-
|
675
|
-
# the Char must come first as its #=~ method is the only one that is UTF-8 safe
|
676
|
-
ni,match = index_match(rawdata, Charref, i)
|
677
|
-
if ni and ni == i # See? Ugly
|
678
|
-
handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
|
679
|
-
i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
|
680
|
-
i -= 1 unless rawdata[i-1..i-1] == ";"
|
681
|
-
next
|
682
|
-
end
|
683
|
-
ni,match = index_match(rawdata, Entityref, i)
|
684
|
-
if ni and ni == i
|
685
|
-
handle_entityref(match[1])
|
686
|
-
i += match[0].length
|
687
|
-
i -= 1 unless rawdata[i-1..i-1] == ";"
|
688
|
-
next
|
689
|
-
end
|
690
|
-
else
|
691
|
-
error('neither < nor & ??')
|
692
|
-
end
|
693
|
-
# We get here only if incomplete matches but
|
694
|
-
# nothing else
|
695
|
-
ni,match = index_match(rawdata,Incomplete,i)
|
696
|
-
unless ni and ni == 0
|
697
|
-
handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
|
698
|
-
i += 1
|
699
|
-
next
|
700
|
-
end
|
701
|
-
j = ni + match[0].length
|
702
|
-
break if j == n # Really incomplete
|
703
|
-
handle_data(rawdata[i...j])
|
704
|
-
i = j
|
705
|
-
end # end while
|
706
|
-
|
707
|
-
if _end and i < n
|
708
|
-
handle_data(rawdata[i...n])
|
709
|
-
i = n
|
710
|
-
end
|
711
|
-
|
712
|
-
@rawdata = rawdata[i..-1]
|
713
|
-
# @offset += i # FIXME BUGME another unused variable in SGMLParser?
|
714
|
-
end
|
715
|
-
|
716
|
-
|
717
|
-
# Internal -- parse processing instr, return length or -1 if not terminated
|
718
|
-
def parse_pi(i)
|
719
|
-
rawdata = @rawdata
|
720
|
-
if rawdata[i...i+2] != '<?'
|
721
|
-
error("unexpected call to parse_pi()")
|
722
|
-
end
|
723
|
-
ni,match = index_match(rawdata,Piclose,i+2)
|
724
|
-
return nil unless match
|
725
|
-
j = ni
|
726
|
-
handle_pi(rawdata[i+2...j])
|
727
|
-
j = (j + match[0].length)
|
728
|
-
return j-i
|
729
|
-
end
|
730
|
-
|
731
|
-
def parse_comment(i)
|
732
|
-
rawdata = @rawdata
|
733
|
-
if rawdata[i...i+4] != "<!--"
|
734
|
-
error("unexpected call to parse_comment()")
|
735
|
-
end
|
736
|
-
ni,match = index_match(rawdata, Commentclose,i)
|
737
|
-
return nil unless match
|
738
|
-
handle_comment(rawdata[i+4..(ni-1)])
|
739
|
-
return ni+match[0].length # Length from i to just past the closing comment tag
|
740
|
-
end
|
741
|
-
|
742
|
-
|
743
|
-
def parse_starttag(i)
|
744
|
-
@_starttag_text = nil
|
745
|
-
start_pos = i
|
746
|
-
rawdata = @rawdata
|
747
|
-
ni,match = index_match(rawdata,Shorttagopen,i)
|
748
|
-
if ni == i
|
749
|
-
# SGML shorthand: <tag/data/ == <tag>data</tag>
|
750
|
-
# XXX Can data contain &... (entity or char refs)?
|
751
|
-
# XXX Can data contain < or > (tag characters)?
|
752
|
-
# XXX Can there be whitespace before the first /?
|
753
|
-
k,match = index_match(rawdata,Shorttag,i)
|
754
|
-
return nil unless match
|
755
|
-
tag, data = match[1], match[2]
|
756
|
-
@_starttag_text = "<#{tag}/"
|
757
|
-
tag.downcase!
|
758
|
-
second_end = rawdata.index(Shorttagopen,k)
|
759
|
-
finish_shorttag(tag, data)
|
760
|
-
@_starttag_text = rawdata[start_pos...second_end+1]
|
761
|
-
return k
|
762
|
-
end
|
763
|
-
|
764
|
-
j = rawdata.index(Endbracket, i+1)
|
765
|
-
return nil unless j
|
766
|
-
attrsd = []
|
767
|
-
if rawdata[i...i+2] == '<>'
|
768
|
-
# SGML shorthand: <> == <last open tag seen>
|
769
|
-
k = j
|
770
|
-
tag = @lasttag
|
771
|
-
else
|
772
|
-
ni,match = index_match(rawdata,Tagfind,i+1)
|
773
|
-
unless match
|
774
|
-
error('unexpected call to parse_starttag')
|
775
|
-
end
|
776
|
-
k = ni+match[0].length+1
|
777
|
-
tag = match[0].downcase
|
778
|
-
@lasttag = tag
|
779
|
-
end
|
780
|
-
|
781
|
-
while k < j
|
782
|
-
break if rawdata.index(Endtagfind, k) == k
|
783
|
-
ni,match = index_match(rawdata,Attrfind,k)
|
784
|
-
break unless ni
|
785
|
-
matched_length = match[0].length
|
786
|
-
attrname, rest, attrvalue = match[1],match[2],match[3]
|
787
|
-
if rest.nil? or rest.empty?
|
788
|
-
attrvalue = '' # was: = attrname # Why the change?
|
789
|
-
elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
|
790
|
-
attrvalue = attrvalue[1...-1]
|
791
|
-
end
|
792
|
-
attrsd << [attrname.downcase, attrvalue]
|
793
|
-
k += matched_length
|
794
|
-
end
|
795
|
-
if rawdata[j..j] == ">"
|
796
|
-
j += 1
|
797
|
-
end
|
798
|
-
@_starttag_text = rawdata[start_pos...j]
|
799
|
-
finish_starttag(tag, attrsd)
|
800
|
-
return j
|
801
|
-
end
|
802
|
-
|
803
|
-
def parse_endtag(i)
|
804
|
-
rawdata = @rawdata
|
805
|
-
j, match = index_match(rawdata, /[<>]/,i+1)
|
806
|
-
return nil unless j
|
807
|
-
tag = rawdata[i+2...j].strip.downcase
|
808
|
-
if rawdata[j..j] == ">"
|
809
|
-
j += 1
|
810
|
-
end
|
811
|
-
finish_endtag(tag)
|
812
|
-
return j
|
813
|
-
end
|
814
|
-
|
815
|
-
def output
|
816
|
-
# Return processed HTML as a single string
|
817
|
-
return @pieces.map{|p| p.to_s}.join
|
818
|
-
end
|
819
|
-
|
820
|
-
def error(message)
|
821
|
-
raise BetterSGMLParserError.new(message)
|
822
|
-
end
|
823
|
-
def handle_pi(text)
|
824
|
-
end
|
825
|
-
def handle_decl(text)
|
826
|
-
end
|
827
|
-
end
|
828
|
-
|
829
|
-
# Add some helper methods to make AttributeList (all of those damn attrs
|
830
|
-
# and attrsD used by StrictFeedParser) act more like a Hash.
|
831
|
-
# NOTE AttributeList is still Read-Only (AFAICT).
|
832
|
-
# Monkey patching is terrible, and I have an addiction.
|
833
|
-
module XML
|
834
|
-
module SAX
|
835
|
-
module AttributeList # in xml/sax.rb
|
836
|
-
def [](key)
|
837
|
-
getValue(key)
|
838
|
-
end
|
839
|
-
|
840
|
-
def each(&blk)
|
841
|
-
(0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
|
842
|
-
end
|
843
|
-
|
844
|
-
def each_key(&blk)
|
845
|
-
(0...getLength).each{|pos| yield getName(pos) }
|
846
|
-
end
|
847
|
-
|
848
|
-
def each_value(&blk)
|
849
|
-
(0...getLength).each{|pos| yield getValue(pos) }
|
850
|
-
end
|
851
|
-
|
852
|
-
def to_a # Rather use collect? grep for to_a.collect
|
853
|
-
l = []
|
854
|
-
each{|k,v| l << [k,v]}
|
855
|
-
return l
|
856
|
-
end
|
857
|
-
|
858
|
-
def to_s
|
859
|
-
l = []
|
860
|
-
each{|k,v| l << "#{k} => #{v}"}
|
861
|
-
"{ "+l.join(", ")+" }"
|
862
|
-
end
|
863
|
-
end
|
864
|
-
end
|
865
|
-
end
|
866
|
-
# This adds a nice scrub method to Hpricot, so we don't need a _HTMLSanitizer class
|
867
|
-
# http://underpantsgnome.com/2007/01/20/hpricot-scrub
|
868
|
-
# I have modified it to check for attributes that are only allowed if they are in a certain tag
|
869
|
-
module Hpricot
|
870
|
-
Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
871
|
-
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
872
|
-
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
|
873
|
-
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
874
|
-
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
|
875
|
-
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
|
876
|
-
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
|
877
|
-
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
|
878
|
-
'ul', 'var'
|
879
|
-
]
|
880
|
-
|
881
|
-
Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
882
|
-
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
883
|
-
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
884
|
-
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
885
|
-
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
886
|
-
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
887
|
-
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
888
|
-
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
889
|
-
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
890
|
-
'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
891
|
-
'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
|
892
|
-
]
|
893
|
-
|
894
|
-
Unacceptable_Elements_With_End_Tag = ['script', 'applet']
|
895
|
-
|
896
|
-
Acceptable_Css_Properties = ['azimuth', 'background-color',
|
897
|
-
'border-bottom-color', 'border-collapse', 'border-color',
|
898
|
-
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
899
|
-
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
900
|
-
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
901
|
-
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
902
|
-
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
903
|
-
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
904
|
-
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
905
|
-
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
906
|
-
'white-space', 'width'
|
907
|
-
]
|
908
|
-
|
909
|
-
# survey of common keywords found in feeds
|
910
|
-
Acceptable_Css_Keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
911
|
-
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
912
|
-
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
913
|
-
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
914
|
-
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
915
|
-
'transparent', 'underline', 'white', 'yellow'
|
916
|
-
]
|
917
|
-
|
918
|
-
Mathml_Elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
919
|
-
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
920
|
-
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
921
|
-
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
922
|
-
'munderover', 'none'
|
923
|
-
]
|
924
|
-
|
925
|
-
Mathml_Attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
926
|
-
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
927
|
-
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
928
|
-
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
929
|
-
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
930
|
-
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
931
|
-
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
932
|
-
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
933
|
-
'xlink:type', 'xmlns', 'xmlns:xlink'
|
934
|
-
]
|
935
|
-
|
936
|
-
# svgtiny - foreignObject + linearGradient + radialGradient + stop
|
937
|
-
Svg_Elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
938
|
-
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
|
939
|
-
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
|
940
|
-
'linearGradient', 'line', 'metadata', 'missing-glyph', 'mpath', 'path',
|
941
|
-
'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg',
|
942
|
-
'switch', 'text', 'title', 'use'
|
943
|
-
]
|
944
|
-
|
945
|
-
# svgtiny + class + opacity + offset + xmlns + xmlns:xlink
|
946
|
-
Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
947
|
-
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
948
|
-
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
949
|
-
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
|
950
|
-
'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
|
951
|
-
'font-size', 'font-stretch', 'font-style', 'font-variant',
|
952
|
-
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
|
953
|
-
'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
|
954
|
-
'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
|
955
|
-
'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
|
956
|
-
'origin', 'overline-position', 'overline-thickness', 'panose-1',
|
957
|
-
'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
|
958
|
-
'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
|
959
|
-
'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
|
960
|
-
'stop-color', 'stop-opacity', 'strikethrough-position',
|
961
|
-
'strikethrough-thickness', 'stroke', 'stroke-dasharray',
|
962
|
-
'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
|
963
|
-
'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
|
964
|
-
'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
|
965
|
-
'underline-position', 'underline-thickness', 'unicode',
|
966
|
-
'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
|
967
|
-
'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
|
968
|
-
'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
|
969
|
-
'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
|
970
|
-
'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
|
971
|
-
]
|
972
|
-
|
973
|
-
Svg_Attr_Map = nil
|
974
|
-
Svg_Elem_Map = nil
|
975
|
-
|
976
|
-
Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
|
977
|
-
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
978
|
-
'stroke-opacity'
|
979
|
-
]
|
980
|
-
|
981
|
-
unless $compatible
|
982
|
-
@@acceptable_tag_specific_attributes = {}
|
983
|
-
@@mathml_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@mathml_attributes }
|
984
|
-
@@svg_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@svg_attributes }
|
985
|
-
end
|
986
|
-
|
987
|
-
class Elements
|
988
|
-
def strip(allowed_tags=[]) # I completely route around this with the recursive_strip in Doc
|
989
|
-
each { |x| x.strip(allowed_tags) }
|
990
|
-
end
|
991
|
-
|
992
|
-
def strip_attributes(safe=[])
|
993
|
-
each { |x| x.strip_attributes(safe) }
|
994
|
-
end
|
995
|
-
|
996
|
-
def strip_style(ok_props=[], ok_keywords=[])
|
997
|
-
each { |x| x.strip_style(ok_props, ok_keywords) }
|
998
|
-
end
|
999
|
-
end
|
1000
|
-
|
1001
|
-
class Text
|
1002
|
-
def strip(foo)
|
1003
|
-
end
|
1004
|
-
def strip_attributes(foo)
|
1005
|
-
end
|
1006
|
-
end
|
1007
|
-
class Comment
|
1008
|
-
def strip(foo)
|
1009
|
-
end
|
1010
|
-
def strip_attributes(foo)
|
1011
|
-
end
|
1012
|
-
end
|
1013
|
-
class BogusETag
|
1014
|
-
def strip(foo)
|
1015
|
-
end
|
1016
|
-
def strip_attributes(foo)
|
1017
|
-
end
|
1018
|
-
end
|
1019
|
-
|
1020
|
-
class Elem
|
1021
|
-
def decode_entities
|
1022
|
-
children.each{ |x| x.decode_entities }
|
1023
|
-
end
|
1024
|
-
|
1025
|
-
def cull
|
1026
|
-
if children
|
1027
|
-
swap(children.to_s)
|
1028
|
-
end
|
1029
|
-
end
|
1030
|
-
|
1031
|
-
def strip
|
1032
|
-
if strip_removes?
|
1033
|
-
cull
|
1034
|
-
end
|
1035
|
-
end
|
1036
|
-
|
1037
|
-
def strip_attributes
|
1038
|
-
unless attributes.nil?
|
1039
|
-
attributes.each do |atr|
|
1040
|
-
unless Acceptable_Attributes.include?atr[0]
|
1041
|
-
remove_attribute(atr[0])
|
1042
|
-
end
|
1043
|
-
end
|
1044
|
-
end
|
1045
|
-
end
|
1046
|
-
|
1047
|
-
def strip_removes?
|
1048
|
-
# I'm sure there are others that shuould be ripped instead of stripped
|
1049
|
-
attributes && attributes['type'] =~ /script|css/
|
1050
|
-
end
|
1051
|
-
end
|
1052
|
-
end
|
1053
|
-
|
1054
|
-
module FeedParser
|
1055
|
-
Version = "0.1aleph_naught"
|
1056
|
-
|
1057
|
-
License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
1058
|
-
|
1059
|
-
Redistribution and use in source and binary forms, with or without modification,
|
1060
|
-
are permitted provided that the following conditions are met:
|
1061
|
-
|
1062
|
-
* Redistributions of source code must retain the above copyright notice,
|
1063
|
-
this list of conditions and the following disclaimer.
|
1064
|
-
* Redistributions in binary form must reproduce the above copyright notice,
|
1065
|
-
this list of conditions and the following disclaimer in the documentation
|
1066
|
-
and/or other materials provided with the distribution.
|
1067
|
-
|
1068
|
-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
1069
|
-
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
1070
|
-
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
1071
|
-
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
1072
|
-
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
1073
|
-
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
1074
|
-
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
1075
|
-
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
1076
|
-
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
1077
|
-
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
1078
|
-
POSSIBILITY OF SUCH DAMAGE."""
|
1079
|
-
|
1080
|
-
Author = "Jeff Hodges <http://somethingsimilar.com>"
|
1081
|
-
Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
|
1082
|
-
Contributors = [ "Jason Diamond <http://injektilo.org/>",
|
1083
|
-
"John Beimler <http://john.beimler.org/>",
|
1084
|
-
"Fazal Majid <http://www.majid.info/mylos/weblog/>",
|
1085
|
-
"Aaron Swartz <http://aaronsw.com/>",
|
1086
|
-
"Kevin Marks <http://epeus.blogspot.com/>"
|
1087
|
-
]
|
1088
|
-
# HTTP "User-Agent" header to send to servers when downloading feeds.
|
1089
|
-
# If you are embedding feedparser in a larger application, you should
|
1090
|
-
# change this to your application name and URL.
|
1091
|
-
USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
|
1092
|
-
|
1093
|
-
# HTTP "Accept" header to send to servers when downloading feeds. If you don't
|
1094
|
-
# want to send an Accept header, set this to None.
|
1095
|
-
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
|
1096
|
-
|
1097
|
-
|
1098
|
-
# If you want feedparser to automatically run HTML markup through HTML Tidy, set
|
1099
|
-
# this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
|
1100
|
-
# or utidylib <http://utidylib.berlios.de/>.
|
1101
|
-
TIDY_MARKUP = false #FIXME untranslated
|
1102
|
-
|
1103
|
-
# List of Python interfaces for HTML Tidy, in order of preference. Only useful
|
1104
|
-
# if TIDY_MARKUP = true
|
1105
|
-
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
|
1106
|
-
|
1107
|
-
# The original Python import. I'm using it to help translate
|
1108
|
-
#import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
# ---------- don't touch these ----------
|
1113
|
-
class ThingsNobodyCaresAboutButMe < Exception
|
1114
|
-
end
|
1115
|
-
class CharacterEncodingOverride < ThingsNobodyCaresAboutButMe
|
1116
|
-
end
|
1117
|
-
class CharacterEncodingUnknown < ThingsNobodyCaresAboutButMe
|
1118
|
-
end
|
1119
|
-
class NonXMLContentType < ThingsNobodyCaresAboutButMe
|
1120
|
-
end
|
1121
|
-
class UndeclaredNamespace < Exception
|
1122
|
-
end
|
1123
|
-
|
1124
|
-
|
1125
|
-
SUPPORTED_VERSIONS = {'' => 'unknown',
|
1126
|
-
'rss090' => 'RSS 0.90',
|
1127
|
-
'rss091n' => 'RSS 0.91 (Netscape)',
|
1128
|
-
'rss091u' => 'RSS 0.91 (Userland)',
|
1129
|
-
'rss092' => 'RSS 0.92',
|
1130
|
-
'rss093' => 'RSS 0.93',
|
1131
|
-
'rss094' => 'RSS 0.94',
|
1132
|
-
'rss20' => 'RSS 2.0',
|
1133
|
-
'rss10' => 'RSS 1.0',
|
1134
|
-
'rss' => 'RSS (unknown version)',
|
1135
|
-
'atom01' => 'Atom 0.1',
|
1136
|
-
'atom02' => 'Atom 0.2',
|
1137
|
-
'atom03' => 'Atom 0.3',
|
1138
|
-
'atom10' => 'Atom 1.0',
|
1139
|
-
'atom' => 'Atom (unknown version)',
|
1140
|
-
'cdf' => 'CDF',
|
1141
|
-
'hotrss' => 'Hot RSS'
|
1142
|
-
}
|
1143
|
-
class FeedParserDict < Hash
|
1144
|
-
=begin
|
1145
|
-
The naming of a certain common attribute (such as, "When was the last
|
1146
|
-
time this feed was updated?") can have many different names depending
|
1147
|
-
on the type of feed we are handling. This class allows us to use
|
1148
|
-
both the attribute name a person, who has knowledge of the kind of
|
1149
|
-
feed being parsed, expects, as well as allowing a developer to rely
|
1150
|
-
on one name to contain the proper attribute no matter what kind of
|
1151
|
-
feed is being parsed. @@keymaps is a Hash that contains information
|
1152
|
-
on what certain attributes "really is" in each feed type. It does so
|
1153
|
-
by providing a common name that will map to any feed type in the keys,
|
1154
|
-
with possible "correct" attributes in the its values. the #[] and #[]=
|
1155
|
-
methods check with keymaps to see what attribute the developer "really
|
1156
|
-
means" if they've asked for one which happens to be in @@keymap's keys.
|
1157
|
-
=end
|
1158
|
-
@@keymap = {'channel' => 'feed',
|
1159
|
-
'items' => 'entries',
|
1160
|
-
'guid' => 'id',
|
1161
|
-
'date' => 'updated',
|
1162
|
-
'date_parsed' => 'updated_parsed',
|
1163
|
-
'description' => ['subtitle', 'summary'],
|
1164
|
-
'url' => ['href'],
|
1165
|
-
'modified' => 'updated',
|
1166
|
-
'modified_parsed' => 'updated_parsed',
|
1167
|
-
'issued' => 'published',
|
1168
|
-
'issued_parsed' => 'published_parsed',
|
1169
|
-
'copyright' => 'rights',
|
1170
|
-
'copyright_detail' => 'rights_detail',
|
1171
|
-
'tagline' => 'subtitle',
|
1172
|
-
'tagline_detail' => 'subtitle_detail'}
|
1173
|
-
|
1174
|
-
def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
|
1175
|
-
return self['entries']
|
1176
|
-
end
|
1177
|
-
# We could include the [] rewrite in new using Hash.new's fancy pants block thing
|
1178
|
-
# but we'd still have to overwrite []= and such.
|
1179
|
-
# I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
|
1180
|
-
def initialize(pairs=nil)
|
1181
|
-
if pairs.class == Array and pairs[0].class == Array and pairs[0].length == 2
|
1182
|
-
pairs.each do |l|
|
1183
|
-
k,v = l
|
1184
|
-
self[k] = v
|
1185
|
-
end
|
1186
|
-
elsif pairs.class == Hash
|
1187
|
-
self.merge!(pairs)
|
1188
|
-
end
|
1189
|
-
end
|
1190
|
-
|
1191
|
-
def [](key)
|
1192
|
-
if key == 'category'
|
1193
|
-
return self['tags'][0]['term']
|
1194
|
-
end
|
1195
|
-
if key == 'categories'
|
1196
|
-
return self['tags'].collect{|tag| [tag['scheme'],tag['term']]}
|
1197
|
-
end
|
1198
|
-
realkey = @@keymap[key] || key
|
1199
|
-
if realkey.class == Array
|
1200
|
-
realkey.each{ |key| return self[key] if has_key?key }
|
1201
|
-
end
|
1202
|
-
# Note that the original key is preferred over the realkey we (might
|
1203
|
-
# have) found in @@keymaps
|
1204
|
-
if has_key?(key)
|
1205
|
-
return super(key)
|
1206
|
-
end
|
1207
|
-
return super(realkey)
|
1208
|
-
end
|
1209
|
-
|
1210
|
-
def []=(key,value)
|
1211
|
-
if @@keymap.key?key
|
1212
|
-
key = @@keymap[key]
|
1213
|
-
if key.class == Array
|
1214
|
-
key = key[0]
|
1215
|
-
end
|
1216
|
-
end
|
1217
|
-
super(key,value)
|
1218
|
-
end
|
1219
|
-
|
1220
|
-
def method_missing(msym, *args)
|
1221
|
-
methodname = msym.to_s
|
1222
|
-
if methodname[-1] == '='
|
1223
|
-
return self[methodname[0..-2]] = args[0]
|
1224
|
-
elsif methodname[-1] != '!' and methodname[-1] != '?' and methodname[0] != "_" # FIXME implement with private
|
1225
|
-
return self[methodname]
|
1226
|
-
else
|
1227
|
-
raise NoMethodError, "whoops, we don't know about the attribute or method called `#{methodname}' for #{self}:#{self.class}"
|
1228
|
-
end
|
1229
|
-
end
|
1230
|
-
end
|
1231
|
-
|
1232
|
-
|
1233
|
-
|
1234
|
-
|
1235
|
-
module FeedParserMixin
|
1236
|
-
attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
|
1237
|
-
|
1238
|
-
def startup(baseuri=nil, baselang=nil, encoding='utf-8')
|
1239
|
-
$stderr << "initializing FeedParser\n" if $debug
|
1240
|
-
|
1241
|
-
@namespaces = {'' => '',
|
1242
|
-
'http://backend.userland.com/rss' => '',
|
1243
|
-
'http://blogs.law.harvard.edu/tech/rss' => '',
|
1244
|
-
'http://purl.org/rss/1.0/' => '',
|
1245
|
-
'http://my.netscape.com/rdf/simple/0.9/' => '',
|
1246
|
-
'http://example.com/newformat#' => '',
|
1247
|
-
'http://example.com/necho' => '',
|
1248
|
-
'http://purl.org/echo/' => '',
|
1249
|
-
'uri/of/echo/namespace#' => '',
|
1250
|
-
'http://purl.org/pie/' => '',
|
1251
|
-
'http://purl.org/atom/ns#' => '',
|
1252
|
-
'http://www.w3.org/2005/Atom' => '',
|
1253
|
-
'http://purl.org/rss/1.0/modules/rss091#' => '',
|
1254
|
-
'http://webns.net/mvcb/' => 'admin',
|
1255
|
-
'http://purl.org/rss/1.0/modules/aggregation/' => 'ag',
|
1256
|
-
'http://purl.org/rss/1.0/modules/annotate/' => 'annotate',
|
1257
|
-
'http://media.tangent.org/rss/1.0/' => 'audio',
|
1258
|
-
'http://backend.userland.com/blogChannelModule' => 'blogChannel',
|
1259
|
-
'http://web.resource.org/cc/' => 'cc',
|
1260
|
-
'http://backend.userland.com/creativeCommonsRssModule' => 'creativeCommons',
|
1261
|
-
'http://purl.org/rss/1.0/modules/company' => 'co',
|
1262
|
-
'http://purl.org/rss/1.0/modules/content/' => 'content',
|
1263
|
-
'http://my.theinfo.org/changed/1.0/rss/' => 'cp',
|
1264
|
-
'http://purl.org/dc/elements/1.1/' => 'dc',
|
1265
|
-
'http://purl.org/dc/terms/' => 'dcterms',
|
1266
|
-
'http://purl.org/rss/1.0/modules/email/' => 'email',
|
1267
|
-
'http://purl.org/rss/1.0/modules/event/' => 'ev',
|
1268
|
-
'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner',
|
1269
|
-
'http://freshmeat.net/rss/fm/' => 'fm',
|
1270
|
-
'http://xmlns.com/foaf/0.1/' => 'foaf',
|
1271
|
-
'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo',
|
1272
|
-
'http://postneo.com/icbm/' => 'icbm',
|
1273
|
-
'http://purl.org/rss/1.0/modules/image/' => 'image',
|
1274
|
-
'http://www.itunes.com/DTDs/PodCast-1.0.dtd' => 'itunes',
|
1275
|
-
'http://example.com/DTDs/PodCast-1.0.dtd' => 'itunes',
|
1276
|
-
'http://purl.org/rss/1.0/modules/link/' => 'l',
|
1277
|
-
'http://search.yahoo.com/mrss' => 'media',
|
1278
|
-
'http://madskills.com/public/xml/rss/module/pingback/' => 'pingback',
|
1279
|
-
'http://prismstandard.org/namespaces/1.2/basic/' => 'prism',
|
1280
|
-
'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
|
1281
|
-
'http://www.w3.org/2000/01/rdf-schema#' => 'rdfs',
|
1282
|
-
'http://purl.org/rss/1.0/modules/reference/' => 'ref',
|
1283
|
-
'http://purl.org/rss/1.0/modules/richequiv/' => 'reqv',
|
1284
|
-
'http://purl.org/rss/1.0/modules/search/' => 'search',
|
1285
|
-
'http://purl.org/rss/1.0/modules/slash/' => 'slash',
|
1286
|
-
'http://schemas.xmlsoap.org/soap/envelope/' => 'soap',
|
1287
|
-
'http://purl.org/rss/1.0/modules/servicestatus/' => 'ss',
|
1288
|
-
'http://hacks.benhammersley.com/rss/streaming/' => 'str',
|
1289
|
-
'http://purl.org/rss/1.0/modules/subscription/' => 'sub',
|
1290
|
-
'http://purl.org/rss/1.0/modules/syndication/' => 'sy',
|
1291
|
-
'http://purl.org/rss/1.0/modules/taxonomy/' => 'taxo',
|
1292
|
-
'http://purl.org/rss/1.0/modules/threading/' => 'thr',
|
1293
|
-
'http://purl.org/rss/1.0/modules/textinput/' => 'ti',
|
1294
|
-
'http://madskills.com/public/xml/rss/module/trackback/' =>'trackback',
|
1295
|
-
'http://wellformedweb.org/commentAPI/' => 'wfw',
|
1296
|
-
'http://purl.org/rss/1.0/modules/wiki/' => 'wiki',
|
1297
|
-
'http://www.w3.org/1999/xhtml' => 'xhtml',
|
1298
|
-
'http://www.w3.org/XML/1998/namespace' => 'xml',
|
1299
|
-
'http://www.w3.org/1999/xlink' => 'xlink',
|
1300
|
-
'http://schemas.pocketsoap.com/rss/myDescModule/' => 'szf'
|
1301
|
-
}
|
1302
|
-
@matchnamespaces = {}
|
1303
|
-
@namespaces.each do |l|
|
1304
|
-
@matchnamespaces[l[0].downcase] = l[1]
|
1305
|
-
end
|
1306
|
-
@can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
|
1307
|
-
@can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
|
1308
|
-
@can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
|
1309
|
-
@html_types = ['text/html', 'application/xhtml+xml']
|
1310
|
-
@feeddata = FeedParserDict.new # feed-level data
|
1311
|
-
@encoding = encoding # character encoding
|
1312
|
-
@entries = [] # list of entry-level data
|
1313
|
-
@version = '' # feed type/version see SUPPORTED_VERSIOSN
|
1314
|
-
@namespacesInUse = {} # hash of namespaces defined by the feed
|
1315
|
-
|
1316
|
-
# the following are used internall to track state;
|
1317
|
-
# this is really out of control and should be refactored
|
1318
|
-
@infeed = false
|
1319
|
-
@inentry = false
|
1320
|
-
@incontent = 0 # Yes, this needs to be zero until I work out popContent and pushContent
|
1321
|
-
@intextinput = false
|
1322
|
-
@inimage = false
|
1323
|
-
@inauthor = false
|
1324
|
-
@incontributor = false
|
1325
|
-
@inpublisher = false
|
1326
|
-
@insource = false
|
1327
|
-
@sourcedata = FeedParserDict.new
|
1328
|
-
@contentparams = FeedParserDict.new
|
1329
|
-
@summaryKey = nil
|
1330
|
-
@namespacemap = {}
|
1331
|
-
@elementstack = []
|
1332
|
-
@basestack = []
|
1333
|
-
@langstack = []
|
1334
|
-
@baseuri = baseuri || ''
|
1335
|
-
@lang = baselang || nil
|
1336
|
-
if baselang
|
1337
|
-
@feeddata['language'] = baselang.gsub('_','-')
|
1338
|
-
end
|
1339
|
-
@date_handlers = [:_parse_date_rfc822,
|
1340
|
-
:_parse_date_hungarian, :_parse_date_greek,:_parse_date_mssql,
|
1341
|
-
:_parse_date_nate,:_parse_date_onblog,:_parse_date_w3dtf,:_parse_date_iso8601
|
1342
|
-
]
|
1343
|
-
$stderr << "Leaving startup\n" if $debug # My addition
|
1344
|
-
end
|
1345
|
-
|
1346
|
-
def unknown_starttag(tag, attrsd)
|
1347
|
-
$stderr << "start #{tag} with #{attrsd}\n" if $debug
|
1348
|
-
# normalize attrs
|
1349
|
-
attrsD = {}
|
1350
|
-
attrsd = Hash[*attrsd.flatten] if attrsd.class == Array # Magic! Asterisk!
|
1351
|
-
# LooseFeedParser needs the above because SGMLParser sends attrs as a
|
1352
|
-
# list of lists (like [['type','text/html'],['mode','escaped']])
|
1353
|
-
|
1354
|
-
attrsd.each do |old_k,value|
|
1355
|
-
# There has to be a better, non-ugly way of doing this
|
1356
|
-
k = old_k.downcase # Downcase all keys
|
1357
|
-
attrsD[k] = value
|
1358
|
-
if ['rel','type'].include?value
|
1359
|
-
attrsD[k].downcase! # Downcase the value if the key is 'rel' or 'type'
|
1360
|
-
end
|
1361
|
-
end
|
1362
|
-
|
1363
|
-
# track xml:base and xml:lang
|
1364
|
-
baseuri = attrsD['xml:base'] || attrsD['base'] || @baseuri
|
1365
|
-
@baseuri = urljoin(@baseuri, baseuri)
|
1366
|
-
lang = attrsD['xml:lang'] || attrsD['lang']
|
1367
|
-
if lang == '' # FIXME This next bit of code is right? Wtf?
|
1368
|
-
# xml:lang could be explicitly set to '', we need to capture that
|
1369
|
-
lang = nil
|
1370
|
-
elsif lang.nil?
|
1371
|
-
# if no xml:lang is specified, use parent lang
|
1372
|
-
lang = @lang
|
1373
|
-
end
|
1374
|
-
if lang and not lang.empty? # Seriously, this cannot be correct
|
1375
|
-
if ['feed', 'rss', 'rdf:RDF'].include?tag
|
1376
|
-
@feeddata['language'] = lang.gsub('_','-')
|
1377
|
-
end
|
1378
|
-
end
|
1379
|
-
@lang = lang
|
1380
|
-
@basestack << @baseuri
|
1381
|
-
@langstack << lang
|
1382
|
-
|
1383
|
-
# track namespaces
|
1384
|
-
attrsd.each do |prefix, uri|
|
1385
|
-
if /^xmlns:/ =~ prefix # prefix begins with xmlns:
|
1386
|
-
trackNamespace(prefix[6..-1], uri)
|
1387
|
-
elsif prefix == 'xmlns':
|
1388
|
-
trackNamespace(nil, uri)
|
1389
|
-
end
|
1390
|
-
end
|
1391
|
-
|
1392
|
-
# track inline content
|
1393
|
-
if @incontent != 0 and @contentparams.has_key?('type') and not ( /xml$/ =~ (@contentparams['type'] || 'xml') )
|
1394
|
-
# element declared itself as escaped markup, but isn't really
|
1395
|
-
|
1396
|
-
@contentparams['type'] = 'application/xhtml+xml'
|
1397
|
-
end
|
1398
|
-
if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
|
1399
|
-
# Note: probably shouldn't simply recreate localname here, but
|
1400
|
-
# our namespace handling isn't actually 100% correct in cases where
|
1401
|
-
# the feed redefines the default namespace (which is actually
|
1402
|
-
# the usual case for inline content, thanks Sam), so here we
|
1403
|
-
# cheat and just reconstruct the element based on localname
|
1404
|
-
# because that compensates for the bugs in our namespace handling.
|
1405
|
-
# This will horribly munge inline content with non-empty qnames,
|
1406
|
-
# but nobody actually does that, so I'm not fixing it.
|
1407
|
-
tag = tag.split(':')[-1]
|
1408
|
-
attrsA = attrsd.to_a.collect{|l| "#{l[0]}=\"#{l[1]}\""}
|
1409
|
-
attrsS = ' '+attrsA.join(' ')
|
1410
|
-
return handle_data("<#{tag}#{attrsS}>", escape=false)
|
1411
|
-
end
|
1412
|
-
|
1413
|
-
# match namespaces
|
1414
|
-
if /:/ =~ tag
|
1415
|
-
prefix, suffix = tag.split(':', 2)
|
1416
|
-
else
|
1417
|
-
prefix, suffix = '', tag
|
1418
|
-
end
|
1419
|
-
prefix = @namespacemap[prefix] || prefix
|
1420
|
-
if prefix and not prefix.empty?
|
1421
|
-
prefix = prefix + '_'
|
1422
|
-
end
|
1423
|
-
|
1424
|
-
# special hack for better tracking of empty textinput/image elements in illformed feeds
|
1425
|
-
if (not prefix and not prefix.empty?) and not (['title', 'link', 'description','name'].include?tag)
|
1426
|
-
@intextinput = false
|
1427
|
-
end
|
1428
|
-
if (prefix.nil? or prefix.empty?) and not (['title', 'link', 'description', 'url', 'href', 'width', 'height'].include?tag)
|
1429
|
-
@inimage = false
|
1430
|
-
end
|
1431
|
-
|
1432
|
-
# call special handler (if defined) or default handler
|
1433
|
-
begin
|
1434
|
-
return send('_start_'+prefix+suffix, attrsD)
|
1435
|
-
rescue NoMethodError
|
1436
|
-
return push(prefix + suffix, true)
|
1437
|
-
end
|
1438
|
-
end # End unknown_starttag
|
1439
|
-
|
1440
|
-
def unknown_endtag(tag)
|
1441
|
-
$stderr << "end #{tag}\n" if $debug
|
1442
|
-
# match namespaces
|
1443
|
-
if tag.index(':')
|
1444
|
-
prefix, suffix = tag.split(':',2)
|
1445
|
-
else
|
1446
|
-
prefix, suffix = '', tag
|
1447
|
-
end
|
1448
|
-
prefix = @namespacemap[prefix] || prefix
|
1449
|
-
if prefix and not prefix.empty?
|
1450
|
-
prefix = prefix + '_'
|
1451
|
-
end
|
1452
|
-
|
1453
|
-
# call special handler (if defined) or default handler
|
1454
|
-
begin
|
1455
|
-
send('_end_' + prefix + suffix) # NOTE no return here! do not add it!
|
1456
|
-
rescue NoMethodError => details
|
1457
|
-
pop(prefix + suffix)
|
1458
|
-
end
|
1459
|
-
|
1460
|
-
# track inline content
|
1461
|
-
if @incontent != 0 and @contentparams.has_key?'type' and /xml$/ =~ (@contentparams['type'] || 'xml')
|
1462
|
-
# element declared itself as escaped markup, but it isn't really
|
1463
|
-
@contentparams['type'] = 'application/xhtml+xml'
|
1464
|
-
end
|
1465
|
-
if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
|
1466
|
-
tag = tag.split(':')[-1]
|
1467
|
-
handle_data("</#{tag}>", escape=false)
|
1468
|
-
end
|
1469
|
-
|
1470
|
-
# track xml:base and xml:lang going out of scope
|
1471
|
-
if @basestack and not @basestack.empty?
|
1472
|
-
@basestack.pop
|
1473
|
-
if @basestack and @basestack[-1] and not (@basestack.empty? or @basestack[-1].empty?)
|
1474
|
-
@baseuri = @basestack[-1]
|
1475
|
-
end
|
1476
|
-
end
|
1477
|
-
if @langstack and not @langstack.empty?
|
1478
|
-
@langstack.pop
|
1479
|
-
if @langstack and not @langstack.empty? # and @langstack[-1] and not @langstack.empty?
|
1480
|
-
@lang = @langstack[-1]
|
1481
|
-
end
|
1482
|
-
end
|
1483
|
-
end
|
1484
|
-
|
1485
|
-
def handle_charref(ref)
|
1486
|
-
# LooseParserOnly
|
1487
|
-
# called for each character reference, e.g. for ' ', ref will be '160'
|
1488
|
-
$stderr << "entering handle_charref with #{ref}\n" if $debug
|
1489
|
-
return if @elementstack.nil? or @elementstack.empty?
|
1490
|
-
ref.downcase!
|
1491
|
-
chars = ['34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e']
|
1492
|
-
if chars.include?ref
|
1493
|
-
text = "&##{ref};"
|
1494
|
-
else
|
1495
|
-
if ref[0..0] == 'x'
|
1496
|
-
c = (ref[1..-1]).to_i(16)
|
1497
|
-
else
|
1498
|
-
c = ref.to_i
|
1499
|
-
end
|
1500
|
-
text = uconvert(unichr(c),'unicode')
|
1501
|
-
end
|
1502
|
-
@elementstack[-1][2] << text
|
1503
|
-
end
|
1504
|
-
|
1505
|
-
def handle_entityref(ref)
|
1506
|
-
# LooseParserOnly
|
1507
|
-
# called for each entity reference, e.g. for '©', ref will be 'copy'
|
1508
|
-
|
1509
|
-
return if @elementstack.nil? or @elementstack.empty?
|
1510
|
-
$stderr << "entering handle_entityref with #{ref}\n" if $debug
|
1511
|
-
ents = ['lt', 'gt', 'quot', 'amp', 'apos']
|
1512
|
-
if ents.include?ref
|
1513
|
-
text = "&#{ref};"
|
1514
|
-
else
|
1515
|
-
text = HTMLEntities::decode_entities("&#{ref};")
|
1516
|
-
end
|
1517
|
-
@elementstack[-1][2] << text
|
1518
|
-
end
|
1519
|
-
|
1520
|
-
def handle_data(text, escape=true)
|
1521
|
-
# called for each block of plain text, i.e. outside of any tag and
|
1522
|
-
# not containing any character or entity references
|
1523
|
-
return if @elementstack.nil? or @elementstack.empty?
|
1524
|
-
if escape and @contentparams['type'] == 'application/xhtml+xml'
|
1525
|
-
text = text.to_xs
|
1526
|
-
end
|
1527
|
-
@elementstack[-1][2] << text
|
1528
|
-
end
|
1529
|
-
|
1530
|
-
def handle_comment(comment)
|
1531
|
-
# called for each comment, e.g. <!-- insert message here -->
|
1532
|
-
end
|
1533
|
-
|
1534
|
-
def handle_pi(text)
|
1535
|
-
end
|
1536
|
-
|
1537
|
-
def handle_decl(text)
|
1538
|
-
end
|
1539
|
-
|
1540
|
-
def parse_declaration(i)
|
1541
|
-
# for LooseFeedParser
|
1542
|
-
$stderr << "entering parse_declaration\n" if $debug
|
1543
|
-
if @rawdata[i...i+9] == '<![CDATA['
|
1544
|
-
k = @rawdata.index(/\]\]>/u,i+9)
|
1545
|
-
k = @rawdata.length unless k
|
1546
|
-
handle_data(@rawdata[i+9...k].to_xs,false)
|
1547
|
-
return k+3
|
1548
|
-
else
|
1549
|
-
k = @rawdata.index(/>/,i).to_i
|
1550
|
-
return k+1
|
1551
|
-
end
|
1552
|
-
end
|
1553
|
-
|
1554
|
-
def mapContentType(contentType)
|
1555
|
-
contentType.downcase!
|
1556
|
-
case contentType
|
1557
|
-
when 'text'
|
1558
|
-
contentType = 'text/plain'
|
1559
|
-
when 'html'
|
1560
|
-
contentType = 'text/html'
|
1561
|
-
when 'xhtml'
|
1562
|
-
contentType = 'application/xhtml+xml'
|
1563
|
-
end
|
1564
|
-
return contentType
|
1565
|
-
end
|
1566
|
-
|
1567
|
-
def trackNamespace(prefix, uri)
|
1568
|
-
|
1569
|
-
loweruri = uri.downcase.strip
|
1570
|
-
if [prefix, loweruri] == [nil, 'http://my.netscape.com/rdf/simple/0.9/'] and (@version.nil? or @version.empty?)
|
1571
|
-
@version = 'rss090'
|
1572
|
-
elsif loweruri == 'http://purl.org/rss/1.0/' and (@version.nil? or @version.empty?)
|
1573
|
-
@version = 'rss10'
|
1574
|
-
elsif loweruri == 'http://www.w3.org/2005/atom' and (@version.nil? or @version.empty?)
|
1575
|
-
@version = 'atom10'
|
1576
|
-
elsif /backend\.userland\.com\/rss/ =~ loweruri
|
1577
|
-
# match any backend.userland.com namespace
|
1578
|
-
uri = 'http://backend.userland.com/rss'
|
1579
|
-
loweruri = uri
|
1580
|
-
end
|
1581
|
-
if @matchnamespaces.has_key? loweruri
|
1582
|
-
@namespacemap[prefix] = @matchnamespaces[loweruri]
|
1583
|
-
@namespacesInUse[@matchnamespaces[loweruri]] = uri
|
1584
|
-
else
|
1585
|
-
@namespacesInUse[prefix || ''] = uri
|
1586
|
-
end
|
1587
|
-
end
|
1588
|
-
|
1589
|
-
def resolveURI(uri)
|
1590
|
-
return urljoin(@baseuri || '', uri)
|
1591
|
-
end
|
1592
|
-
|
1593
|
-
def decodeEntities(element, data)
|
1594
|
-
return data
|
1595
|
-
end
|
1596
|
-
|
1597
|
-
def push(element, expectingText)
|
1598
|
-
@elementstack << [element, expectingText, []]
|
1599
|
-
end
|
1600
|
-
|
1601
|
-
def pop(element, stripWhitespace=true)
|
1602
|
-
return if @elementstack.nil? or @elementstack.empty?
|
1603
|
-
return if @elementstack[-1][0] != element
|
1604
|
-
element, expectingText, pieces = @elementstack.pop
|
1605
|
-
if pieces.class == Array
|
1606
|
-
output = pieces.join('')
|
1607
|
-
else
|
1608
|
-
output = pieces
|
1609
|
-
end
|
1610
|
-
if stripWhitespace
|
1611
|
-
output.strip!
|
1612
|
-
end
|
1613
|
-
return output if not expectingText
|
1614
|
-
|
1615
|
-
# decode base64 content
|
1616
|
-
if @contentparams['base64']
|
1617
|
-
out64 = Base64::decode64(output) # a.k.a. [output].unpack('m')[0]
|
1618
|
-
if not output.empty? and not out64.empty?
|
1619
|
-
output = out64
|
1620
|
-
end
|
1621
|
-
end
|
1622
|
-
|
1623
|
-
# resolve relative URIs
|
1624
|
-
if @can_be_relative_uri.include?element and output and not output.empty?
|
1625
|
-
output = resolveURI(output)
|
1626
|
-
end
|
1627
|
-
|
1628
|
-
# decode entities within embedded markup
|
1629
|
-
if not @contentparams['base64']
|
1630
|
-
output = decodeEntities(element, output)
|
1631
|
-
end
|
1632
|
-
|
1633
|
-
# remove temporary cruft from contentparams
|
1634
|
-
@contentparams.delete('mode')
|
1635
|
-
@contentparams.delete('base64')
|
1636
|
-
|
1637
|
-
# resolve relative URIs within embedded markup
|
1638
|
-
if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
|
1639
|
-
if @can_contain_relative_uris.include?element
|
1640
|
-
output = FeedParser.resolveRelativeURIs(output, @baseuri, @encoding)
|
1641
|
-
end
|
1642
|
-
end
|
1643
|
-
# sanitize embedded markup
|
1644
|
-
if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
|
1645
|
-
if @can_contain_dangerous_markup.include?element
|
1646
|
-
output = FeedParser.sanitizeHTML(output, @encoding)
|
1647
|
-
end
|
1648
|
-
end
|
1649
|
-
|
1650
|
-
if @encoding and not @encoding.empty? and @encoding != 'utf-8'
|
1651
|
-
output = uconvert(output, @encoding, 'utf-8')
|
1652
|
-
# FIXME I turn everything into utf-8, not unicode, originally because REXML was being used but now beause I haven't tested it out yet.
|
1653
|
-
end
|
1654
|
-
|
1655
|
-
# categories/tags/keywords/whatever are handled in _end_category
|
1656
|
-
return output if element == 'category'
|
1657
|
-
|
1658
|
-
# store output in appropriate place(s)
|
1659
|
-
if @inentry and not @insource
|
1660
|
-
if element == 'content'
|
1661
|
-
@entries[-1][element] ||= []
|
1662
|
-
contentparams = Marshal.load(Marshal.dump(@contentparams)) # deepcopy
|
1663
|
-
contentparams['value'] = output
|
1664
|
-
@entries[-1][element] << contentparams
|
1665
|
-
elsif element == 'link'
|
1666
|
-
@entries[-1][element] = output
|
1667
|
-
if output and not output.empty?
|
1668
|
-
@entries[-1]['links'][-1]['href'] = output
|
1669
|
-
end
|
1670
|
-
else
|
1671
|
-
element = 'summary' if element == 'description'
|
1672
|
-
@entries[-1][element] = output
|
1673
|
-
if @incontent != 0
|
1674
|
-
contentparams = Marshal.load(Marshal.dump(@contentparams))
|
1675
|
-
contentparams['value'] = output
|
1676
|
-
@entries[-1][element + '_detail'] = contentparams
|
1677
|
-
end
|
1678
|
-
end
|
1679
|
-
elsif (@infeed or @insource) and not @intextinput and not @inimage
|
1680
|
-
context = getContext()
|
1681
|
-
element = 'subtitle' if element == 'description'
|
1682
|
-
context[element] = output
|
1683
|
-
if element == 'link'
|
1684
|
-
context['links'][-1]['href'] = output
|
1685
|
-
elsif @incontent != 0
|
1686
|
-
contentparams = Marshal.load(Marshal.dump(@contentparams))
|
1687
|
-
contentparams['value'] = output
|
1688
|
-
context[element + '_detail'] = contentparams
|
1689
|
-
end
|
1690
|
-
end
|
1691
|
-
return output
|
1692
|
-
end
|
1693
|
-
|
1694
|
-
def pushContent(tag, attrsD, defaultContentType, expectingText)
|
1695
|
-
@incontent += 1 # Yes, I hate this.
|
1696
|
-
type = mapContentType(attrsD['type'] || defaultContentType)
|
1697
|
-
@contentparams = FeedParserDict.new({'type' => type,'language' => @lang,'base' => @baseuri})
|
1698
|
-
@contentparams['base64'] = isBase64(attrsD, @contentparams)
|
1699
|
-
push(tag, expectingText)
|
1700
|
-
end
|
1701
|
-
|
1702
|
-
def popContent(tag)
|
1703
|
-
value = pop(tag)
|
1704
|
-
@incontent -= 1
|
1705
|
-
@contentparams.clear
|
1706
|
-
return value
|
1707
|
-
end
|
1708
|
-
|
1709
|
-
def mapToStandardPrefix(name)
|
1710
|
-
colonpos = name.index(':')
|
1711
|
-
if colonpos
|
1712
|
-
prefix = name[0..colonpos-1]
|
1713
|
-
suffix = name[colonpos+1..-1]
|
1714
|
-
prefix = @namespacemap[prefix] || prefix
|
1715
|
-
name = prefix + ':' + suffix
|
1716
|
-
end
|
1717
|
-
return name
|
1718
|
-
end
|
1719
|
-
|
1720
|
-
def getAttribute(attrsD, name)
|
1721
|
-
return attrsD[mapToStandardPrefix(name)]
|
1722
|
-
end
|
1723
|
-
|
1724
|
-
def isBase64(attrsD, contentparams)
|
1725
|
-
return true if (attrsD['mode'] == 'base64')
|
1726
|
-
if /(^text\/)|(\+xml$)|(\/xml$)/ =~ contentparams['type']
|
1727
|
-
return false
|
1728
|
-
end
|
1729
|
-
return true
|
1730
|
-
end
|
1731
|
-
|
1732
|
-
def itsAnHrefDamnIt(attrsD)
|
1733
|
-
href= attrsD['url'] || attrsD['uri'] || attrsD['href']
|
1734
|
-
if href
|
1735
|
-
attrsD.delete('url')
|
1736
|
-
attrsD.delete('uri')
|
1737
|
-
attrsD['href'] = href
|
1738
|
-
end
|
1739
|
-
return attrsD
|
1740
|
-
end
|
1741
|
-
|
1742
|
-
|
1743
|
-
def _save(key, value)
|
1744
|
-
context = getContext()
|
1745
|
-
context[key] ||= value
|
1746
|
-
end
|
1747
|
-
|
1748
|
-
def _start_rss(attrsD)
|
1749
|
-
versionmap = {'0.91' => 'rss091u',
|
1750
|
-
'0.92' => 'rss092',
|
1751
|
-
'0.93' => 'rss093',
|
1752
|
-
'0.94' => 'rss094'
|
1753
|
-
}
|
1754
|
-
|
1755
|
-
if not @version or @version.empty?
|
1756
|
-
attr_version = attrsD['version'] || ''
|
1757
|
-
version = versionmap[attr_version]
|
1758
|
-
if version and not version.empty?
|
1759
|
-
@version = version
|
1760
|
-
elsif /^2\./ =~ attr_version
|
1761
|
-
@version = 'rss20'
|
1762
|
-
else
|
1763
|
-
@version = 'rss'
|
1764
|
-
end
|
1765
|
-
end
|
1766
|
-
end
|
1767
|
-
|
1768
|
-
def _start_dlhottitles(attrsD)
|
1769
|
-
@version = 'hotrss'
|
1770
|
-
end
|
1771
|
-
|
1772
|
-
def _start_channel(attrsD)
|
1773
|
-
@infeed = true
|
1774
|
-
_cdf_common(attrsD)
|
1775
|
-
end
|
1776
|
-
alias :_start_feedinfo :_start_channel
|
1777
|
-
|
1778
|
-
def _cdf_common(attrsD)
|
1779
|
-
if attrsD.has_key?'lastmod'
|
1780
|
-
_start_modified({})
|
1781
|
-
@elementstack[-1][-1] = attrsD['lastmod']
|
1782
|
-
_end_modified
|
1783
|
-
end
|
1784
|
-
if attrsD.has_key?'href'
|
1785
|
-
_start_link({})
|
1786
|
-
@elementstack[-1][-1] = attrsD['href']
|
1787
|
-
_end_link
|
1788
|
-
end
|
1789
|
-
end
|
1790
|
-
|
1791
|
-
def _start_feed(attrsD)
|
1792
|
-
@infeed = true
|
1793
|
-
versionmap = {'0.1' => 'atom01',
|
1794
|
-
'0.2' => 'atom02',
|
1795
|
-
'0.3' => 'atom03'
|
1796
|
-
}
|
1797
|
-
|
1798
|
-
if not @version or @version.empty?
|
1799
|
-
attr_version = attrsD['version']
|
1800
|
-
version = versionmap[attr_version]
|
1801
|
-
if @version and not @version.empty?
|
1802
|
-
@version = version
|
1803
|
-
else
|
1804
|
-
@version = 'atom'
|
1805
|
-
end
|
1806
|
-
end
|
1807
|
-
end
|
1808
|
-
|
1809
|
-
def _end_channel
|
1810
|
-
@infeed = false
|
1811
|
-
end
|
1812
|
-
alias :_end_feed :_end_channel
|
1813
|
-
|
1814
|
-
def _start_image(attrsD)
|
1815
|
-
@inimage = true
|
1816
|
-
push('image', false)
|
1817
|
-
context = getContext()
|
1818
|
-
context['image'] ||= FeedParserDict.new
|
1819
|
-
end
|
1820
|
-
|
1821
|
-
def _end_image
|
1822
|
-
pop('image')
|
1823
|
-
@inimage = false
|
1824
|
-
end
|
1825
|
-
|
1826
|
-
def _start_textinput(attrsD)
|
1827
|
-
@intextinput = true
|
1828
|
-
push('textinput', false)
|
1829
|
-
context = getContext()
|
1830
|
-
context['textinput'] ||= FeedParserDict.new
|
1831
|
-
end
|
1832
|
-
alias :_start_textInput :_start_textinput
|
1833
|
-
|
1834
|
-
def _end_textinput
|
1835
|
-
pop('textinput')
|
1836
|
-
@intextinput = false
|
1837
|
-
end
|
1838
|
-
alias :_end_textInput :_end_textinput
|
1839
|
-
|
1840
|
-
def _start_author(attrsD)
|
1841
|
-
@inauthor = true
|
1842
|
-
push('author', true)
|
1843
|
-
end
|
1844
|
-
alias :_start_managingeditor :_start_author
|
1845
|
-
alias :_start_dc_author :_start_author
|
1846
|
-
alias :_start_dc_creator :_start_author
|
1847
|
-
alias :_start_itunes_author :_start_author
|
1848
|
-
|
1849
|
-
def _end_author
|
1850
|
-
pop('author')
|
1851
|
-
@inauthor = false
|
1852
|
-
_sync_author_detail()
|
1853
|
-
end
|
1854
|
-
alias :_end_managingeditor :_end_author
|
1855
|
-
alias :_end_dc_author :_end_author
|
1856
|
-
alias :_end_dc_creator :_end_author
|
1857
|
-
alias :_end_itunes_author :_end_author
|
1858
|
-
|
1859
|
-
def _start_itunes_owner(attrsD)
|
1860
|
-
@inpublisher = true
|
1861
|
-
push('publisher', false)
|
1862
|
-
end
|
1863
|
-
|
1864
|
-
def _end_itunes_owner
|
1865
|
-
pop('publisher')
|
1866
|
-
@inpublisher = false
|
1867
|
-
_sync_author_detail('publisher')
|
1868
|
-
end
|
1869
|
-
|
1870
|
-
def _start_contributor(attrsD)
|
1871
|
-
@incontributor = true
|
1872
|
-
context = getContext()
|
1873
|
-
context['contributors'] ||= []
|
1874
|
-
context['contributors'] << FeedParserDict.new
|
1875
|
-
push('contributor', false)
|
1876
|
-
end
|
1877
|
-
|
1878
|
-
def _end_contributor
|
1879
|
-
pop('contributor')
|
1880
|
-
@incontributor = false
|
1881
|
-
end
|
1882
|
-
|
1883
|
-
def _start_dc_contributor(attrsD)
|
1884
|
-
@incontributor = true
|
1885
|
-
context = getContext()
|
1886
|
-
context['contributors'] ||= []
|
1887
|
-
context['contributors'] << FeedParserDict.new
|
1888
|
-
push('name', false)
|
1889
|
-
end
|
1890
|
-
|
1891
|
-
def _end_dc_contributor
|
1892
|
-
_end_name
|
1893
|
-
@incontributor = false
|
1894
|
-
end
|
1895
|
-
|
1896
|
-
def _start_name(attrsD)
|
1897
|
-
push('name', false)
|
1898
|
-
end
|
1899
|
-
alias :_start_itunes_name :_start_name
|
1900
|
-
|
1901
|
-
def _end_name
|
1902
|
-
value = pop('name')
|
1903
|
-
if @inpublisher
|
1904
|
-
_save_author('name', value, 'publisher')
|
1905
|
-
elsif @inauthor
|
1906
|
-
_save_author('name', value)
|
1907
|
-
elsif @incontributor
|
1908
|
-
_save_contributor('name', value)
|
1909
|
-
elsif @intextinput
|
1910
|
-
context = getContext()
|
1911
|
-
context['textinput']['name'] = value
|
1912
|
-
end
|
1913
|
-
end
|
1914
|
-
alias :_end_itunes_name :_end_name
|
1915
|
-
|
1916
|
-
def _start_width(attrsD)
|
1917
|
-
push('width', false)
|
1918
|
-
end
|
1919
|
-
|
1920
|
-
def _end_width
|
1921
|
-
value = pop('width').to_i
|
1922
|
-
if @inimage
|
1923
|
-
context = getContext
|
1924
|
-
context['image']['width'] = value
|
1925
|
-
end
|
1926
|
-
end
|
1927
|
-
|
1928
|
-
def _start_height(attrsD)
|
1929
|
-
push('height', false)
|
1930
|
-
end
|
1931
|
-
|
1932
|
-
def _end_height
|
1933
|
-
value = pop('height').to_i
|
1934
|
-
if @inimage
|
1935
|
-
context = getContext()
|
1936
|
-
context['image']['height'] = value
|
1937
|
-
end
|
1938
|
-
end
|
1939
|
-
|
1940
|
-
def _start_url(attrsD)
|
1941
|
-
push('href', true)
|
1942
|
-
end
|
1943
|
-
alias :_start_homepage :_start_url
|
1944
|
-
alias :_start_uri :_start_url
|
1945
|
-
|
1946
|
-
def _end_url
|
1947
|
-
value = pop('href')
|
1948
|
-
if @inauthor
|
1949
|
-
_save_author('href', value)
|
1950
|
-
elsif @incontributor
|
1951
|
-
_save_contributor('href', value)
|
1952
|
-
elsif @inimage
|
1953
|
-
context = getContext()
|
1954
|
-
context['image']['href'] = value
|
1955
|
-
elsif @intextinput
|
1956
|
-
context = getContext()
|
1957
|
-
context['textinput']['link'] = value
|
1958
|
-
end
|
1959
|
-
end
|
1960
|
-
alias :_end_homepage :_end_url
|
1961
|
-
alias :_end_uri :_end_url
|
1962
|
-
|
1963
|
-
def _start_email(attrsD)
|
1964
|
-
push('email', false)
|
1965
|
-
end
|
1966
|
-
alias :_start_itunes_email :_start_email
|
1967
|
-
|
1968
|
-
def _end_email
|
1969
|
-
value = pop('email')
|
1970
|
-
if @inpublisher
|
1971
|
-
_save_author('email', value, 'publisher')
|
1972
|
-
elsif @inauthor
|
1973
|
-
_save_author('email', value)
|
1974
|
-
elsif @incontributor
|
1975
|
-
_save_contributor('email', value)
|
1976
|
-
end
|
1977
|
-
end
|
1978
|
-
alias :_end_itunes_email :_end_email
|
1979
|
-
|
1980
|
-
def getContext
|
1981
|
-
if @insource
|
1982
|
-
context = @sourcedata
|
1983
|
-
elsif @inentry
|
1984
|
-
context = @entries[-1]
|
1985
|
-
else
|
1986
|
-
context = @feeddata
|
1987
|
-
end
|
1988
|
-
return context
|
1989
|
-
end
|
1990
|
-
|
1991
|
-
def _save_author(key, value, prefix='author')
|
1992
|
-
context = getContext()
|
1993
|
-
context[prefix + '_detail'] ||= FeedParserDict.new
|
1994
|
-
context[prefix + '_detail'][key] = value
|
1995
|
-
_sync_author_detail()
|
1996
|
-
end
|
1997
|
-
|
1998
|
-
def _save_contributor(key, value)
|
1999
|
-
context = getContext
|
2000
|
-
context['contributors'] ||= [FeedParserDict.new]
|
2001
|
-
context['contributors'][-1][key] = value
|
2002
|
-
end
|
2003
|
-
|
2004
|
-
def _sync_author_detail(key='author')
|
2005
|
-
context = getContext()
|
2006
|
-
detail = context["#{key}_detail"]
|
2007
|
-
if detail and not detail.empty?
|
2008
|
-
name = detail['name']
|
2009
|
-
email = detail['email']
|
2010
|
-
|
2011
|
-
if name and email and not (name.empty? or name.empty?)
|
2012
|
-
context[key] = "#{name} (#{email})"
|
2013
|
-
elsif name and not name.empty?
|
2014
|
-
context[key] = name
|
2015
|
-
elsif email and not email.empty?
|
2016
|
-
context[key] = email
|
2017
|
-
end
|
2018
|
-
else
|
2019
|
-
author = context[key].dup unless context[key].nil?
|
2020
|
-
return if not author or author.empty?
|
2021
|
-
emailmatch = author.match(/(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))/)
|
2022
|
-
email = emailmatch[1]
|
2023
|
-
author.gsub!(email, '')
|
2024
|
-
author.gsub!("\(\)", '')
|
2025
|
-
author.strip!
|
2026
|
-
author.gsub!(/^\(/,'')
|
2027
|
-
author.gsub!(/\)$/,'')
|
2028
|
-
author.strip!
|
2029
|
-
context["#{key}_detail"] ||= FeedParserDict.new
|
2030
|
-
context["#{key}_detail"]['name'] = author
|
2031
|
-
context["#{key}_detail"]['email'] = email
|
2032
|
-
end
|
2033
|
-
end
|
2034
|
-
|
2035
|
-
def _start_subtitle(attrsD)
|
2036
|
-
pushContent('subtitle', attrsD, 'text/plain', true)
|
2037
|
-
end
|
2038
|
-
alias :_start_tagline :_start_subtitle
|
2039
|
-
alias :_start_itunes_subtitle :_start_subtitle
|
2040
|
-
|
2041
|
-
def _end_subtitle
|
2042
|
-
popContent('subtitle')
|
2043
|
-
end
|
2044
|
-
alias :_end_tagline :_end_subtitle
|
2045
|
-
alias :_end_itunes_subtitle :_end_subtitle
|
2046
|
-
|
2047
|
-
def _start_rights(attrsD)
|
2048
|
-
pushContent('rights', attrsD, 'text/plain', true)
|
2049
|
-
end
|
2050
|
-
alias :_start_dc_rights :_start_rights
|
2051
|
-
alias :_start_copyright :_start_rights
|
2052
|
-
|
2053
|
-
def _end_rights
|
2054
|
-
popContent('rights')
|
2055
|
-
end
|
2056
|
-
alias :_end_dc_rights :_end_rights
|
2057
|
-
alias :_end_copyright :_end_rights
|
2058
|
-
|
2059
|
-
def _start_item(attrsD)
|
2060
|
-
@entries << FeedParserDict.new
|
2061
|
-
push('item', false)
|
2062
|
-
@inentry = true
|
2063
|
-
@guidislink = false
|
2064
|
-
id = getAttribute(attrsD, 'rdf:about')
|
2065
|
-
if id and not id.empty?
|
2066
|
-
context = getContext()
|
2067
|
-
context['id'] = id
|
2068
|
-
end
|
2069
|
-
_cdf_common(attrsD)
|
2070
|
-
end
|
2071
|
-
alias :_start_entry :_start_item
|
2072
|
-
alias :_start_product :_start_item
|
2073
|
-
|
2074
|
-
def _end_item
|
2075
|
-
pop('item')
|
2076
|
-
@inentry = false
|
2077
|
-
end
|
2078
|
-
alias :_end_entry :_end_item
|
2079
|
-
|
2080
|
-
def _start_dc_language(attrsD)
|
2081
|
-
push('language', true)
|
2082
|
-
end
|
2083
|
-
alias :_start_language :_start_dc_language
|
2084
|
-
|
2085
|
-
def _end_dc_language
|
2086
|
-
@lang = pop('language')
|
2087
|
-
end
|
2088
|
-
alias :_end_language :_end_dc_language
|
2089
|
-
|
2090
|
-
def _start_dc_publisher(attrsD)
|
2091
|
-
push('publisher', true)
|
2092
|
-
end
|
2093
|
-
alias :_start_webmaster :_start_dc_publisher
|
2094
|
-
|
2095
|
-
def _end_dc_publisher
|
2096
|
-
pop('publisher')
|
2097
|
-
_sync_author_detail('publisher')
|
2098
|
-
end
|
2099
|
-
alias :_end_webmaster :_end_dc_publisher
|
2100
|
-
|
2101
|
-
def _start_published(attrsD)
|
2102
|
-
push('published', true)
|
2103
|
-
end
|
2104
|
-
alias :_start_dcterms_issued :_start_published
|
2105
|
-
alias :_start_issued :_start_published
|
2106
|
-
|
2107
|
-
def _end_published
|
2108
|
-
value = pop('published')
|
2109
|
-
_save('published_parsed', parse_date(value))
|
2110
|
-
end
|
2111
|
-
alias :_end_dcterms_issued :_end_published
|
2112
|
-
alias :_end_issued :_end_published
|
2113
|
-
|
2114
|
-
def _start_updated(attrsD)
|
2115
|
-
push('updated', true)
|
2116
|
-
end
|
2117
|
-
alias :_start_modified :_start_updated
|
2118
|
-
alias :_start_dcterms_modified :_start_updated
|
2119
|
-
alias :_start_pubdate :_start_updated
|
2120
|
-
alias :_start_dc_date :_start_updated
|
2121
|
-
|
2122
|
-
def _end_updated
|
2123
|
-
value = pop('updated')
|
2124
|
-
_save('updated_parsed', parse_date(value))
|
2125
|
-
end
|
2126
|
-
alias :_end_modified :_end_updated
|
2127
|
-
alias :_end_dcterms_modified :_end_updated
|
2128
|
-
alias :_end_pubdate :_end_updated
|
2129
|
-
alias :_end_dc_date :_end_updated
|
2130
|
-
|
2131
|
-
def _start_created(attrsD)
|
2132
|
-
push('created', true)
|
2133
|
-
end
|
2134
|
-
alias :_start_dcterms_created :_start_created
|
2135
|
-
|
2136
|
-
def _end_created
|
2137
|
-
value = pop('created')
|
2138
|
-
_save('created_parsed', parse_date(value))
|
2139
|
-
end
|
2140
|
-
alias :_end_dcterms_created :_end_created
|
2141
|
-
|
2142
|
-
def _start_expirationdate(attrsD)
|
2143
|
-
push('expired', true)
|
2144
|
-
end
|
2145
|
-
def _end_expirationdate
|
2146
|
-
_save('expired_parsed', parse_date(pop('expired')))
|
2147
|
-
end
|
2148
|
-
|
2149
|
-
def _start_cc_license(attrsD)
|
2150
|
-
push('license', true)
|
2151
|
-
value = getAttribute(attrsD, 'rdf:resource')
|
2152
|
-
if value and not value.empty?
|
2153
|
-
elementstack[-1][2] << value
|
2154
|
-
pop('license')
|
2155
|
-
end
|
2156
|
-
end
|
2157
|
-
|
2158
|
-
def _start_creativecommons_license(attrsD)
|
2159
|
-
push('license', true)
|
2160
|
-
end
|
2161
|
-
|
2162
|
-
def _end_creativecommons_license
|
2163
|
-
pop('license')
|
2164
|
-
end
|
2165
|
-
|
2166
|
-
def addTag(term, scheme, label)
|
2167
|
-
context = getContext()
|
2168
|
-
context['tags'] ||= []
|
2169
|
-
tags = context['tags']
|
2170
|
-
if (term.nil? or term.empty?) and (scheme.nil? or scheme.empty?) and (label.nil? or label.empty?)
|
2171
|
-
return
|
2172
|
-
end
|
2173
|
-
value = FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
|
2174
|
-
if not tags.include?value
|
2175
|
-
context['tags'] << FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
|
2176
|
-
end
|
2177
|
-
end
|
2178
|
-
|
2179
|
-
def _start_category(attrsD)
|
2180
|
-
$stderr << "entering _start_category with #{attrsD}\n" if $debug
|
2181
|
-
|
2182
|
-
term = attrsD['term']
|
2183
|
-
scheme = attrsD['scheme'] || attrsD['domain']
|
2184
|
-
label = attrsD['label']
|
2185
|
-
addTag(term, scheme, label)
|
2186
|
-
push('category', true)
|
2187
|
-
end
|
2188
|
-
alias :_start_dc_subject :_start_category
|
2189
|
-
alias :_start_keywords :_start_category
|
2190
|
-
|
2191
|
-
def _end_itunes_keywords
|
2192
|
-
pop('itunes_keywords').split.each do |term|
|
2193
|
-
addTag(term, 'http://www.itunes.com/', nil)
|
2194
|
-
end
|
2195
|
-
end
|
2196
|
-
|
2197
|
-
def _start_itunes_category(attrsD)
|
2198
|
-
addTag(attrsD['text'], 'http://www.itunes.com/', nil)
|
2199
|
-
push('category', true)
|
2200
|
-
end
|
2201
|
-
|
2202
|
-
def _end_category
|
2203
|
-
value = pop('category')
|
2204
|
-
return if value.nil? or value.empty?
|
2205
|
-
context = getContext()
|
2206
|
-
tags = context['tags']
|
2207
|
-
if value and not value.empty? and not tags.empty? and not tags[-1]['term']:
|
2208
|
-
tags[-1]['term'] = value
|
2209
|
-
else
|
2210
|
-
addTag(value, nil, nil)
|
2211
|
-
end
|
2212
|
-
end
|
2213
|
-
alias :_end_dc_subject :_end_category
|
2214
|
-
alias :_end_keywords :_end_category
|
2215
|
-
alias :_end_itunes_category :_end_category
|
2216
|
-
|
2217
|
-
def _start_cloud(attrsD)
|
2218
|
-
getContext()['cloud'] = FeedParserDict.new(attrsD)
|
2219
|
-
end
|
2220
|
-
|
2221
|
-
def _start_link(attrsD)
|
2222
|
-
attrsD['rel'] ||= 'alternate'
|
2223
|
-
attrsD['type'] ||= 'text/html'
|
2224
|
-
attrsD = itsAnHrefDamnIt(attrsD)
|
2225
|
-
if attrsD.has_key? 'href'
|
2226
|
-
attrsD['href'] = resolveURI(attrsD['href'])
|
2227
|
-
end
|
2228
|
-
expectingText = @infeed || @inentry || @insource
|
2229
|
-
context = getContext()
|
2230
|
-
context['links'] ||= []
|
2231
|
-
context['links'] << FeedParserDict.new(attrsD)
|
2232
|
-
if attrsD['rel'] == 'enclosure'
|
2233
|
-
_start_enclosure(attrsD)
|
2234
|
-
end
|
2235
|
-
if attrsD.has_key? 'href'
|
2236
|
-
expectingText = false
|
2237
|
-
if (attrsD['rel'] == 'alternate') and @html_types.include?mapContentType(attrsD['type'])
|
2238
|
-
context['link'] = attrsD['href']
|
2239
|
-
end
|
2240
|
-
else
|
2241
|
-
push('link', expectingText)
|
2242
|
-
end
|
2243
|
-
end
|
2244
|
-
alias :_start_producturl :_start_link
|
2245
|
-
|
2246
|
-
def _end_link
|
2247
|
-
value = pop('link')
|
2248
|
-
context = getContext()
|
2249
|
-
if @intextinput
|
2250
|
-
context['textinput']['link'] = value
|
2251
|
-
end
|
2252
|
-
if @inimage
|
2253
|
-
context['image']['link'] = value
|
2254
|
-
end
|
2255
|
-
end
|
2256
|
-
alias :_end_producturl :_end_link
|
2257
|
-
|
2258
|
-
def _start_guid(attrsD)
|
2259
|
-
@guidislink = ((attrsD['ispermalink'] || 'true') == 'true')
|
2260
|
-
push('id', true)
|
2261
|
-
end
|
2262
|
-
|
2263
|
-
def _end_guid
|
2264
|
-
value = pop('id')
|
2265
|
-
_save('guidislink', (@guidislink and not getContext().has_key?('link')))
|
2266
|
-
if @guidislink:
|
2267
|
-
# guid acts as link, but only if 'ispermalink' is not present or is 'true',
|
2268
|
-
# and only if the item doesn't already have a link element
|
2269
|
-
_save('link', value)
|
2270
|
-
end
|
2271
|
-
end
|
2272
|
-
|
2273
|
-
|
2274
|
-
def _start_title(attrsD)
|
2275
|
-
pushContent('title', attrsD, 'text/plain', @infeed || @inentry || @insource)
|
2276
|
-
end
|
2277
|
-
alias :_start_dc_title :_start_title
|
2278
|
-
alias :_start_media_title :_start_title
|
2279
|
-
|
2280
|
-
def _end_title
|
2281
|
-
value = popContent('title')
|
2282
|
-
context = getContext()
|
2283
|
-
if @intextinput
|
2284
|
-
context['textinput']['title'] = value
|
2285
|
-
elsif @inimage
|
2286
|
-
context['image']['title'] = value
|
2287
|
-
end
|
2288
|
-
end
|
2289
|
-
alias :_end_dc_title :_end_title
|
2290
|
-
alias :_end_media_title :_end_title
|
2291
|
-
|
2292
|
-
def _start_description(attrsD)
|
2293
|
-
context = getContext()
|
2294
|
-
if context.has_key?('summary')
|
2295
|
-
@summaryKey = 'content'
|
2296
|
-
_start_content(attrsD)
|
2297
|
-
else
|
2298
|
-
pushContent('description', attrsD, 'text/html', @infeed || @inentry || @insource)
|
2299
|
-
end
|
2300
|
-
end
|
2301
|
-
|
2302
|
-
def _start_abstract(attrsD)
|
2303
|
-
pushContent('description', attrsD, 'text/plain', @infeed || @inentry || @insource)
|
2304
|
-
end
|
2305
|
-
|
2306
|
-
def _end_description
|
2307
|
-
if @summaryKey == 'content'
|
2308
|
-
_end_content()
|
2309
|
-
else
|
2310
|
-
value = popContent('description')
|
2311
|
-
context = getContext()
|
2312
|
-
if @intextinput
|
2313
|
-
context['textinput']['description'] = value
|
2314
|
-
elsif @inimage:
|
2315
|
-
context['image']['description'] = value
|
2316
|
-
end
|
2317
|
-
end
|
2318
|
-
@summaryKey = nil
|
2319
|
-
end
|
2320
|
-
alias :_end_abstract :_end_description
|
2321
|
-
|
2322
|
-
def _start_info(attrsD)
|
2323
|
-
pushContent('info', attrsD, 'text/plain', true)
|
2324
|
-
end
|
2325
|
-
alias :_start_feedburner_browserfriendly :_start_info
|
2326
|
-
|
2327
|
-
def _end_info
|
2328
|
-
popContent('info')
|
2329
|
-
end
|
2330
|
-
alias :_end_feedburner_browserfriendly :_end_info
|
2331
|
-
|
2332
|
-
def _start_generator(attrsD)
|
2333
|
-
if attrsD and not attrsD.empty?
|
2334
|
-
attrsD = itsAnHrefDamnIt(attrsD)
|
2335
|
-
if attrsD.has_key?('href')
|
2336
|
-
attrsD['href'] = resolveURI(attrsD['href'])
|
2337
|
-
end
|
2338
|
-
end
|
2339
|
-
getContext()['generator_detail'] = FeedParserDict.new(attrsD)
|
2340
|
-
push('generator', true)
|
2341
|
-
end
|
2342
|
-
|
2343
|
-
def _end_generator
|
2344
|
-
value = pop('generator')
|
2345
|
-
context = getContext()
|
2346
|
-
if context.has_key?('generator_detail')
|
2347
|
-
context['generator_detail']['name'] = value
|
2348
|
-
end
|
2349
|
-
end
|
2350
|
-
|
2351
|
-
def _start_admin_generatoragent(attrsD)
|
2352
|
-
push('generator', true)
|
2353
|
-
value = getAttribute(attrsD, 'rdf:resource')
|
2354
|
-
if value and not value.empty?
|
2355
|
-
elementstack[-1][2] << value
|
2356
|
-
end
|
2357
|
-
pop('generator')
|
2358
|
-
getContext()['generator_detail'] = FeedParserDict.new({'href' => value})
|
2359
|
-
end
|
2360
|
-
|
2361
|
-
def _start_admin_errorreportsto(attrsD)
|
2362
|
-
push('errorreportsto', true)
|
2363
|
-
value = getAttribute(attrsD, 'rdf:resource')
|
2364
|
-
if value and not value.empty?
|
2365
|
-
@elementstack[-1][2] << value
|
2366
|
-
end
|
2367
|
-
pop('errorreportsto')
|
2368
|
-
end
|
2369
|
-
|
2370
|
-
def _start_summary(attrsD)
|
2371
|
-
context = getContext()
|
2372
|
-
if context.has_key?'summary'
|
2373
|
-
@summaryKey = 'content'
|
2374
|
-
_start_content(attrsD)
|
2375
|
-
else
|
2376
|
-
@summaryKey = 'summary'
|
2377
|
-
pushContent(@summaryKey, attrsD, 'text/plain', true)
|
2378
|
-
end
|
2379
|
-
end
|
2380
|
-
alias :_start_itunes_summary :_start_summary
|
2381
|
-
|
2382
|
-
def _end_summary
|
2383
|
-
if @summaryKey == 'content':
|
2384
|
-
_end_content()
|
2385
|
-
else
|
2386
|
-
popContent(@summaryKey || 'summary')
|
2387
|
-
end
|
2388
|
-
@summaryKey = nil
|
2389
|
-
end
|
2390
|
-
alias :_end_itunes_summary :_end_summary
|
2391
|
-
|
2392
|
-
def _start_enclosure(attrsD)
|
2393
|
-
attrsD = itsAnHrefDamnIt(attrsD)
|
2394
|
-
getContext()['enclosures'] ||= []
|
2395
|
-
getContext()['enclosures'] << FeedParserDict.new(attrsD)
|
2396
|
-
href = attrsD['href']
|
2397
|
-
if href and not href.empty?
|
2398
|
-
context = getContext()
|
2399
|
-
if not context['id']
|
2400
|
-
context['id'] = href
|
2401
|
-
end
|
2402
|
-
end
|
2403
|
-
end
|
2404
|
-
|
2405
|
-
def _start_source(attrsD)
|
2406
|
-
@insource = true
|
2407
|
-
end
|
2408
|
-
|
2409
|
-
def _end_source
|
2410
|
-
@insource = false
|
2411
|
-
getContext()['source'] = Marshal.load(Marshal.dump(@sourcedata))
|
2412
|
-
@sourcedata.clear()
|
2413
|
-
end
|
2414
|
-
|
2415
|
-
def _start_content(attrsD)
|
2416
|
-
pushContent('content', attrsD, 'text/plain', true)
|
2417
|
-
src = attrsD['src']
|
2418
|
-
if src and not src.empty?:
|
2419
|
-
@contentparams['src'] = src
|
2420
|
-
end
|
2421
|
-
push('content', true)
|
2422
|
-
end
|
2423
|
-
|
2424
|
-
def _start_prodlink(attrsD)
|
2425
|
-
pushContent('content', attrsD, 'text/html', true)
|
2426
|
-
end
|
2427
|
-
|
2428
|
-
def _start_body(attrsD)
|
2429
|
-
pushContent('content', attrsD, 'application/xhtml+xml', true)
|
2430
|
-
end
|
2431
|
-
alias :_start_xhtml_body :_start_body
|
2432
|
-
|
2433
|
-
def _start_content_encoded(attrsD)
|
2434
|
-
pushContent('content', attrsD, 'text/html', true)
|
2435
|
-
end
|
2436
|
-
alias :_start_fullitem :_start_content_encoded
|
2437
|
-
|
2438
|
-
def _end_content
|
2439
|
-
copyToDescription = (['text/plain'] + @html_types).include? mapContentType(@contentparams['type'])
|
2440
|
-
value = popContent('content')
|
2441
|
-
if copyToDescription
|
2442
|
-
_save('description', value)
|
2443
|
-
end
|
2444
|
-
alias :_end_body :_end_content
|
2445
|
-
alias :_end_xhtml_body :_end_content
|
2446
|
-
alias :_end_content_encoded :_end_content
|
2447
|
-
alias :_end_fullitem :_end_content
|
2448
|
-
alias :_end_prodlink :_end_content
|
2449
|
-
end
|
2450
|
-
|
2451
|
-
def _start_itunes_image(attrsD)
|
2452
|
-
push('itunes_image', false)
|
2453
|
-
getContext()['image'] = FeedParserDict.new({'href' => attrsD['href']})
|
2454
|
-
end
|
2455
|
-
alias :_start_itunes_link :_start_itunes_image
|
2456
|
-
|
2457
|
-
def _end_itunes_block
|
2458
|
-
value = pop('itunes_block', false)
|
2459
|
-
getContext()['itunes_block'] = (value == 'yes') and true or false
|
2460
|
-
end
|
2461
|
-
|
2462
|
-
def _end_itunes_explicit
|
2463
|
-
value = pop('itunes_explicit', false)
|
2464
|
-
getContext()['itunes_explicit'] = (value == 'yes') and true or false
|
2465
|
-
end
|
2466
|
-
|
2467
|
-
|
2468
|
-
# ISO-8601 date parsing routines written by Fazal Majid.
|
2469
|
-
# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
|
2470
|
-
# parser is beyond the scope of feedparser and the current Time.iso8601
|
2471
|
-
# method does not work.
|
2472
|
-
# A single regular expression cannot parse ISO 8601 date formats into groups
|
2473
|
-
# as the standard is highly irregular (for instance is 030104 2003-01-04 or
|
2474
|
-
# 0301-04-01), so we use templates instead.
|
2475
|
-
# Please note the order in templates is significant because we need a
|
2476
|
-
# greedy match.
|
2477
|
-
def _parse_date_iso8601(dateString)
|
2478
|
-
# Parse a variety of ISO-8601-compatible formats like 20040105
|
2479
|
-
|
2480
|
-
# What I'm about to show you may be the ugliest code in all of
|
2481
|
-
# rfeedparser.
|
2482
|
-
# FIXME The century regexp maybe not work ('\d\d$' says "two numbers at
|
2483
|
-
# end of line" but we then attach more of a regexp.
|
2484
|
-
iso8601_regexps = [ '^(\d{4})-?([01]\d)-([0123]\d)',
|
2485
|
-
'^(\d{4})-([01]\d)',
|
2486
|
-
'^(\d{4})-?([0123]\d\d)',
|
2487
|
-
'^(\d\d)-?([01]\d)-?([0123]\d)',
|
2488
|
-
'^(\d\d)-?([0123]\d\d)',
|
2489
|
-
'^(\d{4})',
|
2490
|
-
'-(\d\d)-?([01]\d)',
|
2491
|
-
'-([0123]\d\d)',
|
2492
|
-
'-(\d\d)',
|
2493
|
-
'--([01]\d)-?([0123]\d)',
|
2494
|
-
'--([01]\d)',
|
2495
|
-
'---([0123]\d)',
|
2496
|
-
'(\d\d$)',
|
2497
|
-
''
|
2498
|
-
]
|
2499
|
-
iso8601_values = { '^(\d{4})-?([01]\d)-([0123]\d)' => ['year', 'month', 'day'],
|
2500
|
-
'^(\d{4})-([01]\d)' => ['year','month'],
|
2501
|
-
'^(\d{4})-?([0123]\d\d)' => ['year', 'ordinal'],
|
2502
|
-
'^(\d\d)-?([01]\d)-?([0123]\d)' => ['year','month','day'],
|
2503
|
-
'^(\d\d)-?([0123]\d\d)' => ['year','ordinal'],
|
2504
|
-
'^(\d{4})' => ['year'],
|
2505
|
-
'-(\d\d)-?([01]\d)' => ['year','month'],
|
2506
|
-
'-([0123]\d\d)' => ['ordinal'],
|
2507
|
-
'-(\d\d)' => ['year'],
|
2508
|
-
'--([01]\d)-?([0123]\d)' => ['month','day'],
|
2509
|
-
'--([01]\d)' => ['month'],
|
2510
|
-
'---([0123]\d)' => ['day'],
|
2511
|
-
'(\d\d$)' => ['century'],
|
2512
|
-
'' => []
|
2513
|
-
}
|
2514
|
-
add_to_all = '(T?(\d\d):(\d\d)(?::(\d\d))?([+-](\d\d)(?::(\d\d))?|Z)?)?'
|
2515
|
-
add_to_all_fields = ['hour', 'minute', 'second', 'tz', 'tzhour', 'tzmin']
|
2516
|
-
# NOTE We use '(?:' to prevent grouping of optional matches (ones trailed
|
2517
|
-
# by '?'). The second ':' *are* matched.
|
2518
|
-
m = nil
|
2519
|
-
param_keys = []
|
2520
|
-
iso8601_regexps.each do |s|
|
2521
|
-
$stderr << "Trying iso8601 regexp: #{s+add_to_all}\n" if $debug
|
2522
|
-
param_keys = iso8601_values[s] + add_to_all_fields
|
2523
|
-
m = dateString.match(Regexp.new(s+add_to_all))
|
2524
|
-
break if m
|
2525
|
-
end
|
2526
|
-
return if m.nil? or (m.begin(0).zero? and m.end(0).zero?)
|
2527
|
-
|
2528
|
-
param_values = m.to_a
|
2529
|
-
param_values = param_values[1..-1]
|
2530
|
-
params = {}
|
2531
|
-
param_keys.each_with_index do |key,i|
|
2532
|
-
params[key] = param_values[i]
|
2533
|
-
end
|
2534
21
|
|
2535
|
-
|
2536
|
-
|
2537
|
-
|
2538
|
-
|
2539
|
-
|
2540
|
-
|
2541
|
-
year = 100 * (Time.now.utc.year / 100) + year.to_i
|
2542
|
-
else
|
2543
|
-
year = year.to_i
|
2544
|
-
end
|
2545
|
-
|
2546
|
-
month = params['month'] || '-'
|
2547
|
-
if month.nil? or month.empty? or month == '-'
|
2548
|
-
# ordinals are NOT normalized by mktime, we simulate them
|
2549
|
-
# by setting month=1, day=ordinal
|
2550
|
-
if ordinal
|
2551
|
-
month = DateTime.ordinal(year,ordinal).month
|
2552
|
-
else
|
2553
|
-
month = Time.now.utc.month
|
2554
|
-
end
|
2555
|
-
end
|
2556
|
-
month = month.to_i unless month.nil?
|
2557
|
-
day = params['day']
|
2558
|
-
if day.nil? or day.empty?
|
2559
|
-
# see above
|
2560
|
-
if ordinal
|
2561
|
-
day = DateTime.ordinal(year,ordinal).day
|
2562
|
-
elsif params['century'] or params['year'] or params['month']
|
2563
|
-
day = 1
|
2564
|
-
else
|
2565
|
-
day = Time.now.utc.day
|
2566
|
-
end
|
2567
|
-
else
|
2568
|
-
day = day.to_i
|
2569
|
-
end
|
2570
|
-
# special case of the century - is the first year of the 21st century
|
2571
|
-
# 2000 or 2001 ? The debate goes on...
|
2572
|
-
if params.has_key? 'century'
|
2573
|
-
year = (params['century'].to_i - 1) * 100 + 1
|
2574
|
-
end
|
2575
|
-
# in ISO 8601 most fields are optional
|
2576
|
-
hour = params['hour'].to_i
|
2577
|
-
minute = params['minute'].to_i
|
2578
|
-
second = params['second'].to_i
|
2579
|
-
weekday = nil
|
2580
|
-
# daylight savings is complex, but not needed for feedparser's purposes
|
2581
|
-
# as time zones, if specified, include mention of whether it is active
|
2582
|
-
# (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
|
2583
|
-
# and most implementations have DST bugs
|
2584
|
-
tm = [second, minute, hour, day, month, year, nil, ordinal, false, nil]
|
2585
|
-
tz = params['tz']
|
2586
|
-
if tz and not tz.empty? and tz != 'Z'
|
2587
|
-
# FIXME does this cross over days?
|
2588
|
-
if tz[0] == '-'
|
2589
|
-
tm[3] += params['tzhour'].to_i
|
2590
|
-
tm[4] += params['tzmin'].to_i
|
2591
|
-
elsif tz[0] == '+'
|
2592
|
-
tm[3] -= params['tzhour'].to_i
|
2593
|
-
tm[4] -= params['tzmin'].to_i
|
2594
|
-
else
|
2595
|
-
return nil
|
2596
|
-
end
|
2597
|
-
end
|
2598
|
-
return Time.utc(*tm) # Magic!
|
2599
|
-
|
2600
|
-
end
|
2601
|
-
|
2602
|
-
def _parse_date_onblog(dateString)
|
2603
|
-
# Parse a string according to the OnBlog 8-bit date format
|
2604
|
-
# 8-bit date handling routes written by ytrewq1
|
2605
|
-
korean_year = u("년") # b3e2 in euc-kr
|
2606
|
-
korean_month = u("월") # bff9 in euc-kr
|
2607
|
-
korean_day = u("일") # c0cf in euc-kr
|
2608
|
-
|
2609
|
-
|
2610
|
-
korean_onblog_date_re = /(\d{4})#{korean_year}\s+(\d{2})#{korean_month}\s+(\d{2})#{korean_day}\s+(\d{2}):(\d{2}):(\d{2})/
|
2611
|
-
|
2612
|
-
|
2613
|
-
m = korean_onblog_date_re.match(dateString)
|
2614
|
-
return unless m
|
2615
|
-
w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
|
2616
|
-
|
2617
|
-
$stderr << "OnBlog date parsed as: %s\n" % w3dtfdate if $debug
|
2618
|
-
return _parse_date_w3dtf(w3dtfdate)
|
2619
|
-
end
|
2620
|
-
|
2621
|
-
def _parse_date_nate(dateString)
|
2622
|
-
# Parse a string according to the Nate 8-bit date format
|
2623
|
-
# 8-bit date handling routes written by ytrewq1
|
2624
|
-
korean_am = u("오전") # bfc0 c0fc in euc-kr
|
2625
|
-
korean_pm = u("오후") # bfc0 c8c4 in euc-kr
|
2626
|
-
|
2627
|
-
korean_nate_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(#{korean_am}|#{korean_pm})\s+(\d{0,2}):(\d{0,2}):(\d{0,2})/
|
2628
|
-
m = korean_nate_date_re.match(dateString)
|
2629
|
-
return unless m
|
2630
|
-
hour = m[5].to_i
|
2631
|
-
ampm = m[4]
|
2632
|
-
if ampm == korean_pm
|
2633
|
-
hour += 12
|
2634
|
-
end
|
2635
|
-
hour = hour.to_s.rjust(2,'0')
|
2636
|
-
w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{hour}:#{m[6]}:#{m[7]}+09:00"
|
2637
|
-
$stderr << "Nate date parsed as: %s\n" % w3dtfdate if $debug
|
2638
|
-
return _parse_date_w3dtf(w3dtfdate)
|
2639
|
-
end
|
2640
|
-
|
2641
|
-
def _parse_date_mssql(dateString)
|
2642
|
-
mssql_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?/
|
2643
|
-
|
2644
|
-
m = mssql_date_re.match(dateString)
|
2645
|
-
return unless m
|
2646
|
-
w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
|
2647
|
-
$stderr << "MS SQL date parsed as: %s\n" % w3dtfdate if $debug
|
2648
|
-
return _parse_date_w3dtf(w3dtfdate)
|
2649
|
-
end
|
2650
|
-
|
2651
|
-
def _parse_date_greek(dateString)
|
2652
|
-
# Parse a string according to a Greek 8-bit date format
|
2653
|
-
# Unicode strings for Greek date strings
|
2654
|
-
greek_months = {
|
2655
|
-
u("Ιαν") => u("Jan"), # c9e1ed in iso-8859-7
|
2656
|
-
u("Φεβ") => u("Feb"), # d6e5e2 in iso-8859-7
|
2657
|
-
u("Μάώ") => u("Mar"), # ccdcfe in iso-8859-7
|
2658
|
-
u("Μαώ") => u("Mar"), # cce1fe in iso-8859-7
|
2659
|
-
u("Απρ") => u("Apr"), # c1f0f1 in iso-8859-7
|
2660
|
-
u("Μάι") => u("May"), # ccdce9 in iso-8859-7
|
2661
|
-
u("Μαϊ") => u("May"), # cce1fa in iso-8859-7
|
2662
|
-
u("Μαι") => u("May"), # cce1e9 in iso-8859-7
|
2663
|
-
u("Ιούν") => u("Jun"), # c9effded in iso-8859-7
|
2664
|
-
u("Ιον") => u("Jun"), # c9efed in iso-8859-7
|
2665
|
-
u("Ιούλ") => u("Jul"), # c9effdeb in iso-8859-7
|
2666
|
-
u("Ιολ") => u("Jul"), # c9f9eb in iso-8859-7
|
2667
|
-
u("Αύγ") => u("Aug"), # c1fde3 in iso-8859-7
|
2668
|
-
u("Αυγ") => u("Aug"), # c1f5e3 in iso-8859-7
|
2669
|
-
u("Σεπ") => u("Sep"), # d3e5f0 in iso-8859-7
|
2670
|
-
u("Οκτ") => u("Oct"), # cfeaf4 in iso-8859-7
|
2671
|
-
u("Νοέ") => u("Nov"), # cdefdd in iso-8859-7
|
2672
|
-
u("Νοε") => u("Nov"), # cdefe5 in iso-8859-7
|
2673
|
-
u("Δεκ") => u("Dec"), # c4e5ea in iso-8859-7
|
2674
|
-
}
|
2675
|
-
|
2676
|
-
greek_wdays = {
|
2677
|
-
u("Κυρ") => u("Sun"), # caf5f1 in iso-8859-7
|
2678
|
-
u("Δευ") => u("Mon"), # c4e5f5 in iso-8859-7
|
2679
|
-
u("Τρι") => u("Tue"), # d4f1e9 in iso-8859-7
|
2680
|
-
u("Τετ") => u("Wed"), # d4e5f4 in iso-8859-7
|
2681
|
-
u("Πεμ") => u("Thu"), # d0e5ec in iso-8859-7
|
2682
|
-
u("Παρ") => u("Fri"), # d0e1f1 in iso-8859-7
|
2683
|
-
u("Σαβ") => u("Sat"), # d3e1e2 in iso-8859-7
|
2684
|
-
}
|
2685
|
-
|
2686
|
-
greek_date_format = /([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)/
|
2687
|
-
|
2688
|
-
m = greek_date_format.match(dateString)
|
2689
|
-
return unless m
|
2690
|
-
begin
|
2691
|
-
wday = greek_wdays[m[1]]
|
2692
|
-
month = greek_months[m[3]]
|
2693
|
-
rescue
|
2694
|
-
return nil
|
2695
|
-
end
|
2696
|
-
rfc822date = "#{wday}, #{m[2]} #{month} #{m[4]} #{m[5]}:#{m[6]}:#{m[7]} #{m[8]}"
|
2697
|
-
$stderr << "Greek date parsed as: #{rfc822date}\n" if $debug
|
2698
|
-
return _parse_date_rfc822(rfc822date)
|
2699
|
-
end
|
2700
|
-
|
2701
|
-
def _parse_date_hungarian(dateString)
|
2702
|
-
# Parse a string according to a Hungarian 8-bit date format.
|
2703
|
-
hungarian_date_format_re = /(\d{4})-([^-]+)-(\d{0,2})T(\d{0,2}):(\d{2})((\+|-)(\d{0,2}:\d{2}))/
|
2704
|
-
m = hungarian_date_format_re.match(dateString)
|
2705
|
-
return unless m
|
2706
|
-
|
2707
|
-
# Unicode strings for Hungarian date strings
|
2708
|
-
hungarian_months = {
|
2709
|
-
u("január") => u("01"), # e1 in iso-8859-2
|
2710
|
-
u("februári") => u("02"), # e1 in iso-8859-2
|
2711
|
-
u("március") => u("03"), # e1 in iso-8859-2
|
2712
|
-
u("április") => u("04"), # e1 in iso-8859-2
|
2713
|
-
u("máujus") => u("05"), # e1 in iso-8859-2
|
2714
|
-
u("június") => u("06"), # fa in iso-8859-2
|
2715
|
-
u("július") => u("07"), # fa in iso-8859-2
|
2716
|
-
u("augusztus") => u("08"),
|
2717
|
-
u("szeptember") => u("09"),
|
2718
|
-
u("október") => u("10"), # f3 in iso-8859-2
|
2719
|
-
u("november") => u("11"),
|
2720
|
-
u("december") => u("12"),
|
2721
|
-
}
|
2722
|
-
begin
|
2723
|
-
month = hungarian_months[m[2]]
|
2724
|
-
day = m[3].rjust(2,'0')
|
2725
|
-
hour = m[4].rjust(2,'0')
|
2726
|
-
rescue
|
2727
|
-
return
|
2728
|
-
end
|
2729
|
-
|
2730
|
-
w3dtfdate = "#{m[1]}-#{month}-#{day}T#{hour}:#{m[5]}:00#{m[6]}"
|
2731
|
-
$stderr << "Hungarian date parsed as: #{w3dtfdate}\n" if $debug
|
2732
|
-
return _parse_date_w3dtf(w3dtfdate)
|
2733
|
-
end
|
2734
|
-
|
2735
|
-
def rollover(num, modulus)
|
2736
|
-
return num % modulus, num / modulus
|
2737
|
-
end
|
2738
|
-
|
2739
|
-
def set_self(num, modulus)
|
2740
|
-
r = num / modulus
|
2741
|
-
if r == 0
|
2742
|
-
return num
|
2743
|
-
end
|
2744
|
-
return r
|
2745
|
-
end
|
2746
|
-
# W3DTF-style date parsing
|
2747
|
-
# FIXME shouldn't it be "W3CDTF"?
|
2748
|
-
def _parse_date_w3dtf(dateString)
|
2749
|
-
# Ruby's Time docs claim w3cdtf is an alias for iso8601 which is an alias for xmlschema
|
2750
|
-
# Whatever it is, it doesn't work. This has been fixed in Ruby 1.9 and
|
2751
|
-
# in Ruby on Rails, but not really. They don't fix the 25 hour or 61 minute or 61 second rollover and fail in other ways.
|
2752
|
-
|
2753
|
-
m = dateString.match(/^(\d{4})-?(?:(?:([01]\d)-?(?:([0123]\d)(?:T(\d\d):(\d\d):(\d\d)([+-]\d\d:\d\d|Z))?)?)?)?/)
|
2754
|
-
|
2755
|
-
w3 = m[1..3].map{|s| s=s.to_i; s += 1 if s == 0;s} # Map the year, month and day to integers and, if they were nil, set them to 1
|
2756
|
-
w3 += m[4..6].map{|s| s.to_i} # Map the hour, minute and second to integers
|
2757
|
-
w3 << m[-1] # Leave the timezone as a String
|
2758
|
-
|
2759
|
-
# FIXME this next bit needs some serious refactoring
|
2760
|
-
# Rollover times. 0 minutes and 61 seconds -> 1 minute and 1 second
|
2761
|
-
w3[5],r = rollover(w3[5], 60) # rollover seconds
|
2762
|
-
w3[4] += r
|
2763
|
-
w3[4],r = rollover(w3[4], 60) # rollover minutes
|
2764
|
-
w3[3] += r
|
2765
|
-
w3[3],r = rollover(w3[3], 24) # rollover hours
|
2766
|
-
|
2767
|
-
w3[2] = w3[2] + r
|
2768
|
-
if w3[1] > 12
|
2769
|
-
w3[1],r = rollover(w3[1],12)
|
2770
|
-
w3[1] = 12 if w3[1] == 0
|
2771
|
-
w3[0] += r
|
2772
|
-
end
|
2773
|
-
|
2774
|
-
num_days = Time.days_in_month(w3[1], w3[0])
|
2775
|
-
while w3[2] > num_days
|
2776
|
-
w3[2] -= num_days
|
2777
|
-
w3[1] += 1
|
2778
|
-
if w3[1] > 12
|
2779
|
-
w3[0] += 1
|
2780
|
-
w3[1] = set_self(w3[1], 12)
|
2781
|
-
end
|
2782
|
-
num_days = Time.days_in_month(w3[1], w3[0])
|
2783
|
-
end
|
2784
|
-
|
2785
|
-
|
2786
|
-
unless w3[6].class != String
|
2787
|
-
if /^-/ =~ w3[6] # Zone offset goes backwards
|
2788
|
-
w3[6][0] = '+'
|
2789
|
-
elsif /^\+/ =~ w3[6]
|
2790
|
-
w3[6][0] = '-'
|
2791
|
-
end
|
2792
|
-
end
|
2793
|
-
return Time.utc(w3[0], w3[1], w3[2] , w3[3], w3[4], w3[5])+Time.zone_offset(w3[6] || "UTC")
|
2794
|
-
end
|
2795
|
-
|
2796
|
-
def _parse_date_rfc822(dateString)
|
2797
|
-
# Parse an RFC822, RFC1123, RFC2822 or asctime-style date
|
2798
|
-
# These first few lines are to fix up the stupid proprietary format from Disney
|
2799
|
-
unknown_timezones = { 'AT' => 'EDT', 'ET' => 'EST',
|
2800
|
-
'CT' => 'CST', 'MT' => 'MST',
|
2801
|
-
'PT' => 'PST'
|
2802
|
-
}
|
2803
|
-
|
2804
|
-
mon = dateString.split[2]
|
2805
|
-
if mon.length > 3 and Time::RFC2822_MONTH_NAME.include?mon[0..2]
|
2806
|
-
dateString.sub!(mon,mon[0..2])
|
2807
|
-
end
|
2808
|
-
if dateString[-3..-1] != "GMT" and unknown_timezones[dateString[-2..-1]]
|
2809
|
-
dateString[-2..-1] = unknown_timezones[dateString[-2..-1]]
|
2810
|
-
end
|
2811
|
-
# Okay, the Disney date format should be fixed up now.
|
2812
|
-
rfc = dateString.match(/([A-Za-z]{3}), ([0123]\d) ([A-Za-z]{3}) (\d{4})( (\d\d):(\d\d)(?::(\d\d))? ([A-Za-z]{3}))?/)
|
2813
|
-
if rfc.to_a.length > 1 and rfc.to_a.include? nil
|
2814
|
-
dow, day, mon, year, hour, min, sec, tz = rfc[1..-1]
|
2815
|
-
hour,min,sec = [hour,min,sec].map{|e| e.to_s.rjust(2,'0') }
|
2816
|
-
tz ||= "GMT"
|
2817
|
-
end
|
2818
|
-
asctime_match = dateString.match(/([A-Za-z]{3}) ([A-Za-z]{3}) (\d?\d) (\d\d):(\d\d):(\d\d) ([A-Za-z]{3}) (\d\d\d\d)/).to_a
|
2819
|
-
if asctime_match.to_a.length > 1
|
2820
|
-
# Month-abbr dayofmonth hour:minute:second year
|
2821
|
-
dow, mon, day, hour, min, sec, tz, year = asctime_match[1..-1]
|
2822
|
-
day.to_s.rjust(2,'0')
|
2823
|
-
end
|
2824
|
-
if (rfc.to_a.length > 1 and rfc.to_a.include? nil) or asctime_match.to_a.length > 1
|
2825
|
-
ds = "#{dow}, #{day} #{mon} #{year} #{hour}:#{min}:#{sec} #{tz}"
|
2826
|
-
else
|
2827
|
-
ds = dateString
|
2828
|
-
end
|
2829
|
-
t = Time.rfc2822(ds).utc
|
2830
|
-
return t
|
2831
|
-
end
|
2832
|
-
|
2833
|
-
def _parse_date_perforce(aDateString) # FIXME not in 4.1?
|
2834
|
-
# Parse a date in yyyy/mm/dd hh:mm:ss TTT format
|
2835
|
-
# Note that there is a day of the week at the beginning
|
2836
|
-
# Ex. Fri, 2006/09/15 08:19:53 EDT
|
2837
|
-
return Time.parse(aDateString).utc
|
2838
|
-
end
|
2839
|
-
|
2840
|
-
def extract_tuple(atime)
|
2841
|
-
# NOTE leave the error handling to parse_date
|
2842
|
-
t = [atime.year, atime.month, atime.mday, atime.hour,
|
2843
|
-
atime.min, atime.sec, (atime.wday-1) % 7, atime.yday,
|
2844
|
-
atime.isdst
|
2845
|
-
]
|
2846
|
-
# yay for modulus! yaaaaaay! its 530 am and i should be sleeping! yaay!
|
2847
|
-
t[0..-2].map!{|s| s.to_i}
|
2848
|
-
t[-1] = t[-1] ? 1 : 0
|
2849
|
-
return t
|
2850
|
-
end
|
2851
|
-
|
2852
|
-
def parse_date(dateString)
|
2853
|
-
@date_handlers.each do |handler|
|
2854
|
-
begin
|
2855
|
-
$stderr << "Trying date_handler #{handler}\n" if $debug
|
2856
|
-
datething = extract_tuple(send(handler,dateString))
|
2857
|
-
return datething
|
2858
|
-
rescue Exception => e
|
2859
|
-
$stderr << "#{handler} raised #{e}\n" if $debug
|
2860
|
-
end
|
2861
|
-
end
|
2862
|
-
return nil
|
2863
|
-
end
|
2864
|
-
|
2865
|
-
end # End FeedParserMixin
|
2866
|
-
|
2867
|
-
class StrictFeedParser < XML::SAX::HandlerBase # expat
|
2868
|
-
include FeedParserMixin
|
2869
|
-
|
2870
|
-
attr_accessor :bozo, :entries, :feeddata, :exc
|
2871
|
-
def initialize(baseuri, baselang, encoding)
|
2872
|
-
$stderr << "trying StrictFeedParser\n" if $debug
|
2873
|
-
startup(baseuri, baselang, encoding)
|
2874
|
-
@bozo = false
|
2875
|
-
@exc = nil
|
2876
|
-
super()
|
2877
|
-
end
|
2878
|
-
|
2879
|
-
def getPos
|
2880
|
-
[@locator.getSystemId, @locator.getLineNumber]
|
2881
|
-
end
|
2882
|
-
|
2883
|
-
def getAttrs(attrs)
|
2884
|
-
ret = []
|
2885
|
-
for i in 0..attrs.getLength
|
2886
|
-
ret.push([attrs.getName(i), attrs.getValue(i)])
|
2887
|
-
end
|
2888
|
-
ret
|
2889
|
-
end
|
2890
|
-
|
2891
|
-
def setDocumentLocator(loc)
|
2892
|
-
@locator = loc
|
2893
|
-
end
|
2894
|
-
|
2895
|
-
def startDoctypeDecl(name, pub_sys, long_name, uri)
|
2896
|
-
#Nothing is done here. What could we do that is neat and useful?
|
2897
|
-
end
|
2898
|
-
|
2899
|
-
def startNamespaceDecl(prefix, uri)
|
2900
|
-
trackNamespace(prefix, uri)
|
2901
|
-
end
|
2902
|
-
|
2903
|
-
def endNamespaceDecl(prefix)
|
2904
|
-
end
|
2905
|
-
|
2906
|
-
def startElement(name, attrs)
|
2907
|
-
name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
|
2908
|
-
namespaceuri = ($2 || '').downcase
|
2909
|
-
name = $3
|
2910
|
-
if /backend\.userland\.com\/rss/ =~ namespaceuri
|
2911
|
-
# match any backend.userland.com namespace
|
2912
|
-
namespaceuri = 'http://backend.userland.com/rss'
|
2913
|
-
end
|
2914
|
-
prefix = @matchnamespaces[namespaceuri]
|
2915
|
-
# No need to raise UndeclaredNamespace, Expat does that for us with
|
2916
|
-
"unbound prefix (XMLParserError)"
|
2917
|
-
if prefix and not prefix.empty?
|
2918
|
-
name = prefix + ':' + name
|
2919
|
-
end
|
2920
|
-
name.downcase!
|
2921
|
-
unknown_starttag(name, attrs)
|
2922
|
-
end
|
2923
|
-
|
2924
|
-
def character(text, start, length)
|
2925
|
-
#handle_data(CGI.unescapeHTML(text))
|
2926
|
-
handle_data(text)
|
2927
|
-
end
|
2928
|
-
# expat provides "character" not "characters"!
|
2929
|
-
alias :characters :character # Just in case.
|
22
|
+
gem 'character-encodings', ">=0.2.0"
|
23
|
+
gem 'htmltools', ">=1.10"
|
24
|
+
gem 'htmlentities', ">=4.0.0"
|
25
|
+
gem 'activesupport', ">=1.4.1"
|
26
|
+
gem 'rchardet', ">=1.0"
|
27
|
+
require 'xml/saxdriver' # calling expat through the xmlparser gem
|
2930
28
|
|
2931
|
-
|
2932
|
-
|
2933
|
-
end
|
29
|
+
require 'rchardet'
|
30
|
+
$chardet = true
|
2934
31
|
|
2935
|
-
|
2936
|
-
|
2937
|
-
|
2938
|
-
|
2939
|
-
|
2940
|
-
|
2941
|
-
end
|
2942
|
-
name.downcase!
|
2943
|
-
unknown_endtag(name)
|
2944
|
-
end
|
32
|
+
require 'encoding/character/utf-8'
|
33
|
+
require 'html/sgml-parser'
|
34
|
+
require 'htmlentities'
|
35
|
+
require 'active_support'
|
36
|
+
require 'open-uri'
|
37
|
+
include OpenURI
|
2945
38
|
|
2946
|
-
|
2947
|
-
|
2948
|
-
end
|
39
|
+
$debug = false
|
40
|
+
$compatible = true
|
2949
41
|
|
2950
|
-
|
2951
|
-
|
42
|
+
$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
|
43
|
+
require 'rfeedparser/forgiving_uri'
|
44
|
+
require 'rfeedparser/aliases'
|
45
|
+
require 'rfeedparser/encoding_helpers'
|
46
|
+
require 'rfeedparser/better_sgmlparser'
|
47
|
+
require 'rfeedparser/better_attributelist'
|
48
|
+
require 'rfeedparser/scrub'
|
49
|
+
require 'rfeedparser/time_helpers'
|
50
|
+
require 'rfeedparser/feedparserdict'
|
51
|
+
require 'rfeedparser/parser_mixin'
|
52
|
+
require 'rfeedparser/parsers'
|
53
|
+
require 'rfeedparser/markup_helpers'
|
2952
54
|
|
2953
|
-
|
2954
|
-
end
|
2955
|
-
def error(exc)
|
2956
|
-
@bozo = true
|
2957
|
-
@exc = exc
|
2958
|
-
end
|
55
|
+
include FeedParserUtilities
|
2959
56
|
|
2960
|
-
def fatalError(exc)
|
2961
|
-
error(exc)
|
2962
|
-
raise exc
|
2963
|
-
end
|
2964
|
-
end
|
2965
57
|
|
2966
|
-
|
2967
|
-
|
2968
|
-
# We write the methods that were in BaseHTMLProcessor in the python code
|
2969
|
-
# in here directly. We do this because if we inherited from
|
2970
|
-
# BaseHTMLProcessor but then included from FeedParserMixin, the methods
|
2971
|
-
# of Mixin would overwrite the methods we inherited from
|
2972
|
-
# BaseHTMLProcessor. This is exactly the opposite of what we want to
|
2973
|
-
# happen!
|
58
|
+
module FeedParser
|
59
|
+
Version = "0.9.9"
|
2974
60
|
|
2975
|
-
|
61
|
+
License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
2976
62
|
|
2977
|
-
|
2978
|
-
|
2979
|
-
New_Declname_Re = /[a-zA-Z][-_.a-zA-Z0-9:]*\s*/
|
2980
|
-
alias :sgml_feed :feed # feed needs to mapped to feeddata, not the SGMLParser method feed. I think.
|
2981
|
-
def feed
|
2982
|
-
@feeddata
|
2983
|
-
end
|
2984
|
-
def feed=(data)
|
2985
|
-
@feeddata = data
|
2986
|
-
end
|
63
|
+
Redistribution and use in source and binary forms, with or without modification,
|
64
|
+
are permitted provided that the following conditions are met:
|
2987
65
|
|
2988
|
-
|
2989
|
-
|
2990
|
-
|
2991
|
-
|
66
|
+
* Redistributions of source code must retain the above copyright notice,
|
67
|
+
this list of conditions and the following disclaimer.
|
68
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
69
|
+
this list of conditions and the following disclaimer in the documentation
|
70
|
+
and/or other materials provided with the distribution.
|
2992
71
|
|
2993
|
-
|
2994
|
-
|
2995
|
-
|
2996
|
-
|
72
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
73
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
74
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
75
|
+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
76
|
+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
77
|
+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
78
|
+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
79
|
+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
80
|
+
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
81
|
+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
82
|
+
POSSIBILITY OF SUCH DAMAGE."""
|
2997
83
|
|
2998
|
-
|
2999
|
-
|
3000
|
-
|
3001
|
-
|
3002
|
-
|
3003
|
-
|
3004
|
-
|
3005
|
-
|
3006
|
-
|
3007
|
-
|
84
|
+
Author = "Jeff Hodges <http://somethingsimilar.com>"
|
85
|
+
Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
|
86
|
+
Contributors = [ "Jason Diamond <http://injektilo.org/>",
|
87
|
+
"John Beimler <http://john.beimler.org/>",
|
88
|
+
"Fazal Majid <http://www.majid.info/mylos/weblog/>",
|
89
|
+
"Aaron Swartz <http://aaronsw.com/>",
|
90
|
+
"Kevin Marks <http://epeus.blogspot.com/>"
|
91
|
+
]
|
92
|
+
# HTTP "User-Agent" header to send to servers when downloading feeds.
|
93
|
+
# If you are embedding feedparser in a larger application, you should
|
94
|
+
# change this to your application name and URL.
|
95
|
+
USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
|
3008
96
|
|
3009
|
-
|
3010
|
-
|
3011
|
-
|
3012
|
-
data = uconvert(data,'utf-8',@encoding)
|
3013
|
-
end
|
3014
|
-
sgml_feed(data) # see the alias above
|
3015
|
-
end
|
97
|
+
# HTTP "Accept" header to send to servers when downloading feeds. If you don't
|
98
|
+
# want to send an Accept header, set this to None.
|
99
|
+
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
|
3016
100
|
|
3017
101
|
|
3018
|
-
|
3019
|
-
|
3020
|
-
|
3021
|
-
|
3022
|
-
data.gsub!('>', '>')
|
3023
|
-
data.gsub!('&', '&')
|
3024
|
-
data.gsub!('&', '&')
|
3025
|
-
data.gsub!('"', '"')
|
3026
|
-
data.gsub!('"', '"')
|
3027
|
-
data.gsub!(''', ''')
|
3028
|
-
data.gsub!(''', ''')
|
3029
|
-
if @contentparams.has_key? 'type' and not ((@contentparams['type'] || 'xml') =~ /xml$/u)
|
3030
|
-
data.gsub!('<', '<')
|
3031
|
-
data.gsub!('>', '>')
|
3032
|
-
data.gsub!('&', '&')
|
3033
|
-
data.gsub!('"', '"')
|
3034
|
-
data.gsub!(''', "'")
|
3035
|
-
end
|
3036
|
-
return data
|
3037
|
-
end
|
3038
|
-
end
|
102
|
+
# If you want feedparser to automatically run HTML markup through HTML Tidy, set
|
103
|
+
# this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
|
104
|
+
# or utidylib <http://utidylib.berlios.de/>.
|
105
|
+
#TIDY_MARKUP = false #FIXME untranslated
|
3039
106
|
|
3040
|
-
|
3041
|
-
|
3042
|
-
|
3043
|
-
['applet','codebase'],
|
3044
|
-
['area','href'],
|
3045
|
-
['blockquote','cite'],
|
3046
|
-
['body','background'],
|
3047
|
-
['del','cite'],
|
3048
|
-
['form','action'],
|
3049
|
-
['frame','longdesc'],
|
3050
|
-
['frame','src'],
|
3051
|
-
['iframe','longdesc'],
|
3052
|
-
['iframe','src'],
|
3053
|
-
['head','profile'],
|
3054
|
-
['img','longdesc'],
|
3055
|
-
['img','src'],
|
3056
|
-
['img','usemap'],
|
3057
|
-
['input','src'],
|
3058
|
-
['input','usemap'],
|
3059
|
-
['ins','cite'],
|
3060
|
-
['link','href'],
|
3061
|
-
['object','classid'],
|
3062
|
-
['object','codebase'],
|
3063
|
-
['object','data'],
|
3064
|
-
['object','usemap'],
|
3065
|
-
['q','cite'],
|
3066
|
-
['script','src'],
|
3067
|
-
]
|
3068
|
-
h = Hpricot(htmlSource)
|
3069
|
-
relative_uris.each do |l|
|
3070
|
-
ename, eattr = l
|
3071
|
-
h.search(ename).each do |elem|
|
3072
|
-
euri = elem.attributes[eattr]
|
3073
|
-
if euri and not euri.empty? and URI.parse(euri).relative?
|
3074
|
-
elem.attributes[eattr] = urljoin(baseURI, euri)
|
3075
|
-
end
|
3076
|
-
end
|
3077
|
-
end
|
3078
|
-
return h.to_html
|
3079
|
-
end
|
107
|
+
# List of Python interfaces for HTML Tidy, in order of preference. Only useful
|
108
|
+
# if TIDY_MARKUP = true
|
109
|
+
#PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
|
3080
110
|
|
3081
|
-
class SanitizerDoc < Hpricot::Doc
|
3082
|
-
|
3083
|
-
def scrub
|
3084
|
-
traverse_all_element do |e|
|
3085
|
-
if e.elem?
|
3086
|
-
if Acceptable_Elements.include?e.name
|
3087
|
-
e.strip_attributes
|
3088
|
-
else
|
3089
|
-
if Unacceptable_Elements_With_End_Tag.include?e.name
|
3090
|
-
e.inner_html = ''
|
3091
|
-
end
|
3092
|
-
e.swap(SanitizerDoc.new(e.children).scrub.to_html)
|
3093
|
-
# This works because the children swapped in are brought in "after" the current element.
|
3094
|
-
end
|
3095
|
-
elsif e.doctype?
|
3096
|
-
e.parent.children.delete(e)
|
3097
|
-
elsif e.text?
|
3098
|
-
ets = e.to_s
|
3099
|
-
ets.gsub!(/'/, "'")
|
3100
|
-
ets.gsub!(/"/, '"')
|
3101
|
-
ets.gsub!(/\r/,'')
|
3102
|
-
e.swap(ets)
|
3103
|
-
else
|
3104
|
-
end
|
3105
|
-
end
|
3106
|
-
# yes, that '/' should be there. It's a search method. See the Hpricot docs.
|
3107
111
|
|
3108
|
-
|
3109
|
-
|
3110
|
-
end
|
3111
|
-
return self
|
3112
|
-
end
|
112
|
+
# ---------- don't touch these ----------
|
113
|
+
class ThingsNobodyCaresAboutButMe < Exception
|
3113
114
|
end
|
3114
|
-
|
3115
|
-
def SanitizerDoc(html)
|
3116
|
-
FeedParser::SanitizerDoc.new(Hpricot.make(html))
|
115
|
+
class CharacterEncodingOverride < ThingsNobodyCaresAboutButMe
|
3117
116
|
end
|
3118
|
-
|
3119
|
-
def self.sanitizeHTML(html,encoding)
|
3120
|
-
# FIXME Tidy not yet supported
|
3121
|
-
html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '<!\1')
|
3122
|
-
h = SanitizerDoc(html)
|
3123
|
-
h = h.scrub
|
3124
|
-
return h.to_html.strip
|
117
|
+
class CharacterEncodingUnknown < ThingsNobodyCaresAboutButMe
|
3125
118
|
end
|
3126
|
-
|
3127
|
-
|
3128
|
-
|
3129
|
-
def self.getCharacterEncoding(feed, xml_data)
|
3130
|
-
# Get the character encoding of the XML document
|
3131
|
-
$stderr << "In getCharacterEncoding\n" if $debug
|
3132
|
-
sniffed_xml_encoding = nil
|
3133
|
-
xml_encoding = nil
|
3134
|
-
true_encoding = nil
|
3135
|
-
begin
|
3136
|
-
http_headers = feed.meta
|
3137
|
-
http_content_type = feed.meta['content-type'].split(';')[0]
|
3138
|
-
encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
|
3139
|
-
http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
|
3140
|
-
http_encoding = nil if http_encoding.empty?
|
3141
|
-
# FIXME Open-Uri returns iso8859-1 if there is no charset header,
|
3142
|
-
# but that doesn't pass the tests. Open-Uri claims its following
|
3143
|
-
# the right RFC. Are they wrong or do we need to change the tests?
|
3144
|
-
rescue NoMethodError
|
3145
|
-
http_headers = {}
|
3146
|
-
http_content_type = nil
|
3147
|
-
http_encoding = nil
|
3148
|
-
end
|
3149
|
-
# Must sniff for non-ASCII-compatible character encodings before
|
3150
|
-
# searching for XML declaration. This heuristic is defined in
|
3151
|
-
# section F of the XML specification:
|
3152
|
-
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
|
3153
|
-
begin
|
3154
|
-
if xml_data[0..3] == "\x4c\x6f\xa7\x94"
|
3155
|
-
# EBCDIC
|
3156
|
-
xml_data = _ebcdic_to_ascii(xml_data)
|
3157
|
-
elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
|
3158
|
-
# UTF-16BE
|
3159
|
-
sniffed_xml_encoding = 'utf-16be'
|
3160
|
-
xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
|
3161
|
-
elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
|
3162
|
-
# UTF-16BE with BOM
|
3163
|
-
sniffed_xml_encoding = 'utf-16be'
|
3164
|
-
xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
|
3165
|
-
elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
|
3166
|
-
# UTF-16LE
|
3167
|
-
sniffed_xml_encoding = 'utf-16le'
|
3168
|
-
xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
|
3169
|
-
elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
|
3170
|
-
# UTF-16LE with BOM
|
3171
|
-
sniffed_xml_encoding = 'utf-16le'
|
3172
|
-
xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
|
3173
|
-
elsif xml_data[0..3] == "\x00\x00\x00\x3c"
|
3174
|
-
# UTF-32BE
|
3175
|
-
sniffed_xml_encoding = 'utf-32be'
|
3176
|
-
xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
|
3177
|
-
elsif xml_data[0..3] == "\x3c\x00\x00\x00"
|
3178
|
-
# UTF-32LE
|
3179
|
-
sniffed_xml_encoding = 'utf-32le'
|
3180
|
-
xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
|
3181
|
-
elsif xml_data[0..3] == "\x00\x00\xfe\xff"
|
3182
|
-
# UTF-32BE with BOM
|
3183
|
-
sniffed_xml_encoding = 'utf-32be'
|
3184
|
-
xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
|
3185
|
-
elsif xml_data[0..3] == "\xff\xfe\x00\x00"
|
3186
|
-
# UTF-32LE with BOM
|
3187
|
-
sniffed_xml_encoding = 'utf-32le'
|
3188
|
-
xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
|
3189
|
-
elsif xml_data[0..2] == "\xef\xbb\xbf"
|
3190
|
-
# UTF-8 with BOM
|
3191
|
-
sniffed_xml_encoding = 'utf-8'
|
3192
|
-
xml_data = xml_data[3..-1]
|
3193
|
-
else
|
3194
|
-
# ASCII-compatible
|
3195
|
-
end
|
3196
|
-
xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
|
3197
|
-
rescue
|
3198
|
-
xml_encoding_match = nil
|
3199
|
-
end
|
3200
|
-
if xml_encoding_match
|
3201
|
-
xml_encoding = xml_encoding_match[1].downcase
|
3202
|
-
xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
|
3203
|
-
if sniffed_xml_encoding and xencodings.include?xml_encoding
|
3204
|
-
xml_encoding = sniffed_xml_encoding
|
3205
|
-
end
|
3206
|
-
end
|
3207
|
-
|
3208
|
-
acceptable_content_type = false
|
3209
|
-
application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
|
3210
|
-
text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
|
3211
|
-
|
3212
|
-
if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
|
3213
|
-
acceptable_content_type = true
|
3214
|
-
true_encoding = http_encoding || xml_encoding || 'utf-8'
|
3215
|
-
elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
|
3216
|
-
acceptable_content_type = true
|
3217
|
-
true_encoding = http_encoding || 'us-ascii'
|
3218
|
-
elsif /^text\// =~ http_content_type
|
3219
|
-
true_encoding = http_encoding || 'us-ascii'
|
3220
|
-
elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
|
3221
|
-
true_encoding = xml_encoding || 'iso-8859-1'
|
3222
|
-
else
|
3223
|
-
true_encoding = xml_encoding || 'utf-8'
|
3224
|
-
end
|
3225
|
-
return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
|
119
|
+
class NonXMLContentType < ThingsNobodyCaresAboutButMe
|
3226
120
|
end
|
3227
|
-
|
3228
|
-
def self.toUTF8(data, encoding)
|
3229
|
-
=begin
|
3230
|
-
Changes an XML data stream on the fly to specify a new encoding
|
3231
|
-
|
3232
|
-
data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
|
3233
|
-
encoding is a string recognized by encodings.aliases
|
3234
|
-
=end
|
3235
|
-
$stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
|
3236
|
-
# NOTE we must use double quotes when dealing with \x encodings!
|
3237
|
-
if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
|
3238
|
-
if $debug
|
3239
|
-
$stderr << "stripping BOM\n"
|
3240
|
-
if encoding != 'utf-16be'
|
3241
|
-
$stderr << "string utf-16be instead\n"
|
3242
|
-
end
|
3243
|
-
end
|
3244
|
-
encoding = 'utf-16be'
|
3245
|
-
data = data[2..-1]
|
3246
|
-
elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
|
3247
|
-
if $debug
|
3248
|
-
$stderr << "stripping BOM\n"
|
3249
|
-
$stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
|
3250
|
-
end
|
3251
|
-
encoding = 'utf-16le'
|
3252
|
-
data = data[2..-1]
|
3253
|
-
elsif (data[0..2] == "\xef\xbb\xbf")
|
3254
|
-
if $debug
|
3255
|
-
$stderr << "stripping BOM\n"
|
3256
|
-
$stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
|
3257
|
-
end
|
3258
|
-
encoding = 'utf-8'
|
3259
|
-
data = data[3..-1]
|
3260
|
-
elsif (data[0..3] == "\x00\x00\xfe\xff")
|
3261
|
-
if $debug
|
3262
|
-
$stderr << "stripping BOM\n"
|
3263
|
-
if encoding != 'utf-32be'
|
3264
|
-
$stderr << "trying utf-32be instead\n"
|
3265
|
-
end
|
3266
|
-
end
|
3267
|
-
encoding = 'utf-32be'
|
3268
|
-
data = data[4..-1]
|
3269
|
-
elsif (data[0..3] == "\xff\xfe\x00\x00")
|
3270
|
-
if $debug
|
3271
|
-
$stderr << "stripping BOM\n"
|
3272
|
-
if encoding != 'utf-32le'
|
3273
|
-
$stderr << "trying utf-32le instead\n"
|
3274
|
-
end
|
3275
|
-
end
|
3276
|
-
encoding = 'utf-32le'
|
3277
|
-
data = data[4..-1]
|
3278
|
-
end
|
3279
|
-
begin
|
3280
|
-
newdata = uconvert(data, encoding, 'utf-8')
|
3281
|
-
rescue => details
|
3282
|
-
end
|
3283
|
-
$stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
|
3284
|
-
declmatch = /^<\?xml[^>]*?>/
|
3285
|
-
newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
|
3286
|
-
if declmatch =~ newdata
|
3287
|
-
newdata.sub!(declmatch, newdecl)
|
3288
|
-
else
|
3289
|
-
newdata = newdecl + "\n" + newdata
|
3290
|
-
end
|
3291
|
-
return newdata
|
121
|
+
class UndeclaredNamespace < Exception
|
3292
122
|
end
|
3293
123
|
|
3294
|
-
def self.stripDoctype(data)
|
3295
|
-
=begin
|
3296
|
-
Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
3297
|
-
|
3298
|
-
rss_version may be 'rss091n' or None
|
3299
|
-
stripped_data is the same XML document, minus the DOCTYPE
|
3300
|
-
=end
|
3301
|
-
entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
|
3302
|
-
data = data.gsub(entity_pattern,'')
|
3303
|
-
|
3304
|
-
doctype_pattern = /<!DOCTYPE(.*?)>/m
|
3305
|
-
doctype_results = data.scan(doctype_pattern)
|
3306
|
-
if doctype_results and doctype_results[0]
|
3307
|
-
doctype = doctype_results[0][0]
|
3308
|
-
else
|
3309
|
-
doctype = ''
|
3310
|
-
end
|
3311
|
-
|
3312
|
-
if /netscape/ =~ doctype.downcase
|
3313
|
-
version = 'rss091n'
|
3314
|
-
else
|
3315
|
-
version = nil
|
3316
|
-
end
|
3317
|
-
data = data.sub(doctype_pattern, '')
|
3318
|
-
return version, data
|
3319
|
-
end
|
3320
124
|
|
3321
|
-
|
3322
|
-
|
125
|
+
SUPPORTED_VERSIONS = {'' => 'unknown',
|
126
|
+
'rss090' => 'RSS 0.90',
|
127
|
+
'rss091n' => 'RSS 0.91 (Netscape)',
|
128
|
+
'rss091u' => 'RSS 0.91 (Userland)',
|
129
|
+
'rss092' => 'RSS 0.92',
|
130
|
+
'rss093' => 'RSS 0.93',
|
131
|
+
'rss094' => 'RSS 0.94',
|
132
|
+
'rss20' => 'RSS 2.0',
|
133
|
+
'rss10' => 'RSS 1.0',
|
134
|
+
'rss' => 'RSS (unknown version)',
|
135
|
+
'atom01' => 'Atom 0.1',
|
136
|
+
'atom02' => 'Atom 0.2',
|
137
|
+
'atom03' => 'Atom 0.3',
|
138
|
+
'atom10' => 'Atom 1.0',
|
139
|
+
'atom' => 'Atom (unknown version)',
|
140
|
+
'cdf' => 'CDF',
|
141
|
+
'hotrss' => 'Hot RSS'
|
142
|
+
}
|
143
|
+
|
144
|
+
def parse(furi, options = {})
|
3323
145
|
# Parse a feed from a URL, file, stream or string
|
3324
146
|
$compatible = options[:compatible] || $compatible # Use the default compatibility if compatible is nil
|
147
|
+
strictklass = options[:strict] || StrictFeedParser
|
148
|
+
looseklass = options[:loose] || LooseFeedParser
|
3325
149
|
result = FeedParserDict.new
|
3326
150
|
result['feed'] = FeedParserDict.new
|
3327
151
|
result['entries'] = []
|
@@ -3331,13 +155,12 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
|
3331
155
|
end
|
3332
156
|
result['bozo'] = false
|
3333
157
|
handlers = options[:handlers]
|
3334
|
-
|
3335
158
|
if handlers.class != Array # FIXME why does this happen?
|
3336
159
|
handlers = [handlers]
|
3337
160
|
end
|
3338
161
|
|
3339
162
|
begin
|
3340
|
-
if
|
163
|
+
if File.exists?furi
|
3341
164
|
f = open(furi) # OpenURI doesn't behave well when passing HTTP options to a file.
|
3342
165
|
else
|
3343
166
|
# And when you do pass them, make sure they aren't just nil (this still true?)
|
@@ -3504,7 +327,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
|
3504
327
|
if use_strict_parser
|
3505
328
|
# initialize the SAX parser
|
3506
329
|
saxparser = XML::SAX::Helpers::ParserFactory.makeParser("XML::Parser::SAXDriver")
|
3507
|
-
feedparser =
|
330
|
+
feedparser = strictklass.new(baseuri, baselang, 'utf-8')
|
3508
331
|
saxparser.setDocumentHandler(feedparser)
|
3509
332
|
saxparser.setDTDHandler(feedparser)
|
3510
333
|
saxparser.setEntityResolver(feedparser)
|
@@ -3525,7 +348,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
|
3525
348
|
end
|
3526
349
|
end
|
3527
350
|
if not use_strict_parser
|
3528
|
-
feedparser =
|
351
|
+
feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
|
3529
352
|
feedparser.parse(data)
|
3530
353
|
$stderr << "Using LooseFeed\n\n" if $debug
|
3531
354
|
end
|
@@ -3535,6 +358,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
|
3535
358
|
result['namespaces'] = feedparser.namespacesInUse
|
3536
359
|
return result
|
3537
360
|
end
|
361
|
+
module_function(:parse)
|
3538
362
|
end # End FeedParser module
|
3539
363
|
|
3540
364
|
class Serializer
|
@@ -3574,7 +398,7 @@ class TextSerializer < Serializer
|
|
3574
398
|
end
|
3575
399
|
end
|
3576
400
|
|
3577
|
-
class PprintSerializer < Serializer # FIXME
|
401
|
+
class PprintSerializer < Serializer # FIXME use pp instead
|
3578
402
|
def write(stream = $stdout)
|
3579
403
|
stream << @results['href'].to_s + "\n\n"
|
3580
404
|
pp(@results)
|
@@ -3582,87 +406,88 @@ class PprintSerializer < Serializer # FIXME ? use pp instead?
|
|
3582
406
|
end
|
3583
407
|
end
|
3584
408
|
|
3585
|
-
|
3586
|
-
require 'optparse'
|
3587
|
-
require 'ostruct'
|
3588
|
-
options = OpenStruct.new
|
3589
|
-
options.etag = options.modified = options.agent = options.referrer = nil
|
3590
|
-
options.content_language = options.content_location = options.ctype = nil
|
3591
|
-
options.format = 'pprint'
|
3592
|
-
options.compatible = $compatible
|
3593
|
-
options.verbose = false
|
3594
|
-
|
3595
|
-
opts = OptionParser.new do |opts|
|
3596
|
-
|
3597
|
-
|
3598
|
-
|
409
|
+
if $0 == __FILE__
|
410
|
+
require 'optparse'
|
411
|
+
require 'ostruct'
|
412
|
+
options = OpenStruct.new
|
413
|
+
options.etag = options.modified = options.agent = options.referrer = nil
|
414
|
+
options.content_language = options.content_location = options.ctype = nil
|
415
|
+
options.format = 'pprint'
|
416
|
+
options.compatible = $compatible
|
417
|
+
options.verbose = false
|
418
|
+
|
419
|
+
opts = OptionParser.new do |opts|
|
420
|
+
opts.banner
|
421
|
+
opts.separator ""
|
422
|
+
opts.on("-A", "--user-agent [AGENT]",
|
3599
423
|
"User-Agent for HTTP URLs") {|agent|
|
3600
|
-
|
3601
|
-
|
424
|
+
options.agent = agent
|
425
|
+
}
|
3602
426
|
|
3603
|
-
|
427
|
+
opts.on("-e", "--referrer [URL]",
|
3604
428
|
"Referrer for HTTP URLs") {|referrer|
|
3605
|
-
|
3606
|
-
|
429
|
+
options.referrer = referrer
|
430
|
+
}
|
3607
431
|
|
3608
|
-
|
432
|
+
opts.on("-t", "--etag [TAG]",
|
3609
433
|
"ETag/If-None-Match for HTTP URLs") {|etag|
|
3610
|
-
|
3611
|
-
|
434
|
+
options.etag = etag
|
435
|
+
}
|
3612
436
|
|
3613
|
-
|
437
|
+
opts.on("-m", "--last-modified [DATE]",
|
3614
438
|
"Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") {|modified|
|
3615
|
-
|
3616
|
-
|
439
|
+
options.modified = modified
|
440
|
+
}
|
3617
441
|
|
3618
|
-
|
442
|
+
opts.on("-f", "--format [FORMAT]", [:text, :pprint],
|
3619
443
|
"output resutls in FORMAT (text, pprint)") {|format|
|
3620
|
-
|
3621
|
-
|
444
|
+
options.format = format
|
445
|
+
}
|
3622
446
|
|
3623
|
-
|
447
|
+
opts.on("-v", "--[no-]verbose",
|
3624
448
|
"write debugging information to stderr") {|v|
|
3625
|
-
|
3626
|
-
|
449
|
+
options.verbose = v
|
450
|
+
}
|
3627
451
|
|
3628
|
-
|
452
|
+
opts.on("-c", "--[no-]compatible",
|
3629
453
|
"strip element attributes like feedparser.py 4.1 (default)") {|comp|
|
3630
|
-
|
3631
|
-
|
3632
|
-
|
454
|
+
options.compatible = comp
|
455
|
+
}
|
456
|
+
opts.on("-l", "--content-location [LOCATION]",
|
3633
457
|
"default Content-Location HTTP header") {|loc|
|
3634
|
-
|
3635
|
-
|
3636
|
-
|
458
|
+
options.content_location = loc
|
459
|
+
}
|
460
|
+
opts.on("-a", "--content-language [LANG]",
|
3637
461
|
"default Content-Language HTTP header") {|lang|
|
3638
|
-
|
3639
|
-
|
3640
|
-
|
462
|
+
options.content_language = lang
|
463
|
+
}
|
464
|
+
opts.on("-t", "--content-type [TYPE]",
|
3641
465
|
"default Content-type HTTP header") {|ctype|
|
3642
|
-
|
3643
|
-
|
3644
|
-
end
|
466
|
+
options.ctype = ctype
|
467
|
+
}
|
468
|
+
end
|
3645
469
|
|
3646
|
-
opts.parse!(ARGV)
|
3647
|
-
$debug = true if options.verbose
|
3648
|
-
$compatible = options.compatible unless options.compatible.nil?
|
470
|
+
opts.parse!(ARGV)
|
471
|
+
$debug = true if options.verbose
|
472
|
+
$compatible = options.compatible unless options.compatible.nil?
|
3649
473
|
|
3650
|
-
if options.format == :text
|
3651
|
-
|
3652
|
-
else
|
3653
|
-
|
3654
|
-
end
|
3655
|
-
args = *ARGV.dup
|
3656
|
-
unless args.nil?
|
3657
|
-
|
3658
|
-
|
3659
|
-
|
3660
|
-
|
3661
|
-
|
3662
|
-
|
3663
|
-
|
3664
|
-
|
3665
|
-
|
3666
|
-
|
474
|
+
if options.format == :text
|
475
|
+
serializer = TextSerializer
|
476
|
+
else
|
477
|
+
serializer = PprintSerializer
|
478
|
+
end
|
479
|
+
args = *ARGV.dup
|
480
|
+
unless args.nil?
|
481
|
+
args.each do |url| # opts.parse! removes everything but the urls from the command line
|
482
|
+
results = FeedParser.parse(url, :etag => options.etag,
|
483
|
+
:modified => options.modified,
|
484
|
+
:agent => options.agent,
|
485
|
+
:referrer => options.referrer,
|
486
|
+
:content_location => options.content_location,
|
487
|
+
:content_language => options.content_language,
|
488
|
+
:content_type => options.ctype
|
489
|
+
)
|
490
|
+
serializer.new(results).write($stdout)
|
491
|
+
end
|
3667
492
|
end
|
3668
493
|
end
|