rfeedparser 0.9.9 → 0.9.85
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rfeedparser.rb +3354 -170
- data/tests/rfeedparsertest.rb +1 -3
- metadata +3280 -3301
- data/lib/rfeedparser/aliases.rb +0 -432
- data/lib/rfeedparser/better_attributelist.rb +0 -41
- data/lib/rfeedparser/better_sgmlparser.rb +0 -264
- data/lib/rfeedparser/encoding_helpers.rb +0 -257
- data/lib/rfeedparser/feedparserdict.rb +0 -93
- data/lib/rfeedparser/forgiving_uri.rb +0 -93
- data/lib/rfeedparser/markup_helpers.rb +0 -73
- data/lib/rfeedparser/parser_mixin.rb +0 -1235
- data/lib/rfeedparser/parsers.rb +0 -177
- data/lib/rfeedparser/scrub.rb +0 -207
- data/lib/rfeedparser/time_helpers.rb +0 -408
data/lib/rfeedparser.rb
CHANGED
@@ -14,138 +14,3323 @@ require 'stringio'
|
|
14
14
|
require 'uri'
|
15
15
|
require 'cgi' # escaping html
|
16
16
|
require 'time'
|
17
|
+
require 'xml/saxdriver' # calling expat
|
17
18
|
require 'pp'
|
18
19
|
require 'rubygems'
|
19
20
|
require 'base64'
|
20
21
|
require 'iconv'
|
22
|
+
gem 'hpricot', ">=0.5"
|
23
|
+
gem 'character-encodings', ">=0.2.0"
|
24
|
+
gem 'htmltools', ">=1.10"
|
25
|
+
gem 'htmlentities', ">=4.0.0"
|
26
|
+
gem 'activesupport', ">=1.4.2"
|
27
|
+
gem 'rchardet', ">=1.0"
|
28
|
+
|
29
|
+
require 'rchardet'
|
30
|
+
$chardet = true
|
31
|
+
|
32
|
+
require 'hpricot'
|
33
|
+
require 'encoding/character/utf-8'
|
34
|
+
require 'html/sgml-parser'
|
35
|
+
require 'htmlentities'
|
36
|
+
require 'active_support'
|
37
|
+
require 'open-uri'
|
38
|
+
include OpenURI
|
39
|
+
|
40
|
+
$debug = false
|
41
|
+
$compatible = true
|
42
|
+
|
43
|
+
Encoding_Aliases = { # Adapted from python2.4's encodings/aliases.py
|
44
|
+
'unicode' => 'utf-16',
|
45
|
+
# MacOSX does not have Unicode as a separate encoding nor even
|
46
|
+
# aliased. My Ubuntu box has it as a separate encoding but I cannot
|
47
|
+
# for the life of me figure out where the source code for UNICODE.so
|
48
|
+
# is (supposedly, in libc6 .deb but that's a damn lie), so I don't
|
49
|
+
# know what it expects. After some extensive research, I've decided
|
50
|
+
# to alias it to utf-16 much like Python does when it is built with
|
51
|
+
# --enable-unicode=ucs2. This could be seriously wrong. I have no idea.
|
52
|
+
|
53
|
+
# ascii codec
|
54
|
+
'646' => 'ascii',
|
55
|
+
'ansi_x3.4_1968' => 'ascii',
|
56
|
+
'ansi_x3_4_1968' => 'ascii', # some email headers use this non-standard name
|
57
|
+
'ansi_x3.4_1986' => 'ascii',
|
58
|
+
'cp367' => 'ascii',
|
59
|
+
'csascii' => 'ascii',
|
60
|
+
'ibm367' => 'ascii',
|
61
|
+
'iso646_us' => 'ascii',
|
62
|
+
'iso_646.irv_1991' => 'ascii',
|
63
|
+
'iso_ir_6' => 'ascii',
|
64
|
+
'us' => 'ascii',
|
65
|
+
'us_ascii' => 'ascii',
|
66
|
+
|
67
|
+
# big5 codec
|
68
|
+
'big5_tw' => 'big5',
|
69
|
+
'csbig5' => 'big5',
|
70
|
+
|
71
|
+
# big5hkscs codec
|
72
|
+
'big5_hkscs' => 'big5hkscs',
|
73
|
+
'hkscs' => 'big5hkscs',
|
74
|
+
|
75
|
+
# cp037 codec
|
76
|
+
'037' => 'cp037',
|
77
|
+
'csibm037' => 'cp037',
|
78
|
+
'ebcdic_cp_ca' => 'cp037',
|
79
|
+
'ebcdic_cp_nl' => 'cp037',
|
80
|
+
'ebcdic_cp_us' => 'cp037',
|
81
|
+
'ebcdic_cp_wt' => 'cp037',
|
82
|
+
'ibm037' => 'cp037',
|
83
|
+
'ibm039' => 'cp037',
|
84
|
+
|
85
|
+
# cp1026 codec
|
86
|
+
'1026' => 'cp1026',
|
87
|
+
'csibm1026' => 'cp1026',
|
88
|
+
'ibm1026' => 'cp1026',
|
89
|
+
|
90
|
+
# cp1140 codec
|
91
|
+
'1140' => 'cp1140',
|
92
|
+
'ibm1140' => 'cp1140',
|
93
|
+
|
94
|
+
# cp1250 codec
|
95
|
+
'1250' => 'cp1250',
|
96
|
+
'windows_1250' => 'cp1250',
|
97
|
+
|
98
|
+
# cp1251 codec
|
99
|
+
'1251' => 'cp1251',
|
100
|
+
'windows_1251' => 'cp1251',
|
101
|
+
|
102
|
+
# cp1252 codec
|
103
|
+
'1252' => 'cp1252',
|
104
|
+
'windows_1252' => 'cp1252',
|
105
|
+
|
106
|
+
# cp1253 codec
|
107
|
+
'1253' => 'cp1253',
|
108
|
+
'windows_1253' => 'cp1253',
|
109
|
+
|
110
|
+
# cp1254 codec
|
111
|
+
'1254' => 'cp1254',
|
112
|
+
'windows_1254' => 'cp1254',
|
113
|
+
|
114
|
+
# cp1255 codec
|
115
|
+
'1255' => 'cp1255',
|
116
|
+
'windows_1255' => 'cp1255',
|
117
|
+
|
118
|
+
# cp1256 codec
|
119
|
+
'1256' => 'cp1256',
|
120
|
+
'windows_1256' => 'cp1256',
|
121
|
+
|
122
|
+
# cp1257 codec
|
123
|
+
'1257' => 'cp1257',
|
124
|
+
'windows_1257' => 'cp1257',
|
125
|
+
|
126
|
+
# cp1258 codec
|
127
|
+
'1258' => 'cp1258',
|
128
|
+
'windows_1258' => 'cp1258',
|
129
|
+
|
130
|
+
# cp424 codec
|
131
|
+
'424' => 'cp424',
|
132
|
+
'csibm424' => 'cp424',
|
133
|
+
'ebcdic_cp_he' => 'cp424',
|
134
|
+
'ibm424' => 'cp424',
|
135
|
+
|
136
|
+
# cp437 codec
|
137
|
+
'437' => 'cp437',
|
138
|
+
'cspc8codepage437' => 'cp437',
|
139
|
+
'ibm437' => 'cp437',
|
140
|
+
|
141
|
+
# cp500 codec
|
142
|
+
'500' => 'cp500',
|
143
|
+
'csibm500' => 'cp500',
|
144
|
+
'ebcdic_cp_be' => 'cp500',
|
145
|
+
'ebcdic_cp_ch' => 'cp500',
|
146
|
+
'ibm500' => 'cp500',
|
147
|
+
|
148
|
+
# cp775 codec
|
149
|
+
'775' => 'cp775',
|
150
|
+
'cspc775baltic' => 'cp775',
|
151
|
+
'ibm775' => 'cp775',
|
152
|
+
|
153
|
+
# cp850 codec
|
154
|
+
'850' => 'cp850',
|
155
|
+
'cspc850multilingual' => 'cp850',
|
156
|
+
'ibm850' => 'cp850',
|
157
|
+
|
158
|
+
# cp852 codec
|
159
|
+
'852' => 'cp852',
|
160
|
+
'cspcp852' => 'cp852',
|
161
|
+
'ibm852' => 'cp852',
|
162
|
+
|
163
|
+
# cp855 codec
|
164
|
+
'855' => 'cp855',
|
165
|
+
'csibm855' => 'cp855',
|
166
|
+
'ibm855' => 'cp855',
|
167
|
+
|
168
|
+
# cp857 codec
|
169
|
+
'857' => 'cp857',
|
170
|
+
'csibm857' => 'cp857',
|
171
|
+
'ibm857' => 'cp857',
|
172
|
+
|
173
|
+
# cp860 codec
|
174
|
+
'860' => 'cp860',
|
175
|
+
'csibm860' => 'cp860',
|
176
|
+
'ibm860' => 'cp860',
|
177
|
+
|
178
|
+
# cp861 codec
|
179
|
+
'861' => 'cp861',
|
180
|
+
'cp_is' => 'cp861',
|
181
|
+
'csibm861' => 'cp861',
|
182
|
+
'ibm861' => 'cp861',
|
183
|
+
|
184
|
+
# cp862 codec
|
185
|
+
'862' => 'cp862',
|
186
|
+
'cspc862latinhebrew' => 'cp862',
|
187
|
+
'ibm862' => 'cp862',
|
188
|
+
|
189
|
+
# cp863 codec
|
190
|
+
'863' => 'cp863',
|
191
|
+
'csibm863' => 'cp863',
|
192
|
+
'ibm863' => 'cp863',
|
193
|
+
|
194
|
+
# cp864 codec
|
195
|
+
'864' => 'cp864',
|
196
|
+
'csibm864' => 'cp864',
|
197
|
+
'ibm864' => 'cp864',
|
198
|
+
|
199
|
+
# cp865 codec
|
200
|
+
'865' => 'cp865',
|
201
|
+
'csibm865' => 'cp865',
|
202
|
+
'ibm865' => 'cp865',
|
203
|
+
|
204
|
+
# cp866 codec
|
205
|
+
'866' => 'cp866',
|
206
|
+
'csibm866' => 'cp866',
|
207
|
+
'ibm866' => 'cp866',
|
208
|
+
|
209
|
+
# cp869 codec
|
210
|
+
'869' => 'cp869',
|
211
|
+
'cp_gr' => 'cp869',
|
212
|
+
'csibm869' => 'cp869',
|
213
|
+
'ibm869' => 'cp869',
|
214
|
+
|
215
|
+
# cp932 codec
|
216
|
+
'932' => 'cp932',
|
217
|
+
'ms932' => 'cp932',
|
218
|
+
'mskanji' => 'cp932',
|
219
|
+
'ms_kanji' => 'cp932',
|
220
|
+
|
221
|
+
# cp949 codec
|
222
|
+
'949' => 'cp949',
|
223
|
+
'ms949' => 'cp949',
|
224
|
+
'uhc' => 'cp949',
|
225
|
+
|
226
|
+
# cp950 codec
|
227
|
+
'950' => 'cp950',
|
228
|
+
'ms950' => 'cp950',
|
229
|
+
|
230
|
+
# euc_jp codec
|
231
|
+
'euc_jp' => 'euc-jp',
|
232
|
+
'eucjp' => 'euc-jp',
|
233
|
+
'ujis' => 'euc-jp',
|
234
|
+
'u_jis' => 'euc-jp',
|
235
|
+
|
236
|
+
# euc_kr codec
|
237
|
+
'euc_kr' => 'euc-kr',
|
238
|
+
'euckr' => 'euc-kr',
|
239
|
+
'korean' => 'euc-kr',
|
240
|
+
'ksc5601' => 'euc-kr',
|
241
|
+
'ks_c_5601' => 'euc-kr',
|
242
|
+
'ks_c_5601_1987' => 'euc-kr',
|
243
|
+
'ksx1001' => 'euc-kr',
|
244
|
+
'ks_x_1001' => 'euc-kr',
|
245
|
+
|
246
|
+
# gb18030 codec
|
247
|
+
'gb18030_2000' => 'gb18030',
|
248
|
+
|
249
|
+
# gb2312 codec
|
250
|
+
'chinese' => 'gb2312',
|
251
|
+
'csiso58gb231280' => 'gb2312',
|
252
|
+
'euc_cn' => 'gb2312',
|
253
|
+
'euccn' => 'gb2312',
|
254
|
+
'eucgb2312_cn' => 'gb2312',
|
255
|
+
'gb2312_1980' => 'gb2312',
|
256
|
+
'gb2312_80' => 'gb2312',
|
257
|
+
'iso_ir_58' => 'gb2312',
|
258
|
+
|
259
|
+
# gbk codec
|
260
|
+
'936' => 'gbk',
|
261
|
+
'cp936' => 'gbk',
|
262
|
+
'ms936' => 'gbk',
|
263
|
+
|
264
|
+
# hp-roman8 codec
|
265
|
+
'hp_roman8' => 'hp-roman8',
|
266
|
+
'roman8' => 'hp-roman8',
|
267
|
+
'r8' => 'hp-roman8',
|
268
|
+
'csHPRoman8' => 'hp-roman8',
|
269
|
+
|
270
|
+
# iso2022_jp codec
|
271
|
+
'iso2022_jp' => 'iso-2022-jp',
|
272
|
+
'csiso2022jp' => 'iso-2022-jp',
|
273
|
+
'iso2022jp' => 'iso-2022-jp',
|
274
|
+
'iso_2022_jp' => 'iso-2022-jp',
|
275
|
+
|
276
|
+
# iso2022_jp_1 codec
|
277
|
+
'iso2002_jp_1' => 'iso-2022-jp-1',
|
278
|
+
'iso2022jp_1' => 'iso-2022-jp-1',
|
279
|
+
'iso_2022_jp_1' => 'iso-2022-jp-1',
|
280
|
+
|
281
|
+
# iso2022_jp_2 codec
|
282
|
+
'iso2022_jp_2' => 'iso-2002-jp-2',
|
283
|
+
'iso2022jp_2' => 'iso-2022-jp-2',
|
284
|
+
'iso_2022_jp_2' => 'iso-2022-jp-2',
|
285
|
+
|
286
|
+
# iso2022_jp_3 codec
|
287
|
+
'iso2002_jp_3' => 'iso-2022-jp-3',
|
288
|
+
'iso2022jp_3' => 'iso-2022-jp-3',
|
289
|
+
'iso_2022_jp_3' => 'iso-2022-jp-3',
|
290
|
+
|
291
|
+
# iso2022_kr codec
|
292
|
+
'iso2022_kr' => 'iso-2022-kr',
|
293
|
+
'csiso2022kr' => 'iso-2022-kr',
|
294
|
+
'iso2022kr' => 'iso-2022-kr',
|
295
|
+
'iso_2022_kr' => 'iso-2022-kr',
|
296
|
+
|
297
|
+
# iso8859_10 codec
|
298
|
+
'iso8859_10' => 'iso-8859-10',
|
299
|
+
'csisolatin6' => 'iso-8859-10',
|
300
|
+
'iso_8859_10' => 'iso-8859-10',
|
301
|
+
'iso_8859_10_1992' => 'iso-8859-10',
|
302
|
+
'iso_ir_157' => 'iso-8859-10',
|
303
|
+
'l6' => 'iso-8859-10',
|
304
|
+
'latin6' => 'iso-8859-10',
|
305
|
+
|
306
|
+
# iso8859_13 codec
|
307
|
+
'iso8859_13' => 'iso-8859-13',
|
308
|
+
'iso_8859_13' => 'iso-8859-13',
|
309
|
+
|
310
|
+
# iso8859_14 codec
|
311
|
+
'iso8859_14' => 'iso-8859-14',
|
312
|
+
'iso_8859_14' => 'iso-8859-14',
|
313
|
+
'iso_8859_14_1998' => 'iso-8859-14',
|
314
|
+
'iso_celtic' => 'iso-8859-14',
|
315
|
+
'iso_ir_199' => 'iso-8859-14',
|
316
|
+
'l8' => 'iso-8859-14',
|
317
|
+
'latin8' => 'iso-8859-14',
|
318
|
+
|
319
|
+
# iso8859_15 codec
|
320
|
+
'iso8859_15' => 'iso-8859-15',
|
321
|
+
'iso_8859_15' => 'iso-8859-15',
|
322
|
+
|
323
|
+
# iso8859_1 codec
|
324
|
+
'latin_1' => 'iso-8859-1',
|
325
|
+
'cp819' => 'iso-8859-1',
|
326
|
+
'csisolatin1' => 'iso-8859-1',
|
327
|
+
'ibm819' => 'iso-8859-1',
|
328
|
+
'iso8859' => 'iso-8859-1',
|
329
|
+
'iso_8859_1' => 'iso-8859-1',
|
330
|
+
'iso_8859_1_1987' => 'iso-8859-1',
|
331
|
+
'iso_ir_100' => 'iso-8859-1',
|
332
|
+
'l1' => 'iso-8859-1',
|
333
|
+
'latin' => 'iso-8859-1',
|
334
|
+
'latin1' => 'iso-8859-1',
|
335
|
+
|
336
|
+
# iso8859_2 codec
|
337
|
+
'iso8859_2' => 'iso-8859-2',
|
338
|
+
'csisolatin2' => 'iso-8859-2',
|
339
|
+
'iso_8859_2' => 'iso-8859-2',
|
340
|
+
'iso_8859_2_1987' => 'iso-8859-2',
|
341
|
+
'iso_ir_101' => 'iso-8859-2',
|
342
|
+
'l2' => 'iso-8859-2',
|
343
|
+
'latin2' => 'iso-8859-2',
|
344
|
+
|
345
|
+
# iso8859_3 codec
|
346
|
+
'iso8859_3' => 'iso-8859-3',
|
347
|
+
'csisolatin3' => 'iso-8859-3',
|
348
|
+
'iso_8859_3' => 'iso-8859-3',
|
349
|
+
'iso_8859_3_1988' => 'iso-8859-3',
|
350
|
+
'iso_ir_109' => 'iso-8859-3',
|
351
|
+
'l3' => 'iso-8859-3',
|
352
|
+
'latin3' => 'iso-8859-3',
|
353
|
+
|
354
|
+
# iso8859_4 codec
|
355
|
+
'iso8849_4' => 'iso-8859-4',
|
356
|
+
'csisolatin4' => 'iso-8859-4',
|
357
|
+
'iso_8859_4' => 'iso-8859-4',
|
358
|
+
'iso_8859_4_1988' => 'iso-8859-4',
|
359
|
+
'iso_ir_110' => 'iso-8859-4',
|
360
|
+
'l4' => 'iso-8859-4',
|
361
|
+
'latin4' => 'iso-8859-4',
|
362
|
+
|
363
|
+
# iso8859_5 codec
|
364
|
+
'iso8859_5' => 'iso-8859-5',
|
365
|
+
'csisolatincyrillic' => 'iso-8859-5',
|
366
|
+
'cyrillic' => 'iso-8859-5',
|
367
|
+
'iso_8859_5' => 'iso-8859-5',
|
368
|
+
'iso_8859_5_1988' => 'iso-8859-5',
|
369
|
+
'iso_ir_144' => 'iso-8859-5',
|
370
|
+
|
371
|
+
# iso8859_6 codec
|
372
|
+
'iso8859_6' => 'iso-8859-6',
|
373
|
+
'arabic' => 'iso-8859-6',
|
374
|
+
'asmo_708' => 'iso-8859-6',
|
375
|
+
'csisolatinarabic' => 'iso-8859-6',
|
376
|
+
'ecma_114' => 'iso-8859-6',
|
377
|
+
'iso_8859_6' => 'iso-8859-6',
|
378
|
+
'iso_8859_6_1987' => 'iso-8859-6',
|
379
|
+
'iso_ir_127' => 'iso-8859-6',
|
380
|
+
|
381
|
+
# iso8859_7 codec
|
382
|
+
'iso8859_7' => 'iso-8859-7',
|
383
|
+
'csisolatingreek' => 'iso-8859-7',
|
384
|
+
'ecma_118' => 'iso-8859-7',
|
385
|
+
'elot_928' => 'iso-8859-7',
|
386
|
+
'greek' => 'iso-8859-7',
|
387
|
+
'greek8' => 'iso-8859-7',
|
388
|
+
'iso_8859_7' => 'iso-8859-7',
|
389
|
+
'iso_8859_7_1987' => 'iso-8859-7',
|
390
|
+
'iso_ir_126' => 'iso-8859-7',
|
391
|
+
|
392
|
+
# iso8859_8 codec
|
393
|
+
'iso8859_9' => 'iso8859_8',
|
394
|
+
'csisolatinhebrew' => 'iso-8859-8',
|
395
|
+
'hebrew' => 'iso-8859-8',
|
396
|
+
'iso_8859_8' => 'iso-8859-8',
|
397
|
+
'iso_8859_8_1988' => 'iso-8859-8',
|
398
|
+
'iso_ir_138' => 'iso-8859-8',
|
399
|
+
|
400
|
+
# iso8859_9 codec
|
401
|
+
'iso8859_9' => 'iso-8859-9',
|
402
|
+
'csisolatin5' => 'iso-8859-9',
|
403
|
+
'iso_8859_9' => 'iso-8859-9',
|
404
|
+
'iso_8859_9_1989' => 'iso-8859-9',
|
405
|
+
'iso_ir_148' => 'iso-8859-9',
|
406
|
+
'l5' => 'iso-8859-9',
|
407
|
+
'latin5' => 'iso-8859-9',
|
408
|
+
|
409
|
+
# iso8859_11 codec
|
410
|
+
'iso8859_11' => 'iso-8859-11',
|
411
|
+
'thai' => 'iso-8859-11',
|
412
|
+
'iso_8859_11' => 'iso-8859-11',
|
413
|
+
'iso_8859_11_2001' => 'iso-8859-11',
|
414
|
+
|
415
|
+
# iso8859_16 codec
|
416
|
+
'iso8859_16' => 'iso-8859-16',
|
417
|
+
'iso_8859_16' => 'iso-8859-16',
|
418
|
+
'iso_8859_16_2001' => 'iso-8859-16',
|
419
|
+
'iso_ir_226' => 'iso-8859-16',
|
420
|
+
'l10' => 'iso-8859-16',
|
421
|
+
'latin10' => 'iso-8859-16',
|
422
|
+
|
423
|
+
# cskoi8r codec
|
424
|
+
'koi8_r' => 'cskoi8r',
|
425
|
+
|
426
|
+
# mac_cyrillic codec
|
427
|
+
'mac_cyrillic' => 'maccyrillic',
|
428
|
+
|
429
|
+
# shift_jis codec
|
430
|
+
'csshiftjis' => 'shift_jis',
|
431
|
+
'shiftjis' => 'shift_jis',
|
432
|
+
'sjis' => 'shift_jis',
|
433
|
+
's_jis' => 'shift_jis',
|
434
|
+
|
435
|
+
# shift_jisx0213 codec
|
436
|
+
'shiftjisx0213' => 'shift_jisx0213',
|
437
|
+
'sjisx0213' => 'shift_jisx0213',
|
438
|
+
's_jisx0213' => 'shift_jisx0213',
|
439
|
+
|
440
|
+
# utf_16 codec
|
441
|
+
'utf_16' => 'utf-16',
|
442
|
+
'u16' => 'utf-16',
|
443
|
+
'utf16' => 'utf-16',
|
444
|
+
|
445
|
+
# utf_16_be codec
|
446
|
+
'utf_16_be' => 'utf-16be',
|
447
|
+
'unicodebigunmarked' => 'utf-16be',
|
448
|
+
'utf_16be' => 'utf-16be',
|
449
|
+
|
450
|
+
# utf_16_le codec
|
451
|
+
'utf_16_le' => 'utf-16le',
|
452
|
+
'unicodelittleunmarked' => 'utf-16le',
|
453
|
+
'utf_16le' => 'utf-16le',
|
454
|
+
|
455
|
+
# utf_7 codec
|
456
|
+
'utf_7' => 'utf-7',
|
457
|
+
'u7' => 'utf-7',
|
458
|
+
'utf7' => 'utf-7',
|
459
|
+
|
460
|
+
# utf_8 codec
|
461
|
+
'utf_8' => 'utf-8',
|
462
|
+
'u8' => 'utf-8',
|
463
|
+
'utf' => 'utf-8',
|
464
|
+
'utf8' => 'utf-8',
|
465
|
+
'utf8_ucs2' => 'utf-8',
|
466
|
+
'utf8_ucs4' => 'utf-8',
|
467
|
+
}
|
468
|
+
|
469
|
+
def unicode(data, from_encoding)
|
470
|
+
# Takes a single string and converts it from the encoding in
|
471
|
+
# from_encoding to unicode.
|
472
|
+
uconvert(data, from_encoding, 'unicode')
|
473
|
+
end
|
474
|
+
|
475
|
+
def uconvert(data, from_encoding, to_encoding = 'utf-8')
|
476
|
+
from_encoding = Encoding_Aliases[from_encoding] || from_encoding
|
477
|
+
to_encoding = Encoding_Aliases[to_encoding] || to_encoding
|
478
|
+
Iconv.iconv(to_encoding, from_encoding, data)[0]
|
479
|
+
end
|
480
|
+
|
481
|
+
def unichr(i)
|
482
|
+
[i].pack('U*')
|
483
|
+
end
|
484
|
+
|
485
|
+
def index_match(stri,regexp, offset)
|
486
|
+
if offset == 241
|
487
|
+
end
|
488
|
+
i = stri.index(regexp, offset)
|
489
|
+
|
490
|
+
return nil, nil unless i
|
491
|
+
|
492
|
+
full = stri[i..-1].match(regexp)
|
493
|
+
return i, full
|
494
|
+
end
|
495
|
+
|
496
|
+
def _ebcdic_to_ascii(s)
|
497
|
+
return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
|
498
|
+
end
|
499
|
+
|
500
|
+
def urljoin(base, uri)
|
501
|
+
urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
|
502
|
+
uri = uri.sub(urifixer, '\1\3')
|
503
|
+
begin
|
504
|
+
return URI.join(base, uri).to_s
|
505
|
+
rescue URI::BadURIError => e
|
506
|
+
if URI.parse(base).relative?
|
507
|
+
return URI::parse(uri).to_s
|
508
|
+
end
|
509
|
+
end
|
510
|
+
end
|
511
|
+
|
512
|
+
def py2rtime(pytuple)
|
513
|
+
Time.utc(pytuple[0..5])
|
514
|
+
end
|
515
|
+
|
516
|
+
# http://intertwingly.net/stories/2005/09/28/xchar.rb
|
517
|
+
module XChar
|
518
|
+
# http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
|
519
|
+
CP1252 = {
|
520
|
+
128 => 8364, # euro sign
|
521
|
+
130 => 8218, # single low-9 quotation mark
|
522
|
+
131 => 402, # latin small letter f with hook
|
523
|
+
132 => 8222, # double low-9 quotation mark
|
524
|
+
133 => 8230, # horizontal ellipsis
|
525
|
+
134 => 8224, # dagger
|
526
|
+
135 => 8225, # double dagger
|
527
|
+
136 => 710, # modifier letter circumflex accent
|
528
|
+
137 => 8240, # per mille sign
|
529
|
+
138 => 352, # latin capital letter s with caron
|
530
|
+
139 => 8249, # single left-pointing angle quotation mark
|
531
|
+
140 => 338, # latin capital ligature oe
|
532
|
+
142 => 381, # latin capital letter z with caron
|
533
|
+
145 => 8216, # left single quotation mark
|
534
|
+
146 => 8217, # right single quotation mark
|
535
|
+
147 => 8220, # left double quotation mark
|
536
|
+
148 => 8221, # right double quotation mark
|
537
|
+
149 => 8226, # bullet
|
538
|
+
150 => 8211, # en dash
|
539
|
+
151 => 8212, # em dash
|
540
|
+
152 => 732, # small tilde
|
541
|
+
153 => 8482, # trade mark sign
|
542
|
+
154 => 353, # latin small letter s with caron
|
543
|
+
155 => 8250, # single right-pointing angle quotation mark
|
544
|
+
156 => 339, # latin small ligature oe
|
545
|
+
158 => 382, # latin small letter z with caron
|
546
|
+
159 => 376} # latin capital letter y with diaeresis
|
547
|
+
|
548
|
+
# http://www.w3.org/TR/REC-xml/#dt-chardata
|
549
|
+
PREDEFINED = {
|
550
|
+
38 => '&', # ampersand
|
551
|
+
60 => '<', # left angle bracket
|
552
|
+
62 => '>'} # right angle bracket
|
553
|
+
|
554
|
+
# http://www.w3.org/TR/REC-xml/#charsets
|
555
|
+
VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
|
556
|
+
(0xE000..0xFFFD), (0x10000..0x10FFFF)]
|
557
|
+
end
|
558
|
+
|
559
|
+
class Fixnum
|
560
|
+
# xml escaped version of chr
|
561
|
+
def xchr
|
562
|
+
n = XChar::CP1252[self] || self
|
563
|
+
n = 42 unless XChar::VALID.find {|range| range.include? n}
|
564
|
+
XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
|
565
|
+
end
|
566
|
+
end
|
567
|
+
|
568
|
+
class String
|
569
|
+
alias :old_index :index
|
570
|
+
def to_xs
|
571
|
+
unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
|
572
|
+
rescue
|
573
|
+
unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
|
574
|
+
end
|
575
|
+
end
|
576
|
+
|
577
|
+
class BetterSGMLParserError < Exception; end;
|
578
|
+
class BetterSGMLParser < HTML::SGMLParser
|
579
|
+
# Replaced Tagfind and Charref Regexps with the ones in feedparser.py
|
580
|
+
# This makes things work.
|
581
|
+
Interesting = /[&<]/u
|
582
|
+
Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
|
583
|
+
'<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
|
584
|
+
'![^<>]*)?', 64) # 64 is the unicode flag
|
585
|
+
|
586
|
+
Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
|
587
|
+
Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
|
588
|
+
|
589
|
+
Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
|
590
|
+
Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
|
591
|
+
Endtagopen = /<\//u # Matching the Python SGMLParser
|
592
|
+
Endbracket = /[<>]/u
|
593
|
+
Declopen = /<!/u
|
594
|
+
Piopenbegin = /^<\?/u
|
595
|
+
Piclose = />/u
|
596
|
+
|
597
|
+
Commentopen = /<!--/u
|
598
|
+
Commentclose = /--\s*>/u
|
599
|
+
Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
|
600
|
+
Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
|
601
|
+
'(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
|
602
|
+
64)
|
603
|
+
Endtagfind = /\s*\/\s*>/u
|
604
|
+
def initialize(verbose=false)
|
605
|
+
super(verbose)
|
606
|
+
end
|
607
|
+
def feed(*args)
|
608
|
+
super(*args)
|
609
|
+
end
|
610
|
+
|
611
|
+
def goahead(_end)
|
612
|
+
rawdata = @rawdata # woo, utf-8 magic
|
613
|
+
i = 0
|
614
|
+
n = rawdata.length
|
615
|
+
while i < n
|
616
|
+
if @nomoretags
|
617
|
+
# handle_data_range does nothing more than set a "Range" that is never used. wtf?
|
618
|
+
handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
|
619
|
+
i = n
|
620
|
+
break
|
621
|
+
end
|
622
|
+
j = rawdata.index(Interesting, i)
|
623
|
+
j = n unless j
|
624
|
+
handle_data(rawdata[i...j]) if i < j
|
625
|
+
i = j
|
626
|
+
break if (i == n)
|
627
|
+
if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
|
628
|
+
if rawdata.index(Starttagopen,i) == i
|
629
|
+
if @literal
|
630
|
+
handle_data(rawdata[i..i])
|
631
|
+
i = i+1
|
632
|
+
next
|
633
|
+
end
|
634
|
+
k = parse_starttag(i)
|
635
|
+
break unless k
|
636
|
+
i = k
|
637
|
+
next
|
638
|
+
end
|
639
|
+
if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
|
640
|
+
k = parse_endtag(i)
|
641
|
+
break unless k
|
642
|
+
i = k
|
643
|
+
@literal = false
|
644
|
+
next
|
645
|
+
end
|
646
|
+
if @literal
|
647
|
+
if n > (i+1)
|
648
|
+
handle_data("<")
|
649
|
+
i = i+1
|
650
|
+
else
|
651
|
+
#incomplete
|
652
|
+
break
|
653
|
+
end
|
654
|
+
next
|
655
|
+
end
|
656
|
+
if rawdata.index(Commentopen,i) == i
|
657
|
+
k = parse_comment(i)
|
658
|
+
break unless k
|
659
|
+
i = k
|
660
|
+
next
|
661
|
+
end
|
662
|
+
if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
|
663
|
+
k = parse_pi(i)
|
664
|
+
break unless k
|
665
|
+
i += k
|
666
|
+
next
|
667
|
+
end
|
668
|
+
if rawdata.index(Declopen,i) == i
|
669
|
+
# This is some sort of declaration; in "HTML as
|
670
|
+
# deployed," this should only be the document type
|
671
|
+
# declaration ("<!DOCTYPE html...>").
|
672
|
+
k = parse_declaration(i)
|
673
|
+
break unless k
|
674
|
+
i = k
|
675
|
+
next
|
676
|
+
end
|
677
|
+
elsif rawdata[i..i] == '&'
|
678
|
+
if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
|
679
|
+
handle_data(rawdata[i..i])
|
680
|
+
i += 1
|
681
|
+
next
|
682
|
+
end
|
683
|
+
|
684
|
+
# the Char must come first as its #=~ method is the only one that is UTF-8 safe
|
685
|
+
ni,match = index_match(rawdata, Charref, i)
|
686
|
+
if ni and ni == i # See? Ugly
|
687
|
+
handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
|
688
|
+
i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
|
689
|
+
i -= 1 unless rawdata[i-1..i-1] == ";"
|
690
|
+
next
|
691
|
+
end
|
692
|
+
ni,match = index_match(rawdata, Entityref, i)
|
693
|
+
if ni and ni == i
|
694
|
+
handle_entityref(match[1])
|
695
|
+
i += match[0].length
|
696
|
+
i -= 1 unless rawdata[i-1..i-1] == ";"
|
697
|
+
next
|
698
|
+
end
|
699
|
+
else
|
700
|
+
error('neither < nor & ??')
|
701
|
+
end
|
702
|
+
# We get here only if incomplete matches but
|
703
|
+
# nothing else
|
704
|
+
ni,match = index_match(rawdata,Incomplete,i)
|
705
|
+
unless ni and ni == 0
|
706
|
+
handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
|
707
|
+
i += 1
|
708
|
+
next
|
709
|
+
end
|
710
|
+
j = ni + match[0].length
|
711
|
+
break if j == n # Really incomplete
|
712
|
+
handle_data(rawdata[i...j])
|
713
|
+
i = j
|
714
|
+
end # end while
|
715
|
+
|
716
|
+
if _end and i < n
|
717
|
+
handle_data(rawdata[i...n])
|
718
|
+
i = n
|
719
|
+
end
|
720
|
+
|
721
|
+
@rawdata = rawdata[i..-1]
|
722
|
+
# @offset += i # FIXME BUGME another unused variable in SGMLParser?
|
723
|
+
end
|
724
|
+
|
725
|
+
|
726
|
+
# Internal -- parse processing instr, return length or -1 if not terminated
|
727
|
+
def parse_pi(i)
|
728
|
+
rawdata = @rawdata
|
729
|
+
if rawdata[i...i+2] != '<?'
|
730
|
+
error("unexpected call to parse_pi()")
|
731
|
+
end
|
732
|
+
ni,match = index_match(rawdata,Piclose,i+2)
|
733
|
+
return nil unless match
|
734
|
+
j = ni
|
735
|
+
handle_pi(rawdata[i+2...j])
|
736
|
+
j = (j + match[0].length)
|
737
|
+
return j-i
|
738
|
+
end
|
739
|
+
|
740
|
+
def parse_comment(i)
|
741
|
+
rawdata = @rawdata
|
742
|
+
if rawdata[i...i+4] != "<!--"
|
743
|
+
error("unexpected call to parse_comment()")
|
744
|
+
end
|
745
|
+
ni,match = index_match(rawdata, Commentclose,i)
|
746
|
+
return nil unless match
|
747
|
+
handle_comment(rawdata[i+4..(ni-1)])
|
748
|
+
return ni+match[0].length # Length from i to just past the closing comment tag
|
749
|
+
end
|
750
|
+
|
751
|
+
|
752
|
+
def parse_starttag(i)
|
753
|
+
@_starttag_text = nil
|
754
|
+
start_pos = i
|
755
|
+
rawdata = @rawdata
|
756
|
+
ni,match = index_match(rawdata,Shorttagopen,i)
|
757
|
+
if ni == i
|
758
|
+
# SGML shorthand: <tag/data/ == <tag>data</tag>
|
759
|
+
# XXX Can data contain &... (entity or char refs)?
|
760
|
+
# XXX Can data contain < or > (tag characters)?
|
761
|
+
# XXX Can there be whitespace before the first /?
|
762
|
+
k,match = index_match(rawdata,Shorttag,i)
|
763
|
+
return nil unless match
|
764
|
+
tag, data = match[1], match[2]
|
765
|
+
@_starttag_text = "<#{tag}/"
|
766
|
+
tag.downcase!
|
767
|
+
second_end = rawdata.index(Shorttagopen,k)
|
768
|
+
finish_shorttag(tag, data)
|
769
|
+
@_starttag_text = rawdata[start_pos...second_end+1]
|
770
|
+
return k
|
771
|
+
end
|
772
|
+
|
773
|
+
j = rawdata.index(Endbracket, i+1)
|
774
|
+
return nil unless j
|
775
|
+
attrsd = []
|
776
|
+
if rawdata[i...i+2] == '<>'
|
777
|
+
# SGML shorthand: <> == <last open tag seen>
|
778
|
+
k = j
|
779
|
+
tag = @lasttag
|
780
|
+
else
|
781
|
+
ni,match = index_match(rawdata,Tagfind,i+1)
|
782
|
+
unless match
|
783
|
+
error('unexpected call to parse_starttag')
|
784
|
+
end
|
785
|
+
k = ni+match[0].length+1
|
786
|
+
tag = match[0].downcase
|
787
|
+
@lasttag = tag
|
788
|
+
end
|
789
|
+
|
790
|
+
while k < j
|
791
|
+
break if rawdata.index(Endtagfind, k) == k
|
792
|
+
ni,match = index_match(rawdata,Attrfind,k)
|
793
|
+
break unless ni
|
794
|
+
matched_length = match[0].length
|
795
|
+
attrname, rest, attrvalue = match[1],match[2],match[3]
|
796
|
+
if rest.nil? or rest.empty?
|
797
|
+
attrvalue = '' # was: = attrname # Why the change?
|
798
|
+
elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
|
799
|
+
attrvalue = attrvalue[1...-1]
|
800
|
+
end
|
801
|
+
attrsd << [attrname.downcase, attrvalue]
|
802
|
+
k += matched_length
|
803
|
+
end
|
804
|
+
if rawdata[j..j] == ">"
|
805
|
+
j += 1
|
806
|
+
end
|
807
|
+
@_starttag_text = rawdata[start_pos...j]
|
808
|
+
finish_starttag(tag, attrsd)
|
809
|
+
return j
|
810
|
+
end
|
811
|
+
|
812
|
+
def parse_endtag(i)
|
813
|
+
rawdata = @rawdata
|
814
|
+
j, match = index_match(rawdata, /[<>]/,i+1)
|
815
|
+
return nil unless j
|
816
|
+
tag = rawdata[i+2...j].strip.downcase
|
817
|
+
if rawdata[j..j] == ">"
|
818
|
+
j += 1
|
819
|
+
end
|
820
|
+
finish_endtag(tag)
|
821
|
+
return j
|
822
|
+
end
|
823
|
+
|
824
|
+
def output
|
825
|
+
# Return processed HTML as a single string
|
826
|
+
return @pieces.map{|p| p.to_s}.join
|
827
|
+
end
|
828
|
+
|
829
|
+
def error(message)
|
830
|
+
raise BetterSGMLParserError.new(message)
|
831
|
+
end
|
832
|
+
def handle_pi(text)
|
833
|
+
end
|
834
|
+
def handle_decl(text)
|
835
|
+
end
|
836
|
+
end
|
837
|
+
|
838
|
+
# Add some helper methods to make AttributeList (all of those damn attrs
|
839
|
+
# and attrsD used by StrictFeedParser) act more like a Hash.
|
840
|
+
# NOTE AttributeList is still Read-Only (AFAICT).
|
841
|
+
# Monkey patching is terrible, and I have an addiction.
|
842
|
+
module XML
|
843
|
+
module SAX
|
844
|
+
module AttributeList # in xml/sax.rb
|
845
|
+
def [](key)
|
846
|
+
getValue(key)
|
847
|
+
end
|
848
|
+
|
849
|
+
def each(&blk)
|
850
|
+
(0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
|
851
|
+
end
|
852
|
+
|
853
|
+
def each_key(&blk)
|
854
|
+
(0...getLength).each{|pos| yield getName(pos) }
|
855
|
+
end
|
856
|
+
|
857
|
+
def each_value(&blk)
|
858
|
+
(0...getLength).each{|pos| yield getValue(pos) }
|
859
|
+
end
|
860
|
+
|
861
|
+
def to_a # Rather use collect? grep for to_a.collect
|
862
|
+
l = []
|
863
|
+
each{|k,v| l << [k,v]}
|
864
|
+
return l
|
865
|
+
end
|
866
|
+
|
867
|
+
def to_s
|
868
|
+
l = []
|
869
|
+
each{|k,v| l << "#{k} => #{v}"}
|
870
|
+
"{ "+l.join(", ")+" }"
|
871
|
+
end
|
872
|
+
end
|
873
|
+
end
|
874
|
+
end
|
875
|
+
# This adds a nice scrub method to Hpricot, so we don't need a _HTMLSanitizer class
|
876
|
+
# http://underpantsgnome.com/2007/01/20/hpricot-scrub
|
877
|
+
# I have modified it to check for attributes that are only allowed if they are in a certain tag
|
878
|
+
module Hpricot
|
879
|
+
Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
880
|
+
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
881
|
+
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
|
882
|
+
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
883
|
+
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
|
884
|
+
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
|
885
|
+
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
|
886
|
+
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
|
887
|
+
'ul', 'var'
|
888
|
+
]
|
889
|
+
|
890
|
+
Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
891
|
+
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
892
|
+
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
893
|
+
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
894
|
+
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
895
|
+
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
896
|
+
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
897
|
+
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
898
|
+
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
899
|
+
'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
900
|
+
'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
|
901
|
+
]
|
902
|
+
|
903
|
+
Unacceptable_Elements_With_End_Tag = ['script', 'applet']
|
904
|
+
|
905
|
+
Acceptable_Css_Properties = ['azimuth', 'background-color',
|
906
|
+
'border-bottom-color', 'border-collapse', 'border-color',
|
907
|
+
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
908
|
+
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
909
|
+
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
910
|
+
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
911
|
+
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
912
|
+
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
913
|
+
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
914
|
+
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
915
|
+
'white-space', 'width'
|
916
|
+
]
|
917
|
+
|
918
|
+
# survey of common keywords found in feeds
|
919
|
+
Acceptable_Css_Keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
920
|
+
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
921
|
+
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
922
|
+
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
923
|
+
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
924
|
+
'transparent', 'underline', 'white', 'yellow'
|
925
|
+
]
|
926
|
+
|
927
|
+
Mathml_Elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
928
|
+
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
929
|
+
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
930
|
+
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
931
|
+
'munderover', 'none'
|
932
|
+
]
|
933
|
+
|
934
|
+
Mathml_Attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
935
|
+
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
936
|
+
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
937
|
+
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
938
|
+
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
939
|
+
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
940
|
+
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
941
|
+
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
942
|
+
'xlink:type', 'xmlns', 'xmlns:xlink'
|
943
|
+
]
|
944
|
+
|
945
|
+
# svgtiny - foreignObject + linearGradient + radialGradient + stop
|
946
|
+
Svg_Elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
947
|
+
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
|
948
|
+
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
|
949
|
+
'linearGradient', 'line', 'metadata', 'missing-glyph', 'mpath', 'path',
|
950
|
+
'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg',
|
951
|
+
'switch', 'text', 'title', 'use'
|
952
|
+
]
|
953
|
+
|
954
|
+
# svgtiny + class + opacity + offset + xmlns + xmlns:xlink
|
955
|
+
Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
956
|
+
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
957
|
+
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
958
|
+
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
|
959
|
+
'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
|
960
|
+
'font-size', 'font-stretch', 'font-style', 'font-variant',
|
961
|
+
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
|
962
|
+
'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
|
963
|
+
'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
|
964
|
+
'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
|
965
|
+
'origin', 'overline-position', 'overline-thickness', 'panose-1',
|
966
|
+
'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
|
967
|
+
'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
|
968
|
+
'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
|
969
|
+
'stop-color', 'stop-opacity', 'strikethrough-position',
|
970
|
+
'strikethrough-thickness', 'stroke', 'stroke-dasharray',
|
971
|
+
'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
|
972
|
+
'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
|
973
|
+
'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
|
974
|
+
'underline-position', 'underline-thickness', 'unicode',
|
975
|
+
'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
|
976
|
+
'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
|
977
|
+
'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
|
978
|
+
'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
|
979
|
+
'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
|
980
|
+
]
|
981
|
+
|
982
|
+
Svg_Attr_Map = nil
|
983
|
+
Svg_Elem_Map = nil
|
984
|
+
|
985
|
+
Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
|
986
|
+
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
987
|
+
'stroke-opacity'
|
988
|
+
]
|
989
|
+
|
990
|
+
unless $compatible
|
991
|
+
@@acceptable_tag_specific_attributes = {}
|
992
|
+
@@mathml_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@mathml_attributes }
|
993
|
+
@@svg_elements.each{|e| @@acceptable_tag_specific_attributes[e] = @@svg_attributes }
|
994
|
+
end
|
995
|
+
|
996
|
+
class Elements
|
997
|
+
def strip(allowed_tags=[]) # I completely route around this with the recursive_strip in Doc
|
998
|
+
each { |x| x.strip(allowed_tags) }
|
999
|
+
end
|
1000
|
+
|
1001
|
+
def strip_attributes(safe=[])
|
1002
|
+
each { |x| x.strip_attributes(safe) }
|
1003
|
+
end
|
1004
|
+
|
1005
|
+
def strip_style(ok_props=[], ok_keywords=[])
|
1006
|
+
each { |x| x.strip_style(ok_props, ok_keywords) }
|
1007
|
+
end
|
1008
|
+
end
|
1009
|
+
|
1010
|
+
class Text
|
1011
|
+
def strip(foo)
|
1012
|
+
end
|
1013
|
+
def strip_attributes(foo)
|
1014
|
+
end
|
1015
|
+
end
|
1016
|
+
class Comment
|
1017
|
+
def strip(foo)
|
1018
|
+
end
|
1019
|
+
def strip_attributes(foo)
|
1020
|
+
end
|
1021
|
+
end
|
1022
|
+
class BogusETag
|
1023
|
+
def strip(foo)
|
1024
|
+
end
|
1025
|
+
def strip_attributes(foo)
|
1026
|
+
end
|
1027
|
+
end
|
1028
|
+
|
1029
|
+
class Elem
|
1030
|
+
def decode_entities
|
1031
|
+
children.each{ |x| x.decode_entities }
|
1032
|
+
end
|
1033
|
+
|
1034
|
+
def cull
|
1035
|
+
if children
|
1036
|
+
swap(children.to_s)
|
1037
|
+
end
|
1038
|
+
end
|
1039
|
+
|
1040
|
+
def strip
|
1041
|
+
if strip_removes?
|
1042
|
+
cull
|
1043
|
+
end
|
1044
|
+
end
|
1045
|
+
|
1046
|
+
def strip_attributes
|
1047
|
+
unless attributes.nil?
|
1048
|
+
attributes.each do |atr|
|
1049
|
+
unless Acceptable_Attributes.include?atr[0]
|
1050
|
+
remove_attribute(atr[0])
|
1051
|
+
end
|
1052
|
+
end
|
1053
|
+
end
|
1054
|
+
end
|
1055
|
+
|
1056
|
+
def strip_removes?
|
1057
|
+
# I'm sure there are others that shuould be ripped instead of stripped
|
1058
|
+
attributes && attributes['type'] =~ /script|css/
|
1059
|
+
end
|
1060
|
+
end
|
1061
|
+
end
|
1062
|
+
|
1063
|
+
module FeedParser
|
1064
|
+
Version = "0.1aleph_naught"
|
1065
|
+
|
1066
|
+
License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
1067
|
+
|
1068
|
+
Redistribution and use in source and binary forms, with or without modification,
|
1069
|
+
are permitted provided that the following conditions are met:
|
1070
|
+
|
1071
|
+
* Redistributions of source code must retain the above copyright notice,
|
1072
|
+
this list of conditions and the following disclaimer.
|
1073
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
1074
|
+
this list of conditions and the following disclaimer in the documentation
|
1075
|
+
and/or other materials provided with the distribution.
|
1076
|
+
|
1077
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
1078
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
1079
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
1080
|
+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
1081
|
+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
1082
|
+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
1083
|
+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
1084
|
+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
1085
|
+
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
1086
|
+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
1087
|
+
POSSIBILITY OF SUCH DAMAGE."""
|
1088
|
+
|
1089
|
+
Author = "Jeff Hodges <http://somethingsimilar.com>"
|
1090
|
+
Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
|
1091
|
+
Contributors = [ "Jason Diamond <http://injektilo.org/>",
|
1092
|
+
"John Beimler <http://john.beimler.org/>",
|
1093
|
+
"Fazal Majid <http://www.majid.info/mylos/weblog/>",
|
1094
|
+
"Aaron Swartz <http://aaronsw.com/>",
|
1095
|
+
"Kevin Marks <http://epeus.blogspot.com/>"
|
1096
|
+
]
|
1097
|
+
# HTTP "User-Agent" header to send to servers when downloading feeds.
|
1098
|
+
# If you are embedding feedparser in a larger application, you should
|
1099
|
+
# change this to your application name and URL.
|
1100
|
+
USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
|
1101
|
+
|
1102
|
+
# HTTP "Accept" header to send to servers when downloading feeds. If you don't
|
1103
|
+
# want to send an Accept header, set this to None.
|
1104
|
+
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
|
1105
|
+
|
1106
|
+
|
1107
|
+
# If you want feedparser to automatically run HTML markup through HTML Tidy, set
|
1108
|
+
# this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
|
1109
|
+
# or utidylib <http://utidylib.berlios.de/>.
|
1110
|
+
TIDY_MARKUP = false #FIXME untranslated
|
1111
|
+
|
1112
|
+
# List of Python interfaces for HTML Tidy, in order of preference. Only useful
|
1113
|
+
# if TIDY_MARKUP = true
|
1114
|
+
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
|
1115
|
+
|
1116
|
+
# The original Python import. I'm using it to help translate
|
1117
|
+
#import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
|
1118
|
+
|
1119
|
+
|
1120
|
+
|
1121
|
+
# ---------- don't touch these ----------
|
1122
|
+
class ThingsNobodyCaresAboutButMe < Exception
|
1123
|
+
end
|
1124
|
+
class CharacterEncodingOverride < ThingsNobodyCaresAboutButMe
|
1125
|
+
end
|
1126
|
+
class CharacterEncodingUnknown < ThingsNobodyCaresAboutButMe
|
1127
|
+
end
|
1128
|
+
class NonXMLContentType < ThingsNobodyCaresAboutButMe
|
1129
|
+
end
|
1130
|
+
class UndeclaredNamespace < Exception
|
1131
|
+
end
|
1132
|
+
|
1133
|
+
|
1134
|
+
SUPPORTED_VERSIONS = {'' => 'unknown',
|
1135
|
+
'rss090' => 'RSS 0.90',
|
1136
|
+
'rss091n' => 'RSS 0.91 (Netscape)',
|
1137
|
+
'rss091u' => 'RSS 0.91 (Userland)',
|
1138
|
+
'rss092' => 'RSS 0.92',
|
1139
|
+
'rss093' => 'RSS 0.93',
|
1140
|
+
'rss094' => 'RSS 0.94',
|
1141
|
+
'rss20' => 'RSS 2.0',
|
1142
|
+
'rss10' => 'RSS 1.0',
|
1143
|
+
'rss' => 'RSS (unknown version)',
|
1144
|
+
'atom01' => 'Atom 0.1',
|
1145
|
+
'atom02' => 'Atom 0.2',
|
1146
|
+
'atom03' => 'Atom 0.3',
|
1147
|
+
'atom10' => 'Atom 1.0',
|
1148
|
+
'atom' => 'Atom (unknown version)',
|
1149
|
+
'cdf' => 'CDF',
|
1150
|
+
'hotrss' => 'Hot RSS'
|
1151
|
+
}
|
1152
|
+
class FeedParserDict < Hash
|
1153
|
+
=begin
|
1154
|
+
The naming of a certain common attribute (such as, "When was the last
|
1155
|
+
time this feed was updated?") can have many different names depending
|
1156
|
+
on the type of feed we are handling. This class allows us to use
|
1157
|
+
both the attribute name a person, who has knowledge of the kind of
|
1158
|
+
feed being parsed, expects, as well as allowing a developer to rely
|
1159
|
+
on one name to contain the proper attribute no matter what kind of
|
1160
|
+
feed is being parsed. @@keymaps is a Hash that contains information
|
1161
|
+
on what certain attributes "really is" in each feed type. It does so
|
1162
|
+
by providing a common name that will map to any feed type in the keys,
|
1163
|
+
with possible "correct" attributes in the its values. the #[] and #[]=
|
1164
|
+
methods check with keymaps to see what attribute the developer "really
|
1165
|
+
means" if they've asked for one which happens to be in @@keymap's keys.
|
1166
|
+
=end
|
1167
|
+
@@keymap = {'channel' => 'feed',
|
1168
|
+
'items' => 'entries',
|
1169
|
+
'guid' => 'id',
|
1170
|
+
'date' => 'updated',
|
1171
|
+
'date_parsed' => 'updated_parsed',
|
1172
|
+
'description' => ['subtitle', 'summary'],
|
1173
|
+
'url' => ['href'],
|
1174
|
+
'modified' => 'updated',
|
1175
|
+
'modified_parsed' => 'updated_parsed',
|
1176
|
+
'issued' => 'published',
|
1177
|
+
'issued_parsed' => 'published_parsed',
|
1178
|
+
'copyright' => 'rights',
|
1179
|
+
'copyright_detail' => 'rights_detail',
|
1180
|
+
'tagline' => 'subtitle',
|
1181
|
+
'tagline_detail' => 'subtitle_detail'}
|
1182
|
+
|
1183
|
+
def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
|
1184
|
+
return self['entries']
|
1185
|
+
end
|
1186
|
+
# We could include the [] rewrite in new using Hash.new's fancy pants block thing
|
1187
|
+
# but we'd still have to overwrite []= and such.
|
1188
|
+
# I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
|
1189
|
+
def initialize(pairs=nil)
|
1190
|
+
if pairs.class == Array and pairs[0].class == Array and pairs[0].length == 2
|
1191
|
+
pairs.each do |l|
|
1192
|
+
k,v = l
|
1193
|
+
self[k] = v
|
1194
|
+
end
|
1195
|
+
elsif pairs.class == Hash
|
1196
|
+
self.merge!(pairs)
|
1197
|
+
end
|
1198
|
+
end
|
1199
|
+
|
1200
|
+
def [](key)
|
1201
|
+
if key == 'category'
|
1202
|
+
return self['tags'][0]['term']
|
1203
|
+
end
|
1204
|
+
if key == 'categories'
|
1205
|
+
return self['tags'].collect{|tag| [tag['scheme'],tag['term']]}
|
1206
|
+
end
|
1207
|
+
realkey = @@keymap[key] || key
|
1208
|
+
if realkey.class == Array
|
1209
|
+
realkey.each{ |key| return self[key] if has_key?key }
|
1210
|
+
end
|
1211
|
+
# Note that the original key is preferred over the realkey we (might
|
1212
|
+
# have) found in @@keymaps
|
1213
|
+
if has_key?(key)
|
1214
|
+
return super(key)
|
1215
|
+
end
|
1216
|
+
return super(realkey)
|
1217
|
+
end
|
1218
|
+
|
1219
|
+
def []=(key,value)
|
1220
|
+
if @@keymap.key?key
|
1221
|
+
key = @@keymap[key]
|
1222
|
+
if key.class == Array
|
1223
|
+
key = key[0]
|
1224
|
+
end
|
1225
|
+
end
|
1226
|
+
super(key,value)
|
1227
|
+
end
|
1228
|
+
|
1229
|
+
def method_missing(msym, *args)
|
1230
|
+
methodname = msym.to_s
|
1231
|
+
if methodname[-1] == '='
|
1232
|
+
return self[methodname[0..-2]] = args[0]
|
1233
|
+
elsif methodname[-1] != '!' and methodname[-1] != '?' and methodname[0] != "_" # FIXME implement with private
|
1234
|
+
return self[methodname]
|
1235
|
+
else
|
1236
|
+
raise NoMethodError, "whoops, we don't know about the attribute or method called `#{methodname}' for #{self}:#{self.class}"
|
1237
|
+
end
|
1238
|
+
end
|
1239
|
+
end
|
1240
|
+
|
1241
|
+
|
1242
|
+
|
1243
|
+
|
1244
|
+
module FeedParserMixin
|
1245
|
+
attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
|
1246
|
+
|
1247
|
+
def startup(baseuri=nil, baselang=nil, encoding='utf-8')
|
1248
|
+
$stderr << "initializing FeedParser\n" if $debug
|
1249
|
+
|
1250
|
+
@namespaces = {'' => '',
|
1251
|
+
'http://backend.userland.com/rss' => '',
|
1252
|
+
'http://blogs.law.harvard.edu/tech/rss' => '',
|
1253
|
+
'http://purl.org/rss/1.0/' => '',
|
1254
|
+
'http://my.netscape.com/rdf/simple/0.9/' => '',
|
1255
|
+
'http://example.com/newformat#' => '',
|
1256
|
+
'http://example.com/necho' => '',
|
1257
|
+
'http://purl.org/echo/' => '',
|
1258
|
+
'uri/of/echo/namespace#' => '',
|
1259
|
+
'http://purl.org/pie/' => '',
|
1260
|
+
'http://purl.org/atom/ns#' => '',
|
1261
|
+
'http://www.w3.org/2005/Atom' => '',
|
1262
|
+
'http://purl.org/rss/1.0/modules/rss091#' => '',
|
1263
|
+
'http://webns.net/mvcb/' => 'admin',
|
1264
|
+
'http://purl.org/rss/1.0/modules/aggregation/' => 'ag',
|
1265
|
+
'http://purl.org/rss/1.0/modules/annotate/' => 'annotate',
|
1266
|
+
'http://media.tangent.org/rss/1.0/' => 'audio',
|
1267
|
+
'http://backend.userland.com/blogChannelModule' => 'blogChannel',
|
1268
|
+
'http://web.resource.org/cc/' => 'cc',
|
1269
|
+
'http://backend.userland.com/creativeCommonsRssModule' => 'creativeCommons',
|
1270
|
+
'http://purl.org/rss/1.0/modules/company' => 'co',
|
1271
|
+
'http://purl.org/rss/1.0/modules/content/' => 'content',
|
1272
|
+
'http://my.theinfo.org/changed/1.0/rss/' => 'cp',
|
1273
|
+
'http://purl.org/dc/elements/1.1/' => 'dc',
|
1274
|
+
'http://purl.org/dc/terms/' => 'dcterms',
|
1275
|
+
'http://purl.org/rss/1.0/modules/email/' => 'email',
|
1276
|
+
'http://purl.org/rss/1.0/modules/event/' => 'ev',
|
1277
|
+
'http://rssnamespace.org/feedburner/ext/1.0' => 'feedburner',
|
1278
|
+
'http://freshmeat.net/rss/fm/' => 'fm',
|
1279
|
+
'http://xmlns.com/foaf/0.1/' => 'foaf',
|
1280
|
+
'http://www.w3.org/2003/01/geo/wgs84_pos#' => 'geo',
|
1281
|
+
'http://postneo.com/icbm/' => 'icbm',
|
1282
|
+
'http://purl.org/rss/1.0/modules/image/' => 'image',
|
1283
|
+
'http://www.itunes.com/DTDs/PodCast-1.0.dtd' => 'itunes',
|
1284
|
+
'http://example.com/DTDs/PodCast-1.0.dtd' => 'itunes',
|
1285
|
+
'http://purl.org/rss/1.0/modules/link/' => 'l',
|
1286
|
+
'http://search.yahoo.com/mrss' => 'media',
|
1287
|
+
'http://madskills.com/public/xml/rss/module/pingback/' => 'pingback',
|
1288
|
+
'http://prismstandard.org/namespaces/1.2/basic/' => 'prism',
|
1289
|
+
'http://www.w3.org/1999/02/22-rdf-syntax-ns#' => 'rdf',
|
1290
|
+
'http://www.w3.org/2000/01/rdf-schema#' => 'rdfs',
|
1291
|
+
'http://purl.org/rss/1.0/modules/reference/' => 'ref',
|
1292
|
+
'http://purl.org/rss/1.0/modules/richequiv/' => 'reqv',
|
1293
|
+
'http://purl.org/rss/1.0/modules/search/' => 'search',
|
1294
|
+
'http://purl.org/rss/1.0/modules/slash/' => 'slash',
|
1295
|
+
'http://schemas.xmlsoap.org/soap/envelope/' => 'soap',
|
1296
|
+
'http://purl.org/rss/1.0/modules/servicestatus/' => 'ss',
|
1297
|
+
'http://hacks.benhammersley.com/rss/streaming/' => 'str',
|
1298
|
+
'http://purl.org/rss/1.0/modules/subscription/' => 'sub',
|
1299
|
+
'http://purl.org/rss/1.0/modules/syndication/' => 'sy',
|
1300
|
+
'http://purl.org/rss/1.0/modules/taxonomy/' => 'taxo',
|
1301
|
+
'http://purl.org/rss/1.0/modules/threading/' => 'thr',
|
1302
|
+
'http://purl.org/rss/1.0/modules/textinput/' => 'ti',
|
1303
|
+
'http://madskills.com/public/xml/rss/module/trackback/' =>'trackback',
|
1304
|
+
'http://wellformedweb.org/commentAPI/' => 'wfw',
|
1305
|
+
'http://purl.org/rss/1.0/modules/wiki/' => 'wiki',
|
1306
|
+
'http://www.w3.org/1999/xhtml' => 'xhtml',
|
1307
|
+
'http://www.w3.org/XML/1998/namespace' => 'xml',
|
1308
|
+
'http://www.w3.org/1999/xlink' => 'xlink',
|
1309
|
+
'http://schemas.pocketsoap.com/rss/myDescModule/' => 'szf'
|
1310
|
+
}
|
1311
|
+
@matchnamespaces = {}
|
1312
|
+
@namespaces.each do |l|
|
1313
|
+
@matchnamespaces[l[0].downcase] = l[1]
|
1314
|
+
end
|
1315
|
+
@can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
|
1316
|
+
@can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
|
1317
|
+
@can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
|
1318
|
+
@html_types = ['text/html', 'application/xhtml+xml']
|
1319
|
+
@feeddata = FeedParserDict.new # feed-level data
|
1320
|
+
@encoding = encoding # character encoding
|
1321
|
+
@entries = [] # list of entry-level data
|
1322
|
+
@version = '' # feed type/version see SUPPORTED_VERSIOSN
|
1323
|
+
@namespacesInUse = {} # hash of namespaces defined by the feed
|
1324
|
+
|
1325
|
+
# the following are used internall to track state;
|
1326
|
+
# this is really out of control and should be refactored
|
1327
|
+
@infeed = false
|
1328
|
+
@inentry = false
|
1329
|
+
@incontent = 0 # Yes, this needs to be zero until I work out popContent and pushContent
|
1330
|
+
@intextinput = false
|
1331
|
+
@inimage = false
|
1332
|
+
@inauthor = false
|
1333
|
+
@incontributor = false
|
1334
|
+
@inpublisher = false
|
1335
|
+
@insource = false
|
1336
|
+
@sourcedata = FeedParserDict.new
|
1337
|
+
@contentparams = FeedParserDict.new
|
1338
|
+
@summaryKey = nil
|
1339
|
+
@namespacemap = {}
|
1340
|
+
@elementstack = []
|
1341
|
+
@basestack = []
|
1342
|
+
@langstack = []
|
1343
|
+
@baseuri = baseuri || ''
|
1344
|
+
@lang = baselang || nil
|
1345
|
+
if baselang
|
1346
|
+
@feeddata['language'] = baselang.gsub('_','-')
|
1347
|
+
end
|
1348
|
+
@date_handlers = [:_parse_date_rfc822,
|
1349
|
+
:_parse_date_hungarian, :_parse_date_greek,:_parse_date_mssql,
|
1350
|
+
:_parse_date_nate,:_parse_date_onblog,:_parse_date_w3dtf,:_parse_date_iso8601
|
1351
|
+
]
|
1352
|
+
$stderr << "Leaving startup\n" if $debug # My addition
|
1353
|
+
end
|
1354
|
+
|
1355
|
+
def unknown_starttag(tag, attrsd)
|
1356
|
+
$stderr << "start #{tag} with #{attrsd}\n" if $debug
|
1357
|
+
# normalize attrs
|
1358
|
+
attrsD = {}
|
1359
|
+
attrsd = Hash[*attrsd.flatten] if attrsd.class == Array # Magic! Asterisk!
|
1360
|
+
# LooseFeedParser needs the above because SGMLParser sends attrs as a
|
1361
|
+
# list of lists (like [['type','text/html'],['mode','escaped']])
|
1362
|
+
|
1363
|
+
attrsd.each do |old_k,value|
|
1364
|
+
# There has to be a better, non-ugly way of doing this
|
1365
|
+
k = old_k.downcase # Downcase all keys
|
1366
|
+
attrsD[k] = value
|
1367
|
+
if ['rel','type'].include?value
|
1368
|
+
attrsD[k].downcase! # Downcase the value if the key is 'rel' or 'type'
|
1369
|
+
end
|
1370
|
+
end
|
1371
|
+
|
1372
|
+
# track xml:base and xml:lang
|
1373
|
+
baseuri = attrsD['xml:base'] || attrsD['base'] || @baseuri
|
1374
|
+
@baseuri = urljoin(@baseuri, baseuri)
|
1375
|
+
lang = attrsD['xml:lang'] || attrsD['lang']
|
1376
|
+
if lang == '' # FIXME This next bit of code is right? Wtf?
|
1377
|
+
# xml:lang could be explicitly set to '', we need to capture that
|
1378
|
+
lang = nil
|
1379
|
+
elsif lang.nil?
|
1380
|
+
# if no xml:lang is specified, use parent lang
|
1381
|
+
lang = @lang
|
1382
|
+
end
|
1383
|
+
if lang and not lang.empty? # Seriously, this cannot be correct
|
1384
|
+
if ['feed', 'rss', 'rdf:RDF'].include?tag
|
1385
|
+
@feeddata['language'] = lang.gsub('_','-')
|
1386
|
+
end
|
1387
|
+
end
|
1388
|
+
@lang = lang
|
1389
|
+
@basestack << @baseuri
|
1390
|
+
@langstack << lang
|
1391
|
+
|
1392
|
+
# track namespaces
|
1393
|
+
attrsd.each do |prefix, uri|
|
1394
|
+
if /^xmlns:/ =~ prefix # prefix begins with xmlns:
|
1395
|
+
trackNamespace(prefix[6..-1], uri)
|
1396
|
+
elsif prefix == 'xmlns':
|
1397
|
+
trackNamespace(nil, uri)
|
1398
|
+
end
|
1399
|
+
end
|
1400
|
+
|
1401
|
+
# track inline content
|
1402
|
+
if @incontent != 0 and @contentparams.has_key?('type') and not ( /xml$/ =~ (@contentparams['type'] || 'xml') )
|
1403
|
+
# element declared itself as escaped markup, but isn't really
|
1404
|
+
|
1405
|
+
@contentparams['type'] = 'application/xhtml+xml'
|
1406
|
+
end
|
1407
|
+
if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
|
1408
|
+
# Note: probably shouldn't simply recreate localname here, but
|
1409
|
+
# our namespace handling isn't actually 100% correct in cases where
|
1410
|
+
# the feed redefines the default namespace (which is actually
|
1411
|
+
# the usual case for inline content, thanks Sam), so here we
|
1412
|
+
# cheat and just reconstruct the element based on localname
|
1413
|
+
# because that compensates for the bugs in our namespace handling.
|
1414
|
+
# This will horribly munge inline content with non-empty qnames,
|
1415
|
+
# but nobody actually does that, so I'm not fixing it.
|
1416
|
+
tag = tag.split(':')[-1]
|
1417
|
+
attrsA = attrsd.to_a.collect{|l| "#{l[0]}=\"#{l[1]}\""}
|
1418
|
+
attrsS = ' '+attrsA.join(' ')
|
1419
|
+
return handle_data("<#{tag}#{attrsS}>", escape=false)
|
1420
|
+
end
|
1421
|
+
|
1422
|
+
# match namespaces
|
1423
|
+
if /:/ =~ tag
|
1424
|
+
prefix, suffix = tag.split(':', 2)
|
1425
|
+
else
|
1426
|
+
prefix, suffix = '', tag
|
1427
|
+
end
|
1428
|
+
prefix = @namespacemap[prefix] || prefix
|
1429
|
+
if prefix and not prefix.empty?
|
1430
|
+
prefix = prefix + '_'
|
1431
|
+
end
|
1432
|
+
|
1433
|
+
# special hack for better tracking of empty textinput/image elements in illformed feeds
|
1434
|
+
if (not prefix and not prefix.empty?) and not (['title', 'link', 'description','name'].include?tag)
|
1435
|
+
@intextinput = false
|
1436
|
+
end
|
1437
|
+
if (prefix.nil? or prefix.empty?) and not (['title', 'link', 'description', 'url', 'href', 'width', 'height'].include?tag)
|
1438
|
+
@inimage = false
|
1439
|
+
end
|
1440
|
+
|
1441
|
+
# call special handler (if defined) or default handler
|
1442
|
+
begin
|
1443
|
+
return send('_start_'+prefix+suffix, attrsD)
|
1444
|
+
rescue NoMethodError
|
1445
|
+
return push(prefix + suffix, true)
|
1446
|
+
end
|
1447
|
+
end # End unknown_starttag
|
1448
|
+
|
1449
|
+
def unknown_endtag(tag)
|
1450
|
+
$stderr << "end #{tag}\n" if $debug
|
1451
|
+
# match namespaces
|
1452
|
+
if tag.index(':')
|
1453
|
+
prefix, suffix = tag.split(':',2)
|
1454
|
+
else
|
1455
|
+
prefix, suffix = '', tag
|
1456
|
+
end
|
1457
|
+
prefix = @namespacemap[prefix] || prefix
|
1458
|
+
if prefix and not prefix.empty?
|
1459
|
+
prefix = prefix + '_'
|
1460
|
+
end
|
1461
|
+
|
1462
|
+
# call special handler (if defined) or default handler
|
1463
|
+
begin
|
1464
|
+
send('_end_' + prefix + suffix) # NOTE no return here! do not add it!
|
1465
|
+
rescue NoMethodError => details
|
1466
|
+
pop(prefix + suffix)
|
1467
|
+
end
|
1468
|
+
|
1469
|
+
# track inline content
|
1470
|
+
if @incontent != 0 and @contentparams.has_key?'type' and /xml$/ =~ (@contentparams['type'] || 'xml')
|
1471
|
+
# element declared itself as escaped markup, but it isn't really
|
1472
|
+
@contentparams['type'] = 'application/xhtml+xml'
|
1473
|
+
end
|
1474
|
+
if @incontent != 0 and @contentparams['type'] == 'application/xhtml+xml'
|
1475
|
+
tag = tag.split(':')[-1]
|
1476
|
+
handle_data("</#{tag}>", escape=false)
|
1477
|
+
end
|
1478
|
+
|
1479
|
+
# track xml:base and xml:lang going out of scope
|
1480
|
+
if @basestack and not @basestack.empty?
|
1481
|
+
@basestack.pop
|
1482
|
+
if @basestack and @basestack[-1] and not (@basestack.empty? or @basestack[-1].empty?)
|
1483
|
+
@baseuri = @basestack[-1]
|
1484
|
+
end
|
1485
|
+
end
|
1486
|
+
if @langstack and not @langstack.empty?
|
1487
|
+
@langstack.pop
|
1488
|
+
if @langstack and not @langstack.empty? # and @langstack[-1] and not @langstack.empty?
|
1489
|
+
@lang = @langstack[-1]
|
1490
|
+
end
|
1491
|
+
end
|
1492
|
+
end
|
1493
|
+
|
1494
|
+
def handle_charref(ref)
|
1495
|
+
# LooseParserOnly
|
1496
|
+
# called for each character reference, e.g. for ' ', ref will be '160'
|
1497
|
+
$stderr << "entering handle_charref with #{ref}\n" if $debug
|
1498
|
+
return if @elementstack.nil? or @elementstack.empty?
|
1499
|
+
ref.downcase!
|
1500
|
+
chars = ['34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e']
|
1501
|
+
if chars.include?ref
|
1502
|
+
text = "&##{ref};"
|
1503
|
+
else
|
1504
|
+
if ref[0..0] == 'x'
|
1505
|
+
c = (ref[1..-1]).to_i(16)
|
1506
|
+
else
|
1507
|
+
c = ref.to_i
|
1508
|
+
end
|
1509
|
+
text = uconvert(unichr(c),'unicode')
|
1510
|
+
end
|
1511
|
+
@elementstack[-1][2] << text
|
1512
|
+
end
|
1513
|
+
|
1514
|
+
def handle_entityref(ref)
|
1515
|
+
# LooseParserOnly
|
1516
|
+
# called for each entity reference, e.g. for '©', ref will be 'copy'
|
1517
|
+
|
1518
|
+
return if @elementstack.nil? or @elementstack.empty?
|
1519
|
+
$stderr << "entering handle_entityref with #{ref}\n" if $debug
|
1520
|
+
ents = ['lt', 'gt', 'quot', 'amp', 'apos']
|
1521
|
+
if ents.include?ref
|
1522
|
+
text = "&#{ref};"
|
1523
|
+
else
|
1524
|
+
text = HTMLEntities::decode_entities("&#{ref};")
|
1525
|
+
end
|
1526
|
+
@elementstack[-1][2] << text
|
1527
|
+
end
|
1528
|
+
|
1529
|
+
def handle_data(text, escape=true)
|
1530
|
+
# called for each block of plain text, i.e. outside of any tag and
|
1531
|
+
# not containing any character or entity references
|
1532
|
+
return if @elementstack.nil? or @elementstack.empty?
|
1533
|
+
if escape and @contentparams['type'] == 'application/xhtml+xml'
|
1534
|
+
text = text.to_xs
|
1535
|
+
end
|
1536
|
+
@elementstack[-1][2] << text
|
1537
|
+
end
|
1538
|
+
|
1539
|
+
def handle_comment(comment)
|
1540
|
+
# called for each comment, e.g. <!-- insert message here -->
|
1541
|
+
end
|
1542
|
+
|
1543
|
+
def handle_pi(text)
|
1544
|
+
end
|
1545
|
+
|
1546
|
+
def handle_decl(text)
|
1547
|
+
end
|
1548
|
+
|
1549
|
+
def parse_declaration(i)
|
1550
|
+
# for LooseFeedParser
|
1551
|
+
$stderr << "entering parse_declaration\n" if $debug
|
1552
|
+
if @rawdata[i...i+9] == '<![CDATA['
|
1553
|
+
k = @rawdata.index(/\]\]>/u,i+9)
|
1554
|
+
k = @rawdata.length unless k
|
1555
|
+
handle_data(@rawdata[i+9...k].to_xs,false)
|
1556
|
+
return k+3
|
1557
|
+
else
|
1558
|
+
k = @rawdata.index(/>/,i).to_i
|
1559
|
+
return k+1
|
1560
|
+
end
|
1561
|
+
end
|
1562
|
+
|
1563
|
+
def mapContentType(contentType)
|
1564
|
+
contentType.downcase!
|
1565
|
+
case contentType
|
1566
|
+
when 'text'
|
1567
|
+
contentType = 'text/plain'
|
1568
|
+
when 'html'
|
1569
|
+
contentType = 'text/html'
|
1570
|
+
when 'xhtml'
|
1571
|
+
contentType = 'application/xhtml+xml'
|
1572
|
+
end
|
1573
|
+
return contentType
|
1574
|
+
end
|
1575
|
+
|
1576
|
+
def trackNamespace(prefix, uri)
|
1577
|
+
|
1578
|
+
loweruri = uri.downcase.strip
|
1579
|
+
if [prefix, loweruri] == [nil, 'http://my.netscape.com/rdf/simple/0.9/'] and (@version.nil? or @version.empty?)
|
1580
|
+
@version = 'rss090'
|
1581
|
+
elsif loweruri == 'http://purl.org/rss/1.0/' and (@version.nil? or @version.empty?)
|
1582
|
+
@version = 'rss10'
|
1583
|
+
elsif loweruri == 'http://www.w3.org/2005/atom' and (@version.nil? or @version.empty?)
|
1584
|
+
@version = 'atom10'
|
1585
|
+
elsif /backend\.userland\.com\/rss/ =~ loweruri
|
1586
|
+
# match any backend.userland.com namespace
|
1587
|
+
uri = 'http://backend.userland.com/rss'
|
1588
|
+
loweruri = uri
|
1589
|
+
end
|
1590
|
+
if @matchnamespaces.has_key? loweruri
|
1591
|
+
@namespacemap[prefix] = @matchnamespaces[loweruri]
|
1592
|
+
@namespacesInUse[@matchnamespaces[loweruri]] = uri
|
1593
|
+
else
|
1594
|
+
@namespacesInUse[prefix || ''] = uri
|
1595
|
+
end
|
1596
|
+
end
|
1597
|
+
|
1598
|
+
def resolveURI(uri)
|
1599
|
+
return urljoin(@baseuri || '', uri)
|
1600
|
+
end
|
1601
|
+
|
1602
|
+
def decodeEntities(element, data)
|
1603
|
+
return data
|
1604
|
+
end
|
1605
|
+
|
1606
|
+
def push(element, expectingText)
|
1607
|
+
@elementstack << [element, expectingText, []]
|
1608
|
+
end
|
1609
|
+
|
1610
|
+
def pop(element, stripWhitespace=true)
|
1611
|
+
return if @elementstack.nil? or @elementstack.empty?
|
1612
|
+
return if @elementstack[-1][0] != element
|
1613
|
+
element, expectingText, pieces = @elementstack.pop
|
1614
|
+
if pieces.class == Array
|
1615
|
+
output = pieces.join('')
|
1616
|
+
else
|
1617
|
+
output = pieces
|
1618
|
+
end
|
1619
|
+
if stripWhitespace
|
1620
|
+
output.strip!
|
1621
|
+
end
|
1622
|
+
return output if not expectingText
|
1623
|
+
|
1624
|
+
# decode base64 content
|
1625
|
+
if @contentparams['base64']
|
1626
|
+
out64 = Base64::decode64(output) # a.k.a. [output].unpack('m')[0]
|
1627
|
+
if not output.empty? and not out64.empty?
|
1628
|
+
output = out64
|
1629
|
+
end
|
1630
|
+
end
|
1631
|
+
|
1632
|
+
# resolve relative URIs
|
1633
|
+
if @can_be_relative_uri.include?element and output and not output.empty?
|
1634
|
+
output = resolveURI(output)
|
1635
|
+
end
|
1636
|
+
|
1637
|
+
# decode entities within embedded markup
|
1638
|
+
if not @contentparams['base64']
|
1639
|
+
output = decodeEntities(element, output)
|
1640
|
+
end
|
1641
|
+
|
1642
|
+
# remove temporary cruft from contentparams
|
1643
|
+
@contentparams.delete('mode')
|
1644
|
+
@contentparams.delete('base64')
|
1645
|
+
|
1646
|
+
# resolve relative URIs within embedded markup
|
1647
|
+
if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
|
1648
|
+
if @can_contain_relative_uris.include?element
|
1649
|
+
output = FeedParser.resolveRelativeURIs(output, @baseuri, @encoding)
|
1650
|
+
end
|
1651
|
+
end
|
1652
|
+
# sanitize embedded markup
|
1653
|
+
if @html_types.include?mapContentType(@contentparams['type'] || 'text/html')
|
1654
|
+
if @can_contain_dangerous_markup.include?element
|
1655
|
+
output = FeedParser.sanitizeHTML(output, @encoding)
|
1656
|
+
end
|
1657
|
+
end
|
1658
|
+
|
1659
|
+
if @encoding and not @encoding.empty? and @encoding != 'utf-8'
|
1660
|
+
output = uconvert(output, @encoding, 'utf-8')
|
1661
|
+
# FIXME I turn everything into utf-8, not unicode, originally because REXML was being used but now beause I haven't tested it out yet.
|
1662
|
+
end
|
1663
|
+
|
1664
|
+
# categories/tags/keywords/whatever are handled in _end_category
|
1665
|
+
return output if element == 'category'
|
1666
|
+
|
1667
|
+
# store output in appropriate place(s)
|
1668
|
+
if @inentry and not @insource
|
1669
|
+
if element == 'content'
|
1670
|
+
@entries[-1][element] ||= []
|
1671
|
+
contentparams = Marshal.load(Marshal.dump(@contentparams)) # deepcopy
|
1672
|
+
contentparams['value'] = output
|
1673
|
+
@entries[-1][element] << contentparams
|
1674
|
+
elsif element == 'link'
|
1675
|
+
@entries[-1][element] = output
|
1676
|
+
if output and not output.empty?
|
1677
|
+
@entries[-1]['links'][-1]['href'] = output
|
1678
|
+
end
|
1679
|
+
else
|
1680
|
+
element = 'summary' if element == 'description'
|
1681
|
+
@entries[-1][element] = output
|
1682
|
+
if @incontent != 0
|
1683
|
+
contentparams = Marshal.load(Marshal.dump(@contentparams))
|
1684
|
+
contentparams['value'] = output
|
1685
|
+
@entries[-1][element + '_detail'] = contentparams
|
1686
|
+
end
|
1687
|
+
end
|
1688
|
+
elsif (@infeed or @insource) and not @intextinput and not @inimage
|
1689
|
+
context = getContext()
|
1690
|
+
element = 'subtitle' if element == 'description'
|
1691
|
+
context[element] = output
|
1692
|
+
if element == 'link'
|
1693
|
+
context['links'][-1]['href'] = output
|
1694
|
+
elsif @incontent != 0
|
1695
|
+
contentparams = Marshal.load(Marshal.dump(@contentparams))
|
1696
|
+
contentparams['value'] = output
|
1697
|
+
context[element + '_detail'] = contentparams
|
1698
|
+
end
|
1699
|
+
end
|
1700
|
+
return output
|
1701
|
+
end
|
1702
|
+
|
1703
|
+
def pushContent(tag, attrsD, defaultContentType, expectingText)
|
1704
|
+
@incontent += 1 # Yes, I hate this.
|
1705
|
+
type = mapContentType(attrsD['type'] || defaultContentType)
|
1706
|
+
@contentparams = FeedParserDict.new({'type' => type,'language' => @lang,'base' => @baseuri})
|
1707
|
+
@contentparams['base64'] = isBase64(attrsD, @contentparams)
|
1708
|
+
push(tag, expectingText)
|
1709
|
+
end
|
1710
|
+
|
1711
|
+
def popContent(tag)
|
1712
|
+
value = pop(tag)
|
1713
|
+
@incontent -= 1
|
1714
|
+
@contentparams.clear
|
1715
|
+
return value
|
1716
|
+
end
|
1717
|
+
|
1718
|
+
def mapToStandardPrefix(name)
|
1719
|
+
colonpos = name.index(':')
|
1720
|
+
if colonpos
|
1721
|
+
prefix = name[0..colonpos-1]
|
1722
|
+
suffix = name[colonpos+1..-1]
|
1723
|
+
prefix = @namespacemap[prefix] || prefix
|
1724
|
+
name = prefix + ':' + suffix
|
1725
|
+
end
|
1726
|
+
return name
|
1727
|
+
end
|
1728
|
+
|
1729
|
+
def getAttribute(attrsD, name)
|
1730
|
+
return attrsD[mapToStandardPrefix(name)]
|
1731
|
+
end
|
1732
|
+
|
1733
|
+
def isBase64(attrsD, contentparams)
|
1734
|
+
return true if (attrsD['mode'] == 'base64')
|
1735
|
+
if /(^text\/)|(\+xml$)|(\/xml$)/ =~ contentparams['type']
|
1736
|
+
return false
|
1737
|
+
end
|
1738
|
+
return true
|
1739
|
+
end
|
1740
|
+
|
1741
|
+
def itsAnHrefDamnIt(attrsD)
|
1742
|
+
href= attrsD['url'] || attrsD['uri'] || attrsD['href']
|
1743
|
+
if href
|
1744
|
+
attrsD.delete('url')
|
1745
|
+
attrsD.delete('uri')
|
1746
|
+
attrsD['href'] = href
|
1747
|
+
end
|
1748
|
+
return attrsD
|
1749
|
+
end
|
1750
|
+
|
1751
|
+
|
1752
|
+
def _save(key, value)
|
1753
|
+
context = getContext()
|
1754
|
+
context[key] ||= value
|
1755
|
+
end
|
1756
|
+
|
1757
|
+
def _start_rss(attrsD)
|
1758
|
+
versionmap = {'0.91' => 'rss091u',
|
1759
|
+
'0.92' => 'rss092',
|
1760
|
+
'0.93' => 'rss093',
|
1761
|
+
'0.94' => 'rss094'
|
1762
|
+
}
|
1763
|
+
|
1764
|
+
if not @version or @version.empty?
|
1765
|
+
attr_version = attrsD['version'] || ''
|
1766
|
+
version = versionmap[attr_version]
|
1767
|
+
if version and not version.empty?
|
1768
|
+
@version = version
|
1769
|
+
elsif /^2\./ =~ attr_version
|
1770
|
+
@version = 'rss20'
|
1771
|
+
else
|
1772
|
+
@version = 'rss'
|
1773
|
+
end
|
1774
|
+
end
|
1775
|
+
end
|
1776
|
+
|
1777
|
+
def _start_dlhottitles(attrsD)
|
1778
|
+
@version = 'hotrss'
|
1779
|
+
end
|
1780
|
+
|
1781
|
+
def _start_channel(attrsD)
|
1782
|
+
@infeed = true
|
1783
|
+
_cdf_common(attrsD)
|
1784
|
+
end
|
1785
|
+
alias :_start_feedinfo :_start_channel
|
1786
|
+
|
1787
|
+
def _cdf_common(attrsD)
|
1788
|
+
if attrsD.has_key?'lastmod'
|
1789
|
+
_start_modified({})
|
1790
|
+
@elementstack[-1][-1] = attrsD['lastmod']
|
1791
|
+
_end_modified
|
1792
|
+
end
|
1793
|
+
if attrsD.has_key?'href'
|
1794
|
+
_start_link({})
|
1795
|
+
@elementstack[-1][-1] = attrsD['href']
|
1796
|
+
_end_link
|
1797
|
+
end
|
1798
|
+
end
|
1799
|
+
|
1800
|
+
def _start_feed(attrsD)
|
1801
|
+
@infeed = true
|
1802
|
+
versionmap = {'0.1' => 'atom01',
|
1803
|
+
'0.2' => 'atom02',
|
1804
|
+
'0.3' => 'atom03'
|
1805
|
+
}
|
1806
|
+
|
1807
|
+
if not @version or @version.empty?
|
1808
|
+
attr_version = attrsD['version']
|
1809
|
+
version = versionmap[attr_version]
|
1810
|
+
if @version and not @version.empty?
|
1811
|
+
@version = version
|
1812
|
+
else
|
1813
|
+
@version = 'atom'
|
1814
|
+
end
|
1815
|
+
end
|
1816
|
+
end
|
1817
|
+
|
1818
|
+
def _end_channel
|
1819
|
+
@infeed = false
|
1820
|
+
end
|
1821
|
+
alias :_end_feed :_end_channel
|
1822
|
+
|
1823
|
+
def _start_image(attrsD)
|
1824
|
+
@inimage = true
|
1825
|
+
push('image', false)
|
1826
|
+
context = getContext()
|
1827
|
+
context['image'] ||= FeedParserDict.new
|
1828
|
+
end
|
1829
|
+
|
1830
|
+
def _end_image
|
1831
|
+
pop('image')
|
1832
|
+
@inimage = false
|
1833
|
+
end
|
1834
|
+
|
1835
|
+
def _start_textinput(attrsD)
|
1836
|
+
@intextinput = true
|
1837
|
+
push('textinput', false)
|
1838
|
+
context = getContext()
|
1839
|
+
context['textinput'] ||= FeedParserDict.new
|
1840
|
+
end
|
1841
|
+
alias :_start_textInput :_start_textinput
|
1842
|
+
|
1843
|
+
def _end_textinput
|
1844
|
+
pop('textinput')
|
1845
|
+
@intextinput = false
|
1846
|
+
end
|
1847
|
+
alias :_end_textInput :_end_textinput
|
1848
|
+
|
1849
|
+
def _start_author(attrsD)
|
1850
|
+
@inauthor = true
|
1851
|
+
push('author', true)
|
1852
|
+
end
|
1853
|
+
alias :_start_managingeditor :_start_author
|
1854
|
+
alias :_start_dc_author :_start_author
|
1855
|
+
alias :_start_dc_creator :_start_author
|
1856
|
+
alias :_start_itunes_author :_start_author
|
1857
|
+
|
1858
|
+
def _end_author
|
1859
|
+
pop('author')
|
1860
|
+
@inauthor = false
|
1861
|
+
_sync_author_detail()
|
1862
|
+
end
|
1863
|
+
alias :_end_managingeditor :_end_author
|
1864
|
+
alias :_end_dc_author :_end_author
|
1865
|
+
alias :_end_dc_creator :_end_author
|
1866
|
+
alias :_end_itunes_author :_end_author
|
1867
|
+
|
1868
|
+
def _start_itunes_owner(attrsD)
|
1869
|
+
@inpublisher = true
|
1870
|
+
push('publisher', false)
|
1871
|
+
end
|
1872
|
+
|
1873
|
+
def _end_itunes_owner
|
1874
|
+
pop('publisher')
|
1875
|
+
@inpublisher = false
|
1876
|
+
_sync_author_detail('publisher')
|
1877
|
+
end
|
1878
|
+
|
1879
|
+
def _start_contributor(attrsD)
|
1880
|
+
@incontributor = true
|
1881
|
+
context = getContext()
|
1882
|
+
context['contributors'] ||= []
|
1883
|
+
context['contributors'] << FeedParserDict.new
|
1884
|
+
push('contributor', false)
|
1885
|
+
end
|
1886
|
+
|
1887
|
+
def _end_contributor
|
1888
|
+
pop('contributor')
|
1889
|
+
@incontributor = false
|
1890
|
+
end
|
1891
|
+
|
1892
|
+
def _start_dc_contributor(attrsD)
|
1893
|
+
@incontributor = true
|
1894
|
+
context = getContext()
|
1895
|
+
context['contributors'] ||= []
|
1896
|
+
context['contributors'] << FeedParserDict.new
|
1897
|
+
push('name', false)
|
1898
|
+
end
|
1899
|
+
|
1900
|
+
def _end_dc_contributor
|
1901
|
+
_end_name
|
1902
|
+
@incontributor = false
|
1903
|
+
end
|
1904
|
+
|
1905
|
+
def _start_name(attrsD)
|
1906
|
+
push('name', false)
|
1907
|
+
end
|
1908
|
+
alias :_start_itunes_name :_start_name
|
1909
|
+
|
1910
|
+
def _end_name
|
1911
|
+
value = pop('name')
|
1912
|
+
if @inpublisher
|
1913
|
+
_save_author('name', value, 'publisher')
|
1914
|
+
elsif @inauthor
|
1915
|
+
_save_author('name', value)
|
1916
|
+
elsif @incontributor
|
1917
|
+
_save_contributor('name', value)
|
1918
|
+
elsif @intextinput
|
1919
|
+
context = getContext()
|
1920
|
+
context['textinput']['name'] = value
|
1921
|
+
end
|
1922
|
+
end
|
1923
|
+
alias :_end_itunes_name :_end_name
|
1924
|
+
|
1925
|
+
def _start_width(attrsD)
|
1926
|
+
push('width', false)
|
1927
|
+
end
|
1928
|
+
|
1929
|
+
def _end_width
|
1930
|
+
value = pop('width').to_i
|
1931
|
+
if @inimage
|
1932
|
+
context = getContext
|
1933
|
+
context['image']['width'] = value
|
1934
|
+
end
|
1935
|
+
end
|
1936
|
+
|
1937
|
+
def _start_height(attrsD)
|
1938
|
+
push('height', false)
|
1939
|
+
end
|
1940
|
+
|
1941
|
+
def _end_height
|
1942
|
+
value = pop('height').to_i
|
1943
|
+
if @inimage
|
1944
|
+
context = getContext()
|
1945
|
+
context['image']['height'] = value
|
1946
|
+
end
|
1947
|
+
end
|
1948
|
+
|
1949
|
+
def _start_url(attrsD)
|
1950
|
+
push('href', true)
|
1951
|
+
end
|
1952
|
+
alias :_start_homepage :_start_url
|
1953
|
+
alias :_start_uri :_start_url
|
1954
|
+
|
1955
|
+
def _end_url
|
1956
|
+
value = pop('href')
|
1957
|
+
if @inauthor
|
1958
|
+
_save_author('href', value)
|
1959
|
+
elsif @incontributor
|
1960
|
+
_save_contributor('href', value)
|
1961
|
+
elsif @inimage
|
1962
|
+
context = getContext()
|
1963
|
+
context['image']['href'] = value
|
1964
|
+
elsif @intextinput
|
1965
|
+
context = getContext()
|
1966
|
+
context['textinput']['link'] = value
|
1967
|
+
end
|
1968
|
+
end
|
1969
|
+
alias :_end_homepage :_end_url
|
1970
|
+
alias :_end_uri :_end_url
|
1971
|
+
|
1972
|
+
def _start_email(attrsD)
|
1973
|
+
push('email', false)
|
1974
|
+
end
|
1975
|
+
alias :_start_itunes_email :_start_email
|
1976
|
+
|
1977
|
+
def _end_email
|
1978
|
+
value = pop('email')
|
1979
|
+
if @inpublisher
|
1980
|
+
_save_author('email', value, 'publisher')
|
1981
|
+
elsif @inauthor
|
1982
|
+
_save_author('email', value)
|
1983
|
+
elsif @incontributor
|
1984
|
+
_save_contributor('email', value)
|
1985
|
+
end
|
1986
|
+
end
|
1987
|
+
alias :_end_itunes_email :_end_email
|
1988
|
+
|
1989
|
+
def getContext
|
1990
|
+
if @insource
|
1991
|
+
context = @sourcedata
|
1992
|
+
elsif @inentry
|
1993
|
+
context = @entries[-1]
|
1994
|
+
else
|
1995
|
+
context = @feeddata
|
1996
|
+
end
|
1997
|
+
return context
|
1998
|
+
end
|
1999
|
+
|
2000
|
+
def _save_author(key, value, prefix='author')
|
2001
|
+
context = getContext()
|
2002
|
+
context[prefix + '_detail'] ||= FeedParserDict.new
|
2003
|
+
context[prefix + '_detail'][key] = value
|
2004
|
+
_sync_author_detail()
|
2005
|
+
end
|
2006
|
+
|
2007
|
+
def _save_contributor(key, value)
|
2008
|
+
context = getContext
|
2009
|
+
context['contributors'] ||= [FeedParserDict.new]
|
2010
|
+
context['contributors'][-1][key] = value
|
2011
|
+
end
|
2012
|
+
|
2013
|
+
def _sync_author_detail(key='author')
|
2014
|
+
context = getContext()
|
2015
|
+
detail = context["#{key}_detail"]
|
2016
|
+
if detail and not detail.empty?
|
2017
|
+
name = detail['name']
|
2018
|
+
email = detail['email']
|
2019
|
+
|
2020
|
+
if name and email and not (name.empty? or name.empty?)
|
2021
|
+
context[key] = "#{name} (#{email})"
|
2022
|
+
elsif name and not name.empty?
|
2023
|
+
context[key] = name
|
2024
|
+
elsif email and not email.empty?
|
2025
|
+
context[key] = email
|
2026
|
+
end
|
2027
|
+
else
|
2028
|
+
author = context[key].dup unless context[key].nil?
|
2029
|
+
return if not author or author.empty?
|
2030
|
+
emailmatch = author.match(/(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))/)
|
2031
|
+
email = emailmatch[1]
|
2032
|
+
author.gsub!(email, '')
|
2033
|
+
author.gsub!("\(\)", '')
|
2034
|
+
author.strip!
|
2035
|
+
author.gsub!(/^\(/,'')
|
2036
|
+
author.gsub!(/\)$/,'')
|
2037
|
+
author.strip!
|
2038
|
+
context["#{key}_detail"] ||= FeedParserDict.new
|
2039
|
+
context["#{key}_detail"]['name'] = author
|
2040
|
+
context["#{key}_detail"]['email'] = email
|
2041
|
+
end
|
2042
|
+
end
|
2043
|
+
|
2044
|
+
def _start_subtitle(attrsD)
|
2045
|
+
pushContent('subtitle', attrsD, 'text/plain', true)
|
2046
|
+
end
|
2047
|
+
alias :_start_tagline :_start_subtitle
|
2048
|
+
alias :_start_itunes_subtitle :_start_subtitle
|
2049
|
+
|
2050
|
+
def _end_subtitle
|
2051
|
+
popContent('subtitle')
|
2052
|
+
end
|
2053
|
+
alias :_end_tagline :_end_subtitle
|
2054
|
+
alias :_end_itunes_subtitle :_end_subtitle
|
2055
|
+
|
2056
|
+
def _start_rights(attrsD)
|
2057
|
+
pushContent('rights', attrsD, 'text/plain', true)
|
2058
|
+
end
|
2059
|
+
alias :_start_dc_rights :_start_rights
|
2060
|
+
alias :_start_copyright :_start_rights
|
2061
|
+
|
2062
|
+
def _end_rights
|
2063
|
+
popContent('rights')
|
2064
|
+
end
|
2065
|
+
alias :_end_dc_rights :_end_rights
|
2066
|
+
alias :_end_copyright :_end_rights
|
2067
|
+
|
2068
|
+
def _start_item(attrsD)
|
2069
|
+
@entries << FeedParserDict.new
|
2070
|
+
push('item', false)
|
2071
|
+
@inentry = true
|
2072
|
+
@guidislink = false
|
2073
|
+
id = getAttribute(attrsD, 'rdf:about')
|
2074
|
+
if id and not id.empty?
|
2075
|
+
context = getContext()
|
2076
|
+
context['id'] = id
|
2077
|
+
end
|
2078
|
+
_cdf_common(attrsD)
|
2079
|
+
end
|
2080
|
+
alias :_start_entry :_start_item
|
2081
|
+
alias :_start_product :_start_item
|
2082
|
+
|
2083
|
+
def _end_item
|
2084
|
+
pop('item')
|
2085
|
+
@inentry = false
|
2086
|
+
end
|
2087
|
+
alias :_end_entry :_end_item
|
2088
|
+
|
2089
|
+
def _start_dc_language(attrsD)
|
2090
|
+
push('language', true)
|
2091
|
+
end
|
2092
|
+
alias :_start_language :_start_dc_language
|
2093
|
+
|
2094
|
+
def _end_dc_language
|
2095
|
+
@lang = pop('language')
|
2096
|
+
end
|
2097
|
+
alias :_end_language :_end_dc_language
|
2098
|
+
|
2099
|
+
def _start_dc_publisher(attrsD)
|
2100
|
+
push('publisher', true)
|
2101
|
+
end
|
2102
|
+
alias :_start_webmaster :_start_dc_publisher
|
2103
|
+
|
2104
|
+
def _end_dc_publisher
|
2105
|
+
pop('publisher')
|
2106
|
+
_sync_author_detail('publisher')
|
2107
|
+
end
|
2108
|
+
alias :_end_webmaster :_end_dc_publisher
|
2109
|
+
|
2110
|
+
def _start_published(attrsD)
|
2111
|
+
push('published', true)
|
2112
|
+
end
|
2113
|
+
alias :_start_dcterms_issued :_start_published
|
2114
|
+
alias :_start_issued :_start_published
|
2115
|
+
|
2116
|
+
def _end_published
|
2117
|
+
value = pop('published')
|
2118
|
+
_save('published_parsed', parse_date(value))
|
2119
|
+
end
|
2120
|
+
alias :_end_dcterms_issued :_end_published
|
2121
|
+
alias :_end_issued :_end_published
|
2122
|
+
|
2123
|
+
def _start_updated(attrsD)
|
2124
|
+
push('updated', true)
|
2125
|
+
end
|
2126
|
+
alias :_start_modified :_start_updated
|
2127
|
+
alias :_start_dcterms_modified :_start_updated
|
2128
|
+
alias :_start_pubdate :_start_updated
|
2129
|
+
alias :_start_dc_date :_start_updated
|
2130
|
+
|
2131
|
+
def _end_updated
|
2132
|
+
value = pop('updated')
|
2133
|
+
_save('updated_parsed', parse_date(value))
|
2134
|
+
end
|
2135
|
+
alias :_end_modified :_end_updated
|
2136
|
+
alias :_end_dcterms_modified :_end_updated
|
2137
|
+
alias :_end_pubdate :_end_updated
|
2138
|
+
alias :_end_dc_date :_end_updated
|
2139
|
+
|
2140
|
+
def _start_created(attrsD)
|
2141
|
+
push('created', true)
|
2142
|
+
end
|
2143
|
+
alias :_start_dcterms_created :_start_created
|
2144
|
+
|
2145
|
+
def _end_created
|
2146
|
+
value = pop('created')
|
2147
|
+
_save('created_parsed', parse_date(value))
|
2148
|
+
end
|
2149
|
+
alias :_end_dcterms_created :_end_created
|
2150
|
+
|
2151
|
+
def _start_expirationdate(attrsD)
|
2152
|
+
push('expired', true)
|
2153
|
+
end
|
2154
|
+
def _end_expirationdate
|
2155
|
+
_save('expired_parsed', parse_date(pop('expired')))
|
2156
|
+
end
|
2157
|
+
|
2158
|
+
def _start_cc_license(attrsD)
|
2159
|
+
push('license', true)
|
2160
|
+
value = getAttribute(attrsD, 'rdf:resource')
|
2161
|
+
if value and not value.empty?
|
2162
|
+
elementstack[-1][2] << value
|
2163
|
+
pop('license')
|
2164
|
+
end
|
2165
|
+
end
|
2166
|
+
|
2167
|
+
def _start_creativecommons_license(attrsD)
|
2168
|
+
push('license', true)
|
2169
|
+
end
|
2170
|
+
|
2171
|
+
def _end_creativecommons_license
|
2172
|
+
pop('license')
|
2173
|
+
end
|
2174
|
+
|
2175
|
+
def addTag(term, scheme, label)
|
2176
|
+
context = getContext()
|
2177
|
+
context['tags'] ||= []
|
2178
|
+
tags = context['tags']
|
2179
|
+
if (term.nil? or term.empty?) and (scheme.nil? or scheme.empty?) and (label.nil? or label.empty?)
|
2180
|
+
return
|
2181
|
+
end
|
2182
|
+
value = FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
|
2183
|
+
if not tags.include?value
|
2184
|
+
context['tags'] << FeedParserDict.new({'term' => term, 'scheme' => scheme, 'label' => label})
|
2185
|
+
end
|
2186
|
+
end
|
2187
|
+
|
2188
|
+
def _start_category(attrsD)
|
2189
|
+
$stderr << "entering _start_category with #{attrsD}\n" if $debug
|
2190
|
+
|
2191
|
+
term = attrsD['term']
|
2192
|
+
scheme = attrsD['scheme'] || attrsD['domain']
|
2193
|
+
label = attrsD['label']
|
2194
|
+
addTag(term, scheme, label)
|
2195
|
+
push('category', true)
|
2196
|
+
end
|
2197
|
+
alias :_start_dc_subject :_start_category
|
2198
|
+
alias :_start_keywords :_start_category
|
2199
|
+
|
2200
|
+
def _end_itunes_keywords
|
2201
|
+
pop('itunes_keywords').split.each do |term|
|
2202
|
+
addTag(term, 'http://www.itunes.com/', nil)
|
2203
|
+
end
|
2204
|
+
end
|
2205
|
+
|
2206
|
+
def _start_itunes_category(attrsD)
|
2207
|
+
addTag(attrsD['text'], 'http://www.itunes.com/', nil)
|
2208
|
+
push('category', true)
|
2209
|
+
end
|
2210
|
+
|
2211
|
+
def _end_category
|
2212
|
+
value = pop('category')
|
2213
|
+
return if value.nil? or value.empty?
|
2214
|
+
context = getContext()
|
2215
|
+
tags = context['tags']
|
2216
|
+
if value and not value.empty? and not tags.empty? and not tags[-1]['term']:
|
2217
|
+
tags[-1]['term'] = value
|
2218
|
+
else
|
2219
|
+
addTag(value, nil, nil)
|
2220
|
+
end
|
2221
|
+
end
|
2222
|
+
alias :_end_dc_subject :_end_category
|
2223
|
+
alias :_end_keywords :_end_category
|
2224
|
+
alias :_end_itunes_category :_end_category
|
2225
|
+
|
2226
|
+
def _start_cloud(attrsD)
|
2227
|
+
getContext()['cloud'] = FeedParserDict.new(attrsD)
|
2228
|
+
end
|
2229
|
+
|
2230
|
+
def _start_link(attrsD)
|
2231
|
+
attrsD['rel'] ||= 'alternate'
|
2232
|
+
attrsD['type'] ||= 'text/html'
|
2233
|
+
attrsD = itsAnHrefDamnIt(attrsD)
|
2234
|
+
if attrsD.has_key? 'href'
|
2235
|
+
attrsD['href'] = resolveURI(attrsD['href'])
|
2236
|
+
end
|
2237
|
+
expectingText = @infeed || @inentry || @insource
|
2238
|
+
context = getContext()
|
2239
|
+
context['links'] ||= []
|
2240
|
+
context['links'] << FeedParserDict.new(attrsD)
|
2241
|
+
if attrsD['rel'] == 'enclosure'
|
2242
|
+
_start_enclosure(attrsD)
|
2243
|
+
end
|
2244
|
+
if attrsD.has_key? 'href'
|
2245
|
+
expectingText = false
|
2246
|
+
if (attrsD['rel'] == 'alternate') and @html_types.include?mapContentType(attrsD['type'])
|
2247
|
+
context['link'] = attrsD['href']
|
2248
|
+
end
|
2249
|
+
else
|
2250
|
+
push('link', expectingText)
|
2251
|
+
end
|
2252
|
+
end
|
2253
|
+
alias :_start_producturl :_start_link
|
2254
|
+
|
2255
|
+
def _end_link
|
2256
|
+
value = pop('link')
|
2257
|
+
context = getContext()
|
2258
|
+
if @intextinput
|
2259
|
+
context['textinput']['link'] = value
|
2260
|
+
end
|
2261
|
+
if @inimage
|
2262
|
+
context['image']['link'] = value
|
2263
|
+
end
|
2264
|
+
end
|
2265
|
+
alias :_end_producturl :_end_link
|
2266
|
+
|
2267
|
+
def _start_guid(attrsD)
|
2268
|
+
@guidislink = ((attrsD['ispermalink'] || 'true') == 'true')
|
2269
|
+
push('id', true)
|
2270
|
+
end
|
2271
|
+
|
2272
|
+
def _end_guid
|
2273
|
+
value = pop('id')
|
2274
|
+
_save('guidislink', (@guidislink and not getContext().has_key?('link')))
|
2275
|
+
if @guidislink:
|
2276
|
+
# guid acts as link, but only if 'ispermalink' is not present or is 'true',
|
2277
|
+
# and only if the item doesn't already have a link element
|
2278
|
+
_save('link', value)
|
2279
|
+
end
|
2280
|
+
end
|
2281
|
+
|
2282
|
+
|
2283
|
+
def _start_title(attrsD)
|
2284
|
+
pushContent('title', attrsD, 'text/plain', @infeed || @inentry || @insource)
|
2285
|
+
end
|
2286
|
+
alias :_start_dc_title :_start_title
|
2287
|
+
alias :_start_media_title :_start_title
|
2288
|
+
|
2289
|
+
def _end_title
|
2290
|
+
value = popContent('title')
|
2291
|
+
context = getContext()
|
2292
|
+
if @intextinput
|
2293
|
+
context['textinput']['title'] = value
|
2294
|
+
elsif @inimage
|
2295
|
+
context['image']['title'] = value
|
2296
|
+
end
|
2297
|
+
end
|
2298
|
+
alias :_end_dc_title :_end_title
|
2299
|
+
alias :_end_media_title :_end_title
|
2300
|
+
|
2301
|
+
def _start_description(attrsD)
|
2302
|
+
context = getContext()
|
2303
|
+
if context.has_key?('summary')
|
2304
|
+
@summaryKey = 'content'
|
2305
|
+
_start_content(attrsD)
|
2306
|
+
else
|
2307
|
+
pushContent('description', attrsD, 'text/html', @infeed || @inentry || @insource)
|
2308
|
+
end
|
2309
|
+
end
|
2310
|
+
|
2311
|
+
def _start_abstract(attrsD)
|
2312
|
+
pushContent('description', attrsD, 'text/plain', @infeed || @inentry || @insource)
|
2313
|
+
end
|
2314
|
+
|
2315
|
+
def _end_description
|
2316
|
+
if @summaryKey == 'content'
|
2317
|
+
_end_content()
|
2318
|
+
else
|
2319
|
+
value = popContent('description')
|
2320
|
+
context = getContext()
|
2321
|
+
if @intextinput
|
2322
|
+
context['textinput']['description'] = value
|
2323
|
+
elsif @inimage:
|
2324
|
+
context['image']['description'] = value
|
2325
|
+
end
|
2326
|
+
end
|
2327
|
+
@summaryKey = nil
|
2328
|
+
end
|
2329
|
+
alias :_end_abstract :_end_description
|
2330
|
+
|
2331
|
+
def _start_info(attrsD)
|
2332
|
+
pushContent('info', attrsD, 'text/plain', true)
|
2333
|
+
end
|
2334
|
+
alias :_start_feedburner_browserfriendly :_start_info
|
2335
|
+
|
2336
|
+
def _end_info
|
2337
|
+
popContent('info')
|
2338
|
+
end
|
2339
|
+
alias :_end_feedburner_browserfriendly :_end_info
|
2340
|
+
|
2341
|
+
def _start_generator(attrsD)
|
2342
|
+
if attrsD and not attrsD.empty?
|
2343
|
+
attrsD = itsAnHrefDamnIt(attrsD)
|
2344
|
+
if attrsD.has_key?('href')
|
2345
|
+
attrsD['href'] = resolveURI(attrsD['href'])
|
2346
|
+
end
|
2347
|
+
end
|
2348
|
+
getContext()['generator_detail'] = FeedParserDict.new(attrsD)
|
2349
|
+
push('generator', true)
|
2350
|
+
end
|
2351
|
+
|
2352
|
+
def _end_generator
|
2353
|
+
value = pop('generator')
|
2354
|
+
context = getContext()
|
2355
|
+
if context.has_key?('generator_detail')
|
2356
|
+
context['generator_detail']['name'] = value
|
2357
|
+
end
|
2358
|
+
end
|
2359
|
+
|
2360
|
+
def _start_admin_generatoragent(attrsD)
|
2361
|
+
push('generator', true)
|
2362
|
+
value = getAttribute(attrsD, 'rdf:resource')
|
2363
|
+
if value and not value.empty?
|
2364
|
+
elementstack[-1][2] << value
|
2365
|
+
end
|
2366
|
+
pop('generator')
|
2367
|
+
getContext()['generator_detail'] = FeedParserDict.new({'href' => value})
|
2368
|
+
end
|
2369
|
+
|
2370
|
+
def _start_admin_errorreportsto(attrsD)
|
2371
|
+
push('errorreportsto', true)
|
2372
|
+
value = getAttribute(attrsD, 'rdf:resource')
|
2373
|
+
if value and not value.empty?
|
2374
|
+
@elementstack[-1][2] << value
|
2375
|
+
end
|
2376
|
+
pop('errorreportsto')
|
2377
|
+
end
|
2378
|
+
|
2379
|
+
def _start_summary(attrsD)
|
2380
|
+
context = getContext()
|
2381
|
+
if context.has_key?'summary'
|
2382
|
+
@summaryKey = 'content'
|
2383
|
+
_start_content(attrsD)
|
2384
|
+
else
|
2385
|
+
@summaryKey = 'summary'
|
2386
|
+
pushContent(@summaryKey, attrsD, 'text/plain', true)
|
2387
|
+
end
|
2388
|
+
end
|
2389
|
+
alias :_start_itunes_summary :_start_summary
|
2390
|
+
|
2391
|
+
def _end_summary
|
2392
|
+
if @summaryKey == 'content':
|
2393
|
+
_end_content()
|
2394
|
+
else
|
2395
|
+
popContent(@summaryKey || 'summary')
|
2396
|
+
end
|
2397
|
+
@summaryKey = nil
|
2398
|
+
end
|
2399
|
+
alias :_end_itunes_summary :_end_summary
|
2400
|
+
|
2401
|
+
def _start_enclosure(attrsD)
|
2402
|
+
attrsD = itsAnHrefDamnIt(attrsD)
|
2403
|
+
getContext()['enclosures'] ||= []
|
2404
|
+
getContext()['enclosures'] << FeedParserDict.new(attrsD)
|
2405
|
+
href = attrsD['href']
|
2406
|
+
if href and not href.empty?
|
2407
|
+
context = getContext()
|
2408
|
+
if not context['id']
|
2409
|
+
context['id'] = href
|
2410
|
+
end
|
2411
|
+
end
|
2412
|
+
end
|
2413
|
+
|
2414
|
+
def _start_source(attrsD)
|
2415
|
+
@insource = true
|
2416
|
+
end
|
2417
|
+
|
2418
|
+
def _end_source
|
2419
|
+
@insource = false
|
2420
|
+
getContext()['source'] = Marshal.load(Marshal.dump(@sourcedata))
|
2421
|
+
@sourcedata.clear()
|
2422
|
+
end
|
2423
|
+
|
2424
|
+
def _start_content(attrsD)
|
2425
|
+
pushContent('content', attrsD, 'text/plain', true)
|
2426
|
+
src = attrsD['src']
|
2427
|
+
if src and not src.empty?:
|
2428
|
+
@contentparams['src'] = src
|
2429
|
+
end
|
2430
|
+
push('content', true)
|
2431
|
+
end
|
2432
|
+
|
2433
|
+
def _start_prodlink(attrsD)
|
2434
|
+
pushContent('content', attrsD, 'text/html', true)
|
2435
|
+
end
|
2436
|
+
|
2437
|
+
def _start_body(attrsD)
|
2438
|
+
pushContent('content', attrsD, 'application/xhtml+xml', true)
|
2439
|
+
end
|
2440
|
+
alias :_start_xhtml_body :_start_body
|
2441
|
+
|
2442
|
+
def _start_content_encoded(attrsD)
|
2443
|
+
pushContent('content', attrsD, 'text/html', true)
|
2444
|
+
end
|
2445
|
+
alias :_start_fullitem :_start_content_encoded
|
2446
|
+
|
2447
|
+
def _end_content
|
2448
|
+
copyToDescription = (['text/plain'] + @html_types).include? mapContentType(@contentparams['type'])
|
2449
|
+
value = popContent('content')
|
2450
|
+
if copyToDescription
|
2451
|
+
_save('description', value)
|
2452
|
+
end
|
2453
|
+
alias :_end_body :_end_content
|
2454
|
+
alias :_end_xhtml_body :_end_content
|
2455
|
+
alias :_end_content_encoded :_end_content
|
2456
|
+
alias :_end_fullitem :_end_content
|
2457
|
+
alias :_end_prodlink :_end_content
|
2458
|
+
end
|
2459
|
+
|
2460
|
+
def _start_itunes_image(attrsD)
|
2461
|
+
push('itunes_image', false)
|
2462
|
+
getContext()['image'] = FeedParserDict.new({'href' => attrsD['href']})
|
2463
|
+
end
|
2464
|
+
alias :_start_itunes_link :_start_itunes_image
|
2465
|
+
|
2466
|
+
def _end_itunes_block
|
2467
|
+
value = pop('itunes_block', false)
|
2468
|
+
getContext()['itunes_block'] = (value == 'yes') and true or false
|
2469
|
+
end
|
2470
|
+
|
2471
|
+
def _end_itunes_explicit
|
2472
|
+
value = pop('itunes_explicit', false)
|
2473
|
+
getContext()['itunes_explicit'] = (value == 'yes') and true or false
|
2474
|
+
end
|
2475
|
+
|
2476
|
+
|
2477
|
+
# ISO-8601 date parsing routines written by Fazal Majid.
|
2478
|
+
# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
|
2479
|
+
# parser is beyond the scope of feedparser and the current Time.iso8601
|
2480
|
+
# method does not work.
|
2481
|
+
# A single regular expression cannot parse ISO 8601 date formats into groups
|
2482
|
+
# as the standard is highly irregular (for instance is 030104 2003-01-04 or
|
2483
|
+
# 0301-04-01), so we use templates instead.
|
2484
|
+
# Please note the order in templates is significant because we need a
|
2485
|
+
# greedy match.
|
2486
|
+
def _parse_date_iso8601(dateString)
|
2487
|
+
# Parse a variety of ISO-8601-compatible formats like 20040105
|
2488
|
+
|
2489
|
+
# What I'm about to show you may be the ugliest code in all of
|
2490
|
+
# rfeedparser.
|
2491
|
+
# FIXME The century regexp maybe not work ('\d\d$' says "two numbers at
|
2492
|
+
# end of line" but we then attach more of a regexp.
|
2493
|
+
iso8601_regexps = [ '^(\d{4})-?([01]\d)-([0123]\d)',
|
2494
|
+
'^(\d{4})-([01]\d)',
|
2495
|
+
'^(\d{4})-?([0123]\d\d)',
|
2496
|
+
'^(\d\d)-?([01]\d)-?([0123]\d)',
|
2497
|
+
'^(\d\d)-?([0123]\d\d)',
|
2498
|
+
'^(\d{4})',
|
2499
|
+
'-(\d\d)-?([01]\d)',
|
2500
|
+
'-([0123]\d\d)',
|
2501
|
+
'-(\d\d)',
|
2502
|
+
'--([01]\d)-?([0123]\d)',
|
2503
|
+
'--([01]\d)',
|
2504
|
+
'---([0123]\d)',
|
2505
|
+
'(\d\d$)',
|
2506
|
+
''
|
2507
|
+
]
|
2508
|
+
iso8601_values = { '^(\d{4})-?([01]\d)-([0123]\d)' => ['year', 'month', 'day'],
|
2509
|
+
'^(\d{4})-([01]\d)' => ['year','month'],
|
2510
|
+
'^(\d{4})-?([0123]\d\d)' => ['year', 'ordinal'],
|
2511
|
+
'^(\d\d)-?([01]\d)-?([0123]\d)' => ['year','month','day'],
|
2512
|
+
'^(\d\d)-?([0123]\d\d)' => ['year','ordinal'],
|
2513
|
+
'^(\d{4})' => ['year'],
|
2514
|
+
'-(\d\d)-?([01]\d)' => ['year','month'],
|
2515
|
+
'-([0123]\d\d)' => ['ordinal'],
|
2516
|
+
'-(\d\d)' => ['year'],
|
2517
|
+
'--([01]\d)-?([0123]\d)' => ['month','day'],
|
2518
|
+
'--([01]\d)' => ['month'],
|
2519
|
+
'---([0123]\d)' => ['day'],
|
2520
|
+
'(\d\d$)' => ['century'],
|
2521
|
+
'' => []
|
2522
|
+
}
|
2523
|
+
add_to_all = '(T?(\d\d):(\d\d)(?::(\d\d))?([+-](\d\d)(?::(\d\d))?|Z)?)?'
|
2524
|
+
add_to_all_fields = ['hour', 'minute', 'second', 'tz', 'tzhour', 'tzmin']
|
2525
|
+
# NOTE We use '(?:' to prevent grouping of optional matches (ones trailed
|
2526
|
+
# by '?'). The second ':' *are* matched.
|
2527
|
+
m = nil
|
2528
|
+
param_keys = []
|
2529
|
+
iso8601_regexps.each do |s|
|
2530
|
+
$stderr << "Trying iso8601 regexp: #{s+add_to_all}\n" if $debug
|
2531
|
+
param_keys = iso8601_values[s] + add_to_all_fields
|
2532
|
+
m = dateString.match(Regexp.new(s+add_to_all))
|
2533
|
+
break if m
|
2534
|
+
end
|
2535
|
+
return if m.nil? or (m.begin(0).zero? and m.end(0).zero?)
|
2536
|
+
|
2537
|
+
param_values = m.to_a
|
2538
|
+
param_values = param_values[1..-1]
|
2539
|
+
params = {}
|
2540
|
+
param_keys.each_with_index do |key,i|
|
2541
|
+
params[key] = param_values[i]
|
2542
|
+
end
|
21
2543
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
2544
|
+
ordinal = params['ordinal'].to_i unless params['ordinal'].nil?
|
2545
|
+
year = params['year'] || '--'
|
2546
|
+
if year.nil? or year.empty? or year == '--' # FIXME When could the regexp ever return a year equal to '--'?
|
2547
|
+
year = Time.now.utc.year
|
2548
|
+
elsif year.length == 2
|
2549
|
+
# ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
|
2550
|
+
year = 100 * (Time.now.utc.year / 100) + year.to_i
|
2551
|
+
else
|
2552
|
+
year = year.to_i
|
2553
|
+
end
|
28
2554
|
|
29
|
-
|
30
|
-
|
2555
|
+
month = params['month'] || '-'
|
2556
|
+
if month.nil? or month.empty? or month == '-'
|
2557
|
+
# ordinals are NOT normalized by mktime, we simulate them
|
2558
|
+
# by setting month=1, day=ordinal
|
2559
|
+
if ordinal
|
2560
|
+
month = DateTime.ordinal(year,ordinal).month
|
2561
|
+
else
|
2562
|
+
month = Time.now.utc.month
|
2563
|
+
end
|
2564
|
+
end
|
2565
|
+
month = month.to_i unless month.nil?
|
2566
|
+
day = params['day']
|
2567
|
+
if day.nil? or day.empty?
|
2568
|
+
# see above
|
2569
|
+
if ordinal
|
2570
|
+
day = DateTime.ordinal(year,ordinal).day
|
2571
|
+
elsif params['century'] or params['year'] or params['month']
|
2572
|
+
day = 1
|
2573
|
+
else
|
2574
|
+
day = Time.now.utc.day
|
2575
|
+
end
|
2576
|
+
else
|
2577
|
+
day = day.to_i
|
2578
|
+
end
|
2579
|
+
# special case of the century - is the first year of the 21st century
|
2580
|
+
# 2000 or 2001 ? The debate goes on...
|
2581
|
+
if params.has_key? 'century'
|
2582
|
+
year = (params['century'].to_i - 1) * 100 + 1
|
2583
|
+
end
|
2584
|
+
# in ISO 8601 most fields are optional
|
2585
|
+
hour = params['hour'].to_i
|
2586
|
+
minute = params['minute'].to_i
|
2587
|
+
second = params['second'].to_i
|
2588
|
+
weekday = nil
|
2589
|
+
# daylight savings is complex, but not needed for feedparser's purposes
|
2590
|
+
# as time zones, if specified, include mention of whether it is active
|
2591
|
+
# (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
|
2592
|
+
# and most implementations have DST bugs
|
2593
|
+
tm = [second, minute, hour, day, month, year, nil, ordinal, false, nil]
|
2594
|
+
tz = params['tz']
|
2595
|
+
if tz and not tz.empty? and tz != 'Z'
|
2596
|
+
# FIXME does this cross over days?
|
2597
|
+
if tz[0] == '-'
|
2598
|
+
tm[3] += params['tzhour'].to_i
|
2599
|
+
tm[4] += params['tzmin'].to_i
|
2600
|
+
elsif tz[0] == '+'
|
2601
|
+
tm[3] -= params['tzhour'].to_i
|
2602
|
+
tm[4] -= params['tzmin'].to_i
|
2603
|
+
else
|
2604
|
+
return nil
|
2605
|
+
end
|
2606
|
+
end
|
2607
|
+
return Time.utc(*tm) # Magic!
|
31
2608
|
|
32
|
-
|
33
|
-
require 'html/sgml-parser'
|
34
|
-
require 'htmlentities'
|
35
|
-
require 'active_support'
|
36
|
-
require 'open-uri'
|
37
|
-
include OpenURI
|
2609
|
+
end
|
38
2610
|
|
39
|
-
|
40
|
-
|
2611
|
+
def _parse_date_onblog(dateString)
|
2612
|
+
# Parse a string according to the OnBlog 8-bit date format
|
2613
|
+
# 8-bit date handling routes written by ytrewq1
|
2614
|
+
korean_year = u("년") # b3e2 in euc-kr
|
2615
|
+
korean_month = u("월") # bff9 in euc-kr
|
2616
|
+
korean_day = u("일") # c0cf in euc-kr
|
41
2617
|
|
42
|
-
$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
|
43
|
-
require 'rfeedparser/forgiving_uri'
|
44
|
-
require 'rfeedparser/aliases'
|
45
|
-
require 'rfeedparser/encoding_helpers'
|
46
|
-
require 'rfeedparser/better_sgmlparser'
|
47
|
-
require 'rfeedparser/better_attributelist'
|
48
|
-
require 'rfeedparser/scrub'
|
49
|
-
require 'rfeedparser/time_helpers'
|
50
|
-
require 'rfeedparser/feedparserdict'
|
51
|
-
require 'rfeedparser/parser_mixin'
|
52
|
-
require 'rfeedparser/parsers'
|
53
|
-
require 'rfeedparser/markup_helpers'
|
54
2618
|
|
55
|
-
|
2619
|
+
korean_onblog_date_re = /(\d{4})#{korean_year}\s+(\d{2})#{korean_month}\s+(\d{2})#{korean_day}\s+(\d{2}):(\d{2}):(\d{2})/
|
56
2620
|
|
57
2621
|
|
58
|
-
|
59
|
-
|
2622
|
+
m = korean_onblog_date_re.match(dateString)
|
2623
|
+
return unless m
|
2624
|
+
w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
|
60
2625
|
|
61
|
-
|
2626
|
+
$stderr << "OnBlog date parsed as: %s\n" % w3dtfdate if $debug
|
2627
|
+
return _parse_date_w3dtf(w3dtfdate)
|
2628
|
+
end
|
62
2629
|
|
63
|
-
|
64
|
-
|
2630
|
+
def _parse_date_nate(dateString)
|
2631
|
+
# Parse a string according to the Nate 8-bit date format
|
2632
|
+
# 8-bit date handling routes written by ytrewq1
|
2633
|
+
korean_am = u("오전") # bfc0 c0fc in euc-kr
|
2634
|
+
korean_pm = u("오후") # bfc0 c8c4 in euc-kr
|
65
2635
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
2636
|
+
korean_nate_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(#{korean_am}|#{korean_pm})\s+(\d{0,2}):(\d{0,2}):(\d{0,2})/
|
2637
|
+
m = korean_nate_date_re.match(dateString)
|
2638
|
+
return unless m
|
2639
|
+
hour = m[5].to_i
|
2640
|
+
ampm = m[4]
|
2641
|
+
if ampm == korean_pm
|
2642
|
+
hour += 12
|
2643
|
+
end
|
2644
|
+
hour = hour.to_s.rjust(2,'0')
|
2645
|
+
w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{hour}:#{m[6]}:#{m[7]}+09:00"
|
2646
|
+
$stderr << "Nate date parsed as: %s\n" % w3dtfdate if $debug
|
2647
|
+
return _parse_date_w3dtf(w3dtfdate)
|
2648
|
+
end
|
71
2649
|
|
72
|
-
|
73
|
-
|
74
|
-
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
75
|
-
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
76
|
-
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
77
|
-
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
78
|
-
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
79
|
-
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
80
|
-
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
81
|
-
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
82
|
-
POSSIBILITY OF SUCH DAMAGE."""
|
2650
|
+
def _parse_date_mssql(dateString)
|
2651
|
+
mssql_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?/
|
83
2652
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
"Kevin Marks <http://epeus.blogspot.com/>"
|
91
|
-
]
|
92
|
-
# HTTP "User-Agent" header to send to servers when downloading feeds.
|
93
|
-
# If you are embedding feedparser in a larger application, you should
|
94
|
-
# change this to your application name and URL.
|
95
|
-
USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
|
2653
|
+
m = mssql_date_re.match(dateString)
|
2654
|
+
return unless m
|
2655
|
+
w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
|
2656
|
+
$stderr << "MS SQL date parsed as: %s\n" % w3dtfdate if $debug
|
2657
|
+
return _parse_date_w3dtf(w3dtfdate)
|
2658
|
+
end
|
96
2659
|
|
97
|
-
|
98
|
-
|
99
|
-
|
2660
|
+
def _parse_date_greek(dateString)
|
2661
|
+
# Parse a string according to a Greek 8-bit date format
|
2662
|
+
# Unicode strings for Greek date strings
|
2663
|
+
greek_months = {
|
2664
|
+
u("Ιαν") => u("Jan"), # c9e1ed in iso-8859-7
|
2665
|
+
u("Φεβ") => u("Feb"), # d6e5e2 in iso-8859-7
|
2666
|
+
u("Μάώ") => u("Mar"), # ccdcfe in iso-8859-7
|
2667
|
+
u("Μαώ") => u("Mar"), # cce1fe in iso-8859-7
|
2668
|
+
u("Απρ") => u("Apr"), # c1f0f1 in iso-8859-7
|
2669
|
+
u("Μάι") => u("May"), # ccdce9 in iso-8859-7
|
2670
|
+
u("Μαϊ") => u("May"), # cce1fa in iso-8859-7
|
2671
|
+
u("Μαι") => u("May"), # cce1e9 in iso-8859-7
|
2672
|
+
u("Ιούν") => u("Jun"), # c9effded in iso-8859-7
|
2673
|
+
u("Ιον") => u("Jun"), # c9efed in iso-8859-7
|
2674
|
+
u("Ιούλ") => u("Jul"), # c9effdeb in iso-8859-7
|
2675
|
+
u("Ιολ") => u("Jul"), # c9f9eb in iso-8859-7
|
2676
|
+
u("Αύγ") => u("Aug"), # c1fde3 in iso-8859-7
|
2677
|
+
u("Αυγ") => u("Aug"), # c1f5e3 in iso-8859-7
|
2678
|
+
u("Σεπ") => u("Sep"), # d3e5f0 in iso-8859-7
|
2679
|
+
u("Οκτ") => u("Oct"), # cfeaf4 in iso-8859-7
|
2680
|
+
u("Νοέ") => u("Nov"), # cdefdd in iso-8859-7
|
2681
|
+
u("Νοε") => u("Nov"), # cdefe5 in iso-8859-7
|
2682
|
+
u("Δεκ") => u("Dec"), # c4e5ea in iso-8859-7
|
2683
|
+
}
|
100
2684
|
|
2685
|
+
greek_wdays = {
|
2686
|
+
u("Κυρ") => u("Sun"), # caf5f1 in iso-8859-7
|
2687
|
+
u("Δευ") => u("Mon"), # c4e5f5 in iso-8859-7
|
2688
|
+
u("Τρι") => u("Tue"), # d4f1e9 in iso-8859-7
|
2689
|
+
u("Τετ") => u("Wed"), # d4e5f4 in iso-8859-7
|
2690
|
+
u("Πεμ") => u("Thu"), # d0e5ec in iso-8859-7
|
2691
|
+
u("Παρ") => u("Fri"), # d0e1f1 in iso-8859-7
|
2692
|
+
u("Σαβ") => u("Sat"), # d3e1e2 in iso-8859-7
|
2693
|
+
}
|
101
2694
|
|
102
|
-
|
103
|
-
# this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
|
104
|
-
# or utidylib <http://utidylib.berlios.de/>.
|
105
|
-
#TIDY_MARKUP = false #FIXME untranslated
|
2695
|
+
greek_date_format = /([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)/
|
106
2696
|
|
107
|
-
|
108
|
-
|
109
|
-
|
2697
|
+
m = greek_date_format.match(dateString)
|
2698
|
+
return unless m
|
2699
|
+
begin
|
2700
|
+
wday = greek_wdays[m[1]]
|
2701
|
+
month = greek_months[m[3]]
|
2702
|
+
rescue
|
2703
|
+
return nil
|
2704
|
+
end
|
2705
|
+
rfc822date = "#{wday}, #{m[2]} #{month} #{m[4]} #{m[5]}:#{m[6]}:#{m[7]} #{m[8]}"
|
2706
|
+
$stderr << "Greek date parsed as: #{rfc822date}\n" if $debug
|
2707
|
+
return _parse_date_rfc822(rfc822date)
|
2708
|
+
end
|
110
2709
|
|
2710
|
+
def _parse_date_hungarian(dateString)
|
2711
|
+
# Parse a string according to a Hungarian 8-bit date format.
|
2712
|
+
hungarian_date_format_re = /(\d{4})-([^-]+)-(\d{0,2})T(\d{0,2}):(\d{2})((\+|-)(\d{0,2}:\d{2}))/
|
2713
|
+
m = hungarian_date_format_re.match(dateString)
|
2714
|
+
return unless m
|
111
2715
|
|
112
|
-
|
113
|
-
|
2716
|
+
# Unicode strings for Hungarian date strings
|
2717
|
+
hungarian_months = {
|
2718
|
+
u("január") => u("01"), # e1 in iso-8859-2
|
2719
|
+
u("februári") => u("02"), # e1 in iso-8859-2
|
2720
|
+
u("március") => u("03"), # e1 in iso-8859-2
|
2721
|
+
u("április") => u("04"), # e1 in iso-8859-2
|
2722
|
+
u("máujus") => u("05"), # e1 in iso-8859-2
|
2723
|
+
u("június") => u("06"), # fa in iso-8859-2
|
2724
|
+
u("július") => u("07"), # fa in iso-8859-2
|
2725
|
+
u("augusztus") => u("08"),
|
2726
|
+
u("szeptember") => u("09"),
|
2727
|
+
u("október") => u("10"), # f3 in iso-8859-2
|
2728
|
+
u("november") => u("11"),
|
2729
|
+
u("december") => u("12"),
|
2730
|
+
}
|
2731
|
+
begin
|
2732
|
+
month = hungarian_months[m[2]]
|
2733
|
+
day = m[3].rjust(2,'0')
|
2734
|
+
hour = m[4].rjust(2,'0')
|
2735
|
+
rescue
|
2736
|
+
return
|
2737
|
+
end
|
2738
|
+
|
2739
|
+
w3dtfdate = "#{m[1]}-#{month}-#{day}T#{hour}:#{m[5]}:00#{m[6]}"
|
2740
|
+
$stderr << "Hungarian date parsed as: #{w3dtfdate}\n" if $debug
|
2741
|
+
return _parse_date_w3dtf(w3dtfdate)
|
2742
|
+
end
|
2743
|
+
|
2744
|
+
def rollover(num, modulus)
|
2745
|
+
return num % modulus, num / modulus
|
2746
|
+
end
|
2747
|
+
|
2748
|
+
def set_self(num, modulus)
|
2749
|
+
r = num / modulus
|
2750
|
+
if r == 0
|
2751
|
+
return num
|
2752
|
+
end
|
2753
|
+
return r
|
2754
|
+
end
|
2755
|
+
# W3DTF-style date parsing
|
2756
|
+
# FIXME shouldn't it be "W3CDTF"?
|
2757
|
+
def _parse_date_w3dtf(dateString)
|
2758
|
+
# Ruby's Time docs claim w3cdtf is an alias for iso8601 which is an alias for xmlschema
|
2759
|
+
# Whatever it is, it doesn't work. This has been fixed in Ruby 1.9 and
|
2760
|
+
# in Ruby on Rails, but not really. They don't fix the 25 hour or 61 minute or 61 second rollover and fail in other ways.
|
2761
|
+
|
2762
|
+
m = dateString.match(/^(\d{4})-?(?:(?:([01]\d)-?(?:([0123]\d)(?:T(\d\d):(\d\d):(\d\d)([+-]\d\d:\d\d|Z))?)?)?)?/)
|
2763
|
+
|
2764
|
+
w3 = m[1..3].map{|s| s=s.to_i; s += 1 if s == 0;s} # Map the year, month and day to integers and, if they were nil, set them to 1
|
2765
|
+
w3 += m[4..6].map{|s| s.to_i} # Map the hour, minute and second to integers
|
2766
|
+
w3 << m[-1] # Leave the timezone as a String
|
2767
|
+
|
2768
|
+
# FIXME this next bit needs some serious refactoring
|
2769
|
+
# Rollover times. 0 minutes and 61 seconds -> 1 minute and 1 second
|
2770
|
+
w3[5],r = rollover(w3[5], 60) # rollover seconds
|
2771
|
+
w3[4] += r
|
2772
|
+
w3[4],r = rollover(w3[4], 60) # rollover minutes
|
2773
|
+
w3[3] += r
|
2774
|
+
w3[3],r = rollover(w3[3], 24) # rollover hours
|
2775
|
+
|
2776
|
+
w3[2] = w3[2] + r
|
2777
|
+
if w3[1] > 12
|
2778
|
+
w3[1],r = rollover(w3[1],12)
|
2779
|
+
w3[1] = 12 if w3[1] == 0
|
2780
|
+
w3[0] += r
|
2781
|
+
end
|
2782
|
+
|
2783
|
+
num_days = Time.days_in_month(w3[1], w3[0])
|
2784
|
+
while w3[2] > num_days
|
2785
|
+
w3[2] -= num_days
|
2786
|
+
w3[1] += 1
|
2787
|
+
if w3[1] > 12
|
2788
|
+
w3[0] += 1
|
2789
|
+
w3[1] = set_self(w3[1], 12)
|
2790
|
+
end
|
2791
|
+
num_days = Time.days_in_month(w3[1], w3[0])
|
2792
|
+
end
|
2793
|
+
|
2794
|
+
|
2795
|
+
unless w3[6].class != String
|
2796
|
+
if /^-/ =~ w3[6] # Zone offset goes backwards
|
2797
|
+
w3[6][0] = '+'
|
2798
|
+
elsif /^\+/ =~ w3[6]
|
2799
|
+
w3[6][0] = '-'
|
2800
|
+
end
|
2801
|
+
end
|
2802
|
+
return Time.utc(w3[0], w3[1], w3[2] , w3[3], w3[4], w3[5])+Time.zone_offset(w3[6] || "UTC")
|
2803
|
+
end
|
2804
|
+
|
2805
|
+
def _parse_date_rfc822(dateString)
|
2806
|
+
# Parse an RFC822, RFC1123, RFC2822 or asctime-style date
|
2807
|
+
# These first few lines are to fix up the stupid proprietary format from Disney
|
2808
|
+
unknown_timezones = { 'AT' => 'EDT', 'ET' => 'EST',
|
2809
|
+
'CT' => 'CST', 'MT' => 'MST',
|
2810
|
+
'PT' => 'PST'
|
2811
|
+
}
|
2812
|
+
|
2813
|
+
mon = dateString.split[2]
|
2814
|
+
if mon.length > 3 and Time::RFC2822_MONTH_NAME.include?mon[0..2]
|
2815
|
+
dateString.sub!(mon,mon[0..2])
|
2816
|
+
end
|
2817
|
+
if dateString[-3..-1] != "GMT" and unknown_timezones[dateString[-2..-1]]
|
2818
|
+
dateString[-2..-1] = unknown_timezones[dateString[-2..-1]]
|
2819
|
+
end
|
2820
|
+
# Okay, the Disney date format should be fixed up now.
|
2821
|
+
rfc = dateString.match(/([A-Za-z]{3}), ([0123]\d) ([A-Za-z]{3}) (\d{4})( (\d\d):(\d\d)(?::(\d\d))? ([A-Za-z]{3}))?/)
|
2822
|
+
if rfc.to_a.length > 1 and rfc.to_a.include? nil
|
2823
|
+
dow, day, mon, year, hour, min, sec, tz = rfc[1..-1]
|
2824
|
+
hour,min,sec = [hour,min,sec].map{|e| e.to_s.rjust(2,'0') }
|
2825
|
+
tz ||= "GMT"
|
2826
|
+
end
|
2827
|
+
asctime_match = dateString.match(/([A-Za-z]{3}) ([A-Za-z]{3}) (\d?\d) (\d\d):(\d\d):(\d\d) ([A-Za-z]{3}) (\d\d\d\d)/).to_a
|
2828
|
+
if asctime_match.to_a.length > 1
|
2829
|
+
# Month-abbr dayofmonth hour:minute:second year
|
2830
|
+
dow, mon, day, hour, min, sec, tz, year = asctime_match[1..-1]
|
2831
|
+
day.to_s.rjust(2,'0')
|
2832
|
+
end
|
2833
|
+
if (rfc.to_a.length > 1 and rfc.to_a.include? nil) or asctime_match.to_a.length > 1
|
2834
|
+
ds = "#{dow}, #{day} #{mon} #{year} #{hour}:#{min}:#{sec} #{tz}"
|
2835
|
+
else
|
2836
|
+
ds = dateString
|
2837
|
+
end
|
2838
|
+
t = Time.rfc2822(ds).utc
|
2839
|
+
return t
|
2840
|
+
end
|
2841
|
+
|
2842
|
+
def _parse_date_perforce(aDateString) # FIXME not in 4.1?
|
2843
|
+
# Parse a date in yyyy/mm/dd hh:mm:ss TTT format
|
2844
|
+
# Note that there is a day of the week at the beginning
|
2845
|
+
# Ex. Fri, 2006/09/15 08:19:53 EDT
|
2846
|
+
return Time.parse(aDateString).utc
|
2847
|
+
end
|
2848
|
+
|
2849
|
+
def extract_tuple(atime)
|
2850
|
+
# NOTE leave the error handling to parse_date
|
2851
|
+
t = [atime.year, atime.month, atime.mday, atime.hour,
|
2852
|
+
atime.min, atime.sec, (atime.wday-1) % 7, atime.yday,
|
2853
|
+
atime.isdst
|
2854
|
+
]
|
2855
|
+
# yay for modulus! yaaaaaay! its 530 am and i should be sleeping! yaay!
|
2856
|
+
t[0..-2].map!{|s| s.to_i}
|
2857
|
+
t[-1] = t[-1] ? 1 : 0
|
2858
|
+
return t
|
2859
|
+
end
|
2860
|
+
|
2861
|
+
def parse_date(dateString)
|
2862
|
+
@date_handlers.each do |handler|
|
2863
|
+
begin
|
2864
|
+
$stderr << "Trying date_handler #{handler}\n" if $debug
|
2865
|
+
datething = extract_tuple(send(handler,dateString))
|
2866
|
+
return datething
|
2867
|
+
rescue Exception => e
|
2868
|
+
$stderr << "#{handler} raised #{e}\n" if $debug
|
2869
|
+
end
|
2870
|
+
end
|
2871
|
+
return nil
|
2872
|
+
end
|
2873
|
+
|
2874
|
+
end # End FeedParserMixin
|
2875
|
+
|
2876
|
+
class StrictFeedParser < XML::SAX::HandlerBase # expat
|
2877
|
+
include FeedParserMixin
|
2878
|
+
|
2879
|
+
attr_accessor :bozo, :entries, :feeddata, :exc
|
2880
|
+
def initialize(baseuri, baselang, encoding)
|
2881
|
+
$stderr << "trying StrictFeedParser\n" if $debug
|
2882
|
+
startup(baseuri, baselang, encoding)
|
2883
|
+
@bozo = false
|
2884
|
+
@exc = nil
|
2885
|
+
super()
|
2886
|
+
end
|
2887
|
+
|
2888
|
+
def getPos
|
2889
|
+
[@locator.getSystemId, @locator.getLineNumber]
|
2890
|
+
end
|
2891
|
+
|
2892
|
+
def getAttrs(attrs)
|
2893
|
+
ret = []
|
2894
|
+
for i in 0..attrs.getLength
|
2895
|
+
ret.push([attrs.getName(i), attrs.getValue(i)])
|
2896
|
+
end
|
2897
|
+
ret
|
2898
|
+
end
|
2899
|
+
|
2900
|
+
def setDocumentLocator(loc)
|
2901
|
+
@locator = loc
|
2902
|
+
end
|
2903
|
+
|
2904
|
+
def startDoctypeDecl(name, pub_sys, long_name, uri)
|
2905
|
+
#Nothing is done here. What could we do that is neat and useful?
|
2906
|
+
end
|
2907
|
+
|
2908
|
+
def startNamespaceDecl(prefix, uri)
|
2909
|
+
trackNamespace(prefix, uri)
|
2910
|
+
end
|
2911
|
+
|
2912
|
+
def endNamespaceDecl(prefix)
|
2913
|
+
end
|
2914
|
+
|
2915
|
+
def startElement(name, attrs)
|
2916
|
+
name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
|
2917
|
+
namespaceuri = ($2 || '').downcase
|
2918
|
+
name = $3
|
2919
|
+
if /backend\.userland\.com\/rss/ =~ namespaceuri
|
2920
|
+
# match any backend.userland.com namespace
|
2921
|
+
namespaceuri = 'http://backend.userland.com/rss'
|
2922
|
+
end
|
2923
|
+
prefix = @matchnamespaces[namespaceuri]
|
2924
|
+
# No need to raise UndeclaredNamespace, Expat does that for us with
|
2925
|
+
"unbound prefix (XMLParserError)"
|
2926
|
+
if prefix and not prefix.empty?
|
2927
|
+
name = prefix + ':' + name
|
2928
|
+
end
|
2929
|
+
name.downcase!
|
2930
|
+
unknown_starttag(name, attrs)
|
2931
|
+
end
|
2932
|
+
|
2933
|
+
def character(text, start, length)
|
2934
|
+
#handle_data(CGI.unescapeHTML(text))
|
2935
|
+
handle_data(text)
|
2936
|
+
end
|
2937
|
+
# expat provides "character" not "characters"!
|
2938
|
+
alias :characters :character # Just in case.
|
2939
|
+
|
2940
|
+
def startCdata(content)
|
2941
|
+
handle_data(content)
|
2942
|
+
end
|
2943
|
+
|
2944
|
+
def endElement(name)
|
2945
|
+
name =~ /^(([^;]*);)?(.+)$/ # Snag namespaceuri from name
|
2946
|
+
namespaceuri = ($2 || '').downcase
|
2947
|
+
prefix = @matchnamespaces[namespaceuri]
|
2948
|
+
if prefix and not prefix.empty?
|
2949
|
+
localname = prefix + ':' + name
|
2950
|
+
end
|
2951
|
+
name.downcase!
|
2952
|
+
unknown_endtag(name)
|
2953
|
+
end
|
2954
|
+
|
2955
|
+
def comment(comment)
|
2956
|
+
handle_comment(comment)
|
2957
|
+
end
|
2958
|
+
|
2959
|
+
def entityDecl(*foo)
|
2960
|
+
end
|
2961
|
+
|
2962
|
+
def unparsedEntityDecl(*foo)
|
2963
|
+
end
|
2964
|
+
def error(exc)
|
2965
|
+
@bozo = true
|
2966
|
+
@exc = exc
|
2967
|
+
end
|
2968
|
+
|
2969
|
+
def fatalError(exc)
|
2970
|
+
error(exc)
|
2971
|
+
raise exc
|
2972
|
+
end
|
114
2973
|
end
|
115
|
-
|
2974
|
+
|
2975
|
+
class LooseFeedParser < BetterSGMLParser
|
2976
|
+
include FeedParserMixin
|
2977
|
+
# We write the methods that were in BaseHTMLProcessor in the python code
|
2978
|
+
# in here directly. We do this because if we inherited from
|
2979
|
+
# BaseHTMLProcessor but then included from FeedParserMixin, the methods
|
2980
|
+
# of Mixin would overwrite the methods we inherited from
|
2981
|
+
# BaseHTMLProcessor. This is exactly the opposite of what we want to
|
2982
|
+
# happen!
|
2983
|
+
|
2984
|
+
attr_accessor :encoding, :bozo, :feeddata, :entries, :namespacesInUse
|
2985
|
+
|
2986
|
+
Elements_No_End_Tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
|
2987
|
+
'img', 'input', 'isindex', 'link', 'meta', 'param']
|
2988
|
+
New_Declname_Re = /[a-zA-Z][-_.a-zA-Z0-9:]*\s*/
|
2989
|
+
alias :sgml_feed :feed # feed needs to mapped to feeddata, not the SGMLParser method feed. I think.
|
2990
|
+
def feed
|
2991
|
+
@feeddata
|
2992
|
+
end
|
2993
|
+
def feed=(data)
|
2994
|
+
@feeddata = data
|
2995
|
+
end
|
2996
|
+
|
2997
|
+
def initialize(baseuri, baselang, encoding)
|
2998
|
+
startup(baseuri, baselang, encoding)
|
2999
|
+
super() # Keep the parentheses! No touchy.
|
3000
|
+
end
|
3001
|
+
|
3002
|
+
def reset
|
3003
|
+
@pieces = []
|
3004
|
+
super
|
3005
|
+
end
|
3006
|
+
|
3007
|
+
def parse(data)
|
3008
|
+
data.gsub!(/<!((?!DOCTYPE|--|\[))/i, '<!\1')
|
3009
|
+
data.gsub!(/<([^<\s]+?)\s*\/>/) do |tag|
|
3010
|
+
clean = tag[1..-3].strip
|
3011
|
+
if Elements_No_End_Tag.include?clean
|
3012
|
+
tag
|
3013
|
+
else
|
3014
|
+
'<'+clean+'></'+clean+'>'
|
3015
|
+
end
|
3016
|
+
end
|
3017
|
+
|
3018
|
+
data.gsub!(/'/, "'")
|
3019
|
+
data.gsub!(/"/, "'")
|
3020
|
+
if @encoding and not @encoding.empty? # FIXME unicode check type(u'')
|
3021
|
+
data = uconvert(data,'utf-8',@encoding)
|
3022
|
+
end
|
3023
|
+
sgml_feed(data) # see the alias above
|
3024
|
+
end
|
3025
|
+
|
3026
|
+
|
3027
|
+
def decodeEntities(element, data)
|
3028
|
+
data.gsub!('<', '<')
|
3029
|
+
data.gsub!('<', '<')
|
3030
|
+
data.gsub!('>', '>')
|
3031
|
+
data.gsub!('>', '>')
|
3032
|
+
data.gsub!('&', '&')
|
3033
|
+
data.gsub!('&', '&')
|
3034
|
+
data.gsub!('"', '"')
|
3035
|
+
data.gsub!('"', '"')
|
3036
|
+
data.gsub!(''', ''')
|
3037
|
+
data.gsub!(''', ''')
|
3038
|
+
if @contentparams.has_key? 'type' and not ((@contentparams['type'] || 'xml') =~ /xml$/u)
|
3039
|
+
data.gsub!('<', '<')
|
3040
|
+
data.gsub!('>', '>')
|
3041
|
+
data.gsub!('&', '&')
|
3042
|
+
data.gsub!('"', '"')
|
3043
|
+
data.gsub!(''', "'")
|
3044
|
+
end
|
3045
|
+
return data
|
3046
|
+
end
|
116
3047
|
end
|
117
|
-
|
3048
|
+
|
3049
|
+
def FeedParser.resolveRelativeURIs(htmlSource, baseURI, encoding)
|
3050
|
+
$stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
|
3051
|
+
relative_uris = [ ['a','href'],
|
3052
|
+
['applet','codebase'],
|
3053
|
+
['area','href'],
|
3054
|
+
['blockquote','cite'],
|
3055
|
+
['body','background'],
|
3056
|
+
['del','cite'],
|
3057
|
+
['form','action'],
|
3058
|
+
['frame','longdesc'],
|
3059
|
+
['frame','src'],
|
3060
|
+
['iframe','longdesc'],
|
3061
|
+
['iframe','src'],
|
3062
|
+
['head','profile'],
|
3063
|
+
['img','longdesc'],
|
3064
|
+
['img','src'],
|
3065
|
+
['img','usemap'],
|
3066
|
+
['input','src'],
|
3067
|
+
['input','usemap'],
|
3068
|
+
['ins','cite'],
|
3069
|
+
['link','href'],
|
3070
|
+
['object','classid'],
|
3071
|
+
['object','codebase'],
|
3072
|
+
['object','data'],
|
3073
|
+
['object','usemap'],
|
3074
|
+
['q','cite'],
|
3075
|
+
['script','src'],
|
3076
|
+
]
|
3077
|
+
h = Hpricot(htmlSource)
|
3078
|
+
relative_uris.each do |l|
|
3079
|
+
ename, eattr = l
|
3080
|
+
h.search(ename).each do |elem|
|
3081
|
+
euri = elem.attributes[eattr]
|
3082
|
+
if euri and not euri.empty? and URI.parse(euri).relative?
|
3083
|
+
elem.attributes[eattr] = urljoin(baseURI, euri)
|
3084
|
+
end
|
3085
|
+
end
|
3086
|
+
end
|
3087
|
+
return h.to_html
|
118
3088
|
end
|
119
|
-
|
3089
|
+
|
3090
|
+
class SanitizerDoc < Hpricot::Doc
|
3091
|
+
|
3092
|
+
def scrub
|
3093
|
+
traverse_all_element do |e|
|
3094
|
+
if e.elem?
|
3095
|
+
if Acceptable_Elements.include?e.name
|
3096
|
+
e.strip_attributes
|
3097
|
+
else
|
3098
|
+
if Unacceptable_Elements_With_End_Tag.include?e.name
|
3099
|
+
e.inner_html = ''
|
3100
|
+
end
|
3101
|
+
e.swap(SanitizerDoc.new(e.children).scrub.to_html)
|
3102
|
+
# This works because the children swapped in are brought in "after" the current element.
|
3103
|
+
end
|
3104
|
+
elsif e.doctype?
|
3105
|
+
e.parent.children.delete(e)
|
3106
|
+
elsif e.text?
|
3107
|
+
ets = e.to_s
|
3108
|
+
ets.gsub!(/'/, "'")
|
3109
|
+
ets.gsub!(/"/, '"')
|
3110
|
+
ets.gsub!(/\r/,'')
|
3111
|
+
e.swap(ets)
|
3112
|
+
else
|
3113
|
+
end
|
3114
|
+
end
|
3115
|
+
# yes, that '/' should be there. It's a search method. See the Hpricot docs.
|
3116
|
+
|
3117
|
+
unless $compatible # FIXME not properly recursive, see comment in recursive_strip
|
3118
|
+
(self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
|
3119
|
+
end
|
3120
|
+
return self
|
3121
|
+
end
|
120
3122
|
end
|
121
|
-
|
3123
|
+
|
3124
|
+
def SanitizerDoc(html)
|
3125
|
+
FeedParser::SanitizerDoc.new(Hpricot.make(html))
|
3126
|
+
end
|
3127
|
+
module_function(:SanitizerDoc)
|
3128
|
+
def self.sanitizeHTML(html,encoding)
|
3129
|
+
# FIXME Tidy not yet supported
|
3130
|
+
html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '<!\1')
|
3131
|
+
h = SanitizerDoc(html)
|
3132
|
+
h = h.scrub
|
3133
|
+
return h.to_html.strip
|
122
3134
|
end
|
123
3135
|
|
124
3136
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
3137
|
+
|
3138
|
+
def self.getCharacterEncoding(feed, xml_data)
|
3139
|
+
# Get the character encoding of the XML document
|
3140
|
+
$stderr << "In getCharacterEncoding\n" if $debug
|
3141
|
+
sniffed_xml_encoding = nil
|
3142
|
+
xml_encoding = nil
|
3143
|
+
true_encoding = nil
|
3144
|
+
begin
|
3145
|
+
http_headers = feed.meta
|
3146
|
+
http_content_type = feed.meta['content-type'].split(';')[0]
|
3147
|
+
encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
|
3148
|
+
http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
|
3149
|
+
http_encoding = nil if http_encoding.empty?
|
3150
|
+
# FIXME Open-Uri returns iso8859-1 if there is no charset header,
|
3151
|
+
# but that doesn't pass the tests. Open-Uri claims its following
|
3152
|
+
# the right RFC. Are they wrong or do we need to change the tests?
|
3153
|
+
rescue NoMethodError
|
3154
|
+
http_headers = {}
|
3155
|
+
http_content_type = nil
|
3156
|
+
http_encoding = nil
|
3157
|
+
end
|
3158
|
+
# Must sniff for non-ASCII-compatible character encodings before
|
3159
|
+
# searching for XML declaration. This heuristic is defined in
|
3160
|
+
# section F of the XML specification:
|
3161
|
+
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
|
3162
|
+
begin
|
3163
|
+
if xml_data[0..3] == "\x4c\x6f\xa7\x94"
|
3164
|
+
# EBCDIC
|
3165
|
+
xml_data = _ebcdic_to_ascii(xml_data)
|
3166
|
+
elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
|
3167
|
+
# UTF-16BE
|
3168
|
+
sniffed_xml_encoding = 'utf-16be'
|
3169
|
+
xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
|
3170
|
+
elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
|
3171
|
+
# UTF-16BE with BOM
|
3172
|
+
sniffed_xml_encoding = 'utf-16be'
|
3173
|
+
xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
|
3174
|
+
elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
|
3175
|
+
# UTF-16LE
|
3176
|
+
sniffed_xml_encoding = 'utf-16le'
|
3177
|
+
xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
|
3178
|
+
elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
|
3179
|
+
# UTF-16LE with BOM
|
3180
|
+
sniffed_xml_encoding = 'utf-16le'
|
3181
|
+
xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
|
3182
|
+
elsif xml_data[0..3] == "\x00\x00\x00\x3c"
|
3183
|
+
# UTF-32BE
|
3184
|
+
sniffed_xml_encoding = 'utf-32be'
|
3185
|
+
xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
|
3186
|
+
elsif xml_data[0..3] == "\x3c\x00\x00\x00"
|
3187
|
+
# UTF-32LE
|
3188
|
+
sniffed_xml_encoding = 'utf-32le'
|
3189
|
+
xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
|
3190
|
+
elsif xml_data[0..3] == "\x00\x00\xfe\xff"
|
3191
|
+
# UTF-32BE with BOM
|
3192
|
+
sniffed_xml_encoding = 'utf-32be'
|
3193
|
+
xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
|
3194
|
+
elsif xml_data[0..3] == "\xff\xfe\x00\x00"
|
3195
|
+
# UTF-32LE with BOM
|
3196
|
+
sniffed_xml_encoding = 'utf-32le'
|
3197
|
+
xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
|
3198
|
+
elsif xml_data[0..2] == "\xef\xbb\xbf"
|
3199
|
+
# UTF-8 with BOM
|
3200
|
+
sniffed_xml_encoding = 'utf-8'
|
3201
|
+
xml_data = xml_data[3..-1]
|
3202
|
+
else
|
3203
|
+
# ASCII-compatible
|
3204
|
+
end
|
3205
|
+
xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
|
3206
|
+
rescue
|
3207
|
+
xml_encoding_match = nil
|
3208
|
+
end
|
3209
|
+
if xml_encoding_match
|
3210
|
+
xml_encoding = xml_encoding_match[1].downcase
|
3211
|
+
xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
|
3212
|
+
if sniffed_xml_encoding and xencodings.include?xml_encoding
|
3213
|
+
xml_encoding = sniffed_xml_encoding
|
3214
|
+
end
|
3215
|
+
end
|
3216
|
+
|
3217
|
+
acceptable_content_type = false
|
3218
|
+
application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
|
3219
|
+
text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
|
3220
|
+
|
3221
|
+
if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
|
3222
|
+
acceptable_content_type = true
|
3223
|
+
true_encoding = http_encoding || xml_encoding || 'utf-8'
|
3224
|
+
elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
|
3225
|
+
acceptable_content_type = true
|
3226
|
+
true_encoding = http_encoding || 'us-ascii'
|
3227
|
+
elsif /^text\// =~ http_content_type
|
3228
|
+
true_encoding = http_encoding || 'us-ascii'
|
3229
|
+
elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
|
3230
|
+
true_encoding = xml_encoding || 'iso-8859-1'
|
3231
|
+
else
|
3232
|
+
true_encoding = xml_encoding || 'utf-8'
|
3233
|
+
end
|
3234
|
+
return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
|
3235
|
+
end
|
3236
|
+
|
3237
|
+
def self.toUTF8(data, encoding)
|
3238
|
+
=begin
|
3239
|
+
Changes an XML data stream on the fly to specify a new encoding
|
3240
|
+
|
3241
|
+
data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
|
3242
|
+
encoding is a string recognized by encodings.aliases
|
3243
|
+
=end
|
3244
|
+
$stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
|
3245
|
+
# NOTE we must use double quotes when dealing with \x encodings!
|
3246
|
+
if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
|
3247
|
+
if $debug
|
3248
|
+
$stderr << "stripping BOM\n"
|
3249
|
+
if encoding != 'utf-16be'
|
3250
|
+
$stderr << "string utf-16be instead\n"
|
3251
|
+
end
|
3252
|
+
end
|
3253
|
+
encoding = 'utf-16be'
|
3254
|
+
data = data[2..-1]
|
3255
|
+
elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
|
3256
|
+
if $debug
|
3257
|
+
$stderr << "stripping BOM\n"
|
3258
|
+
$stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
|
3259
|
+
end
|
3260
|
+
encoding = 'utf-16le'
|
3261
|
+
data = data[2..-1]
|
3262
|
+
elsif (data[0..2] == "\xef\xbb\xbf")
|
3263
|
+
if $debug
|
3264
|
+
$stderr << "stripping BOM\n"
|
3265
|
+
$stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
|
3266
|
+
end
|
3267
|
+
encoding = 'utf-8'
|
3268
|
+
data = data[3..-1]
|
3269
|
+
elsif (data[0..3] == "\x00\x00\xfe\xff")
|
3270
|
+
if $debug
|
3271
|
+
$stderr << "stripping BOM\n"
|
3272
|
+
if encoding != 'utf-32be'
|
3273
|
+
$stderr << "trying utf-32be instead\n"
|
3274
|
+
end
|
3275
|
+
end
|
3276
|
+
encoding = 'utf-32be'
|
3277
|
+
data = data[4..-1]
|
3278
|
+
elsif (data[0..3] == "\xff\xfe\x00\x00")
|
3279
|
+
if $debug
|
3280
|
+
$stderr << "stripping BOM\n"
|
3281
|
+
if encoding != 'utf-32le'
|
3282
|
+
$stderr << "trying utf-32le instead\n"
|
3283
|
+
end
|
3284
|
+
end
|
3285
|
+
encoding = 'utf-32le'
|
3286
|
+
data = data[4..-1]
|
3287
|
+
end
|
3288
|
+
begin
|
3289
|
+
newdata = uconvert(data, encoding, 'utf-8')
|
3290
|
+
rescue => details
|
3291
|
+
end
|
3292
|
+
$stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
|
3293
|
+
declmatch = /^<\?xml[^>]*?>/
|
3294
|
+
newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
|
3295
|
+
if declmatch =~ newdata
|
3296
|
+
newdata.sub!(declmatch, newdecl)
|
3297
|
+
else
|
3298
|
+
newdata = newdecl + "\n" + newdata
|
3299
|
+
end
|
3300
|
+
return newdata
|
3301
|
+
end
|
3302
|
+
|
3303
|
+
def self.stripDoctype(data)
|
3304
|
+
=begin
|
3305
|
+
Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
3306
|
+
|
3307
|
+
rss_version may be 'rss091n' or None
|
3308
|
+
stripped_data is the same XML document, minus the DOCTYPE
|
3309
|
+
=end
|
3310
|
+
entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
|
3311
|
+
data = data.gsub(entity_pattern,'')
|
3312
|
+
|
3313
|
+
doctype_pattern = /<!DOCTYPE(.*?)>/m
|
3314
|
+
doctype_results = data.scan(doctype_pattern)
|
3315
|
+
if doctype_results and doctype_results[0]
|
3316
|
+
doctype = doctype_results[0][0]
|
3317
|
+
else
|
3318
|
+
doctype = ''
|
3319
|
+
end
|
3320
|
+
|
3321
|
+
if /netscape/ =~ doctype.downcase
|
3322
|
+
version = 'rss091n'
|
3323
|
+
else
|
3324
|
+
version = nil
|
3325
|
+
end
|
3326
|
+
data = data.sub(doctype_pattern, '')
|
3327
|
+
return version, data
|
3328
|
+
end
|
3329
|
+
|
3330
|
+
def parse(*args); FeedParser.parse(*args); end
|
3331
|
+
def FeedParser.parse(furi, options={})
|
145
3332
|
# Parse a feed from a URL, file, stream or string
|
146
3333
|
$compatible = options[:compatible] || $compatible # Use the default compatibility if compatible is nil
|
147
|
-
strictklass = options[:strict] || StrictFeedParser
|
148
|
-
looseklass = options[:loose] || LooseFeedParser
|
149
3334
|
result = FeedParserDict.new
|
150
3335
|
result['feed'] = FeedParserDict.new
|
151
3336
|
result['entries'] = []
|
@@ -155,12 +3340,13 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
155
3340
|
end
|
156
3341
|
result['bozo'] = false
|
157
3342
|
handlers = options[:handlers]
|
3343
|
+
|
158
3344
|
if handlers.class != Array # FIXME why does this happen?
|
159
3345
|
handlers = [handlers]
|
160
3346
|
end
|
161
3347
|
|
162
3348
|
begin
|
163
|
-
if
|
3349
|
+
if URI::parse(furi).class == URI::Generic
|
164
3350
|
f = open(furi) # OpenURI doesn't behave well when passing HTTP options to a file.
|
165
3351
|
else
|
166
3352
|
# And when you do pass them, make sure they aren't just nil (this still true?)
|
@@ -327,7 +3513,7 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
327
3513
|
if use_strict_parser
|
328
3514
|
# initialize the SAX parser
|
329
3515
|
saxparser = XML::SAX::Helpers::ParserFactory.makeParser("XML::Parser::SAXDriver")
|
330
|
-
feedparser =
|
3516
|
+
feedparser = StrictFeedParser.new(baseuri, baselang, 'utf-8')
|
331
3517
|
saxparser.setDocumentHandler(feedparser)
|
332
3518
|
saxparser.setDTDHandler(feedparser)
|
333
3519
|
saxparser.setEntityResolver(feedparser)
|
@@ -348,7 +3534,7 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
348
3534
|
end
|
349
3535
|
end
|
350
3536
|
if not use_strict_parser
|
351
|
-
feedparser =
|
3537
|
+
feedparser = LooseFeedParser.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
|
352
3538
|
feedparser.parse(data)
|
353
3539
|
$stderr << "Using LooseFeed\n\n" if $debug
|
354
3540
|
end
|
@@ -358,7 +3544,6 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
358
3544
|
result['namespaces'] = feedparser.namespacesInUse
|
359
3545
|
return result
|
360
3546
|
end
|
361
|
-
module_function(:parse)
|
362
3547
|
end # End FeedParser module
|
363
3548
|
|
364
3549
|
class Serializer
|
@@ -398,7 +3583,7 @@ class TextSerializer < Serializer
|
|
398
3583
|
end
|
399
3584
|
end
|
400
3585
|
|
401
|
-
class PprintSerializer < Serializer # FIXME use pp instead
|
3586
|
+
class PprintSerializer < Serializer # FIXME ? use pp instead?
|
402
3587
|
def write(stream = $stdout)
|
403
3588
|
stream << @results['href'].to_s + "\n\n"
|
404
3589
|
pp(@results)
|
@@ -406,88 +3591,87 @@ class PprintSerializer < Serializer # FIXME use pp instead
|
|
406
3591
|
end
|
407
3592
|
end
|
408
3593
|
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
3594
|
+
|
3595
|
+
require 'optparse'
|
3596
|
+
require 'ostruct'
|
3597
|
+
options = OpenStruct.new
|
3598
|
+
options.etag = options.modified = options.agent = options.referrer = nil
|
3599
|
+
options.content_language = options.content_location = options.ctype = nil
|
3600
|
+
options.format = 'pprint'
|
3601
|
+
options.compatible = $compatible
|
3602
|
+
options.verbose = false
|
3603
|
+
|
3604
|
+
opts = OptionParser.new do |opts|
|
3605
|
+
opts.banner
|
3606
|
+
opts.separator ""
|
3607
|
+
opts.on("-A", "--user-agent [AGENT]",
|
423
3608
|
"User-Agent for HTTP URLs") {|agent|
|
424
|
-
|
425
|
-
|
3609
|
+
options.agent = agent
|
3610
|
+
}
|
426
3611
|
|
427
|
-
|
3612
|
+
opts.on("-e", "--referrer [URL]",
|
428
3613
|
"Referrer for HTTP URLs") {|referrer|
|
429
|
-
|
430
|
-
|
3614
|
+
options.referrer = referrer
|
3615
|
+
}
|
431
3616
|
|
432
|
-
|
3617
|
+
opts.on("-t", "--etag [TAG]",
|
433
3618
|
"ETag/If-None-Match for HTTP URLs") {|etag|
|
434
|
-
|
435
|
-
|
3619
|
+
options.etag = etag
|
3620
|
+
}
|
436
3621
|
|
437
|
-
|
3622
|
+
opts.on("-m", "--last-modified [DATE]",
|
438
3623
|
"Last-modified/If-Modified-Since for HTTP URLs (any supported date format)") {|modified|
|
439
|
-
|
440
|
-
|
3624
|
+
options.modified = modified
|
3625
|
+
}
|
441
3626
|
|
442
|
-
|
3627
|
+
opts.on("-f", "--format [FORMAT]", [:text, :pprint],
|
443
3628
|
"output resutls in FORMAT (text, pprint)") {|format|
|
444
|
-
|
445
|
-
|
3629
|
+
options.format = format
|
3630
|
+
}
|
446
3631
|
|
447
|
-
|
3632
|
+
opts.on("-v", "--[no-]verbose",
|
448
3633
|
"write debugging information to stderr") {|v|
|
449
|
-
|
450
|
-
|
3634
|
+
options.verbose = v
|
3635
|
+
}
|
451
3636
|
|
452
|
-
|
3637
|
+
opts.on("-c", "--[no-]compatible",
|
453
3638
|
"strip element attributes like feedparser.py 4.1 (default)") {|comp|
|
454
|
-
|
455
|
-
|
456
|
-
|
3639
|
+
options.compatible = comp
|
3640
|
+
}
|
3641
|
+
opts.on("-l", "--content-location [LOCATION]",
|
457
3642
|
"default Content-Location HTTP header") {|loc|
|
458
|
-
|
459
|
-
|
460
|
-
|
3643
|
+
options.content_location = loc
|
3644
|
+
}
|
3645
|
+
opts.on("-a", "--content-language [LANG]",
|
461
3646
|
"default Content-Language HTTP header") {|lang|
|
462
|
-
|
463
|
-
|
464
|
-
|
3647
|
+
options.content_language = lang
|
3648
|
+
}
|
3649
|
+
opts.on("-t", "--content-type [TYPE]",
|
465
3650
|
"default Content-type HTTP header") {|ctype|
|
466
|
-
|
467
|
-
|
468
|
-
|
3651
|
+
options.ctype = ctype
|
3652
|
+
}
|
3653
|
+
end
|
469
3654
|
|
470
|
-
|
471
|
-
|
472
|
-
|
3655
|
+
opts.parse!(ARGV)
|
3656
|
+
$debug = true if options.verbose
|
3657
|
+
$compatible = options.compatible unless options.compatible.nil?
|
473
3658
|
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
end
|
3659
|
+
if options.format == :text
|
3660
|
+
serializer = TextSerializer
|
3661
|
+
else
|
3662
|
+
serializer = PprintSerializer
|
3663
|
+
end
|
3664
|
+
args = *ARGV.dup
|
3665
|
+
unless args.nil?
|
3666
|
+
args.each do |url| # opts.parse! removes everything but the urls from the command line
|
3667
|
+
results = FeedParser.parse(url, :etag => options.etag,
|
3668
|
+
:modified => options.modified,
|
3669
|
+
:agent => options.agent,
|
3670
|
+
:referrer => options.referrer,
|
3671
|
+
:content_location => options.content_location,
|
3672
|
+
:content_language => options.content_language,
|
3673
|
+
:content_type => options.ctype
|
3674
|
+
)
|
3675
|
+
serializer.new(results).write($stdout)
|
492
3676
|
end
|
493
3677
|
end
|