rfeedparser 0.9.92 → 0.9.93
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rfeedparser.rb +106 -105
- data/lib/rfeedparser/better_sgmlparser.rb +84 -84
- data/lib/rfeedparser/encoding_helpers.rb +4 -3
- data/lib/rfeedparser/parser_mixin.rb +121 -118
- data/lib/rfeedparser/parsers.rb +31 -30
- data/lib/rfeedparser/scrub.rb +1 -1
- data/lib/rfeedparser/time_helpers.rb +52 -54
- data/tests/rfponly/wellformed/mrss/mrss_media_content.xml +20 -0
- data/tests/rfponly/wellformed/mrss/mrss_thumbnail.xml +21 -0
- metadata +10 -5
data/lib/rfeedparser/scrub.rb
CHANGED
@@ -20,35 +20,33 @@ module FeedParserMixin
|
|
20
20
|
# FIXME The century regexp maybe not work ('\d\d$' says "two numbers at
|
21
21
|
# end of line" but we then attach more of a regexp.
|
22
22
|
iso8601_regexps = [ '^(\d{4})-?([01]\d)-([0123]\d)',
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
]
|
23
|
+
'^(\d{4})-([01]\d)',
|
24
|
+
'^(\d{4})-?([0123]\d\d)',
|
25
|
+
'^(\d\d)-?([01]\d)-?([0123]\d)',
|
26
|
+
'^(\d\d)-?([0123]\d\d)',
|
27
|
+
'^(\d{4})',
|
28
|
+
'-(\d\d)-?([01]\d)',
|
29
|
+
'-([0123]\d\d)',
|
30
|
+
'-(\d\d)',
|
31
|
+
'--([01]\d)-?([0123]\d)',
|
32
|
+
'--([01]\d)',
|
33
|
+
'---([0123]\d)',
|
34
|
+
'(\d\d$)',
|
35
|
+
'' ]
|
37
36
|
iso8601_values = { '^(\d{4})-?([01]\d)-([0123]\d)' => ['year', 'month', 'day'],
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
}
|
37
|
+
'^(\d{4})-([01]\d)' => ['year','month'],
|
38
|
+
'^(\d{4})-?([0123]\d\d)' => ['year', 'ordinal'],
|
39
|
+
'^(\d\d)-?([01]\d)-?([0123]\d)' => ['year','month','day'],
|
40
|
+
'^(\d\d)-?([0123]\d\d)' => ['year','ordinal'],
|
41
|
+
'^(\d{4})' => ['year'],
|
42
|
+
'-(\d\d)-?([01]\d)' => ['year','month'],
|
43
|
+
'-([0123]\d\d)' => ['ordinal'],
|
44
|
+
'-(\d\d)' => ['year'],
|
45
|
+
'--([01]\d)-?([0123]\d)' => ['month','day'],
|
46
|
+
'--([01]\d)' => ['month'],
|
47
|
+
'---([0123]\d)' => ['day'],
|
48
|
+
'(\d\d$)' => ['century'],
|
49
|
+
'' => [] }
|
52
50
|
add_to_all = '(T?(\d\d):(\d\d)(?::(\d\d))?([+-](\d\d)(?::(\d\d))?|Z)?)?'
|
53
51
|
add_to_all_fields = ['hour', 'minute', 'second', 'tz', 'tzhour', 'tzmin']
|
54
52
|
# NOTE We use '(?:' to prevent grouping of optional matches (ones trailed
|
@@ -86,9 +84,9 @@ module FeedParserMixin
|
|
86
84
|
# ordinals are NOT normalized by mktime, we simulate them
|
87
85
|
# by setting month=1, day=ordinal
|
88
86
|
if ordinal
|
89
|
-
|
87
|
+
month = DateTime.ordinal(year,ordinal).month
|
90
88
|
else
|
91
|
-
|
89
|
+
month = Time.now.utc.month
|
92
90
|
end
|
93
91
|
end
|
94
92
|
month = month.to_i unless month.nil?
|
@@ -96,11 +94,11 @@ module FeedParserMixin
|
|
96
94
|
if day.nil? or day.empty?
|
97
95
|
# see above
|
98
96
|
if ordinal
|
99
|
-
|
97
|
+
day = DateTime.ordinal(year,ordinal).day
|
100
98
|
elsif params['century'] or params['year'] or params['month']
|
101
|
-
|
99
|
+
day = 1
|
102
100
|
else
|
103
|
-
|
101
|
+
day = Time.now.utc.day
|
104
102
|
end
|
105
103
|
else
|
106
104
|
day = day.to_i
|
@@ -124,13 +122,13 @@ module FeedParserMixin
|
|
124
122
|
if tz and not tz.empty? and tz != 'Z'
|
125
123
|
# FIXME does this cross over days?
|
126
124
|
if tz[0] == '-'
|
127
|
-
|
128
|
-
|
125
|
+
tm[3] += params['tzhour'].to_i
|
126
|
+
tm[4] += params['tzmin'].to_i
|
129
127
|
elsif tz[0] == '+'
|
130
|
-
|
131
|
-
|
128
|
+
tm[3] -= params['tzhour'].to_i
|
129
|
+
tm[4] -= params['tzmin'].to_i
|
132
130
|
else
|
133
|
-
|
131
|
+
return nil
|
134
132
|
end
|
135
133
|
end
|
136
134
|
return Time.utc(*tm) # Magic!
|
@@ -148,7 +146,7 @@ module FeedParserMixin
|
|
148
146
|
korean_onblog_date_re = /(\d{4})#{korean_year}\s+(\d{2})#{korean_month}\s+(\d{2})#{korean_day}\s+(\d{2}):(\d{2}):(\d{2})/
|
149
147
|
|
150
148
|
|
151
|
-
|
149
|
+
m = korean_onblog_date_re.match(dateString)
|
152
150
|
return unless m
|
153
151
|
w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
|
154
152
|
|
@@ -163,7 +161,7 @@ module FeedParserMixin
|
|
163
161
|
korean_pm = u("오후") # bfc0 c8c4 in euc-kr
|
164
162
|
|
165
163
|
korean_nate_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(#{korean_am}|#{korean_pm})\s+(\d{0,2}):(\d{0,2}):(\d{0,2})/
|
166
|
-
|
164
|
+
m = korean_nate_date_re.match(dateString)
|
167
165
|
return unless m
|
168
166
|
hour = m[5].to_i
|
169
167
|
ampm = m[4]
|
@@ -179,7 +177,7 @@ module FeedParserMixin
|
|
179
177
|
def _parse_date_mssql(dateString)
|
180
178
|
mssql_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?/
|
181
179
|
|
182
|
-
|
180
|
+
m = mssql_date_re.match(dateString)
|
183
181
|
return unless m
|
184
182
|
w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
|
185
183
|
$stderr << "MS SQL date parsed as: %s\n" % w3dtfdate if $debug
|
@@ -223,7 +221,7 @@ module FeedParserMixin
|
|
223
221
|
|
224
222
|
greek_date_format = /([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)/
|
225
223
|
|
226
|
-
|
224
|
+
m = greek_date_format.match(dateString)
|
227
225
|
return unless m
|
228
226
|
begin
|
229
227
|
wday = greek_wdays[m[1]]
|
@@ -239,7 +237,7 @@ module FeedParserMixin
|
|
239
237
|
def _parse_date_hungarian(dateString)
|
240
238
|
# Parse a string according to a Hungarian 8-bit date format.
|
241
239
|
hungarian_date_format_re = /(\d{4})-([^-]+)-(\d{0,2})T(\d{0,2}):(\d{2})((\+|-)(\d{0,2}:\d{2}))/
|
242
|
-
|
240
|
+
m = hungarian_date_format_re.match(dateString)
|
243
241
|
return unless m
|
244
242
|
|
245
243
|
# Unicode strings for Hungarian date strings
|
@@ -314,8 +312,8 @@ module FeedParserMixin
|
|
314
312
|
w3[2] -= num_days
|
315
313
|
w3[1] += 1
|
316
314
|
if w3[1] > 12
|
317
|
-
|
318
|
-
|
315
|
+
w3[0] += 1
|
316
|
+
w3[1] = set_self(w3[1], 12)
|
319
317
|
end
|
320
318
|
num_days = Time.days_in_month(w3[1], w3[0])
|
321
319
|
end
|
@@ -323,9 +321,9 @@ module FeedParserMixin
|
|
323
321
|
|
324
322
|
unless w3[6].class != String
|
325
323
|
if /^-/ =~ w3[6] # Zone offset goes backwards
|
326
|
-
|
324
|
+
w3[6][0] = '+'
|
327
325
|
elsif /^\+/ =~ w3[6]
|
328
|
-
|
326
|
+
w3[6][0] = '-'
|
329
327
|
end
|
330
328
|
end
|
331
329
|
return Time.utc(w3[0], w3[1], w3[2] , w3[3], w3[4], w3[5])+Time.zone_offset(w3[6] || "UTC")
|
@@ -335,8 +333,8 @@ module FeedParserMixin
|
|
335
333
|
# Parse an RFC822, RFC1123, RFC2822 or asctime-style date
|
336
334
|
# These first few lines are to fix up the stupid proprietary format from Disney
|
337
335
|
unknown_timezones = { 'AT' => 'EDT', 'ET' => 'EST',
|
338
|
-
|
339
|
-
|
336
|
+
'CT' => 'CST', 'MT' => 'MST',
|
337
|
+
'PT' => 'PST'
|
340
338
|
}
|
341
339
|
|
342
340
|
mon = dateString.split[2]
|
@@ -390,11 +388,11 @@ module FeedParserMixin
|
|
390
388
|
def parse_date(dateString)
|
391
389
|
@date_handlers.each do |handler|
|
392
390
|
begin
|
393
|
-
|
394
|
-
|
395
|
-
|
391
|
+
$stderr << "Trying date_handler #{handler}\n" if $debug
|
392
|
+
datething = extract_tuple(send(handler,dateString))
|
393
|
+
return datething
|
396
394
|
rescue Exception => e
|
397
|
-
|
395
|
+
$stderr << "#{handler} raised #{e}\n" if $debug
|
398
396
|
end
|
399
397
|
end
|
400
398
|
return nil
|
@@ -403,6 +401,6 @@ end
|
|
403
401
|
|
404
402
|
module FeedParserUtilities
|
405
403
|
def py2rtime(pytuple)
|
406
|
-
Time.utc(pytuple[0..5])
|
404
|
+
return Time.utc(*pytuple[0..5]) unless pytuple.blank?
|
407
405
|
end
|
408
406
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
<!--
|
2
|
+
Description: interprets media:content
|
3
|
+
Expect: not bozo and entries[0]['enclosures'][0]['href'] == u'http://www.webmonkey.com/monkeyrock.mpg'
|
4
|
+
-->
|
5
|
+
<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
|
6
|
+
<channel>
|
7
|
+
<title>Some Bands I Like</title>
|
8
|
+
<link>http://www.andyvolk.com/webmonkey/bands/</link>
|
9
|
+
<description>A list of some bands I like (or have been a member of).</description>
|
10
|
+
|
11
|
+
<item>
|
12
|
+
<title>Rocking Webmonkey Garage Band</title>
|
13
|
+
<link>http://www.webmonkey.com/ourband.html</link>
|
14
|
+
<description>The best ever garage band on the Internet.</description>
|
15
|
+
<guid isPermaLink="false"> http://www.webmonkey.com/ourband.html</guid>
|
16
|
+
<media:content url="http://www.webmonkey.com/monkeyrock.mpg" fileSize="2471632" type="video/mpeg" height="240" width="320" duration="147" medium="video" isDefault="true">
|
17
|
+
</media:content>
|
18
|
+
</item>
|
19
|
+
</channel>
|
20
|
+
</rss>
|
@@ -0,0 +1,21 @@
|
|
1
|
+
<!--
|
2
|
+
Description: interprets media:content
|
3
|
+
Expect: not bozo and entries[0]['enclosures'][1]['href'] == u'http://www.webmonkey.com/images/monkeyrock-thumb.jpg'
|
4
|
+
-->
|
5
|
+
<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
|
6
|
+
<channel>
|
7
|
+
<title>Some Bands I Like</title>
|
8
|
+
<link>http://www.andyvolk.com/webmonkey/bands/</link>
|
9
|
+
<description>A list of some bands I like (or have been a member of).</description>
|
10
|
+
|
11
|
+
<item>
|
12
|
+
<title>Rocking Webmonkey Garage Band</title>
|
13
|
+
<link>http://www.webmonkey.com/ourband.html</link>
|
14
|
+
<description>The best ever garage band on the Internet.</description>
|
15
|
+
<guid isPermaLink="false"> http://www.webmonkey.com/ourband.html</guid>
|
16
|
+
<media:content url="http://www.webmonkey.com/monkeyrock.mpg" fileSize="2471632" type="video/mpeg" height="240" width="320" duration="147" medium="video" isDefault="true">
|
17
|
+
<media:thumbnail url="http://www.webmonkey.com/images/monkeyrock-thumb.jpg" height="98" width="145"/>
|
18
|
+
</media:content>
|
19
|
+
</item>
|
20
|
+
</channel>
|
21
|
+
</rss>
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.
|
2
|
+
rubygems_version: 0.9.4
|
3
3
|
specification_version: 1
|
4
4
|
name: rfeedparser
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.9.
|
7
|
-
date: 2007-
|
6
|
+
version: 0.9.93
|
7
|
+
date: 2007-07-21 00:00:00 -07:00
|
8
8
|
summary: Parse RSS and Atom feeds in Ruby
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -1822,6 +1822,11 @@ files:
|
|
1822
1822
|
- tests/illformed/sanitize/item_xhtml_body_style.xml
|
1823
1823
|
- tests/rfeedparserserver.rb
|
1824
1824
|
- tests/rfeedparsertest.rb
|
1825
|
+
- tests/rfponly
|
1826
|
+
- tests/rfponly/wellformed
|
1827
|
+
- tests/rfponly/wellformed/mrss
|
1828
|
+
- tests/rfponly/wellformed/mrss/mrss_media_content.xml
|
1829
|
+
- tests/rfponly/wellformed/mrss/mrss_thumbnail.xml
|
1825
1830
|
- tests/wellformed
|
1826
1831
|
- tests/wellformed/amp
|
1827
1832
|
- tests/wellformed/amp/amp01.xml
|
@@ -3426,7 +3431,7 @@ dependencies:
|
|
3426
3431
|
requirements:
|
3427
3432
|
- - ">="
|
3428
3433
|
- !ruby/object:Gem::Version
|
3429
|
-
version: "1.
|
3434
|
+
version: "1.1"
|
3430
3435
|
version:
|
3431
3436
|
- !ruby/object:Gem::Dependency
|
3432
3437
|
name: activesupport
|
@@ -3442,7 +3447,7 @@ dependencies:
|
|
3442
3447
|
version_requirement:
|
3443
3448
|
version_requirements: !ruby/object:Gem::Version::Requirement
|
3444
3449
|
requirements:
|
3445
|
-
- - "
|
3450
|
+
- - "="
|
3446
3451
|
- !ruby/object:Gem::Version
|
3447
3452
|
version: "0.5"
|
3448
3453
|
version:
|