rfeedparser 0.9.92 → 0.9.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rfeedparser.rb +106 -105
- data/lib/rfeedparser/better_sgmlparser.rb +84 -84
- data/lib/rfeedparser/encoding_helpers.rb +4 -3
- data/lib/rfeedparser/parser_mixin.rb +121 -118
- data/lib/rfeedparser/parsers.rb +31 -30
- data/lib/rfeedparser/scrub.rb +1 -1
- data/lib/rfeedparser/time_helpers.rb +52 -54
- data/tests/rfponly/wellformed/mrss/mrss_media_content.xml +20 -0
- data/tests/rfponly/wellformed/mrss/mrss_thumbnail.xml +21 -0
- metadata +10 -5
data/lib/rfeedparser/scrub.rb
CHANGED
@@ -20,35 +20,33 @@ module FeedParserMixin
|
|
20
20
|
# FIXME The century regexp maybe not work ('\d\d$' says "two numbers at
|
21
21
|
# end of line" but we then attach more of a regexp.
|
22
22
|
iso8601_regexps = [ '^(\d{4})-?([01]\d)-([0123]\d)',
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
]
|
23
|
+
'^(\d{4})-([01]\d)',
|
24
|
+
'^(\d{4})-?([0123]\d\d)',
|
25
|
+
'^(\d\d)-?([01]\d)-?([0123]\d)',
|
26
|
+
'^(\d\d)-?([0123]\d\d)',
|
27
|
+
'^(\d{4})',
|
28
|
+
'-(\d\d)-?([01]\d)',
|
29
|
+
'-([0123]\d\d)',
|
30
|
+
'-(\d\d)',
|
31
|
+
'--([01]\d)-?([0123]\d)',
|
32
|
+
'--([01]\d)',
|
33
|
+
'---([0123]\d)',
|
34
|
+
'(\d\d$)',
|
35
|
+
'' ]
|
37
36
|
iso8601_values = { '^(\d{4})-?([01]\d)-([0123]\d)' => ['year', 'month', 'day'],
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
}
|
37
|
+
'^(\d{4})-([01]\d)' => ['year','month'],
|
38
|
+
'^(\d{4})-?([0123]\d\d)' => ['year', 'ordinal'],
|
39
|
+
'^(\d\d)-?([01]\d)-?([0123]\d)' => ['year','month','day'],
|
40
|
+
'^(\d\d)-?([0123]\d\d)' => ['year','ordinal'],
|
41
|
+
'^(\d{4})' => ['year'],
|
42
|
+
'-(\d\d)-?([01]\d)' => ['year','month'],
|
43
|
+
'-([0123]\d\d)' => ['ordinal'],
|
44
|
+
'-(\d\d)' => ['year'],
|
45
|
+
'--([01]\d)-?([0123]\d)' => ['month','day'],
|
46
|
+
'--([01]\d)' => ['month'],
|
47
|
+
'---([0123]\d)' => ['day'],
|
48
|
+
'(\d\d$)' => ['century'],
|
49
|
+
'' => [] }
|
52
50
|
add_to_all = '(T?(\d\d):(\d\d)(?::(\d\d))?([+-](\d\d)(?::(\d\d))?|Z)?)?'
|
53
51
|
add_to_all_fields = ['hour', 'minute', 'second', 'tz', 'tzhour', 'tzmin']
|
54
52
|
# NOTE We use '(?:' to prevent grouping of optional matches (ones trailed
|
@@ -86,9 +84,9 @@ module FeedParserMixin
|
|
86
84
|
# ordinals are NOT normalized by mktime, we simulate them
|
87
85
|
# by setting month=1, day=ordinal
|
88
86
|
if ordinal
|
89
|
-
|
87
|
+
month = DateTime.ordinal(year,ordinal).month
|
90
88
|
else
|
91
|
-
|
89
|
+
month = Time.now.utc.month
|
92
90
|
end
|
93
91
|
end
|
94
92
|
month = month.to_i unless month.nil?
|
@@ -96,11 +94,11 @@ module FeedParserMixin
|
|
96
94
|
if day.nil? or day.empty?
|
97
95
|
# see above
|
98
96
|
if ordinal
|
99
|
-
|
97
|
+
day = DateTime.ordinal(year,ordinal).day
|
100
98
|
elsif params['century'] or params['year'] or params['month']
|
101
|
-
|
99
|
+
day = 1
|
102
100
|
else
|
103
|
-
|
101
|
+
day = Time.now.utc.day
|
104
102
|
end
|
105
103
|
else
|
106
104
|
day = day.to_i
|
@@ -124,13 +122,13 @@ module FeedParserMixin
|
|
124
122
|
if tz and not tz.empty? and tz != 'Z'
|
125
123
|
# FIXME does this cross over days?
|
126
124
|
if tz[0] == '-'
|
127
|
-
|
128
|
-
|
125
|
+
tm[3] += params['tzhour'].to_i
|
126
|
+
tm[4] += params['tzmin'].to_i
|
129
127
|
elsif tz[0] == '+'
|
130
|
-
|
131
|
-
|
128
|
+
tm[3] -= params['tzhour'].to_i
|
129
|
+
tm[4] -= params['tzmin'].to_i
|
132
130
|
else
|
133
|
-
|
131
|
+
return nil
|
134
132
|
end
|
135
133
|
end
|
136
134
|
return Time.utc(*tm) # Magic!
|
@@ -148,7 +146,7 @@ module FeedParserMixin
|
|
148
146
|
korean_onblog_date_re = /(\d{4})#{korean_year}\s+(\d{2})#{korean_month}\s+(\d{2})#{korean_day}\s+(\d{2}):(\d{2}):(\d{2})/
|
149
147
|
|
150
148
|
|
151
|
-
|
149
|
+
m = korean_onblog_date_re.match(dateString)
|
152
150
|
return unless m
|
153
151
|
w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
|
154
152
|
|
@@ -163,7 +161,7 @@ module FeedParserMixin
|
|
163
161
|
korean_pm = u("오후") # bfc0 c8c4 in euc-kr
|
164
162
|
|
165
163
|
korean_nate_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(#{korean_am}|#{korean_pm})\s+(\d{0,2}):(\d{0,2}):(\d{0,2})/
|
166
|
-
|
164
|
+
m = korean_nate_date_re.match(dateString)
|
167
165
|
return unless m
|
168
166
|
hour = m[5].to_i
|
169
167
|
ampm = m[4]
|
@@ -179,7 +177,7 @@ module FeedParserMixin
|
|
179
177
|
def _parse_date_mssql(dateString)
|
180
178
|
mssql_date_re = /(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?/
|
181
179
|
|
182
|
-
|
180
|
+
m = mssql_date_re.match(dateString)
|
183
181
|
return unless m
|
184
182
|
w3dtfdate = "#{m[1]}-#{m[2]}-#{m[3]}T#{m[4]}:#{m[5]}:#{m[6]}+09:00"
|
185
183
|
$stderr << "MS SQL date parsed as: %s\n" % w3dtfdate if $debug
|
@@ -223,7 +221,7 @@ module FeedParserMixin
|
|
223
221
|
|
224
222
|
greek_date_format = /([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)/
|
225
223
|
|
226
|
-
|
224
|
+
m = greek_date_format.match(dateString)
|
227
225
|
return unless m
|
228
226
|
begin
|
229
227
|
wday = greek_wdays[m[1]]
|
@@ -239,7 +237,7 @@ module FeedParserMixin
|
|
239
237
|
def _parse_date_hungarian(dateString)
|
240
238
|
# Parse a string according to a Hungarian 8-bit date format.
|
241
239
|
hungarian_date_format_re = /(\d{4})-([^-]+)-(\d{0,2})T(\d{0,2}):(\d{2})((\+|-)(\d{0,2}:\d{2}))/
|
242
|
-
|
240
|
+
m = hungarian_date_format_re.match(dateString)
|
243
241
|
return unless m
|
244
242
|
|
245
243
|
# Unicode strings for Hungarian date strings
|
@@ -314,8 +312,8 @@ module FeedParserMixin
|
|
314
312
|
w3[2] -= num_days
|
315
313
|
w3[1] += 1
|
316
314
|
if w3[1] > 12
|
317
|
-
|
318
|
-
|
315
|
+
w3[0] += 1
|
316
|
+
w3[1] = set_self(w3[1], 12)
|
319
317
|
end
|
320
318
|
num_days = Time.days_in_month(w3[1], w3[0])
|
321
319
|
end
|
@@ -323,9 +321,9 @@ module FeedParserMixin
|
|
323
321
|
|
324
322
|
unless w3[6].class != String
|
325
323
|
if /^-/ =~ w3[6] # Zone offset goes backwards
|
326
|
-
|
324
|
+
w3[6][0] = '+'
|
327
325
|
elsif /^\+/ =~ w3[6]
|
328
|
-
|
326
|
+
w3[6][0] = '-'
|
329
327
|
end
|
330
328
|
end
|
331
329
|
return Time.utc(w3[0], w3[1], w3[2] , w3[3], w3[4], w3[5])+Time.zone_offset(w3[6] || "UTC")
|
@@ -335,8 +333,8 @@ module FeedParserMixin
|
|
335
333
|
# Parse an RFC822, RFC1123, RFC2822 or asctime-style date
|
336
334
|
# These first few lines are to fix up the stupid proprietary format from Disney
|
337
335
|
unknown_timezones = { 'AT' => 'EDT', 'ET' => 'EST',
|
338
|
-
|
339
|
-
|
336
|
+
'CT' => 'CST', 'MT' => 'MST',
|
337
|
+
'PT' => 'PST'
|
340
338
|
}
|
341
339
|
|
342
340
|
mon = dateString.split[2]
|
@@ -390,11 +388,11 @@ module FeedParserMixin
|
|
390
388
|
def parse_date(dateString)
|
391
389
|
@date_handlers.each do |handler|
|
392
390
|
begin
|
393
|
-
|
394
|
-
|
395
|
-
|
391
|
+
$stderr << "Trying date_handler #{handler}\n" if $debug
|
392
|
+
datething = extract_tuple(send(handler,dateString))
|
393
|
+
return datething
|
396
394
|
rescue Exception => e
|
397
|
-
|
395
|
+
$stderr << "#{handler} raised #{e}\n" if $debug
|
398
396
|
end
|
399
397
|
end
|
400
398
|
return nil
|
@@ -403,6 +401,6 @@ end
|
|
403
401
|
|
404
402
|
module FeedParserUtilities
|
405
403
|
def py2rtime(pytuple)
|
406
|
-
Time.utc(pytuple[0..5])
|
404
|
+
return Time.utc(*pytuple[0..5]) unless pytuple.blank?
|
407
405
|
end
|
408
406
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
<!--
|
2
|
+
Description: interprets media:content
|
3
|
+
Expect: not bozo and entries[0]['enclosures'][0]['href'] == u'http://www.webmonkey.com/monkeyrock.mpg'
|
4
|
+
-->
|
5
|
+
<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
|
6
|
+
<channel>
|
7
|
+
<title>Some Bands I Like</title>
|
8
|
+
<link>http://www.andyvolk.com/webmonkey/bands/</link>
|
9
|
+
<description>A list of some bands I like (or have been a member of).</description>
|
10
|
+
|
11
|
+
<item>
|
12
|
+
<title>Rocking Webmonkey Garage Band</title>
|
13
|
+
<link>http://www.webmonkey.com/ourband.html</link>
|
14
|
+
<description>The best ever garage band on the Internet.</description>
|
15
|
+
<guid isPermaLink="false"> http://www.webmonkey.com/ourband.html</guid>
|
16
|
+
<media:content url="http://www.webmonkey.com/monkeyrock.mpg" fileSize="2471632" type="video/mpeg" height="240" width="320" duration="147" medium="video" isDefault="true">
|
17
|
+
</media:content>
|
18
|
+
</item>
|
19
|
+
</channel>
|
20
|
+
</rss>
|
@@ -0,0 +1,21 @@
|
|
1
|
+
<!--
|
2
|
+
Description: interprets media:content
|
3
|
+
Expect: not bozo and entries[0]['enclosures'][1]['href'] == u'http://www.webmonkey.com/images/monkeyrock-thumb.jpg'
|
4
|
+
-->
|
5
|
+
<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
|
6
|
+
<channel>
|
7
|
+
<title>Some Bands I Like</title>
|
8
|
+
<link>http://www.andyvolk.com/webmonkey/bands/</link>
|
9
|
+
<description>A list of some bands I like (or have been a member of).</description>
|
10
|
+
|
11
|
+
<item>
|
12
|
+
<title>Rocking Webmonkey Garage Band</title>
|
13
|
+
<link>http://www.webmonkey.com/ourband.html</link>
|
14
|
+
<description>The best ever garage band on the Internet.</description>
|
15
|
+
<guid isPermaLink="false"> http://www.webmonkey.com/ourband.html</guid>
|
16
|
+
<media:content url="http://www.webmonkey.com/monkeyrock.mpg" fileSize="2471632" type="video/mpeg" height="240" width="320" duration="147" medium="video" isDefault="true">
|
17
|
+
<media:thumbnail url="http://www.webmonkey.com/images/monkeyrock-thumb.jpg" height="98" width="145"/>
|
18
|
+
</media:content>
|
19
|
+
</item>
|
20
|
+
</channel>
|
21
|
+
</rss>
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.
|
2
|
+
rubygems_version: 0.9.4
|
3
3
|
specification_version: 1
|
4
4
|
name: rfeedparser
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.9.
|
7
|
-
date: 2007-
|
6
|
+
version: 0.9.93
|
7
|
+
date: 2007-07-21 00:00:00 -07:00
|
8
8
|
summary: Parse RSS and Atom feeds in Ruby
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -1822,6 +1822,11 @@ files:
|
|
1822
1822
|
- tests/illformed/sanitize/item_xhtml_body_style.xml
|
1823
1823
|
- tests/rfeedparserserver.rb
|
1824
1824
|
- tests/rfeedparsertest.rb
|
1825
|
+
- tests/rfponly
|
1826
|
+
- tests/rfponly/wellformed
|
1827
|
+
- tests/rfponly/wellformed/mrss
|
1828
|
+
- tests/rfponly/wellformed/mrss/mrss_media_content.xml
|
1829
|
+
- tests/rfponly/wellformed/mrss/mrss_thumbnail.xml
|
1825
1830
|
- tests/wellformed
|
1826
1831
|
- tests/wellformed/amp
|
1827
1832
|
- tests/wellformed/amp/amp01.xml
|
@@ -3426,7 +3431,7 @@ dependencies:
|
|
3426
3431
|
requirements:
|
3427
3432
|
- - ">="
|
3428
3433
|
- !ruby/object:Gem::Version
|
3429
|
-
version: "1.
|
3434
|
+
version: "1.1"
|
3430
3435
|
version:
|
3431
3436
|
- !ruby/object:Gem::Dependency
|
3432
3437
|
name: activesupport
|
@@ -3442,7 +3447,7 @@ dependencies:
|
|
3442
3447
|
version_requirement:
|
3443
3448
|
version_requirements: !ruby/object:Gem::Version::Requirement
|
3444
3449
|
requirements:
|
3445
|
-
- - "
|
3450
|
+
- - "="
|
3446
3451
|
- !ruby/object:Gem::Version
|
3447
3452
|
version: "0.5"
|
3448
3453
|
version:
|