rfeedparser 0.9.931 → 0.9.940
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rfeedparser.rb +143 -58
- data/lib/rfeedparser/aliases.rb +1 -1
- data/lib/rfeedparser/better_attributelist.rb +11 -11
- data/lib/rfeedparser/better_sgmlparser.rb +1 -1
- data/lib/rfeedparser/encoding_helpers.rb +120 -127
- data/lib/rfeedparser/feedparserdict.rb +30 -20
- data/lib/rfeedparser/forgiving_uri.rb +9 -7
- data/lib/rfeedparser/markup_helpers.rb +11 -14
- data/lib/rfeedparser/parser_mixin.rb +16 -11
- data/lib/rfeedparser/parsers.rb +1 -2
- data/lib/rfeedparser/scrub.rb +95 -90
- data/lib/rfeedparser/time_helpers.rb +379 -379
- data/lib/rfeedparser/utilities.rb +23 -0
- data/tests/rfeedparser_test_helper.rb +262 -0
- data/tests/rfeedparserserver.rb +3 -109
- data/tests/rfeedparsertest.rb +6 -165
- data/tests/rfponly/http/200.xml +30 -0
- data/tests/rfponly/http/220.xml +28 -0
- data/tests/rfponly/http/300.xml +8 -0
- data/tests/rfponly/http/300.xml_redirect +25 -0
- data/tests/rfponly/http/301.xml +8 -0
- data/tests/rfponly/http/301.xml_redirect +25 -0
- data/tests/rfponly/http/302.xml +8 -0
- data/tests/rfponly/http/302.xml_redirect +25 -0
- data/tests/rfponly/http/307.xml +8 -0
- data/tests/rfponly/http/307.xml_redirect +25 -0
- data/tests/rfponly/http/320.xml +8 -0
- data/tests/rfponly/http/320.xml_redirect +25 -0
- data/tests/rfponly/http/400.xml +7 -0
- data/tests/rfponly/http/404.xml +7 -0
- data/tests/rfponly/http/410.xml +7 -0
- data/tests/rfponly/http/420.xml +7 -0
- data/tests/rfponly/http/500.xml +7 -0
- data/tests/rfponly/http/520.xml +7 -0
- data/tests/rfponly/http/etag.xml +28 -0
- data/tests/rfponly/http/lastmodified.xml +29 -0
- data/tests/rfponly/wellformed/date/feed_modified_with_negative_numeric_timezone.xml +9 -0
- data/tests/rfponly/wellformed/date/feed_modified_with_positive_numeric_timezone.xml +9 -0
- data/tests/rfponly/wellformed/scrub/hpricot_self_closing_tag_workaround.xml +11 -0
- metadata +31 -3
@@ -1,5 +1,5 @@
|
|
1
|
-
#!/usr/bin/ruby
|
2
|
-
module
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
module FeedParser
|
3
3
|
class FeedParserDict < Hash
|
4
4
|
=begin
|
5
5
|
The naming of a certain common attribute (such as, "When was the last
|
@@ -16,26 +16,36 @@ module FeedParserUtilities
|
|
16
16
|
methods check with keymaps to see what attribute the developer "really
|
17
17
|
means" if they've asked for one which happens to be in @@keymap's keys.
|
18
18
|
=end
|
19
|
-
@@keymap = {
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
19
|
+
@@keymap = {
|
20
|
+
'channel' => 'feed',
|
21
|
+
'items' => 'entries',
|
22
|
+
'guid' => 'id',
|
23
|
+
'date' => 'updated',
|
24
|
+
'date_parsed' => 'updated_parsed',
|
25
|
+
'description' => ['subtitle', 'summary'],
|
26
|
+
'url' => ['href'],
|
27
|
+
'modified' => 'updated',
|
28
|
+
'modified_parsed' => 'updated_parsed',
|
29
|
+
'issued' => 'published',
|
30
|
+
'issued_parsed' => 'published_parsed',
|
31
|
+
'copyright' => 'rights',
|
32
|
+
'copyright_detail' => 'rights_detail',
|
33
|
+
'tagline' => 'subtitle',
|
34
|
+
'tagline_detail' => 'subtitle_detail'
|
35
|
+
}
|
36
|
+
|
37
|
+
# Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
|
38
|
+
alias :hash_entries :entries
|
39
|
+
def entries
|
40
|
+
self['entries']
|
37
41
|
end
|
38
42
|
|
43
|
+
# Added to avoid deprecated method wornings
|
44
|
+
alias :object_type :type
|
45
|
+
def type
|
46
|
+
self['type']
|
47
|
+
end
|
48
|
+
|
39
49
|
# We could include the [] rewrite in new using Hash.new's fancy pants block thing
|
40
50
|
# but we'd still have to overwrite []= and such.
|
41
51
|
# I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
|
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
# From Robert Aman's GentleCMS URI.
|
3
3
|
# GentleCMS, Copyright (c) 2006 Robert Aman
|
4
4
|
#
|
@@ -535,7 +535,7 @@ class ForgivingURI
|
|
535
535
|
|
536
536
|
# Merges two URIs together.
|
537
537
|
def merge(uri)
|
538
|
-
return self + uri
|
538
|
+
return (self + uri)
|
539
539
|
end
|
540
540
|
|
541
541
|
# Destructive form of merge.
|
@@ -940,12 +940,14 @@ class ForgivingURI
|
|
940
940
|
def urljoin(base, uri)
|
941
941
|
urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
|
942
942
|
uri = uri.sub(urifixer, '\1\3')
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
if
|
947
|
-
|
943
|
+
pbase = ForgivingURI.parse(base) rescue nil
|
944
|
+
if pbase && pbase.absolute?
|
945
|
+
puri = ForgivingURI.parse(uri) rescue nil
|
946
|
+
if puri && puri.relative?
|
947
|
+
# ForgivingURI.join does the wrong thing. What the hell.
|
948
|
+
return ForgivingURI.join(base, uri).to_s.gsub(/[^:]\/{2,}/, '')
|
948
949
|
end
|
949
950
|
end
|
951
|
+
return uri
|
950
952
|
end
|
951
953
|
|
@@ -1,14 +1,11 @@
|
|
1
|
-
#!/usr/bin/ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
module FeedParserUtilities
|
3
|
-
#FIXME we need to find a better place for this method
|
4
3
|
def stripDoctype(data)
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
rss_version may be 'rss091n' or None
|
9
|
-
stripped_data is the same XML document, minus the DOCTYPE
|
10
|
-
=end
|
4
|
+
#Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
5
|
+
#rss_version may be 'rss091n' or None
|
6
|
+
#stripped_data is the same XML document, minus the DOCTYPE
|
11
7
|
entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
|
8
|
+
|
12
9
|
data = data.gsub(entity_pattern,'')
|
13
10
|
|
14
11
|
doctype_pattern = /<!DOCTYPE(.*?)>/m
|
@@ -27,7 +24,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
|
27
24
|
data = data.sub(doctype_pattern, '')
|
28
25
|
return version, data
|
29
26
|
end
|
30
|
-
|
27
|
+
|
31
28
|
def resolveRelativeURIs(htmlSource, baseURI, encoding)
|
32
29
|
$stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
|
33
30
|
relative_uris = [ ['a','href'],
|
@@ -60,11 +57,11 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
|
60
57
|
relative_uris.each do |l|
|
61
58
|
ename, eattr = l
|
62
59
|
h.search(ename).each do |elem|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
60
|
+
euri = elem.attributes[eattr]
|
61
|
+
uri = ForgivingURI.parse(URI.encode(euri)) rescue nil
|
62
|
+
if euri and not euri.empty? and uri and uri.relative?
|
63
|
+
elem.raw_attributes[eattr] = urljoin(baseURI, euri)
|
64
|
+
end
|
68
65
|
end
|
69
66
|
end
|
70
67
|
return h.to_html
|
@@ -1,5 +1,7 @@
|
|
1
|
-
#!/usr/bin/ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
module FeedParser
|
2
3
|
module FeedParserMixin
|
4
|
+
include FeedParserUtilities
|
3
5
|
attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
|
4
6
|
|
5
7
|
def startup(baseuri=nil, baselang=nil, encoding='utf-8')
|
@@ -103,10 +105,6 @@ module FeedParserMixin
|
|
103
105
|
if baselang
|
104
106
|
@feeddata['language'] = baselang.gsub('_','-')
|
105
107
|
end
|
106
|
-
@date_handlers = [:_parse_date_rfc822,
|
107
|
-
:_parse_date_hungarian, :_parse_date_greek,:_parse_date_mssql,
|
108
|
-
:_parse_date_nate,:_parse_date_onblog,:_parse_date_w3dtf,:_parse_date_iso8601
|
109
|
-
]
|
110
108
|
$stderr << "Leaving startup\n" if $debug # My addition
|
111
109
|
end
|
112
110
|
|
@@ -873,7 +871,9 @@ module FeedParserMixin
|
|
873
871
|
|
874
872
|
def _end_published
|
875
873
|
value = pop('published')
|
876
|
-
|
874
|
+
d = parse_date(value)
|
875
|
+
_save('published_parsed', extract_tuple(d))
|
876
|
+
_save('published_time', d)
|
877
877
|
end
|
878
878
|
alias :_end_dcterms_issued :_end_published
|
879
879
|
alias :_end_issued :_end_published
|
@@ -888,7 +888,9 @@ module FeedParserMixin
|
|
888
888
|
|
889
889
|
def _end_updated
|
890
890
|
value = pop('updated')
|
891
|
-
|
891
|
+
d = parse_date(value)
|
892
|
+
_save('updated_parsed', extract_tuple(d))
|
893
|
+
_save('updated_time', d)
|
892
894
|
end
|
893
895
|
alias :_end_modified :_end_updated
|
894
896
|
alias :_end_dcterms_modified :_end_updated
|
@@ -902,7 +904,9 @@ module FeedParserMixin
|
|
902
904
|
|
903
905
|
def _end_created
|
904
906
|
value = pop('created')
|
905
|
-
|
907
|
+
d = parse_date(value)
|
908
|
+
_save('created_parsed', extract_tuple(d))
|
909
|
+
_save('created_time', d)
|
906
910
|
end
|
907
911
|
alias :_end_dcterms_created :_end_created
|
908
912
|
|
@@ -910,7 +914,9 @@ module FeedParserMixin
|
|
910
914
|
push('expired', true)
|
911
915
|
end
|
912
916
|
def _end_expirationdate
|
913
|
-
|
917
|
+
d = parse_date(pop('expired'))
|
918
|
+
_save('expired_parsed', extract_tuple(d))
|
919
|
+
_save('expired_time', d)
|
914
920
|
end
|
915
921
|
|
916
922
|
def _start_cc_license(attrsD)
|
@@ -1234,5 +1240,4 @@ module FeedParserMixin
|
|
1234
1240
|
end
|
1235
1241
|
|
1236
1242
|
end # End FeedParserMixin
|
1237
|
-
|
1238
|
-
|
1243
|
+
end
|
data/lib/rfeedparser/parsers.rb
CHANGED
data/lib/rfeedparser/scrub.rb
CHANGED
@@ -1,47 +1,45 @@
|
|
1
|
-
#!/usr/bin/ruby
|
2
|
-
gem 'hpricot', "=0.5"
|
3
|
-
require 'hpricot'
|
1
|
+
#!/usr/bin/env ruby
|
4
2
|
# This used to be based on Michael Moen's Hpricot#scrub, but that seems to
|
5
3
|
# have only been part of its evolution. Hpricot#scrub is cool code, though.
|
6
4
|
# http://underpantsgnome.com/2007/01/20/hpricot-scrub
|
7
5
|
module Hpricot
|
8
6
|
Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
7
|
+
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
8
|
+
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
|
9
|
+
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
10
|
+
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
|
11
|
+
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
|
12
|
+
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
|
13
|
+
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
|
14
|
+
'ul', 'var'
|
17
15
|
]
|
18
16
|
|
19
17
|
Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
18
|
+
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
19
|
+
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
20
|
+
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
21
|
+
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
22
|
+
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
23
|
+
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
24
|
+
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
25
|
+
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
26
|
+
'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
27
|
+
'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
|
30
28
|
]
|
31
29
|
|
32
30
|
Unacceptable_Elements_With_End_Tag = ['script', 'applet']
|
33
31
|
|
34
32
|
Acceptable_Css_Properties = ['azimuth', 'background-color',
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
33
|
+
'border-bottom-color', 'border-collapse', 'border-color',
|
34
|
+
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
35
|
+
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
36
|
+
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
37
|
+
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
38
|
+
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
39
|
+
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
40
|
+
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
41
|
+
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
42
|
+
'white-space', 'width'
|
45
43
|
]
|
46
44
|
|
47
45
|
# survey of common keywords found in feeds
|
@@ -82,38 +80,38 @@ module Hpricot
|
|
82
80
|
|
83
81
|
# svgtiny + class + opacity + offset + xmlns + xmlns:xlink
|
84
82
|
Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
83
|
+
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
84
|
+
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
85
|
+
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
|
86
|
+
'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
|
87
|
+
'font-size', 'font-stretch', 'font-style', 'font-variant',
|
88
|
+
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
|
89
|
+
'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
|
90
|
+
'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
|
91
|
+
'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
|
92
|
+
'origin', 'overline-position', 'overline-thickness', 'panose-1',
|
93
|
+
'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
|
94
|
+
'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
|
95
|
+
'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
|
96
|
+
'stop-color', 'stop-opacity', 'strikethrough-position',
|
97
|
+
'strikethrough-thickness', 'stroke', 'stroke-dasharray',
|
98
|
+
'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
|
99
|
+
'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
|
100
|
+
'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
|
101
|
+
'underline-position', 'underline-thickness', 'unicode',
|
102
|
+
'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
|
103
|
+
'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
|
104
|
+
'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
|
105
|
+
'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
|
106
|
+
'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
|
109
107
|
]
|
110
108
|
|
111
109
|
Svg_Attr_Map = nil
|
112
110
|
Svg_Elem_Map = nil
|
113
111
|
|
114
112
|
Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
|
115
|
-
|
116
|
-
|
113
|
+
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
114
|
+
'stroke-opacity'
|
117
115
|
]
|
118
116
|
|
119
117
|
unless $compatible
|
@@ -148,11 +146,9 @@ module Hpricot
|
|
148
146
|
class Elem
|
149
147
|
def strip_attributes
|
150
148
|
unless attributes.nil?
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
end
|
155
|
-
end
|
149
|
+
ra = {}
|
150
|
+
raw_attributes.keys.each{|atr| ra[atr] = raw_attributes[atr] if Acceptable_Attributes.include?(atr) }
|
151
|
+
self.raw_attributes = ra
|
156
152
|
end
|
157
153
|
end
|
158
154
|
end
|
@@ -160,35 +156,44 @@ end
|
|
160
156
|
|
161
157
|
module FeedParserUtilities
|
162
158
|
class SanitizerDoc < Hpricot::Doc
|
163
|
-
|
159
|
+
|
164
160
|
def scrub
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
161
|
+
others = children.map do |e|
|
162
|
+
if e.elem?
|
163
|
+
if Acceptable_Elements.include?e.name
|
164
|
+
e.strip_attributes
|
165
|
+
e.inner_html = SanitizerDoc.new(e.children).scrub
|
166
|
+
result = e
|
167
|
+
else
|
168
|
+
result = e
|
169
|
+
|
170
|
+
if Unacceptable_Elements_With_End_Tag.include?e.name
|
171
|
+
result = nil
|
172
|
+
end
|
173
|
+
|
174
|
+
if result
|
175
|
+
result = SanitizerDoc.new(result.children).scrub # The important part
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
elsif e.doctype?
|
180
|
+
result = nil
|
181
|
+
|
182
|
+
elsif e.text?
|
183
|
+
ets = e.to_html
|
184
|
+
ets.gsub!(/'/, "'")
|
185
|
+
ets.gsub!(/"/, '"')
|
186
|
+
ets.gsub!(/\r/,'')
|
187
|
+
result = ets
|
188
|
+
end
|
189
|
+
result
|
185
190
|
end
|
186
|
-
|
191
|
+
|
187
192
|
unless $compatible # FIXME nonworking
|
188
|
-
|
189
|
-
|
193
|
+
# yes, that '/' should be there. It's a search method. See the Hpricot docs.
|
194
|
+
(self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
|
190
195
|
end
|
191
|
-
return
|
196
|
+
return others.compact.join
|
192
197
|
end
|
193
198
|
end
|
194
199
|
|
@@ -200,8 +205,8 @@ module FeedParserUtilities
|
|
200
205
|
def sanitizeHTML(html,encoding)
|
201
206
|
# FIXME Tidy not yet supported
|
202
207
|
html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '<!\1')
|
203
|
-
|
204
|
-
|
205
|
-
|
208
|
+
h = SanitizerDoc(html)
|
209
|
+
h = h.scrub
|
210
|
+
return h.strip
|
206
211
|
end
|
207
212
|
end
|