rfeedparser 0.9.931 → 0.9.940
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rfeedparser.rb +143 -58
- data/lib/rfeedparser/aliases.rb +1 -1
- data/lib/rfeedparser/better_attributelist.rb +11 -11
- data/lib/rfeedparser/better_sgmlparser.rb +1 -1
- data/lib/rfeedparser/encoding_helpers.rb +120 -127
- data/lib/rfeedparser/feedparserdict.rb +30 -20
- data/lib/rfeedparser/forgiving_uri.rb +9 -7
- data/lib/rfeedparser/markup_helpers.rb +11 -14
- data/lib/rfeedparser/parser_mixin.rb +16 -11
- data/lib/rfeedparser/parsers.rb +1 -2
- data/lib/rfeedparser/scrub.rb +95 -90
- data/lib/rfeedparser/time_helpers.rb +379 -379
- data/lib/rfeedparser/utilities.rb +23 -0
- data/tests/rfeedparser_test_helper.rb +262 -0
- data/tests/rfeedparserserver.rb +3 -109
- data/tests/rfeedparsertest.rb +6 -165
- data/tests/rfponly/http/200.xml +30 -0
- data/tests/rfponly/http/220.xml +28 -0
- data/tests/rfponly/http/300.xml +8 -0
- data/tests/rfponly/http/300.xml_redirect +25 -0
- data/tests/rfponly/http/301.xml +8 -0
- data/tests/rfponly/http/301.xml_redirect +25 -0
- data/tests/rfponly/http/302.xml +8 -0
- data/tests/rfponly/http/302.xml_redirect +25 -0
- data/tests/rfponly/http/307.xml +8 -0
- data/tests/rfponly/http/307.xml_redirect +25 -0
- data/tests/rfponly/http/320.xml +8 -0
- data/tests/rfponly/http/320.xml_redirect +25 -0
- data/tests/rfponly/http/400.xml +7 -0
- data/tests/rfponly/http/404.xml +7 -0
- data/tests/rfponly/http/410.xml +7 -0
- data/tests/rfponly/http/420.xml +7 -0
- data/tests/rfponly/http/500.xml +7 -0
- data/tests/rfponly/http/520.xml +7 -0
- data/tests/rfponly/http/etag.xml +28 -0
- data/tests/rfponly/http/lastmodified.xml +29 -0
- data/tests/rfponly/wellformed/date/feed_modified_with_negative_numeric_timezone.xml +9 -0
- data/tests/rfponly/wellformed/date/feed_modified_with_positive_numeric_timezone.xml +9 -0
- data/tests/rfponly/wellformed/scrub/hpricot_self_closing_tag_workaround.xml +11 -0
- metadata +31 -3
@@ -1,5 +1,5 @@
|
|
1
|
-
#!/usr/bin/ruby
|
2
|
-
module
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
module FeedParser
|
3
3
|
class FeedParserDict < Hash
|
4
4
|
=begin
|
5
5
|
The naming of a certain common attribute (such as, "When was the last
|
@@ -16,26 +16,36 @@ module FeedParserUtilities
|
|
16
16
|
methods check with keymaps to see what attribute the developer "really
|
17
17
|
means" if they've asked for one which happens to be in @@keymap's keys.
|
18
18
|
=end
|
19
|
-
@@keymap = {
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
19
|
+
@@keymap = {
|
20
|
+
'channel' => 'feed',
|
21
|
+
'items' => 'entries',
|
22
|
+
'guid' => 'id',
|
23
|
+
'date' => 'updated',
|
24
|
+
'date_parsed' => 'updated_parsed',
|
25
|
+
'description' => ['subtitle', 'summary'],
|
26
|
+
'url' => ['href'],
|
27
|
+
'modified' => 'updated',
|
28
|
+
'modified_parsed' => 'updated_parsed',
|
29
|
+
'issued' => 'published',
|
30
|
+
'issued_parsed' => 'published_parsed',
|
31
|
+
'copyright' => 'rights',
|
32
|
+
'copyright_detail' => 'rights_detail',
|
33
|
+
'tagline' => 'subtitle',
|
34
|
+
'tagline_detail' => 'subtitle_detail'
|
35
|
+
}
|
36
|
+
|
37
|
+
# Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
|
38
|
+
alias :hash_entries :entries
|
39
|
+
def entries
|
40
|
+
self['entries']
|
37
41
|
end
|
38
42
|
|
43
|
+
# Added to avoid deprecated method wornings
|
44
|
+
alias :object_type :type
|
45
|
+
def type
|
46
|
+
self['type']
|
47
|
+
end
|
48
|
+
|
39
49
|
# We could include the [] rewrite in new using Hash.new's fancy pants block thing
|
40
50
|
# but we'd still have to overwrite []= and such.
|
41
51
|
# I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
|
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
# From Robert Aman's GentleCMS URI.
|
3
3
|
# GentleCMS, Copyright (c) 2006 Robert Aman
|
4
4
|
#
|
@@ -535,7 +535,7 @@ class ForgivingURI
|
|
535
535
|
|
536
536
|
# Merges two URIs together.
|
537
537
|
def merge(uri)
|
538
|
-
return self + uri
|
538
|
+
return (self + uri)
|
539
539
|
end
|
540
540
|
|
541
541
|
# Destructive form of merge.
|
@@ -940,12 +940,14 @@ class ForgivingURI
|
|
940
940
|
def urljoin(base, uri)
|
941
941
|
urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
|
942
942
|
uri = uri.sub(urifixer, '\1\3')
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
if
|
947
|
-
|
943
|
+
pbase = ForgivingURI.parse(base) rescue nil
|
944
|
+
if pbase && pbase.absolute?
|
945
|
+
puri = ForgivingURI.parse(uri) rescue nil
|
946
|
+
if puri && puri.relative?
|
947
|
+
# ForgivingURI.join does the wrong thing. What the hell.
|
948
|
+
return ForgivingURI.join(base, uri).to_s.gsub(/[^:]\/{2,}/, '')
|
948
949
|
end
|
949
950
|
end
|
951
|
+
return uri
|
950
952
|
end
|
951
953
|
|
@@ -1,14 +1,11 @@
|
|
1
|
-
#!/usr/bin/ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
module FeedParserUtilities
|
3
|
-
#FIXME we need to find a better place for this method
|
4
3
|
def stripDoctype(data)
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
rss_version may be 'rss091n' or None
|
9
|
-
stripped_data is the same XML document, minus the DOCTYPE
|
10
|
-
=end
|
4
|
+
#Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
5
|
+
#rss_version may be 'rss091n' or None
|
6
|
+
#stripped_data is the same XML document, minus the DOCTYPE
|
11
7
|
entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
|
8
|
+
|
12
9
|
data = data.gsub(entity_pattern,'')
|
13
10
|
|
14
11
|
doctype_pattern = /<!DOCTYPE(.*?)>/m
|
@@ -27,7 +24,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
|
27
24
|
data = data.sub(doctype_pattern, '')
|
28
25
|
return version, data
|
29
26
|
end
|
30
|
-
|
27
|
+
|
31
28
|
def resolveRelativeURIs(htmlSource, baseURI, encoding)
|
32
29
|
$stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
|
33
30
|
relative_uris = [ ['a','href'],
|
@@ -60,11 +57,11 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
|
|
60
57
|
relative_uris.each do |l|
|
61
58
|
ename, eattr = l
|
62
59
|
h.search(ename).each do |elem|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
60
|
+
euri = elem.attributes[eattr]
|
61
|
+
uri = ForgivingURI.parse(URI.encode(euri)) rescue nil
|
62
|
+
if euri and not euri.empty? and uri and uri.relative?
|
63
|
+
elem.raw_attributes[eattr] = urljoin(baseURI, euri)
|
64
|
+
end
|
68
65
|
end
|
69
66
|
end
|
70
67
|
return h.to_html
|
@@ -1,5 +1,7 @@
|
|
1
|
-
#!/usr/bin/ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
module FeedParser
|
2
3
|
module FeedParserMixin
|
4
|
+
include FeedParserUtilities
|
3
5
|
attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
|
4
6
|
|
5
7
|
def startup(baseuri=nil, baselang=nil, encoding='utf-8')
|
@@ -103,10 +105,6 @@ module FeedParserMixin
|
|
103
105
|
if baselang
|
104
106
|
@feeddata['language'] = baselang.gsub('_','-')
|
105
107
|
end
|
106
|
-
@date_handlers = [:_parse_date_rfc822,
|
107
|
-
:_parse_date_hungarian, :_parse_date_greek,:_parse_date_mssql,
|
108
|
-
:_parse_date_nate,:_parse_date_onblog,:_parse_date_w3dtf,:_parse_date_iso8601
|
109
|
-
]
|
110
108
|
$stderr << "Leaving startup\n" if $debug # My addition
|
111
109
|
end
|
112
110
|
|
@@ -873,7 +871,9 @@ module FeedParserMixin
|
|
873
871
|
|
874
872
|
def _end_published
|
875
873
|
value = pop('published')
|
876
|
-
|
874
|
+
d = parse_date(value)
|
875
|
+
_save('published_parsed', extract_tuple(d))
|
876
|
+
_save('published_time', d)
|
877
877
|
end
|
878
878
|
alias :_end_dcterms_issued :_end_published
|
879
879
|
alias :_end_issued :_end_published
|
@@ -888,7 +888,9 @@ module FeedParserMixin
|
|
888
888
|
|
889
889
|
def _end_updated
|
890
890
|
value = pop('updated')
|
891
|
-
|
891
|
+
d = parse_date(value)
|
892
|
+
_save('updated_parsed', extract_tuple(d))
|
893
|
+
_save('updated_time', d)
|
892
894
|
end
|
893
895
|
alias :_end_modified :_end_updated
|
894
896
|
alias :_end_dcterms_modified :_end_updated
|
@@ -902,7 +904,9 @@ module FeedParserMixin
|
|
902
904
|
|
903
905
|
def _end_created
|
904
906
|
value = pop('created')
|
905
|
-
|
907
|
+
d = parse_date(value)
|
908
|
+
_save('created_parsed', extract_tuple(d))
|
909
|
+
_save('created_time', d)
|
906
910
|
end
|
907
911
|
alias :_end_dcterms_created :_end_created
|
908
912
|
|
@@ -910,7 +914,9 @@ module FeedParserMixin
|
|
910
914
|
push('expired', true)
|
911
915
|
end
|
912
916
|
def _end_expirationdate
|
913
|
-
|
917
|
+
d = parse_date(pop('expired'))
|
918
|
+
_save('expired_parsed', extract_tuple(d))
|
919
|
+
_save('expired_time', d)
|
914
920
|
end
|
915
921
|
|
916
922
|
def _start_cc_license(attrsD)
|
@@ -1234,5 +1240,4 @@ module FeedParserMixin
|
|
1234
1240
|
end
|
1235
1241
|
|
1236
1242
|
end # End FeedParserMixin
|
1237
|
-
|
1238
|
-
|
1243
|
+
end
|
data/lib/rfeedparser/parsers.rb
CHANGED
data/lib/rfeedparser/scrub.rb
CHANGED
@@ -1,47 +1,45 @@
|
|
1
|
-
#!/usr/bin/ruby
|
2
|
-
gem 'hpricot', "=0.5"
|
3
|
-
require 'hpricot'
|
1
|
+
#!/usr/bin/env ruby
|
4
2
|
# This used to be based on Michael Moen's Hpricot#scrub, but that seems to
|
5
3
|
# have only been part of its evolution. Hpricot#scrub is cool code, though.
|
6
4
|
# http://underpantsgnome.com/2007/01/20/hpricot-scrub
|
7
5
|
module Hpricot
|
8
6
|
Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
7
|
+
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
8
|
+
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
|
9
|
+
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
10
|
+
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
|
11
|
+
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
|
12
|
+
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
|
13
|
+
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
|
14
|
+
'ul', 'var'
|
17
15
|
]
|
18
16
|
|
19
17
|
Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
18
|
+
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
19
|
+
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
20
|
+
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
21
|
+
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
22
|
+
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
23
|
+
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
24
|
+
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
25
|
+
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
26
|
+
'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
27
|
+
'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
|
30
28
|
]
|
31
29
|
|
32
30
|
Unacceptable_Elements_With_End_Tag = ['script', 'applet']
|
33
31
|
|
34
32
|
Acceptable_Css_Properties = ['azimuth', 'background-color',
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
33
|
+
'border-bottom-color', 'border-collapse', 'border-color',
|
34
|
+
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
35
|
+
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
36
|
+
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
37
|
+
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
38
|
+
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
39
|
+
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
40
|
+
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
41
|
+
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
42
|
+
'white-space', 'width'
|
45
43
|
]
|
46
44
|
|
47
45
|
# survey of common keywords found in feeds
|
@@ -82,38 +80,38 @@ module Hpricot
|
|
82
80
|
|
83
81
|
# svgtiny + class + opacity + offset + xmlns + xmlns:xlink
|
84
82
|
Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
83
|
+
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
84
|
+
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
85
|
+
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
|
86
|
+
'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
|
87
|
+
'font-size', 'font-stretch', 'font-style', 'font-variant',
|
88
|
+
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
|
89
|
+
'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
|
90
|
+
'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
|
91
|
+
'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
|
92
|
+
'origin', 'overline-position', 'overline-thickness', 'panose-1',
|
93
|
+
'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
|
94
|
+
'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
|
95
|
+
'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
|
96
|
+
'stop-color', 'stop-opacity', 'strikethrough-position',
|
97
|
+
'strikethrough-thickness', 'stroke', 'stroke-dasharray',
|
98
|
+
'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
|
99
|
+
'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
|
100
|
+
'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
|
101
|
+
'underline-position', 'underline-thickness', 'unicode',
|
102
|
+
'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
|
103
|
+
'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
|
104
|
+
'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
|
105
|
+
'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
|
106
|
+
'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
|
109
107
|
]
|
110
108
|
|
111
109
|
Svg_Attr_Map = nil
|
112
110
|
Svg_Elem_Map = nil
|
113
111
|
|
114
112
|
Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
|
115
|
-
|
116
|
-
|
113
|
+
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
114
|
+
'stroke-opacity'
|
117
115
|
]
|
118
116
|
|
119
117
|
unless $compatible
|
@@ -148,11 +146,9 @@ module Hpricot
|
|
148
146
|
class Elem
|
149
147
|
def strip_attributes
|
150
148
|
unless attributes.nil?
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
end
|
155
|
-
end
|
149
|
+
ra = {}
|
150
|
+
raw_attributes.keys.each{|atr| ra[atr] = raw_attributes[atr] if Acceptable_Attributes.include?(atr) }
|
151
|
+
self.raw_attributes = ra
|
156
152
|
end
|
157
153
|
end
|
158
154
|
end
|
@@ -160,35 +156,44 @@ end
|
|
160
156
|
|
161
157
|
module FeedParserUtilities
|
162
158
|
class SanitizerDoc < Hpricot::Doc
|
163
|
-
|
159
|
+
|
164
160
|
def scrub
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
161
|
+
others = children.map do |e|
|
162
|
+
if e.elem?
|
163
|
+
if Acceptable_Elements.include?e.name
|
164
|
+
e.strip_attributes
|
165
|
+
e.inner_html = SanitizerDoc.new(e.children).scrub
|
166
|
+
result = e
|
167
|
+
else
|
168
|
+
result = e
|
169
|
+
|
170
|
+
if Unacceptable_Elements_With_End_Tag.include?e.name
|
171
|
+
result = nil
|
172
|
+
end
|
173
|
+
|
174
|
+
if result
|
175
|
+
result = SanitizerDoc.new(result.children).scrub # The important part
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
elsif e.doctype?
|
180
|
+
result = nil
|
181
|
+
|
182
|
+
elsif e.text?
|
183
|
+
ets = e.to_html
|
184
|
+
ets.gsub!(/'/, "'")
|
185
|
+
ets.gsub!(/"/, '"')
|
186
|
+
ets.gsub!(/\r/,'')
|
187
|
+
result = ets
|
188
|
+
end
|
189
|
+
result
|
185
190
|
end
|
186
|
-
|
191
|
+
|
187
192
|
unless $compatible # FIXME nonworking
|
188
|
-
|
189
|
-
|
193
|
+
# yes, that '/' should be there. It's a search method. See the Hpricot docs.
|
194
|
+
(self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
|
190
195
|
end
|
191
|
-
return
|
196
|
+
return others.compact.join
|
192
197
|
end
|
193
198
|
end
|
194
199
|
|
@@ -200,8 +205,8 @@ module FeedParserUtilities
|
|
200
205
|
def sanitizeHTML(html,encoding)
|
201
206
|
# FIXME Tidy not yet supported
|
202
207
|
html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '<!\1')
|
203
|
-
|
204
|
-
|
205
|
-
|
208
|
+
h = SanitizerDoc(html)
|
209
|
+
h = h.scrub
|
210
|
+
return h.strip
|
206
211
|
end
|
207
212
|
end
|