rfeedparser 0.9.931 → 0.9.940

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/lib/rfeedparser.rb +143 -58
  2. data/lib/rfeedparser/aliases.rb +1 -1
  3. data/lib/rfeedparser/better_attributelist.rb +11 -11
  4. data/lib/rfeedparser/better_sgmlparser.rb +1 -1
  5. data/lib/rfeedparser/encoding_helpers.rb +120 -127
  6. data/lib/rfeedparser/feedparserdict.rb +30 -20
  7. data/lib/rfeedparser/forgiving_uri.rb +9 -7
  8. data/lib/rfeedparser/markup_helpers.rb +11 -14
  9. data/lib/rfeedparser/parser_mixin.rb +16 -11
  10. data/lib/rfeedparser/parsers.rb +1 -2
  11. data/lib/rfeedparser/scrub.rb +95 -90
  12. data/lib/rfeedparser/time_helpers.rb +379 -379
  13. data/lib/rfeedparser/utilities.rb +23 -0
  14. data/tests/rfeedparser_test_helper.rb +262 -0
  15. data/tests/rfeedparserserver.rb +3 -109
  16. data/tests/rfeedparsertest.rb +6 -165
  17. data/tests/rfponly/http/200.xml +30 -0
  18. data/tests/rfponly/http/220.xml +28 -0
  19. data/tests/rfponly/http/300.xml +8 -0
  20. data/tests/rfponly/http/300.xml_redirect +25 -0
  21. data/tests/rfponly/http/301.xml +8 -0
  22. data/tests/rfponly/http/301.xml_redirect +25 -0
  23. data/tests/rfponly/http/302.xml +8 -0
  24. data/tests/rfponly/http/302.xml_redirect +25 -0
  25. data/tests/rfponly/http/307.xml +8 -0
  26. data/tests/rfponly/http/307.xml_redirect +25 -0
  27. data/tests/rfponly/http/320.xml +8 -0
  28. data/tests/rfponly/http/320.xml_redirect +25 -0
  29. data/tests/rfponly/http/400.xml +7 -0
  30. data/tests/rfponly/http/404.xml +7 -0
  31. data/tests/rfponly/http/410.xml +7 -0
  32. data/tests/rfponly/http/420.xml +7 -0
  33. data/tests/rfponly/http/500.xml +7 -0
  34. data/tests/rfponly/http/520.xml +7 -0
  35. data/tests/rfponly/http/etag.xml +28 -0
  36. data/tests/rfponly/http/lastmodified.xml +29 -0
  37. data/tests/rfponly/wellformed/date/feed_modified_with_negative_numeric_timezone.xml +9 -0
  38. data/tests/rfponly/wellformed/date/feed_modified_with_positive_numeric_timezone.xml +9 -0
  39. data/tests/rfponly/wellformed/scrub/hpricot_self_closing_tag_workaround.xml +11 -0
  40. metadata +31 -3
@@ -1,5 +1,5 @@
1
- #!/usr/bin/ruby
2
- module FeedParserUtilities
1
+ #!/usr/bin/env ruby
2
+ module FeedParser
3
3
  class FeedParserDict < Hash
4
4
  =begin
5
5
  The naming of a certain common attribute (such as, "When was the last
@@ -16,26 +16,36 @@ module FeedParserUtilities
16
16
  methods check with keymaps to see what attribute the developer "really
17
17
  means" if they've asked for one which happens to be in @@keymap's keys.
18
18
  =end
19
- @@keymap = {'channel' => 'feed',
20
- 'items' => 'entries',
21
- 'guid' => 'id',
22
- 'date' => 'updated',
23
- 'date_parsed' => 'updated_parsed',
24
- 'description' => ['subtitle', 'summary'],
25
- 'url' => ['href'],
26
- 'modified' => 'updated',
27
- 'modified_parsed' => 'updated_parsed',
28
- 'issued' => 'published',
29
- 'issued_parsed' => 'published_parsed',
30
- 'copyright' => 'rights',
31
- 'copyright_detail' => 'rights_detail',
32
- 'tagline' => 'subtitle',
33
- 'tagline_detail' => 'subtitle_detail'}
34
-
35
- def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
36
- return self['entries']
19
+ @@keymap = {
20
+ 'channel' => 'feed',
21
+ 'items' => 'entries',
22
+ 'guid' => 'id',
23
+ 'date' => 'updated',
24
+ 'date_parsed' => 'updated_parsed',
25
+ 'description' => ['subtitle', 'summary'],
26
+ 'url' => ['href'],
27
+ 'modified' => 'updated',
28
+ 'modified_parsed' => 'updated_parsed',
29
+ 'issued' => 'published',
30
+ 'issued_parsed' => 'published_parsed',
31
+ 'copyright' => 'rights',
32
+ 'copyright_detail' => 'rights_detail',
33
+ 'tagline' => 'subtitle',
34
+ 'tagline_detail' => 'subtitle_detail'
35
+ }
36
+
37
+ # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
38
+ alias :hash_entries :entries
39
+ def entries
40
+ self['entries']
37
41
  end
38
42
 
43
+ # Added to avoid deprecated method wornings
44
+ alias :object_type :type
45
+ def type
46
+ self['type']
47
+ end
48
+
39
49
  # We could include the [] rewrite in new using Hash.new's fancy pants block thing
40
50
  # but we'd still have to overwrite []= and such.
41
51
  # I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
@@ -1,4 +1,4 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
2
  # From Robert Aman's GentleCMS URI.
3
3
  # GentleCMS, Copyright (c) 2006 Robert Aman
4
4
  #
@@ -535,7 +535,7 @@ class ForgivingURI
535
535
 
536
536
  # Merges two URIs together.
537
537
  def merge(uri)
538
- return self + uri
538
+ return (self + uri)
539
539
  end
540
540
 
541
541
  # Destructive form of merge.
@@ -940,12 +940,14 @@ class ForgivingURI
940
940
  def urljoin(base, uri)
941
941
  urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
942
942
  uri = uri.sub(urifixer, '\1\3')
943
- begin
944
- return ForgivingURI.join(base, uri).to_s
945
- rescue URI::BadURIError => e
946
- if ForgivingURI.parse(base).relative?
947
- return ForgivingURI.parse(uri).to_s
943
+ pbase = ForgivingURI.parse(base) rescue nil
944
+ if pbase && pbase.absolute?
945
+ puri = ForgivingURI.parse(uri) rescue nil
946
+ if puri && puri.relative?
947
+ # ForgivingURI.join does the wrong thing. What the hell.
948
+ return ForgivingURI.join(base, uri).to_s.gsub(/[^:]\/{2,}/, '')
948
949
  end
949
950
  end
951
+ return uri
950
952
  end
951
953
 
@@ -1,14 +1,11 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
2
  module FeedParserUtilities
3
- #FIXME we need to find a better place for this method
4
3
  def stripDoctype(data)
5
- =begin
6
- Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
7
-
8
- rss_version may be 'rss091n' or None
9
- stripped_data is the same XML document, minus the DOCTYPE
10
- =end
4
+ #Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
5
+ #rss_version may be 'rss091n' or None
6
+ #stripped_data is the same XML document, minus the DOCTYPE
11
7
  entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
8
+
12
9
  data = data.gsub(entity_pattern,'')
13
10
 
14
11
  doctype_pattern = /<!DOCTYPE(.*?)>/m
@@ -27,7 +24,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
27
24
  data = data.sub(doctype_pattern, '')
28
25
  return version, data
29
26
  end
30
-
27
+
31
28
  def resolveRelativeURIs(htmlSource, baseURI, encoding)
32
29
  $stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
33
30
  relative_uris = [ ['a','href'],
@@ -60,11 +57,11 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
60
57
  relative_uris.each do |l|
61
58
  ename, eattr = l
62
59
  h.search(ename).each do |elem|
63
- euri = elem.attributes[eattr]
64
- # FIXME uses the URI.encode method. should it?
65
- if euri and not euri.empty? and ForgivingURI.parse(URI.encode(euri)).relative?
66
- elem.attributes[eattr] = urljoin(baseURI, euri)
67
- end
60
+ euri = elem.attributes[eattr]
61
+ uri = ForgivingURI.parse(URI.encode(euri)) rescue nil
62
+ if euri and not euri.empty? and uri and uri.relative?
63
+ elem.raw_attributes[eattr] = urljoin(baseURI, euri)
64
+ end
68
65
  end
69
66
  end
70
67
  return h.to_html
@@ -1,5 +1,7 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
+ module FeedParser
2
3
  module FeedParserMixin
4
+ include FeedParserUtilities
3
5
  attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
4
6
 
5
7
  def startup(baseuri=nil, baselang=nil, encoding='utf-8')
@@ -103,10 +105,6 @@ module FeedParserMixin
103
105
  if baselang
104
106
  @feeddata['language'] = baselang.gsub('_','-')
105
107
  end
106
- @date_handlers = [:_parse_date_rfc822,
107
- :_parse_date_hungarian, :_parse_date_greek,:_parse_date_mssql,
108
- :_parse_date_nate,:_parse_date_onblog,:_parse_date_w3dtf,:_parse_date_iso8601
109
- ]
110
108
  $stderr << "Leaving startup\n" if $debug # My addition
111
109
  end
112
110
 
@@ -873,7 +871,9 @@ module FeedParserMixin
873
871
 
874
872
  def _end_published
875
873
  value = pop('published')
876
- _save('published_parsed', parse_date(value))
874
+ d = parse_date(value)
875
+ _save('published_parsed', extract_tuple(d))
876
+ _save('published_time', d)
877
877
  end
878
878
  alias :_end_dcterms_issued :_end_published
879
879
  alias :_end_issued :_end_published
@@ -888,7 +888,9 @@ module FeedParserMixin
888
888
 
889
889
  def _end_updated
890
890
  value = pop('updated')
891
- _save('updated_parsed', parse_date(value))
891
+ d = parse_date(value)
892
+ _save('updated_parsed', extract_tuple(d))
893
+ _save('updated_time', d)
892
894
  end
893
895
  alias :_end_modified :_end_updated
894
896
  alias :_end_dcterms_modified :_end_updated
@@ -902,7 +904,9 @@ module FeedParserMixin
902
904
 
903
905
  def _end_created
904
906
  value = pop('created')
905
- _save('created_parsed', parse_date(value))
907
+ d = parse_date(value)
908
+ _save('created_parsed', extract_tuple(d))
909
+ _save('created_time', d)
906
910
  end
907
911
  alias :_end_dcterms_created :_end_created
908
912
 
@@ -910,7 +914,9 @@ module FeedParserMixin
910
914
  push('expired', true)
911
915
  end
912
916
  def _end_expirationdate
913
- _save('expired_parsed', parse_date(pop('expired')))
917
+ d = parse_date(pop('expired'))
918
+ _save('expired_parsed', extract_tuple(d))
919
+ _save('expired_time', d)
914
920
  end
915
921
 
916
922
  def _start_cc_license(attrsD)
@@ -1234,5 +1240,4 @@ module FeedParserMixin
1234
1240
  end
1235
1241
 
1236
1242
  end # End FeedParserMixin
1237
-
1238
-
1243
+ end
@@ -1,5 +1,4 @@
1
- #!/usr/bin/ruby
2
-
1
+ #!/usr/bin/env ruby
3
2
 
4
3
  module FeedParser
5
4
  class StrictFeedParser < XML::SAX::HandlerBase # expat
@@ -1,47 +1,45 @@
1
- #!/usr/bin/ruby
2
- gem 'hpricot', "=0.5"
3
- require 'hpricot'
1
+ #!/usr/bin/env ruby
4
2
  # This used to be based on Michael Moen's Hpricot#scrub, but that seems to
5
3
  # have only been part of its evolution. Hpricot#scrub is cool code, though.
6
4
  # http://underpantsgnome.com/2007/01/20/hpricot-scrub
7
5
  module Hpricot
8
6
  Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
9
- 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
10
- 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
11
- 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
12
- 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
13
- 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
14
- 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
15
- 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
16
- 'ul', 'var'
7
+ 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
8
+ 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
9
+ 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
10
+ 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
11
+ 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
12
+ 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
13
+ 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
14
+ 'ul', 'var'
17
15
  ]
18
16
 
19
17
  Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
20
- 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
21
- 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
22
- 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
23
- 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
24
- 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
25
- 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
26
- 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
27
- 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
28
- 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
29
- 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
18
+ 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
19
+ 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
20
+ 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
21
+ 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
22
+ 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
23
+ 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
24
+ 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
25
+ 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
26
+ 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
27
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
30
28
  ]
31
29
 
32
30
  Unacceptable_Elements_With_End_Tag = ['script', 'applet']
33
31
 
34
32
  Acceptable_Css_Properties = ['azimuth', 'background-color',
35
- 'border-bottom-color', 'border-collapse', 'border-color',
36
- 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
37
- 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
38
- 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
39
- 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
40
- 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
41
- 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
42
- 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
43
- 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
44
- 'white-space', 'width'
33
+ 'border-bottom-color', 'border-collapse', 'border-color',
34
+ 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
35
+ 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
36
+ 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
37
+ 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
38
+ 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
39
+ 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
40
+ 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
41
+ 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
42
+ 'white-space', 'width'
45
43
  ]
46
44
 
47
45
  # survey of common keywords found in feeds
@@ -82,38 +80,38 @@ module Hpricot
82
80
 
83
81
  # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
84
82
  Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
85
- 'arabic-form', 'ascent', 'attributeName', 'attributeType',
86
- 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
87
- 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
88
- 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
89
- 'font-size', 'font-stretch', 'font-style', 'font-variant',
90
- 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
91
- 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
92
- 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
93
- 'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
94
- 'origin', 'overline-position', 'overline-thickness', 'panose-1',
95
- 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
96
- 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
97
- 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
98
- 'stop-color', 'stop-opacity', 'strikethrough-position',
99
- 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
100
- 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
101
- 'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
102
- 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
103
- 'underline-position', 'underline-thickness', 'unicode',
104
- 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
105
- 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
106
- 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
107
- 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
108
- 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
83
+ 'arabic-form', 'ascent', 'attributeName', 'attributeType',
84
+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
85
+ 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
86
+ 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
87
+ 'font-size', 'font-stretch', 'font-style', 'font-variant',
88
+ 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
89
+ 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
90
+ 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
91
+ 'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
92
+ 'origin', 'overline-position', 'overline-thickness', 'panose-1',
93
+ 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
94
+ 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
95
+ 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
96
+ 'stop-color', 'stop-opacity', 'strikethrough-position',
97
+ 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
98
+ 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
99
+ 'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
100
+ 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
101
+ 'underline-position', 'underline-thickness', 'unicode',
102
+ 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
103
+ 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
104
+ 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
105
+ 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
106
+ 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
109
107
  ]
110
108
 
111
109
  Svg_Attr_Map = nil
112
110
  Svg_Elem_Map = nil
113
111
 
114
112
  Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
115
- 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
116
- 'stroke-opacity'
113
+ 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
114
+ 'stroke-opacity'
117
115
  ]
118
116
 
119
117
  unless $compatible
@@ -148,11 +146,9 @@ module Hpricot
148
146
  class Elem
149
147
  def strip_attributes
150
148
  unless attributes.nil?
151
- attributes.each do |atr|
152
- unless Acceptable_Attributes.include?atr[0]
153
- remove_attribute(atr[0])
154
- end
155
- end
149
+ ra = {}
150
+ raw_attributes.keys.each{|atr| ra[atr] = raw_attributes[atr] if Acceptable_Attributes.include?(atr) }
151
+ self.raw_attributes = ra
156
152
  end
157
153
  end
158
154
  end
@@ -160,35 +156,44 @@ end
160
156
 
161
157
  module FeedParserUtilities
162
158
  class SanitizerDoc < Hpricot::Doc
163
-
159
+
164
160
  def scrub
165
- traverse_all_element do |e|
166
- if e.elem?
167
- if Acceptable_Elements.include?e.name
168
- e.strip_attributes
169
- else
170
- if Unacceptable_Elements_With_End_Tag.include?e.name
171
- e.inner_html = ''
172
- end
173
- e.swap(SanitizerDoc.new(e.children).scrub.to_html) # The important part
174
- end
175
- elsif e.doctype?
176
- e.parent.children.delete(e)
177
- elsif e.text?
178
- ets = e.to_s
179
- ets.gsub!(/&#39;/, "'")
180
- ets.gsub!(/&#34;/, '"')
181
- ets.gsub!(/\r/,'')
182
- e.swap(ets)
183
- else
184
- end
161
+ others = children.map do |e|
162
+ if e.elem?
163
+ if Acceptable_Elements.include?e.name
164
+ e.strip_attributes
165
+ e.inner_html = SanitizerDoc.new(e.children).scrub
166
+ result = e
167
+ else
168
+ result = e
169
+
170
+ if Unacceptable_Elements_With_End_Tag.include?e.name
171
+ result = nil
172
+ end
173
+
174
+ if result
175
+ result = SanitizerDoc.new(result.children).scrub # The important part
176
+ end
177
+ end
178
+
179
+ elsif e.doctype?
180
+ result = nil
181
+
182
+ elsif e.text?
183
+ ets = e.to_html
184
+ ets.gsub!(/&#39;/, "'")
185
+ ets.gsub!(/&#34;/, '"')
186
+ ets.gsub!(/\r/,'')
187
+ result = ets
188
+ end
189
+ result
185
190
  end
186
-
191
+
187
192
  unless $compatible # FIXME nonworking
188
- # yes, that '/' should be there. It's a search method. See the Hpricot docs.
189
- (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
193
+ # yes, that '/' should be there. It's a search method. See the Hpricot docs.
194
+ (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
190
195
  end
191
- return self
196
+ return others.compact.join
192
197
  end
193
198
  end
194
199
 
@@ -200,8 +205,8 @@ module FeedParserUtilities
200
205
  def sanitizeHTML(html,encoding)
201
206
  # FIXME Tidy not yet supported
202
207
  html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
203
- h = SanitizerDoc(html)
204
- h = h.scrub
205
- return h.to_html.strip
208
+ h = SanitizerDoc(html)
209
+ h = h.scrub
210
+ return h.strip
206
211
  end
207
212
  end