rfeedparser 0.9.931 → 0.9.940

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/lib/rfeedparser.rb +143 -58
  2. data/lib/rfeedparser/aliases.rb +1 -1
  3. data/lib/rfeedparser/better_attributelist.rb +11 -11
  4. data/lib/rfeedparser/better_sgmlparser.rb +1 -1
  5. data/lib/rfeedparser/encoding_helpers.rb +120 -127
  6. data/lib/rfeedparser/feedparserdict.rb +30 -20
  7. data/lib/rfeedparser/forgiving_uri.rb +9 -7
  8. data/lib/rfeedparser/markup_helpers.rb +11 -14
  9. data/lib/rfeedparser/parser_mixin.rb +16 -11
  10. data/lib/rfeedparser/parsers.rb +1 -2
  11. data/lib/rfeedparser/scrub.rb +95 -90
  12. data/lib/rfeedparser/time_helpers.rb +379 -379
  13. data/lib/rfeedparser/utilities.rb +23 -0
  14. data/tests/rfeedparser_test_helper.rb +262 -0
  15. data/tests/rfeedparserserver.rb +3 -109
  16. data/tests/rfeedparsertest.rb +6 -165
  17. data/tests/rfponly/http/200.xml +30 -0
  18. data/tests/rfponly/http/220.xml +28 -0
  19. data/tests/rfponly/http/300.xml +8 -0
  20. data/tests/rfponly/http/300.xml_redirect +25 -0
  21. data/tests/rfponly/http/301.xml +8 -0
  22. data/tests/rfponly/http/301.xml_redirect +25 -0
  23. data/tests/rfponly/http/302.xml +8 -0
  24. data/tests/rfponly/http/302.xml_redirect +25 -0
  25. data/tests/rfponly/http/307.xml +8 -0
  26. data/tests/rfponly/http/307.xml_redirect +25 -0
  27. data/tests/rfponly/http/320.xml +8 -0
  28. data/tests/rfponly/http/320.xml_redirect +25 -0
  29. data/tests/rfponly/http/400.xml +7 -0
  30. data/tests/rfponly/http/404.xml +7 -0
  31. data/tests/rfponly/http/410.xml +7 -0
  32. data/tests/rfponly/http/420.xml +7 -0
  33. data/tests/rfponly/http/500.xml +7 -0
  34. data/tests/rfponly/http/520.xml +7 -0
  35. data/tests/rfponly/http/etag.xml +28 -0
  36. data/tests/rfponly/http/lastmodified.xml +29 -0
  37. data/tests/rfponly/wellformed/date/feed_modified_with_negative_numeric_timezone.xml +9 -0
  38. data/tests/rfponly/wellformed/date/feed_modified_with_positive_numeric_timezone.xml +9 -0
  39. data/tests/rfponly/wellformed/scrub/hpricot_self_closing_tag_workaround.xml +11 -0
  40. metadata +31 -3
@@ -1,5 +1,5 @@
1
- #!/usr/bin/ruby
2
- module FeedParserUtilities
1
+ #!/usr/bin/env ruby
2
+ module FeedParser
3
3
  class FeedParserDict < Hash
4
4
  =begin
5
5
  The naming of a certain common attribute (such as, "When was the last
@@ -16,26 +16,36 @@ module FeedParserUtilities
16
16
  methods check with keymaps to see what attribute the developer "really
17
17
  means" if they've asked for one which happens to be in @@keymap's keys.
18
18
  =end
19
- @@keymap = {'channel' => 'feed',
20
- 'items' => 'entries',
21
- 'guid' => 'id',
22
- 'date' => 'updated',
23
- 'date_parsed' => 'updated_parsed',
24
- 'description' => ['subtitle', 'summary'],
25
- 'url' => ['href'],
26
- 'modified' => 'updated',
27
- 'modified_parsed' => 'updated_parsed',
28
- 'issued' => 'published',
29
- 'issued_parsed' => 'published_parsed',
30
- 'copyright' => 'rights',
31
- 'copyright_detail' => 'rights_detail',
32
- 'tagline' => 'subtitle',
33
- 'tagline_detail' => 'subtitle_detail'}
34
-
35
- def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
36
- return self['entries']
19
+ @@keymap = {
20
+ 'channel' => 'feed',
21
+ 'items' => 'entries',
22
+ 'guid' => 'id',
23
+ 'date' => 'updated',
24
+ 'date_parsed' => 'updated_parsed',
25
+ 'description' => ['subtitle', 'summary'],
26
+ 'url' => ['href'],
27
+ 'modified' => 'updated',
28
+ 'modified_parsed' => 'updated_parsed',
29
+ 'issued' => 'published',
30
+ 'issued_parsed' => 'published_parsed',
31
+ 'copyright' => 'rights',
32
+ 'copyright_detail' => 'rights_detail',
33
+ 'tagline' => 'subtitle',
34
+ 'tagline_detail' => 'subtitle_detail'
35
+ }
36
+
37
+ # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
38
+ alias :hash_entries :entries
39
+ def entries
40
+ self['entries']
37
41
  end
38
42
 
43
+ # Added to avoid deprecated method wornings
44
+ alias :object_type :type
45
+ def type
46
+ self['type']
47
+ end
48
+
39
49
  # We could include the [] rewrite in new using Hash.new's fancy pants block thing
40
50
  # but we'd still have to overwrite []= and such.
41
51
  # I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
@@ -1,4 +1,4 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
2
  # From Robert Aman's GentleCMS URI.
3
3
  # GentleCMS, Copyright (c) 2006 Robert Aman
4
4
  #
@@ -535,7 +535,7 @@ class ForgivingURI
535
535
 
536
536
  # Merges two URIs together.
537
537
  def merge(uri)
538
- return self + uri
538
+ return (self + uri)
539
539
  end
540
540
 
541
541
  # Destructive form of merge.
@@ -940,12 +940,14 @@ class ForgivingURI
940
940
  def urljoin(base, uri)
941
941
  urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
942
942
  uri = uri.sub(urifixer, '\1\3')
943
- begin
944
- return ForgivingURI.join(base, uri).to_s
945
- rescue URI::BadURIError => e
946
- if ForgivingURI.parse(base).relative?
947
- return ForgivingURI.parse(uri).to_s
943
+ pbase = ForgivingURI.parse(base) rescue nil
944
+ if pbase && pbase.absolute?
945
+ puri = ForgivingURI.parse(uri) rescue nil
946
+ if puri && puri.relative?
947
+ # ForgivingURI.join does the wrong thing. What the hell.
948
+ return ForgivingURI.join(base, uri).to_s.gsub(/[^:]\/{2,}/, '')
948
949
  end
949
950
  end
951
+ return uri
950
952
  end
951
953
 
@@ -1,14 +1,11 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
2
  module FeedParserUtilities
3
- #FIXME we need to find a better place for this method
4
3
  def stripDoctype(data)
5
- =begin
6
- Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
7
-
8
- rss_version may be 'rss091n' or None
9
- stripped_data is the same XML document, minus the DOCTYPE
10
- =end
4
+ #Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
5
+ #rss_version may be 'rss091n' or None
6
+ #stripped_data is the same XML document, minus the DOCTYPE
11
7
  entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
8
+
12
9
  data = data.gsub(entity_pattern,'')
13
10
 
14
11
  doctype_pattern = /<!DOCTYPE(.*?)>/m
@@ -27,7 +24,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
27
24
  data = data.sub(doctype_pattern, '')
28
25
  return version, data
29
26
  end
30
-
27
+
31
28
  def resolveRelativeURIs(htmlSource, baseURI, encoding)
32
29
  $stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
33
30
  relative_uris = [ ['a','href'],
@@ -60,11 +57,11 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
60
57
  relative_uris.each do |l|
61
58
  ename, eattr = l
62
59
  h.search(ename).each do |elem|
63
- euri = elem.attributes[eattr]
64
- # FIXME uses the URI.encode method. should it?
65
- if euri and not euri.empty? and ForgivingURI.parse(URI.encode(euri)).relative?
66
- elem.attributes[eattr] = urljoin(baseURI, euri)
67
- end
60
+ euri = elem.attributes[eattr]
61
+ uri = ForgivingURI.parse(URI.encode(euri)) rescue nil
62
+ if euri and not euri.empty? and uri and uri.relative?
63
+ elem.raw_attributes[eattr] = urljoin(baseURI, euri)
64
+ end
68
65
  end
69
66
  end
70
67
  return h.to_html
@@ -1,5 +1,7 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
+ module FeedParser
2
3
  module FeedParserMixin
4
+ include FeedParserUtilities
3
5
  attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
4
6
 
5
7
  def startup(baseuri=nil, baselang=nil, encoding='utf-8')
@@ -103,10 +105,6 @@ module FeedParserMixin
103
105
  if baselang
104
106
  @feeddata['language'] = baselang.gsub('_','-')
105
107
  end
106
- @date_handlers = [:_parse_date_rfc822,
107
- :_parse_date_hungarian, :_parse_date_greek,:_parse_date_mssql,
108
- :_parse_date_nate,:_parse_date_onblog,:_parse_date_w3dtf,:_parse_date_iso8601
109
- ]
110
108
  $stderr << "Leaving startup\n" if $debug # My addition
111
109
  end
112
110
 
@@ -873,7 +871,9 @@ module FeedParserMixin
873
871
 
874
872
  def _end_published
875
873
  value = pop('published')
876
- _save('published_parsed', parse_date(value))
874
+ d = parse_date(value)
875
+ _save('published_parsed', extract_tuple(d))
876
+ _save('published_time', d)
877
877
  end
878
878
  alias :_end_dcterms_issued :_end_published
879
879
  alias :_end_issued :_end_published
@@ -888,7 +888,9 @@ module FeedParserMixin
888
888
 
889
889
  def _end_updated
890
890
  value = pop('updated')
891
- _save('updated_parsed', parse_date(value))
891
+ d = parse_date(value)
892
+ _save('updated_parsed', extract_tuple(d))
893
+ _save('updated_time', d)
892
894
  end
893
895
  alias :_end_modified :_end_updated
894
896
  alias :_end_dcterms_modified :_end_updated
@@ -902,7 +904,9 @@ module FeedParserMixin
902
904
 
903
905
  def _end_created
904
906
  value = pop('created')
905
- _save('created_parsed', parse_date(value))
907
+ d = parse_date(value)
908
+ _save('created_parsed', extract_tuple(d))
909
+ _save('created_time', d)
906
910
  end
907
911
  alias :_end_dcterms_created :_end_created
908
912
 
@@ -910,7 +914,9 @@ module FeedParserMixin
910
914
  push('expired', true)
911
915
  end
912
916
  def _end_expirationdate
913
- _save('expired_parsed', parse_date(pop('expired')))
917
+ d = parse_date(pop('expired'))
918
+ _save('expired_parsed', extract_tuple(d))
919
+ _save('expired_time', d)
914
920
  end
915
921
 
916
922
  def _start_cc_license(attrsD)
@@ -1234,5 +1240,4 @@ module FeedParserMixin
1234
1240
  end
1235
1241
 
1236
1242
  end # End FeedParserMixin
1237
-
1238
-
1243
+ end
@@ -1,5 +1,4 @@
1
- #!/usr/bin/ruby
2
-
1
+ #!/usr/bin/env ruby
3
2
 
4
3
  module FeedParser
5
4
  class StrictFeedParser < XML::SAX::HandlerBase # expat
@@ -1,47 +1,45 @@
1
- #!/usr/bin/ruby
2
- gem 'hpricot', "=0.5"
3
- require 'hpricot'
1
+ #!/usr/bin/env ruby
4
2
  # This used to be based on Michael Moen's Hpricot#scrub, but that seems to
5
3
  # have only been part of its evolution. Hpricot#scrub is cool code, though.
6
4
  # http://underpantsgnome.com/2007/01/20/hpricot-scrub
7
5
  module Hpricot
8
6
  Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
9
- 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
10
- 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
11
- 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
12
- 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
13
- 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
14
- 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
15
- 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
16
- 'ul', 'var'
7
+ 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
8
+ 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
9
+ 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
10
+ 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
11
+ 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
12
+ 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
13
+ 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
14
+ 'ul', 'var'
17
15
  ]
18
16
 
19
17
  Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
20
- 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
21
- 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
22
- 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
23
- 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
24
- 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
25
- 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
26
- 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
27
- 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
28
- 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
29
- 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
18
+ 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
19
+ 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
20
+ 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
21
+ 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
22
+ 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
23
+ 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
24
+ 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
25
+ 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
26
+ 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
27
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
30
28
  ]
31
29
 
32
30
  Unacceptable_Elements_With_End_Tag = ['script', 'applet']
33
31
 
34
32
  Acceptable_Css_Properties = ['azimuth', 'background-color',
35
- 'border-bottom-color', 'border-collapse', 'border-color',
36
- 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
37
- 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
38
- 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
39
- 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
40
- 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
41
- 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
42
- 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
43
- 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
44
- 'white-space', 'width'
33
+ 'border-bottom-color', 'border-collapse', 'border-color',
34
+ 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
35
+ 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
36
+ 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
37
+ 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
38
+ 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
39
+ 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
40
+ 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
41
+ 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
42
+ 'white-space', 'width'
45
43
  ]
46
44
 
47
45
  # survey of common keywords found in feeds
@@ -82,38 +80,38 @@ module Hpricot
82
80
 
83
81
  # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
84
82
  Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
85
- 'arabic-form', 'ascent', 'attributeName', 'attributeType',
86
- 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
87
- 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
88
- 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
89
- 'font-size', 'font-stretch', 'font-style', 'font-variant',
90
- 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
91
- 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
92
- 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
93
- 'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
94
- 'origin', 'overline-position', 'overline-thickness', 'panose-1',
95
- 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
96
- 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
97
- 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
98
- 'stop-color', 'stop-opacity', 'strikethrough-position',
99
- 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
100
- 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
101
- 'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
102
- 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
103
- 'underline-position', 'underline-thickness', 'unicode',
104
- 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
105
- 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
106
- 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
107
- 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
108
- 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
83
+ 'arabic-form', 'ascent', 'attributeName', 'attributeType',
84
+ 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
85
+ 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
86
+ 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
87
+ 'font-size', 'font-stretch', 'font-style', 'font-variant',
88
+ 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
89
+ 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
90
+ 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
91
+ 'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
92
+ 'origin', 'overline-position', 'overline-thickness', 'panose-1',
93
+ 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
94
+ 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
95
+ 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
96
+ 'stop-color', 'stop-opacity', 'strikethrough-position',
97
+ 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
98
+ 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
99
+ 'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
100
+ 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
101
+ 'underline-position', 'underline-thickness', 'unicode',
102
+ 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
103
+ 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
104
+ 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
105
+ 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
106
+ 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
109
107
  ]
110
108
 
111
109
  Svg_Attr_Map = nil
112
110
  Svg_Elem_Map = nil
113
111
 
114
112
  Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
115
- 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
116
- 'stroke-opacity'
113
+ 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
114
+ 'stroke-opacity'
117
115
  ]
118
116
 
119
117
  unless $compatible
@@ -148,11 +146,9 @@ module Hpricot
148
146
  class Elem
149
147
  def strip_attributes
150
148
  unless attributes.nil?
151
- attributes.each do |atr|
152
- unless Acceptable_Attributes.include?atr[0]
153
- remove_attribute(atr[0])
154
- end
155
- end
149
+ ra = {}
150
+ raw_attributes.keys.each{|atr| ra[atr] = raw_attributes[atr] if Acceptable_Attributes.include?(atr) }
151
+ self.raw_attributes = ra
156
152
  end
157
153
  end
158
154
  end
@@ -160,35 +156,44 @@ end
160
156
 
161
157
  module FeedParserUtilities
162
158
  class SanitizerDoc < Hpricot::Doc
163
-
159
+
164
160
  def scrub
165
- traverse_all_element do |e|
166
- if e.elem?
167
- if Acceptable_Elements.include?e.name
168
- e.strip_attributes
169
- else
170
- if Unacceptable_Elements_With_End_Tag.include?e.name
171
- e.inner_html = ''
172
- end
173
- e.swap(SanitizerDoc.new(e.children).scrub.to_html) # The important part
174
- end
175
- elsif e.doctype?
176
- e.parent.children.delete(e)
177
- elsif e.text?
178
- ets = e.to_s
179
- ets.gsub!(/&#39;/, "'")
180
- ets.gsub!(/&#34;/, '"')
181
- ets.gsub!(/\r/,'')
182
- e.swap(ets)
183
- else
184
- end
161
+ others = children.map do |e|
162
+ if e.elem?
163
+ if Acceptable_Elements.include?e.name
164
+ e.strip_attributes
165
+ e.inner_html = SanitizerDoc.new(e.children).scrub
166
+ result = e
167
+ else
168
+ result = e
169
+
170
+ if Unacceptable_Elements_With_End_Tag.include?e.name
171
+ result = nil
172
+ end
173
+
174
+ if result
175
+ result = SanitizerDoc.new(result.children).scrub # The important part
176
+ end
177
+ end
178
+
179
+ elsif e.doctype?
180
+ result = nil
181
+
182
+ elsif e.text?
183
+ ets = e.to_html
184
+ ets.gsub!(/&#39;/, "'")
185
+ ets.gsub!(/&#34;/, '"')
186
+ ets.gsub!(/\r/,'')
187
+ result = ets
188
+ end
189
+ result
185
190
  end
186
-
191
+
187
192
  unless $compatible # FIXME nonworking
188
- # yes, that '/' should be there. It's a search method. See the Hpricot docs.
189
- (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
193
+ # yes, that '/' should be there. It's a search method. See the Hpricot docs.
194
+ (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
190
195
  end
191
- return self
196
+ return others.compact.join
192
197
  end
193
198
  end
194
199
 
@@ -200,8 +205,8 @@ module FeedParserUtilities
200
205
  def sanitizeHTML(html,encoding)
201
206
  # FIXME Tidy not yet supported
202
207
  html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
203
- h = SanitizerDoc(html)
204
- h = h.scrub
205
- return h.to_html.strip
208
+ h = SanitizerDoc(html)
209
+ h = h.scrub
210
+ return h.strip
206
211
  end
207
212
  end