rfeedparser 0.9.931 → 0.9.940
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rfeedparser.rb +143 -58
- data/lib/rfeedparser/aliases.rb +1 -1
- data/lib/rfeedparser/better_attributelist.rb +11 -11
- data/lib/rfeedparser/better_sgmlparser.rb +1 -1
- data/lib/rfeedparser/encoding_helpers.rb +120 -127
- data/lib/rfeedparser/feedparserdict.rb +30 -20
- data/lib/rfeedparser/forgiving_uri.rb +9 -7
- data/lib/rfeedparser/markup_helpers.rb +11 -14
- data/lib/rfeedparser/parser_mixin.rb +16 -11
- data/lib/rfeedparser/parsers.rb +1 -2
- data/lib/rfeedparser/scrub.rb +95 -90
- data/lib/rfeedparser/time_helpers.rb +379 -379
- data/lib/rfeedparser/utilities.rb +23 -0
- data/tests/rfeedparser_test_helper.rb +262 -0
- data/tests/rfeedparserserver.rb +3 -109
- data/tests/rfeedparsertest.rb +6 -165
- data/tests/rfponly/http/200.xml +30 -0
- data/tests/rfponly/http/220.xml +28 -0
- data/tests/rfponly/http/300.xml +8 -0
- data/tests/rfponly/http/300.xml_redirect +25 -0
- data/tests/rfponly/http/301.xml +8 -0
- data/tests/rfponly/http/301.xml_redirect +25 -0
- data/tests/rfponly/http/302.xml +8 -0
- data/tests/rfponly/http/302.xml_redirect +25 -0
- data/tests/rfponly/http/307.xml +8 -0
- data/tests/rfponly/http/307.xml_redirect +25 -0
- data/tests/rfponly/http/320.xml +8 -0
- data/tests/rfponly/http/320.xml_redirect +25 -0
- data/tests/rfponly/http/400.xml +7 -0
- data/tests/rfponly/http/404.xml +7 -0
- data/tests/rfponly/http/410.xml +7 -0
- data/tests/rfponly/http/420.xml +7 -0
- data/tests/rfponly/http/500.xml +7 -0
- data/tests/rfponly/http/520.xml +7 -0
- data/tests/rfponly/http/etag.xml +28 -0
- data/tests/rfponly/http/lastmodified.xml +29 -0
- data/tests/rfponly/wellformed/date/feed_modified_with_negative_numeric_timezone.xml +9 -0
- data/tests/rfponly/wellformed/date/feed_modified_with_positive_numeric_timezone.xml +9 -0
- data/tests/rfponly/wellformed/scrub/hpricot_self_closing_tag_workaround.xml +11 -0
- metadata +31 -3
| @@ -1,5 +1,5 @@ | |
| 1 | 
            -
            #!/usr/bin/ruby
         | 
| 2 | 
            -
            module  | 
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
            module FeedParser
         | 
| 3 3 | 
             
              class FeedParserDict < Hash 
         | 
| 4 4 | 
             
            =begin
         | 
| 5 5 | 
             
                 The naming of a certain common attribute (such as, "When was the last
         | 
| @@ -16,26 +16,36 @@ module FeedParserUtilities | |
| 16 16 | 
             
                 methods check with keymaps to see what attribute the developer "really
         | 
| 17 17 | 
             
                 means" if they've asked for one which happens to be in @@keymap's keys.
         | 
| 18 18 | 
             
            =end
         | 
| 19 | 
            -
                @@keymap = { | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
             | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
                 | 
| 36 | 
            -
             | 
| 19 | 
            +
                @@keymap = {
         | 
| 20 | 
            +
                  'channel' => 'feed',
         | 
| 21 | 
            +
                  'items' => 'entries',
         | 
| 22 | 
            +
            	    'guid' => 'id',
         | 
| 23 | 
            +
            	    'date' => 'updated',
         | 
| 24 | 
            +
            	    'date_parsed' => 'updated_parsed',
         | 
| 25 | 
            +
            	    'description' => ['subtitle', 'summary'],
         | 
| 26 | 
            +
            	    'url' => ['href'],
         | 
| 27 | 
            +
            	    'modified' => 'updated',
         | 
| 28 | 
            +
            	    'modified_parsed' => 'updated_parsed',
         | 
| 29 | 
            +
            	    'issued' => 'published',
         | 
| 30 | 
            +
            	    'issued_parsed' => 'published_parsed',
         | 
| 31 | 
            +
            	    'copyright' => 'rights',
         | 
| 32 | 
            +
            	    'copyright_detail' => 'rights_detail',
         | 
| 33 | 
            +
            	    'tagline' => 'subtitle',
         | 
| 34 | 
            +
            	    'tagline_detail' => 'subtitle_detail'
         | 
| 35 | 
            +
                }
         | 
| 36 | 
            +
                
         | 
| 37 | 
            +
                # Apparently, Hash has an entries method!  That blew a good 3 hours or more of my time
         | 
| 38 | 
            +
                alias :hash_entries :entries
         | 
| 39 | 
            +
                def entries 
         | 
| 40 | 
            +
                  self['entries']
         | 
| 37 41 | 
             
                end
         | 
| 38 42 |  | 
| 43 | 
            +
                # Added to avoid deprecated method wornings
         | 
| 44 | 
            +
                alias :object_type :type
         | 
| 45 | 
            +
                def type
         | 
| 46 | 
            +
                  self['type']
         | 
| 47 | 
            +
                end
         | 
| 48 | 
            +
                
         | 
| 39 49 | 
             
                # We could include the [] rewrite in new using Hash.new's fancy pants block thing
         | 
| 40 50 | 
             
                # but we'd still have to overwrite []= and such. 
         | 
| 41 51 | 
             
                # I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
         | 
| @@ -1,4 +1,4 @@ | |
| 1 | 
            -
            #!/usr/bin/ruby
         | 
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 2 | 
             
            # From Robert Aman's GentleCMS URI.
         | 
| 3 3 | 
             
            # GentleCMS, Copyright (c) 2006 Robert Aman
         | 
| 4 4 | 
             
            #
         | 
| @@ -535,7 +535,7 @@ class ForgivingURI | |
| 535 535 |  | 
| 536 536 | 
             
                # Merges two URIs together.
         | 
| 537 537 | 
             
                def merge(uri)
         | 
| 538 | 
            -
                  return self + uri
         | 
| 538 | 
            +
                  return (self + uri)
         | 
| 539 539 | 
             
                end
         | 
| 540 540 |  | 
| 541 541 | 
             
                # Destructive form of merge.
         | 
| @@ -940,12 +940,14 @@ class ForgivingURI | |
| 940 940 | 
             
            def urljoin(base, uri)
         | 
| 941 941 | 
             
              urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
         | 
| 942 942 | 
             
              uri = uri.sub(urifixer, '\1\3') 
         | 
| 943 | 
            -
               | 
| 944 | 
            -
             | 
| 945 | 
            -
             | 
| 946 | 
            -
                if  | 
| 947 | 
            -
                   | 
| 943 | 
            +
              pbase = ForgivingURI.parse(base) rescue nil
         | 
| 944 | 
            +
              if pbase && pbase.absolute?
         | 
| 945 | 
            +
                puri = ForgivingURI.parse(uri) rescue nil
         | 
| 946 | 
            +
                if puri && puri.relative?
         | 
| 947 | 
            +
                  # ForgivingURI.join does the wrong thing.  What the hell.
         | 
| 948 | 
            +
                  return ForgivingURI.join(base, uri).to_s.gsub(/[^:]\/{2,}/, '')
         | 
| 948 949 | 
             
                end
         | 
| 949 950 | 
             
              end
         | 
| 951 | 
            +
              return uri
         | 
| 950 952 | 
             
            end
         | 
| 951 953 |  | 
| @@ -1,14 +1,11 @@ | |
| 1 | 
            -
            #!/usr/bin/ruby
         | 
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 2 | 
             
            module FeedParserUtilities
         | 
| 3 | 
            -
              #FIXME we need to find a better place for this method
         | 
| 4 3 | 
             
              def stripDoctype(data)
         | 
| 5 | 
            -
             | 
| 6 | 
            -
             | 
| 7 | 
            -
             | 
| 8 | 
            -
                rss_version may be 'rss091n' or None
         | 
| 9 | 
            -
                stripped_data is the same XML document, minus the DOCTYPE
         | 
| 10 | 
            -
            =end
         | 
| 4 | 
            +
                #Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
         | 
| 5 | 
            +
                #rss_version may be 'rss091n' or None
         | 
| 6 | 
            +
                #stripped_data is the same XML document, minus the DOCTYPE
         | 
| 11 7 | 
             
                entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
         | 
| 8 | 
            +
                
         | 
| 12 9 | 
             
                data = data.gsub(entity_pattern,'')
         | 
| 13 10 |  | 
| 14 11 | 
             
                doctype_pattern = /<!DOCTYPE(.*?)>/m
         | 
| @@ -27,7 +24,7 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data) | |
| 27 24 | 
             
                data = data.sub(doctype_pattern, '')
         | 
| 28 25 | 
             
                return version, data
         | 
| 29 26 | 
             
              end
         | 
| 30 | 
            -
             | 
| 27 | 
            +
              
         | 
| 31 28 | 
             
              def resolveRelativeURIs(htmlSource, baseURI, encoding)
         | 
| 32 29 | 
             
                $stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
         | 
| 33 30 | 
             
                relative_uris = [ ['a','href'],
         | 
| @@ -60,11 +57,11 @@ Strips DOCTYPE from XML document, returns (rss_version, stripped_data) | |
| 60 57 | 
             
                relative_uris.each do |l|
         | 
| 61 58 | 
             
                  ename, eattr = l
         | 
| 62 59 | 
             
                  h.search(ename).each do |elem|
         | 
| 63 | 
            -
             | 
| 64 | 
            -
                     | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 60 | 
            +
                    euri = elem.attributes[eattr]
         | 
| 61 | 
            +
                    uri = ForgivingURI.parse(URI.encode(euri)) rescue nil
         | 
| 62 | 
            +
                    if euri and not euri.empty? and uri and uri.relative?
         | 
| 63 | 
            +
                      elem.raw_attributes[eattr] = urljoin(baseURI, euri)
         | 
| 64 | 
            +
                    end
         | 
| 68 65 | 
             
                  end
         | 
| 69 66 | 
             
                end
         | 
| 70 67 | 
             
                return h.to_html
         | 
| @@ -1,5 +1,7 @@ | |
| 1 | 
            -
            #!/usr/bin/ruby
         | 
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
            module FeedParser
         | 
| 2 3 | 
             
            module FeedParserMixin
         | 
| 4 | 
            +
              include FeedParserUtilities
         | 
| 3 5 | 
             
              attr_accessor :feeddata, :version, :namespacesInUse, :date_handlers
         | 
| 4 6 |  | 
| 5 7 | 
             
              def startup(baseuri=nil, baselang=nil, encoding='utf-8')
         | 
| @@ -103,10 +105,6 @@ module FeedParserMixin | |
| 103 105 | 
             
                if baselang 
         | 
| 104 106 | 
             
                  @feeddata['language'] = baselang.gsub('_','-')
         | 
| 105 107 | 
             
                end
         | 
| 106 | 
            -
                @date_handlers = [:_parse_date_rfc822,
         | 
| 107 | 
            -
                  :_parse_date_hungarian, :_parse_date_greek,:_parse_date_mssql,
         | 
| 108 | 
            -
                  :_parse_date_nate,:_parse_date_onblog,:_parse_date_w3dtf,:_parse_date_iso8601
         | 
| 109 | 
            -
                ]
         | 
| 110 108 | 
             
                $stderr << "Leaving startup\n" if $debug # My addition
         | 
| 111 109 | 
             
              end
         | 
| 112 110 |  | 
| @@ -873,7 +871,9 @@ module FeedParserMixin | |
| 873 871 |  | 
| 874 872 | 
             
              def _end_published
         | 
| 875 873 | 
             
                value = pop('published')
         | 
| 876 | 
            -
                 | 
| 874 | 
            +
                d = parse_date(value)
         | 
| 875 | 
            +
                _save('published_parsed', extract_tuple(d))
         | 
| 876 | 
            +
                _save('published_time', d)
         | 
| 877 877 | 
             
              end
         | 
| 878 878 | 
             
              alias :_end_dcterms_issued :_end_published
         | 
| 879 879 | 
             
              alias :_end_issued :_end_published
         | 
| @@ -888,7 +888,9 @@ module FeedParserMixin | |
| 888 888 |  | 
| 889 889 | 
             
              def _end_updated
         | 
| 890 890 | 
             
                value = pop('updated')
         | 
| 891 | 
            -
                 | 
| 891 | 
            +
                d = parse_date(value)
         | 
| 892 | 
            +
                _save('updated_parsed', extract_tuple(d))
         | 
| 893 | 
            +
                _save('updated_time', d)
         | 
| 892 894 | 
             
              end
         | 
| 893 895 | 
             
              alias :_end_modified :_end_updated
         | 
| 894 896 | 
             
              alias :_end_dcterms_modified :_end_updated
         | 
| @@ -902,7 +904,9 @@ module FeedParserMixin | |
| 902 904 |  | 
| 903 905 | 
             
              def _end_created
         | 
| 904 906 | 
             
                value = pop('created')
         | 
| 905 | 
            -
                 | 
| 907 | 
            +
                d = parse_date(value)
         | 
| 908 | 
            +
                _save('created_parsed', extract_tuple(d))
         | 
| 909 | 
            +
                _save('created_time', d)
         | 
| 906 910 | 
             
              end
         | 
| 907 911 | 
             
              alias :_end_dcterms_created :_end_created
         | 
| 908 912 |  | 
| @@ -910,7 +914,9 @@ module FeedParserMixin | |
| 910 914 | 
             
                push('expired', true)
         | 
| 911 915 | 
             
              end
         | 
| 912 916 | 
             
              def _end_expirationdate
         | 
| 913 | 
            -
                 | 
| 917 | 
            +
                d = parse_date(pop('expired'))
         | 
| 918 | 
            +
                _save('expired_parsed', extract_tuple(d))
         | 
| 919 | 
            +
                _save('expired_time', d)
         | 
| 914 920 | 
             
              end
         | 
| 915 921 |  | 
| 916 922 | 
             
              def _start_cc_license(attrsD)
         | 
| @@ -1234,5 +1240,4 @@ module FeedParserMixin | |
| 1234 1240 | 
             
              end
         | 
| 1235 1241 |  | 
| 1236 1242 | 
             
            end # End FeedParserMixin
         | 
| 1237 | 
            -
             | 
| 1238 | 
            -
             | 
| 1243 | 
            +
            end
         | 
    
        data/lib/rfeedparser/parsers.rb
    CHANGED
    
    
    
        data/lib/rfeedparser/scrub.rb
    CHANGED
    
    | @@ -1,47 +1,45 @@ | |
| 1 | 
            -
            #!/usr/bin/ruby
         | 
| 2 | 
            -
            gem 'hpricot', "=0.5"
         | 
| 3 | 
            -
            require 'hpricot'
         | 
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 4 2 | 
             
            # This used to be based on Michael Moen's Hpricot#scrub, but that seems to 
         | 
| 5 3 | 
             
            # have only been part of its evolution. Hpricot#scrub is cool code, though.
         | 
| 6 4 | 
             
            # http://underpantsgnome.com/2007/01/20/hpricot-scrub
         | 
| 7 5 | 
             
            module Hpricot
         | 
| 8 6 | 
             
              Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
         | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 7 | 
            +
                'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
         | 
| 8 | 
            +
                'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
         | 
| 9 | 
            +
                'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
         | 
| 10 | 
            +
                'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
         | 
| 11 | 
            +
                'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
         | 
| 12 | 
            +
                'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
         | 
| 13 | 
            +
                'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
         | 
| 14 | 
            +
                'ul', 'var'
         | 
| 17 15 | 
             
              ]
         | 
| 18 16 |  | 
| 19 17 | 
             
              Acceptable_Attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
         | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 18 | 
            +
                'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
         | 
| 19 | 
            +
                'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
         | 
| 20 | 
            +
                'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
         | 
| 21 | 
            +
                'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
         | 
| 22 | 
            +
                'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
         | 
| 23 | 
            +
                'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
         | 
| 24 | 
            +
                'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
         | 
| 25 | 
            +
                'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
         | 
| 26 | 
            +
                'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 
         | 
| 27 | 
            +
                'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'
         | 
| 30 28 | 
             
              ]
         | 
| 31 29 |  | 
| 32 30 | 
             
              Unacceptable_Elements_With_End_Tag = ['script', 'applet']
         | 
| 33 31 |  | 
| 34 32 | 
             
              Acceptable_Css_Properties = ['azimuth', 'background-color',
         | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 33 | 
            +
                'border-bottom-color', 'border-collapse', 'border-color',
         | 
| 34 | 
            +
                'border-left-color', 'border-right-color', 'border-top-color', 'clear',
         | 
| 35 | 
            +
                'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
         | 
| 36 | 
            +
                'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
         | 
| 37 | 
            +
                'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
         | 
| 38 | 
            +
                'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
         | 
| 39 | 
            +
                'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
         | 
| 40 | 
            +
                'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
         | 
| 41 | 
            +
                'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
         | 
| 42 | 
            +
                'white-space', 'width'
         | 
| 45 43 | 
             
              ]
         | 
| 46 44 |  | 
| 47 45 | 
             
              # survey of common keywords found in feeds
         | 
| @@ -82,38 +80,38 @@ module Hpricot | |
| 82 80 |  | 
| 83 81 | 
             
              # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
         | 
| 84 82 | 
             
              Svg_Attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
         | 
| 85 | 
            -
             | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 89 | 
            -
             | 
| 90 | 
            -
             | 
| 91 | 
            -
             | 
| 92 | 
            -
             | 
| 93 | 
            -
             | 
| 94 | 
            -
             | 
| 95 | 
            -
             | 
| 96 | 
            -
             | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
             | 
| 100 | 
            -
             | 
| 101 | 
            -
             | 
| 102 | 
            -
             | 
| 103 | 
            -
             | 
| 104 | 
            -
             | 
| 105 | 
            -
             | 
| 106 | 
            -
             | 
| 107 | 
            -
             | 
| 108 | 
            -
             | 
| 83 | 
            +
                'arabic-form', 'ascent', 'attributeName', 'attributeType',
         | 
| 84 | 
            +
                'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
         | 
| 85 | 
            +
                'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
         | 
| 86 | 
            +
                'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
         | 
| 87 | 
            +
                'font-size', 'font-stretch', 'font-style', 'font-variant',
         | 
| 88 | 
            +
                'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name', 
         | 
| 89 | 
            +
                'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
         | 
| 90 | 
            +
                'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
         | 
| 91 | 
            +
                'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
         | 
| 92 | 
            +
                'origin', 'overline-position', 'overline-thickness', 'panose-1',
         | 
| 93 | 
            +
                'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
         | 
| 94 | 
            +
                'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
         | 
| 95 | 
            +
                'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 
         | 
| 96 | 
            +
                'stop-color', 'stop-opacity', 'strikethrough-position',
         | 
| 97 | 
            +
                'strikethrough-thickness', 'stroke', 'stroke-dasharray',
         | 
| 98 | 
            +
                'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
         | 
| 99 | 
            +
                'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
         | 
| 100 | 
            +
                'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
         | 
| 101 | 
            +
                'underline-position', 'underline-thickness', 'unicode',
         | 
| 102 | 
            +
                'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
         | 
| 103 | 
            +
                'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
         | 
| 104 | 
            +
                'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
         | 
| 105 | 
            +
                'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang',
         | 
| 106 | 
            +
                'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'
         | 
| 109 107 | 
             
              ]
         | 
| 110 108 |  | 
| 111 109 | 
             
              Svg_Attr_Map = nil
         | 
| 112 110 | 
             
              Svg_Elem_Map = nil
         | 
| 113 111 |  | 
| 114 112 | 
             
              Acceptable_Svg_Properties = [ 'fill', 'fill-opacity', 'fill-rule',
         | 
| 115 | 
            -
             | 
| 116 | 
            -
             | 
| 113 | 
            +
                'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
         | 
| 114 | 
            +
                'stroke-opacity'
         | 
| 117 115 | 
             
              ]
         | 
| 118 116 |  | 
| 119 117 | 
             
              unless $compatible 
         | 
| @@ -148,11 +146,9 @@ module Hpricot | |
| 148 146 | 
             
              class Elem
         | 
| 149 147 | 
             
                def strip_attributes
         | 
| 150 148 | 
             
                  unless attributes.nil?
         | 
| 151 | 
            -
             | 
| 152 | 
            -
             | 
| 153 | 
            -
             | 
| 154 | 
            -
            	  end
         | 
| 155 | 
            -
            	end
         | 
| 149 | 
            +
                    ra = {}
         | 
| 150 | 
            +
                    raw_attributes.keys.each{|atr| ra[atr] = raw_attributes[atr] if Acceptable_Attributes.include?(atr) }
         | 
| 151 | 
            +
                    self.raw_attributes = ra
         | 
| 156 152 | 
             
                  end
         | 
| 157 153 | 
             
                end
         | 
| 158 154 | 
             
              end
         | 
| @@ -160,35 +156,44 @@ end | |
| 160 156 |  | 
| 161 157 | 
             
            module FeedParserUtilities
         | 
| 162 158 | 
             
              class SanitizerDoc < Hpricot::Doc
         | 
| 163 | 
            -
             | 
| 159 | 
            +
                
         | 
| 164 160 | 
             
                def scrub
         | 
| 165 | 
            -
                   | 
| 166 | 
            -
             | 
| 167 | 
            -
             | 
| 168 | 
            -
             | 
| 169 | 
            -
             | 
| 170 | 
            -
             | 
| 171 | 
            -
             | 
| 172 | 
            -
             | 
| 173 | 
            -
             | 
| 174 | 
            -
             | 
| 175 | 
            -
             | 
| 176 | 
            -
             | 
| 177 | 
            -
             | 
| 178 | 
            -
             | 
| 179 | 
            -
             | 
| 180 | 
            -
             | 
| 181 | 
            -
             | 
| 182 | 
            -
             | 
| 183 | 
            -
             | 
| 184 | 
            -
             | 
| 161 | 
            +
                  others = children.map do |e|
         | 
| 162 | 
            +
                    if e.elem?
         | 
| 163 | 
            +
                      if Acceptable_Elements.include?e.name
         | 
| 164 | 
            +
                        e.strip_attributes
         | 
| 165 | 
            +
                        e.inner_html = SanitizerDoc.new(e.children).scrub
         | 
| 166 | 
            +
                        result = e
         | 
| 167 | 
            +
                      else
         | 
| 168 | 
            +
                        result = e
         | 
| 169 | 
            +
                        
         | 
| 170 | 
            +
                        if Unacceptable_Elements_With_End_Tag.include?e.name
         | 
| 171 | 
            +
                          result = nil
         | 
| 172 | 
            +
                        end
         | 
| 173 | 
            +
                        
         | 
| 174 | 
            +
                        if result 
         | 
| 175 | 
            +
                          result = SanitizerDoc.new(result.children).scrub   # The important part
         | 
| 176 | 
            +
                        end            
         | 
| 177 | 
            +
                      end
         | 
| 178 | 
            +
                      
         | 
| 179 | 
            +
                    elsif e.doctype?
         | 
| 180 | 
            +
                      result = nil
         | 
| 181 | 
            +
             | 
| 182 | 
            +
                    elsif e.text?
         | 
| 183 | 
            +
                      ets = e.to_html
         | 
| 184 | 
            +
                      ets.gsub!(/'/, "'") 
         | 
| 185 | 
            +
                      ets.gsub!(/"/, '"')
         | 
| 186 | 
            +
                      ets.gsub!(/\r/,'')
         | 
| 187 | 
            +
                      result = ets
         | 
| 188 | 
            +
                    end
         | 
| 189 | 
            +
                    result
         | 
| 185 190 | 
             
                  end
         | 
| 186 | 
            -
             | 
| 191 | 
            +
                  
         | 
| 187 192 | 
             
                  unless $compatible # FIXME nonworking
         | 
| 188 | 
            -
             | 
| 189 | 
            -
             | 
| 193 | 
            +
                    # yes, that '/' should be there. It's a search method. See the Hpricot docs.
         | 
| 194 | 
            +
                    (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
         | 
| 190 195 | 
             
                  end
         | 
| 191 | 
            -
                  return  | 
| 196 | 
            +
                  return others.compact.join
         | 
| 192 197 | 
             
                end
         | 
| 193 198 | 
             
              end
         | 
| 194 199 |  | 
| @@ -200,8 +205,8 @@ module FeedParserUtilities | |
| 200 205 | 
             
              def sanitizeHTML(html,encoding)
         | 
| 201 206 | 
             
                # FIXME Tidy not yet supported
         | 
| 202 207 | 
             
                html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '<!\1')
         | 
| 203 | 
            -
             | 
| 204 | 
            -
             | 
| 205 | 
            -
             | 
| 208 | 
            +
                h = SanitizerDoc(html)
         | 
| 209 | 
            +
                h = h.scrub
         | 
| 210 | 
            +
                return h.strip
         | 
| 206 211 | 
             
              end
         | 
| 207 212 | 
             
            end
         |