rfeedparser 0.9.86 → 0.9.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/rfeedparser.rb +23 -54
  2. metadata +2 -2
@@ -41,7 +41,8 @@ $debug = false
41
41
  $compatible = true
42
42
 
43
43
  Encoding_Aliases = { # Adapted from python2.4's encodings/aliases.py
44
- 'unicode' => 'utf-16',
44
+ 'unicode' => 'utf-16',
45
+
45
46
  # MacOSX does not have Unicode as a separate encoding nor even
46
47
  # aliased. My Ubuntu box has it as a separate encoding but I cannot
47
48
  # for the life of me figure out where the source code for UNICODE.so
@@ -483,8 +484,6 @@ def unichr(i)
483
484
  end
484
485
 
485
486
  def index_match(stri,regexp, offset)
486
- if offset == 241
487
- end
488
487
  i = stri.index(regexp, offset)
489
488
 
490
489
  return nil, nil unless i
@@ -500,6 +499,7 @@ end
500
499
  def urljoin(base, uri)
501
500
  urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
502
501
  uri = uri.sub(urifixer, '\1\3')
502
+
503
503
  begin
504
504
  return URI.join(base, uri).to_s
505
505
  rescue URI::BadURIError => e
@@ -872,9 +872,10 @@ module XML
872
872
  end
873
873
  end
874
874
  end
875
- # This adds a nice scrub method to Hpricot, so we don't need a _HTMLSanitizer class
875
+
876
+ # This used to be based on Michael Moen's Hpricot#scrub, but that seems to
877
+ # have only been part of its evolution. Hpricot#scrub is cool code, though.
876
878
  # http://underpantsgnome.com/2007/01/20/hpricot-scrub
877
- # I have modified it to check for attributes that are only allowed if they are in a certain tag
878
879
  module Hpricot
879
880
  Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
880
881
  'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
@@ -994,55 +995,29 @@ module Hpricot
994
995
  end
995
996
 
996
997
  class Elements
997
- def strip(allowed_tags=[]) # I completely route around this with the recursive_strip in Doc
998
- each { |x| x.strip(allowed_tags) }
999
- end
1000
-
1001
998
  def strip_attributes(safe=[])
1002
999
  each { |x| x.strip_attributes(safe) }
1003
1000
  end
1004
1001
 
1005
- def strip_style(ok_props=[], ok_keywords=[])
1002
+ def strip_style(ok_props=[], ok_keywords=[]) # NOTE unused so far.
1006
1003
  each { |x| x.strip_style(ok_props, ok_keywords) }
1007
1004
  end
1008
1005
  end
1009
1006
 
1010
1007
  class Text
1011
- def strip(foo)
1012
- end
1013
1008
  def strip_attributes(foo)
1014
1009
  end
1015
1010
  end
1016
1011
  class Comment
1017
- def strip(foo)
1018
- end
1019
1012
  def strip_attributes(foo)
1020
1013
  end
1021
1014
  end
1022
1015
  class BogusETag
1023
- def strip(foo)
1024
- end
1025
1016
  def strip_attributes(foo)
1026
1017
  end
1027
1018
  end
1028
1019
 
1029
1020
  class Elem
1030
- def decode_entities
1031
- children.each{ |x| x.decode_entities }
1032
- end
1033
-
1034
- def cull
1035
- if children
1036
- swap(children.to_s)
1037
- end
1038
- end
1039
-
1040
- def strip
1041
- if strip_removes?
1042
- cull
1043
- end
1044
- end
1045
-
1046
1021
  def strip_attributes
1047
1022
  unless attributes.nil?
1048
1023
  attributes.each do |atr|
@@ -1052,16 +1027,11 @@ module Hpricot
1052
1027
  end
1053
1028
  end
1054
1029
  end
1055
-
1056
- def strip_removes?
1057
- # I'm sure there are others that shuould be ripped instead of stripped
1058
- attributes && attributes['type'] =~ /script|css/
1059
- end
1060
1030
  end
1061
1031
  end
1062
1032
 
1063
1033
  module FeedParser
1064
- Version = "0.9.86"
1034
+ Version = "0.9.87"
1065
1035
 
1066
1036
  License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
1067
1037
 
@@ -1107,15 +1077,11 @@ POSSIBILITY OF SUCH DAMAGE."""
1107
1077
  # If you want feedparser to automatically run HTML markup through HTML Tidy, set
1108
1078
  # this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
1109
1079
  # or utidylib <http://utidylib.berlios.de/>.
1110
- TIDY_MARKUP = false #FIXME untranslated
1080
+ #TIDY_MARKUP = false #FIXME untranslated
1111
1081
 
1112
1082
  # List of Python interfaces for HTML Tidy, in order of preference. Only useful
1113
1083
  # if TIDY_MARKUP = true
1114
- PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
1115
-
1116
- # The original Python import. I'm using it to help translate
1117
- #import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
1118
-
1084
+ #PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
1119
1085
 
1120
1086
 
1121
1087
  # ---------- don't touch these ----------
@@ -1153,13 +1119,14 @@ POSSIBILITY OF SUCH DAMAGE."""
1153
1119
  =begin
1154
1120
  The naming of a certain common attribute (such as, "When was the last
1155
1121
  time this feed was updated?") can have many different names depending
1156
- on the type of feed we are handling. This class allows us to use
1157
- both the attribute name a person, who has knowledge of the kind of
1158
- feed being parsed, expects, as well as allowing a developer to rely
1159
- on one name to contain the proper attribute no matter what kind of
1160
- feed is being parsed. @@keymaps is a Hash that contains information
1161
- on what certain attributes "really is" in each feed type. It does so
1162
- by providing a common name that will map to any feed type in the keys,
1122
+ on the type of feed we are handling. This class allows us to satisfy
1123
+ the expectations of both the developer who has prior knowledge of the
1124
+ feed type as well as the developer who wants a consistent application
1125
+ interface.
1126
+
1127
+ @@keymap is a Hash that contains information on what a certain
1128
+ attribute names "really are" in each kind of feed. It does this by
1129
+ providing a common name that will map to any feed type in the keys,
1163
1130
  with possible "correct" attributes in the its values. the #[] and #[]=
1164
1131
  methods check with keymaps to see what attribute the developer "really
1165
1132
  means" if they've asked for one which happens to be in @@keymap's keys.
@@ -1183,6 +1150,7 @@ POSSIBILITY OF SUCH DAMAGE."""
1183
1150
  def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
1184
1151
  return self['entries']
1185
1152
  end
1153
+
1186
1154
  # We could include the [] rewrite in new using Hash.new's fancy pants block thing
1187
1155
  # but we'd still have to overwrite []= and such.
1188
1156
  # I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
@@ -1209,7 +1177,7 @@ POSSIBILITY OF SUCH DAMAGE."""
1209
1177
  realkey.each{ |key| return self[key] if has_key?key }
1210
1178
  end
1211
1179
  # Note that the original key is preferred over the realkey we (might
1212
- # have) found in @@keymaps
1180
+ # have) found in @@keymap
1213
1181
  if has_key?(key)
1214
1182
  return super(key)
1215
1183
  end
@@ -3079,7 +3047,7 @@ POSSIBILITY OF SUCH DAMAGE."""
3079
3047
  ename, eattr = l
3080
3048
  h.search(ename).each do |elem|
3081
3049
  euri = elem.attributes[eattr]
3082
- if euri and not euri.empty? and URI.parse(euri).relative?
3050
+ if euri and not euri.empty? and URI.parse(URI.encode(euri)).relative?
3083
3051
  elem.attributes[eattr] = urljoin(baseURI, euri)
3084
3052
  end
3085
3053
  end
@@ -3114,7 +3082,7 @@ POSSIBILITY OF SUCH DAMAGE."""
3114
3082
  end
3115
3083
  # yes, that '/' should be there. It's a search method. See the Hpricot docs.
3116
3084
 
3117
- unless $compatible # FIXME not properly recursive, see comment in recursive_strip
3085
+ unless $compatible # FIXME nonworking
3118
3086
  (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
3119
3087
  end
3120
3088
  return self
@@ -3125,6 +3093,7 @@ POSSIBILITY OF SUCH DAMAGE."""
3125
3093
  FeedParser::SanitizerDoc.new(Hpricot.make(html))
3126
3094
  end
3127
3095
  module_function(:SanitizerDoc)
3096
+
3128
3097
  def self.sanitizeHTML(html,encoding)
3129
3098
  # FIXME Tidy not yet supported
3130
3099
  html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: rfeedparser
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.9.86
7
- date: 2007-04-05 00:00:00 -04:00
6
+ version: 0.9.87
7
+ date: 2007-04-07 00:00:00 -04:00
8
8
  summary: Parse RSS and Atom feeds in Ruby
9
9
  require_paths:
10
10
  - lib