rfeedparser 0.9.86 → 0.9.87

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/rfeedparser.rb +23 -54
  2. metadata +2 -2
@@ -41,7 +41,8 @@ $debug = false
41
41
  $compatible = true
42
42
 
43
43
  Encoding_Aliases = { # Adapted from python2.4's encodings/aliases.py
44
- 'unicode' => 'utf-16',
44
+ 'unicode' => 'utf-16',
45
+
45
46
  # MacOSX does not have Unicode as a separate encoding nor even
46
47
  # aliased. My Ubuntu box has it as a separate encoding but I cannot
47
48
  # for the life of me figure out where the source code for UNICODE.so
@@ -483,8 +484,6 @@ def unichr(i)
483
484
  end
484
485
 
485
486
  def index_match(stri,regexp, offset)
486
- if offset == 241
487
- end
488
487
  i = stri.index(regexp, offset)
489
488
 
490
489
  return nil, nil unless i
@@ -500,6 +499,7 @@ end
500
499
  def urljoin(base, uri)
501
500
  urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
502
501
  uri = uri.sub(urifixer, '\1\3')
502
+
503
503
  begin
504
504
  return URI.join(base, uri).to_s
505
505
  rescue URI::BadURIError => e
@@ -872,9 +872,10 @@ module XML
872
872
  end
873
873
  end
874
874
  end
875
- # This adds a nice scrub method to Hpricot, so we don't need a _HTMLSanitizer class
875
+
876
+ # This used to be based on Michael Moen's Hpricot#scrub, but that seems to
877
+ # have only been part of its evolution. Hpricot#scrub is cool code, though.
876
878
  # http://underpantsgnome.com/2007/01/20/hpricot-scrub
877
- # I have modified it to check for attributes that are only allowed if they are in a certain tag
878
879
  module Hpricot
879
880
  Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
880
881
  'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
@@ -994,55 +995,29 @@ module Hpricot
994
995
  end
995
996
 
996
997
  class Elements
997
- def strip(allowed_tags=[]) # I completely route around this with the recursive_strip in Doc
998
- each { |x| x.strip(allowed_tags) }
999
- end
1000
-
1001
998
  def strip_attributes(safe=[])
1002
999
  each { |x| x.strip_attributes(safe) }
1003
1000
  end
1004
1001
 
1005
- def strip_style(ok_props=[], ok_keywords=[])
1002
+ def strip_style(ok_props=[], ok_keywords=[]) # NOTE unused so far.
1006
1003
  each { |x| x.strip_style(ok_props, ok_keywords) }
1007
1004
  end
1008
1005
  end
1009
1006
 
1010
1007
  class Text
1011
- def strip(foo)
1012
- end
1013
1008
  def strip_attributes(foo)
1014
1009
  end
1015
1010
  end
1016
1011
  class Comment
1017
- def strip(foo)
1018
- end
1019
1012
  def strip_attributes(foo)
1020
1013
  end
1021
1014
  end
1022
1015
  class BogusETag
1023
- def strip(foo)
1024
- end
1025
1016
  def strip_attributes(foo)
1026
1017
  end
1027
1018
  end
1028
1019
 
1029
1020
  class Elem
1030
- def decode_entities
1031
- children.each{ |x| x.decode_entities }
1032
- end
1033
-
1034
- def cull
1035
- if children
1036
- swap(children.to_s)
1037
- end
1038
- end
1039
-
1040
- def strip
1041
- if strip_removes?
1042
- cull
1043
- end
1044
- end
1045
-
1046
1021
  def strip_attributes
1047
1022
  unless attributes.nil?
1048
1023
  attributes.each do |atr|
@@ -1052,16 +1027,11 @@ module Hpricot
1052
1027
  end
1053
1028
  end
1054
1029
  end
1055
-
1056
- def strip_removes?
1057
- # I'm sure there are others that shuould be ripped instead of stripped
1058
- attributes && attributes['type'] =~ /script|css/
1059
- end
1060
1030
  end
1061
1031
  end
1062
1032
 
1063
1033
  module FeedParser
1064
- Version = "0.9.86"
1034
+ Version = "0.9.87"
1065
1035
 
1066
1036
  License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
1067
1037
 
@@ -1107,15 +1077,11 @@ POSSIBILITY OF SUCH DAMAGE."""
1107
1077
  # If you want feedparser to automatically run HTML markup through HTML Tidy, set
1108
1078
  # this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
1109
1079
  # or utidylib <http://utidylib.berlios.de/>.
1110
- TIDY_MARKUP = false #FIXME untranslated
1080
+ #TIDY_MARKUP = false #FIXME untranslated
1111
1081
 
1112
1082
  # List of Python interfaces for HTML Tidy, in order of preference. Only useful
1113
1083
  # if TIDY_MARKUP = true
1114
- PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
1115
-
1116
- # The original Python import. I'm using it to help translate
1117
- #import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
1118
-
1084
+ #PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
1119
1085
 
1120
1086
 
1121
1087
  # ---------- don't touch these ----------
@@ -1153,13 +1119,14 @@ POSSIBILITY OF SUCH DAMAGE."""
1153
1119
  =begin
1154
1120
  The naming of a certain common attribute (such as, "When was the last
1155
1121
  time this feed was updated?") can have many different names depending
1156
- on the type of feed we are handling. This class allows us to use
1157
- both the attribute name a person, who has knowledge of the kind of
1158
- feed being parsed, expects, as well as allowing a developer to rely
1159
- on one name to contain the proper attribute no matter what kind of
1160
- feed is being parsed. @@keymaps is a Hash that contains information
1161
- on what certain attributes "really is" in each feed type. It does so
1162
- by providing a common name that will map to any feed type in the keys,
1122
+ on the type of feed we are handling. This class allows us to satisfy
1123
+ the expectations of both the developer who has prior knowledge of the
1124
+ feed type as well as the developer who wants a consistent application
1125
+ interface.
1126
+
1127
+ @@keymap is a Hash that contains information on what a certain
1128
+ attribute names "really are" in each kind of feed. It does this by
1129
+ providing a common name that will map to any feed type in the keys,
1163
1130
  with possible "correct" attributes in the its values. the #[] and #[]=
1164
1131
  methods check with keymaps to see what attribute the developer "really
1165
1132
  means" if they've asked for one which happens to be in @@keymap's keys.
@@ -1183,6 +1150,7 @@ POSSIBILITY OF SUCH DAMAGE."""
1183
1150
  def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
1184
1151
  return self['entries']
1185
1152
  end
1153
+
1186
1154
  # We could include the [] rewrite in new using Hash.new's fancy pants block thing
1187
1155
  # but we'd still have to overwrite []= and such.
1188
1156
  # I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
@@ -1209,7 +1177,7 @@ POSSIBILITY OF SUCH DAMAGE."""
1209
1177
  realkey.each{ |key| return self[key] if has_key?key }
1210
1178
  end
1211
1179
  # Note that the original key is preferred over the realkey we (might
1212
- # have) found in @@keymaps
1180
+ # have) found in @@keymap
1213
1181
  if has_key?(key)
1214
1182
  return super(key)
1215
1183
  end
@@ -3079,7 +3047,7 @@ POSSIBILITY OF SUCH DAMAGE."""
3079
3047
  ename, eattr = l
3080
3048
  h.search(ename).each do |elem|
3081
3049
  euri = elem.attributes[eattr]
3082
- if euri and not euri.empty? and URI.parse(euri).relative?
3050
+ if euri and not euri.empty? and URI.parse(URI.encode(euri)).relative?
3083
3051
  elem.attributes[eattr] = urljoin(baseURI, euri)
3084
3052
  end
3085
3053
  end
@@ -3114,7 +3082,7 @@ POSSIBILITY OF SUCH DAMAGE."""
3114
3082
  end
3115
3083
  # yes, that '/' should be there. It's a search method. See the Hpricot docs.
3116
3084
 
3117
- unless $compatible # FIXME not properly recursive, see comment in recursive_strip
3085
+ unless $compatible # FIXME nonworking
3118
3086
  (self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
3119
3087
  end
3120
3088
  return self
@@ -3125,6 +3093,7 @@ POSSIBILITY OF SUCH DAMAGE."""
3125
3093
  FeedParser::SanitizerDoc.new(Hpricot.make(html))
3126
3094
  end
3127
3095
  module_function(:SanitizerDoc)
3096
+
3128
3097
  def self.sanitizeHTML(html,encoding)
3129
3098
  # FIXME Tidy not yet supported
3130
3099
  html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: rfeedparser
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.9.86
7
- date: 2007-04-05 00:00:00 -04:00
6
+ version: 0.9.87
7
+ date: 2007-04-07 00:00:00 -04:00
8
8
  summary: Parse RSS and Atom feeds in Ruby
9
9
  require_paths:
10
10
  - lib