rfeedparser 0.9.86 → 0.9.87
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rfeedparser.rb +23 -54
- metadata +2 -2
data/lib/rfeedparser.rb
CHANGED
@@ -41,7 +41,8 @@ $debug = false
|
|
41
41
|
$compatible = true
|
42
42
|
|
43
43
|
Encoding_Aliases = { # Adapted from python2.4's encodings/aliases.py
|
44
|
-
'unicode' => 'utf-16',
|
44
|
+
'unicode' => 'utf-16',
|
45
|
+
|
45
46
|
# MacOSX does not have Unicode as a separate encoding nor even
|
46
47
|
# aliased. My Ubuntu box has it as a separate encoding but I cannot
|
47
48
|
# for the life of me figure out where the source code for UNICODE.so
|
@@ -483,8 +484,6 @@ def unichr(i)
|
|
483
484
|
end
|
484
485
|
|
485
486
|
def index_match(stri,regexp, offset)
|
486
|
-
if offset == 241
|
487
|
-
end
|
488
487
|
i = stri.index(regexp, offset)
|
489
488
|
|
490
489
|
return nil, nil unless i
|
@@ -500,6 +499,7 @@ end
|
|
500
499
|
def urljoin(base, uri)
|
501
500
|
urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
|
502
501
|
uri = uri.sub(urifixer, '\1\3')
|
502
|
+
|
503
503
|
begin
|
504
504
|
return URI.join(base, uri).to_s
|
505
505
|
rescue URI::BadURIError => e
|
@@ -872,9 +872,10 @@ module XML
|
|
872
872
|
end
|
873
873
|
end
|
874
874
|
end
|
875
|
-
|
875
|
+
|
876
|
+
# This used to be based on Michael Moen's Hpricot#scrub, but that seems to
|
877
|
+
# have only been part of its evolution. Hpricot#scrub is cool code, though.
|
876
878
|
# http://underpantsgnome.com/2007/01/20/hpricot-scrub
|
877
|
-
# I have modified it to check for attributes that are only allowed if they are in a certain tag
|
878
879
|
module Hpricot
|
879
880
|
Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
880
881
|
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
@@ -994,55 +995,29 @@ module Hpricot
|
|
994
995
|
end
|
995
996
|
|
996
997
|
class Elements
|
997
|
-
def strip(allowed_tags=[]) # I completely route around this with the recursive_strip in Doc
|
998
|
-
each { |x| x.strip(allowed_tags) }
|
999
|
-
end
|
1000
|
-
|
1001
998
|
def strip_attributes(safe=[])
|
1002
999
|
each { |x| x.strip_attributes(safe) }
|
1003
1000
|
end
|
1004
1001
|
|
1005
|
-
def strip_style(ok_props=[], ok_keywords=[])
|
1002
|
+
def strip_style(ok_props=[], ok_keywords=[]) # NOTE unused so far.
|
1006
1003
|
each { |x| x.strip_style(ok_props, ok_keywords) }
|
1007
1004
|
end
|
1008
1005
|
end
|
1009
1006
|
|
1010
1007
|
class Text
|
1011
|
-
def strip(foo)
|
1012
|
-
end
|
1013
1008
|
def strip_attributes(foo)
|
1014
1009
|
end
|
1015
1010
|
end
|
1016
1011
|
class Comment
|
1017
|
-
def strip(foo)
|
1018
|
-
end
|
1019
1012
|
def strip_attributes(foo)
|
1020
1013
|
end
|
1021
1014
|
end
|
1022
1015
|
class BogusETag
|
1023
|
-
def strip(foo)
|
1024
|
-
end
|
1025
1016
|
def strip_attributes(foo)
|
1026
1017
|
end
|
1027
1018
|
end
|
1028
1019
|
|
1029
1020
|
class Elem
|
1030
|
-
def decode_entities
|
1031
|
-
children.each{ |x| x.decode_entities }
|
1032
|
-
end
|
1033
|
-
|
1034
|
-
def cull
|
1035
|
-
if children
|
1036
|
-
swap(children.to_s)
|
1037
|
-
end
|
1038
|
-
end
|
1039
|
-
|
1040
|
-
def strip
|
1041
|
-
if strip_removes?
|
1042
|
-
cull
|
1043
|
-
end
|
1044
|
-
end
|
1045
|
-
|
1046
1021
|
def strip_attributes
|
1047
1022
|
unless attributes.nil?
|
1048
1023
|
attributes.each do |atr|
|
@@ -1052,16 +1027,11 @@ module Hpricot
|
|
1052
1027
|
end
|
1053
1028
|
end
|
1054
1029
|
end
|
1055
|
-
|
1056
|
-
def strip_removes?
|
1057
|
-
# I'm sure there are others that shuould be ripped instead of stripped
|
1058
|
-
attributes && attributes['type'] =~ /script|css/
|
1059
|
-
end
|
1060
1030
|
end
|
1061
1031
|
end
|
1062
1032
|
|
1063
1033
|
module FeedParser
|
1064
|
-
Version = "0.9.
|
1034
|
+
Version = "0.9.87"
|
1065
1035
|
|
1066
1036
|
License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
1067
1037
|
|
@@ -1107,15 +1077,11 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
1107
1077
|
# If you want feedparser to automatically run HTML markup through HTML Tidy, set
|
1108
1078
|
# this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
|
1109
1079
|
# or utidylib <http://utidylib.berlios.de/>.
|
1110
|
-
TIDY_MARKUP = false #FIXME untranslated
|
1080
|
+
#TIDY_MARKUP = false #FIXME untranslated
|
1111
1081
|
|
1112
1082
|
# List of Python interfaces for HTML Tidy, in order of preference. Only useful
|
1113
1083
|
# if TIDY_MARKUP = true
|
1114
|
-
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
|
1115
|
-
|
1116
|
-
# The original Python import. I'm using it to help translate
|
1117
|
-
#import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
|
1118
|
-
|
1084
|
+
#PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
|
1119
1085
|
|
1120
1086
|
|
1121
1087
|
# ---------- don't touch these ----------
|
@@ -1153,13 +1119,14 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
1153
1119
|
=begin
|
1154
1120
|
The naming of a certain common attribute (such as, "When was the last
|
1155
1121
|
time this feed was updated?") can have many different names depending
|
1156
|
-
on the type of feed we are handling. This class allows us to
|
1157
|
-
|
1158
|
-
feed
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1122
|
+
on the type of feed we are handling. This class allows us to satisfy
|
1123
|
+
the expectations of both the developer who has prior knowledge of the
|
1124
|
+
feed type as well as the developer who wants a consistent application
|
1125
|
+
interface.
|
1126
|
+
|
1127
|
+
@@keymap is a Hash that contains information on what a certain
|
1128
|
+
attribute names "really are" in each kind of feed. It does this by
|
1129
|
+
providing a common name that will map to any feed type in the keys,
|
1163
1130
|
with possible "correct" attributes in the its values. the #[] and #[]=
|
1164
1131
|
methods check with keymaps to see what attribute the developer "really
|
1165
1132
|
means" if they've asked for one which happens to be in @@keymap's keys.
|
@@ -1183,6 +1150,7 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
1183
1150
|
def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
|
1184
1151
|
return self['entries']
|
1185
1152
|
end
|
1153
|
+
|
1186
1154
|
# We could include the [] rewrite in new using Hash.new's fancy pants block thing
|
1187
1155
|
# but we'd still have to overwrite []= and such.
|
1188
1156
|
# I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
|
@@ -1209,7 +1177,7 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
1209
1177
|
realkey.each{ |key| return self[key] if has_key?key }
|
1210
1178
|
end
|
1211
1179
|
# Note that the original key is preferred over the realkey we (might
|
1212
|
-
# have) found in @@
|
1180
|
+
# have) found in @@keymap
|
1213
1181
|
if has_key?(key)
|
1214
1182
|
return super(key)
|
1215
1183
|
end
|
@@ -3079,7 +3047,7 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
3079
3047
|
ename, eattr = l
|
3080
3048
|
h.search(ename).each do |elem|
|
3081
3049
|
euri = elem.attributes[eattr]
|
3082
|
-
if euri and not euri.empty? and URI.parse(euri).relative?
|
3050
|
+
if euri and not euri.empty? and URI.parse(URI.encode(euri)).relative?
|
3083
3051
|
elem.attributes[eattr] = urljoin(baseURI, euri)
|
3084
3052
|
end
|
3085
3053
|
end
|
@@ -3114,7 +3082,7 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
3114
3082
|
end
|
3115
3083
|
# yes, that '/' should be there. It's a search method. See the Hpricot docs.
|
3116
3084
|
|
3117
|
-
unless $compatible # FIXME
|
3085
|
+
unless $compatible # FIXME nonworking
|
3118
3086
|
(self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
|
3119
3087
|
end
|
3120
3088
|
return self
|
@@ -3125,6 +3093,7 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
3125
3093
|
FeedParser::SanitizerDoc.new(Hpricot.make(html))
|
3126
3094
|
end
|
3127
3095
|
module_function(:SanitizerDoc)
|
3096
|
+
|
3128
3097
|
def self.sanitizeHTML(html,encoding)
|
3129
3098
|
# FIXME Tidy not yet supported
|
3130
3099
|
html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '<!\1')
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rfeedparser
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.9.
|
7
|
-
date: 2007-04-
|
6
|
+
version: 0.9.87
|
7
|
+
date: 2007-04-07 00:00:00 -04:00
|
8
8
|
summary: Parse RSS and Atom feeds in Ruby
|
9
9
|
require_paths:
|
10
10
|
- lib
|