rfeedparser 0.9.86 → 0.9.87
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rfeedparser.rb +23 -54
- metadata +2 -2
data/lib/rfeedparser.rb
CHANGED
@@ -41,7 +41,8 @@ $debug = false
|
|
41
41
|
$compatible = true
|
42
42
|
|
43
43
|
Encoding_Aliases = { # Adapted from python2.4's encodings/aliases.py
|
44
|
-
'unicode' => 'utf-16',
|
44
|
+
'unicode' => 'utf-16',
|
45
|
+
|
45
46
|
# MacOSX does not have Unicode as a separate encoding nor even
|
46
47
|
# aliased. My Ubuntu box has it as a separate encoding but I cannot
|
47
48
|
# for the life of me figure out where the source code for UNICODE.so
|
@@ -483,8 +484,6 @@ def unichr(i)
|
|
483
484
|
end
|
484
485
|
|
485
486
|
def index_match(stri,regexp, offset)
|
486
|
-
if offset == 241
|
487
|
-
end
|
488
487
|
i = stri.index(regexp, offset)
|
489
488
|
|
490
489
|
return nil, nil unless i
|
@@ -500,6 +499,7 @@ end
|
|
500
499
|
def urljoin(base, uri)
|
501
500
|
urifixer = /^([A-Za-z][A-Za-z0-9+-.]*:\/\/)(\/*)(.*?)/u
|
502
501
|
uri = uri.sub(urifixer, '\1\3')
|
502
|
+
|
503
503
|
begin
|
504
504
|
return URI.join(base, uri).to_s
|
505
505
|
rescue URI::BadURIError => e
|
@@ -872,9 +872,10 @@ module XML
|
|
872
872
|
end
|
873
873
|
end
|
874
874
|
end
|
875
|
-
|
875
|
+
|
876
|
+
# This used to be based on Michael Moen's Hpricot#scrub, but that seems to
|
877
|
+
# have only been part of its evolution. Hpricot#scrub is cool code, though.
|
876
878
|
# http://underpantsgnome.com/2007/01/20/hpricot-scrub
|
877
|
-
# I have modified it to check for attributes that are only allowed if they are in a certain tag
|
878
879
|
module Hpricot
|
879
880
|
Acceptable_Elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
880
881
|
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
@@ -994,55 +995,29 @@ module Hpricot
|
|
994
995
|
end
|
995
996
|
|
996
997
|
class Elements
|
997
|
-
def strip(allowed_tags=[]) # I completely route around this with the recursive_strip in Doc
|
998
|
-
each { |x| x.strip(allowed_tags) }
|
999
|
-
end
|
1000
|
-
|
1001
998
|
def strip_attributes(safe=[])
|
1002
999
|
each { |x| x.strip_attributes(safe) }
|
1003
1000
|
end
|
1004
1001
|
|
1005
|
-
def strip_style(ok_props=[], ok_keywords=[])
|
1002
|
+
def strip_style(ok_props=[], ok_keywords=[]) # NOTE unused so far.
|
1006
1003
|
each { |x| x.strip_style(ok_props, ok_keywords) }
|
1007
1004
|
end
|
1008
1005
|
end
|
1009
1006
|
|
1010
1007
|
class Text
|
1011
|
-
def strip(foo)
|
1012
|
-
end
|
1013
1008
|
def strip_attributes(foo)
|
1014
1009
|
end
|
1015
1010
|
end
|
1016
1011
|
class Comment
|
1017
|
-
def strip(foo)
|
1018
|
-
end
|
1019
1012
|
def strip_attributes(foo)
|
1020
1013
|
end
|
1021
1014
|
end
|
1022
1015
|
class BogusETag
|
1023
|
-
def strip(foo)
|
1024
|
-
end
|
1025
1016
|
def strip_attributes(foo)
|
1026
1017
|
end
|
1027
1018
|
end
|
1028
1019
|
|
1029
1020
|
class Elem
|
1030
|
-
def decode_entities
|
1031
|
-
children.each{ |x| x.decode_entities }
|
1032
|
-
end
|
1033
|
-
|
1034
|
-
def cull
|
1035
|
-
if children
|
1036
|
-
swap(children.to_s)
|
1037
|
-
end
|
1038
|
-
end
|
1039
|
-
|
1040
|
-
def strip
|
1041
|
-
if strip_removes?
|
1042
|
-
cull
|
1043
|
-
end
|
1044
|
-
end
|
1045
|
-
|
1046
1021
|
def strip_attributes
|
1047
1022
|
unless attributes.nil?
|
1048
1023
|
attributes.each do |atr|
|
@@ -1052,16 +1027,11 @@ module Hpricot
|
|
1052
1027
|
end
|
1053
1028
|
end
|
1054
1029
|
end
|
1055
|
-
|
1056
|
-
def strip_removes?
|
1057
|
-
# I'm sure there are others that shuould be ripped instead of stripped
|
1058
|
-
attributes && attributes['type'] =~ /script|css/
|
1059
|
-
end
|
1060
1030
|
end
|
1061
1031
|
end
|
1062
1032
|
|
1063
1033
|
module FeedParser
|
1064
|
-
Version = "0.9.
|
1034
|
+
Version = "0.9.87"
|
1065
1035
|
|
1066
1036
|
License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
1067
1037
|
|
@@ -1107,15 +1077,11 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
1107
1077
|
# If you want feedparser to automatically run HTML markup through HTML Tidy, set
|
1108
1078
|
# this to true. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
|
1109
1079
|
# or utidylib <http://utidylib.berlios.de/>.
|
1110
|
-
TIDY_MARKUP = false #FIXME untranslated
|
1080
|
+
#TIDY_MARKUP = false #FIXME untranslated
|
1111
1081
|
|
1112
1082
|
# List of Python interfaces for HTML Tidy, in order of preference. Only useful
|
1113
1083
|
# if TIDY_MARKUP = true
|
1114
|
-
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
|
1115
|
-
|
1116
|
-
# The original Python import. I'm using it to help translate
|
1117
|
-
#import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
|
1118
|
-
|
1084
|
+
#PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] #FIXME untranslated
|
1119
1085
|
|
1120
1086
|
|
1121
1087
|
# ---------- don't touch these ----------
|
@@ -1153,13 +1119,14 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
1153
1119
|
=begin
|
1154
1120
|
The naming of a certain common attribute (such as, "When was the last
|
1155
1121
|
time this feed was updated?") can have many different names depending
|
1156
|
-
on the type of feed we are handling. This class allows us to
|
1157
|
-
|
1158
|
-
feed
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1122
|
+
on the type of feed we are handling. This class allows us to satisfy
|
1123
|
+
the expectations of both the developer who has prior knowledge of the
|
1124
|
+
feed type as well as the developer who wants a consistent application
|
1125
|
+
interface.
|
1126
|
+
|
1127
|
+
@@keymap is a Hash that contains information on what a certain
|
1128
|
+
attribute names "really are" in each kind of feed. It does this by
|
1129
|
+
providing a common name that will map to any feed type in the keys,
|
1163
1130
|
with possible "correct" attributes in the its values. the #[] and #[]=
|
1164
1131
|
methods check with keymaps to see what attribute the developer "really
|
1165
1132
|
means" if they've asked for one which happens to be in @@keymap's keys.
|
@@ -1183,6 +1150,7 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
1183
1150
|
def entries # Apparently, Hash has an entries method! That blew a good 3 hours or more of my time
|
1184
1151
|
return self['entries']
|
1185
1152
|
end
|
1153
|
+
|
1186
1154
|
# We could include the [] rewrite in new using Hash.new's fancy pants block thing
|
1187
1155
|
# but we'd still have to overwrite []= and such.
|
1188
1156
|
# I'm going to make it easy to turn lists of pairs into FeedParserDicts's though.
|
@@ -1209,7 +1177,7 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
1209
1177
|
realkey.each{ |key| return self[key] if has_key?key }
|
1210
1178
|
end
|
1211
1179
|
# Note that the original key is preferred over the realkey we (might
|
1212
|
-
# have) found in @@
|
1180
|
+
# have) found in @@keymap
|
1213
1181
|
if has_key?(key)
|
1214
1182
|
return super(key)
|
1215
1183
|
end
|
@@ -3079,7 +3047,7 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
3079
3047
|
ename, eattr = l
|
3080
3048
|
h.search(ename).each do |elem|
|
3081
3049
|
euri = elem.attributes[eattr]
|
3082
|
-
if euri and not euri.empty? and URI.parse(euri).relative?
|
3050
|
+
if euri and not euri.empty? and URI.parse(URI.encode(euri)).relative?
|
3083
3051
|
elem.attributes[eattr] = urljoin(baseURI, euri)
|
3084
3052
|
end
|
3085
3053
|
end
|
@@ -3114,7 +3082,7 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
3114
3082
|
end
|
3115
3083
|
# yes, that '/' should be there. It's a search method. See the Hpricot docs.
|
3116
3084
|
|
3117
|
-
unless $compatible # FIXME
|
3085
|
+
unless $compatible # FIXME nonworking
|
3118
3086
|
(self/tag).strip_style(@config[:allow_css_properties], @config[:allow_css_keywords])
|
3119
3087
|
end
|
3120
3088
|
return self
|
@@ -3125,6 +3093,7 @@ POSSIBILITY OF SUCH DAMAGE."""
|
|
3125
3093
|
FeedParser::SanitizerDoc.new(Hpricot.make(html))
|
3126
3094
|
end
|
3127
3095
|
module_function(:SanitizerDoc)
|
3096
|
+
|
3128
3097
|
def self.sanitizeHTML(html,encoding)
|
3129
3098
|
# FIXME Tidy not yet supported
|
3130
3099
|
html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '<!\1')
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rfeedparser
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.9.
|
7
|
-
date: 2007-04-
|
6
|
+
version: 0.9.87
|
7
|
+
date: 2007-04-07 00:00:00 -04:00
|
8
8
|
summary: Parse RSS and Atom feeds in Ruby
|
9
9
|
require_paths:
|
10
10
|
- lib
|