feedme 0.8.0 → 0.8.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/feedme.rb +9 -5
- data/lib/hpricot-util.rb +1 -1
- data/lib/html-cleaner.rb +5 -5
- metadata +2 -2
data/lib/feedme.rb
CHANGED
@@ -153,6 +153,10 @@ module FeedMe
|
|
153
153
|
match = Regexp.new(regexp).match(str)
|
154
154
|
match.nil? ? nil : match[1]
|
155
155
|
end,
|
156
|
+
|
157
|
+
# this shouldn't be necessary since all text is automatically
|
158
|
+
# unescaped, but some feeds double-escape HTML
|
159
|
+
:esc => proc {|str| CGI.unescapeHTML(str) }
|
156
160
|
}
|
157
161
|
end
|
158
162
|
|
@@ -528,7 +532,7 @@ module FeedMe
|
|
528
532
|
end
|
529
533
|
|
530
534
|
class Parser < FeedData
|
531
|
-
attr_reader :fm_source, :fm_options, :fm_type, :fm_tags, :fm_unparsed
|
535
|
+
attr_reader :fm_source, :fm_options, :fm_type, :fm_tags, :fm_parsed, :fm_unparsed
|
532
536
|
|
533
537
|
def initialize(builder, source, options={})
|
534
538
|
super(nil, nil, builder)
|
@@ -672,13 +676,13 @@ module FeedMe
|
|
672
676
|
end
|
673
677
|
|
674
678
|
def clean_content(tag, attrs, content, parent)
|
675
|
-
|
676
|
-
|
679
|
+
content = content.to_s
|
680
|
+
if fm_builder.date_tags.include? tag
|
677
681
|
content = Time.parse(content) rescue unescape(content)
|
678
682
|
else
|
679
683
|
content = unescape(content)
|
680
684
|
end
|
681
|
-
|
685
|
+
|
682
686
|
unless attrs.empty?
|
683
687
|
hash = FeedData.new(tag, parent, fm_builder)
|
684
688
|
attrs.each_pair {|key, value| hash[key] = unescape(value) }
|
@@ -706,7 +710,7 @@ module FeedMe
|
|
706
710
|
|
707
711
|
def unescape(content)
|
708
712
|
content = CGI.unescapeHTML(content)
|
709
|
-
|
713
|
+
|
710
714
|
query = content.match(/^(http:.*\?)(.*)$/)
|
711
715
|
content = query[1] + CGI.unescape(query[2]) if query
|
712
716
|
|
data/lib/hpricot-util.rb
CHANGED
data/lib/html-cleaner.rb
CHANGED
@@ -59,17 +59,17 @@ module FeedMe
|
|
59
59
|
# Extra (i.e. unmatched) ending tags and comments are removed.
|
60
60
|
def clean(str)
|
61
61
|
str = unescapeHTML(str)
|
62
|
-
|
63
62
|
doc = Hpricot(str, :fixup_tags => true)
|
64
63
|
doc = subtree(doc, :body)
|
65
|
-
|
64
|
+
|
66
65
|
# get all the tags in the document
|
67
66
|
# Somewhere near hpricot 0.4.92 "*" starting to return all elements,
|
68
67
|
# including text nodes instead of just tagged elements.
|
69
68
|
tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
|
70
69
|
|
71
70
|
# Remove tags that aren't whitelisted.
|
72
|
-
|
71
|
+
diff = tags - HTML_ELEMENTS
|
72
|
+
remove_tags!(doc, diff)
|
73
73
|
remaining_tags = tags & HTML_ELEMENTS
|
74
74
|
|
75
75
|
# Remove attributes that aren't on the whitelist, or are suspicious URLs.
|
@@ -80,9 +80,9 @@ module FeedMe
|
|
80
80
|
end
|
81
81
|
element.raw_attributes = element.raw_attributes.build_hash {|a,v| [a, add_entities(v)]}
|
82
82
|
end unless remaining_tags.empty?
|
83
|
-
|
83
|
+
|
84
84
|
doc.traverse_text {|t| t.set(add_entities(t.to_html))}
|
85
|
-
|
85
|
+
|
86
86
|
# Return the tree, without comments. Ugly way of removing comments,
|
87
87
|
# but can't see a way to do this in Hpricot yet.
|
88
88
|
doc.to_s.gsub(/<\!--.*?-->/mi, '')
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feedme
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Didion
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-02 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|