feedme 0.8.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/feedme.rb +9 -5
- data/lib/hpricot-util.rb +1 -1
- data/lib/html-cleaner.rb +5 -5
- metadata +2 -2
data/lib/feedme.rb
CHANGED
@@ -153,6 +153,10 @@ module FeedMe
|
|
153
153
|
match = Regexp.new(regexp).match(str)
|
154
154
|
match.nil? ? nil : match[1]
|
155
155
|
end,
|
156
|
+
|
157
|
+
# this shouldn't be necessary since all text is automatically
|
158
|
+
# unescaped, but some feeds double-escape HTML
|
159
|
+
:esc => proc {|str| CGI.unescapeHTML(str) }
|
156
160
|
}
|
157
161
|
end
|
158
162
|
|
@@ -528,7 +532,7 @@ module FeedMe
|
|
528
532
|
end
|
529
533
|
|
530
534
|
class Parser < FeedData
|
531
|
-
attr_reader :fm_source, :fm_options, :fm_type, :fm_tags, :fm_unparsed
|
535
|
+
attr_reader :fm_source, :fm_options, :fm_type, :fm_tags, :fm_parsed, :fm_unparsed
|
532
536
|
|
533
537
|
def initialize(builder, source, options={})
|
534
538
|
super(nil, nil, builder)
|
@@ -672,13 +676,13 @@ module FeedMe
|
|
672
676
|
end
|
673
677
|
|
674
678
|
def clean_content(tag, attrs, content, parent)
|
675
|
-
|
676
|
-
|
679
|
+
content = content.to_s
|
680
|
+
if fm_builder.date_tags.include? tag
|
677
681
|
content = Time.parse(content) rescue unescape(content)
|
678
682
|
else
|
679
683
|
content = unescape(content)
|
680
684
|
end
|
681
|
-
|
685
|
+
|
682
686
|
unless attrs.empty?
|
683
687
|
hash = FeedData.new(tag, parent, fm_builder)
|
684
688
|
attrs.each_pair {|key, value| hash[key] = unescape(value) }
|
@@ -706,7 +710,7 @@ module FeedMe
|
|
706
710
|
|
707
711
|
def unescape(content)
|
708
712
|
content = CGI.unescapeHTML(content)
|
709
|
-
|
713
|
+
|
710
714
|
query = content.match(/^(http:.*\?)(.*)$/)
|
711
715
|
content = query[1] + CGI.unescape(query[2]) if query
|
712
716
|
|
data/lib/hpricot-util.rb
CHANGED
data/lib/html-cleaner.rb
CHANGED
@@ -59,17 +59,17 @@ module FeedMe
|
|
59
59
|
# Extra (i.e. unmatched) ending tags and comments are removed.
|
60
60
|
def clean(str)
|
61
61
|
str = unescapeHTML(str)
|
62
|
-
|
63
62
|
doc = Hpricot(str, :fixup_tags => true)
|
64
63
|
doc = subtree(doc, :body)
|
65
|
-
|
64
|
+
|
66
65
|
# get all the tags in the document
|
67
66
|
# Somewhere near hpricot 0.4.92 "*" starting to return all elements,
|
68
67
|
# including text nodes instead of just tagged elements.
|
69
68
|
tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
|
70
69
|
|
71
70
|
# Remove tags that aren't whitelisted.
|
72
|
-
|
71
|
+
diff = tags - HTML_ELEMENTS
|
72
|
+
remove_tags!(doc, diff)
|
73
73
|
remaining_tags = tags & HTML_ELEMENTS
|
74
74
|
|
75
75
|
# Remove attributes that aren't on the whitelist, or are suspicious URLs.
|
@@ -80,9 +80,9 @@ module FeedMe
|
|
80
80
|
end
|
81
81
|
element.raw_attributes = element.raw_attributes.build_hash {|a,v| [a, add_entities(v)]}
|
82
82
|
end unless remaining_tags.empty?
|
83
|
-
|
83
|
+
|
84
84
|
doc.traverse_text {|t| t.set(add_entities(t.to_html))}
|
85
|
-
|
85
|
+
|
86
86
|
# Return the tree, without comments. Ugly way of removing comments,
|
87
87
|
# but can't see a way to do this in Hpricot yet.
|
88
88
|
doc.to_s.gsub(/<\!--.*?-->/mi, '')
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feedme
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Didion
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-02 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|