feedme 0.8.0 → 0.8.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -153,6 +153,10 @@ module FeedMe
153
153
  match = Regexp.new(regexp).match(str)
154
154
  match.nil? ? nil : match[1]
155
155
  end,
156
+
157
+ # this shouldn't be necessary since all text is automatically
158
+ # unescaped, but some feeds double-escape HTML
159
+ :esc => proc {|str| CGI.unescapeHTML(str) }
156
160
  }
157
161
  end
158
162
 
@@ -528,7 +532,7 @@ module FeedMe
528
532
  end
529
533
 
530
534
  class Parser < FeedData
531
- attr_reader :fm_source, :fm_options, :fm_type, :fm_tags, :fm_unparsed
535
+ attr_reader :fm_source, :fm_options, :fm_type, :fm_tags, :fm_parsed, :fm_unparsed
532
536
 
533
537
  def initialize(builder, source, options={})
534
538
  super(nil, nil, builder)
@@ -672,13 +676,13 @@ module FeedMe
672
676
  end
673
677
 
674
678
  def clean_content(tag, attrs, content, parent)
675
- content = content.to_s
676
- if fm_builder.date_tags.include? tag
679
+ content = content.to_s
680
+ if fm_builder.date_tags.include? tag
677
681
  content = Time.parse(content) rescue unescape(content)
678
682
  else
679
683
  content = unescape(content)
680
684
  end
681
-
685
+
682
686
  unless attrs.empty?
683
687
  hash = FeedData.new(tag, parent, fm_builder)
684
688
  attrs.each_pair {|key, value| hash[key] = unescape(value) }
@@ -706,7 +710,7 @@ module FeedMe
706
710
 
707
711
  def unescape(content)
708
712
  content = CGI.unescapeHTML(content)
709
-
713
+
710
714
  query = content.match(/^(http:.*\?)(.*)$/)
711
715
  content = query[1] + CGI.unescape(query[2]) if query
712
716
 
@@ -30,7 +30,7 @@ module FeedMe
30
30
  # sanitize HTML
31
31
  # todo: dup code to fix bugs
32
32
  def clean_html(html)
33
- FeedMe::HtmlCleaner.clean(html)
33
+ FeedMe::HtmlCleaner.clean(html)
34
34
  end
35
35
  end
36
36
 
@@ -59,17 +59,17 @@ module FeedMe
59
59
  # Extra (i.e. unmatched) ending tags and comments are removed.
60
60
  def clean(str)
61
61
  str = unescapeHTML(str)
62
-
63
62
  doc = Hpricot(str, :fixup_tags => true)
64
63
  doc = subtree(doc, :body)
65
-
64
+
66
65
  # get all the tags in the document
67
66
  # Somewhere near hpricot 0.4.92 "*" starting to return all elements,
68
67
  # including text nodes instead of just tagged elements.
69
68
  tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
70
69
 
71
70
  # Remove tags that aren't whitelisted.
72
- remove_tags!(doc, tags - HTML_ELEMENTS)
71
+ diff = tags - HTML_ELEMENTS
72
+ remove_tags!(doc, diff)
73
73
  remaining_tags = tags & HTML_ELEMENTS
74
74
 
75
75
  # Remove attributes that aren't on the whitelist, or are suspicious URLs.
@@ -80,9 +80,9 @@ module FeedMe
80
80
  end
81
81
  element.raw_attributes = element.raw_attributes.build_hash {|a,v| [a, add_entities(v)]}
82
82
  end unless remaining_tags.empty?
83
-
83
+
84
84
  doc.traverse_text {|t| t.set(add_entities(t.to_html))}
85
-
85
+
86
86
  # Return the tree, without comments. Ugly way of removing comments,
87
87
  # but can't see a way to do this in Hpricot yet.
88
88
  doc.to_s.gsub(/<\!--.*?-->/mi, '')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feedme
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.8.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Didion
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-28 00:00:00 -05:00
12
+ date: 2010-01-02 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies: []
15
15