feedme 0.8.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -153,6 +153,10 @@ module FeedMe
153
153
  match = Regexp.new(regexp).match(str)
154
154
  match.nil? ? nil : match[1]
155
155
  end,
156
+
157
+ # this shouldn't be necessary since all text is automatically
158
+ # unescaped, but some feeds double-escape HTML
159
+ :esc => proc {|str| CGI.unescapeHTML(str) }
156
160
  }
157
161
  end
158
162
 
@@ -528,7 +532,7 @@ module FeedMe
528
532
  end
529
533
 
530
534
  class Parser < FeedData
531
- attr_reader :fm_source, :fm_options, :fm_type, :fm_tags, :fm_unparsed
535
+ attr_reader :fm_source, :fm_options, :fm_type, :fm_tags, :fm_parsed, :fm_unparsed
532
536
 
533
537
  def initialize(builder, source, options={})
534
538
  super(nil, nil, builder)
@@ -672,13 +676,13 @@ module FeedMe
672
676
  end
673
677
 
674
678
  def clean_content(tag, attrs, content, parent)
675
- content = content.to_s
676
- if fm_builder.date_tags.include? tag
679
+ content = content.to_s
680
+ if fm_builder.date_tags.include? tag
677
681
  content = Time.parse(content) rescue unescape(content)
678
682
  else
679
683
  content = unescape(content)
680
684
  end
681
-
685
+
682
686
  unless attrs.empty?
683
687
  hash = FeedData.new(tag, parent, fm_builder)
684
688
  attrs.each_pair {|key, value| hash[key] = unescape(value) }
@@ -706,7 +710,7 @@ module FeedMe
706
710
 
707
711
  def unescape(content)
708
712
  content = CGI.unescapeHTML(content)
709
-
713
+
710
714
  query = content.match(/^(http:.*\?)(.*)$/)
711
715
  content = query[1] + CGI.unescape(query[2]) if query
712
716
 
@@ -30,7 +30,7 @@ module FeedMe
30
30
  # sanitize HTML
31
31
  # todo: dup code to fix bugs
32
32
  def clean_html(html)
33
- FeedMe::HtmlCleaner.clean(html)
33
+ FeedMe::HtmlCleaner.clean(html)
34
34
  end
35
35
  end
36
36
 
@@ -59,17 +59,17 @@ module FeedMe
59
59
  # Extra (i.e. unmatched) ending tags and comments are removed.
60
60
  def clean(str)
61
61
  str = unescapeHTML(str)
62
-
63
62
  doc = Hpricot(str, :fixup_tags => true)
64
63
  doc = subtree(doc, :body)
65
-
64
+
66
65
  # get all the tags in the document
67
66
  # Somewhere near hpricot 0.4.92 "*" starting to return all elements,
68
67
  # including text nodes instead of just tagged elements.
69
68
  tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
70
69
 
71
70
  # Remove tags that aren't whitelisted.
72
- remove_tags!(doc, tags - HTML_ELEMENTS)
71
+ diff = tags - HTML_ELEMENTS
72
+ remove_tags!(doc, diff)
73
73
  remaining_tags = tags & HTML_ELEMENTS
74
74
 
75
75
  # Remove attributes that aren't on the whitelist, or are suspicious URLs.
@@ -80,9 +80,9 @@ module FeedMe
80
80
  end
81
81
  element.raw_attributes = element.raw_attributes.build_hash {|a,v| [a, add_entities(v)]}
82
82
  end unless remaining_tags.empty?
83
-
83
+
84
84
  doc.traverse_text {|t| t.set(add_entities(t.to_html))}
85
-
85
+
86
86
  # Return the tree, without comments. Ugly way of removing comments,
87
87
  # but can't see a way to do this in Hpricot yet.
88
88
  doc.to_s.gsub(/<\!--.*?-->/mi, '')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feedme
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.8.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Didion
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-28 00:00:00 -05:00
12
+ date: 2010-01-02 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies: []
15
15