feedme 0.1 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,82 @@
1
+ # HTML utils that use hpricot
2
+ # Adapted from code by By Henrik Nyh (http://henrik.nyh.se), Les Hill
3
+ # (http://blog.leshill.org)
4
+
5
+ require 'rubygems'
6
+ require 'html-cleaner'
7
+ require 'hpricot'
8
+ require 'active_support'
9
+
10
+ module FeedMe
11
+ class HpricotUtil
12
+ # Like the Rails _truncate_ helper but doesn't break HTML tags or entities.
13
+ def truncate_html(html, words=15, truncate_string= "...")
14
+ return if html.nil?
15
+ doc = Hpricot(html.to_s)
16
+ doc.inner_text.mb_chars.split.size >= words ?
17
+ doc.truncate(words, truncate_string).inner_html : html.to_s
18
+ end
19
+
20
+ # strip all tags from HTML
21
+ def strip_html(html)
22
+ (Hpricot.parse(html)/:"text()").to_s
23
+ end
24
+
25
+ # strip tags from HTML and truncate to a certain number of words
26
+ def strip_truncate_html(input, words=15, truncate_string='...')
27
+ strip_html(input).split[0..words].join(' ') + truncate_string
28
+ end
29
+
30
+ # sanitize HTML
31
+ # todo: dup code to fix bugs
32
+ def clean_html(html)
33
+ FeedMe::HtmlCleaner.clean(html)
34
+ end
35
+ end
36
+
37
+ @@instance = HpricotUtil.new
38
+
39
+ def FeedMe.html_helper
40
+ @@instance
41
+ end
42
+ end
43
+
44
+ module HpricotTruncator
45
+ module NodeWithChildren
46
+ def truncate(words, truncate_string)
47
+ return self if inner_text.mb_chars.split.size <= words
48
+ truncated_node = dup
49
+ truncated_node.name = name
50
+ truncated_node.raw_attributes = raw_attributes
51
+ truncated_node.children = []
52
+ each_child do |node|
53
+ break if words <= 0
54
+ node_length = node.inner_text.mb_chars.split.size
55
+ truncated_node.children << node.truncate(words, truncate_string)
56
+ words -= node_length
57
+ end
58
+ truncated_node
59
+ end
60
+ end
61
+
62
+ module TextNode
63
+ def truncate(num_words, truncate_string)
64
+ words = content.split
65
+ self.content = (words.size <= num_words ?
66
+ content : words[0..num_words-1].join(' ') + truncate_string).to_s
67
+ self
68
+ end
69
+ end
70
+
71
+ module IgnoredTag
72
+ def truncate(max_length, ellipsis)
73
+ self
74
+ end
75
+ end
76
+ end
77
+
78
+ Hpricot::Doc.send(:include, HpricotTruncator::NodeWithChildren)
79
+ Hpricot::Elem.send(:include, HpricotTruncator::NodeWithChildren)
80
+ Hpricot::Text.send(:include, HpricotTruncator::TextNode)
81
+ Hpricot::BogusETag.send(:include, HpricotTruncator::IgnoredTag)
82
+ Hpricot::Comment.send(:include, HpricotTruncator::IgnoredTag)
@@ -0,0 +1,188 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+ require 'cgi'
4
+
5
+ module FeedMe
6
+
7
+ # Various methods for cleaning up HTML and preparing it for safe public
8
+ # consumption.
9
+ #
10
+ # Documents used for refrence:
11
+ # - http://www.w3.org/TR/html4/index/attributes.html
12
+ # - http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
13
+ # - http://feedparser.org/docs/html-sanitization.html
14
+ # - http://code.whytheluckystiff.net/hpricot/wiki
15
+ class HtmlCleaner
16
+
17
+ # allowed html elements.
18
+ HTML_ELEMENTS = %w(
19
+ a abbr acronym address area b bdo big blockquote br button caption center
20
+ cite code col colgroup dd del dfn dir div dl dt em fieldset font h1 h2 h3
21
+ h4 h5 h6 hr i img ins kbd label legend li map menu ol optgroup p pre q s
22
+ samp small span strike strong sub sup table tbody td tfoot th thead tr tt
23
+ u ul var
24
+ )
25
+
26
+ # allowed attributes.
27
+ HTML_ATTRS = %w(
28
+ abbr accept accept-charset accesskey align alt axis border cellpadding
29
+ cellspacing char charoff charset checked cite class clear cols colspan
30
+ color compact coords datetime dir disabled for frame headers height href
31
+ hreflang hspace id ismap label lang longdesc maxlength media method
32
+ multiple name nohref noshade nowrap readonly rel rev rows rowspan rules
33
+ scope selected shape size span src start summary tabindex target title
34
+ type usemap valign value vspace width
35
+ )
36
+
37
+ # allowed attributes, but they can contain URIs, extra caution required.
38
+ # NOTE: That means this doesnt list *all* URI attrs, just the ones that are allowed.
39
+ HTML_URI_ATTRS = %w(
40
+ href src cite usemap longdesc
41
+ )
42
+
43
+ DODGY_URI_SCHEMES = %w(
44
+ javascript vbscript mocha livescript data
45
+ )
46
+
47
+ class << self
48
+
49
+ # Does this:
50
+ # - Unescape HTML
51
+ # - Parse HTML into tree
52
+ # - Find 'body' if present, and extract tree inside that tag, otherwise parse whole tree
53
+ # - Each tag:
54
+ # - remove tag if not whitelisted
55
+ # - escape HTML tag contents
56
+ # - remove all attributes not on whitelist
57
+ # - extra-scrub URI attrs; see dodgy_uri?
58
+ #
59
+ # Extra (i.e. unmatched) ending tags and comments are removed.
60
+ def clean(str)
61
+ str = unescapeHTML(str)
62
+
63
+ doc = Hpricot(str, :fixup_tags => true)
64
+ doc = subtree(doc, :body)
65
+
66
+ # get all the tags in the document
67
+ # Somewhere near hpricot 0.4.92 "*" starting to return all elements,
68
+ # including text nodes instead of just tagged elements.
69
+ tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
70
+
71
+ # Remove tags that aren't whitelisted.
72
+ remove_tags!(doc, tags - HTML_ELEMENTS)
73
+ remaining_tags = tags & HTML_ELEMENTS
74
+
75
+ # Remove attributes that aren't on the whitelist, or are suspicious URLs.
76
+ (doc/remaining_tags.join(",")).each do |element|
77
+ next if element.raw_attributes.nil?
78
+ element.raw_attributes.reject! do |attr,val|
79
+ !HTML_ATTRS.include?(attr) || (HTML_URI_ATTRS.include?(attr) && dodgy_uri?(val))
80
+ end
81
+ element.raw_attributes = element.raw_attributes.build_hash {|a,v| [a, add_entities(v)]}
82
+ end unless remaining_tags.empty?
83
+
84
+ doc.traverse_text {|t| t.set(add_entities(t.to_html))}
85
+
86
+ # Return the tree, without comments. Ugly way of removing comments,
87
+ # but can't see a way to do this in Hpricot yet.
88
+ doc.to_s.gsub(/<\!--.*?-->/mi, '')
89
+ end
90
+
91
+ # For all other feed elements:
92
+ # - Unescape HTML.
93
+ # - Parse HTML into tree (taking 'body' as root, if present)
94
+ # - Takes text out of each tag, and escapes HTML.
95
+ # - Returns all text concatenated.
96
+ def flatten(str)
97
+ str.gsub!("\n", " ")
98
+ str = unescapeHTML(str)
99
+
100
+ doc = Hpricot(str, :xhtml_strict => true)
101
+ doc = subtree(doc, :body)
102
+
103
+ out = []
104
+ doc.traverse_text {|t| out << add_entities(t.to_html)}
105
+
106
+ return out.join
107
+ end
108
+
109
+ # Returns true if the given string contains a suspicious URL,
110
+ # i.e. a javascript link.
111
+ #
112
+ # This method rejects javascript, vbscript, livescript, mocha and data URLs.
113
+ # It *could* be refined to only deny dangerous data URLs, however.
114
+ def dodgy_uri?(uri)
115
+ uri = uri.to_s
116
+
117
+ # special case for poorly-formed entities (missing ';')
118
+ # if these occur *anywhere* within the string, then throw it out.
119
+ return true if (uri =~ /&\#(\d+|x[0-9a-f]+)[^;\d]/mi)
120
+
121
+ # Try escaping as both HTML or URI encodings, and then trying
122
+ # each scheme regexp on each
123
+ [unescapeHTML(uri), CGI.unescape(uri)].each do |unesc_uri|
124
+ DODGY_URI_SCHEMES.each do |scheme|
125
+
126
+ regexp = "#{scheme}:".gsub(/./) do |char|
127
+ "([\000-\037\177\s]*)#{char}"
128
+ end
129
+
130
+ # regexp looks something like
131
+ # /\A([\000-\037\177\s]*)j([\000-\037\177\s]*)a([\000-\037\177\s]*)v([\000-\037\177\s]*)a([\000-\037\177\s]*)s([\000-\037\177\s]*)c([\000-\037\177\s]*)r([\000-\037\177\s]*)i([\000-\037\177\s]*)p([\000-\037\177\s]*)t([\000-\037\177\s]*):/mi
132
+ return true if (unesc_uri =~ %r{\A#{regexp}}mi)
133
+ end
134
+ end
135
+
136
+ nil
137
+ end
138
+
139
+ # unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
140
+ def unescapeHTML(str, xml = true)
141
+ CGI.unescapeHTML(xml ? str.gsub("&apos;", "&#39;") : str)
142
+ end
143
+
144
+ # Adds entities where possible.
145
+ # Works like CGI.escapeHTML, but will not escape existing entities;
146
+ # i.e. &#123; will NOT become &amp;#123;
147
+ #
148
+ # This method could be improved by adding a whitelist of html entities.
149
+ def add_entities(str)
150
+ str.to_s.gsub(/\"/n, '&quot;').gsub(/>/n, '&gt;').gsub(/</n, '&lt;').gsub(/&(?!(\#\d+|\#x([0-9a-f]+)|\w{2,8});)/nmi, '&amp;')
151
+ end
152
+
153
+ private
154
+
155
+ # Everything below elment, or the just return the doc if element not present.
156
+ def subtree(doc, element)
157
+ doc.at("//#{element}/*") || doc
158
+ end
159
+
160
+ def remove_tags!(doc, tags)
161
+ (doc/tags.join(",")).remove unless tags.empty?
162
+ end
163
+
164
+ end
165
+ end
166
+ end
167
+
168
+
169
+ module Enumerable #:nodoc:
170
+ def build_hash
171
+ result = {}
172
+ self.each do |elt|
173
+ key, value = yield elt
174
+ result[key] = value
175
+ end
176
+ result
177
+ end
178
+ end
179
+
180
+ # http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/207625
181
+ # Subject: A simple Hpricot text setter
182
+ # From: Chris Gehlker <canyonrat mac.com>
183
+ # Date: Fri, 11 Aug 2006 03:19:13 +0900
184
+ class Hpricot::Text #:nodoc:
185
+ def set(string)
186
+ @content = string
187
+ end
188
+ end
@@ -0,0 +1,117 @@
1
+ # HTML utils that use nokogiri
2
+ # Adapted from code by Eleo (http://gist.github.com/101410)
3
+
4
+ require 'rubygems'
5
+ require 'nokogiri'
6
+ require 'sanitize'
7
+
8
+ module FeedMe
9
+ class NokogiriUtil
10
+ # Truncate HTML while preserving tags
11
+ def truncate_html(text, num_words=15, truncate_string="...")
12
+ doc = Nokogiri::HTML(html)
13
+ current = doc.children.first
14
+ count = 0
15
+
16
+ while true
17
+ # we found a text node
18
+ if current.is_a?(Nokogiri::XML::Text)
19
+ count += current.text.split.length
20
+ # we reached our limit, let's get outta here!
21
+ break if count > num_words
22
+ previous = current
23
+ end
24
+
25
+ if current.children.length > 0
26
+ # this node has children, can't be a text node,
27
+ # lets descend and look for text nodes
28
+ current = current.children.first
29
+ elsif !current.next.nil?
30
+ #this has no children, but has a sibling, let's check it out
31
+ current = current.next
32
+ else
33
+ # we are the last child, we need to ascend until we are
34
+ # either done or find a sibling to continue on to
35
+ n = current
36
+ while !n.is_a?(Nokogiri::HTML::Document) and n.parent.next.nil?
37
+ n = n.parent
38
+ end
39
+
40
+ # we've reached the top and found no more text nodes, break
41
+ if n.is_a?(Nokogiri::HTML::Document)
42
+ break;
43
+ else
44
+ current = n.parent.next
45
+ end
46
+ end
47
+ end
48
+
49
+ if count >= num_words
50
+ unless count == num_words
51
+ new_content = current.text.split
52
+
53
+ # If we're here, the last text node we counted eclipsed the number of words
54
+ # that we want, so we need to cut down on words. The easiest way to think about
55
+ # this is that without this node we'd have fewer words than the limit, so all
56
+ # the previous words plus a limited number of words from this node are needed.
57
+ # We simply need to figure out how many words are needed and grab that many.
58
+ # Then we need to -subtract- an index, because the first word would be index zero.
59
+
60
+ # For example, given:
61
+ # <p>Testing this HTML truncater.</p><p>To see if its working.</p>
62
+ # Let's say I want 6 words. The correct returned string would be:
63
+ # <p>Testing this HTML truncater.</p><p>To see...</p>
64
+ # All the words in both paragraphs = 9
65
+ # The last paragraph is the one that breaks the limit. How many words would we
66
+ # have without it? 4. But we want up to 6, so we might as well get that many.
67
+ # 6 - 4 = 2, so we get 2 words from this node, but words #1-2 are indices #0-1, so
68
+ # we subtract 1. If this gives us -1, we want nothing from this node. So go back to
69
+ # the previous node instead.
70
+ index = num_words-(count-new_content.length)-1
71
+ if index >= 0
72
+ new_content = new_content[0..index]
73
+ current.content = new_content.join(' ') + truncate_string
74
+ else
75
+ current = previous
76
+ current.content = current.content + truncate_string
77
+ end
78
+ end
79
+
80
+ # remove everything else
81
+ while !current.is_a?(Nokogiri::HTML::Document)
82
+ while !current.next.nil?
83
+ current.next.remove
84
+ end
85
+ current = current.parent
86
+ end
87
+ end
88
+
89
+ # now we grab the html and not the text.
90
+ # we do first because nokogiri adds html and body tags
91
+ # which we don't want
92
+ doc.root.children.first.inner_html
93
+ end
94
+
95
+ # strip all tags from HTML
96
+ def strip_html(html)
97
+ Nokogiri::HTML(html).inner_text
98
+ end
99
+
100
+ # strip tags from HTML and truncate to a certain number of words
101
+ def strip_truncate_html(html, words=15, truncate_string='...')
102
+ strip_html(html).split[0..words].join(' ') + truncate_string
103
+ end
104
+
105
+ # sanitize HTML
106
+ # todo: dup code to fix bugs
107
+ def clean_html(html)
108
+ Sanitize.clean(html)
109
+ end
110
+ end
111
+
112
+ @@instance = NokogiriUtil.new
113
+
114
+ def FeedMe.html_helper
115
+ @@instance
116
+ end
117
+ end
data/lib/util.rb ADDED
@@ -0,0 +1,45 @@
1
+ module FeedMe
2
+ # Pretty-print an object, with special formatting for hashes
3
+ # and arrays.
4
+ def FeedMe.pretty_to_s(obj, indent_step=2, indent=0, code=nil)
5
+ new_indent = indent + indent_step
6
+ space = ' ' * indent
7
+ new_space = ' ' * new_indent
8
+ str = ''
9
+ if (obj.is_a?(FeedData) || obj.is_a?(Hash))
10
+ str << "#{obj.fm_tag_name} " if obj.is_a?(FeedData)
11
+ str << "{"
12
+ obj.each_with_index do |item, index|
13
+ key, value = code.call(*item) if code
14
+ str << "\n#{new_space}"
15
+ str << FeedMe.pretty_to_s(key, indent_step, new_indent, code)
16
+ str << " => "
17
+ str << FeedMe.pretty_to_s(value, indent_step, new_indent, code)
18
+ str << ',' unless index == obj.size-1
19
+ end
20
+ str << "\n#{space}}"
21
+ elsif obj.is_a?(Array)
22
+ str << "["
23
+ obj.each_with_index do |value, index|
24
+ str << "\n#{new_space}"
25
+ str << FeedMe.pretty_to_s(value, indent_step, new_indent, code)
26
+ str << ',' unless index == obj.size-1
27
+ end
28
+ str << "\n#{space}]"
29
+ elsif obj.is_a? Symbol
30
+ str << obj.inspect
31
+ else
32
+ str << obj.to_s.strip.inspect
33
+ end
34
+ return str
35
+ end
36
+ end
37
+
38
+ class String
39
+ def trunc(wordcount, tail='...')
40
+ words = self.split
41
+ truncated = words[0..(wordcount-1)].join(' ')
42
+ truncated += tail if words.size > wordcount
43
+ truncated
44
+ end
45
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feedme
3
3
  version: !ruby/object:Gem::Version
4
- version: "0.1"
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Didion
@@ -9,22 +9,13 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-09-03 00:00:00 -04:00
12
+ date: 2009-12-28 00:00:00 -05:00
13
13
  default_executable:
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: hoe
17
- type: :development
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 2.3.3
24
- version:
14
+ dependencies: []
15
+
25
16
  description: A simple, flexible, and extensible RSS and Atom parser for Ruby. Based on the popular SimpleRSS library, but with many nice extra features.
26
17
  email:
27
- - jdidion@rubyforge.org
18
+ - code@didion.net
28
19
  executables: []
29
20
 
30
21
  extensions: []
@@ -38,11 +29,16 @@ files:
38
29
  - Manifest.txt
39
30
  - README.txt
40
31
  - Rakefile
41
- - lib/feedme.rb
42
32
  - examples/rocketboom.rb
43
33
  - examples/rocketboom.rss
34
+ - lib/feedme.rb
35
+ - lib/hpricot-util.rb
36
+ - lib/html-cleaner.rb
37
+ - lib/nokogiri-util.rb
38
+ - lib/util.rb
39
+ - test/test_helper.rb
44
40
  has_rdoc: true
45
- homepage: http://feedme.rubyforge.org
41
+ homepage: http://wiki.github.com/jdidion/feedme
46
42
  licenses: []
47
43
 
48
44
  post_install_message: