feedme 0.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,82 @@
1
+ # HTML utils that use hpricot
2
+ # Adapted from code by By Henrik Nyh (http://henrik.nyh.se), Les Hill
3
+ # (http://blog.leshill.org)
4
+
5
+ require 'rubygems'
6
+ require 'html-cleaner'
7
+ require 'hpricot'
8
+ require 'active_support'
9
+
10
+ module FeedMe
11
+ class HpricotUtil
12
+ # Like the Rails _truncate_ helper but doesn't break HTML tags or entities.
13
+ def truncate_html(html, words=15, truncate_string= "...")
14
+ return if html.nil?
15
+ doc = Hpricot(html.to_s)
16
+ doc.inner_text.mb_chars.split.size >= words ?
17
+ doc.truncate(words, truncate_string).inner_html : html.to_s
18
+ end
19
+
20
+ # strip all tags from HTML
21
+ def strip_html(html)
22
+ (Hpricot.parse(html)/:"text()").to_s
23
+ end
24
+
25
+ # strip tags from HTML and truncate to a certain number of words
26
+ def strip_truncate_html(input, words=15, truncate_string='...')
27
+ strip_html(input).split[0..words].join(' ') + truncate_string
28
+ end
29
+
30
+ # sanitize HTML
31
+ # todo: dup code to fix bugs
32
+ def clean_html(html)
33
+ FeedMe::HtmlCleaner.clean(html)
34
+ end
35
+ end
36
+
37
+ @@instance = HpricotUtil.new
38
+
39
+ def FeedMe.html_helper
40
+ @@instance
41
+ end
42
+ end
43
+
44
+ module HpricotTruncator
45
+ module NodeWithChildren
46
+ def truncate(words, truncate_string)
47
+ return self if inner_text.mb_chars.split.size <= words
48
+ truncated_node = dup
49
+ truncated_node.name = name
50
+ truncated_node.raw_attributes = raw_attributes
51
+ truncated_node.children = []
52
+ each_child do |node|
53
+ break if words <= 0
54
+ node_length = node.inner_text.mb_chars.split.size
55
+ truncated_node.children << node.truncate(words, truncate_string)
56
+ words -= node_length
57
+ end
58
+ truncated_node
59
+ end
60
+ end
61
+
62
+ module TextNode
63
+ def truncate(num_words, truncate_string)
64
+ words = content.split
65
+ self.content = (words.size <= num_words ?
66
+ content : words[0..num_words-1].join(' ') + truncate_string).to_s
67
+ self
68
+ end
69
+ end
70
+
71
+ module IgnoredTag
72
+ def truncate(max_length, ellipsis)
73
+ self
74
+ end
75
+ end
76
+ end
77
+
78
+ Hpricot::Doc.send(:include, HpricotTruncator::NodeWithChildren)
79
+ Hpricot::Elem.send(:include, HpricotTruncator::NodeWithChildren)
80
+ Hpricot::Text.send(:include, HpricotTruncator::TextNode)
81
+ Hpricot::BogusETag.send(:include, HpricotTruncator::IgnoredTag)
82
+ Hpricot::Comment.send(:include, HpricotTruncator::IgnoredTag)
@@ -0,0 +1,188 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+ require 'cgi'
4
+
5
+ module FeedMe
6
+
7
+ # Various methods for cleaning up HTML and preparing it for safe public
8
+ # consumption.
9
+ #
10
+ # Documents used for refrence:
11
+ # - http://www.w3.org/TR/html4/index/attributes.html
12
+ # - http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
13
+ # - http://feedparser.org/docs/html-sanitization.html
14
+ # - http://code.whytheluckystiff.net/hpricot/wiki
15
+ class HtmlCleaner
16
+
17
+ # allowed html elements.
18
+ HTML_ELEMENTS = %w(
19
+ a abbr acronym address area b bdo big blockquote br button caption center
20
+ cite code col colgroup dd del dfn dir div dl dt em fieldset font h1 h2 h3
21
+ h4 h5 h6 hr i img ins kbd label legend li map menu ol optgroup p pre q s
22
+ samp small span strike strong sub sup table tbody td tfoot th thead tr tt
23
+ u ul var
24
+ )
25
+
26
+ # allowed attributes.
27
+ HTML_ATTRS = %w(
28
+ abbr accept accept-charset accesskey align alt axis border cellpadding
29
+ cellspacing char charoff charset checked cite class clear cols colspan
30
+ color compact coords datetime dir disabled for frame headers height href
31
+ hreflang hspace id ismap label lang longdesc maxlength media method
32
+ multiple name nohref noshade nowrap readonly rel rev rows rowspan rules
33
+ scope selected shape size span src start summary tabindex target title
34
+ type usemap valign value vspace width
35
+ )
36
+
37
+ # allowed attributes, but they can contain URIs, extra caution required.
38
+ # NOTE: That means this doesnt list *all* URI attrs, just the ones that are allowed.
39
+ HTML_URI_ATTRS = %w(
40
+ href src cite usemap longdesc
41
+ )
42
+
43
+ DODGY_URI_SCHEMES = %w(
44
+ javascript vbscript mocha livescript data
45
+ )
46
+
47
+ class << self
48
+
49
+ # Does this:
50
+ # - Unescape HTML
51
+ # - Parse HTML into tree
52
+ # - Find 'body' if present, and extract tree inside that tag, otherwise parse whole tree
53
+ # - Each tag:
54
+ # - remove tag if not whitelisted
55
+ # - escape HTML tag contents
56
+ # - remove all attributes not on whitelist
57
+ # - extra-scrub URI attrs; see dodgy_uri?
58
+ #
59
+ # Extra (i.e. unmatched) ending tags and comments are removed.
60
+ def clean(str)
61
+ str = unescapeHTML(str)
62
+
63
+ doc = Hpricot(str, :fixup_tags => true)
64
+ doc = subtree(doc, :body)
65
+
66
+ # get all the tags in the document
67
+ # Somewhere near hpricot 0.4.92 "*" starting to return all elements,
68
+ # including text nodes instead of just tagged elements.
69
+ tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
70
+
71
+ # Remove tags that aren't whitelisted.
72
+ remove_tags!(doc, tags - HTML_ELEMENTS)
73
+ remaining_tags = tags & HTML_ELEMENTS
74
+
75
+ # Remove attributes that aren't on the whitelist, or are suspicious URLs.
76
+ (doc/remaining_tags.join(",")).each do |element|
77
+ next if element.raw_attributes.nil?
78
+ element.raw_attributes.reject! do |attr,val|
79
+ !HTML_ATTRS.include?(attr) || (HTML_URI_ATTRS.include?(attr) && dodgy_uri?(val))
80
+ end
81
+ element.raw_attributes = element.raw_attributes.build_hash {|a,v| [a, add_entities(v)]}
82
+ end unless remaining_tags.empty?
83
+
84
+ doc.traverse_text {|t| t.set(add_entities(t.to_html))}
85
+
86
+ # Return the tree, without comments. Ugly way of removing comments,
87
+ # but can't see a way to do this in Hpricot yet.
88
+ doc.to_s.gsub(/<\!--.*?-->/mi, '')
89
+ end
90
+
91
+ # For all other feed elements:
92
+ # - Unescape HTML.
93
+ # - Parse HTML into tree (taking 'body' as root, if present)
94
+ # - Takes text out of each tag, and escapes HTML.
95
+ # - Returns all text concatenated.
96
+ def flatten(str)
97
+ str.gsub!("\n", " ")
98
+ str = unescapeHTML(str)
99
+
100
+ doc = Hpricot(str, :xhtml_strict => true)
101
+ doc = subtree(doc, :body)
102
+
103
+ out = []
104
+ doc.traverse_text {|t| out << add_entities(t.to_html)}
105
+
106
+ return out.join
107
+ end
108
+
109
+ # Returns true if the given string contains a suspicious URL,
110
+ # i.e. a javascript link.
111
+ #
112
+ # This method rejects javascript, vbscript, livescript, mocha and data URLs.
113
+ # It *could* be refined to only deny dangerous data URLs, however.
114
+ def dodgy_uri?(uri)
115
+ uri = uri.to_s
116
+
117
+ # special case for poorly-formed entities (missing ';')
118
+ # if these occur *anywhere* within the string, then throw it out.
119
+ return true if (uri =~ /&\#(\d+|x[0-9a-f]+)[^;\d]/mi)
120
+
121
+ # Try escaping as both HTML or URI encodings, and then trying
122
+ # each scheme regexp on each
123
+ [unescapeHTML(uri), CGI.unescape(uri)].each do |unesc_uri|
124
+ DODGY_URI_SCHEMES.each do |scheme|
125
+
126
+ regexp = "#{scheme}:".gsub(/./) do |char|
127
+ "([\000-\037\177\s]*)#{char}"
128
+ end
129
+
130
+ # regexp looks something like
131
+ # /\A([\000-\037\177\s]*)j([\000-\037\177\s]*)a([\000-\037\177\s]*)v([\000-\037\177\s]*)a([\000-\037\177\s]*)s([\000-\037\177\s]*)c([\000-\037\177\s]*)r([\000-\037\177\s]*)i([\000-\037\177\s]*)p([\000-\037\177\s]*)t([\000-\037\177\s]*):/mi
132
+ return true if (unesc_uri =~ %r{\A#{regexp}}mi)
133
+ end
134
+ end
135
+
136
+ nil
137
+ end
138
+
139
+ # unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
140
+ def unescapeHTML(str, xml = true)
141
+ CGI.unescapeHTML(xml ? str.gsub("&apos;", "&#39;") : str)
142
+ end
143
+
144
+ # Adds entities where possible.
145
+ # Works like CGI.escapeHTML, but will not escape existing entities;
146
+ # i.e. &#123; will NOT become &amp;#123;
147
+ #
148
+ # This method could be improved by adding a whitelist of html entities.
149
+ def add_entities(str)
150
+ str.to_s.gsub(/\"/n, '&quot;').gsub(/>/n, '&gt;').gsub(/</n, '&lt;').gsub(/&(?!(\#\d+|\#x([0-9a-f]+)|\w{2,8});)/nmi, '&amp;')
151
+ end
152
+
153
+ private
154
+
155
+ # Everything below elment, or the just return the doc if element not present.
156
+ def subtree(doc, element)
157
+ doc.at("//#{element}/*") || doc
158
+ end
159
+
160
+ def remove_tags!(doc, tags)
161
+ (doc/tags.join(",")).remove unless tags.empty?
162
+ end
163
+
164
+ end
165
+ end
166
+ end
167
+
168
+
169
+ module Enumerable #:nodoc:
170
+ def build_hash
171
+ result = {}
172
+ self.each do |elt|
173
+ key, value = yield elt
174
+ result[key] = value
175
+ end
176
+ result
177
+ end
178
+ end
179
+
180
+ # http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/207625
181
+ # Subject: A simple Hpricot text setter
182
+ # From: Chris Gehlker <canyonrat mac.com>
183
+ # Date: Fri, 11 Aug 2006 03:19:13 +0900
184
+ class Hpricot::Text #:nodoc:
185
+ def set(string)
186
+ @content = string
187
+ end
188
+ end
@@ -0,0 +1,117 @@
1
+ # HTML utils that use nokogiri
2
+ # Adapted from code by Eleo (http://gist.github.com/101410)
3
+
4
+ require 'rubygems'
5
+ require 'nokogiri'
6
+ require 'sanitize'
7
+
8
+ module FeedMe
9
+ class NokogiriUtil
10
+ # Truncate HTML while preserving tags
11
+ def truncate_html(text, num_words=15, truncate_string="...")
12
+ doc = Nokogiri::HTML(html)
13
+ current = doc.children.first
14
+ count = 0
15
+
16
+ while true
17
+ # we found a text node
18
+ if current.is_a?(Nokogiri::XML::Text)
19
+ count += current.text.split.length
20
+ # we reached our limit, let's get outta here!
21
+ break if count > num_words
22
+ previous = current
23
+ end
24
+
25
+ if current.children.length > 0
26
+ # this node has children, can't be a text node,
27
+ # lets descend and look for text nodes
28
+ current = current.children.first
29
+ elsif !current.next.nil?
30
+ #this has no children, but has a sibling, let's check it out
31
+ current = current.next
32
+ else
33
+ # we are the last child, we need to ascend until we are
34
+ # either done or find a sibling to continue on to
35
+ n = current
36
+ while !n.is_a?(Nokogiri::HTML::Document) and n.parent.next.nil?
37
+ n = n.parent
38
+ end
39
+
40
+ # we've reached the top and found no more text nodes, break
41
+ if n.is_a?(Nokogiri::HTML::Document)
42
+ break;
43
+ else
44
+ current = n.parent.next
45
+ end
46
+ end
47
+ end
48
+
49
+ if count >= num_words
50
+ unless count == num_words
51
+ new_content = current.text.split
52
+
53
+ # If we're here, the last text node we counted eclipsed the number of words
54
+ # that we want, so we need to cut down on words. The easiest way to think about
55
+ # this is that without this node we'd have fewer words than the limit, so all
56
+ # the previous words plus a limited number of words from this node are needed.
57
+ # We simply need to figure out how many words are needed and grab that many.
58
+ # Then we need to -subtract- an index, because the first word would be index zero.
59
+
60
+ # For example, given:
61
+ # <p>Testing this HTML truncater.</p><p>To see if its working.</p>
62
+ # Let's say I want 6 words. The correct returned string would be:
63
+ # <p>Testing this HTML truncater.</p><p>To see...</p>
64
+ # All the words in both paragraphs = 9
65
+ # The last paragraph is the one that breaks the limit. How many words would we
66
+ # have without it? 4. But we want up to 6, so we might as well get that many.
67
+ # 6 - 4 = 2, so we get 2 words from this node, but words #1-2 are indices #0-1, so
68
+ # we subtract 1. If this gives us -1, we want nothing from this node. So go back to
69
+ # the previous node instead.
70
+ index = num_words-(count-new_content.length)-1
71
+ if index >= 0
72
+ new_content = new_content[0..index]
73
+ current.content = new_content.join(' ') + truncate_string
74
+ else
75
+ current = previous
76
+ current.content = current.content + truncate_string
77
+ end
78
+ end
79
+
80
+ # remove everything else
81
+ while !current.is_a?(Nokogiri::HTML::Document)
82
+ while !current.next.nil?
83
+ current.next.remove
84
+ end
85
+ current = current.parent
86
+ end
87
+ end
88
+
89
+ # now we grab the html and not the text.
90
+ # we do first because nokogiri adds html and body tags
91
+ # which we don't want
92
+ doc.root.children.first.inner_html
93
+ end
94
+
95
+ # strip all tags from HTML
96
+ def strip_html(html)
97
+ Nokogiri::HTML(html).inner_text
98
+ end
99
+
100
+ # strip tags from HTML and truncate to a certain number of words
101
+ def strip_truncate_html(html, words=15, truncate_string='...')
102
+ strip_html(html).split[0..words].join(' ') + truncate_string
103
+ end
104
+
105
+ # sanitize HTML
106
+ # todo: dup code to fix bugs
107
+ def clean_html(html)
108
+ Sanitize.clean(html)
109
+ end
110
+ end
111
+
112
+ @@instance = NokogiriUtil.new
113
+
114
+ def FeedMe.html_helper
115
+ @@instance
116
+ end
117
+ end
data/lib/util.rb ADDED
@@ -0,0 +1,45 @@
1
+ module FeedMe
2
+ # Pretty-print an object, with special formatting for hashes
3
+ # and arrays.
4
+ def FeedMe.pretty_to_s(obj, indent_step=2, indent=0, code=nil)
5
+ new_indent = indent + indent_step
6
+ space = ' ' * indent
7
+ new_space = ' ' * new_indent
8
+ str = ''
9
+ if (obj.is_a?(FeedData) || obj.is_a?(Hash))
10
+ str << "#{obj.fm_tag_name} " if obj.is_a?(FeedData)
11
+ str << "{"
12
+ obj.each_with_index do |item, index|
13
+ key, value = code.call(*item) if code
14
+ str << "\n#{new_space}"
15
+ str << FeedMe.pretty_to_s(key, indent_step, new_indent, code)
16
+ str << " => "
17
+ str << FeedMe.pretty_to_s(value, indent_step, new_indent, code)
18
+ str << ',' unless index == obj.size-1
19
+ end
20
+ str << "\n#{space}}"
21
+ elsif obj.is_a?(Array)
22
+ str << "["
23
+ obj.each_with_index do |value, index|
24
+ str << "\n#{new_space}"
25
+ str << FeedMe.pretty_to_s(value, indent_step, new_indent, code)
26
+ str << ',' unless index == obj.size-1
27
+ end
28
+ str << "\n#{space}]"
29
+ elsif obj.is_a? Symbol
30
+ str << obj.inspect
31
+ else
32
+ str << obj.to_s.strip.inspect
33
+ end
34
+ return str
35
+ end
36
+ end
37
+
38
+ class String
39
+ def trunc(wordcount, tail='...')
40
+ words = self.split
41
+ truncated = words[0..(wordcount-1)].join(' ')
42
+ truncated += tail if words.size > wordcount
43
+ truncated
44
+ end
45
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feedme
3
3
  version: !ruby/object:Gem::Version
4
- version: "0.1"
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Didion
@@ -9,22 +9,13 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-09-03 00:00:00 -04:00
12
+ date: 2009-12-28 00:00:00 -05:00
13
13
  default_executable:
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: hoe
17
- type: :development
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 2.3.3
24
- version:
14
+ dependencies: []
15
+
25
16
  description: A simple, flexible, and extensible RSS and Atom parser for Ruby. Based on the popular SimpleRSS library, but with many nice extra features.
26
17
  email:
27
- - jdidion@rubyforge.org
18
+ - code@didion.net
28
19
  executables: []
29
20
 
30
21
  extensions: []
@@ -38,11 +29,16 @@ files:
38
29
  - Manifest.txt
39
30
  - README.txt
40
31
  - Rakefile
41
- - lib/feedme.rb
42
32
  - examples/rocketboom.rb
43
33
  - examples/rocketboom.rss
34
+ - lib/feedme.rb
35
+ - lib/hpricot-util.rb
36
+ - lib/html-cleaner.rb
37
+ - lib/nokogiri-util.rb
38
+ - lib/util.rb
39
+ - test/test_helper.rb
44
40
  has_rdoc: true
45
- homepage: http://feedme.rubyforge.org
41
+ homepage: http://wiki.github.com/jdidion/feedme
46
42
  licenses: []
47
43
 
48
44
  post_install_message: