feedme 0.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +100 -0
- data/Manifest.txt +2 -0
- data/README.txt +124 -46
- data/Rakefile +21 -4
- data/examples/rocketboom.rb +5 -4
- data/lib/feedme.rb +399 -191
- data/lib/hpricot-util.rb +82 -0
- data/lib/html-cleaner.rb +188 -0
- data/lib/nokogiri-util.rb +117 -0
- data/lib/util.rb +45 -0
- metadata +12 -16
data/lib/hpricot-util.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
# HTML utils that use hpricot
|
2
|
+
# Adapted from code by By Henrik Nyh (http://henrik.nyh.se), Les Hill
|
3
|
+
# (http://blog.leshill.org)
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
require 'html-cleaner'
|
7
|
+
require 'hpricot'
|
8
|
+
require 'active_support'
|
9
|
+
|
10
|
+
module FeedMe
|
11
|
+
class HpricotUtil
|
12
|
+
# Like the Rails _truncate_ helper but doesn't break HTML tags or entities.
|
13
|
+
def truncate_html(html, words=15, truncate_string= "...")
|
14
|
+
return if html.nil?
|
15
|
+
doc = Hpricot(html.to_s)
|
16
|
+
doc.inner_text.mb_chars.split.size >= words ?
|
17
|
+
doc.truncate(words, truncate_string).inner_html : html.to_s
|
18
|
+
end
|
19
|
+
|
20
|
+
# strip all tags from HTML
|
21
|
+
def strip_html(html)
|
22
|
+
(Hpricot.parse(html)/:"text()").to_s
|
23
|
+
end
|
24
|
+
|
25
|
+
# strip tags from HTML and truncate to a certain number of words
|
26
|
+
def strip_truncate_html(input, words=15, truncate_string='...')
|
27
|
+
strip_html(input).split[0..words].join(' ') + truncate_string
|
28
|
+
end
|
29
|
+
|
30
|
+
# sanitize HTML
|
31
|
+
# todo: dup code to fix bugs
|
32
|
+
def clean_html(html)
|
33
|
+
FeedMe::HtmlCleaner.clean(html)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
@@instance = HpricotUtil.new
|
38
|
+
|
39
|
+
def FeedMe.html_helper
|
40
|
+
@@instance
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
module HpricotTruncator
|
45
|
+
module NodeWithChildren
|
46
|
+
def truncate(words, truncate_string)
|
47
|
+
return self if inner_text.mb_chars.split.size <= words
|
48
|
+
truncated_node = dup
|
49
|
+
truncated_node.name = name
|
50
|
+
truncated_node.raw_attributes = raw_attributes
|
51
|
+
truncated_node.children = []
|
52
|
+
each_child do |node|
|
53
|
+
break if words <= 0
|
54
|
+
node_length = node.inner_text.mb_chars.split.size
|
55
|
+
truncated_node.children << node.truncate(words, truncate_string)
|
56
|
+
words -= node_length
|
57
|
+
end
|
58
|
+
truncated_node
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
module TextNode
|
63
|
+
def truncate(num_words, truncate_string)
|
64
|
+
words = content.split
|
65
|
+
self.content = (words.size <= num_words ?
|
66
|
+
content : words[0..num_words-1].join(' ') + truncate_string).to_s
|
67
|
+
self
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
module IgnoredTag
|
72
|
+
def truncate(max_length, ellipsis)
|
73
|
+
self
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
Hpricot::Doc.send(:include, HpricotTruncator::NodeWithChildren)
|
79
|
+
Hpricot::Elem.send(:include, HpricotTruncator::NodeWithChildren)
|
80
|
+
Hpricot::Text.send(:include, HpricotTruncator::TextNode)
|
81
|
+
Hpricot::BogusETag.send(:include, HpricotTruncator::IgnoredTag)
|
82
|
+
Hpricot::Comment.send(:include, HpricotTruncator::IgnoredTag)
|
data/lib/html-cleaner.rb
ADDED
@@ -0,0 +1,188 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
require 'cgi'
|
4
|
+
|
5
|
+
module FeedMe
|
6
|
+
|
7
|
+
# Various methods for cleaning up HTML and preparing it for safe public
|
8
|
+
# consumption.
|
9
|
+
#
|
10
|
+
# Documents used for refrence:
|
11
|
+
# - http://www.w3.org/TR/html4/index/attributes.html
|
12
|
+
# - http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
|
13
|
+
# - http://feedparser.org/docs/html-sanitization.html
|
14
|
+
# - http://code.whytheluckystiff.net/hpricot/wiki
|
15
|
+
class HtmlCleaner
|
16
|
+
|
17
|
+
# allowed html elements.
|
18
|
+
HTML_ELEMENTS = %w(
|
19
|
+
a abbr acronym address area b bdo big blockquote br button caption center
|
20
|
+
cite code col colgroup dd del dfn dir div dl dt em fieldset font h1 h2 h3
|
21
|
+
h4 h5 h6 hr i img ins kbd label legend li map menu ol optgroup p pre q s
|
22
|
+
samp small span strike strong sub sup table tbody td tfoot th thead tr tt
|
23
|
+
u ul var
|
24
|
+
)
|
25
|
+
|
26
|
+
# allowed attributes.
|
27
|
+
HTML_ATTRS = %w(
|
28
|
+
abbr accept accept-charset accesskey align alt axis border cellpadding
|
29
|
+
cellspacing char charoff charset checked cite class clear cols colspan
|
30
|
+
color compact coords datetime dir disabled for frame headers height href
|
31
|
+
hreflang hspace id ismap label lang longdesc maxlength media method
|
32
|
+
multiple name nohref noshade nowrap readonly rel rev rows rowspan rules
|
33
|
+
scope selected shape size span src start summary tabindex target title
|
34
|
+
type usemap valign value vspace width
|
35
|
+
)
|
36
|
+
|
37
|
+
# allowed attributes, but they can contain URIs, extra caution required.
|
38
|
+
# NOTE: That means this doesnt list *all* URI attrs, just the ones that are allowed.
|
39
|
+
HTML_URI_ATTRS = %w(
|
40
|
+
href src cite usemap longdesc
|
41
|
+
)
|
42
|
+
|
43
|
+
DODGY_URI_SCHEMES = %w(
|
44
|
+
javascript vbscript mocha livescript data
|
45
|
+
)
|
46
|
+
|
47
|
+
class << self
|
48
|
+
|
49
|
+
# Does this:
|
50
|
+
# - Unescape HTML
|
51
|
+
# - Parse HTML into tree
|
52
|
+
# - Find 'body' if present, and extract tree inside that tag, otherwise parse whole tree
|
53
|
+
# - Each tag:
|
54
|
+
# - remove tag if not whitelisted
|
55
|
+
# - escape HTML tag contents
|
56
|
+
# - remove all attributes not on whitelist
|
57
|
+
# - extra-scrub URI attrs; see dodgy_uri?
|
58
|
+
#
|
59
|
+
# Extra (i.e. unmatched) ending tags and comments are removed.
|
60
|
+
def clean(str)
|
61
|
+
str = unescapeHTML(str)
|
62
|
+
|
63
|
+
doc = Hpricot(str, :fixup_tags => true)
|
64
|
+
doc = subtree(doc, :body)
|
65
|
+
|
66
|
+
# get all the tags in the document
|
67
|
+
# Somewhere near hpricot 0.4.92 "*" starting to return all elements,
|
68
|
+
# including text nodes instead of just tagged elements.
|
69
|
+
tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
|
70
|
+
|
71
|
+
# Remove tags that aren't whitelisted.
|
72
|
+
remove_tags!(doc, tags - HTML_ELEMENTS)
|
73
|
+
remaining_tags = tags & HTML_ELEMENTS
|
74
|
+
|
75
|
+
# Remove attributes that aren't on the whitelist, or are suspicious URLs.
|
76
|
+
(doc/remaining_tags.join(",")).each do |element|
|
77
|
+
next if element.raw_attributes.nil?
|
78
|
+
element.raw_attributes.reject! do |attr,val|
|
79
|
+
!HTML_ATTRS.include?(attr) || (HTML_URI_ATTRS.include?(attr) && dodgy_uri?(val))
|
80
|
+
end
|
81
|
+
element.raw_attributes = element.raw_attributes.build_hash {|a,v| [a, add_entities(v)]}
|
82
|
+
end unless remaining_tags.empty?
|
83
|
+
|
84
|
+
doc.traverse_text {|t| t.set(add_entities(t.to_html))}
|
85
|
+
|
86
|
+
# Return the tree, without comments. Ugly way of removing comments,
|
87
|
+
# but can't see a way to do this in Hpricot yet.
|
88
|
+
doc.to_s.gsub(/<\!--.*?-->/mi, '')
|
89
|
+
end
|
90
|
+
|
91
|
+
# For all other feed elements:
|
92
|
+
# - Unescape HTML.
|
93
|
+
# - Parse HTML into tree (taking 'body' as root, if present)
|
94
|
+
# - Takes text out of each tag, and escapes HTML.
|
95
|
+
# - Returns all text concatenated.
|
96
|
+
def flatten(str)
|
97
|
+
str.gsub!("\n", " ")
|
98
|
+
str = unescapeHTML(str)
|
99
|
+
|
100
|
+
doc = Hpricot(str, :xhtml_strict => true)
|
101
|
+
doc = subtree(doc, :body)
|
102
|
+
|
103
|
+
out = []
|
104
|
+
doc.traverse_text {|t| out << add_entities(t.to_html)}
|
105
|
+
|
106
|
+
return out.join
|
107
|
+
end
|
108
|
+
|
109
|
+
# Returns true if the given string contains a suspicious URL,
|
110
|
+
# i.e. a javascript link.
|
111
|
+
#
|
112
|
+
# This method rejects javascript, vbscript, livescript, mocha and data URLs.
|
113
|
+
# It *could* be refined to only deny dangerous data URLs, however.
|
114
|
+
def dodgy_uri?(uri)
|
115
|
+
uri = uri.to_s
|
116
|
+
|
117
|
+
# special case for poorly-formed entities (missing ';')
|
118
|
+
# if these occur *anywhere* within the string, then throw it out.
|
119
|
+
return true if (uri =~ /&\#(\d+|x[0-9a-f]+)[^;\d]/mi)
|
120
|
+
|
121
|
+
# Try escaping as both HTML or URI encodings, and then trying
|
122
|
+
# each scheme regexp on each
|
123
|
+
[unescapeHTML(uri), CGI.unescape(uri)].each do |unesc_uri|
|
124
|
+
DODGY_URI_SCHEMES.each do |scheme|
|
125
|
+
|
126
|
+
regexp = "#{scheme}:".gsub(/./) do |char|
|
127
|
+
"([\000-\037\177\s]*)#{char}"
|
128
|
+
end
|
129
|
+
|
130
|
+
# regexp looks something like
|
131
|
+
# /\A([\000-\037\177\s]*)j([\000-\037\177\s]*)a([\000-\037\177\s]*)v([\000-\037\177\s]*)a([\000-\037\177\s]*)s([\000-\037\177\s]*)c([\000-\037\177\s]*)r([\000-\037\177\s]*)i([\000-\037\177\s]*)p([\000-\037\177\s]*)t([\000-\037\177\s]*):/mi
|
132
|
+
return true if (unesc_uri =~ %r{\A#{regexp}}mi)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
nil
|
137
|
+
end
|
138
|
+
|
139
|
+
# unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
|
140
|
+
def unescapeHTML(str, xml = true)
|
141
|
+
CGI.unescapeHTML(xml ? str.gsub("'", "'") : str)
|
142
|
+
end
|
143
|
+
|
144
|
+
# Adds entities where possible.
|
145
|
+
# Works like CGI.escapeHTML, but will not escape existing entities;
|
146
|
+
# i.e. { will NOT become &#123;
|
147
|
+
#
|
148
|
+
# This method could be improved by adding a whitelist of html entities.
|
149
|
+
def add_entities(str)
|
150
|
+
str.to_s.gsub(/\"/n, '"').gsub(/>/n, '>').gsub(/</n, '<').gsub(/&(?!(\#\d+|\#x([0-9a-f]+)|\w{2,8});)/nmi, '&')
|
151
|
+
end
|
152
|
+
|
153
|
+
private
|
154
|
+
|
155
|
+
# Everything below elment, or the just return the doc if element not present.
|
156
|
+
def subtree(doc, element)
|
157
|
+
doc.at("//#{element}/*") || doc
|
158
|
+
end
|
159
|
+
|
160
|
+
def remove_tags!(doc, tags)
|
161
|
+
(doc/tags.join(",")).remove unless tags.empty?
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
|
169
|
+
module Enumerable #:nodoc:
|
170
|
+
def build_hash
|
171
|
+
result = {}
|
172
|
+
self.each do |elt|
|
173
|
+
key, value = yield elt
|
174
|
+
result[key] = value
|
175
|
+
end
|
176
|
+
result
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/207625
|
181
|
+
# Subject: A simple Hpricot text setter
|
182
|
+
# From: Chris Gehlker <canyonrat mac.com>
|
183
|
+
# Date: Fri, 11 Aug 2006 03:19:13 +0900
|
184
|
+
class Hpricot::Text #:nodoc:
|
185
|
+
def set(string)
|
186
|
+
@content = string
|
187
|
+
end
|
188
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# HTML utils that use nokogiri
|
2
|
+
# Adapted from code by Eleo (http://gist.github.com/101410)
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'sanitize'
|
7
|
+
|
8
|
+
module FeedMe
|
9
|
+
class NokogiriUtil
|
10
|
+
# Truncate HTML while preserving tags
|
11
|
+
def truncate_html(text, num_words=15, truncate_string="...")
|
12
|
+
doc = Nokogiri::HTML(html)
|
13
|
+
current = doc.children.first
|
14
|
+
count = 0
|
15
|
+
|
16
|
+
while true
|
17
|
+
# we found a text node
|
18
|
+
if current.is_a?(Nokogiri::XML::Text)
|
19
|
+
count += current.text.split.length
|
20
|
+
# we reached our limit, let's get outta here!
|
21
|
+
break if count > num_words
|
22
|
+
previous = current
|
23
|
+
end
|
24
|
+
|
25
|
+
if current.children.length > 0
|
26
|
+
# this node has children, can't be a text node,
|
27
|
+
# lets descend and look for text nodes
|
28
|
+
current = current.children.first
|
29
|
+
elsif !current.next.nil?
|
30
|
+
#this has no children, but has a sibling, let's check it out
|
31
|
+
current = current.next
|
32
|
+
else
|
33
|
+
# we are the last child, we need to ascend until we are
|
34
|
+
# either done or find a sibling to continue on to
|
35
|
+
n = current
|
36
|
+
while !n.is_a?(Nokogiri::HTML::Document) and n.parent.next.nil?
|
37
|
+
n = n.parent
|
38
|
+
end
|
39
|
+
|
40
|
+
# we've reached the top and found no more text nodes, break
|
41
|
+
if n.is_a?(Nokogiri::HTML::Document)
|
42
|
+
break;
|
43
|
+
else
|
44
|
+
current = n.parent.next
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
if count >= num_words
|
50
|
+
unless count == num_words
|
51
|
+
new_content = current.text.split
|
52
|
+
|
53
|
+
# If we're here, the last text node we counted eclipsed the number of words
|
54
|
+
# that we want, so we need to cut down on words. The easiest way to think about
|
55
|
+
# this is that without this node we'd have fewer words than the limit, so all
|
56
|
+
# the previous words plus a limited number of words from this node are needed.
|
57
|
+
# We simply need to figure out how many words are needed and grab that many.
|
58
|
+
# Then we need to -subtract- an index, because the first word would be index zero.
|
59
|
+
|
60
|
+
# For example, given:
|
61
|
+
# <p>Testing this HTML truncater.</p><p>To see if its working.</p>
|
62
|
+
# Let's say I want 6 words. The correct returned string would be:
|
63
|
+
# <p>Testing this HTML truncater.</p><p>To see...</p>
|
64
|
+
# All the words in both paragraphs = 9
|
65
|
+
# The last paragraph is the one that breaks the limit. How many words would we
|
66
|
+
# have without it? 4. But we want up to 6, so we might as well get that many.
|
67
|
+
# 6 - 4 = 2, so we get 2 words from this node, but words #1-2 are indices #0-1, so
|
68
|
+
# we subtract 1. If this gives us -1, we want nothing from this node. So go back to
|
69
|
+
# the previous node instead.
|
70
|
+
index = num_words-(count-new_content.length)-1
|
71
|
+
if index >= 0
|
72
|
+
new_content = new_content[0..index]
|
73
|
+
current.content = new_content.join(' ') + truncate_string
|
74
|
+
else
|
75
|
+
current = previous
|
76
|
+
current.content = current.content + truncate_string
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# remove everything else
|
81
|
+
while !current.is_a?(Nokogiri::HTML::Document)
|
82
|
+
while !current.next.nil?
|
83
|
+
current.next.remove
|
84
|
+
end
|
85
|
+
current = current.parent
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# now we grab the html and not the text.
|
90
|
+
# we do first because nokogiri adds html and body tags
|
91
|
+
# which we don't want
|
92
|
+
doc.root.children.first.inner_html
|
93
|
+
end
|
94
|
+
|
95
|
+
# strip all tags from HTML
|
96
|
+
def strip_html(html)
|
97
|
+
Nokogiri::HTML(html).inner_text
|
98
|
+
end
|
99
|
+
|
100
|
+
# strip tags from HTML and truncate to a certain number of words
|
101
|
+
def strip_truncate_html(html, words=15, truncate_string='...')
|
102
|
+
strip_html(html).split[0..words].join(' ') + truncate_string
|
103
|
+
end
|
104
|
+
|
105
|
+
# sanitize HTML
|
106
|
+
# todo: dup code to fix bugs
|
107
|
+
def clean_html(html)
|
108
|
+
Sanitize.clean(html)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
@@instance = NokogiriUtil.new
|
113
|
+
|
114
|
+
def FeedMe.html_helper
|
115
|
+
@@instance
|
116
|
+
end
|
117
|
+
end
|
data/lib/util.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
module FeedMe
|
2
|
+
# Pretty-print an object, with special formatting for hashes
|
3
|
+
# and arrays.
|
4
|
+
def FeedMe.pretty_to_s(obj, indent_step=2, indent=0, code=nil)
|
5
|
+
new_indent = indent + indent_step
|
6
|
+
space = ' ' * indent
|
7
|
+
new_space = ' ' * new_indent
|
8
|
+
str = ''
|
9
|
+
if (obj.is_a?(FeedData) || obj.is_a?(Hash))
|
10
|
+
str << "#{obj.fm_tag_name} " if obj.is_a?(FeedData)
|
11
|
+
str << "{"
|
12
|
+
obj.each_with_index do |item, index|
|
13
|
+
key, value = code.call(*item) if code
|
14
|
+
str << "\n#{new_space}"
|
15
|
+
str << FeedMe.pretty_to_s(key, indent_step, new_indent, code)
|
16
|
+
str << " => "
|
17
|
+
str << FeedMe.pretty_to_s(value, indent_step, new_indent, code)
|
18
|
+
str << ',' unless index == obj.size-1
|
19
|
+
end
|
20
|
+
str << "\n#{space}}"
|
21
|
+
elsif obj.is_a?(Array)
|
22
|
+
str << "["
|
23
|
+
obj.each_with_index do |value, index|
|
24
|
+
str << "\n#{new_space}"
|
25
|
+
str << FeedMe.pretty_to_s(value, indent_step, new_indent, code)
|
26
|
+
str << ',' unless index == obj.size-1
|
27
|
+
end
|
28
|
+
str << "\n#{space}]"
|
29
|
+
elsif obj.is_a? Symbol
|
30
|
+
str << obj.inspect
|
31
|
+
else
|
32
|
+
str << obj.to_s.strip.inspect
|
33
|
+
end
|
34
|
+
return str
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
class String
|
39
|
+
def trunc(wordcount, tail='...')
|
40
|
+
words = self.split
|
41
|
+
truncated = words[0..(wordcount-1)].join(' ')
|
42
|
+
truncated += tail if words.size > wordcount
|
43
|
+
truncated
|
44
|
+
end
|
45
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feedme
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Didion
|
@@ -9,22 +9,13 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-12-28 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
|
-
dependencies:
|
15
|
-
|
16
|
-
name: hoe
|
17
|
-
type: :development
|
18
|
-
version_requirement:
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">="
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 2.3.3
|
24
|
-
version:
|
14
|
+
dependencies: []
|
15
|
+
|
25
16
|
description: A simple, flexible, and extensible RSS and Atom parser for Ruby. Based on the popular SimpleRSS library, but with many nice extra features.
|
26
17
|
email:
|
27
|
-
-
|
18
|
+
- code@didion.net
|
28
19
|
executables: []
|
29
20
|
|
30
21
|
extensions: []
|
@@ -38,11 +29,16 @@ files:
|
|
38
29
|
- Manifest.txt
|
39
30
|
- README.txt
|
40
31
|
- Rakefile
|
41
|
-
- lib/feedme.rb
|
42
32
|
- examples/rocketboom.rb
|
43
33
|
- examples/rocketboom.rss
|
34
|
+
- lib/feedme.rb
|
35
|
+
- lib/hpricot-util.rb
|
36
|
+
- lib/html-cleaner.rb
|
37
|
+
- lib/nokogiri-util.rb
|
38
|
+
- lib/util.rb
|
39
|
+
- test/test_helper.rb
|
44
40
|
has_rdoc: true
|
45
|
-
homepage: http://
|
41
|
+
homepage: http://wiki.github.com/jdidion/feedme
|
46
42
|
licenses: []
|
47
43
|
|
48
44
|
post_install_message:
|