feedme 0.1 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +100 -0
- data/Manifest.txt +2 -0
- data/README.txt +124 -46
- data/Rakefile +21 -4
- data/examples/rocketboom.rb +5 -4
- data/lib/feedme.rb +399 -191
- data/lib/hpricot-util.rb +82 -0
- data/lib/html-cleaner.rb +188 -0
- data/lib/nokogiri-util.rb +117 -0
- data/lib/util.rb +45 -0
- metadata +12 -16
data/lib/hpricot-util.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
# HTML utils that use hpricot
|
2
|
+
# Adapted from code by By Henrik Nyh (http://henrik.nyh.se), Les Hill
|
3
|
+
# (http://blog.leshill.org)
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
require 'html-cleaner'
|
7
|
+
require 'hpricot'
|
8
|
+
require 'active_support'
|
9
|
+
|
10
|
+
module FeedMe
|
11
|
+
class HpricotUtil
|
12
|
+
# Like the Rails _truncate_ helper but doesn't break HTML tags or entities.
|
13
|
+
def truncate_html(html, words=15, truncate_string= "...")
|
14
|
+
return if html.nil?
|
15
|
+
doc = Hpricot(html.to_s)
|
16
|
+
doc.inner_text.mb_chars.split.size >= words ?
|
17
|
+
doc.truncate(words, truncate_string).inner_html : html.to_s
|
18
|
+
end
|
19
|
+
|
20
|
+
# strip all tags from HTML
|
21
|
+
def strip_html(html)
|
22
|
+
(Hpricot.parse(html)/:"text()").to_s
|
23
|
+
end
|
24
|
+
|
25
|
+
# strip tags from HTML and truncate to a certain number of words
|
26
|
+
def strip_truncate_html(input, words=15, truncate_string='...')
|
27
|
+
strip_html(input).split[0..words].join(' ') + truncate_string
|
28
|
+
end
|
29
|
+
|
30
|
+
# sanitize HTML
|
31
|
+
# todo: dup code to fix bugs
|
32
|
+
def clean_html(html)
|
33
|
+
FeedMe::HtmlCleaner.clean(html)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
@@instance = HpricotUtil.new
|
38
|
+
|
39
|
+
def FeedMe.html_helper
|
40
|
+
@@instance
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
module HpricotTruncator
|
45
|
+
module NodeWithChildren
|
46
|
+
def truncate(words, truncate_string)
|
47
|
+
return self if inner_text.mb_chars.split.size <= words
|
48
|
+
truncated_node = dup
|
49
|
+
truncated_node.name = name
|
50
|
+
truncated_node.raw_attributes = raw_attributes
|
51
|
+
truncated_node.children = []
|
52
|
+
each_child do |node|
|
53
|
+
break if words <= 0
|
54
|
+
node_length = node.inner_text.mb_chars.split.size
|
55
|
+
truncated_node.children << node.truncate(words, truncate_string)
|
56
|
+
words -= node_length
|
57
|
+
end
|
58
|
+
truncated_node
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
module TextNode
|
63
|
+
def truncate(num_words, truncate_string)
|
64
|
+
words = content.split
|
65
|
+
self.content = (words.size <= num_words ?
|
66
|
+
content : words[0..num_words-1].join(' ') + truncate_string).to_s
|
67
|
+
self
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
module IgnoredTag
|
72
|
+
def truncate(max_length, ellipsis)
|
73
|
+
self
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
Hpricot::Doc.send(:include, HpricotTruncator::NodeWithChildren)
|
79
|
+
Hpricot::Elem.send(:include, HpricotTruncator::NodeWithChildren)
|
80
|
+
Hpricot::Text.send(:include, HpricotTruncator::TextNode)
|
81
|
+
Hpricot::BogusETag.send(:include, HpricotTruncator::IgnoredTag)
|
82
|
+
Hpricot::Comment.send(:include, HpricotTruncator::IgnoredTag)
|
data/lib/html-cleaner.rb
ADDED
@@ -0,0 +1,188 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hpricot'
|
3
|
+
require 'cgi'
|
4
|
+
|
5
|
+
module FeedMe
|
6
|
+
|
7
|
+
# Various methods for cleaning up HTML and preparing it for safe public
|
8
|
+
# consumption.
|
9
|
+
#
|
10
|
+
# Documents used for refrence:
|
11
|
+
# - http://www.w3.org/TR/html4/index/attributes.html
|
12
|
+
# - http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
|
13
|
+
# - http://feedparser.org/docs/html-sanitization.html
|
14
|
+
# - http://code.whytheluckystiff.net/hpricot/wiki
|
15
|
+
class HtmlCleaner
|
16
|
+
|
17
|
+
# allowed html elements.
|
18
|
+
HTML_ELEMENTS = %w(
|
19
|
+
a abbr acronym address area b bdo big blockquote br button caption center
|
20
|
+
cite code col colgroup dd del dfn dir div dl dt em fieldset font h1 h2 h3
|
21
|
+
h4 h5 h6 hr i img ins kbd label legend li map menu ol optgroup p pre q s
|
22
|
+
samp small span strike strong sub sup table tbody td tfoot th thead tr tt
|
23
|
+
u ul var
|
24
|
+
)
|
25
|
+
|
26
|
+
# allowed attributes.
|
27
|
+
HTML_ATTRS = %w(
|
28
|
+
abbr accept accept-charset accesskey align alt axis border cellpadding
|
29
|
+
cellspacing char charoff charset checked cite class clear cols colspan
|
30
|
+
color compact coords datetime dir disabled for frame headers height href
|
31
|
+
hreflang hspace id ismap label lang longdesc maxlength media method
|
32
|
+
multiple name nohref noshade nowrap readonly rel rev rows rowspan rules
|
33
|
+
scope selected shape size span src start summary tabindex target title
|
34
|
+
type usemap valign value vspace width
|
35
|
+
)
|
36
|
+
|
37
|
+
# allowed attributes, but they can contain URIs, extra caution required.
|
38
|
+
# NOTE: That means this doesnt list *all* URI attrs, just the ones that are allowed.
|
39
|
+
HTML_URI_ATTRS = %w(
|
40
|
+
href src cite usemap longdesc
|
41
|
+
)
|
42
|
+
|
43
|
+
DODGY_URI_SCHEMES = %w(
|
44
|
+
javascript vbscript mocha livescript data
|
45
|
+
)
|
46
|
+
|
47
|
+
class << self
|
48
|
+
|
49
|
+
# Does this:
|
50
|
+
# - Unescape HTML
|
51
|
+
# - Parse HTML into tree
|
52
|
+
# - Find 'body' if present, and extract tree inside that tag, otherwise parse whole tree
|
53
|
+
# - Each tag:
|
54
|
+
# - remove tag if not whitelisted
|
55
|
+
# - escape HTML tag contents
|
56
|
+
# - remove all attributes not on whitelist
|
57
|
+
# - extra-scrub URI attrs; see dodgy_uri?
|
58
|
+
#
|
59
|
+
# Extra (i.e. unmatched) ending tags and comments are removed.
|
60
|
+
def clean(str)
|
61
|
+
str = unescapeHTML(str)
|
62
|
+
|
63
|
+
doc = Hpricot(str, :fixup_tags => true)
|
64
|
+
doc = subtree(doc, :body)
|
65
|
+
|
66
|
+
# get all the tags in the document
|
67
|
+
# Somewhere near hpricot 0.4.92 "*" starting to return all elements,
|
68
|
+
# including text nodes instead of just tagged elements.
|
69
|
+
tags = (doc/"*").inject([]) { |m,e| m << e.name if(e.respond_to?(:name) && e.name =~ /^\w+$/) ; m }.uniq
|
70
|
+
|
71
|
+
# Remove tags that aren't whitelisted.
|
72
|
+
remove_tags!(doc, tags - HTML_ELEMENTS)
|
73
|
+
remaining_tags = tags & HTML_ELEMENTS
|
74
|
+
|
75
|
+
# Remove attributes that aren't on the whitelist, or are suspicious URLs.
|
76
|
+
(doc/remaining_tags.join(",")).each do |element|
|
77
|
+
next if element.raw_attributes.nil?
|
78
|
+
element.raw_attributes.reject! do |attr,val|
|
79
|
+
!HTML_ATTRS.include?(attr) || (HTML_URI_ATTRS.include?(attr) && dodgy_uri?(val))
|
80
|
+
end
|
81
|
+
element.raw_attributes = element.raw_attributes.build_hash {|a,v| [a, add_entities(v)]}
|
82
|
+
end unless remaining_tags.empty?
|
83
|
+
|
84
|
+
doc.traverse_text {|t| t.set(add_entities(t.to_html))}
|
85
|
+
|
86
|
+
# Return the tree, without comments. Ugly way of removing comments,
|
87
|
+
# but can't see a way to do this in Hpricot yet.
|
88
|
+
doc.to_s.gsub(/<\!--.*?-->/mi, '')
|
89
|
+
end
|
90
|
+
|
91
|
+
# For all other feed elements:
|
92
|
+
# - Unescape HTML.
|
93
|
+
# - Parse HTML into tree (taking 'body' as root, if present)
|
94
|
+
# - Takes text out of each tag, and escapes HTML.
|
95
|
+
# - Returns all text concatenated.
|
96
|
+
def flatten(str)
|
97
|
+
str.gsub!("\n", " ")
|
98
|
+
str = unescapeHTML(str)
|
99
|
+
|
100
|
+
doc = Hpricot(str, :xhtml_strict => true)
|
101
|
+
doc = subtree(doc, :body)
|
102
|
+
|
103
|
+
out = []
|
104
|
+
doc.traverse_text {|t| out << add_entities(t.to_html)}
|
105
|
+
|
106
|
+
return out.join
|
107
|
+
end
|
108
|
+
|
109
|
+
# Returns true if the given string contains a suspicious URL,
|
110
|
+
# i.e. a javascript link.
|
111
|
+
#
|
112
|
+
# This method rejects javascript, vbscript, livescript, mocha and data URLs.
|
113
|
+
# It *could* be refined to only deny dangerous data URLs, however.
|
114
|
+
def dodgy_uri?(uri)
|
115
|
+
uri = uri.to_s
|
116
|
+
|
117
|
+
# special case for poorly-formed entities (missing ';')
|
118
|
+
# if these occur *anywhere* within the string, then throw it out.
|
119
|
+
return true if (uri =~ /&\#(\d+|x[0-9a-f]+)[^;\d]/mi)
|
120
|
+
|
121
|
+
# Try escaping as both HTML or URI encodings, and then trying
|
122
|
+
# each scheme regexp on each
|
123
|
+
[unescapeHTML(uri), CGI.unescape(uri)].each do |unesc_uri|
|
124
|
+
DODGY_URI_SCHEMES.each do |scheme|
|
125
|
+
|
126
|
+
regexp = "#{scheme}:".gsub(/./) do |char|
|
127
|
+
"([\000-\037\177\s]*)#{char}"
|
128
|
+
end
|
129
|
+
|
130
|
+
# regexp looks something like
|
131
|
+
# /\A([\000-\037\177\s]*)j([\000-\037\177\s]*)a([\000-\037\177\s]*)v([\000-\037\177\s]*)a([\000-\037\177\s]*)s([\000-\037\177\s]*)c([\000-\037\177\s]*)r([\000-\037\177\s]*)i([\000-\037\177\s]*)p([\000-\037\177\s]*)t([\000-\037\177\s]*):/mi
|
132
|
+
return true if (unesc_uri =~ %r{\A#{regexp}}mi)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
nil
|
137
|
+
end
|
138
|
+
|
139
|
+
# unescapes HTML. If xml is true, also converts XML-only named entities to HTML.
|
140
|
+
def unescapeHTML(str, xml = true)
|
141
|
+
CGI.unescapeHTML(xml ? str.gsub("'", "'") : str)
|
142
|
+
end
|
143
|
+
|
144
|
+
# Adds entities where possible.
|
145
|
+
# Works like CGI.escapeHTML, but will not escape existing entities;
|
146
|
+
# i.e. { will NOT become &#123;
|
147
|
+
#
|
148
|
+
# This method could be improved by adding a whitelist of html entities.
|
149
|
+
def add_entities(str)
|
150
|
+
str.to_s.gsub(/\"/n, '"').gsub(/>/n, '>').gsub(/</n, '<').gsub(/&(?!(\#\d+|\#x([0-9a-f]+)|\w{2,8});)/nmi, '&')
|
151
|
+
end
|
152
|
+
|
153
|
+
private
|
154
|
+
|
155
|
+
# Everything below elment, or the just return the doc if element not present.
|
156
|
+
def subtree(doc, element)
|
157
|
+
doc.at("//#{element}/*") || doc
|
158
|
+
end
|
159
|
+
|
160
|
+
def remove_tags!(doc, tags)
|
161
|
+
(doc/tags.join(",")).remove unless tags.empty?
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
|
169
|
+
module Enumerable #:nodoc:
|
170
|
+
def build_hash
|
171
|
+
result = {}
|
172
|
+
self.each do |elt|
|
173
|
+
key, value = yield elt
|
174
|
+
result[key] = value
|
175
|
+
end
|
176
|
+
result
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/207625
|
181
|
+
# Subject: A simple Hpricot text setter
|
182
|
+
# From: Chris Gehlker <canyonrat mac.com>
|
183
|
+
# Date: Fri, 11 Aug 2006 03:19:13 +0900
|
184
|
+
class Hpricot::Text #:nodoc:
|
185
|
+
def set(string)
|
186
|
+
@content = string
|
187
|
+
end
|
188
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# HTML utils that use nokogiri
|
2
|
+
# Adapted from code by Eleo (http://gist.github.com/101410)
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'sanitize'
|
7
|
+
|
8
|
+
module FeedMe
|
9
|
+
class NokogiriUtil
|
10
|
+
# Truncate HTML while preserving tags
|
11
|
+
def truncate_html(text, num_words=15, truncate_string="...")
|
12
|
+
doc = Nokogiri::HTML(html)
|
13
|
+
current = doc.children.first
|
14
|
+
count = 0
|
15
|
+
|
16
|
+
while true
|
17
|
+
# we found a text node
|
18
|
+
if current.is_a?(Nokogiri::XML::Text)
|
19
|
+
count += current.text.split.length
|
20
|
+
# we reached our limit, let's get outta here!
|
21
|
+
break if count > num_words
|
22
|
+
previous = current
|
23
|
+
end
|
24
|
+
|
25
|
+
if current.children.length > 0
|
26
|
+
# this node has children, can't be a text node,
|
27
|
+
# lets descend and look for text nodes
|
28
|
+
current = current.children.first
|
29
|
+
elsif !current.next.nil?
|
30
|
+
#this has no children, but has a sibling, let's check it out
|
31
|
+
current = current.next
|
32
|
+
else
|
33
|
+
# we are the last child, we need to ascend until we are
|
34
|
+
# either done or find a sibling to continue on to
|
35
|
+
n = current
|
36
|
+
while !n.is_a?(Nokogiri::HTML::Document) and n.parent.next.nil?
|
37
|
+
n = n.parent
|
38
|
+
end
|
39
|
+
|
40
|
+
# we've reached the top and found no more text nodes, break
|
41
|
+
if n.is_a?(Nokogiri::HTML::Document)
|
42
|
+
break;
|
43
|
+
else
|
44
|
+
current = n.parent.next
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
if count >= num_words
|
50
|
+
unless count == num_words
|
51
|
+
new_content = current.text.split
|
52
|
+
|
53
|
+
# If we're here, the last text node we counted eclipsed the number of words
|
54
|
+
# that we want, so we need to cut down on words. The easiest way to think about
|
55
|
+
# this is that without this node we'd have fewer words than the limit, so all
|
56
|
+
# the previous words plus a limited number of words from this node are needed.
|
57
|
+
# We simply need to figure out how many words are needed and grab that many.
|
58
|
+
# Then we need to -subtract- an index, because the first word would be index zero.
|
59
|
+
|
60
|
+
# For example, given:
|
61
|
+
# <p>Testing this HTML truncater.</p><p>To see if its working.</p>
|
62
|
+
# Let's say I want 6 words. The correct returned string would be:
|
63
|
+
# <p>Testing this HTML truncater.</p><p>To see...</p>
|
64
|
+
# All the words in both paragraphs = 9
|
65
|
+
# The last paragraph is the one that breaks the limit. How many words would we
|
66
|
+
# have without it? 4. But we want up to 6, so we might as well get that many.
|
67
|
+
# 6 - 4 = 2, so we get 2 words from this node, but words #1-2 are indices #0-1, so
|
68
|
+
# we subtract 1. If this gives us -1, we want nothing from this node. So go back to
|
69
|
+
# the previous node instead.
|
70
|
+
index = num_words-(count-new_content.length)-1
|
71
|
+
if index >= 0
|
72
|
+
new_content = new_content[0..index]
|
73
|
+
current.content = new_content.join(' ') + truncate_string
|
74
|
+
else
|
75
|
+
current = previous
|
76
|
+
current.content = current.content + truncate_string
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# remove everything else
|
81
|
+
while !current.is_a?(Nokogiri::HTML::Document)
|
82
|
+
while !current.next.nil?
|
83
|
+
current.next.remove
|
84
|
+
end
|
85
|
+
current = current.parent
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# now we grab the html and not the text.
|
90
|
+
# we do first because nokogiri adds html and body tags
|
91
|
+
# which we don't want
|
92
|
+
doc.root.children.first.inner_html
|
93
|
+
end
|
94
|
+
|
95
|
+
# strip all tags from HTML
|
96
|
+
def strip_html(html)
|
97
|
+
Nokogiri::HTML(html).inner_text
|
98
|
+
end
|
99
|
+
|
100
|
+
# strip tags from HTML and truncate to a certain number of words
|
101
|
+
def strip_truncate_html(html, words=15, truncate_string='...')
|
102
|
+
strip_html(html).split[0..words].join(' ') + truncate_string
|
103
|
+
end
|
104
|
+
|
105
|
+
# sanitize HTML
|
106
|
+
# todo: dup code to fix bugs
|
107
|
+
def clean_html(html)
|
108
|
+
Sanitize.clean(html)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
@@instance = NokogiriUtil.new
|
113
|
+
|
114
|
+
def FeedMe.html_helper
|
115
|
+
@@instance
|
116
|
+
end
|
117
|
+
end
|
data/lib/util.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
module FeedMe
|
2
|
+
# Pretty-print an object, with special formatting for hashes
|
3
|
+
# and arrays.
|
4
|
+
def FeedMe.pretty_to_s(obj, indent_step=2, indent=0, code=nil)
|
5
|
+
new_indent = indent + indent_step
|
6
|
+
space = ' ' * indent
|
7
|
+
new_space = ' ' * new_indent
|
8
|
+
str = ''
|
9
|
+
if (obj.is_a?(FeedData) || obj.is_a?(Hash))
|
10
|
+
str << "#{obj.fm_tag_name} " if obj.is_a?(FeedData)
|
11
|
+
str << "{"
|
12
|
+
obj.each_with_index do |item, index|
|
13
|
+
key, value = code.call(*item) if code
|
14
|
+
str << "\n#{new_space}"
|
15
|
+
str << FeedMe.pretty_to_s(key, indent_step, new_indent, code)
|
16
|
+
str << " => "
|
17
|
+
str << FeedMe.pretty_to_s(value, indent_step, new_indent, code)
|
18
|
+
str << ',' unless index == obj.size-1
|
19
|
+
end
|
20
|
+
str << "\n#{space}}"
|
21
|
+
elsif obj.is_a?(Array)
|
22
|
+
str << "["
|
23
|
+
obj.each_with_index do |value, index|
|
24
|
+
str << "\n#{new_space}"
|
25
|
+
str << FeedMe.pretty_to_s(value, indent_step, new_indent, code)
|
26
|
+
str << ',' unless index == obj.size-1
|
27
|
+
end
|
28
|
+
str << "\n#{space}]"
|
29
|
+
elsif obj.is_a? Symbol
|
30
|
+
str << obj.inspect
|
31
|
+
else
|
32
|
+
str << obj.to_s.strip.inspect
|
33
|
+
end
|
34
|
+
return str
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
class String
|
39
|
+
def trunc(wordcount, tail='...')
|
40
|
+
words = self.split
|
41
|
+
truncated = words[0..(wordcount-1)].join(' ')
|
42
|
+
truncated += tail if words.size > wordcount
|
43
|
+
truncated
|
44
|
+
end
|
45
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feedme
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Didion
|
@@ -9,22 +9,13 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-12-28 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
|
-
dependencies:
|
15
|
-
|
16
|
-
name: hoe
|
17
|
-
type: :development
|
18
|
-
version_requirement:
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">="
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 2.3.3
|
24
|
-
version:
|
14
|
+
dependencies: []
|
15
|
+
|
25
16
|
description: A simple, flexible, and extensible RSS and Atom parser for Ruby. Based on the popular SimpleRSS library, but with many nice extra features.
|
26
17
|
email:
|
27
|
-
-
|
18
|
+
- code@didion.net
|
28
19
|
executables: []
|
29
20
|
|
30
21
|
extensions: []
|
@@ -38,11 +29,16 @@ files:
|
|
38
29
|
- Manifest.txt
|
39
30
|
- README.txt
|
40
31
|
- Rakefile
|
41
|
-
- lib/feedme.rb
|
42
32
|
- examples/rocketboom.rb
|
43
33
|
- examples/rocketboom.rss
|
34
|
+
- lib/feedme.rb
|
35
|
+
- lib/hpricot-util.rb
|
36
|
+
- lib/html-cleaner.rb
|
37
|
+
- lib/nokogiri-util.rb
|
38
|
+
- lib/util.rb
|
39
|
+
- test/test_helper.rb
|
44
40
|
has_rdoc: true
|
45
|
-
homepage: http://
|
41
|
+
homepage: http://wiki.github.com/jdidion/feedme
|
46
42
|
licenses: []
|
47
43
|
|
48
44
|
post_install_message:
|