truffle-hog 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +66 -0
- data/lib/truffle-hog.rb +27 -0
- data/spec/spec.opts +2 -0
- data/spec/spec_helper.rb +10 -0
- data/spec/truffle-hog_spec.rb +44 -0
- metadata +59 -0
data/README.textile
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
h1. TruffleHog
|
2
|
+
|
3
|
+
"http://github.com/pauldix/truffle-hog":http://github.com/pauldix/truffle-hog
|
4
|
+
|
5
|
+
If you have issues please use the "Feedzirra group":http://groups.google.com/group/feedzirra
|
6
|
+
|
7
|
+
h2. Summary
|
8
|
+
|
9
|
+
Finds RSS and Atom feed urls in html like a hog finds truffles. Tasty, delicious feeds... er, truffles.
|
10
|
+
|
11
|
+
h2. Description
|
12
|
+
|
13
|
+
This is a simple library. It doesn't GET the html for you. It doesn't use an HTML or XML parser. It just uses a little regex to find feed urls. If you want to GET the web pages, use "Typhoeus":http://github.com/pauldix/typhoeus/. If you want to fetch and parse the feeds use "Feedzirra":http://github.com/pauldix/feedzirra.
|
14
|
+
|
15
|
+
h2. Installation
|
16
|
+
|
17
|
+
<pre>
|
18
|
+
gem install truffle-hog --source http://gemcutter.org
|
19
|
+
</pre>
|
20
|
+
|
21
|
+
h2. Use
|
22
|
+
|
23
|
+
<pre>
|
24
|
+
require 'rubygems'
|
25
|
+
require 'truffle-hog'
|
26
|
+
|
27
|
+
# get atom and rss
|
28
|
+
feed_urls = TruffleHog.parse_feed_urls(some_html)
|
29
|
+
|
30
|
+
# get atom if available, otherwise rss
|
31
|
+
feed_urls = TruffleHog.parse_feed_urls(some_html, :atom)
|
32
|
+
|
33
|
+
# get rss if available, otherwise atom
|
34
|
+
feed_urls = TruffleHog.parse_feed_urls(some_html, :rss)
|
35
|
+
</pre>
|
36
|
+
|
37
|
+
h2. Next
|
38
|
+
|
39
|
+
I may want to make a fun Nokogiri backed version and test speed and stuff. This thing hasn't been benchmarked yet.
|
40
|
+
|
41
|
+
h2. LICENSE
|
42
|
+
|
43
|
+
(The MIT License)
|
44
|
+
|
45
|
+
Copyright (c) 2009:
|
46
|
+
|
47
|
+
"Paul Dix":http://pauldix.net
|
48
|
+
|
49
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
50
|
+
a copy of this software and associated documentation files (the
|
51
|
+
'Software'), to deal in the Software without restriction, including
|
52
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
53
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
54
|
+
permit persons to whom the Software is furnished to do so, subject to
|
55
|
+
the following conditions:
|
56
|
+
|
57
|
+
The above copyright notice and this permission notice shall be
|
58
|
+
included in all copies or substantial portions of the Software.
|
59
|
+
|
60
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
61
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
62
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
63
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
64
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
65
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
66
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/lib/truffle-hog.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
module TruffleHog
|
2
|
+
VERSION = "0.0.1"
|
3
|
+
|
4
|
+
def self.parse_feed_urls(html, favor = :all)
|
5
|
+
rss_links = []
|
6
|
+
atom_links = []
|
7
|
+
|
8
|
+
rss_links = (scan_for_tag(html, "a", "rss") + scan_for_tag(html, "link", "rss")).flatten.uniq
|
9
|
+
atom_links = (scan_for_tag(html, "a", "atom") + scan_for_tag(html, "link", "atom")).flatten.uniq
|
10
|
+
|
11
|
+
case favor
|
12
|
+
when :all
|
13
|
+
(rss_links + atom_links).uniq
|
14
|
+
when :rss
|
15
|
+
rss_links.empty? ? atom_links : rss_links
|
16
|
+
when :atom
|
17
|
+
atom_links.empty? ? rss_links : atom_links
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.scan_for_tag(html, tag, type)
|
22
|
+
href_first = html.scan(/<#{tag}.*href\=['"](.*?)['"].*type\=['"]application\/#{type}\+xml['"].*?>/)
|
23
|
+
return href_first unless href_first.empty?
|
24
|
+
|
25
|
+
html.scan(/<#{tag}.*type\=['"]application\/#{type}\+xml['"].*href=['"](.*?)['"].*?>/)
|
26
|
+
end
|
27
|
+
end
|
data/spec/spec.opts
ADDED
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "spec"
|
3
|
+
|
4
|
+
# gem install redgreen for colored test output
|
5
|
+
begin require "redgreen" unless ENV['TM_CURRENT_LINE']; rescue LoadError; end
|
6
|
+
|
7
|
+
path = File.expand_path(File.dirname(__FILE__) + "/../lib/")
|
8
|
+
$LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path)
|
9
|
+
|
10
|
+
require "lib/truffle-hog"
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
describe "parsing html" do
|
4
|
+
before(:all) do
|
5
|
+
@html = File.read(File.dirname(__FILE__) + "/pauldix_net.html")
|
6
|
+
end
|
7
|
+
|
8
|
+
it "parses all feed urls" do
|
9
|
+
TruffleHog.parse_feed_urls(@html).should == ["http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/rss",
|
10
|
+
"http://www.pauldix.net/in_head/index.rdf", "http://www.pauldix.net/in_head/rss.xml",
|
11
|
+
"http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom",
|
12
|
+
"http://www.pauldix.net/in_head/atom.xml"]
|
13
|
+
end
|
14
|
+
|
15
|
+
it "parses rss feeds from the link tags in head" do
|
16
|
+
feed_urls = TruffleHog.parse_feed_urls(@html, :rss)
|
17
|
+
feed_urls.should include("http://www.pauldix.net/in_head/index.rdf")
|
18
|
+
feed_urls.should include("http://www.pauldix.net/in_head/rss.xml")
|
19
|
+
feed_urls.should_not include("http://www.pauldix.net/in_head/atom.xml")
|
20
|
+
feed_urls.should_not include("http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom")
|
21
|
+
end
|
22
|
+
|
23
|
+
it "parses atom feeds from the link tags in head" do
|
24
|
+
feed_urls = TruffleHog.parse_feed_urls(@html, :atom)
|
25
|
+
feed_urls.should include("http://www.pauldix.net/in_head/atom.xml")
|
26
|
+
feed_urls.should_not include("http://www.pauldix.net/in_head/index.rdf")
|
27
|
+
feed_urls.should_not include("http://www.pauldix.net/in_head/rss.xml")
|
28
|
+
end
|
29
|
+
|
30
|
+
it "parses rss feeds from the body" do
|
31
|
+
feed_urls = TruffleHog.parse_feed_urls(@html, :rss)
|
32
|
+
feed_urls.should include("http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/rss")
|
33
|
+
feed_urls.should_not include("http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom")
|
34
|
+
end
|
35
|
+
|
36
|
+
it "parses atom feeds from the body" do
|
37
|
+
feed_urls = TruffleHog.parse_feed_urls(@html, :atom)
|
38
|
+
feed_urls.should include("http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom")
|
39
|
+
feed_urls.should_not include("http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/rss")
|
40
|
+
end
|
41
|
+
|
42
|
+
it "returns atom feeds if rss is favored, but none are found"
|
43
|
+
it "returns rss feeds if atom is favored, but none are found"
|
44
|
+
end
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: truffle-hog
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Paul Dix
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-09-21 00:00:00 -04:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description:
|
17
|
+
email: paul@pauldix.net
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- lib/truffle-hog.rb
|
26
|
+
- README.textile
|
27
|
+
- spec/spec.opts
|
28
|
+
- spec/spec_helper.rb
|
29
|
+
- spec/truffle-hog_spec.rb
|
30
|
+
has_rdoc: true
|
31
|
+
homepage: http://github.com/pauldix/truffle-hog
|
32
|
+
licenses: []
|
33
|
+
|
34
|
+
post_install_message:
|
35
|
+
rdoc_options: []
|
36
|
+
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "0"
|
50
|
+
version:
|
51
|
+
requirements: []
|
52
|
+
|
53
|
+
rubyforge_project:
|
54
|
+
rubygems_version: 1.3.5
|
55
|
+
signing_key:
|
56
|
+
specification_version: 2
|
57
|
+
summary: Finds RSS and Atom feed urls in html like a hog finds truffles. Tasty, delicious feeds... er, truffles.
|
58
|
+
test_files: []
|
59
|
+
|