truffle-hog 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,66 @@
1
+ h1. TruffleHog
2
+
3
+ "http://github.com/pauldix/truffle-hog":http://github.com/pauldix/truffle-hog
4
+
5
+ If you have issues please use the "Feedzirra group":http://groups.google.com/group/feedzirra
6
+
7
+ h2. Summary
8
+
9
+ Finds RSS and Atom feed urls in html like a hog finds truffles. Tasty, delicious feeds... er, truffles.
10
+
11
+ h2. Description
12
+
13
+ This is a simple library. It doesn't GET the html for you. It doesn't use an HTML or XML parser. It just uses a little regex to find feed urls. If you want to GET the web pages, use "Typhoeus":http://github.com/pauldix/typhoeus/. If you want to fetch and parse the feeds use "Feedzirra":http://github.com/pauldix/feedzirra.
14
+
15
+ h2. Installation
16
+
17
+ <pre>
18
+ gem install truffle-hog --source http://gemcutter.org
19
+ </pre>
20
+
21
+ h2. Use
22
+
23
+ <pre>
24
+ require 'rubygems'
25
+ require 'truffle-hog'
26
+
27
+ # get atom and rss
28
+ feed_urls = TruffleHog.parse_feed_urls(some_html)
29
+
30
+ # get atom if available, otherwise rss
31
+ feed_urls = TruffleHog.parse_feed_urls(some_html, :atom)
32
+
33
+ # get rss if available, otherwise atom
34
+ feed_urls = TruffleHog.parse_feed_urls(some_html, :rss)
35
+ </pre>
36
+
37
+ h2. Next
38
+
39
+ I may want to make a fun Nokogiri backed version and test speed and stuff. This thing hasn't been benchmarked yet.
40
+
41
+ h2. LICENSE
42
+
43
+ (The MIT License)
44
+
45
+ Copyright (c) 2009:
46
+
47
+ "Paul Dix":http://pauldix.net
48
+
49
+ Permission is hereby granted, free of charge, to any person obtaining
50
+ a copy of this software and associated documentation files (the
51
+ 'Software'), to deal in the Software without restriction, including
52
+ without limitation the rights to use, copy, modify, merge, publish,
53
+ distribute, sublicense, and/or sell copies of the Software, and to
54
+ permit persons to whom the Software is furnished to do so, subject to
55
+ the following conditions:
56
+
57
+ The above copyright notice and this permission notice shall be
58
+ included in all copies or substantial portions of the Software.
59
+
60
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
61
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
62
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
63
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
64
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
65
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
66
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,27 @@
1
+ module TruffleHog
2
+ VERSION = "0.0.1"
3
+
4
+ def self.parse_feed_urls(html, favor = :all)
5
+ rss_links = []
6
+ atom_links = []
7
+
8
+ rss_links = (scan_for_tag(html, "a", "rss") + scan_for_tag(html, "link", "rss")).flatten.uniq
9
+ atom_links = (scan_for_tag(html, "a", "atom") + scan_for_tag(html, "link", "atom")).flatten.uniq
10
+
11
+ case favor
12
+ when :all
13
+ (rss_links + atom_links).uniq
14
+ when :rss
15
+ rss_links.empty? ? atom_links : rss_links
16
+ when :atom
17
+ atom_links.empty? ? rss_links : atom_links
18
+ end
19
+ end
20
+
21
+ def self.scan_for_tag(html, tag, type)
22
+ href_first = html.scan(/<#{tag}.*href\=['"](.*?)['"].*type\=['"]application\/#{type}\+xml['"].*?>/)
23
+ return href_first unless href_first.empty?
24
+
25
+ html.scan(/<#{tag}.*type\=['"]application\/#{type}\+xml['"].*href=['"](.*?)['"].*?>/)
26
+ end
27
+ end
@@ -0,0 +1,2 @@
1
+ --diff
2
+ --color
@@ -0,0 +1,10 @@
1
+ require "rubygems"
2
+ require "spec"
3
+
4
+ # gem install redgreen for colored test output
5
+ begin require "redgreen" unless ENV['TM_CURRENT_LINE']; rescue LoadError; end
6
+
7
+ path = File.expand_path(File.dirname(__FILE__) + "/../lib/")
8
+ $LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path)
9
+
10
+ require "lib/truffle-hog"
@@ -0,0 +1,44 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe "parsing html" do
4
+ before(:all) do
5
+ @html = File.read(File.dirname(__FILE__) + "/pauldix_net.html")
6
+ end
7
+
8
+ it "parses all feed urls" do
9
+ TruffleHog.parse_feed_urls(@html).should == ["http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/rss",
10
+ "http://www.pauldix.net/in_head/index.rdf", "http://www.pauldix.net/in_head/rss.xml",
11
+ "http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom",
12
+ "http://www.pauldix.net/in_head/atom.xml"]
13
+ end
14
+
15
+ it "parses rss feeds from the link tags in head" do
16
+ feed_urls = TruffleHog.parse_feed_urls(@html, :rss)
17
+ feed_urls.should include("http://www.pauldix.net/in_head/index.rdf")
18
+ feed_urls.should include("http://www.pauldix.net/in_head/rss.xml")
19
+ feed_urls.should_not include("http://www.pauldix.net/in_head/atom.xml")
20
+ feed_urls.should_not include("http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom")
21
+ end
22
+
23
+ it "parses atom feeds from the link tags in head" do
24
+ feed_urls = TruffleHog.parse_feed_urls(@html, :atom)
25
+ feed_urls.should include("http://www.pauldix.net/in_head/atom.xml")
26
+ feed_urls.should_not include("http://www.pauldix.net/in_head/index.rdf")
27
+ feed_urls.should_not include("http://www.pauldix.net/in_head/rss.xml")
28
+ end
29
+
30
+ it "parses rss feeds from the body" do
31
+ feed_urls = TruffleHog.parse_feed_urls(@html, :rss)
32
+ feed_urls.should include("http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/rss")
33
+ feed_urls.should_not include("http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom")
34
+ end
35
+
36
+ it "parses atom feeds from the body" do
37
+ feed_urls = TruffleHog.parse_feed_urls(@html, :atom)
38
+ feed_urls.should include("http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/atom")
39
+ feed_urls.should_not include("http://feeds.feedburner.com/PaulDixExplainsNothing/in_body/rss")
40
+ end
41
+
42
+ it "returns atom feeds if rss is favored, but none are found"
43
+ it "returns rss feeds if atom is favored, but none are found"
44
+ end
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: truffle-hog
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Paul Dix
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-09-21 00:00:00 -04:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: paul@pauldix.net
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - lib/truffle-hog.rb
26
+ - README.textile
27
+ - spec/spec.opts
28
+ - spec/spec_helper.rb
29
+ - spec/truffle-hog_spec.rb
30
+ has_rdoc: true
31
+ homepage: http://github.com/pauldix/truffle-hog
32
+ licenses: []
33
+
34
+ post_install_message:
35
+ rdoc_options: []
36
+
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ required_rubygems_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: "0"
50
+ version:
51
+ requirements: []
52
+
53
+ rubyforge_project:
54
+ rubygems_version: 1.3.5
55
+ signing_key:
56
+ specification_version: 2
57
+ summary: Finds RSS and Atom feed urls in html like a hog finds truffles. Tasty, delicious feeds... er, truffles.
58
+ test_files: []
59
+