feedalizer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ This changelog only mentions what you, as a user of Feedalizer,
2
+ might want to know. See the Subversion log for more.
3
+
4
+ [ 2006-08-23 20:23 ]
5
+
6
+ Added optional limit parameter to scrape_items.
7
+ Thanks to Thanh Vinh Tang for the hint that it is needed.
data/COPYING ADDED
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2006 Christoffer Sawicki <christoffer.sawicki@gmail.com>
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to
5
+ deal in the Software without restriction, including without limitation the
6
+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7
+ sell copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16
+ THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,37 @@
1
+ ___ _ _ _
2
+ | __|__ ___ __| |__ _| (_)______ _ _
3
+ | _/ -_) -_) _` / _` | | |_ / -_) '_|
4
+ |_|\___\___\__,_\__,_|_|_/__\___|_|
5
+
6
+ Project Website:
7
+ http://termos.vemod.net/feedalizer
8
+
9
+ = Dependencies
10
+
11
+ * Hpricot
12
+ http://code.whytheluckystiff.net/hpricot/
13
+ % gem install hpricot
14
+
15
+ = Documentation
16
+
17
+ First of all, see the included examples. Then read some Hpricot
18
+ documentation (http://code.whytheluckystiff.net/hpricot/).
19
+
20
+ (Yes, a tutorial would be nice.)
21
+
22
+ = Copyright
23
+
24
+ Copyright (c) Christoffer Sawicki <christoffer.sawicki@gmail.com> 2006
25
+
26
+ This program is free software; you can redistribute it and/or modify
27
+ it under the terms of the GNU General Public License version 2 as
28
+ published by the Free Software Foundation.
29
+
30
+ This program is distributed in the hope that it will be useful,
31
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
32
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33
+ GNU General Public License for more details.
34
+
35
+ You should have received a copy of the GNU General Public License
36
+ along with this program; if not, write to the Free Software
37
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
data/TODO ADDED
@@ -0,0 +1,7 @@
1
+ * Add more examples
2
+ * Write some documentation (tutorial?)
3
+ * Add note about CGI
4
+ * Cache the retrieved HTML during script development?
5
+ * Write unit test(s) for debug mode.
6
+ * Add support for RSS 2.0
7
+ * Add timeout control (a bit tricky to do nicely)
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ # Simple wrapper around Ruby :-)
3
+
4
+ require "feedalizer"
5
+
6
+ eval(ARGF.read)
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # This script was contributed by Chu Yeow Cheah, thanks!
4
+
5
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
6
+
7
+ require 'feedalizer'
8
+ require 'time'
9
+
10
+ url = 'http://mongrel.rubyforge.org/news.html'
11
+
12
+ feedalize(url) do
13
+
14
+ feed.title = 'Mongrel News'
15
+ feed.about = '...'
16
+ feed.description = 'Latest Mongrel news from the official site'
17
+
18
+ scrape_items('h2') do |rss_item, html_element|
19
+
20
+ siblings = html_element.parent.containers
21
+ html_element_index = siblings.index(html_element)
22
+
23
+ date, title = html_element.innerHTML.strip.split(':')
24
+ date = Time.parse(date.sub(/-/, ' ')) # date is before the first ':'
25
+ title.strip!
26
+
27
+ # Grab siblings after the <h2> and put into description.
28
+ description = ''
29
+ (html_element_index+1...siblings.size).each do |i|
30
+ next_sibling = siblings[i]
31
+ unless 'h2' == next_sibling.stag.name
32
+ description = description + next_sibling.to_s
33
+ else
34
+ break
35
+ end
36
+ end
37
+
38
+ rss_item.link = url
39
+ rss_item.title = title
40
+ rss_item.date = date
41
+ rss_item.description = description
42
+ end
43
+
44
+ output!
45
+ end
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # This script was contributed by Christian Neukirchen, thanks!
4
+
5
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), "..", "lib")
6
+
7
+ require "feedalizer"
8
+ require "time"
9
+
10
+ feedalize("http://pjvault.com/news.html") do
11
+ feed.title = "Pearl Jam Vault"
12
+ feed.about = "..."
13
+ feed.description = "Pearl Jam News and Reviews"
14
+
15
+ scrape_items("li") do |rss_item, html_element|
16
+ link = html_element.search("a").first
17
+
18
+ rss_item.link = link.attributes["href"]
19
+ rss_item.date = Time.parse(html_element.to_s)
20
+ rss_item.title = link.children.last.to_s
21
+
22
+ rss_item.description = html_element.inner_html
23
+ end
24
+
25
+ output!
26
+ end
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), "..", "lib")
4
+
5
+ require "feedalizer"
6
+ require "time"
7
+
8
+ url = "http://sydsvenskan.se/serier/nemi/article101047.ece?context=serie"
9
+
10
+ feedalize(url) do
11
+ feed.title = "Nemi"
12
+ feed.about = "..."
13
+ feed.description = "Daily Nemi strip scraped from Sydsvenskan"
14
+
15
+ scrape_items("option") do |rss_item, html_element|
16
+ rss_item.link = html_element.attributes["value"]
17
+ rss_item.date = Time.parse(html_element.inner_html)
18
+ rss_item.title = rss_item.date.strftime("%Y-%m-%d")
19
+
20
+ # This grabs the page for a particular strip and extracts the relevant img element
21
+ rss_item.description = grab_page(rss_item.link).search("//img[@width=748]")
22
+ end
23
+
24
+ output!
25
+ end
@@ -0,0 +1,60 @@
1
+ require "rss/maker"
2
+ require "open-uri"
3
+ require "hpricot"
4
+
5
+ class Feedalizer
6
+ VERSION = "0.1.0"
7
+
8
+ attr_reader :source
9
+
10
+ def initialize(url, &block)
11
+ @source = grab_page(url)
12
+ @rss = RSS::Maker::RSS10.new
13
+
14
+ feed.generator = "Feedalizer (http://termos.vemod.net/feedalizer)"
15
+ feed.link = url
16
+
17
+ instance_eval(&block) if block_given?
18
+
19
+ debug! if $DEBUG
20
+ end
21
+
22
+ def feed
23
+ @rss.channel
24
+ end
25
+
26
+ def scrape_items(hpricot_query, limit = 15)
27
+ elements = @source.search(hpricot_query)
28
+
29
+ elements.first(limit).each do |element|
30
+ yield @rss.items.new_item, element
31
+ end
32
+ end
33
+
34
+ def grab_page(url)
35
+ open(url) { |io| Hpricot(io) }
36
+ end
37
+
38
+ # For backwards-compatibility
39
+ alias_method :parse_source, :grab_page
40
+
41
+ def output
42
+ @rss.to_rss.to_s unless $DEBUG
43
+ end
44
+
45
+ def output!(target = STDOUT)
46
+ target << output
47
+ end
48
+
49
+ def debug!
50
+ @rss.items.each do |item|
51
+ STDERR.puts [ item.title, item.date, item.link ].join("\n")
52
+ STDERR.puts
53
+ end
54
+ end
55
+ end
56
+
57
+ # A handy wrapper for Feedalizer.new :-)
58
+ def feedalize(url, &block)
59
+ Feedalizer.new(url, &block)
60
+ end
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
4
+
5
+ require "test/unit"
6
+ require "feedalizer"
7
+
8
+ class TestFeedalizer < Test::Unit::TestCase
9
+ TEST_FILE = File.join(File.dirname(__FILE__), "test.html")
10
+
11
+ def setup
12
+ @feedalizer = Feedalizer.new(TEST_FILE)
13
+ end
14
+
15
+ def test_construction
16
+ assert_kind_of RSS::Maker::RSS10::Channel, @feedalizer.feed
17
+ assert_kind_of Hpricot::Doc, @feedalizer.source
18
+ end
19
+
20
+ def test_block
21
+ inside = nil
22
+ Feedalizer.new(TEST_FILE) { inside = self } rescue nil
23
+ assert_kind_of Feedalizer, inside
24
+ end
25
+
26
+ def test_default_feed_link
27
+ assert_equal TEST_FILE, @feedalizer.feed.link
28
+ end
29
+
30
+ def test_source
31
+ assert_equal "Test", @feedalizer.source.search("html/head/title").text
32
+ end
33
+
34
+ def test_scrape_items
35
+ elements = []
36
+
37
+ @feedalizer.scrape_items("div.item") do |item, element|
38
+ elements << element
39
+
40
+ assert_kind_of RSS::Maker::RSS10::Items::Item, item
41
+ assert_kind_of Hpricot::Elem, element
42
+ end
43
+
44
+ assert_equal 2, elements.size
45
+ end
46
+
47
+ def test_scrape_items_limit
48
+ elements = []
49
+
50
+ @feedalizer.scrape_items("div.item", 1) do |item, element|
51
+ elements << element
52
+ end
53
+
54
+ assert_equal 1, elements.size
55
+ end
56
+
57
+ def test_grab_page
58
+ assert_kind_of Hpricot::Doc, @feedalizer.grab_page(TEST_FILE)
59
+ end
60
+
61
+ def test_output
62
+ f = @feedalizer.feed
63
+ f.about = f.title = f.description = "..."
64
+
65
+ output = @feedalizer.output
66
+
67
+ assert output.include?('<rdf:RDF xmlns="http://purl.org/rss/1.0/"')
68
+ assert output.size > 600
69
+ end
70
+ end
@@ -0,0 +1,19 @@
1
+ <html>
2
+ <head>
3
+ <title>Test</title>
4
+ </head>
5
+
6
+ <body>
7
+ <h1>News</h1>
8
+
9
+ <div class="item">
10
+ <h2>Foo</h2>
11
+ <p>Foo</p>
12
+ </div>
13
+
14
+ <div class="item">
15
+ <h2>Bar</h2>
16
+ <p>Bar</p>
17
+ </div>
18
+ </body>
19
+ </html>
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.11
3
+ specification_version: 1
4
+ name: feedalizer
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2006-10-11 00:00:00 +02:00
8
+ summary: Transforms web pages into RSS feeds
9
+ require_paths:
10
+ - lib
11
+ email: christoffer.sawicki@gmail.com
12
+ homepage: http://termos.vemod.net/feedalizer
13
+ rubyforge_project: feedalizer
14
+ description: Feedalizer glues together Hpricot with Ruby's RSS library in a way that makes it dead easy to transform web pages into RSS feeds.
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ authors:
29
+ - Christoffer Sawicki
30
+ files:
31
+ - CHANGELOG
32
+ - lib/feedalizer.rb
33
+ - tests/test.html
34
+ - tests/tc_feedalizer.rb
35
+ - TODO
36
+ - COPYING
37
+ - README
38
+ - examples
39
+ - examples/mongrel-news.rb
40
+ - examples/pjvault.rb
41
+ - examples/sydsvenskan-nemi.rb
42
+ - bin/feedalizer
43
+ test_files: []
44
+
45
+ rdoc_options: []
46
+
47
+ extra_rdoc_files: []
48
+
49
+ executables:
50
+ - feedalizer
51
+ extensions: []
52
+
53
+ requirements: []
54
+
55
+ dependencies:
56
+ - !ruby/object:Gem::Dependency
57
+ name: hpricot
58
+ version_requirement:
59
+ version_requirements: !ruby/object:Gem::Version::Requirement
60
+ requirements:
61
+ - - ">"
62
+ - !ruby/object:Gem::Version
63
+ version: 0.0.0
64
+ version: