feedalizer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ This changelog only mentions what you, as a user of Feedalizer,
2
+ might want to know. See the Subversion log for more.
3
+
4
+ [ 2006-08-23 20:23 ]
5
+
6
+ Added optional limit parameter to scrape_items.
7
+ Thanks to Thanh Vinh Tang for the hint that it is needed.
data/COPYING ADDED
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2006 Christoffer Sawicki <christoffer.sawicki@gmail.com>
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to
5
+ deal in the Software without restriction, including without limitation the
6
+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7
+ sell copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
16
+ THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,37 @@
1
+ ___ _ _ _
2
+ | __|__ ___ __| |__ _| (_)______ _ _
3
+ | _/ -_) -_) _` / _` | | |_ / -_) '_|
4
+ |_|\___\___\__,_\__,_|_|_/__\___|_|
5
+
6
+ Project Website:
7
+ http://termos.vemod.net/feedalizer
8
+
9
+ = Dependencies
10
+
11
+ * Hpricot
12
+ http://code.whytheluckystiff.net/hpricot/
13
+ % gem install hpricot
14
+
15
+ = Documentation
16
+
17
+ First of all, see the included examples. Then read some Hpricot
18
+ documentation (http://code.whytheluckystiff.net/hpricot/).
19
+
20
+ (Yes, a tutorial would be nice.)
21
+
22
+ = Copyright
23
+
24
+ Copyright (c) Christoffer Sawicki <christoffer.sawicki@gmail.com> 2006
25
+
26
+ This program is free software; you can redistribute it and/or modify
27
+ it under the terms of the GNU General Public License version 2 as
28
+ published by the Free Software Foundation.
29
+
30
+ This program is distributed in the hope that it will be useful,
31
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
32
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33
+ GNU General Public License for more details.
34
+
35
+ You should have received a copy of the GNU General Public License
36
+ along with this program; if not, write to the Free Software
37
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
data/TODO ADDED
@@ -0,0 +1,7 @@
1
+ * Add more examples
2
+ * Write some documentation (tutorial?)
3
+ * Add note about CGI
4
+ * Cache the retrieved HTML during script development?
5
+ * Write unit test(s) for debug mode.
6
+ * Add support for RSS 2.0
7
+ * Add timeout control (a bit tricky to do nicely)
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ # Simple wrapper around Ruby :-)
3
+
4
+ require "feedalizer"
5
+
6
+ eval(ARGF.read)
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # This script was contributed by Chu Yeow Cheah, thanks!
4
+
5
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
6
+
7
+ require 'feedalizer'
8
+ require 'time'
9
+
10
+ url = 'http://mongrel.rubyforge.org/news.html'
11
+
12
+ feedalize(url) do
13
+
14
+ feed.title = 'Mongrel News'
15
+ feed.about = '...'
16
+ feed.description = 'Latest Mongrel news from the official site'
17
+
18
+ scrape_items('h2') do |rss_item, html_element|
19
+
20
+ siblings = html_element.parent.containers
21
+ html_element_index = siblings.index(html_element)
22
+
23
+ date, title = html_element.innerHTML.strip.split(':')
24
+ date = Time.parse(date.sub(/-/, ' ')) # date is before the first ':'
25
+ title.strip!
26
+
27
+ # Grab siblings after the <h2> and put into description.
28
+ description = ''
29
+ (html_element_index+1...siblings.size).each do |i|
30
+ next_sibling = siblings[i]
31
+ unless 'h2' == next_sibling.stag.name
32
+ description = description + next_sibling.to_s
33
+ else
34
+ break
35
+ end
36
+ end
37
+
38
+ rss_item.link = url
39
+ rss_item.title = title
40
+ rss_item.date = date
41
+ rss_item.description = description
42
+ end
43
+
44
+ output!
45
+ end
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # This script was contributed by Christian Neukirchen, thanks!
4
+
5
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), "..", "lib")
6
+
7
+ require "feedalizer"
8
+ require "time"
9
+
10
+ feedalize("http://pjvault.com/news.html") do
11
+ feed.title = "Pearl Jam Vault"
12
+ feed.about = "..."
13
+ feed.description = "Pearl Jam News and Reviews"
14
+
15
+ scrape_items("li") do |rss_item, html_element|
16
+ link = html_element.search("a").first
17
+
18
+ rss_item.link = link.attributes["href"]
19
+ rss_item.date = Time.parse(html_element.to_s)
20
+ rss_item.title = link.children.last.to_s
21
+
22
+ rss_item.description = html_element.inner_html
23
+ end
24
+
25
+ output!
26
+ end
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), "..", "lib")
4
+
5
+ require "feedalizer"
6
+ require "time"
7
+
8
+ url = "http://sydsvenskan.se/serier/nemi/article101047.ece?context=serie"
9
+
10
+ feedalize(url) do
11
+ feed.title = "Nemi"
12
+ feed.about = "..."
13
+ feed.description = "Daily Nemi strip scraped from Sydsvenskan"
14
+
15
+ scrape_items("option") do |rss_item, html_element|
16
+ rss_item.link = html_element.attributes["value"]
17
+ rss_item.date = Time.parse(html_element.inner_html)
18
+ rss_item.title = rss_item.date.strftime("%Y-%m-%d")
19
+
20
+ # This grabs the page for a particular strip and extracts the relevant img element
21
+ rss_item.description = grab_page(rss_item.link).search("//img[@width=748]")
22
+ end
23
+
24
+ output!
25
+ end
@@ -0,0 +1,60 @@
1
+ require "rss/maker"
2
+ require "open-uri"
3
+ require "hpricot"
4
+
5
+ class Feedalizer
6
+ VERSION = "0.1.0"
7
+
8
+ attr_reader :source
9
+
10
+ def initialize(url, &block)
11
+ @source = grab_page(url)
12
+ @rss = RSS::Maker::RSS10.new
13
+
14
+ feed.generator = "Feedalizer (http://termos.vemod.net/feedalizer)"
15
+ feed.link = url
16
+
17
+ instance_eval(&block) if block_given?
18
+
19
+ debug! if $DEBUG
20
+ end
21
+
22
+ def feed
23
+ @rss.channel
24
+ end
25
+
26
+ def scrape_items(hpricot_query, limit = 15)
27
+ elements = @source.search(hpricot_query)
28
+
29
+ elements.first(limit).each do |element|
30
+ yield @rss.items.new_item, element
31
+ end
32
+ end
33
+
34
+ def grab_page(url)
35
+ open(url) { |io| Hpricot(io) }
36
+ end
37
+
38
+ # For backwards-compatibility
39
+ alias_method :parse_source, :grab_page
40
+
41
+ def output
42
+ @rss.to_rss.to_s unless $DEBUG
43
+ end
44
+
45
+ def output!(target = STDOUT)
46
+ target << output
47
+ end
48
+
49
+ def debug!
50
+ @rss.items.each do |item|
51
+ STDERR.puts [ item.title, item.date, item.link ].join("\n")
52
+ STDERR.puts
53
+ end
54
+ end
55
+ end
56
+
57
+ # A handy wrapper for Feedalizer.new :-)
58
+ def feedalize(url, &block)
59
+ Feedalizer.new(url, &block)
60
+ end
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
4
+
5
+ require "test/unit"
6
+ require "feedalizer"
7
+
8
+ class TestFeedalizer < Test::Unit::TestCase
9
+ TEST_FILE = File.join(File.dirname(__FILE__), "test.html")
10
+
11
+ def setup
12
+ @feedalizer = Feedalizer.new(TEST_FILE)
13
+ end
14
+
15
+ def test_construction
16
+ assert_kind_of RSS::Maker::RSS10::Channel, @feedalizer.feed
17
+ assert_kind_of Hpricot::Doc, @feedalizer.source
18
+ end
19
+
20
+ def test_block
21
+ inside = nil
22
+ Feedalizer.new(TEST_FILE) { inside = self } rescue nil
23
+ assert_kind_of Feedalizer, inside
24
+ end
25
+
26
+ def test_default_feed_link
27
+ assert_equal TEST_FILE, @feedalizer.feed.link
28
+ end
29
+
30
+ def test_source
31
+ assert_equal "Test", @feedalizer.source.search("html/head/title").text
32
+ end
33
+
34
+ def test_scrape_items
35
+ elements = []
36
+
37
+ @feedalizer.scrape_items("div.item") do |item, element|
38
+ elements << element
39
+
40
+ assert_kind_of RSS::Maker::RSS10::Items::Item, item
41
+ assert_kind_of Hpricot::Elem, element
42
+ end
43
+
44
+ assert_equal 2, elements.size
45
+ end
46
+
47
+ def test_scrape_items_limit
48
+ elements = []
49
+
50
+ @feedalizer.scrape_items("div.item", 1) do |item, element|
51
+ elements << element
52
+ end
53
+
54
+ assert_equal 1, elements.size
55
+ end
56
+
57
+ def test_grab_page
58
+ assert_kind_of Hpricot::Doc, @feedalizer.grab_page(TEST_FILE)
59
+ end
60
+
61
+ def test_output
62
+ f = @feedalizer.feed
63
+ f.about = f.title = f.description = "..."
64
+
65
+ output = @feedalizer.output
66
+
67
+ assert output.include?('<rdf:RDF xmlns="http://purl.org/rss/1.0/"')
68
+ assert output.size > 600
69
+ end
70
+ end
@@ -0,0 +1,19 @@
1
+ <html>
2
+ <head>
3
+ <title>Test</title>
4
+ </head>
5
+
6
+ <body>
7
+ <h1>News</h1>
8
+
9
+ <div class="item">
10
+ <h2>Foo</h2>
11
+ <p>Foo</p>
12
+ </div>
13
+
14
+ <div class="item">
15
+ <h2>Bar</h2>
16
+ <p>Bar</p>
17
+ </div>
18
+ </body>
19
+ </html>
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.11
3
+ specification_version: 1
4
+ name: feedalizer
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2006-10-11 00:00:00 +02:00
8
+ summary: Transforms web pages into RSS feeds
9
+ require_paths:
10
+ - lib
11
+ email: christoffer.sawicki@gmail.com
12
+ homepage: http://termos.vemod.net/feedalizer
13
+ rubyforge_project: feedalizer
14
+ description: Feedalizer glues together Hpricot with Ruby's RSS library in a way that makes it dead easy to transform web pages into RSS feeds.
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ authors:
29
+ - Christoffer Sawicki
30
+ files:
31
+ - CHANGELOG
32
+ - lib/feedalizer.rb
33
+ - tests/test.html
34
+ - tests/tc_feedalizer.rb
35
+ - TODO
36
+ - COPYING
37
+ - README
38
+ - examples
39
+ - examples/mongrel-news.rb
40
+ - examples/pjvault.rb
41
+ - examples/sydsvenskan-nemi.rb
42
+ - bin/feedalizer
43
+ test_files: []
44
+
45
+ rdoc_options: []
46
+
47
+ extra_rdoc_files: []
48
+
49
+ executables:
50
+ - feedalizer
51
+ extensions: []
52
+
53
+ requirements: []
54
+
55
+ dependencies:
56
+ - !ruby/object:Gem::Dependency
57
+ name: hpricot
58
+ version_requirement:
59
+ version_requirements: !ruby/object:Gem::Version::Requirement
60
+ requirements:
61
+ - - ">"
62
+ - !ruby/object:Gem::Version
63
+ version: 0.0.0
64
+ version: