feedalizer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +7 -0
- data/COPYING +18 -0
- data/README +37 -0
- data/TODO +7 -0
- data/bin/feedalizer +6 -0
- data/examples/mongrel-news.rb +45 -0
- data/examples/pjvault.rb +26 -0
- data/examples/sydsvenskan-nemi.rb +25 -0
- data/lib/feedalizer.rb +60 -0
- data/tests/tc_feedalizer.rb +70 -0
- data/tests/test.html +19 -0
- metadata +64 -0
data/CHANGELOG
ADDED
data/COPYING
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
Copyright (c) 2006 Christoffer Sawicki <christoffer.sawicki@gmail.com>
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to
|
5
|
+
deal in the Software without restriction, including without limitation the
|
6
|
+
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
7
|
+
sell copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
16
|
+
THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
17
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
18
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
___ _ _ _
|
2
|
+
| __|__ ___ __| |__ _| (_)______ _ _
|
3
|
+
| _/ -_) -_) _` / _` | | |_ / -_) '_|
|
4
|
+
|_|\___\___\__,_\__,_|_|_/__\___|_|
|
5
|
+
|
6
|
+
Project Website:
|
7
|
+
http://termos.vemod.net/feedalizer
|
8
|
+
|
9
|
+
= Dependencies
|
10
|
+
|
11
|
+
* Hpricot
|
12
|
+
http://code.whytheluckystiff.net/hpricot/
|
13
|
+
% gem install hpricot
|
14
|
+
|
15
|
+
= Documentation
|
16
|
+
|
17
|
+
First of all, see the included examples. Then read some Hpricot
|
18
|
+
documentation (http://code.whytheluckystiff.net/hpricot/).
|
19
|
+
|
20
|
+
(Yes, a tutorial would be nice.)
|
21
|
+
|
22
|
+
= Copyright
|
23
|
+
|
24
|
+
Copyright (c) Christoffer Sawicki <christoffer.sawicki@gmail.com> 2006
|
25
|
+
|
26
|
+
This program is free software; you can redistribute it and/or modify
|
27
|
+
it under the terms of the GNU General Public License version 2 as
|
28
|
+
published by the Free Software Foundation.
|
29
|
+
|
30
|
+
This program is distributed in the hope that it will be useful,
|
31
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
32
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
33
|
+
GNU General Public License for more details.
|
34
|
+
|
35
|
+
You should have received a copy of the GNU General Public License
|
36
|
+
along with this program; if not, write to the Free Software
|
37
|
+
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
data/TODO
ADDED
data/bin/feedalizer
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This script was contributed by Chu Yeow Cheah, thanks!
|
4
|
+
|
5
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
+
|
7
|
+
require 'feedalizer'
|
8
|
+
require 'time'
|
9
|
+
|
10
|
+
url = 'http://mongrel.rubyforge.org/news.html'
|
11
|
+
|
12
|
+
feedalize(url) do
|
13
|
+
|
14
|
+
feed.title = 'Mongrel News'
|
15
|
+
feed.about = '...'
|
16
|
+
feed.description = 'Latest Mongrel news from the official site'
|
17
|
+
|
18
|
+
scrape_items('h2') do |rss_item, html_element|
|
19
|
+
|
20
|
+
siblings = html_element.parent.containers
|
21
|
+
html_element_index = siblings.index(html_element)
|
22
|
+
|
23
|
+
date, title = html_element.innerHTML.strip.split(':')
|
24
|
+
date = Time.parse(date.sub(/-/, ' ')) # date is before the first ':'
|
25
|
+
title.strip!
|
26
|
+
|
27
|
+
# Grab siblings after the <h2> and put into description.
|
28
|
+
description = ''
|
29
|
+
(html_element_index+1...siblings.size).each do |i|
|
30
|
+
next_sibling = siblings[i]
|
31
|
+
unless 'h2' == next_sibling.stag.name
|
32
|
+
description = description + next_sibling.to_s
|
33
|
+
else
|
34
|
+
break
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
rss_item.link = url
|
39
|
+
rss_item.title = title
|
40
|
+
rss_item.date = date
|
41
|
+
rss_item.description = description
|
42
|
+
end
|
43
|
+
|
44
|
+
output!
|
45
|
+
end
|
data/examples/pjvault.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This script was contributed by Christian Neukirchen, thanks!
|
4
|
+
|
5
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
6
|
+
|
7
|
+
require "feedalizer"
|
8
|
+
require "time"
|
9
|
+
|
10
|
+
feedalize("http://pjvault.com/news.html") do
|
11
|
+
feed.title = "Pearl Jam Vault"
|
12
|
+
feed.about = "..."
|
13
|
+
feed.description = "Pearl Jam News and Reviews"
|
14
|
+
|
15
|
+
scrape_items("li") do |rss_item, html_element|
|
16
|
+
link = html_element.search("a").first
|
17
|
+
|
18
|
+
rss_item.link = link.attributes["href"]
|
19
|
+
rss_item.date = Time.parse(html_element.to_s)
|
20
|
+
rss_item.title = link.children.last.to_s
|
21
|
+
|
22
|
+
rss_item.description = html_element.inner_html
|
23
|
+
end
|
24
|
+
|
25
|
+
output!
|
26
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
4
|
+
|
5
|
+
require "feedalizer"
|
6
|
+
require "time"
|
7
|
+
|
8
|
+
url = "http://sydsvenskan.se/serier/nemi/article101047.ece?context=serie"
|
9
|
+
|
10
|
+
feedalize(url) do
|
11
|
+
feed.title = "Nemi"
|
12
|
+
feed.about = "..."
|
13
|
+
feed.description = "Daily Nemi strip scraped from Sydsvenskan"
|
14
|
+
|
15
|
+
scrape_items("option") do |rss_item, html_element|
|
16
|
+
rss_item.link = html_element.attributes["value"]
|
17
|
+
rss_item.date = Time.parse(html_element.inner_html)
|
18
|
+
rss_item.title = rss_item.date.strftime("%Y-%m-%d")
|
19
|
+
|
20
|
+
# This grabs the page for a particular strip and extracts the relevant img element
|
21
|
+
rss_item.description = grab_page(rss_item.link).search("//img[@width=748]")
|
22
|
+
end
|
23
|
+
|
24
|
+
output!
|
25
|
+
end
|
data/lib/feedalizer.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require "rss/maker"
|
2
|
+
require "open-uri"
|
3
|
+
require "hpricot"
|
4
|
+
|
5
|
+
class Feedalizer
|
6
|
+
VERSION = "0.1.0"
|
7
|
+
|
8
|
+
attr_reader :source
|
9
|
+
|
10
|
+
def initialize(url, &block)
|
11
|
+
@source = grab_page(url)
|
12
|
+
@rss = RSS::Maker::RSS10.new
|
13
|
+
|
14
|
+
feed.generator = "Feedalizer (http://termos.vemod.net/feedalizer)"
|
15
|
+
feed.link = url
|
16
|
+
|
17
|
+
instance_eval(&block) if block_given?
|
18
|
+
|
19
|
+
debug! if $DEBUG
|
20
|
+
end
|
21
|
+
|
22
|
+
def feed
|
23
|
+
@rss.channel
|
24
|
+
end
|
25
|
+
|
26
|
+
def scrape_items(hpricot_query, limit = 15)
|
27
|
+
elements = @source.search(hpricot_query)
|
28
|
+
|
29
|
+
elements.first(limit).each do |element|
|
30
|
+
yield @rss.items.new_item, element
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def grab_page(url)
|
35
|
+
open(url) { |io| Hpricot(io) }
|
36
|
+
end
|
37
|
+
|
38
|
+
# For backwards-compatibility
|
39
|
+
alias_method :parse_source, :grab_page
|
40
|
+
|
41
|
+
def output
|
42
|
+
@rss.to_rss.to_s unless $DEBUG
|
43
|
+
end
|
44
|
+
|
45
|
+
def output!(target = STDOUT)
|
46
|
+
target << output
|
47
|
+
end
|
48
|
+
|
49
|
+
def debug!
|
50
|
+
@rss.items.each do |item|
|
51
|
+
STDERR.puts [ item.title, item.date, item.link ].join("\n")
|
52
|
+
STDERR.puts
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# A handy wrapper for Feedalizer.new :-)
|
58
|
+
def feedalize(url, &block)
|
59
|
+
Feedalizer.new(url, &block)
|
60
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
|
4
|
+
|
5
|
+
require "test/unit"
|
6
|
+
require "feedalizer"
|
7
|
+
|
8
|
+
class TestFeedalizer < Test::Unit::TestCase
|
9
|
+
TEST_FILE = File.join(File.dirname(__FILE__), "test.html")
|
10
|
+
|
11
|
+
def setup
|
12
|
+
@feedalizer = Feedalizer.new(TEST_FILE)
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_construction
|
16
|
+
assert_kind_of RSS::Maker::RSS10::Channel, @feedalizer.feed
|
17
|
+
assert_kind_of Hpricot::Doc, @feedalizer.source
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_block
|
21
|
+
inside = nil
|
22
|
+
Feedalizer.new(TEST_FILE) { inside = self } rescue nil
|
23
|
+
assert_kind_of Feedalizer, inside
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_default_feed_link
|
27
|
+
assert_equal TEST_FILE, @feedalizer.feed.link
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_source
|
31
|
+
assert_equal "Test", @feedalizer.source.search("html/head/title").text
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_scrape_items
|
35
|
+
elements = []
|
36
|
+
|
37
|
+
@feedalizer.scrape_items("div.item") do |item, element|
|
38
|
+
elements << element
|
39
|
+
|
40
|
+
assert_kind_of RSS::Maker::RSS10::Items::Item, item
|
41
|
+
assert_kind_of Hpricot::Elem, element
|
42
|
+
end
|
43
|
+
|
44
|
+
assert_equal 2, elements.size
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_scrape_items_limit
|
48
|
+
elements = []
|
49
|
+
|
50
|
+
@feedalizer.scrape_items("div.item", 1) do |item, element|
|
51
|
+
elements << element
|
52
|
+
end
|
53
|
+
|
54
|
+
assert_equal 1, elements.size
|
55
|
+
end
|
56
|
+
|
57
|
+
def test_grab_page
|
58
|
+
assert_kind_of Hpricot::Doc, @feedalizer.grab_page(TEST_FILE)
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_output
|
62
|
+
f = @feedalizer.feed
|
63
|
+
f.about = f.title = f.description = "..."
|
64
|
+
|
65
|
+
output = @feedalizer.output
|
66
|
+
|
67
|
+
assert output.include?('<rdf:RDF xmlns="http://purl.org/rss/1.0/"')
|
68
|
+
assert output.size > 600
|
69
|
+
end
|
70
|
+
end
|
data/tests/test.html
ADDED
metadata
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.11
|
3
|
+
specification_version: 1
|
4
|
+
name: feedalizer
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2006-10-11 00:00:00 +02:00
|
8
|
+
summary: Transforms web pages into RSS feeds
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: christoffer.sawicki@gmail.com
|
12
|
+
homepage: http://termos.vemod.net/feedalizer
|
13
|
+
rubyforge_project: feedalizer
|
14
|
+
description: Feedalizer glues together Hpricot with Ruby's RSS library in a way that makes it dead easy to transform web pages into RSS feeds.
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
authors:
|
29
|
+
- Christoffer Sawicki
|
30
|
+
files:
|
31
|
+
- CHANGELOG
|
32
|
+
- lib/feedalizer.rb
|
33
|
+
- tests/test.html
|
34
|
+
- tests/tc_feedalizer.rb
|
35
|
+
- TODO
|
36
|
+
- COPYING
|
37
|
+
- README
|
38
|
+
- examples
|
39
|
+
- examples/mongrel-news.rb
|
40
|
+
- examples/pjvault.rb
|
41
|
+
- examples/sydsvenskan-nemi.rb
|
42
|
+
- bin/feedalizer
|
43
|
+
test_files: []
|
44
|
+
|
45
|
+
rdoc_options: []
|
46
|
+
|
47
|
+
extra_rdoc_files: []
|
48
|
+
|
49
|
+
executables:
|
50
|
+
- feedalizer
|
51
|
+
extensions: []
|
52
|
+
|
53
|
+
requirements: []
|
54
|
+
|
55
|
+
dependencies:
|
56
|
+
- !ruby/object:Gem::Dependency
|
57
|
+
name: hpricot
|
58
|
+
version_requirement:
|
59
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">"
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: 0.0.0
|
64
|
+
version:
|