feedalizer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +7 -0
- data/COPYING +18 -0
- data/README +37 -0
- data/TODO +7 -0
- data/bin/feedalizer +6 -0
- data/examples/mongrel-news.rb +45 -0
- data/examples/pjvault.rb +26 -0
- data/examples/sydsvenskan-nemi.rb +25 -0
- data/lib/feedalizer.rb +60 -0
- data/tests/tc_feedalizer.rb +70 -0
- data/tests/test.html +19 -0
- metadata +64 -0
data/CHANGELOG
ADDED
data/COPYING
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
Copyright (c) 2006 Christoffer Sawicki <christoffer.sawicki@gmail.com>
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to
|
5
|
+
deal in the Software without restriction, including without limitation the
|
6
|
+
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
7
|
+
sell copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
16
|
+
THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
17
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
18
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
___ _ _ _
|
2
|
+
| __|__ ___ __| |__ _| (_)______ _ _
|
3
|
+
| _/ -_) -_) _` / _` | | |_ / -_) '_|
|
4
|
+
|_|\___\___\__,_\__,_|_|_/__\___|_|
|
5
|
+
|
6
|
+
Project Website:
|
7
|
+
http://termos.vemod.net/feedalizer
|
8
|
+
|
9
|
+
= Dependencies
|
10
|
+
|
11
|
+
* Hpricot
|
12
|
+
http://code.whytheluckystiff.net/hpricot/
|
13
|
+
% gem install hpricot
|
14
|
+
|
15
|
+
= Documentation
|
16
|
+
|
17
|
+
First of all, see the included examples. Then read some Hpricot
|
18
|
+
documentation (http://code.whytheluckystiff.net/hpricot/).
|
19
|
+
|
20
|
+
(Yes, a tutorial would be nice.)
|
21
|
+
|
22
|
+
= Copyright
|
23
|
+
|
24
|
+
Copyright (c) Christoffer Sawicki <christoffer.sawicki@gmail.com> 2006
|
25
|
+
|
26
|
+
This program is free software; you can redistribute it and/or modify
|
27
|
+
it under the terms of the GNU General Public License version 2 as
|
28
|
+
published by the Free Software Foundation.
|
29
|
+
|
30
|
+
This program is distributed in the hope that it will be useful,
|
31
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
32
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
33
|
+
GNU General Public License for more details.
|
34
|
+
|
35
|
+
You should have received a copy of the GNU General Public License
|
36
|
+
along with this program; if not, write to the Free Software
|
37
|
+
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
data/TODO
ADDED
data/bin/feedalizer
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This script was contributed by Chu Yeow Cheah, thanks!
|
4
|
+
|
5
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
+
|
7
|
+
require 'feedalizer'
|
8
|
+
require 'time'
|
9
|
+
|
10
|
+
url = 'http://mongrel.rubyforge.org/news.html'
|
11
|
+
|
12
|
+
feedalize(url) do
|
13
|
+
|
14
|
+
feed.title = 'Mongrel News'
|
15
|
+
feed.about = '...'
|
16
|
+
feed.description = 'Latest Mongrel news from the official site'
|
17
|
+
|
18
|
+
scrape_items('h2') do |rss_item, html_element|
|
19
|
+
|
20
|
+
siblings = html_element.parent.containers
|
21
|
+
html_element_index = siblings.index(html_element)
|
22
|
+
|
23
|
+
date, title = html_element.innerHTML.strip.split(':')
|
24
|
+
date = Time.parse(date.sub(/-/, ' ')) # date is before the first ':'
|
25
|
+
title.strip!
|
26
|
+
|
27
|
+
# Grab siblings after the <h2> and put into description.
|
28
|
+
description = ''
|
29
|
+
(html_element_index+1...siblings.size).each do |i|
|
30
|
+
next_sibling = siblings[i]
|
31
|
+
unless 'h2' == next_sibling.stag.name
|
32
|
+
description = description + next_sibling.to_s
|
33
|
+
else
|
34
|
+
break
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
rss_item.link = url
|
39
|
+
rss_item.title = title
|
40
|
+
rss_item.date = date
|
41
|
+
rss_item.description = description
|
42
|
+
end
|
43
|
+
|
44
|
+
output!
|
45
|
+
end
|
data/examples/pjvault.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This script was contributed by Christian Neukirchen, thanks!
|
4
|
+
|
5
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
6
|
+
|
7
|
+
require "feedalizer"
|
8
|
+
require "time"
|
9
|
+
|
10
|
+
feedalize("http://pjvault.com/news.html") do
|
11
|
+
feed.title = "Pearl Jam Vault"
|
12
|
+
feed.about = "..."
|
13
|
+
feed.description = "Pearl Jam News and Reviews"
|
14
|
+
|
15
|
+
scrape_items("li") do |rss_item, html_element|
|
16
|
+
link = html_element.search("a").first
|
17
|
+
|
18
|
+
rss_item.link = link.attributes["href"]
|
19
|
+
rss_item.date = Time.parse(html_element.to_s)
|
20
|
+
rss_item.title = link.children.last.to_s
|
21
|
+
|
22
|
+
rss_item.description = html_element.inner_html
|
23
|
+
end
|
24
|
+
|
25
|
+
output!
|
26
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
4
|
+
|
5
|
+
require "feedalizer"
|
6
|
+
require "time"
|
7
|
+
|
8
|
+
url = "http://sydsvenskan.se/serier/nemi/article101047.ece?context=serie"
|
9
|
+
|
10
|
+
feedalize(url) do
|
11
|
+
feed.title = "Nemi"
|
12
|
+
feed.about = "..."
|
13
|
+
feed.description = "Daily Nemi strip scraped from Sydsvenskan"
|
14
|
+
|
15
|
+
scrape_items("option") do |rss_item, html_element|
|
16
|
+
rss_item.link = html_element.attributes["value"]
|
17
|
+
rss_item.date = Time.parse(html_element.inner_html)
|
18
|
+
rss_item.title = rss_item.date.strftime("%Y-%m-%d")
|
19
|
+
|
20
|
+
# This grabs the page for a particular strip and extracts the relevant img element
|
21
|
+
rss_item.description = grab_page(rss_item.link).search("//img[@width=748]")
|
22
|
+
end
|
23
|
+
|
24
|
+
output!
|
25
|
+
end
|
data/lib/feedalizer.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require "rss/maker"
|
2
|
+
require "open-uri"
|
3
|
+
require "hpricot"
|
4
|
+
|
5
|
+
class Feedalizer
|
6
|
+
VERSION = "0.1.0"
|
7
|
+
|
8
|
+
attr_reader :source
|
9
|
+
|
10
|
+
def initialize(url, &block)
|
11
|
+
@source = grab_page(url)
|
12
|
+
@rss = RSS::Maker::RSS10.new
|
13
|
+
|
14
|
+
feed.generator = "Feedalizer (http://termos.vemod.net/feedalizer)"
|
15
|
+
feed.link = url
|
16
|
+
|
17
|
+
instance_eval(&block) if block_given?
|
18
|
+
|
19
|
+
debug! if $DEBUG
|
20
|
+
end
|
21
|
+
|
22
|
+
def feed
|
23
|
+
@rss.channel
|
24
|
+
end
|
25
|
+
|
26
|
+
def scrape_items(hpricot_query, limit = 15)
|
27
|
+
elements = @source.search(hpricot_query)
|
28
|
+
|
29
|
+
elements.first(limit).each do |element|
|
30
|
+
yield @rss.items.new_item, element
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def grab_page(url)
|
35
|
+
open(url) { |io| Hpricot(io) }
|
36
|
+
end
|
37
|
+
|
38
|
+
# For backwards-compatibility
|
39
|
+
alias_method :parse_source, :grab_page
|
40
|
+
|
41
|
+
def output
|
42
|
+
@rss.to_rss.to_s unless $DEBUG
|
43
|
+
end
|
44
|
+
|
45
|
+
def output!(target = STDOUT)
|
46
|
+
target << output
|
47
|
+
end
|
48
|
+
|
49
|
+
def debug!
|
50
|
+
@rss.items.each do |item|
|
51
|
+
STDERR.puts [ item.title, item.date, item.link ].join("\n")
|
52
|
+
STDERR.puts
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# A handy wrapper for Feedalizer.new :-)
|
58
|
+
def feedalize(url, &block)
|
59
|
+
Feedalizer.new(url, &block)
|
60
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
|
4
|
+
|
5
|
+
require "test/unit"
|
6
|
+
require "feedalizer"
|
7
|
+
|
8
|
+
class TestFeedalizer < Test::Unit::TestCase
|
9
|
+
TEST_FILE = File.join(File.dirname(__FILE__), "test.html")
|
10
|
+
|
11
|
+
def setup
|
12
|
+
@feedalizer = Feedalizer.new(TEST_FILE)
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_construction
|
16
|
+
assert_kind_of RSS::Maker::RSS10::Channel, @feedalizer.feed
|
17
|
+
assert_kind_of Hpricot::Doc, @feedalizer.source
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_block
|
21
|
+
inside = nil
|
22
|
+
Feedalizer.new(TEST_FILE) { inside = self } rescue nil
|
23
|
+
assert_kind_of Feedalizer, inside
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_default_feed_link
|
27
|
+
assert_equal TEST_FILE, @feedalizer.feed.link
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_source
|
31
|
+
assert_equal "Test", @feedalizer.source.search("html/head/title").text
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_scrape_items
|
35
|
+
elements = []
|
36
|
+
|
37
|
+
@feedalizer.scrape_items("div.item") do |item, element|
|
38
|
+
elements << element
|
39
|
+
|
40
|
+
assert_kind_of RSS::Maker::RSS10::Items::Item, item
|
41
|
+
assert_kind_of Hpricot::Elem, element
|
42
|
+
end
|
43
|
+
|
44
|
+
assert_equal 2, elements.size
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_scrape_items_limit
|
48
|
+
elements = []
|
49
|
+
|
50
|
+
@feedalizer.scrape_items("div.item", 1) do |item, element|
|
51
|
+
elements << element
|
52
|
+
end
|
53
|
+
|
54
|
+
assert_equal 1, elements.size
|
55
|
+
end
|
56
|
+
|
57
|
+
def test_grab_page
|
58
|
+
assert_kind_of Hpricot::Doc, @feedalizer.grab_page(TEST_FILE)
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_output
|
62
|
+
f = @feedalizer.feed
|
63
|
+
f.about = f.title = f.description = "..."
|
64
|
+
|
65
|
+
output = @feedalizer.output
|
66
|
+
|
67
|
+
assert output.include?('<rdf:RDF xmlns="http://purl.org/rss/1.0/"')
|
68
|
+
assert output.size > 600
|
69
|
+
end
|
70
|
+
end
|
data/tests/test.html
ADDED
metadata
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.11
|
3
|
+
specification_version: 1
|
4
|
+
name: feedalizer
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2006-10-11 00:00:00 +02:00
|
8
|
+
summary: Transforms web pages into RSS feeds
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: christoffer.sawicki@gmail.com
|
12
|
+
homepage: http://termos.vemod.net/feedalizer
|
13
|
+
rubyforge_project: feedalizer
|
14
|
+
description: Feedalizer glues together Hpricot with Ruby's RSS library in a way that makes it dead easy to transform web pages into RSS feeds.
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
authors:
|
29
|
+
- Christoffer Sawicki
|
30
|
+
files:
|
31
|
+
- CHANGELOG
|
32
|
+
- lib/feedalizer.rb
|
33
|
+
- tests/test.html
|
34
|
+
- tests/tc_feedalizer.rb
|
35
|
+
- TODO
|
36
|
+
- COPYING
|
37
|
+
- README
|
38
|
+
- examples
|
39
|
+
- examples/mongrel-news.rb
|
40
|
+
- examples/pjvault.rb
|
41
|
+
- examples/sydsvenskan-nemi.rb
|
42
|
+
- bin/feedalizer
|
43
|
+
test_files: []
|
44
|
+
|
45
|
+
rdoc_options: []
|
46
|
+
|
47
|
+
extra_rdoc_files: []
|
48
|
+
|
49
|
+
executables:
|
50
|
+
- feedalizer
|
51
|
+
extensions: []
|
52
|
+
|
53
|
+
requirements: []
|
54
|
+
|
55
|
+
dependencies:
|
56
|
+
- !ruby/object:Gem::Dependency
|
57
|
+
name: hpricot
|
58
|
+
version_requirement:
|
59
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">"
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: 0.0.0
|
64
|
+
version:
|