feed_ninja 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +12 -0
- data/lib/feed_ninja/atomish.rb +55 -0
- data/lib/feed_ninja/extractor.rb +34 -0
- data/lib/feed_ninja/feed_ninja.rb +95 -0
- data/lib/feed_ninja.rb +12 -0
- data/spec/atomish_spec.rb +33 -0
- data/spec/extractor_spec.rb +39 -0
- data/spec/feed_ninja_spec.rb +28 -0
- data/spec/feeds/atom.xml +28 -0
- data/spec/feeds/rss.xml +19 -0
- data/spec/pages/one.html +18 -0
- data/spec/spec_helper.rb +17 -0
- metadata +85 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e232c6d96bcbdfda50a9985ab6430c2275dc1608
|
4
|
+
data.tar.gz: 2a0d5f1d965290ba077c0bb7eceb176d47a53f55
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0d3c349beea7ef6835ff680fbc5974f569f2e260fffbc42c55a6b61874a5a5236bc4a1bc6a14311341a2a80982048901daac31e42e0f24a87c25bfeeeb0bc9ab
|
7
|
+
data.tar.gz: 75e0af1da786f34fcd13c7969c7500c49b00299dcc5d76a1a9ed995bf716d4f5fa858aee035bf584d6b30dbdab1baab5369e7039f52edaa268ca25090593c731
|
data/README.md
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#FeedNinja
|
2
|
+
This gem can be used to take an RSS or Atom feed, follow the links they provide and extract images and/or text with xpath. The data is then reformatted into a new Atom feed.
|
3
|
+
It is inteded to be used with feeds that only provide a sneak peek of the content, to rip all the interesting bits out for displaying in your feed reader immediately.
|
4
|
+
|
5
|
+
##Example Usage
|
6
|
+
require 'feed_ninja'
|
7
|
+
|
8
|
+
get 'http://example.com/rss' do
|
9
|
+
picture_at '//foo/img/@src'
|
10
|
+
text_at '//bar/span'
|
11
|
+
title_matches /^News/
|
12
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
class AtomIshWriter
|
2
|
+
attr_accessor :title, :link, :updated
|
3
|
+
def initialize
|
4
|
+
@entries = []
|
5
|
+
end
|
6
|
+
|
7
|
+
def new_entry
|
8
|
+
item = Entry.new
|
9
|
+
item = yield item
|
10
|
+
@entries << item;
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_s
|
14
|
+
%{<?xml version="1.0" encoding="utf-8"?>
|
15
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
16
|
+
|
17
|
+
<title>#{@title}</title>
|
18
|
+
<id>#{@link}</id>
|
19
|
+
<link href="#{@link}"/>
|
20
|
+
<updated>#{@updated}</updated>
|
21
|
+
<author>
|
22
|
+
<name>FeedNinja</name>
|
23
|
+
<uri>http://github.com/Tourniquet/feedninja</uri>
|
24
|
+
<email>latzer.daniel@gmail.com</email>
|
25
|
+
</author>
|
26
|
+
#{@entries.inject { |memo, entry| memo.to_s + entry.to_s }.to_s}</feed>}
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class Entry
|
32
|
+
attr_accessor :title, :link, :images, :updated, :summary, :id
|
33
|
+
|
34
|
+
def to_s
|
35
|
+
%{ <entry>
|
36
|
+
<title>#{@title}</title>
|
37
|
+
<link rel="alternate" type="text/html" href="#{@link}" />
|
38
|
+
<id>#{@id}</id>
|
39
|
+
<updated>#{@updated}</updated>
|
40
|
+
<content type="html">#{self.content.encode(:xml => :text)}</content>
|
41
|
+
</entry>
|
42
|
+
}
|
43
|
+
end
|
44
|
+
|
45
|
+
def content
|
46
|
+
Array(@images).inject("") do |memo, src|
|
47
|
+
memo += %{
|
48
|
+
<a href="#{src}">
|
49
|
+
<img src="#{src}"/>
|
50
|
+
</a>
|
51
|
+
}
|
52
|
+
#end + summary || ""
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
class Extractor
|
2
|
+
attr_accessor :doc
|
3
|
+
|
4
|
+
def fetch uri
|
5
|
+
open(uri) do |site|
|
6
|
+
@doc = Nokogiri::HTML(site)
|
7
|
+
#return extract_image(doc, site.base_uri), extract_xml(doc)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def extract_images(base_url, *xpaths)
|
12
|
+
Array(xpaths).collect_concat do |xpath|
|
13
|
+
extract_image(base_url, xpath)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def extract_image(base_url, xpath)
|
18
|
+
@doc.xpath(xpath).collect do | picture_src |
|
19
|
+
if(picture_src.to_s.start_with? 'http') then
|
20
|
+
picture_src.to_s
|
21
|
+
else
|
22
|
+
"#{base_url.scheme}://#{base_url.host}/#{base_url.path}#{picture_src}"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def extract_xml *xpaths
|
28
|
+
Array(xpaths).collect_concat do |xpath|
|
29
|
+
@doc.xpath(xpath).collect do |result|
|
30
|
+
result.to_s
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'rss'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'time'
|
5
|
+
|
6
|
+
class FeedNinja
|
7
|
+
attr_accessor :uri, :picture_xpath, :text_xpath, :title_regex, :limit
|
8
|
+
attr_accessor :extractor
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
@limit = 2
|
12
|
+
@extractor = Extractor.new
|
13
|
+
@writer = AtomIshWriter.new
|
14
|
+
@ninja_prefix = "N! "
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize_writer doc
|
18
|
+
@writer.updated = DateTime.now.to_s
|
19
|
+
|
20
|
+
case doc.feed_type
|
21
|
+
when "atom"
|
22
|
+
@writer.title = @ninja_prefix + doc.title.content
|
23
|
+
@writer.link = doc.link.href
|
24
|
+
when "rss"
|
25
|
+
@writer.title = @ninja_prefix + doc.channel.title
|
26
|
+
@writer.link = doc.channel.link
|
27
|
+
else
|
28
|
+
raise "Invalid feed format"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# get the feed and iterate over the entries
|
33
|
+
def fetch url
|
34
|
+
open(url) do |feed|
|
35
|
+
doc = RSS::Parser.parse(feed)
|
36
|
+
initialize_writer(doc)
|
37
|
+
process_items(doc)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def process_items doc
|
42
|
+
items = doc.items
|
43
|
+
if title_regex
|
44
|
+
items = items.select { |item| title_regex =~ item.title }
|
45
|
+
end
|
46
|
+
items.first(@limit).each do |item|
|
47
|
+
|
48
|
+
#TODO add multithreading here; be sure to use multiple extractor instances
|
49
|
+
process_item item, doc.feed_type
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def process_item original, feed_type
|
54
|
+
@writer.new_entry do |entry|
|
55
|
+
case feed_type
|
56
|
+
when "atom"
|
57
|
+
entry.title = original.title.content
|
58
|
+
entry.link = original.link.href
|
59
|
+
entry.updated = original.updated
|
60
|
+
entry.id = original.id
|
61
|
+
@extractor.fetch original.link.href
|
62
|
+
when "rss"
|
63
|
+
entry.title = original.title
|
64
|
+
entry.link = original.link
|
65
|
+
entry.updated = original.pubDate ? original.pubDate.xmlschema : DateTime.now.to_s
|
66
|
+
entry.id = entry.link
|
67
|
+
@extractor.fetch original.link
|
68
|
+
end
|
69
|
+
|
70
|
+
entry.images = @extractor.extract_images @picture_xpath
|
71
|
+
entry.summary = @extractor.extract_xml @text_xpath
|
72
|
+
|
73
|
+
entry #it's kind of fishy to explicitly have to return the entry here...
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def to_s
|
78
|
+
@writer.to_s
|
79
|
+
end
|
80
|
+
|
81
|
+
## DSL convenience setters
|
82
|
+
|
83
|
+
def picture_at *xpath
|
84
|
+
@picture_xpath = xpath
|
85
|
+
end
|
86
|
+
|
87
|
+
def text_at *xpath
|
88
|
+
@text_xpath = xpath
|
89
|
+
end
|
90
|
+
|
91
|
+
def title_matches regex
|
92
|
+
@title_regex = regex
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
data/lib/feed_ninja.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'feed_ninja/feed_ninja'
|
2
|
+
require 'feed_ninja/atomish'
|
3
|
+
require 'feed_ninja/extractor'
|
4
|
+
|
5
|
+
def get (url, &block)
|
6
|
+
ninja = FeedNinja.new
|
7
|
+
ninja.instance_eval(&block)
|
8
|
+
ninja.fetch(url)
|
9
|
+
|
10
|
+
puts "Content-type: application/atom+xml\n"
|
11
|
+
puts ninja.to_s
|
12
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'feed_ninja'
|
3
|
+
|
4
|
+
describe AtomIshWriter do
|
5
|
+
it 'should output a valid atom feed' do
|
6
|
+
writer = AtomIshWriter.new
|
7
|
+
writer.title = 'test'
|
8
|
+
writer.link = 'http://example.com/atom'
|
9
|
+
writer.updated = DateTime.now.to_s
|
10
|
+
|
11
|
+
writer.new_entry do |entry|
|
12
|
+
entry = Entry.new
|
13
|
+
entry.title = "title"
|
14
|
+
entry.link = "http://example.com/one"
|
15
|
+
entry.id = entry.link
|
16
|
+
entry.images = ["http://example.com/one.jpg", "http://example.com/two.jpg"]
|
17
|
+
entry.summary = "First part of the story"
|
18
|
+
entry.updated = DateTime.now.to_s
|
19
|
+
end
|
20
|
+
|
21
|
+
writer.new_entry do |entry|
|
22
|
+
entry = Entry.new
|
23
|
+
entry.title = "title"
|
24
|
+
entry.link = "http://example.com/two"
|
25
|
+
entry.id = entry.link
|
26
|
+
entry.images = ["http://example.com/one.jpg", "http://example.com/two.jpg"]
|
27
|
+
entry.summary = "Second part of the story"
|
28
|
+
entry.updated = (DateTime.now - 60).to_s
|
29
|
+
end
|
30
|
+
|
31
|
+
RSS::Parser.parse(writer.to_s)
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'feed_ninja'
|
3
|
+
|
4
|
+
describe Extractor do
|
5
|
+
before :each do
|
6
|
+
@extractor = Extractor.new
|
7
|
+
@extractor.fetch 'spec/pages/one.html'
|
8
|
+
@base = URI('http://example.com')
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should extract one image with relative url' do
|
12
|
+
xpath = "//div[@id='one_image_relative']/img/@src"
|
13
|
+
picture = @extractor.extract_images(@base, xpath)
|
14
|
+
|
15
|
+
picture.should == ["http://example.com/one.jpg"]
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should extract one image with absolute url' do
|
19
|
+
xpath = "//div[@id='one_image_absolute']/img/@src"
|
20
|
+
base = URI('http://wrong.com') #base URI shouldn't be applied here
|
21
|
+
picture = @extractor.extract_images(base, xpath)
|
22
|
+
|
23
|
+
picture.should == ["http://example.com/one.jpg"]
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'should extract several images' do
|
27
|
+
xpath = "//div[@id='several_images']/img/@src"
|
28
|
+
pictures = @extractor.extract_images(@base, xpath)
|
29
|
+
|
30
|
+
pictures.size.should == 2
|
31
|
+
pictures.should == ["http://example.com/one.jpg", "http://example.com/two.jpg"]
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should extract some paragraphs' do
|
35
|
+
paragraphs = @extractor.extract_xml "//div[@id='paragraphs']/p"
|
36
|
+
|
37
|
+
paragraphs.should == %w{<p>one</p> <p>two</p> <p>three</p>}
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'feed_ninja'
|
3
|
+
|
4
|
+
describe FeedNinja do
|
5
|
+
before :each do
|
6
|
+
@ninja = FeedNinja.new
|
7
|
+
@extractor = double()
|
8
|
+
@ninja.extractor = @extractor
|
9
|
+
@extractor.stub(:extract_images)
|
10
|
+
@extractor.stub(:extract_xml)
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should read an atom feed' do
|
14
|
+
@extractor.should_receive(:fetch).twice
|
15
|
+
@ninja.fetch 'spec/feeds/atom.xml'
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should read an RSS feed' do
|
19
|
+
@extractor.should_receive(:fetch).twice
|
20
|
+
@ninja.fetch 'spec/feeds/rss.xml'
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should not read more than the given limit' do
|
24
|
+
@ninja.limit = 1
|
25
|
+
@extractor.should_receive(:fetch).once
|
26
|
+
@ninja.fetch 'spec/feeds/rss.xml'
|
27
|
+
end
|
28
|
+
end
|
data/spec/feeds/atom.xml
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
3
|
+
|
4
|
+
<title>atom</title>
|
5
|
+
<link rel="alternate" type="text/html" href="http://example.com/atom"/>
|
6
|
+
<updated>2007-07-13T18:30:02Z</updated>
|
7
|
+
<author>
|
8
|
+
<name>feedninja</name>
|
9
|
+
</author>
|
10
|
+
<id>http://example.com/atom</id>
|
11
|
+
|
12
|
+
<entry>
|
13
|
+
<title>one</title>
|
14
|
+
<link href="http://example.com/one"/>
|
15
|
+
<id>1</id>
|
16
|
+
<updated>2007-07-13T18:30:02Z</updated>
|
17
|
+
<summary>summary_one</summary>
|
18
|
+
</entry>
|
19
|
+
|
20
|
+
<entry>
|
21
|
+
<title>two</title>
|
22
|
+
<link href="http://example.com/two"/>
|
23
|
+
<id>2</id>
|
24
|
+
<updated>2007-07-13T18:30:02Z</updated>
|
25
|
+
<summary>summary_two</summary>
|
26
|
+
</entry>
|
27
|
+
|
28
|
+
</feed>
|
data/spec/feeds/rss.xml
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8" ?>
|
2
|
+
<rss version="2.0">
|
3
|
+
|
4
|
+
<channel>
|
5
|
+
<title>rss_feed</title>
|
6
|
+
<link>http://www.example.com</link>
|
7
|
+
<description>descri </description>
|
8
|
+
<item>
|
9
|
+
<title>one</title>
|
10
|
+
<link>http://example.com/one</link>
|
11
|
+
<description>description_one</description>
|
12
|
+
</item>
|
13
|
+
<item>
|
14
|
+
<title>two</title>
|
15
|
+
<link>http://example.com/two</link>
|
16
|
+
<description>description_two</description>
|
17
|
+
</item>
|
18
|
+
</channel>
|
19
|
+
</rss>
|
data/spec/pages/one.html
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
<html>
|
2
|
+
<div id="one_image_relative">
|
3
|
+
<img src="one.jpg"/>
|
4
|
+
</div>
|
5
|
+
<div id="one_image_absolute">
|
6
|
+
<img src="http://example.com/one.jpg"/>
|
7
|
+
</div>
|
8
|
+
<div id="several_images">
|
9
|
+
<img src="one.jpg"/>
|
10
|
+
<img src="two.jpg"/>
|
11
|
+
</div>
|
12
|
+
<div id="paragraphs">
|
13
|
+
<p>one</p>
|
14
|
+
<p>two</p>
|
15
|
+
<span>combo_breaker</span>
|
16
|
+
<p>three</p>
|
17
|
+
</div>
|
18
|
+
</html>
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
config.run_all_when_everything_filtered = true
|
10
|
+
config.filter_run :focus
|
11
|
+
|
12
|
+
# Run specs in random order to surface order dependencies. If you find an
|
13
|
+
# order dependency and want to debug it, you can fix the order by providing
|
14
|
+
# the seed, which is printed after each run.
|
15
|
+
# --seed 1234
|
16
|
+
config.order = 'random'
|
17
|
+
end
|
metadata
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: feed_ninja
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.3
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Daniel Latzer
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-02-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 2.14.1
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 2.14.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.6.1
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.6.1
|
41
|
+
description: |-
|
42
|
+
This gem can be used to take an RSS or Atom feed, follow the links they provide and extract images and/or text with xpath. The data is then reformatted into a new Atom feed.
|
43
|
+
It is inteded to be used with feeds that only provide a sneak peek of the content, to rip all the interesting bits out for displaying in your feed reader immediately.
|
44
|
+
email: latzer.daniel@gmail.com
|
45
|
+
executables: []
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- README.md
|
50
|
+
- lib/feed_ninja.rb
|
51
|
+
- lib/feed_ninja/atomish.rb
|
52
|
+
- lib/feed_ninja/extractor.rb
|
53
|
+
- lib/feed_ninja/feed_ninja.rb
|
54
|
+
- spec/atomish_spec.rb
|
55
|
+
- spec/extractor_spec.rb
|
56
|
+
- spec/feed_ninja_spec.rb
|
57
|
+
- spec/feeds/atom.xml
|
58
|
+
- spec/feeds/rss.xml
|
59
|
+
- spec/pages/one.html
|
60
|
+
- spec/spec_helper.rb
|
61
|
+
homepage: http://github.com/tourniquet/feedninja
|
62
|
+
licenses:
|
63
|
+
- MIT
|
64
|
+
metadata: {}
|
65
|
+
post_install_message:
|
66
|
+
rdoc_options: []
|
67
|
+
require_paths:
|
68
|
+
- lib
|
69
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: '0'
|
74
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
75
|
+
requirements:
|
76
|
+
- - ">="
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: '0'
|
79
|
+
requirements: []
|
80
|
+
rubyforge_project:
|
81
|
+
rubygems_version: 2.2.2
|
82
|
+
signing_key:
|
83
|
+
specification_version: 4
|
84
|
+
summary: A tiny helper to rip the interesting bits out of RSS and Atom feeds
|
85
|
+
test_files: []
|