feed_ninja 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e232c6d96bcbdfda50a9985ab6430c2275dc1608
4
+ data.tar.gz: 2a0d5f1d965290ba077c0bb7eceb176d47a53f55
5
+ SHA512:
6
+ metadata.gz: 0d3c349beea7ef6835ff680fbc5974f569f2e260fffbc42c55a6b61874a5a5236bc4a1bc6a14311341a2a80982048901daac31e42e0f24a87c25bfeeeb0bc9ab
7
+ data.tar.gz: 75e0af1da786f34fcd13c7969c7500c49b00299dcc5d76a1a9ed995bf716d4f5fa858aee035bf584d6b30dbdab1baab5369e7039f52edaa268ca25090593c731
data/README.md ADDED
@@ -0,0 +1,12 @@
1
+ #FeedNinja
2
+ This gem can be used to take an RSS or Atom feed, follow the links they provide and extract images and/or text with xpath. The data is then reformatted into a new Atom feed.
3
+ It is inteded to be used with feeds that only provide a sneak peek of the content, to rip all the interesting bits out for displaying in your feed reader immediately.
4
+
5
+ ##Example Usage
6
+ require 'feed_ninja'
7
+
8
+ get 'http://example.com/rss' do
9
+ picture_at '//foo/img/@src'
10
+ text_at '//bar/span'
11
+ title_matches /^News/
12
+ end
@@ -0,0 +1,55 @@
1
+ class AtomIshWriter
2
+ attr_accessor :title, :link, :updated
3
+ def initialize
4
+ @entries = []
5
+ end
6
+
7
+ def new_entry
8
+ item = Entry.new
9
+ item = yield item
10
+ @entries << item;
11
+ end
12
+
13
+ def to_s
14
+ %{<?xml version="1.0" encoding="utf-8"?>
15
+ <feed xmlns="http://www.w3.org/2005/Atom">
16
+
17
+ <title>#{@title}</title>
18
+ <id>#{@link}</id>
19
+ <link href="#{@link}"/>
20
+ <updated>#{@updated}</updated>
21
+ <author>
22
+ <name>FeedNinja</name>
23
+ <uri>http://github.com/Tourniquet/feedninja</uri>
24
+ <email>latzer.daniel@gmail.com</email>
25
+ </author>
26
+ #{@entries.inject { |memo, entry| memo.to_s + entry.to_s }.to_s}</feed>}
27
+
28
+ end
29
+ end
30
+
31
+ class Entry
32
+ attr_accessor :title, :link, :images, :updated, :summary, :id
33
+
34
+ def to_s
35
+ %{ <entry>
36
+ <title>#{@title}</title>
37
+ <link rel="alternate" type="text/html" href="#{@link}" />
38
+ <id>#{@id}</id>
39
+ <updated>#{@updated}</updated>
40
+ <content type="html">#{self.content.encode(:xml => :text)}</content>
41
+ </entry>
42
+ }
43
+ end
44
+
45
+ def content
46
+ Array(@images).inject("") do |memo, src|
47
+ memo += %{
48
+ <a href="#{src}">
49
+ <img src="#{src}"/>
50
+ </a>
51
+ }
52
+ #end + summary || ""
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,34 @@
1
+ class Extractor
2
+ attr_accessor :doc
3
+
4
+ def fetch uri
5
+ open(uri) do |site|
6
+ @doc = Nokogiri::HTML(site)
7
+ #return extract_image(doc, site.base_uri), extract_xml(doc)
8
+ end
9
+ end
10
+
11
+ def extract_images(base_url, *xpaths)
12
+ Array(xpaths).collect_concat do |xpath|
13
+ extract_image(base_url, xpath)
14
+ end
15
+ end
16
+
17
+ def extract_image(base_url, xpath)
18
+ @doc.xpath(xpath).collect do | picture_src |
19
+ if(picture_src.to_s.start_with? 'http') then
20
+ picture_src.to_s
21
+ else
22
+ "#{base_url.scheme}://#{base_url.host}/#{base_url.path}#{picture_src}"
23
+ end
24
+ end
25
+ end
26
+
27
+ def extract_xml *xpaths
28
+ Array(xpaths).collect_concat do |xpath|
29
+ @doc.xpath(xpath).collect do |result|
30
+ result.to_s
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,95 @@
1
+ require 'rss'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+ require 'time'
5
+
6
+ class FeedNinja
7
+ attr_accessor :uri, :picture_xpath, :text_xpath, :title_regex, :limit
8
+ attr_accessor :extractor
9
+
10
+ def initialize
11
+ @limit = 2
12
+ @extractor = Extractor.new
13
+ @writer = AtomIshWriter.new
14
+ @ninja_prefix = "N! "
15
+ end
16
+
17
+ def initialize_writer doc
18
+ @writer.updated = DateTime.now.to_s
19
+
20
+ case doc.feed_type
21
+ when "atom"
22
+ @writer.title = @ninja_prefix + doc.title.content
23
+ @writer.link = doc.link.href
24
+ when "rss"
25
+ @writer.title = @ninja_prefix + doc.channel.title
26
+ @writer.link = doc.channel.link
27
+ else
28
+ raise "Invalid feed format"
29
+ end
30
+ end
31
+
32
+ # get the feed and iterate over the entries
33
+ def fetch url
34
+ open(url) do |feed|
35
+ doc = RSS::Parser.parse(feed)
36
+ initialize_writer(doc)
37
+ process_items(doc)
38
+ end
39
+ end
40
+
41
+ def process_items doc
42
+ items = doc.items
43
+ if title_regex
44
+ items = items.select { |item| title_regex =~ item.title }
45
+ end
46
+ items.first(@limit).each do |item|
47
+
48
+ #TODO add multithreading here; be sure to use multiple extractor instances
49
+ process_item item, doc.feed_type
50
+ end
51
+ end
52
+
53
+ def process_item original, feed_type
54
+ @writer.new_entry do |entry|
55
+ case feed_type
56
+ when "atom"
57
+ entry.title = original.title.content
58
+ entry.link = original.link.href
59
+ entry.updated = original.updated
60
+ entry.id = original.id
61
+ @extractor.fetch original.link.href
62
+ when "rss"
63
+ entry.title = original.title
64
+ entry.link = original.link
65
+ entry.updated = original.pubDate ? original.pubDate.xmlschema : DateTime.now.to_s
66
+ entry.id = entry.link
67
+ @extractor.fetch original.link
68
+ end
69
+
70
+ entry.images = @extractor.extract_images @picture_xpath
71
+ entry.summary = @extractor.extract_xml @text_xpath
72
+
73
+ entry #it's kind of fishy to explicitly have to return the entry here...
74
+ end
75
+ end
76
+
77
+ def to_s
78
+ @writer.to_s
79
+ end
80
+
81
+ ## DSL convenience setters
82
+
83
+ def picture_at *xpath
84
+ @picture_xpath = xpath
85
+ end
86
+
87
+ def text_at *xpath
88
+ @text_xpath = xpath
89
+ end
90
+
91
+ def title_matches regex
92
+ @title_regex = regex
93
+ end
94
+ end
95
+
data/lib/feed_ninja.rb ADDED
@@ -0,0 +1,12 @@
1
+ require 'feed_ninja/feed_ninja'
2
+ require 'feed_ninja/atomish'
3
+ require 'feed_ninja/extractor'
4
+
5
+ def get (url, &block)
6
+ ninja = FeedNinja.new
7
+ ninja.instance_eval(&block)
8
+ ninja.fetch(url)
9
+
10
+ puts "Content-type: application/atom+xml\n"
11
+ puts ninja.to_s
12
+ end
@@ -0,0 +1,33 @@
1
+ require 'spec_helper'
2
+ require 'feed_ninja'
3
+
4
+ describe AtomIshWriter do
5
+ it 'should output a valid atom feed' do
6
+ writer = AtomIshWriter.new
7
+ writer.title = 'test'
8
+ writer.link = 'http://example.com/atom'
9
+ writer.updated = DateTime.now.to_s
10
+
11
+ writer.new_entry do |entry|
12
+ entry = Entry.new
13
+ entry.title = "title"
14
+ entry.link = "http://example.com/one"
15
+ entry.id = entry.link
16
+ entry.images = ["http://example.com/one.jpg", "http://example.com/two.jpg"]
17
+ entry.summary = "First part of the story"
18
+ entry.updated = DateTime.now.to_s
19
+ end
20
+
21
+ writer.new_entry do |entry|
22
+ entry = Entry.new
23
+ entry.title = "title"
24
+ entry.link = "http://example.com/two"
25
+ entry.id = entry.link
26
+ entry.images = ["http://example.com/one.jpg", "http://example.com/two.jpg"]
27
+ entry.summary = "Second part of the story"
28
+ entry.updated = (DateTime.now - 60).to_s
29
+ end
30
+
31
+ RSS::Parser.parse(writer.to_s)
32
+ end
33
+ end
@@ -0,0 +1,39 @@
1
+ require 'spec_helper'
2
+ require 'feed_ninja'
3
+
4
+ describe Extractor do
5
+ before :each do
6
+ @extractor = Extractor.new
7
+ @extractor.fetch 'spec/pages/one.html'
8
+ @base = URI('http://example.com')
9
+ end
10
+
11
+ it 'should extract one image with relative url' do
12
+ xpath = "//div[@id='one_image_relative']/img/@src"
13
+ picture = @extractor.extract_images(@base, xpath)
14
+
15
+ picture.should == ["http://example.com/one.jpg"]
16
+ end
17
+
18
+ it 'should extract one image with absolute url' do
19
+ xpath = "//div[@id='one_image_absolute']/img/@src"
20
+ base = URI('http://wrong.com') #base URI shouldn't be applied here
21
+ picture = @extractor.extract_images(base, xpath)
22
+
23
+ picture.should == ["http://example.com/one.jpg"]
24
+ end
25
+
26
+ it 'should extract several images' do
27
+ xpath = "//div[@id='several_images']/img/@src"
28
+ pictures = @extractor.extract_images(@base, xpath)
29
+
30
+ pictures.size.should == 2
31
+ pictures.should == ["http://example.com/one.jpg", "http://example.com/two.jpg"]
32
+ end
33
+
34
+ it 'should extract some paragraphs' do
35
+ paragraphs = @extractor.extract_xml "//div[@id='paragraphs']/p"
36
+
37
+ paragraphs.should == %w{<p>one</p> <p>two</p> <p>three</p>}
38
+ end
39
+ end
@@ -0,0 +1,28 @@
1
+ require 'spec_helper'
2
+ require 'feed_ninja'
3
+
4
+ describe FeedNinja do
5
+ before :each do
6
+ @ninja = FeedNinja.new
7
+ @extractor = double()
8
+ @ninja.extractor = @extractor
9
+ @extractor.stub(:extract_images)
10
+ @extractor.stub(:extract_xml)
11
+ end
12
+
13
+ it 'should read an atom feed' do
14
+ @extractor.should_receive(:fetch).twice
15
+ @ninja.fetch 'spec/feeds/atom.xml'
16
+ end
17
+
18
+ it 'should read an RSS feed' do
19
+ @extractor.should_receive(:fetch).twice
20
+ @ninja.fetch 'spec/feeds/rss.xml'
21
+ end
22
+
23
+ it 'should not read more than the given limit' do
24
+ @ninja.limit = 1
25
+ @extractor.should_receive(:fetch).once
26
+ @ninja.fetch 'spec/feeds/rss.xml'
27
+ end
28
+ end
@@ -0,0 +1,28 @@
1
+ <?xml version="1.0"?>
2
+ <feed xmlns="http://www.w3.org/2005/Atom">
3
+
4
+ <title>atom</title>
5
+ <link rel="alternate" type="text/html" href="http://example.com/atom"/>
6
+ <updated>2007-07-13T18:30:02Z</updated>
7
+ <author>
8
+ <name>feedninja</name>
9
+ </author>
10
+ <id>http://example.com/atom</id>
11
+
12
+ <entry>
13
+ <title>one</title>
14
+ <link href="http://example.com/one"/>
15
+ <id>1</id>
16
+ <updated>2007-07-13T18:30:02Z</updated>
17
+ <summary>summary_one</summary>
18
+ </entry>
19
+
20
+ <entry>
21
+ <title>two</title>
22
+ <link href="http://example.com/two"/>
23
+ <id>2</id>
24
+ <updated>2007-07-13T18:30:02Z</updated>
25
+ <summary>summary_two</summary>
26
+ </entry>
27
+
28
+ </feed>
@@ -0,0 +1,19 @@
1
+ <?xml version="1.0" encoding="UTF-8" ?>
2
+ <rss version="2.0">
3
+
4
+ <channel>
5
+ <title>rss_feed</title>
6
+ <link>http://www.example.com</link>
7
+ <description>descri </description>
8
+ <item>
9
+ <title>one</title>
10
+ <link>http://example.com/one</link>
11
+ <description>description_one</description>
12
+ </item>
13
+ <item>
14
+ <title>two</title>
15
+ <link>http://example.com/two</link>
16
+ <description>description_two</description>
17
+ </item>
18
+ </channel>
19
+ </rss>
@@ -0,0 +1,18 @@
1
+ <html>
2
+ <div id="one_image_relative">
3
+ <img src="one.jpg"/>
4
+ </div>
5
+ <div id="one_image_absolute">
6
+ <img src="http://example.com/one.jpg"/>
7
+ </div>
8
+ <div id="several_images">
9
+ <img src="one.jpg"/>
10
+ <img src="two.jpg"/>
11
+ </div>
12
+ <div id="paragraphs">
13
+ <p>one</p>
14
+ <p>two</p>
15
+ <span>combo_breaker</span>
16
+ <p>three</p>
17
+ </div>
18
+ </html>
@@ -0,0 +1,17 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+ RSpec.configure do |config|
8
+ config.treat_symbols_as_metadata_keys_with_true_values = true
9
+ config.run_all_when_everything_filtered = true
10
+ config.filter_run :focus
11
+
12
+ # Run specs in random order to surface order dependencies. If you find an
13
+ # order dependency and want to debug it, you can fix the order by providing
14
+ # the seed, which is printed after each run.
15
+ # --seed 1234
16
+ config.order = 'random'
17
+ end
metadata ADDED
@@ -0,0 +1,85 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: feed_ninja
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ platform: ruby
6
+ authors:
7
+ - Daniel Latzer
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-02-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 2.14.1
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 2.14.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 1.6.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 1.6.1
41
+ description: |-
42
+ This gem can be used to take an RSS or Atom feed, follow the links they provide and extract images and/or text with xpath. The data is then reformatted into a new Atom feed.
43
+ It is inteded to be used with feeds that only provide a sneak peek of the content, to rip all the interesting bits out for displaying in your feed reader immediately.
44
+ email: latzer.daniel@gmail.com
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - README.md
50
+ - lib/feed_ninja.rb
51
+ - lib/feed_ninja/atomish.rb
52
+ - lib/feed_ninja/extractor.rb
53
+ - lib/feed_ninja/feed_ninja.rb
54
+ - spec/atomish_spec.rb
55
+ - spec/extractor_spec.rb
56
+ - spec/feed_ninja_spec.rb
57
+ - spec/feeds/atom.xml
58
+ - spec/feeds/rss.xml
59
+ - spec/pages/one.html
60
+ - spec/spec_helper.rb
61
+ homepage: http://github.com/tourniquet/feedninja
62
+ licenses:
63
+ - MIT
64
+ metadata: {}
65
+ post_install_message:
66
+ rdoc_options: []
67
+ require_paths:
68
+ - lib
69
+ required_ruby_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ required_rubygems_version: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - ">="
77
+ - !ruby/object:Gem::Version
78
+ version: '0'
79
+ requirements: []
80
+ rubyforge_project:
81
+ rubygems_version: 2.2.2
82
+ signing_key:
83
+ specification_version: 4
84
+ summary: A tiny helper to rip the interesting bits out of RSS and Atom feeds
85
+ test_files: []