RubyGems - torrent_crawler - Versions diffs - 0.0.0 - Mend

torrent_crawler 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

data/.document +5 -0
data/.gitignore +4 -0
data/LICENSE +20 -0
data/README.md +23 -0
data/Rakefile +31 -0
data/VERSION +1 -0
data/lib/crawlers/base.rb +51 -0
data/lib/crawlers/linux_tracker.rb +60 -0
data/lib/crawlers/mininova.rb +60 -0
data/lib/torrent_crawler.rb +19 -0
data/lib/torrent_crawler/torrent.rb +61 -0
data/spec/crawlers/base_spec.rb +19 -0
data/spec/crawlers/linux_tracker_spec.rb +68 -0
data/spec/crawlers/mininova_spec.rb +70 -0
data/spec/file_fixtures/linux_tracker/details.html +1296 -0
data/spec/file_fixtures/linux_tracker/index.html +1896 -0
data/spec/file_fixtures/mininova/details_det.html +239 -0
data/spec/file_fixtures/mininova/details_tor.html +268 -0
data/spec/file_fixtures/mininova/index.html +173 -0
data/spec/meta_tracker/torrent_spec.rb +25 -0
data/spec/meta_tracker_spec.rb +4 -0
data/spec/spec_helper.rb +17 -0
data/torrent_crawler.gemspec +71 -0
metadata +110 -0

data/.document ADDED

@@ -0,0 +1,5 @@
+README.md
+lib/**/*.rb
+bin/*
+features/**/*.feature
+LICENSE

data/.gitignore ADDED

@@ -0,0 +1,4 @@
+coverage
+rdoc
+pkg
+autotest/

data/LICENSE ADDED

@@ -0,0 +1,20 @@
+Copyright (c) 2009 rspeicher
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,23 @@
+# torrent_crawler
+Crawl multiple torrent sites.
+## Default Crawlers
+The included crawlers act as more of an example than anything else. Unless you
+really want to know which Linux ISOs just got released.
+* [LinuxTracker](http://linuxtracker.org)
+* [Mininova](http://mininova.org)
+## Note on Patches/Pull Requests
+* Fork
+* Code
+* Commit
+* Push
+* Pull Request
+## Copyright
+Copyright (c) 2010 Robert Speicher. See LICENSE for details.

data/Rakefile ADDED

@@ -0,0 +1,31 @@
+require 'rubygems'
+require 'rake'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gem|
+    gem.name = "torrent_crawler"
+    gem.summary = %Q{Crawl multiple torrent sites}
+    gem.description = %Q{Crawl multiple torrent sites.}
+    gem.email = "rspeicher@gmail.com"
+    gem.homepage = "http://github.com/tsigo/torrent_crawler"
+    gem.authors = ["rspeicher"]
+    gem.add_development_dependency "rspec", "~> 2.0.0"
+  end
+  Jeweler::GemcutterTasks.new
+rescue LoadError
+  puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
+end
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec
+begin
+  require 'yard'
+  YARD::Rake::YardocTask.new
+rescue LoadError
+  task :yardoc do
+    abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
+  end
+end

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.0.0

data/lib/crawlers/base.rb ADDED

@@ -0,0 +1,51 @@
+module TorrentCrawler
+  module Crawlers
+    class Base
+      attr_accessor :results
+      def initialize
+        @results = []
+      end
+      def headers
+        {
+          'User-Agent'      => "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3",
+          'Accept-Language' => 'en-us,en;q=0.5',
+          'Referer'         => index_url
+        }
+      end
+      def index_url
+        raise BadCrawlerError, "A method named index_url has not been implemented in this Crawler class"
+      end
+      def detail_url
+        raise BadCrawlerError, "A method named detail_url has not been implemented in this Crawler class"
+      end
+      def index(last_seen = nil)
+        raise BadCrawlerError, "A method named index has not been implemented in this Crawler class"
+      end
+      def detail(tracker_id)
+        raise BadCrawlerError, "A method named detail has not been implemented in this Crawler class"
+      end
+      def result(&block)
+        torrent = TorrentCrawler::Torrent.new
+        torrent.tracker_key = self.tracker_key
+        yield torrent
+      end
+      def tracker_key
+        self.class.to_s.gsub(/::/, '/').
+          gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
+          gsub(/([a-z\d])([A-Z])/,'\1_\2').
+          tr("-", "_").
+          downcase.
+          gsub(/.*\/([^\/]+)$/, '\1')
+      end
+    end
+  end
+end

data/lib/crawlers/linux_tracker.rb ADDED

@@ -0,0 +1,60 @@
+module TorrentCrawler::Crawlers
+  class LinuxTracker < Base
+    def index_url
+      "http://linuxtracker.org/"
+    end
+    def detail_url(tracker_id)
+      "http://linuxtracker.org/index.php?page=torrent-details&id=#{tracker_id}"
+    end
+    def index(last_seen = nil)
+      doc = Nokogiri::HTML(open(self.index_url, self.headers))
+      doc.css('#rightcol :nth-child(4) table > tr').each do |tr|
+        next if tr.css(':nth-child(2) a').first.nil?
+        result do |torrent|
+          torrent.tracker_id  = tr.css(':nth-child(2) a').first['href'].gsub(/.*id=([a-z0-9]+).*/, '\1')
+          return results if torrent.tracker_id == last_seen
+          torrent.hash        = torrent.tracker_id
+          torrent.title       = tr.css(':nth-child(2) a').first.text.strip
+          # torrent.uploader
+          torrent.size        = tr.css(':nth-child(5)').first.text.strip
+          # torrent.files
+          torrent.seeders     = tr.css(':nth-child(6)').first.text.strip
+          torrent.leechers    = tr.css(':nth-child(7)').first.text.strip
+          torrent.snatches    = tr.css(':nth-child(8)').first.text.strip
+          torrent.snatches    = '0' if torrent.snatches == '---'
+          torrent.uploaded_at = Time.now
+          results << torrent
+        end
+      end
+      results
+    end
+    def detail(tracker_id)
+      doc = Nokogiri::HTML(open(self.detail_url(tracker_id), self.headers))
+      result do |torrent|
+        torrent.tracker_id  = tracker_id
+        torrent.hash        = tracker_id
+        torrent.title       = doc.css('tr:nth-child(1) .row1:nth-child(2)').first.text.strip
+        torrent.uploader    = doc.css('tr:nth-child(16) a').first.text.strip
+        torrent.size        = doc.css('tr:nth-child(13) .row1').first.text.strip
+        torrent.files       = doc.css('tr:nth-child(14) .row1').first.text.gsub(/.*(\d+) files?.*/im, '\1')
+        torrent.seeders     = doc.css('tr:nth-child(19) .row1').first.text.gsub(/.*Seeds: (\d+).*/, '\1')
+        torrent.leechers    = doc.css('tr:nth-child(19) .row1').first.text.gsub(/.*Leechers: (\d+).*/, '\1')
+        torrent.snatches    = doc.css('tr:nth-child(18) .row1').first.text.gsub(/[^\d]+/, '')
+        torrent.uploaded_at = Time.now # TODO: Parse site's value?
+        torrent.tags << torrent.uploader
+        torrent
+      end
+    end
+  end
+end

data/lib/crawlers/mininova.rb ADDED

@@ -0,0 +1,60 @@
+module TorrentCrawler::Crawlers
+  class Mininova < Base
+    def index_url
+      "http://www.mininova.org/"
+    end
+    def detail_url(tracker_id)
+      "http://www.mininova.org/det/#{tracker_id}"
+    end
+    def index(last_seen = nil)
+      doc = Nokogiri::HTML(open(self.index_url, self.headers))
+      doc.css('table.maintable:nth-child(2) tr').each do |tr|
+        next if tr.css(':nth-child(2) a').first.nil?
+        result do |torrent|
+          torrent.tracker_id  = tr.css('td:nth-child(2) a').first['href'].gsub(%r{^.*/get/(\d+)/?$}, '\1')
+          return results if torrent.tracker_id == last_seen
+          # torrent.hash
+          torrent.title       = tr.css('td:nth-child(2) a:nth-child(2)').first.text.strip
+          # torrent.uploader
+          torrent.size        = tr.css('td:nth-child(3)').first.text.strip
+          # torrent.files
+          torrent.seeders     = tr.css('td:nth-child(4)').first.text.strip
+          torrent.leechers    = tr.css('td:nth-child(5)').first.text.strip
+          # torrent.snatches
+          torrent.uploaded_at = Time.now
+          torrent.tags << tr.css('td:nth-child(1)').first.text.strip
+          torrent.tags << tr.css('td:nth-child(2) small strong').first.text.strip
+          results << torrent
+        end
+      end
+      results
+    end
+    def detail(tracker_id)
+      doc = Nokogiri::HTML(open(self.detail_url(tracker_id), headers))
+      result do |torrent|
+        torrent.tracker_id  = tracker_id
+        torrent.hash        = doc.css('#torrentdetails p:nth-child(2)').first.text.strip.gsub(/Info hash:\s*(.*)/, '\1')
+        torrent.title       = doc.css('h1').first.text.gsub(/Details of (.*)/, '\1')
+        # torrent.uploader
+        torrent.size        = doc.css('#torrentdetails p:nth-child(6)').first.text.strip.gsub(/(.*) in \d+ files?/, '\1')
+        torrent.files       = doc.css('#torrentdetails p:nth-child(6)').first.text.strip.gsub(/.*(\d+) files?$/m, '\1')
+        # torrent.seeders
+        # torrent.leechers
+        # torrent.snatches
+        torrent.uploaded_at = DateTime.parse(doc.css('#torrentdetails p:nth-child(3)').first.text.strip.gsub(/Added on:\s*(.*)/, '\1'))
+        torrent
+      end
+    end
+  end
+end

data/lib/torrent_crawler.rb ADDED

@@ -0,0 +1,19 @@
+require 'date'
+require 'open-uri'
+require 'nokogiri'
+module TorrentCrawler
+  module Crawlers
+    # Raised when a crawler class fails to implement a required method
+    class BadCrawlerError < NoMethodError; end
+    # Raised when a crawler fails to authenticate with the tracker
+    class AuthenticationError < RuntimeError; end
+  end
+end
+require 'torrent_crawler/torrent'
+require 'crawlers/base'
+require 'crawlers/linux_tracker'
+require 'crawlers/mininova'

data/lib/torrent_crawler/torrent.rb ADDED

@@ -0,0 +1,61 @@
+module TorrentCrawler
+  class Torrent
+    attr_accessor :tracker_key
+    attr_accessor :tracker_id
+    def id
+      tracker_id
+    end
+    def id=(value)
+      tracker_id = value
+    end
+    attr_accessor :hash
+    attr_accessor :title
+    attr_accessor :uploader
+    def size
+      @size ||= 0
+    end
+    def size=(value)
+      if value.respond_to? :downcase
+        if value =~ /^([0-9\.]+)(.*)$/
+          value = $1.strip.to_f
+          unit  = $2.strip.gsub(/[^A-Za-z]/, '')
+          case unit
+          when /tb/i
+            value *= 1024 * 1024 * 1024
+          when /gb/i
+            value *= 1024 * 1024
+          when /mb/i, /megabytes?/i
+            value *= 1024
+          end
+          @size = value.to_i
+        end
+      else
+        @size = value.to_i
+      end
+    end
+    attr_accessor :files
+    attr_accessor :seeders
+    attr_accessor :leechers
+    attr_accessor :snatches
+    attr_accessor :uploaded_at
+    def tags
+      @tags ||= []
+    end
+  end
+end

data/spec/crawlers/base_spec.rb ADDED

@@ -0,0 +1,19 @@
+require 'spec_helper'
+module TorrentCrawler::Crawlers
+  class TestCrawler < Base
+  end
+  describe Base, "#index" do
+    it "should raise BadCrawlerError when not implemented" do
+      expect { TestCrawler.new.index }.to raise_error(BadCrawlerError, /index has not been implemented/)
+    end
+  end
+  describe Base, "#tracker_key" do
+    it "should provide a default tracker_key" do
+      TestCrawler.new.tracker_key.should eql('test_crawler')
+    end
+  end
+end

data/spec/crawlers/linux_tracker_spec.rb ADDED

@@ -0,0 +1,68 @@
+require 'spec_helper'
+module TorrentCrawler::Crawlers
+  describe LinuxTracker do
+    subject { LinuxTracker.new }
+    it "should implement index_url" do
+      expect { subject.index_url }.to_not raise_error
+    end
+    it "should implement detail_url" do
+      expect { subject.detail_url('abcd') }.to_not raise_error
+    end
+  end
+  describe LinuxTracker, "#index" do
+    before do
+      crawler = LinuxTracker.new
+      FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('linux_tracker/index.html'))
+      results = crawler.index
+      @subject = results[0]
+    end
+    subject { @subject }
+    its(:tracker_key) { should eql('linux_tracker') }
+    its(:tracker_id)  { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
+    its(:hash)        { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
+    its(:title)       { should eql('parabola 2010 10 01 netinstall x86 64') }
+    its(:uploader)    { should be_nil }
+    its(:size)        { should eql(174_080) }
+    its(:files)       { should be_nil }
+    its(:seeders)     { should eql('1') }
+    its(:leechers)    { should eql('0') }
+    its(:snatches)    { should eql('0') }
+    its(:uploaded_at) { should_not be_nil }
+  end
+  describe LinuxTracker, "#index with last_seen" do
+    it "should stop crawling when it hits a previously-seen id" do
+      crawler = LinuxTracker.new
+      FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('linux_tracker/index.html'))
+      results = crawler.index('cd63c50078bae05b27195159508be0787f09d002')
+      results.size.should eql(2)
+    end
+  end
+  describe LinuxTracker, "#detail" do
+    before do
+      crawler = LinuxTracker.new
+      FakeWeb.register_uri(:get, crawler.detail_url('c35157e2d773fcde76e0b3ae441752f01c82bcd8'), :body => file_fixture('linux_tracker/details.html'))
+      @subject = crawler.detail('c35157e2d773fcde76e0b3ae441752f01c82bcd8')
+    end
+    subject { @subject }
+    its(:tracker_key) { should eql('linux_tracker') }
+    its(:tracker_id)  { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
+    its(:hash)        { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
+    its(:title)       { should eql('parabola-2010.10.01-netinstall-x86_64') }
+    its(:uploader)    { should eql('youknowwho') }
+    its(:size)        { should eql(174_080) }
+    its(:files)       { should eql('1') }
+    its(:seeders)     { should eql('2') }
+    its(:leechers)    { should eql('0') }
+    its(:snatches)    { should eql('0') }
+    its(:uploaded_at) { should_not be_nil }
+    its(:tags)        { should_not eql([]) }
+  end
+end

data/spec/crawlers/mininova_spec.rb ADDED

@@ -0,0 +1,70 @@
+require 'spec_helper'
+module TorrentCrawler::Crawlers
+  describe Mininova do
+    subject { Mininova.new }
+    it "should implement index_url" do
+      expect { subject.index_url }.to_not raise_error
+    end
+    it "should implement detail_url" do
+      expect { subject.detail_url('abcd') }.to_not raise_error
+    end
+  end
+  describe Mininova, "#index" do
+    before do
+      crawler = Mininova.new
+      FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('mininova/index.html'))
+      results = crawler.index
+      @subject = results[0]
+    end
+    subject { @subject }
+    its(:tracker_key) { should eql('mininova') }
+    its(:tracker_id)  { should eql('13195739') }
+    its(:hash)        { should be_nil }
+    its(:title)       { should eql(%{Nikki McKnight aka Nix "Sky's the Limit"}) }
+    its(:uploader)    { should be_nil }
+    its(:size)        { should eql(16_404) }
+    its(:files)       { should be_nil }
+    its(:seeders)     { should eql('1') }
+    its(:leechers)    { should eql('0') }
+    its(:snatches)    { should be_nil }
+    its(:uploaded_at) { should_not be_nil }
+    its(:tags)        { should include('Music') }
+    its(:tags)        { should include('Hip Hop') }
+  end
+  describe Mininova, "#index with last_seen" do
+    it "should stop crawling when it hits a previously-seen id" do
+      crawler = Mininova.new
+      FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('mininova/index.html'))
+      results = crawler.index('13195736')
+      results.size.should eql(2)
+    end
+  end
+  describe Mininova, "#detail" do
+    before do
+      crawler = Mininova.new
+      FakeWeb.register_uri(:get, crawler.detail_url('13195739'), :body => file_fixture('mininova/details_det.html'))
+      @subject = crawler.detail('13195739')
+    end
+    subject { @subject }
+    its(:tracker_key) { should eql('mininova') }
+    its(:tracker_id)  { should eql('13195739') }
+    its(:hash)        { should eql('757c4ac2d2aee458aa53847a0bd24b7946efff3b') }
+    its(:title)       { should eql(%{Nikki McKnight aka Nix "Sky's the Limit"}) }
+    its(:uploader)    { should be_nil }
+    its(:size)        { should eql(16_404) }
+    its(:files)       { should eql('2') }
+    its(:seeders)     { should be_nil }
+    its(:leechers)    { should be_nil }
+    its(:snatches)    { should be_nil }
+    its(:uploaded_at) { should_not be_nil }
+    its(:tags)        { should eql([]) }
+  end
+end