RubyGems - torrent_crawler - Versions diffs - 0.0.0 - Mend

torrent_crawler 0.0.0

Files changed (24) hide show

data/.document +5 -0
data/.gitignore +4 -0
data/LICENSE +20 -0
data/README.md +23 -0
data/Rakefile +31 -0
data/VERSION +1 -0
data/lib/crawlers/base.rb +51 -0
data/lib/crawlers/linux_tracker.rb +60 -0
data/lib/crawlers/mininova.rb +60 -0
data/lib/torrent_crawler.rb +19 -0
data/lib/torrent_crawler/torrent.rb +61 -0
data/spec/crawlers/base_spec.rb +19 -0
data/spec/crawlers/linux_tracker_spec.rb +68 -0
data/spec/crawlers/mininova_spec.rb +70 -0
data/spec/file_fixtures/linux_tracker/details.html +1296 -0
data/spec/file_fixtures/linux_tracker/index.html +1896 -0
data/spec/file_fixtures/mininova/details_det.html +239 -0
data/spec/file_fixtures/mininova/details_tor.html +268 -0
data/spec/file_fixtures/mininova/index.html +173 -0
data/spec/meta_tracker/torrent_spec.rb +25 -0
data/spec/meta_tracker_spec.rb +4 -0
data/spec/spec_helper.rb +17 -0
data/torrent_crawler.gemspec +71 -0
metadata +110 -0

data/.document ADDED

@@ -0,0 +1,5 @@
+README.md
+lib/**/*.rb
+bin/*
+features/**/*.feature
+LICENSE

data/.gitignore ADDED

@@ -0,0 +1,4 @@
+coverage
+rdoc
+pkg
+autotest/

data/LICENSE ADDED

@@ -0,0 +1,20 @@
+Copyright (c) 2009 rspeicher
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,23 @@
+# torrent_crawler
+Crawl multiple torrent sites.
+## Default Crawlers
+The included crawlers act as more of an example than anything else. Unless you
+really want to know which Linux ISOs just got released.
+* [LinuxTracker](http://linuxtracker.org)
+* [Mininova](http://mininova.org)
+## Note on Patches/Pull Requests
+* Fork
+* Code
+* Commit
+* Push
+* Pull Request
+## Copyright
+Copyright (c) 2010 Robert Speicher. See LICENSE for details.

data/Rakefile ADDED

@@ -0,0 +1,31 @@
+require 'rubygems'
+require 'rake'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gem|
+    gem.name = "torrent_crawler"
+    gem.summary = %Q{Crawl multiple torrent sites}
+    gem.description = %Q{Crawl multiple torrent sites.}
+    gem.email = "rspeicher@gmail.com"
+    gem.homepage = "http://github.com/tsigo/torrent_crawler"
+    gem.authors = ["rspeicher"]
+    gem.add_development_dependency "rspec", "~> 2.0.0"
+  end
+  Jeweler::GemcutterTasks.new
+rescue LoadError
+  puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
+end
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec
+begin
+  require 'yard'
+  YARD::Rake::YardocTask.new
+rescue LoadError
+  task :yardoc do
+    abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
+  end
+end

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.0.0

data/lib/crawlers/base.rb ADDED

@@ -0,0 +1,51 @@
+module TorrentCrawler
+  module Crawlers
+    class Base
+      attr_accessor :results
+      def initialize
+        @results = []
+      end
+      def headers
+        {
+          'User-Agent'      => "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3",
+          'Accept-Language' => 'en-us,en;q=0.5',
+          'Referer'         => index_url
+        }
+      end
+      def index_url
+        raise BadCrawlerError, "A method named index_url has not been implemented in this Crawler class"
+      end
+      def detail_url
+        raise BadCrawlerError, "A method named detail_url has not been implemented in this Crawler class"
+      end
+      def index(last_seen = nil)
+        raise BadCrawlerError, "A method named index has not been implemented in this Crawler class"
+      end
+      def detail(tracker_id)
+        raise BadCrawlerError, "A method named detail has not been implemented in this Crawler class"
+      end
+      def result(&block)
+        torrent = TorrentCrawler::Torrent.new
+        torrent.tracker_key = self.tracker_key
+        yield torrent
+      end
+      def tracker_key
+        self.class.to_s.gsub(/::/, '/').
+          gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
+          gsub(/([a-z\d])([A-Z])/,'\1_\2').
+          tr("-", "_").
+          downcase.
+          gsub(/.*\/([^\/]+)$/, '\1')
+      end
+    end
+  end
+end

data/lib/crawlers/linux_tracker.rb ADDED

@@ -0,0 +1,60 @@
+module TorrentCrawler::Crawlers
+  class LinuxTracker < Base
+    def index_url
+      "http://linuxtracker.org/"
+    end
+    def detail_url(tracker_id)
+      "http://linuxtracker.org/index.php?page=torrent-details&id=#{tracker_id}"
+    end
+    def index(last_seen = nil)
+      doc = Nokogiri::HTML(open(self.index_url, self.headers))
+      doc.css('#rightcol :nth-child(4) table > tr').each do |tr|
+        next if tr.css(':nth-child(2) a').first.nil?
+        result do |torrent|
+          torrent.tracker_id  = tr.css(':nth-child(2) a').first['href'].gsub(/.*id=([a-z0-9]+).*/, '\1')
+          return results if torrent.tracker_id == last_seen
+          torrent.hash        = torrent.tracker_id
+          torrent.title       = tr.css(':nth-child(2) a').first.text.strip
+          # torrent.uploader
+          torrent.size        = tr.css(':nth-child(5)').first.text.strip
+          # torrent.files
+          torrent.seeders     = tr.css(':nth-child(6)').first.text.strip
+          torrent.leechers    = tr.css(':nth-child(7)').first.text.strip
+          torrent.snatches    = tr.css(':nth-child(8)').first.text.strip
+          torrent.snatches    = '0' if torrent.snatches == '---'
+          torrent.uploaded_at = Time.now
+          results << torrent
+        end
+      end
+      results
+    end
+    def detail(tracker_id)
+      doc = Nokogiri::HTML(open(self.detail_url(tracker_id), self.headers))
+      result do |torrent|
+        torrent.tracker_id  = tracker_id
+        torrent.hash        = tracker_id
+        torrent.title       = doc.css('tr:nth-child(1) .row1:nth-child(2)').first.text.strip
+        torrent.uploader    = doc.css('tr:nth-child(16) a').first.text.strip
+        torrent.size        = doc.css('tr:nth-child(13) .row1').first.text.strip
+        torrent.files       = doc.css('tr:nth-child(14) .row1').first.text.gsub(/.*(\d+) files?.*/im, '\1')
+        torrent.seeders     = doc.css('tr:nth-child(19) .row1').first.text.gsub(/.*Seeds: (\d+).*/, '\1')
+        torrent.leechers    = doc.css('tr:nth-child(19) .row1').first.text.gsub(/.*Leechers: (\d+).*/, '\1')
+        torrent.snatches    = doc.css('tr:nth-child(18) .row1').first.text.gsub(/[^\d]+/, '')
+        torrent.uploaded_at = Time.now # TODO: Parse site's value?
+        torrent.tags << torrent.uploader
+        torrent
+      end
+    end
+  end
+end

data/lib/crawlers/mininova.rb ADDED

@@ -0,0 +1,60 @@
+module TorrentCrawler::Crawlers
+  class Mininova < Base
+    def index_url
+      "http://www.mininova.org/"
+    end
+    def detail_url(tracker_id)
+      "http://www.mininova.org/det/#{tracker_id}"
+    end
+    def index(last_seen = nil)
+      doc = Nokogiri::HTML(open(self.index_url, self.headers))
+      doc.css('table.maintable:nth-child(2) tr').each do |tr|
+        next if tr.css(':nth-child(2) a').first.nil?
+        result do |torrent|
+          torrent.tracker_id  = tr.css('td:nth-child(2) a').first['href'].gsub(%r{^.*/get/(\d+)/?$}, '\1')
+          return results if torrent.tracker_id == last_seen
+          # torrent.hash
+          torrent.title       = tr.css('td:nth-child(2) a:nth-child(2)').first.text.strip
+          # torrent.uploader
+          torrent.size        = tr.css('td:nth-child(3)').first.text.strip
+          # torrent.files
+          torrent.seeders     = tr.css('td:nth-child(4)').first.text.strip
+          torrent.leechers    = tr.css('td:nth-child(5)').first.text.strip
+          # torrent.snatches
+          torrent.uploaded_at = Time.now
+          torrent.tags << tr.css('td:nth-child(1)').first.text.strip
+          torrent.tags << tr.css('td:nth-child(2) small strong').first.text.strip
+          results << torrent
+        end
+      end
+      results
+    end
+    def detail(tracker_id)
+      doc = Nokogiri::HTML(open(self.detail_url(tracker_id), headers))
+      result do |torrent|
+        torrent.tracker_id  = tracker_id
+        torrent.hash        = doc.css('#torrentdetails p:nth-child(2)').first.text.strip.gsub(/Info hash:\s*(.*)/, '\1')
+        torrent.title       = doc.css('h1').first.text.gsub(/Details of (.*)/, '\1')
+        # torrent.uploader
+        torrent.size        = doc.css('#torrentdetails p:nth-child(6)').first.text.strip.gsub(/(.*) in \d+ files?/, '\1')
+        torrent.files       = doc.css('#torrentdetails p:nth-child(6)').first.text.strip.gsub(/.*(\d+) files?$/m, '\1')
+        # torrent.seeders
+        # torrent.leechers
+        # torrent.snatches
+        torrent.uploaded_at = DateTime.parse(doc.css('#torrentdetails p:nth-child(3)').first.text.strip.gsub(/Added on:\s*(.*)/, '\1'))
+        torrent
+      end
+    end
+  end
+end

data/lib/torrent_crawler.rb ADDED

@@ -0,0 +1,19 @@
+require 'date'
+require 'open-uri'
+require 'nokogiri'
+module TorrentCrawler
+  module Crawlers
+    # Raised when a crawler class fails to implement a required method
+    class BadCrawlerError < NoMethodError; end
+    # Raised when a crawler fails to authenticate with the tracker
+    class AuthenticationError < RuntimeError; end
+  end
+end
+require 'torrent_crawler/torrent'
+require 'crawlers/base'
+require 'crawlers/linux_tracker'
+require 'crawlers/mininova'

data/lib/torrent_crawler/torrent.rb ADDED

@@ -0,0 +1,61 @@
+module TorrentCrawler
+  class Torrent
+    attr_accessor :tracker_key
+    attr_accessor :tracker_id
+    def id
+      tracker_id
+    end
+    def id=(value)
+      tracker_id = value
+    end
+    attr_accessor :hash
+    attr_accessor :title
+    attr_accessor :uploader
+    def size
+      @size ||= 0
+    end
+    def size=(value)
+      if value.respond_to? :downcase
+        if value =~ /^([0-9\.]+)(.*)$/
+          value = $1.strip.to_f
+          unit  = $2.strip.gsub(/[^A-Za-z]/, '')
+          case unit
+          when /tb/i
+            value *= 1024 * 1024 * 1024
+          when /gb/i
+            value *= 1024 * 1024
+          when /mb/i, /megabytes?/i
+            value *= 1024
+          end
+          @size = value.to_i
+        end
+      else
+        @size = value.to_i
+      end
+    end
+    attr_accessor :files
+    attr_accessor :seeders
+    attr_accessor :leechers
+    attr_accessor :snatches
+    attr_accessor :uploaded_at
+    def tags
+      @tags ||= []
+    end
+  end
+end

data/spec/crawlers/base_spec.rb ADDED

@@ -0,0 +1,19 @@
+require 'spec_helper'
+module TorrentCrawler::Crawlers
+  class TestCrawler < Base
+  end
+  describe Base, "#index" do
+    it "should raise BadCrawlerError when not implemented" do
+      expect { TestCrawler.new.index }.to raise_error(BadCrawlerError, /index has not been implemented/)
+    end
+  end
+  describe Base, "#tracker_key" do
+    it "should provide a default tracker_key" do
+      TestCrawler.new.tracker_key.should eql('test_crawler')
+    end
+  end
+end

data/spec/crawlers/linux_tracker_spec.rb ADDED

@@ -0,0 +1,68 @@
+require 'spec_helper'
+module TorrentCrawler::Crawlers
+  describe LinuxTracker do
+    subject { LinuxTracker.new }
+    it "should implement index_url" do
+      expect { subject.index_url }.to_not raise_error
+    end
+    it "should implement detail_url" do
+      expect { subject.detail_url('abcd') }.to_not raise_error
+    end
+  end
+  describe LinuxTracker, "#index" do
+    before do
+      crawler = LinuxTracker.new
+      FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('linux_tracker/index.html'))
+      results = crawler.index
+      @subject = results[0]
+    end
+    subject { @subject }
+    its(:tracker_key) { should eql('linux_tracker') }
+    its(:tracker_id)  { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
+    its(:hash)        { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
+    its(:title)       { should eql('parabola 2010 10 01 netinstall x86 64') }
+    its(:uploader)    { should be_nil }
+    its(:size)        { should eql(174_080) }
+    its(:files)       { should be_nil }
+    its(:seeders)     { should eql('1') }
+    its(:leechers)    { should eql('0') }
+    its(:snatches)    { should eql('0') }
+    its(:uploaded_at) { should_not be_nil }
+  end
+  describe LinuxTracker, "#index with last_seen" do
+    it "should stop crawling when it hits a previously-seen id" do
+      crawler = LinuxTracker.new
+      FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('linux_tracker/index.html'))
+      results = crawler.index('cd63c50078bae05b27195159508be0787f09d002')
+      results.size.should eql(2)
+    end
+  end
+  describe LinuxTracker, "#detail" do
+    before do
+      crawler = LinuxTracker.new
+      FakeWeb.register_uri(:get, crawler.detail_url('c35157e2d773fcde76e0b3ae441752f01c82bcd8'), :body => file_fixture('linux_tracker/details.html'))
+      @subject = crawler.detail('c35157e2d773fcde76e0b3ae441752f01c82bcd8')
+    end
+    subject { @subject }
+    its(:tracker_key) { should eql('linux_tracker') }
+    its(:tracker_id)  { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
+    its(:hash)        { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
+    its(:title)       { should eql('parabola-2010.10.01-netinstall-x86_64') }
+    its(:uploader)    { should eql('youknowwho') }
+    its(:size)        { should eql(174_080) }
+    its(:files)       { should eql('1') }
+    its(:seeders)     { should eql('2') }
+    its(:leechers)    { should eql('0') }
+    its(:snatches)    { should eql('0') }
+    its(:uploaded_at) { should_not be_nil }
+    its(:tags)        { should_not eql([]) }
+  end
+end

data/spec/crawlers/mininova_spec.rb ADDED

@@ -0,0 +1,70 @@
+require 'spec_helper'
+module TorrentCrawler::Crawlers
+  describe Mininova do
+    subject { Mininova.new }
+    it "should implement index_url" do
+      expect { subject.index_url }.to_not raise_error
+    end
+    it "should implement detail_url" do
+      expect { subject.detail_url('abcd') }.to_not raise_error
+    end
+  end
+  describe Mininova, "#index" do
+    before do
+      crawler = Mininova.new
+      FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('mininova/index.html'))
+      results = crawler.index
+      @subject = results[0]
+    end
+    subject { @subject }
+    its(:tracker_key) { should eql('mininova') }
+    its(:tracker_id)  { should eql('13195739') }
+    its(:hash)        { should be_nil }
+    its(:title)       { should eql(%{Nikki McKnight aka Nix "Sky's the Limit"}) }
+    its(:uploader)    { should be_nil }
+    its(:size)        { should eql(16_404) }
+    its(:files)       { should be_nil }
+    its(:seeders)     { should eql('1') }
+    its(:leechers)    { should eql('0') }
+    its(:snatches)    { should be_nil }
+    its(:uploaded_at) { should_not be_nil }
+    its(:tags)        { should include('Music') }
+    its(:tags)        { should include('Hip Hop') }
+  end
+  describe Mininova, "#index with last_seen" do
+    it "should stop crawling when it hits a previously-seen id" do
+      crawler = Mininova.new
+      FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('mininova/index.html'))
+      results = crawler.index('13195736')
+      results.size.should eql(2)
+    end
+  end
+  describe Mininova, "#detail" do
+    before do
+      crawler = Mininova.new
+      FakeWeb.register_uri(:get, crawler.detail_url('13195739'), :body => file_fixture('mininova/details_det.html'))
+      @subject = crawler.detail('13195739')
+    end
+    subject { @subject }
+    its(:tracker_key) { should eql('mininova') }
+    its(:tracker_id)  { should eql('13195739') }
+    its(:hash)        { should eql('757c4ac2d2aee458aa53847a0bd24b7946efff3b') }
+    its(:title)       { should eql(%{Nikki McKnight aka Nix "Sky's the Limit"}) }
+    its(:uploader)    { should be_nil }
+    its(:size)        { should eql(16_404) }
+    its(:files)       { should eql('2') }
+    its(:seeders)     { should be_nil }
+    its(:leechers)    { should be_nil }
+    its(:snatches)    { should be_nil }
+    its(:uploaded_at) { should_not be_nil }
+    its(:tags)        { should eql([]) }
+  end
+end