torrent_crawler 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.md
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,4 @@
1
+ coverage
2
+ rdoc
3
+ pkg
4
+ autotest/
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 rspeicher
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,23 @@
1
+ # torrent_crawler
2
+
3
+ Crawl multiple torrent sites.
4
+
5
+ ## Default Crawlers
6
+
7
+ The included crawlers act as more of an example than anything else. Unless you
8
+ really want to know which Linux ISOs just got released.
9
+
10
+ * [LinuxTracker](http://linuxtracker.org)
11
+ * [Mininova](http://mininova.org)
12
+
13
+ ## Note on Patches/Pull Requests
14
+
15
+ * Fork
16
+ * Code
17
+ * Commit
18
+ * Push
19
+ * Pull Request
20
+
21
+ ## Copyright
22
+
23
+ Copyright (c) 2010 Robert Speicher. See LICENSE for details.
@@ -0,0 +1,31 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "torrent_crawler"
8
+ gem.summary = %Q{Crawl multiple torrent sites}
9
+ gem.description = %Q{Crawl multiple torrent sites.}
10
+ gem.email = "rspeicher@gmail.com"
11
+ gem.homepage = "http://github.com/tsigo/torrent_crawler"
12
+ gem.authors = ["rspeicher"]
13
+ gem.add_development_dependency "rspec", "~> 2.0.0"
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
18
+ end
19
+
20
+ require 'rspec/core/rake_task'
21
+ RSpec::Core::RakeTask.new(:spec)
22
+ task :default => :spec
23
+
24
+ begin
25
+ require 'yard'
26
+ YARD::Rake::YardocTask.new
27
+ rescue LoadError
28
+ task :yardoc do
29
+ abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
30
+ end
31
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
@@ -0,0 +1,51 @@
1
+ module TorrentCrawler
2
+ module Crawlers
3
+ class Base
4
+ attr_accessor :results
5
+
6
+ def initialize
7
+ @results = []
8
+ end
9
+
10
+ def headers
11
+ {
12
+ 'User-Agent' => "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3",
13
+ 'Accept-Language' => 'en-us,en;q=0.5',
14
+ 'Referer' => index_url
15
+ }
16
+ end
17
+
18
+ def index_url
19
+ raise BadCrawlerError, "A method named index_url has not been implemented in this Crawler class"
20
+ end
21
+
22
+ def detail_url
23
+ raise BadCrawlerError, "A method named detail_url has not been implemented in this Crawler class"
24
+ end
25
+
26
+ def index(last_seen = nil)
27
+ raise BadCrawlerError, "A method named index has not been implemented in this Crawler class"
28
+ end
29
+
30
+ def detail(tracker_id)
31
+ raise BadCrawlerError, "A method named detail has not been implemented in this Crawler class"
32
+ end
33
+
34
+ def result(&block)
35
+ torrent = TorrentCrawler::Torrent.new
36
+ torrent.tracker_key = self.tracker_key
37
+
38
+ yield torrent
39
+ end
40
+
41
+ def tracker_key
42
+ self.class.to_s.gsub(/::/, '/').
43
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
44
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
45
+ tr("-", "_").
46
+ downcase.
47
+ gsub(/.*\/([^\/]+)$/, '\1')
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,60 @@
1
+ module TorrentCrawler::Crawlers
2
+ class LinuxTracker < Base
3
+ def index_url
4
+ "http://linuxtracker.org/"
5
+ end
6
+
7
+ def detail_url(tracker_id)
8
+ "http://linuxtracker.org/index.php?page=torrent-details&id=#{tracker_id}"
9
+ end
10
+
11
+ def index(last_seen = nil)
12
+ doc = Nokogiri::HTML(open(self.index_url, self.headers))
13
+ doc.css('#rightcol :nth-child(4) table > tr').each do |tr|
14
+ next if tr.css(':nth-child(2) a').first.nil?
15
+
16
+ result do |torrent|
17
+ torrent.tracker_id = tr.css(':nth-child(2) a').first['href'].gsub(/.*id=([a-z0-9]+).*/, '\1')
18
+
19
+ return results if torrent.tracker_id == last_seen
20
+
21
+ torrent.hash = torrent.tracker_id
22
+ torrent.title = tr.css(':nth-child(2) a').first.text.strip
23
+ # torrent.uploader
24
+ torrent.size = tr.css(':nth-child(5)').first.text.strip
25
+ # torrent.files
26
+ torrent.seeders = tr.css(':nth-child(6)').first.text.strip
27
+ torrent.leechers = tr.css(':nth-child(7)').first.text.strip
28
+ torrent.snatches = tr.css(':nth-child(8)').first.text.strip
29
+ torrent.snatches = '0' if torrent.snatches == '---'
30
+ torrent.uploaded_at = Time.now
31
+
32
+ results << torrent
33
+ end
34
+ end
35
+
36
+ results
37
+ end
38
+
39
+ def detail(tracker_id)
40
+ doc = Nokogiri::HTML(open(self.detail_url(tracker_id), self.headers))
41
+
42
+ result do |torrent|
43
+ torrent.tracker_id = tracker_id
44
+ torrent.hash = tracker_id
45
+ torrent.title = doc.css('tr:nth-child(1) .row1:nth-child(2)').first.text.strip
46
+ torrent.uploader = doc.css('tr:nth-child(16) a').first.text.strip
47
+ torrent.size = doc.css('tr:nth-child(13) .row1').first.text.strip
48
+ torrent.files = doc.css('tr:nth-child(14) .row1').first.text.gsub(/.*(\d+) files?.*/im, '\1')
49
+ torrent.seeders = doc.css('tr:nth-child(19) .row1').first.text.gsub(/.*Seeds: (\d+).*/, '\1')
50
+ torrent.leechers = doc.css('tr:nth-child(19) .row1').first.text.gsub(/.*Leechers: (\d+).*/, '\1')
51
+ torrent.snatches = doc.css('tr:nth-child(18) .row1').first.text.gsub(/[^\d]+/, '')
52
+ torrent.uploaded_at = Time.now # TODO: Parse site's value?
53
+
54
+ torrent.tags << torrent.uploader
55
+
56
+ torrent
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,60 @@
1
+ module TorrentCrawler::Crawlers
2
+ class Mininova < Base
3
+ def index_url
4
+ "http://www.mininova.org/"
5
+ end
6
+
7
+ def detail_url(tracker_id)
8
+ "http://www.mininova.org/det/#{tracker_id}"
9
+ end
10
+
11
+ def index(last_seen = nil)
12
+ doc = Nokogiri::HTML(open(self.index_url, self.headers))
13
+ doc.css('table.maintable:nth-child(2) tr').each do |tr|
14
+ next if tr.css(':nth-child(2) a').first.nil?
15
+
16
+ result do |torrent|
17
+ torrent.tracker_id = tr.css('td:nth-child(2) a').first['href'].gsub(%r{^.*/get/(\d+)/?$}, '\1')
18
+
19
+ return results if torrent.tracker_id == last_seen
20
+
21
+ # torrent.hash
22
+ torrent.title = tr.css('td:nth-child(2) a:nth-child(2)').first.text.strip
23
+ # torrent.uploader
24
+ torrent.size = tr.css('td:nth-child(3)').first.text.strip
25
+ # torrent.files
26
+ torrent.seeders = tr.css('td:nth-child(4)').first.text.strip
27
+ torrent.leechers = tr.css('td:nth-child(5)').first.text.strip
28
+ # torrent.snatches
29
+ torrent.uploaded_at = Time.now
30
+
31
+ torrent.tags << tr.css('td:nth-child(1)').first.text.strip
32
+ torrent.tags << tr.css('td:nth-child(2) small strong').first.text.strip
33
+
34
+ results << torrent
35
+ end
36
+ end
37
+
38
+ results
39
+ end
40
+
41
+ def detail(tracker_id)
42
+ doc = Nokogiri::HTML(open(self.detail_url(tracker_id), headers))
43
+
44
+ result do |torrent|
45
+ torrent.tracker_id = tracker_id
46
+ torrent.hash = doc.css('#torrentdetails p:nth-child(2)').first.text.strip.gsub(/Info hash:\s*(.*)/, '\1')
47
+ torrent.title = doc.css('h1').first.text.gsub(/Details of (.*)/, '\1')
48
+ # torrent.uploader
49
+ torrent.size = doc.css('#torrentdetails p:nth-child(6)').first.text.strip.gsub(/(.*) in \d+ files?/, '\1')
50
+ torrent.files = doc.css('#torrentdetails p:nth-child(6)').first.text.strip.gsub(/.*(\d+) files?$/m, '\1')
51
+ # torrent.seeders
52
+ # torrent.leechers
53
+ # torrent.snatches
54
+ torrent.uploaded_at = DateTime.parse(doc.css('#torrentdetails p:nth-child(3)').first.text.strip.gsub(/Added on:\s*(.*)/, '\1'))
55
+
56
+ torrent
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,19 @@
1
+ require 'date'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+
5
+ module TorrentCrawler
6
+ module Crawlers
7
+ # Raised when a crawler class fails to implement a required method
8
+ class BadCrawlerError < NoMethodError; end
9
+
10
+ # Raised when a crawler fails to authenticate with the tracker
11
+ class AuthenticationError < RuntimeError; end
12
+ end
13
+ end
14
+
15
+ require 'torrent_crawler/torrent'
16
+
17
+ require 'crawlers/base'
18
+ require 'crawlers/linux_tracker'
19
+ require 'crawlers/mininova'
@@ -0,0 +1,61 @@
1
+ module TorrentCrawler
2
+ class Torrent
3
+ attr_accessor :tracker_key
4
+
5
+ attr_accessor :tracker_id
6
+
7
+ def id
8
+ tracker_id
9
+ end
10
+
11
+ def id=(value)
12
+ tracker_id = value
13
+ end
14
+
15
+ attr_accessor :hash
16
+
17
+ attr_accessor :title
18
+
19
+ attr_accessor :uploader
20
+
21
+ def size
22
+ @size ||= 0
23
+ end
24
+
25
+ def size=(value)
26
+ if value.respond_to? :downcase
27
+ if value =~ /^([0-9\.]+)(.*)$/
28
+ value = $1.strip.to_f
29
+ unit = $2.strip.gsub(/[^A-Za-z]/, '')
30
+
31
+ case unit
32
+ when /tb/i
33
+ value *= 1024 * 1024 * 1024
34
+ when /gb/i
35
+ value *= 1024 * 1024
36
+ when /mb/i, /megabytes?/i
37
+ value *= 1024
38
+ end
39
+
40
+ @size = value.to_i
41
+ end
42
+ else
43
+ @size = value.to_i
44
+ end
45
+ end
46
+
47
+ attr_accessor :files
48
+
49
+ attr_accessor :seeders
50
+
51
+ attr_accessor :leechers
52
+
53
+ attr_accessor :snatches
54
+
55
+ attr_accessor :uploaded_at
56
+
57
+ def tags
58
+ @tags ||= []
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,19 @@
1
+ require 'spec_helper'
2
+
3
+
4
+ module TorrentCrawler::Crawlers
5
+ class TestCrawler < Base
6
+ end
7
+
8
+ describe Base, "#index" do
9
+ it "should raise BadCrawlerError when not implemented" do
10
+ expect { TestCrawler.new.index }.to raise_error(BadCrawlerError, /index has not been implemented/)
11
+ end
12
+ end
13
+
14
+ describe Base, "#tracker_key" do
15
+ it "should provide a default tracker_key" do
16
+ TestCrawler.new.tracker_key.should eql('test_crawler')
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,68 @@
1
+ require 'spec_helper'
2
+
3
+ module TorrentCrawler::Crawlers
4
+ describe LinuxTracker do
5
+ subject { LinuxTracker.new }
6
+
7
+ it "should implement index_url" do
8
+ expect { subject.index_url }.to_not raise_error
9
+ end
10
+
11
+ it "should implement detail_url" do
12
+ expect { subject.detail_url('abcd') }.to_not raise_error
13
+ end
14
+ end
15
+
16
+ describe LinuxTracker, "#index" do
17
+ before do
18
+ crawler = LinuxTracker.new
19
+ FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('linux_tracker/index.html'))
20
+ results = crawler.index
21
+ @subject = results[0]
22
+ end
23
+ subject { @subject }
24
+
25
+ its(:tracker_key) { should eql('linux_tracker') }
26
+ its(:tracker_id) { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
27
+ its(:hash) { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
28
+ its(:title) { should eql('parabola 2010 10 01 netinstall x86 64') }
29
+ its(:uploader) { should be_nil }
30
+ its(:size) { should eql(174_080) }
31
+ its(:files) { should be_nil }
32
+ its(:seeders) { should eql('1') }
33
+ its(:leechers) { should eql('0') }
34
+ its(:snatches) { should eql('0') }
35
+ its(:uploaded_at) { should_not be_nil }
36
+ end
37
+
38
+ describe LinuxTracker, "#index with last_seen" do
39
+ it "should stop crawling when it hits a previously-seen id" do
40
+ crawler = LinuxTracker.new
41
+ FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('linux_tracker/index.html'))
42
+ results = crawler.index('cd63c50078bae05b27195159508be0787f09d002')
43
+ results.size.should eql(2)
44
+ end
45
+ end
46
+
47
+ describe LinuxTracker, "#detail" do
48
+ before do
49
+ crawler = LinuxTracker.new
50
+ FakeWeb.register_uri(:get, crawler.detail_url('c35157e2d773fcde76e0b3ae441752f01c82bcd8'), :body => file_fixture('linux_tracker/details.html'))
51
+ @subject = crawler.detail('c35157e2d773fcde76e0b3ae441752f01c82bcd8')
52
+ end
53
+ subject { @subject }
54
+
55
+ its(:tracker_key) { should eql('linux_tracker') }
56
+ its(:tracker_id) { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
57
+ its(:hash) { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
58
+ its(:title) { should eql('parabola-2010.10.01-netinstall-x86_64') }
59
+ its(:uploader) { should eql('youknowwho') }
60
+ its(:size) { should eql(174_080) }
61
+ its(:files) { should eql('1') }
62
+ its(:seeders) { should eql('2') }
63
+ its(:leechers) { should eql('0') }
64
+ its(:snatches) { should eql('0') }
65
+ its(:uploaded_at) { should_not be_nil }
66
+ its(:tags) { should_not eql([]) }
67
+ end
68
+ end
@@ -0,0 +1,70 @@
1
+ require 'spec_helper'
2
+
3
+ module TorrentCrawler::Crawlers
4
+ describe Mininova do
5
+ subject { Mininova.new }
6
+
7
+ it "should implement index_url" do
8
+ expect { subject.index_url }.to_not raise_error
9
+ end
10
+
11
+ it "should implement detail_url" do
12
+ expect { subject.detail_url('abcd') }.to_not raise_error
13
+ end
14
+ end
15
+
16
+ describe Mininova, "#index" do
17
+ before do
18
+ crawler = Mininova.new
19
+ FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('mininova/index.html'))
20
+ results = crawler.index
21
+ @subject = results[0]
22
+ end
23
+ subject { @subject }
24
+
25
+ its(:tracker_key) { should eql('mininova') }
26
+ its(:tracker_id) { should eql('13195739') }
27
+ its(:hash) { should be_nil }
28
+ its(:title) { should eql(%{Nikki McKnight aka Nix "Sky's the Limit"}) }
29
+ its(:uploader) { should be_nil }
30
+ its(:size) { should eql(16_404) }
31
+ its(:files) { should be_nil }
32
+ its(:seeders) { should eql('1') }
33
+ its(:leechers) { should eql('0') }
34
+ its(:snatches) { should be_nil }
35
+ its(:uploaded_at) { should_not be_nil }
36
+ its(:tags) { should include('Music') }
37
+ its(:tags) { should include('Hip Hop') }
38
+ end
39
+
40
+ describe Mininova, "#index with last_seen" do
41
+ it "should stop crawling when it hits a previously-seen id" do
42
+ crawler = Mininova.new
43
+ FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('mininova/index.html'))
44
+ results = crawler.index('13195736')
45
+ results.size.should eql(2)
46
+ end
47
+ end
48
+
49
+ describe Mininova, "#detail" do
50
+ before do
51
+ crawler = Mininova.new
52
+ FakeWeb.register_uri(:get, crawler.detail_url('13195739'), :body => file_fixture('mininova/details_det.html'))
53
+ @subject = crawler.detail('13195739')
54
+ end
55
+ subject { @subject }
56
+
57
+ its(:tracker_key) { should eql('mininova') }
58
+ its(:tracker_id) { should eql('13195739') }
59
+ its(:hash) { should eql('757c4ac2d2aee458aa53847a0bd24b7946efff3b') }
60
+ its(:title) { should eql(%{Nikki McKnight aka Nix "Sky's the Limit"}) }
61
+ its(:uploader) { should be_nil }
62
+ its(:size) { should eql(16_404) }
63
+ its(:files) { should eql('2') }
64
+ its(:seeders) { should be_nil }
65
+ its(:leechers) { should be_nil }
66
+ its(:snatches) { should be_nil }
67
+ its(:uploaded_at) { should_not be_nil }
68
+ its(:tags) { should eql([]) }
69
+ end
70
+ end