torrent_crawler 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.md
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,4 @@
1
+ coverage
2
+ rdoc
3
+ pkg
4
+ autotest/
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 rspeicher
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,23 @@
1
+ # torrent_crawler
2
+
3
+ Crawl multiple torrent sites.
4
+
5
+ ## Default Crawlers
6
+
7
+ The included crawlers act as more of an example than anything else. Unless you
8
+ really want to know which Linux ISOs just got released.
9
+
10
+ * [LinuxTracker](http://linuxtracker.org)
11
+ * [Mininova](http://mininova.org)
12
+
13
+ ## Note on Patches/Pull Requests
14
+
15
+ * Fork
16
+ * Code
17
+ * Commit
18
+ * Push
19
+ * Pull Request
20
+
21
+ ## Copyright
22
+
23
+ Copyright (c) 2010 Robert Speicher. See LICENSE for details.
@@ -0,0 +1,31 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "torrent_crawler"
8
+ gem.summary = %Q{Crawl multiple torrent sites}
9
+ gem.description = %Q{Crawl multiple torrent sites.}
10
+ gem.email = "rspeicher@gmail.com"
11
+ gem.homepage = "http://github.com/tsigo/torrent_crawler"
12
+ gem.authors = ["rspeicher"]
13
+ gem.add_development_dependency "rspec", "~> 2.0.0"
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
18
+ end
19
+
20
+ require 'rspec/core/rake_task'
21
+ RSpec::Core::RakeTask.new(:spec)
22
+ task :default => :spec
23
+
24
+ begin
25
+ require 'yard'
26
+ YARD::Rake::YardocTask.new
27
+ rescue LoadError
28
+ task :yardoc do
29
+ abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
30
+ end
31
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
@@ -0,0 +1,51 @@
1
+ module TorrentCrawler
2
+ module Crawlers
3
+ class Base
4
+ attr_accessor :results
5
+
6
+ def initialize
7
+ @results = []
8
+ end
9
+
10
+ def headers
11
+ {
12
+ 'User-Agent' => "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3",
13
+ 'Accept-Language' => 'en-us,en;q=0.5',
14
+ 'Referer' => index_url
15
+ }
16
+ end
17
+
18
+ def index_url
19
+ raise BadCrawlerError, "A method named index_url has not been implemented in this Crawler class"
20
+ end
21
+
22
+ def detail_url
23
+ raise BadCrawlerError, "A method named detail_url has not been implemented in this Crawler class"
24
+ end
25
+
26
+ def index(last_seen = nil)
27
+ raise BadCrawlerError, "A method named index has not been implemented in this Crawler class"
28
+ end
29
+
30
+ def detail(tracker_id)
31
+ raise BadCrawlerError, "A method named detail has not been implemented in this Crawler class"
32
+ end
33
+
34
+ def result(&block)
35
+ torrent = TorrentCrawler::Torrent.new
36
+ torrent.tracker_key = self.tracker_key
37
+
38
+ yield torrent
39
+ end
40
+
41
+ def tracker_key
42
+ self.class.to_s.gsub(/::/, '/').
43
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
44
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
45
+ tr("-", "_").
46
+ downcase.
47
+ gsub(/.*\/([^\/]+)$/, '\1')
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,60 @@
1
+ module TorrentCrawler::Crawlers
2
+ class LinuxTracker < Base
3
+ def index_url
4
+ "http://linuxtracker.org/"
5
+ end
6
+
7
+ def detail_url(tracker_id)
8
+ "http://linuxtracker.org/index.php?page=torrent-details&id=#{tracker_id}"
9
+ end
10
+
11
+ def index(last_seen = nil)
12
+ doc = Nokogiri::HTML(open(self.index_url, self.headers))
13
+ doc.css('#rightcol :nth-child(4) table > tr').each do |tr|
14
+ next if tr.css(':nth-child(2) a').first.nil?
15
+
16
+ result do |torrent|
17
+ torrent.tracker_id = tr.css(':nth-child(2) a').first['href'].gsub(/.*id=([a-z0-9]+).*/, '\1')
18
+
19
+ return results if torrent.tracker_id == last_seen
20
+
21
+ torrent.hash = torrent.tracker_id
22
+ torrent.title = tr.css(':nth-child(2) a').first.text.strip
23
+ # torrent.uploader
24
+ torrent.size = tr.css(':nth-child(5)').first.text.strip
25
+ # torrent.files
26
+ torrent.seeders = tr.css(':nth-child(6)').first.text.strip
27
+ torrent.leechers = tr.css(':nth-child(7)').first.text.strip
28
+ torrent.snatches = tr.css(':nth-child(8)').first.text.strip
29
+ torrent.snatches = '0' if torrent.snatches == '---'
30
+ torrent.uploaded_at = Time.now
31
+
32
+ results << torrent
33
+ end
34
+ end
35
+
36
+ results
37
+ end
38
+
39
+ def detail(tracker_id)
40
+ doc = Nokogiri::HTML(open(self.detail_url(tracker_id), self.headers))
41
+
42
+ result do |torrent|
43
+ torrent.tracker_id = tracker_id
44
+ torrent.hash = tracker_id
45
+ torrent.title = doc.css('tr:nth-child(1) .row1:nth-child(2)').first.text.strip
46
+ torrent.uploader = doc.css('tr:nth-child(16) a').first.text.strip
47
+ torrent.size = doc.css('tr:nth-child(13) .row1').first.text.strip
48
+ torrent.files = doc.css('tr:nth-child(14) .row1').first.text.gsub(/.*(\d+) files?.*/im, '\1')
49
+ torrent.seeders = doc.css('tr:nth-child(19) .row1').first.text.gsub(/.*Seeds: (\d+).*/, '\1')
50
+ torrent.leechers = doc.css('tr:nth-child(19) .row1').first.text.gsub(/.*Leechers: (\d+).*/, '\1')
51
+ torrent.snatches = doc.css('tr:nth-child(18) .row1').first.text.gsub(/[^\d]+/, '')
52
+ torrent.uploaded_at = Time.now # TODO: Parse site's value?
53
+
54
+ torrent.tags << torrent.uploader
55
+
56
+ torrent
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,60 @@
1
+ module TorrentCrawler::Crawlers
2
+ class Mininova < Base
3
+ def index_url
4
+ "http://www.mininova.org/"
5
+ end
6
+
7
+ def detail_url(tracker_id)
8
+ "http://www.mininova.org/det/#{tracker_id}"
9
+ end
10
+
11
+ def index(last_seen = nil)
12
+ doc = Nokogiri::HTML(open(self.index_url, self.headers))
13
+ doc.css('table.maintable:nth-child(2) tr').each do |tr|
14
+ next if tr.css(':nth-child(2) a').first.nil?
15
+
16
+ result do |torrent|
17
+ torrent.tracker_id = tr.css('td:nth-child(2) a').first['href'].gsub(%r{^.*/get/(\d+)/?$}, '\1')
18
+
19
+ return results if torrent.tracker_id == last_seen
20
+
21
+ # torrent.hash
22
+ torrent.title = tr.css('td:nth-child(2) a:nth-child(2)').first.text.strip
23
+ # torrent.uploader
24
+ torrent.size = tr.css('td:nth-child(3)').first.text.strip
25
+ # torrent.files
26
+ torrent.seeders = tr.css('td:nth-child(4)').first.text.strip
27
+ torrent.leechers = tr.css('td:nth-child(5)').first.text.strip
28
+ # torrent.snatches
29
+ torrent.uploaded_at = Time.now
30
+
31
+ torrent.tags << tr.css('td:nth-child(1)').first.text.strip
32
+ torrent.tags << tr.css('td:nth-child(2) small strong').first.text.strip
33
+
34
+ results << torrent
35
+ end
36
+ end
37
+
38
+ results
39
+ end
40
+
41
+ def detail(tracker_id)
42
+ doc = Nokogiri::HTML(open(self.detail_url(tracker_id), headers))
43
+
44
+ result do |torrent|
45
+ torrent.tracker_id = tracker_id
46
+ torrent.hash = doc.css('#torrentdetails p:nth-child(2)').first.text.strip.gsub(/Info hash:\s*(.*)/, '\1')
47
+ torrent.title = doc.css('h1').first.text.gsub(/Details of (.*)/, '\1')
48
+ # torrent.uploader
49
+ torrent.size = doc.css('#torrentdetails p:nth-child(6)').first.text.strip.gsub(/(.*) in \d+ files?/, '\1')
50
+ torrent.files = doc.css('#torrentdetails p:nth-child(6)').first.text.strip.gsub(/.*(\d+) files?$/m, '\1')
51
+ # torrent.seeders
52
+ # torrent.leechers
53
+ # torrent.snatches
54
+ torrent.uploaded_at = DateTime.parse(doc.css('#torrentdetails p:nth-child(3)').first.text.strip.gsub(/Added on:\s*(.*)/, '\1'))
55
+
56
+ torrent
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,19 @@
1
+ require 'date'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+
5
+ module TorrentCrawler
6
+ module Crawlers
7
+ # Raised when a crawler class fails to implement a required method
8
+ class BadCrawlerError < NoMethodError; end
9
+
10
+ # Raised when a crawler fails to authenticate with the tracker
11
+ class AuthenticationError < RuntimeError; end
12
+ end
13
+ end
14
+
15
+ require 'torrent_crawler/torrent'
16
+
17
+ require 'crawlers/base'
18
+ require 'crawlers/linux_tracker'
19
+ require 'crawlers/mininova'
@@ -0,0 +1,61 @@
1
+ module TorrentCrawler
2
+ class Torrent
3
+ attr_accessor :tracker_key
4
+
5
+ attr_accessor :tracker_id
6
+
7
+ def id
8
+ tracker_id
9
+ end
10
+
11
+ def id=(value)
12
+ tracker_id = value
13
+ end
14
+
15
+ attr_accessor :hash
16
+
17
+ attr_accessor :title
18
+
19
+ attr_accessor :uploader
20
+
21
+ def size
22
+ @size ||= 0
23
+ end
24
+
25
+ def size=(value)
26
+ if value.respond_to? :downcase
27
+ if value =~ /^([0-9\.]+)(.*)$/
28
+ value = $1.strip.to_f
29
+ unit = $2.strip.gsub(/[^A-Za-z]/, '')
30
+
31
+ case unit
32
+ when /tb/i
33
+ value *= 1024 * 1024 * 1024
34
+ when /gb/i
35
+ value *= 1024 * 1024
36
+ when /mb/i, /megabytes?/i
37
+ value *= 1024
38
+ end
39
+
40
+ @size = value.to_i
41
+ end
42
+ else
43
+ @size = value.to_i
44
+ end
45
+ end
46
+
47
+ attr_accessor :files
48
+
49
+ attr_accessor :seeders
50
+
51
+ attr_accessor :leechers
52
+
53
+ attr_accessor :snatches
54
+
55
+ attr_accessor :uploaded_at
56
+
57
+ def tags
58
+ @tags ||= []
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,19 @@
1
+ require 'spec_helper'
2
+
3
+
4
+ module TorrentCrawler::Crawlers
5
+ class TestCrawler < Base
6
+ end
7
+
8
+ describe Base, "#index" do
9
+ it "should raise BadCrawlerError when not implemented" do
10
+ expect { TestCrawler.new.index }.to raise_error(BadCrawlerError, /index has not been implemented/)
11
+ end
12
+ end
13
+
14
+ describe Base, "#tracker_key" do
15
+ it "should provide a default tracker_key" do
16
+ TestCrawler.new.tracker_key.should eql('test_crawler')
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,68 @@
1
+ require 'spec_helper'
2
+
3
+ module TorrentCrawler::Crawlers
4
+ describe LinuxTracker do
5
+ subject { LinuxTracker.new }
6
+
7
+ it "should implement index_url" do
8
+ expect { subject.index_url }.to_not raise_error
9
+ end
10
+
11
+ it "should implement detail_url" do
12
+ expect { subject.detail_url('abcd') }.to_not raise_error
13
+ end
14
+ end
15
+
16
+ describe LinuxTracker, "#index" do
17
+ before do
18
+ crawler = LinuxTracker.new
19
+ FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('linux_tracker/index.html'))
20
+ results = crawler.index
21
+ @subject = results[0]
22
+ end
23
+ subject { @subject }
24
+
25
+ its(:tracker_key) { should eql('linux_tracker') }
26
+ its(:tracker_id) { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
27
+ its(:hash) { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
28
+ its(:title) { should eql('parabola 2010 10 01 netinstall x86 64') }
29
+ its(:uploader) { should be_nil }
30
+ its(:size) { should eql(174_080) }
31
+ its(:files) { should be_nil }
32
+ its(:seeders) { should eql('1') }
33
+ its(:leechers) { should eql('0') }
34
+ its(:snatches) { should eql('0') }
35
+ its(:uploaded_at) { should_not be_nil }
36
+ end
37
+
38
+ describe LinuxTracker, "#index with last_seen" do
39
+ it "should stop crawling when it hits a previously-seen id" do
40
+ crawler = LinuxTracker.new
41
+ FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('linux_tracker/index.html'))
42
+ results = crawler.index('cd63c50078bae05b27195159508be0787f09d002')
43
+ results.size.should eql(2)
44
+ end
45
+ end
46
+
47
+ describe LinuxTracker, "#detail" do
48
+ before do
49
+ crawler = LinuxTracker.new
50
+ FakeWeb.register_uri(:get, crawler.detail_url('c35157e2d773fcde76e0b3ae441752f01c82bcd8'), :body => file_fixture('linux_tracker/details.html'))
51
+ @subject = crawler.detail('c35157e2d773fcde76e0b3ae441752f01c82bcd8')
52
+ end
53
+ subject { @subject }
54
+
55
+ its(:tracker_key) { should eql('linux_tracker') }
56
+ its(:tracker_id) { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
57
+ its(:hash) { should eql('c35157e2d773fcde76e0b3ae441752f01c82bcd8') }
58
+ its(:title) { should eql('parabola-2010.10.01-netinstall-x86_64') }
59
+ its(:uploader) { should eql('youknowwho') }
60
+ its(:size) { should eql(174_080) }
61
+ its(:files) { should eql('1') }
62
+ its(:seeders) { should eql('2') }
63
+ its(:leechers) { should eql('0') }
64
+ its(:snatches) { should eql('0') }
65
+ its(:uploaded_at) { should_not be_nil }
66
+ its(:tags) { should_not eql([]) }
67
+ end
68
+ end
@@ -0,0 +1,70 @@
1
+ require 'spec_helper'
2
+
3
+ module TorrentCrawler::Crawlers
4
+ describe Mininova do
5
+ subject { Mininova.new }
6
+
7
+ it "should implement index_url" do
8
+ expect { subject.index_url }.to_not raise_error
9
+ end
10
+
11
+ it "should implement detail_url" do
12
+ expect { subject.detail_url('abcd') }.to_not raise_error
13
+ end
14
+ end
15
+
16
+ describe Mininova, "#index" do
17
+ before do
18
+ crawler = Mininova.new
19
+ FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('mininova/index.html'))
20
+ results = crawler.index
21
+ @subject = results[0]
22
+ end
23
+ subject { @subject }
24
+
25
+ its(:tracker_key) { should eql('mininova') }
26
+ its(:tracker_id) { should eql('13195739') }
27
+ its(:hash) { should be_nil }
28
+ its(:title) { should eql(%{Nikki McKnight aka Nix "Sky's the Limit"}) }
29
+ its(:uploader) { should be_nil }
30
+ its(:size) { should eql(16_404) }
31
+ its(:files) { should be_nil }
32
+ its(:seeders) { should eql('1') }
33
+ its(:leechers) { should eql('0') }
34
+ its(:snatches) { should be_nil }
35
+ its(:uploaded_at) { should_not be_nil }
36
+ its(:tags) { should include('Music') }
37
+ its(:tags) { should include('Hip Hop') }
38
+ end
39
+
40
+ describe Mininova, "#index with last_seen" do
41
+ it "should stop crawling when it hits a previously-seen id" do
42
+ crawler = Mininova.new
43
+ FakeWeb.register_uri(:get, crawler.index_url, :body => file_fixture('mininova/index.html'))
44
+ results = crawler.index('13195736')
45
+ results.size.should eql(2)
46
+ end
47
+ end
48
+
49
+ describe Mininova, "#detail" do
50
+ before do
51
+ crawler = Mininova.new
52
+ FakeWeb.register_uri(:get, crawler.detail_url('13195739'), :body => file_fixture('mininova/details_det.html'))
53
+ @subject = crawler.detail('13195739')
54
+ end
55
+ subject { @subject }
56
+
57
+ its(:tracker_key) { should eql('mininova') }
58
+ its(:tracker_id) { should eql('13195739') }
59
+ its(:hash) { should eql('757c4ac2d2aee458aa53847a0bd24b7946efff3b') }
60
+ its(:title) { should eql(%{Nikki McKnight aka Nix "Sky's the Limit"}) }
61
+ its(:uploader) { should be_nil }
62
+ its(:size) { should eql(16_404) }
63
+ its(:files) { should eql('2') }
64
+ its(:seeders) { should be_nil }
65
+ its(:leechers) { should be_nil }
66
+ its(:snatches) { should be_nil }
67
+ its(:uploaded_at) { should_not be_nil }
68
+ its(:tags) { should eql([]) }
69
+ end
70
+ end