ghtorrent 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +132 -0
- data/Rakefile +20 -0
- data/bin/ght-data-retrieval +119 -0
- data/bin/ght-load +242 -0
- data/bin/ght-mirror-events +154 -0
- data/bin/ght-periodic-dump +92 -0
- data/bin/ght-rm-dupl +124 -0
- data/bin/ght-torrent-index +180 -0
- data/lib/ghtorrent.rb +22 -0
- data/lib/ghtorrent/adapters/base_adapter.rb +91 -0
- data/lib/ghtorrent/adapters/mongo_persister.rb +126 -0
- data/lib/ghtorrent/adapters/noop_persister.rb +58 -0
- data/lib/ghtorrent/api_client.rb +106 -0
- data/lib/ghtorrent/call_stack.rb +119 -0
- data/lib/ghtorrent/command.rb +136 -0
- data/lib/ghtorrent/ghtorrent.rb +396 -0
- data/lib/ghtorrent/logging.rb +69 -0
- data/lib/ghtorrent/migrations/001_init_schema.rb +60 -0
- data/lib/ghtorrent/migrations/002_add_followers_created_at.rb +15 -0
- data/lib/ghtorrent/migrations/003_add_external_ref_ids.rb +40 -0
- data/lib/ghtorrent/persister.rb +48 -0
- data/lib/ghtorrent/retriever.rb +148 -0
- data/lib/ghtorrent/settings.rb +63 -0
- data/lib/ghtorrent/utils.rb +58 -0
- data/test/callstack_test.rb +67 -0
- metadata +181 -0
@@ -0,0 +1,154 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
|
4
|
+
#
|
5
|
+
# Redistribution and use in source and binary forms, with or
|
6
|
+
# without modification, are permitted provided that the following
|
7
|
+
# conditions are met:
|
8
|
+
#
|
9
|
+
# 1. Redistributions of source code must retain the above
|
10
|
+
# copyright notice, this list of conditions and the following
|
11
|
+
# disclaimer.
|
12
|
+
#
|
13
|
+
# 2. Redistributions in binary form must reproduce the above
|
14
|
+
# copyright notice, this list of conditions and the following
|
15
|
+
# disclaimer in the documentation and/or other materials
|
16
|
+
# provided with the distribution.
|
17
|
+
#
|
18
|
+
# THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
19
|
+
# AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
20
|
+
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
21
|
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
|
22
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
23
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
24
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
25
|
+
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
26
|
+
# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
27
|
+
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
28
|
+
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
29
|
+
# POSSIBILITY OF SUCH DAMAGE.
|
30
|
+
|
31
|
+
require 'rubygems'
|
32
|
+
require 'yaml'
|
33
|
+
require 'amqp'
|
34
|
+
require 'eventmachine'
|
35
|
+
require 'ghtorrent'
|
36
|
+
require 'json'
|
37
|
+
require 'logger'
|
38
|
+
|
39
|
+
class GHTMirrorEvents < GHTorrent::Command
|
40
|
+
|
41
|
+
include GHTorrent::Settings
|
42
|
+
include GHTorrent::Logging
|
43
|
+
include GHTorrent::Persister
|
44
|
+
|
45
|
+
attr_reader :settings
|
46
|
+
|
47
|
+
# Retrieve events from Github, store them in the DB
|
48
|
+
def retrieve(exchange)
|
49
|
+
begin
|
50
|
+
new = dupl = 0
|
51
|
+
events = @gh.get_events
|
52
|
+
|
53
|
+
events.each do |e|
|
54
|
+
unless @persister.find(:events, {'id' => e['id']}).empty?
|
55
|
+
info "Already got #{e['id']}"
|
56
|
+
dupl += 1
|
57
|
+
next
|
58
|
+
end
|
59
|
+
|
60
|
+
new += 1
|
61
|
+
@persister.store(:events, e)
|
62
|
+
info "Added #{e['id']}"
|
63
|
+
|
64
|
+
msg = JSON.dump(e)
|
65
|
+
key = "evt.%s" % e['type']
|
66
|
+
exchange.publish msg, :persistent => true, :routing_key => key
|
67
|
+
end
|
68
|
+
return new, dupl
|
69
|
+
rescue Exception => e
|
70
|
+
STDERR.puts e.message
|
71
|
+
STDERR.puts e.backtrace
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def prepare_options(options)
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
def go
|
80
|
+
|
81
|
+
@gh = GHTorrent::Mirror.new(options[:config])
|
82
|
+
@settings = @gh.settings
|
83
|
+
@persister = connect(:mongo, @settings)
|
84
|
+
@logger = Logger.new(STDOUT)
|
85
|
+
|
86
|
+
# Graceful exit
|
87
|
+
Signal.trap('INT') { AMQP.stop { EM.stop } }
|
88
|
+
Signal.trap('TERM') { AMQP.stop { EM.stop } }
|
89
|
+
|
90
|
+
# The event loop
|
91
|
+
AMQP.start(:host => config(:amqp_host),
|
92
|
+
:port => config(:amqp_port),
|
93
|
+
:username => config(:amqp_username),
|
94
|
+
:password => config(:amqp_password)) do |connection|
|
95
|
+
|
96
|
+
# Statistics used to recalibrate event delays
|
97
|
+
dupl_msgs = new_msgs = 1
|
98
|
+
|
99
|
+
debug "connected to rabbit"
|
100
|
+
|
101
|
+
channel = AMQP::Channel.new(connection)
|
102
|
+
exchange = channel.topic(config(:amqp_exchange), :durable => true,
|
103
|
+
:auto_delete => false)
|
104
|
+
|
105
|
+
# Initial delay for the retrieve event loop
|
106
|
+
retrieval_delay = config(:mirror_pollevery)
|
107
|
+
|
108
|
+
# Retrieve commits.
|
109
|
+
retriever = EventMachine.add_periodic_timer(retrieval_delay) do
|
110
|
+
(new, dupl) = retrieve exchange
|
111
|
+
dupl_msgs += dupl
|
112
|
+
new_msgs += new
|
113
|
+
end
|
114
|
+
|
115
|
+
# Adjust event retrieval delay time to reduce load to Github
|
116
|
+
EventMachine.add_periodic_timer(120) do
|
117
|
+
ratio = (dupl_msgs.to_f / (dupl_msgs + new_msgs).to_f)
|
118
|
+
|
119
|
+
info("Stats: #{new_msgs} new, #{dupl_msgs} duplicate, ratio: #{ratio}")
|
120
|
+
|
121
|
+
new_delay = if ratio >= 0 and ratio < 0.3 then
|
122
|
+
-1
|
123
|
+
elsif ratio >= 0.3 and ratio <= 0.5 then
|
124
|
+
0
|
125
|
+
elsif ratio > 0.5 and ratio < 1 then
|
126
|
+
+1
|
127
|
+
end
|
128
|
+
|
129
|
+
# Reset counters for new loop
|
130
|
+
dupl_msgs = new_msgs = 0
|
131
|
+
|
132
|
+
# Update the retrieval delay and restart the event retriever
|
133
|
+
if new_delay != 0
|
134
|
+
|
135
|
+
# Stop the retriever task and adjust retrieval delay
|
136
|
+
retriever.cancel
|
137
|
+
retrieval_delay = retrieval_delay + new_delay
|
138
|
+
info("Setting event retrieval delay to #{retrieval_delay} secs")
|
139
|
+
|
140
|
+
# Restart the retriever
|
141
|
+
retriever = EventMachine.add_periodic_timer(retrieval_delay) do
|
142
|
+
(new, dupl) = retrieve exchange
|
143
|
+
dupl_msgs += dupl
|
144
|
+
new_msgs += new
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
GHTMirrorEvents.run
|
153
|
+
|
154
|
+
# vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
|
@@ -0,0 +1,92 @@
|
|
1
|
+
#!/bin/sh
|
2
|
+
#
|
3
|
+
# Create the periodic database dump files
|
4
|
+
#
|
5
|
+
|
6
|
+
# Directory to place compressed files and torrents
|
7
|
+
OUTDIR=/home/data/github-mirror/dumps
|
8
|
+
|
9
|
+
# Base URL for HTTP dir containing torrents and data
|
10
|
+
WEBSEED=http://ikaria.dmst.aueb.gr/ghtorrent/
|
11
|
+
|
12
|
+
# Time to start dumping from
|
13
|
+
if [ -r lastrun ]
|
14
|
+
then
|
15
|
+
timeStart=`cat lastrun`
|
16
|
+
else
|
17
|
+
timeStart=0
|
18
|
+
fi
|
19
|
+
|
20
|
+
# Time to end dumping
|
21
|
+
if [ "$1" = "" ]
|
22
|
+
then
|
23
|
+
timeEnd=`date +%s`
|
24
|
+
else
|
25
|
+
timeEnd=`date -d "$1" +%s` || exit 1
|
26
|
+
fi
|
27
|
+
|
28
|
+
# Name used for the files
|
29
|
+
dateName=`date -d @$timeEnd -u +'%Y-%m-%d'`
|
30
|
+
|
31
|
+
# _id example:
|
32
|
+
# 4f208c3e08d69a1835000077
|
33
|
+
# 000102030405060708091011
|
34
|
+
# | || || || |
|
35
|
+
# time mach pid count
|
36
|
+
|
37
|
+
endId=`printf '%08x0000000000000000' $timeEnd`
|
38
|
+
startId=`printf '%08x0000000000000000' $timeStart`
|
39
|
+
|
40
|
+
echo "Dumping database from `date -d @$timeStart` to `date -d @$timeEnd`"
|
41
|
+
|
42
|
+
collections=`echo "show collections"|mongo --quiet github|egrep -v "system|bye"`
|
43
|
+
|
44
|
+
rm -rf dump
|
45
|
+
for col in $collections; do
|
46
|
+
|
47
|
+
echo "Dumping $col"
|
48
|
+
mongodump --db github --collection $col -q '{"_id" : {"$gte" : ObjectId("'$startId'"), "$lt" : ObjectId("'$endId'")} }' || exit 1
|
49
|
+
done
|
50
|
+
|
51
|
+
# Report the metadata for the given database
|
52
|
+
meta()
|
53
|
+
{
|
54
|
+
echo -n "Number of $1: "
|
55
|
+
mongo --quiet --eval 'db.'$1'.find({"_id" : {"$gte" : ObjectId("'$startId'"), "$lt" : ObjectId("'$endId'")} }).count() + 0' github
|
56
|
+
echo -n "Uncompressed size of $1: "
|
57
|
+
wc -c dump/github/$1.bson | awk '{printf "%d bytes ", $1}'
|
58
|
+
du -h dump/github/$1.bson | awk '{print " (" $1 ")" }'
|
59
|
+
}
|
60
|
+
|
61
|
+
for col in $collections; do
|
62
|
+
(
|
63
|
+
echo "Start date: `date -u -d @$timeStart +'%Y-%m-%dT%H:%M:%SZ'`"
|
64
|
+
echo "End date: `date -u -d @$timeEnd +'%Y-%m-%dT%H:%M:%SZ'`"
|
65
|
+
meta $col
|
66
|
+
)
|
67
|
+
done |
|
68
|
+
tee README.$dateName.txt >dump/github/README.txt || exit 1
|
69
|
+
|
70
|
+
# Do the same per collection
|
71
|
+
for col in $collections; do
|
72
|
+
echo "Archiving $col.bson"
|
73
|
+
if [ ! -s dump/github/$col.bson ]; then
|
74
|
+
echo "Collection empty, skipping"
|
75
|
+
continue
|
76
|
+
fi
|
77
|
+
|
78
|
+
if ! tar zcf $OUTDIR/$col-dump.$dateName.tar.gz dump/github/$col.bson
|
79
|
+
then
|
80
|
+
rm -f $OUTDIR/$col-dump.$dateName.tar.gz
|
81
|
+
exit 1
|
82
|
+
fi
|
83
|
+
|
84
|
+
mktorrent -a udp://tracker.openbittorrent.com:80 -a udp://tracker.publicbt.com:80/announce -a http://tracker.bittorrent.am/announce -w $WEBSEED/$col-dump.$dateName.tar.gz -o $OUTDIR/$col-dump.$dateName.torrent $OUTDIR/$col-dump.$dateName.tar.gz
|
85
|
+
done
|
86
|
+
|
87
|
+
# Update last run info
|
88
|
+
echo $timeEnd >lastrun || exit 1
|
89
|
+
|
90
|
+
# Clean up
|
91
|
+
rm -rf dump
|
92
|
+
|
data/bin/ght-rm-dupl
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Knows how to remove duplicate entries from various collections.
|
4
|
+
#
|
5
|
+
# Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
|
6
|
+
#
|
7
|
+
# Redistribution and use in source and binary forms, with or
|
8
|
+
# without modification, are permitted provided that the following
|
9
|
+
# conditions are met:
|
10
|
+
#
|
11
|
+
# 1. Redistributions of source code must retain the above
|
12
|
+
# copyright notice, this list of conditions and the following
|
13
|
+
# disclaimer.
|
14
|
+
#
|
15
|
+
# 2. Redistributions in binary form must reproduce the above
|
16
|
+
# copyright notice, this list of conditions and the following
|
17
|
+
# disclaimer in the documentation and/or other materials
|
18
|
+
# provided with the distribution.
|
19
|
+
#
|
20
|
+
# THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
21
|
+
# AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
22
|
+
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
23
|
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
|
24
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
25
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
26
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
27
|
+
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
28
|
+
# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
29
|
+
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
30
|
+
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
31
|
+
# POSSIBILITY OF SUCH DAMAGE.
|
32
|
+
|
33
|
+
require 'rubygems'
|
34
|
+
require 'mongo'
|
35
|
+
require 'ghtorrent-old'
|
36
|
+
|
37
|
+
GH = Mirror.new
|
38
|
+
GH.init("config.yaml")
|
39
|
+
|
40
|
+
# Unique keys per known collection
|
41
|
+
per_col = {
|
42
|
+
:commits => {
|
43
|
+
:payload => "commit.id",
|
44
|
+
:col => GH.commits_col,
|
45
|
+
},
|
46
|
+
:events => {
|
47
|
+
:payload => "id",
|
48
|
+
:col => GH.events_col,
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
# Print MongoDB remove statements that
|
53
|
+
# remove all but one entries for each commit.
|
54
|
+
def remove_duplicates(data, col)
|
55
|
+
removed = 0
|
56
|
+
data.select { |k, v| v.size > 1 }.each do |k, v|
|
57
|
+
v.slice(0..(v.size - 2)).map do |x|
|
58
|
+
removed += 1 if delete_by_id col, x
|
59
|
+
end
|
60
|
+
end
|
61
|
+
removed
|
62
|
+
end
|
63
|
+
|
64
|
+
def delete_by_id(col, id)
|
65
|
+
begin
|
66
|
+
col.remove({'_id' => id})
|
67
|
+
true
|
68
|
+
rescue Mongo::OperationFailure
|
69
|
+
puts "Cannot remove record with id #{id} from #{col.name}"
|
70
|
+
false
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
which = case ARGV[0]
|
75
|
+
when "commits" then :commits
|
76
|
+
when "events" then :events
|
77
|
+
else puts "Not a known collection name: #{ARGV[0]}\n"
|
78
|
+
end
|
79
|
+
|
80
|
+
from = case ARGV[1]
|
81
|
+
when nil then {}
|
82
|
+
else
|
83
|
+
t = Time.at(ARGV[1].to_i)
|
84
|
+
STDERR.puts "Searching for duplicates after #{t}"
|
85
|
+
{'_id' => {'$gte' => BSON::ObjectId.from_time(t)}}
|
86
|
+
end
|
87
|
+
|
88
|
+
# Various counters to report stats
|
89
|
+
processed = total_processed = removed = 0
|
90
|
+
|
91
|
+
data = Hash.new
|
92
|
+
|
93
|
+
# The following code needs to save intermediate results to cope
|
94
|
+
# with large datasets
|
95
|
+
per_col[which][:col].find(from, :fields => per_col[which][:payload]).each do |r|
|
96
|
+
_id = r["_id"]
|
97
|
+
commit = GH.read_value(r, per_col[which][:payload])
|
98
|
+
|
99
|
+
# If entries cannot be parsed, remove them
|
100
|
+
if commit.empty?
|
101
|
+
puts "Deleting unknown entry #{_id}"
|
102
|
+
removed += 1 if delete_by_id per_col[which][:col], _id
|
103
|
+
else
|
104
|
+
data[commit] = [] if data[commit].nil?
|
105
|
+
data[commit] << _id
|
106
|
+
end
|
107
|
+
|
108
|
+
processed += 1
|
109
|
+
total_processed += 1
|
110
|
+
|
111
|
+
print "\rProcessed #{processed} records"
|
112
|
+
|
113
|
+
# Calculate duplicates, save intermediate result
|
114
|
+
if processed > 500000
|
115
|
+
puts "\nLoaded #{data.size} values, cleaning"
|
116
|
+
removed += remove_duplicates data, per_col[which][:col]
|
117
|
+
data = Hash.new
|
118
|
+
processed = 0
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
removed += remove_duplicates data, per_col[which][:col]
|
123
|
+
|
124
|
+
puts "Processed #{total_processed}, deleted #{removed} duplicates"
|
@@ -0,0 +1,180 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
|
4
|
+
#
|
5
|
+
# Redistribution and use in source and binary forms, with or
|
6
|
+
# without modification, are permitted provided that the following
|
7
|
+
# conditions are met:
|
8
|
+
#
|
9
|
+
# 1. Redistributions of source code must retain the above
|
10
|
+
# copyright notice, this list of conditions and the following
|
11
|
+
# disclaimer.
|
12
|
+
#
|
13
|
+
# 2. Redistributions in binary form must reproduce the above
|
14
|
+
# copyright notice, this list of conditions and the following
|
15
|
+
# disclaimer in the documentation and/or other materials
|
16
|
+
# provided with the distribution.
|
17
|
+
#
|
18
|
+
# THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
19
|
+
# AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
20
|
+
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
21
|
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
|
22
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
23
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
24
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
25
|
+
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
26
|
+
# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
27
|
+
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
28
|
+
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
29
|
+
# POSSIBILITY OF SUCH DAMAGE.
|
30
|
+
|
31
|
+
require 'rubygems'
|
32
|
+
require 'erb'
|
33
|
+
require 'set'
|
34
|
+
require 'date'
|
35
|
+
require 'ghtorrent'
|
36
|
+
|
37
|
+
class Page
|
38
|
+
attr_reader :collections
|
39
|
+
attr_reader :dumps
|
40
|
+
|
41
|
+
def initialize(last_update)
|
42
|
+
@last_update = last_update
|
43
|
+
@dumps = Set.new
|
44
|
+
@collections = Set.new
|
45
|
+
end
|
46
|
+
|
47
|
+
def add_dump(dump)
|
48
|
+
@dumps << dump
|
49
|
+
end
|
50
|
+
|
51
|
+
def add_collection(col)
|
52
|
+
@collections << col
|
53
|
+
end
|
54
|
+
|
55
|
+
# Expose private binding() method.
|
56
|
+
def get_binding
|
57
|
+
binding()
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
class Dump
|
63
|
+
attr_reader :torrents
|
64
|
+
attr_reader :date
|
65
|
+
|
66
|
+
def initialize(torrents, date)
|
67
|
+
@torrents = torrents
|
68
|
+
@date = date
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
class Torrent
|
73
|
+
attr_reader :url
|
74
|
+
attr_reader :name
|
75
|
+
attr_reader :size
|
76
|
+
attr_reader :date
|
77
|
+
|
78
|
+
def initialize(url, name, size, date)
|
79
|
+
@url = url
|
80
|
+
@name = name
|
81
|
+
@size = size
|
82
|
+
@date = date
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
class Indexer < GHTorrent::Command
|
87
|
+
|
88
|
+
def prepare_options(options)
|
89
|
+
options.banner <<-BANNER
|
90
|
+
Create an HTML table from a list of torrent and data files. The expected
|
91
|
+
naming is the following:
|
92
|
+
|
93
|
+
collname-dump-2012-03-27.torrent
|
94
|
+
collname-dump-2012-03-27.tar.gz
|
95
|
+
|
96
|
+
#{command_name} [options]
|
97
|
+
|
98
|
+
#{command_name} options:
|
99
|
+
BANNER
|
100
|
+
|
101
|
+
options.opt :prefix, 'URL prefix to use for links',
|
102
|
+
:short => 'p', :default => "", :type => :string
|
103
|
+
end
|
104
|
+
|
105
|
+
def validate_options
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
def go
|
110
|
+
url_prefix=options[:prefix]
|
111
|
+
|
112
|
+
# Load the template
|
113
|
+
gem_root = Gem.loaded_specs['ghtorrent']
|
114
|
+
|
115
|
+
file = if gem_root.nil?
|
116
|
+
# Gem not installed yet, try current dir
|
117
|
+
File.open("index.erb").read
|
118
|
+
else
|
119
|
+
File.open(File.join(gem_root, "index.erb")).read
|
120
|
+
end
|
121
|
+
|
122
|
+
rhtml = ERB.new(file)
|
123
|
+
|
124
|
+
# Open the dir to read entries from
|
125
|
+
dir = ARGV.shift
|
126
|
+
|
127
|
+
if dir.nil?
|
128
|
+
dir = "."
|
129
|
+
end
|
130
|
+
|
131
|
+
torrents = Dir.entries("#{dir}").map do |f|
|
132
|
+
|
133
|
+
#File name format expected: collname-dump-2012-03-27.torrent
|
134
|
+
# collname-dump-2012-03-27.tar.gz
|
135
|
+
|
136
|
+
# Go through all torrent files and extract name of
|
137
|
+
# dumped collection and dump date
|
138
|
+
matches = /([a-z0-9]+)-[a-z]+\.(.*)\.torrent/.match(f)
|
139
|
+
next if matches.nil?
|
140
|
+
|
141
|
+
# Calculate original file size
|
142
|
+
dump = f.gsub(/.torrent/, ".tar.gz")
|
143
|
+
size = File.stat(File.join(dir, dump)).size / 1024 / 1024
|
144
|
+
|
145
|
+
# Expects a format of yyyy-mm-dd
|
146
|
+
date = Date.parse(matches[2])
|
147
|
+
|
148
|
+
if size > 0
|
149
|
+
Torrent.new(url_prefix + "/" + f, matches[1], size, date)
|
150
|
+
end
|
151
|
+
end.select { |x| !x.nil? }
|
152
|
+
|
153
|
+
all_dates = torrents.inject(Set.new) { |acc, t| acc << t.date }
|
154
|
+
|
155
|
+
all_dumps = all_dates.map { |d|
|
156
|
+
date_torrents = torrents.select { |t| t.date == d }
|
157
|
+
name_torrents = date_torrents.inject(Hash.new) { |acc, a|
|
158
|
+
acc.store(a.name, a);
|
159
|
+
acc
|
160
|
+
}
|
161
|
+
Dump.new(name_torrents, d)
|
162
|
+
}
|
163
|
+
|
164
|
+
max_date = all_dates.max { |a, b| a <=> b }
|
165
|
+
|
166
|
+
ghtorrent = Page.new(max_date)
|
167
|
+
all_dumps.each { |x|
|
168
|
+
ghtorrent.add_dump x
|
169
|
+
x.torrents.values.each { |t|
|
170
|
+
ghtorrent.add_collection t.name
|
171
|
+
}
|
172
|
+
}
|
173
|
+
|
174
|
+
puts rhtml.result(ghtorrent.get_binding).gsub(/^\s+/, "").gsub(/\s+$/, $/).gsub(/<table>/, "\n<table>")
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
Indexer.run
|
179
|
+
|
180
|
+
# vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
|