ghtorrent 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +132 -0
- data/Rakefile +20 -0
- data/bin/ght-data-retrieval +119 -0
- data/bin/ght-load +242 -0
- data/bin/ght-mirror-events +154 -0
- data/bin/ght-periodic-dump +92 -0
- data/bin/ght-rm-dupl +124 -0
- data/bin/ght-torrent-index +180 -0
- data/lib/ghtorrent.rb +22 -0
- data/lib/ghtorrent/adapters/base_adapter.rb +91 -0
- data/lib/ghtorrent/adapters/mongo_persister.rb +126 -0
- data/lib/ghtorrent/adapters/noop_persister.rb +58 -0
- data/lib/ghtorrent/api_client.rb +106 -0
- data/lib/ghtorrent/call_stack.rb +119 -0
- data/lib/ghtorrent/command.rb +136 -0
- data/lib/ghtorrent/ghtorrent.rb +396 -0
- data/lib/ghtorrent/logging.rb +69 -0
- data/lib/ghtorrent/migrations/001_init_schema.rb +60 -0
- data/lib/ghtorrent/migrations/002_add_followers_created_at.rb +15 -0
- data/lib/ghtorrent/migrations/003_add_external_ref_ids.rb +40 -0
- data/lib/ghtorrent/persister.rb +48 -0
- data/lib/ghtorrent/retriever.rb +148 -0
- data/lib/ghtorrent/settings.rb +63 -0
- data/lib/ghtorrent/utils.rb +58 -0
- data/test/callstack_test.rb +67 -0
- metadata +181 -0
@@ -0,0 +1,154 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
|
4
|
+
#
|
5
|
+
# Redistribution and use in source and binary forms, with or
|
6
|
+
# without modification, are permitted provided that the following
|
7
|
+
# conditions are met:
|
8
|
+
#
|
9
|
+
# 1. Redistributions of source code must retain the above
|
10
|
+
# copyright notice, this list of conditions and the following
|
11
|
+
# disclaimer.
|
12
|
+
#
|
13
|
+
# 2. Redistributions in binary form must reproduce the above
|
14
|
+
# copyright notice, this list of conditions and the following
|
15
|
+
# disclaimer in the documentation and/or other materials
|
16
|
+
# provided with the distribution.
|
17
|
+
#
|
18
|
+
# THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
19
|
+
# AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
20
|
+
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
21
|
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
|
22
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
23
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
24
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
25
|
+
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
26
|
+
# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
27
|
+
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
28
|
+
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
29
|
+
# POSSIBILITY OF SUCH DAMAGE.
|
30
|
+
|
31
|
+
require 'rubygems'
|
32
|
+
require 'yaml'
|
33
|
+
require 'amqp'
|
34
|
+
require 'eventmachine'
|
35
|
+
require 'ghtorrent'
|
36
|
+
require 'json'
|
37
|
+
require 'logger'
|
38
|
+
|
39
|
+
class GHTMirrorEvents < GHTorrent::Command
|
40
|
+
|
41
|
+
include GHTorrent::Settings
|
42
|
+
include GHTorrent::Logging
|
43
|
+
include GHTorrent::Persister
|
44
|
+
|
45
|
+
attr_reader :settings
|
46
|
+
|
47
|
+
# Retrieve events from Github, store them in the DB
|
48
|
+
def retrieve(exchange)
|
49
|
+
begin
|
50
|
+
new = dupl = 0
|
51
|
+
events = @gh.get_events
|
52
|
+
|
53
|
+
events.each do |e|
|
54
|
+
unless @persister.find(:events, {'id' => e['id']}).empty?
|
55
|
+
info "Already got #{e['id']}"
|
56
|
+
dupl += 1
|
57
|
+
next
|
58
|
+
end
|
59
|
+
|
60
|
+
new += 1
|
61
|
+
@persister.store(:events, e)
|
62
|
+
info "Added #{e['id']}"
|
63
|
+
|
64
|
+
msg = JSON.dump(e)
|
65
|
+
key = "evt.%s" % e['type']
|
66
|
+
exchange.publish msg, :persistent => true, :routing_key => key
|
67
|
+
end
|
68
|
+
return new, dupl
|
69
|
+
rescue Exception => e
|
70
|
+
STDERR.puts e.message
|
71
|
+
STDERR.puts e.backtrace
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def prepare_options(options)
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
def go
|
80
|
+
|
81
|
+
@gh = GHTorrent::Mirror.new(options[:config])
|
82
|
+
@settings = @gh.settings
|
83
|
+
@persister = connect(:mongo, @settings)
|
84
|
+
@logger = Logger.new(STDOUT)
|
85
|
+
|
86
|
+
# Graceful exit
|
87
|
+
Signal.trap('INT') { AMQP.stop { EM.stop } }
|
88
|
+
Signal.trap('TERM') { AMQP.stop { EM.stop } }
|
89
|
+
|
90
|
+
# The event loop
|
91
|
+
AMQP.start(:host => config(:amqp_host),
|
92
|
+
:port => config(:amqp_port),
|
93
|
+
:username => config(:amqp_username),
|
94
|
+
:password => config(:amqp_password)) do |connection|
|
95
|
+
|
96
|
+
# Statistics used to recalibrate event delays
|
97
|
+
dupl_msgs = new_msgs = 1
|
98
|
+
|
99
|
+
debug "connected to rabbit"
|
100
|
+
|
101
|
+
channel = AMQP::Channel.new(connection)
|
102
|
+
exchange = channel.topic(config(:amqp_exchange), :durable => true,
|
103
|
+
:auto_delete => false)
|
104
|
+
|
105
|
+
# Initial delay for the retrieve event loop
|
106
|
+
retrieval_delay = config(:mirror_pollevery)
|
107
|
+
|
108
|
+
# Retrieve commits.
|
109
|
+
retriever = EventMachine.add_periodic_timer(retrieval_delay) do
|
110
|
+
(new, dupl) = retrieve exchange
|
111
|
+
dupl_msgs += dupl
|
112
|
+
new_msgs += new
|
113
|
+
end
|
114
|
+
|
115
|
+
# Adjust event retrieval delay time to reduce load to Github
|
116
|
+
EventMachine.add_periodic_timer(120) do
|
117
|
+
ratio = (dupl_msgs.to_f / (dupl_msgs + new_msgs).to_f)
|
118
|
+
|
119
|
+
info("Stats: #{new_msgs} new, #{dupl_msgs} duplicate, ratio: #{ratio}")
|
120
|
+
|
121
|
+
new_delay = if ratio >= 0 and ratio < 0.3 then
|
122
|
+
-1
|
123
|
+
elsif ratio >= 0.3 and ratio <= 0.5 then
|
124
|
+
0
|
125
|
+
elsif ratio > 0.5 and ratio < 1 then
|
126
|
+
+1
|
127
|
+
end
|
128
|
+
|
129
|
+
# Reset counters for new loop
|
130
|
+
dupl_msgs = new_msgs = 0
|
131
|
+
|
132
|
+
# Update the retrieval delay and restart the event retriever
|
133
|
+
if new_delay != 0
|
134
|
+
|
135
|
+
# Stop the retriever task and adjust retrieval delay
|
136
|
+
retriever.cancel
|
137
|
+
retrieval_delay = retrieval_delay + new_delay
|
138
|
+
info("Setting event retrieval delay to #{retrieval_delay} secs")
|
139
|
+
|
140
|
+
# Restart the retriever
|
141
|
+
retriever = EventMachine.add_periodic_timer(retrieval_delay) do
|
142
|
+
(new, dupl) = retrieve exchange
|
143
|
+
dupl_msgs += dupl
|
144
|
+
new_msgs += new
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
GHTMirrorEvents.run
|
153
|
+
|
154
|
+
# vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
|
@@ -0,0 +1,92 @@
|
|
1
|
+
#!/bin/sh
|
2
|
+
#
|
3
|
+
# Create the periodic database dump files
|
4
|
+
#
|
5
|
+
|
6
|
+
# Directory to place compressed files and torrents
|
7
|
+
OUTDIR=/home/data/github-mirror/dumps
|
8
|
+
|
9
|
+
# Base URL for HTTP dir containing torrents and data
|
10
|
+
WEBSEED=http://ikaria.dmst.aueb.gr/ghtorrent/
|
11
|
+
|
12
|
+
# Time to start dumping from
|
13
|
+
if [ -r lastrun ]
|
14
|
+
then
|
15
|
+
timeStart=`cat lastrun`
|
16
|
+
else
|
17
|
+
timeStart=0
|
18
|
+
fi
|
19
|
+
|
20
|
+
# Time to end dumping
|
21
|
+
if [ "$1" = "" ]
|
22
|
+
then
|
23
|
+
timeEnd=`date +%s`
|
24
|
+
else
|
25
|
+
timeEnd=`date -d "$1" +%s` || exit 1
|
26
|
+
fi
|
27
|
+
|
28
|
+
# Name used for the files
|
29
|
+
dateName=`date -d @$timeEnd -u +'%Y-%m-%d'`
|
30
|
+
|
31
|
+
# _id example:
|
32
|
+
# 4f208c3e08d69a1835000077
|
33
|
+
# 000102030405060708091011
|
34
|
+
# | || || || |
|
35
|
+
# time mach pid count
|
36
|
+
|
37
|
+
endId=`printf '%08x0000000000000000' $timeEnd`
|
38
|
+
startId=`printf '%08x0000000000000000' $timeStart`
|
39
|
+
|
40
|
+
echo "Dumping database from `date -d @$timeStart` to `date -d @$timeEnd`"
|
41
|
+
|
42
|
+
collections=`echo "show collections"|mongo --quiet github|egrep -v "system|bye"`
|
43
|
+
|
44
|
+
rm -rf dump
|
45
|
+
for col in $collections; do
|
46
|
+
|
47
|
+
echo "Dumping $col"
|
48
|
+
mongodump --db github --collection $col -q '{"_id" : {"$gte" : ObjectId("'$startId'"), "$lt" : ObjectId("'$endId'")} }' || exit 1
|
49
|
+
done
|
50
|
+
|
51
|
+
# Report the metadata for the given database
|
52
|
+
meta()
|
53
|
+
{
|
54
|
+
echo -n "Number of $1: "
|
55
|
+
mongo --quiet --eval 'db.'$1'.find({"_id" : {"$gte" : ObjectId("'$startId'"), "$lt" : ObjectId("'$endId'")} }).count() + 0' github
|
56
|
+
echo -n "Uncompressed size of $1: "
|
57
|
+
wc -c dump/github/$1.bson | awk '{printf "%d bytes ", $1}'
|
58
|
+
du -h dump/github/$1.bson | awk '{print " (" $1 ")" }'
|
59
|
+
}
|
60
|
+
|
61
|
+
for col in $collections; do
|
62
|
+
(
|
63
|
+
echo "Start date: `date -u -d @$timeStart +'%Y-%m-%dT%H:%M:%SZ'`"
|
64
|
+
echo "End date: `date -u -d @$timeEnd +'%Y-%m-%dT%H:%M:%SZ'`"
|
65
|
+
meta $col
|
66
|
+
)
|
67
|
+
done |
|
68
|
+
tee README.$dateName.txt >dump/github/README.txt || exit 1
|
69
|
+
|
70
|
+
# Do the same per collection
|
71
|
+
for col in $collections; do
|
72
|
+
echo "Archiving $col.bson"
|
73
|
+
if [ ! -s dump/github/$col.bson ]; then
|
74
|
+
echo "Collection empty, skipping"
|
75
|
+
continue
|
76
|
+
fi
|
77
|
+
|
78
|
+
if ! tar zcf $OUTDIR/$col-dump.$dateName.tar.gz dump/github/$col.bson
|
79
|
+
then
|
80
|
+
rm -f $OUTDIR/$col-dump.$dateName.tar.gz
|
81
|
+
exit 1
|
82
|
+
fi
|
83
|
+
|
84
|
+
mktorrent -a udp://tracker.openbittorrent.com:80 -a udp://tracker.publicbt.com:80/announce -a http://tracker.bittorrent.am/announce -w $WEBSEED/$col-dump.$dateName.tar.gz -o $OUTDIR/$col-dump.$dateName.torrent $OUTDIR/$col-dump.$dateName.tar.gz
|
85
|
+
done
|
86
|
+
|
87
|
+
# Update last run info
|
88
|
+
echo $timeEnd >lastrun || exit 1
|
89
|
+
|
90
|
+
# Clean up
|
91
|
+
rm -rf dump
|
92
|
+
|
data/bin/ght-rm-dupl
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Knows how to remove duplicate entries from various collections.
|
4
|
+
#
|
5
|
+
# Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
|
6
|
+
#
|
7
|
+
# Redistribution and use in source and binary forms, with or
|
8
|
+
# without modification, are permitted provided that the following
|
9
|
+
# conditions are met:
|
10
|
+
#
|
11
|
+
# 1. Redistributions of source code must retain the above
|
12
|
+
# copyright notice, this list of conditions and the following
|
13
|
+
# disclaimer.
|
14
|
+
#
|
15
|
+
# 2. Redistributions in binary form must reproduce the above
|
16
|
+
# copyright notice, this list of conditions and the following
|
17
|
+
# disclaimer in the documentation and/or other materials
|
18
|
+
# provided with the distribution.
|
19
|
+
#
|
20
|
+
# THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
21
|
+
# AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
22
|
+
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
23
|
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
|
24
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
25
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
26
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
27
|
+
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
28
|
+
# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
29
|
+
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
30
|
+
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
31
|
+
# POSSIBILITY OF SUCH DAMAGE.
|
32
|
+
|
33
|
+
require 'rubygems'
|
34
|
+
require 'mongo'
|
35
|
+
require 'ghtorrent-old'
|
36
|
+
|
37
|
+
GH = Mirror.new
|
38
|
+
GH.init("config.yaml")
|
39
|
+
|
40
|
+
# Unique keys per known collection
|
41
|
+
per_col = {
|
42
|
+
:commits => {
|
43
|
+
:payload => "commit.id",
|
44
|
+
:col => GH.commits_col,
|
45
|
+
},
|
46
|
+
:events => {
|
47
|
+
:payload => "id",
|
48
|
+
:col => GH.events_col,
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
# Print MongoDB remove statements that
|
53
|
+
# remove all but one entries for each commit.
|
54
|
+
def remove_duplicates(data, col)
|
55
|
+
removed = 0
|
56
|
+
data.select { |k, v| v.size > 1 }.each do |k, v|
|
57
|
+
v.slice(0..(v.size - 2)).map do |x|
|
58
|
+
removed += 1 if delete_by_id col, x
|
59
|
+
end
|
60
|
+
end
|
61
|
+
removed
|
62
|
+
end
|
63
|
+
|
64
|
+
def delete_by_id(col, id)
|
65
|
+
begin
|
66
|
+
col.remove({'_id' => id})
|
67
|
+
true
|
68
|
+
rescue Mongo::OperationFailure
|
69
|
+
puts "Cannot remove record with id #{id} from #{col.name}"
|
70
|
+
false
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
which = case ARGV[0]
|
75
|
+
when "commits" then :commits
|
76
|
+
when "events" then :events
|
77
|
+
else puts "Not a known collection name: #{ARGV[0]}\n"
|
78
|
+
end
|
79
|
+
|
80
|
+
from = case ARGV[1]
|
81
|
+
when nil then {}
|
82
|
+
else
|
83
|
+
t = Time.at(ARGV[1].to_i)
|
84
|
+
STDERR.puts "Searching for duplicates after #{t}"
|
85
|
+
{'_id' => {'$gte' => BSON::ObjectId.from_time(t)}}
|
86
|
+
end
|
87
|
+
|
88
|
+
# Various counters to report stats
|
89
|
+
processed = total_processed = removed = 0
|
90
|
+
|
91
|
+
data = Hash.new
|
92
|
+
|
93
|
+
# The following code needs to save intermediate results to cope
|
94
|
+
# with large datasets
|
95
|
+
per_col[which][:col].find(from, :fields => per_col[which][:payload]).each do |r|
|
96
|
+
_id = r["_id"]
|
97
|
+
commit = GH.read_value(r, per_col[which][:payload])
|
98
|
+
|
99
|
+
# If entries cannot be parsed, remove them
|
100
|
+
if commit.empty?
|
101
|
+
puts "Deleting unknown entry #{_id}"
|
102
|
+
removed += 1 if delete_by_id per_col[which][:col], _id
|
103
|
+
else
|
104
|
+
data[commit] = [] if data[commit].nil?
|
105
|
+
data[commit] << _id
|
106
|
+
end
|
107
|
+
|
108
|
+
processed += 1
|
109
|
+
total_processed += 1
|
110
|
+
|
111
|
+
print "\rProcessed #{processed} records"
|
112
|
+
|
113
|
+
# Calculate duplicates, save intermediate result
|
114
|
+
if processed > 500000
|
115
|
+
puts "\nLoaded #{data.size} values, cleaning"
|
116
|
+
removed += remove_duplicates data, per_col[which][:col]
|
117
|
+
data = Hash.new
|
118
|
+
processed = 0
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
removed += remove_duplicates data, per_col[which][:col]
|
123
|
+
|
124
|
+
puts "Processed #{total_processed}, deleted #{removed} duplicates"
|
@@ -0,0 +1,180 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
|
4
|
+
#
|
5
|
+
# Redistribution and use in source and binary forms, with or
|
6
|
+
# without modification, are permitted provided that the following
|
7
|
+
# conditions are met:
|
8
|
+
#
|
9
|
+
# 1. Redistributions of source code must retain the above
|
10
|
+
# copyright notice, this list of conditions and the following
|
11
|
+
# disclaimer.
|
12
|
+
#
|
13
|
+
# 2. Redistributions in binary form must reproduce the above
|
14
|
+
# copyright notice, this list of conditions and the following
|
15
|
+
# disclaimer in the documentation and/or other materials
|
16
|
+
# provided with the distribution.
|
17
|
+
#
|
18
|
+
# THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
19
|
+
# AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
20
|
+
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
21
|
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
|
22
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
23
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
24
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
25
|
+
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
26
|
+
# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
27
|
+
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
28
|
+
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
29
|
+
# POSSIBILITY OF SUCH DAMAGE.
|
30
|
+
|
31
|
+
require 'rubygems'
|
32
|
+
require 'erb'
|
33
|
+
require 'set'
|
34
|
+
require 'date'
|
35
|
+
require 'ghtorrent'
|
36
|
+
|
37
|
+
class Page
|
38
|
+
attr_reader :collections
|
39
|
+
attr_reader :dumps
|
40
|
+
|
41
|
+
def initialize(last_update)
|
42
|
+
@last_update = last_update
|
43
|
+
@dumps = Set.new
|
44
|
+
@collections = Set.new
|
45
|
+
end
|
46
|
+
|
47
|
+
def add_dump(dump)
|
48
|
+
@dumps << dump
|
49
|
+
end
|
50
|
+
|
51
|
+
def add_collection(col)
|
52
|
+
@collections << col
|
53
|
+
end
|
54
|
+
|
55
|
+
# Expose private binding() method.
|
56
|
+
def get_binding
|
57
|
+
binding()
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
class Dump
|
63
|
+
attr_reader :torrents
|
64
|
+
attr_reader :date
|
65
|
+
|
66
|
+
def initialize(torrents, date)
|
67
|
+
@torrents = torrents
|
68
|
+
@date = date
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
class Torrent
|
73
|
+
attr_reader :url
|
74
|
+
attr_reader :name
|
75
|
+
attr_reader :size
|
76
|
+
attr_reader :date
|
77
|
+
|
78
|
+
def initialize(url, name, size, date)
|
79
|
+
@url = url
|
80
|
+
@name = name
|
81
|
+
@size = size
|
82
|
+
@date = date
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
class Indexer < GHTorrent::Command
|
87
|
+
|
88
|
+
def prepare_options(options)
|
89
|
+
options.banner <<-BANNER
|
90
|
+
Create an HTML table from a list of torrent and data files. The expected
|
91
|
+
naming is the following:
|
92
|
+
|
93
|
+
collname-dump-2012-03-27.torrent
|
94
|
+
collname-dump-2012-03-27.tar.gz
|
95
|
+
|
96
|
+
#{command_name} [options]
|
97
|
+
|
98
|
+
#{command_name} options:
|
99
|
+
BANNER
|
100
|
+
|
101
|
+
options.opt :prefix, 'URL prefix to use for links',
|
102
|
+
:short => 'p', :default => "", :type => :string
|
103
|
+
end
|
104
|
+
|
105
|
+
def validate_options
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
def go
|
110
|
+
url_prefix=options[:prefix]
|
111
|
+
|
112
|
+
# Load the template
|
113
|
+
gem_root = Gem.loaded_specs['ghtorrent']
|
114
|
+
|
115
|
+
file = if gem_root.nil?
|
116
|
+
# Gem not installed yet, try current dir
|
117
|
+
File.open("index.erb").read
|
118
|
+
else
|
119
|
+
File.open(File.join(gem_root, "index.erb")).read
|
120
|
+
end
|
121
|
+
|
122
|
+
rhtml = ERB.new(file)
|
123
|
+
|
124
|
+
# Open the dir to read entries from
|
125
|
+
dir = ARGV.shift
|
126
|
+
|
127
|
+
if dir.nil?
|
128
|
+
dir = "."
|
129
|
+
end
|
130
|
+
|
131
|
+
torrents = Dir.entries("#{dir}").map do |f|
|
132
|
+
|
133
|
+
#File name format expected: collname-dump-2012-03-27.torrent
|
134
|
+
# collname-dump-2012-03-27.tar.gz
|
135
|
+
|
136
|
+
# Go through all torrent files and extract name of
|
137
|
+
# dumped collection and dump date
|
138
|
+
matches = /([a-z0-9]+)-[a-z]+\.(.*)\.torrent/.match(f)
|
139
|
+
next if matches.nil?
|
140
|
+
|
141
|
+
# Calculate original file size
|
142
|
+
dump = f.gsub(/.torrent/, ".tar.gz")
|
143
|
+
size = File.stat(File.join(dir, dump)).size / 1024 / 1024
|
144
|
+
|
145
|
+
# Expects a format of yyyy-mm-dd
|
146
|
+
date = Date.parse(matches[2])
|
147
|
+
|
148
|
+
if size > 0
|
149
|
+
Torrent.new(url_prefix + "/" + f, matches[1], size, date)
|
150
|
+
end
|
151
|
+
end.select { |x| !x.nil? }
|
152
|
+
|
153
|
+
all_dates = torrents.inject(Set.new) { |acc, t| acc << t.date }
|
154
|
+
|
155
|
+
all_dumps = all_dates.map { |d|
|
156
|
+
date_torrents = torrents.select { |t| t.date == d }
|
157
|
+
name_torrents = date_torrents.inject(Hash.new) { |acc, a|
|
158
|
+
acc.store(a.name, a);
|
159
|
+
acc
|
160
|
+
}
|
161
|
+
Dump.new(name_torrents, d)
|
162
|
+
}
|
163
|
+
|
164
|
+
max_date = all_dates.max { |a, b| a <=> b }
|
165
|
+
|
166
|
+
ghtorrent = Page.new(max_date)
|
167
|
+
all_dumps.each { |x|
|
168
|
+
ghtorrent.add_dump x
|
169
|
+
x.torrents.values.each { |t|
|
170
|
+
ghtorrent.add_collection t.name
|
171
|
+
}
|
172
|
+
}
|
173
|
+
|
174
|
+
puts rhtml.result(ghtorrent.get_binding).gsub(/^\s+/, "").gsub(/\s+$/, $/).gsub(/<table>/, "\n<table>")
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
Indexer.run
|
179
|
+
|
180
|
+
# vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
|