ghtorrent 0.6 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -0
- data/Gemfile +1 -11
- data/Gemfile.lock +27 -29
- data/README.md +10 -14
- data/bin/ght-mirror-events +0 -0
- data/bin/ght-process-event +0 -0
- data/bin/ght-retrieve-repo +0 -0
- data/bin/ght-retrieve-user +6 -0
- data/lib/ghtorrent.rb +1 -0
- data/lib/ghtorrent/adapters/base_adapter.rb +6 -0
- data/lib/ghtorrent/adapters/mongo_persister.rb +8 -0
- data/lib/ghtorrent/api_client.rb +8 -29
- data/lib/ghtorrent/command.rb +1 -3
- data/lib/ghtorrent/commands/ght_data_retrieval.rb +5 -10
- data/lib/ghtorrent/commands/ght_get_more_commits.rb +28 -17
- data/lib/ghtorrent/commands/ght_load.rb +2 -2
- data/lib/ghtorrent/commands/ght_retrieve_repo.rb +45 -15
- data/lib/ghtorrent/commands/ght_retrieve_user.rb +72 -0
- data/lib/ghtorrent/ghtorrent.rb +288 -209
- data/lib/ghtorrent/migrations/012_add_forks_to_projects.rb +31 -0
- data/lib/ghtorrent/migrations/013_add_merged_to_pullreqs.rb +39 -0
- data/lib/ghtorrent/migrations/014_add_deleted_to_projects.rb +21 -0
- data/lib/ghtorrent/retriever.rb +90 -25
- data/lib/ghtorrent/settings.rb +44 -6
- data/lib/version.rb +2 -2
- metadata +52 -84
- data/bin/ght-periodic-dump +0 -130
- data/bin/ght-torrent-index +0 -150
- data/test/callstack_test.rb +0 -67
data/bin/ght-periodic-dump
DELETED
@@ -1,130 +0,0 @@
|
|
1
|
-
#!/bin/sh
|
2
|
-
#
|
3
|
-
# Create the periodic database dump files
|
4
|
-
#
|
5
|
-
|
6
|
-
# Directory to place compressed files and torrents
|
7
|
-
OUTDIR=/home/data/github-mirror/dumps
|
8
|
-
|
9
|
-
# Base URL for HTTP dir containing torrents and data
|
10
|
-
WEBSEED=http://ikaria.dmst.aueb.gr/ghtorrent/
|
11
|
-
|
12
|
-
usage()
|
13
|
-
{
|
14
|
-
echo "Usage: $0 [-f 'yyyy-mm-dd hh:mm'] [-t 'yyyy-mm-dd hh:mm']"
|
15
|
-
echo " [-c collection_to_dump]"
|
16
|
-
echo "Dump the database. -f earliest record timestamp"
|
17
|
-
echo " -t latest record timestamp"
|
18
|
-
echo " -c collection to dump (default: all)"
|
19
|
-
}
|
20
|
-
|
21
|
-
if [ -z $1 ]
|
22
|
-
then
|
23
|
-
usage
|
24
|
-
exit 1
|
25
|
-
fi
|
26
|
-
|
27
|
-
while getopts "f:t:c:" o
|
28
|
-
do
|
29
|
-
case $o in
|
30
|
-
f) timeStart=`date -d "$OPTARG" +%s` ;;
|
31
|
-
t) timeEnd=`date -d "$OPTARG" +%s` ;;
|
32
|
-
c) collection=$OPTARG ;;
|
33
|
-
\?) echo "Invalid option: -$OPTARG" >&2
|
34
|
-
usage
|
35
|
-
exit 1
|
36
|
-
;;
|
37
|
-
esac
|
38
|
-
done
|
39
|
-
|
40
|
-
|
41
|
-
# Time to start dumping from
|
42
|
-
if [ -z $timeStart ]
|
43
|
-
then
|
44
|
-
if [ -r lastrun ]
|
45
|
-
then
|
46
|
-
timeStart=`cat lastrun`
|
47
|
-
else
|
48
|
-
timeStart=0
|
49
|
-
fi
|
50
|
-
fi
|
51
|
-
|
52
|
-
# Time to end dumping
|
53
|
-
if [ -z $timeEnd ]
|
54
|
-
then
|
55
|
-
timeEnd=`date +%s`
|
56
|
-
fi
|
57
|
-
|
58
|
-
# Name used for the files
|
59
|
-
dateName=`date -d @$timeEnd -u +'%Y-%m-%d'`
|
60
|
-
|
61
|
-
# _id example:
|
62
|
-
# 4f208c3e08d69a1835000077
|
63
|
-
# 000102030405060708091011
|
64
|
-
# | || || || |
|
65
|
-
# time mach pid count
|
66
|
-
|
67
|
-
endId=`printf '%08x0000000000000000' $timeEnd`
|
68
|
-
startId=`printf '%08x0000000000000000' $timeStart`
|
69
|
-
|
70
|
-
|
71
|
-
if [ -z $collection ]
|
72
|
-
then
|
73
|
-
collections=`echo "show collections"|mongo --quiet github|egrep -v "system|bye"`
|
74
|
-
else
|
75
|
-
collections=$collection
|
76
|
-
fi
|
77
|
-
|
78
|
-
echo "Dumping database from `date -d @$timeStart` to `date -d @$timeEnd`"
|
79
|
-
|
80
|
-
rm -rf dump
|
81
|
-
mkdir -p dump/github
|
82
|
-
|
83
|
-
for col in $collections; do
|
84
|
-
|
85
|
-
echo "Dumping $col"
|
86
|
-
mongodump --db github --collection $col -q '{"_id" : {"$gte" : ObjectId("'$startId'"), "$lt" : ObjectId("'$endId'")} }' || exit 1
|
87
|
-
done
|
88
|
-
|
89
|
-
# Report the metadata for the given database
|
90
|
-
meta()
|
91
|
-
{
|
92
|
-
echo -n "Number of $1: "
|
93
|
-
mongo --quiet --eval 'db.'$1'.find({"_id" : {"$gte" : ObjectId("'$startId'"), "$lt" : ObjectId("'$endId'")} }).count() + 0' github
|
94
|
-
echo -n "Uncompressed size of $1: "
|
95
|
-
wc -c dump/github/$1.bson | awk '{printf "%d bytes ", $1}'
|
96
|
-
du -h dump/github/$1.bson | awk '{print " (" $1 ")" }'
|
97
|
-
}
|
98
|
-
|
99
|
-
for col in $collections; do
|
100
|
-
(
|
101
|
-
echo "Start date: `date -u -d @$timeStart +'%Y-%m-%dT%H:%M:%SZ'`"
|
102
|
-
echo "End date: `date -u -d @$timeEnd +'%Y-%m-%dT%H:%M:%SZ'`"
|
103
|
-
meta $col
|
104
|
-
)
|
105
|
-
done |
|
106
|
-
tee README.$dateName.txt >dump/github/README.txt || exit 1
|
107
|
-
|
108
|
-
# Do the same per collection
|
109
|
-
for col in $collections; do
|
110
|
-
echo "Archiving $col.bson"
|
111
|
-
if [ ! -s dump/github/$col.bson ]; then
|
112
|
-
echo "Collection empty, skipping"
|
113
|
-
continue
|
114
|
-
fi
|
115
|
-
|
116
|
-
if ! tar zcf $OUTDIR/$col-dump.$dateName.tar.gz dump/github/$col.bson
|
117
|
-
then
|
118
|
-
rm -f $OUTDIR/$col-dump.$dateName.tar.gz
|
119
|
-
exit 1
|
120
|
-
fi
|
121
|
-
|
122
|
-
mktorrent -a udp://tracker.openbittorrent.com:80 -a udp://tracker.publicbt.com:80/announce -a http://tracker.bittorrent.am/announce -w $WEBSEED/$col-dump.$dateName.tar.gz -o $OUTDIR/$col-dump.$dateName.torrent $OUTDIR/$col-dump.$dateName.tar.gz
|
123
|
-
done
|
124
|
-
|
125
|
-
# Update last run info
|
126
|
-
echo $timeEnd >lastrun || exit 1
|
127
|
-
|
128
|
-
# Clean up
|
129
|
-
rm -rf dump
|
130
|
-
|
data/bin/ght-torrent-index
DELETED
@@ -1,150 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'erb'
|
3
|
-
require 'set'
|
4
|
-
require 'date'
|
5
|
-
require 'ghtorrent'
|
6
|
-
|
7
|
-
class Page
|
8
|
-
attr_reader :collections
|
9
|
-
attr_reader :dumps
|
10
|
-
|
11
|
-
def initialize(last_update)
|
12
|
-
@last_update = last_update
|
13
|
-
@dumps = Set.new
|
14
|
-
@collections = Set.new
|
15
|
-
end
|
16
|
-
|
17
|
-
def add_dump(dump)
|
18
|
-
@dumps << dump
|
19
|
-
end
|
20
|
-
|
21
|
-
def add_collection(col)
|
22
|
-
@collections << col
|
23
|
-
end
|
24
|
-
|
25
|
-
# Expose private binding() method.
|
26
|
-
def get_binding
|
27
|
-
binding()
|
28
|
-
end
|
29
|
-
|
30
|
-
end
|
31
|
-
|
32
|
-
class Dump
|
33
|
-
attr_reader :torrents
|
34
|
-
attr_reader :date
|
35
|
-
|
36
|
-
def initialize(torrents, date)
|
37
|
-
@torrents = torrents
|
38
|
-
@date = date
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
class Torrent
|
43
|
-
attr_reader :url
|
44
|
-
attr_reader :name
|
45
|
-
attr_reader :size
|
46
|
-
attr_reader :date
|
47
|
-
|
48
|
-
def initialize(url, name, size, date)
|
49
|
-
@url = url
|
50
|
-
@name = name
|
51
|
-
@size = size
|
52
|
-
@date = date
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
class Indexer < GHTorrent::Command
|
57
|
-
|
58
|
-
def prepare_options(options)
|
59
|
-
options.banner <<-BANNER
|
60
|
-
Create an HTML table from a list of torrent and data files. The expected
|
61
|
-
naming is the following:
|
62
|
-
|
63
|
-
collname-dump-2012-03-27.torrent
|
64
|
-
collname-dump-2012-03-27.tar.gz
|
65
|
-
|
66
|
-
#{command_name} [options]
|
67
|
-
|
68
|
-
#{command_name} options:
|
69
|
-
BANNER
|
70
|
-
|
71
|
-
options.opt :prefix, 'URL prefix to use for links',
|
72
|
-
:short => 'p', :default => "", :type => :string
|
73
|
-
end
|
74
|
-
|
75
|
-
def validate_options
|
76
|
-
|
77
|
-
end
|
78
|
-
|
79
|
-
def go
|
80
|
-
url_prefix=options[:prefix]
|
81
|
-
|
82
|
-
# Load the template
|
83
|
-
gem_root = Gem.loaded_specs['ghtorrent']
|
84
|
-
|
85
|
-
file = if gem_root.nil?
|
86
|
-
# Gem not installed yet, try current dir
|
87
|
-
File.open("index.erb").read
|
88
|
-
else
|
89
|
-
File.open(File.join(gem_root, "index.erb")).read
|
90
|
-
end
|
91
|
-
|
92
|
-
rhtml = ERB.new(file)
|
93
|
-
|
94
|
-
# Open the dir to read entries from
|
95
|
-
dir = ARGV.shift
|
96
|
-
|
97
|
-
if dir.nil?
|
98
|
-
dir = "."
|
99
|
-
end
|
100
|
-
|
101
|
-
torrents = Dir.entries("#{dir}").map do |f|
|
102
|
-
|
103
|
-
#File name format expected: collname-dump-2012-03-27.torrent
|
104
|
-
# collname-dump-2012-03-27.tar.gz
|
105
|
-
|
106
|
-
# Go through all torrent files and extract name of
|
107
|
-
# dumped collection and dump date
|
108
|
-
matches = /([a-z0-9]+)-[a-z]+\.(.*)\.torrent/.match(f)
|
109
|
-
next if matches.nil?
|
110
|
-
|
111
|
-
# Calculate original file size
|
112
|
-
dump = f.gsub(/.torrent/, ".tar.gz")
|
113
|
-
size = File.stat(File.join(dir, dump)).size / 1024 / 1024
|
114
|
-
|
115
|
-
# Expects a format of yyyy-mm-dd
|
116
|
-
date = Date.parse(matches[2])
|
117
|
-
|
118
|
-
if size > 0
|
119
|
-
Torrent.new(url_prefix + "/" + f, matches[1], size, date)
|
120
|
-
end
|
121
|
-
end.select { |x| !x.nil? }
|
122
|
-
|
123
|
-
all_dates = torrents.inject(Set.new) { |acc, t| acc << t.date }
|
124
|
-
|
125
|
-
all_dumps = all_dates.map { |d|
|
126
|
-
date_torrents = torrents.select { |t| t.date == d }
|
127
|
-
name_torrents = date_torrents.inject(Hash.new) { |acc, a|
|
128
|
-
acc.store(a.name, a);
|
129
|
-
acc
|
130
|
-
}
|
131
|
-
Dump.new(name_torrents, d)
|
132
|
-
}
|
133
|
-
|
134
|
-
max_date = all_dates.max { |a, b| a <=> b }
|
135
|
-
|
136
|
-
ghtorrent = Page.new(max_date)
|
137
|
-
all_dumps.each { |x|
|
138
|
-
ghtorrent.add_dump x
|
139
|
-
x.torrents.values.each { |t|
|
140
|
-
ghtorrent.add_collection t.name
|
141
|
-
}
|
142
|
-
}
|
143
|
-
|
144
|
-
puts rhtml.result(ghtorrent.get_binding).gsub(/^\s+/, "").gsub(/\s+$/, $/).gsub(/<table>/, "\n<table>")
|
145
|
-
end
|
146
|
-
end
|
147
|
-
|
148
|
-
Indexer.run
|
149
|
-
|
150
|
-
# vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
|
data/test/callstack_test.rb
DELETED
@@ -1,67 +0,0 @@
|
|
1
|
-
require "test/unit"
|
2
|
-
require 'ghtorrent'
|
3
|
-
|
4
|
-
class CallStackTest < Test::Unit::TestCase
|
5
|
-
|
6
|
-
def setup
|
7
|
-
end
|
8
|
-
|
9
|
-
def teardown
|
10
|
-
end
|
11
|
-
|
12
|
-
def test_constructor
|
13
|
-
a = CallStack.new('users', 0)
|
14
|
-
b = CallStack.new('users', 0)
|
15
|
-
assert_equal a,b
|
16
|
-
end
|
17
|
-
|
18
|
-
def test_push
|
19
|
-
stack = CallStack.new('users1', 0)
|
20
|
-
assert_not_nil stack
|
21
|
-
|
22
|
-
stack.push("foo bar")
|
23
|
-
stack.push("2")
|
24
|
-
stack.push("1234421")
|
25
|
-
stack.empty
|
26
|
-
end
|
27
|
-
|
28
|
-
def test_pop
|
29
|
-
stack = CallStack.new('users2', 0)
|
30
|
-
assert_not_nil stack
|
31
|
-
|
32
|
-
stack.push("foo bar")
|
33
|
-
stack.push("2")
|
34
|
-
stack.push("1234421")
|
35
|
-
|
36
|
-
assert stack.pop == "1234421"
|
37
|
-
stack.empty
|
38
|
-
end
|
39
|
-
|
40
|
-
def test_push_pop_push
|
41
|
-
stack = CallStack.new('users3', 0)
|
42
|
-
assert_not_nil stack
|
43
|
-
|
44
|
-
stack.push("foo bar")
|
45
|
-
stack.push("2")
|
46
|
-
|
47
|
-
stack.pop
|
48
|
-
|
49
|
-
stack.push("1234421")
|
50
|
-
|
51
|
-
stack.empty
|
52
|
-
end
|
53
|
-
|
54
|
-
def test_stress
|
55
|
-
stack = CallStack.new('users4', 0)
|
56
|
-
|
57
|
-
1000.times do
|
58
|
-
txt = (0..rand(20)).map{65.+(rand(25)).chr}.join
|
59
|
-
stack.push txt
|
60
|
-
end
|
61
|
-
|
62
|
-
999.times do
|
63
|
-
stack.pop
|
64
|
-
end
|
65
|
-
stack.pop
|
66
|
-
end
|
67
|
-
end
|