ghtorrent 0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,154 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
4
+ #
5
+ # Redistribution and use in source and binary forms, with or
6
+ # without modification, are permitted provided that the following
7
+ # conditions are met:
8
+ #
9
+ # 1. Redistributions of source code must retain the above
10
+ # copyright notice, this list of conditions and the following
11
+ # disclaimer.
12
+ #
13
+ # 2. Redistributions in binary form must reproduce the above
14
+ # copyright notice, this list of conditions and the following
15
+ # disclaimer in the documentation and/or other materials
16
+ # provided with the distribution.
17
+ #
18
+ # THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ # AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20
+ # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
22
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
25
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26
+ # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27
+ # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28
+ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
+ # POSSIBILITY OF SUCH DAMAGE.
30
+
31
+ require 'rubygems'
32
+ require 'yaml'
33
+ require 'amqp'
34
+ require 'eventmachine'
35
+ require 'ghtorrent'
36
+ require 'json'
37
+ require 'logger'
38
+
39
+ class GHTMirrorEvents < GHTorrent::Command
40
+
41
+ include GHTorrent::Settings
42
+ include GHTorrent::Logging
43
+ include GHTorrent::Persister
44
+
45
+ attr_reader :settings
46
+
47
+ # Retrieve events from Github, store them in the DB
48
+ def retrieve(exchange)
49
+ begin
50
+ new = dupl = 0
51
+ events = @gh.get_events
52
+
53
+ events.each do |e|
54
+ unless @persister.find(:events, {'id' => e['id']}).empty?
55
+ info "Already got #{e['id']}"
56
+ dupl += 1
57
+ next
58
+ end
59
+
60
+ new += 1
61
+ @persister.store(:events, e)
62
+ info "Added #{e['id']}"
63
+
64
+ msg = JSON.dump(e)
65
+ key = "evt.%s" % e['type']
66
+ exchange.publish msg, :persistent => true, :routing_key => key
67
+ end
68
+ return new, dupl
69
+ rescue Exception => e
70
+ STDERR.puts e.message
71
+ STDERR.puts e.backtrace
72
+ end
73
+ end
74
+
75
+ def prepare_options(options)
76
+
77
+ end
78
+
79
+ def go
80
+
81
+ @gh = GHTorrent::Mirror.new(options[:config])
82
+ @settings = @gh.settings
83
+ @persister = connect(:mongo, @settings)
84
+ @logger = Logger.new(STDOUT)
85
+
86
+ # Graceful exit
87
+ Signal.trap('INT') { AMQP.stop { EM.stop } }
88
+ Signal.trap('TERM') { AMQP.stop { EM.stop } }
89
+
90
+ # The event loop
91
+ AMQP.start(:host => config(:amqp_host),
92
+ :port => config(:amqp_port),
93
+ :username => config(:amqp_username),
94
+ :password => config(:amqp_password)) do |connection|
95
+
96
+ # Statistics used to recalibrate event delays
97
+ dupl_msgs = new_msgs = 1
98
+
99
+ debug "connected to rabbit"
100
+
101
+ channel = AMQP::Channel.new(connection)
102
+ exchange = channel.topic(config(:amqp_exchange), :durable => true,
103
+ :auto_delete => false)
104
+
105
+ # Initial delay for the retrieve event loop
106
+ retrieval_delay = config(:mirror_pollevery)
107
+
108
+ # Retrieve commits.
109
+ retriever = EventMachine.add_periodic_timer(retrieval_delay) do
110
+ (new, dupl) = retrieve exchange
111
+ dupl_msgs += dupl
112
+ new_msgs += new
113
+ end
114
+
115
+ # Adjust event retrieval delay time to reduce load to Github
116
+ EventMachine.add_periodic_timer(120) do
117
+ ratio = (dupl_msgs.to_f / (dupl_msgs + new_msgs).to_f)
118
+
119
+ info("Stats: #{new_msgs} new, #{dupl_msgs} duplicate, ratio: #{ratio}")
120
+
121
+ new_delay = if ratio >= 0 and ratio < 0.3 then
122
+ -1
123
+ elsif ratio >= 0.3 and ratio <= 0.5 then
124
+ 0
125
+ elsif ratio > 0.5 and ratio < 1 then
126
+ +1
127
+ end
128
+
129
+ # Reset counters for new loop
130
+ dupl_msgs = new_msgs = 0
131
+
132
+ # Update the retrieval delay and restart the event retriever
133
+ if new_delay != 0
134
+
135
+ # Stop the retriever task and adjust retrieval delay
136
+ retriever.cancel
137
+ retrieval_delay = retrieval_delay + new_delay
138
+ info("Setting event retrieval delay to #{retrieval_delay} secs")
139
+
140
+ # Restart the retriever
141
+ retriever = EventMachine.add_periodic_timer(retrieval_delay) do
142
+ (new, dupl) = retrieve exchange
143
+ dupl_msgs += dupl
144
+ new_msgs += new
145
+ end
146
+ end
147
+ end
148
+ end
149
+ end
150
+ end
151
+
152
+ GHTMirrorEvents.run
153
+
154
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
@@ -0,0 +1,92 @@
1
+ #!/bin/sh
2
+ #
3
+ # Create the periodic database dump files
4
+ #
5
+
6
+ # Directory to place compressed files and torrents
7
+ OUTDIR=/home/data/github-mirror/dumps
8
+
9
+ # Base URL for HTTP dir containing torrents and data
10
+ WEBSEED=http://ikaria.dmst.aueb.gr/ghtorrent/
11
+
12
+ # Time to start dumping from
13
+ if [ -r lastrun ]
14
+ then
15
+ timeStart=`cat lastrun`
16
+ else
17
+ timeStart=0
18
+ fi
19
+
20
+ # Time to end dumping
21
+ if [ "$1" = "" ]
22
+ then
23
+ timeEnd=`date +%s`
24
+ else
25
+ timeEnd=`date -d "$1" +%s` || exit 1
26
+ fi
27
+
28
+ # Name used for the files
29
+ dateName=`date -d @$timeEnd -u +'%Y-%m-%d'`
30
+
31
+ # _id example:
32
+ # 4f208c3e08d69a1835000077
33
+ # 000102030405060708091011
34
+ # | || || || |
35
+ # time mach pid count
36
+
37
+ endId=`printf '%08x0000000000000000' $timeEnd`
38
+ startId=`printf '%08x0000000000000000' $timeStart`
39
+
40
+ echo "Dumping database from `date -d @$timeStart` to `date -d @$timeEnd`"
41
+
42
+ collections=`echo "show collections"|mongo --quiet github|egrep -v "system|bye"`
43
+
44
+ rm -rf dump
45
+ for col in $collections; do
46
+
47
+ echo "Dumping $col"
48
+ mongodump --db github --collection $col -q '{"_id" : {"$gte" : ObjectId("'$startId'"), "$lt" : ObjectId("'$endId'")} }' || exit 1
49
+ done
50
+
51
+ # Report the metadata for the given database
52
+ meta()
53
+ {
54
+ echo -n "Number of $1: "
55
+ mongo --quiet --eval 'db.'$1'.find({"_id" : {"$gte" : ObjectId("'$startId'"), "$lt" : ObjectId("'$endId'")} }).count() + 0' github
56
+ echo -n "Uncompressed size of $1: "
57
+ wc -c dump/github/$1.bson | awk '{printf "%d bytes ", $1}'
58
+ du -h dump/github/$1.bson | awk '{print " (" $1 ")" }'
59
+ }
60
+
61
+ for col in $collections; do
62
+ (
63
+ echo "Start date: `date -u -d @$timeStart +'%Y-%m-%dT%H:%M:%SZ'`"
64
+ echo "End date: `date -u -d @$timeEnd +'%Y-%m-%dT%H:%M:%SZ'`"
65
+ meta $col
66
+ )
67
+ done |
68
+ tee README.$dateName.txt >dump/github/README.txt || exit 1
69
+
70
+ # Do the same per collection
71
+ for col in $collections; do
72
+ echo "Archiving $col.bson"
73
+ if [ ! -s dump/github/$col.bson ]; then
74
+ echo "Collection empty, skipping"
75
+ continue
76
+ fi
77
+
78
+ if ! tar zcf $OUTDIR/$col-dump.$dateName.tar.gz dump/github/$col.bson
79
+ then
80
+ rm -f $OUTDIR/$col-dump.$dateName.tar.gz
81
+ exit 1
82
+ fi
83
+
84
+ mktorrent -a udp://tracker.openbittorrent.com:80 -a udp://tracker.publicbt.com:80/announce -a http://tracker.bittorrent.am/announce -w $WEBSEED/$col-dump.$dateName.tar.gz -o $OUTDIR/$col-dump.$dateName.torrent $OUTDIR/$col-dump.$dateName.tar.gz
85
+ done
86
+
87
+ # Update last run info
88
+ echo $timeEnd >lastrun || exit 1
89
+
90
+ # Clean up
91
+ rm -rf dump
92
+
data/bin/ght-rm-dupl ADDED
@@ -0,0 +1,124 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Knows how to remove duplicate entries from various collections.
4
+ #
5
+ # Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
6
+ #
7
+ # Redistribution and use in source and binary forms, with or
8
+ # without modification, are permitted provided that the following
9
+ # conditions are met:
10
+ #
11
+ # 1. Redistributions of source code must retain the above
12
+ # copyright notice, this list of conditions and the following
13
+ # disclaimer.
14
+ #
15
+ # 2. Redistributions in binary form must reproduce the above
16
+ # copyright notice, this list of conditions and the following
17
+ # disclaimer in the documentation and/or other materials
18
+ # provided with the distribution.
19
+ #
20
+ # THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
+ # AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22
+ # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
24
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
27
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28
+ # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29
+ # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30
+ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31
+ # POSSIBILITY OF SUCH DAMAGE.
32
+
33
+ require 'rubygems'
34
+ require 'mongo'
35
+ require 'ghtorrent-old'
36
+
37
+ GH = Mirror.new
38
+ GH.init("config.yaml")
39
+
40
+ # Unique keys per known collection
41
+ per_col = {
42
+ :commits => {
43
+ :payload => "commit.id",
44
+ :col => GH.commits_col,
45
+ },
46
+ :events => {
47
+ :payload => "id",
48
+ :col => GH.events_col,
49
+ }
50
+ }
51
+
52
+ # Print MongoDB remove statements that
53
+ # remove all but one entries for each commit.
54
+ def remove_duplicates(data, col)
55
+ removed = 0
56
+ data.select { |k, v| v.size > 1 }.each do |k, v|
57
+ v.slice(0..(v.size - 2)).map do |x|
58
+ removed += 1 if delete_by_id col, x
59
+ end
60
+ end
61
+ removed
62
+ end
63
+
64
+ def delete_by_id(col, id)
65
+ begin
66
+ col.remove({'_id' => id})
67
+ true
68
+ rescue Mongo::OperationFailure
69
+ puts "Cannot remove record with id #{id} from #{col.name}"
70
+ false
71
+ end
72
+ end
73
+
74
+ which = case ARGV[0]
75
+ when "commits" then :commits
76
+ when "events" then :events
77
+ else puts "Not a known collection name: #{ARGV[0]}\n"
78
+ end
79
+
80
+ from = case ARGV[1]
81
+ when nil then {}
82
+ else
83
+ t = Time.at(ARGV[1].to_i)
84
+ STDERR.puts "Searching for duplicates after #{t}"
85
+ {'_id' => {'$gte' => BSON::ObjectId.from_time(t)}}
86
+ end
87
+
88
+ # Various counters to report stats
89
+ processed = total_processed = removed = 0
90
+
91
+ data = Hash.new
92
+
93
+ # The following code needs to save intermediate results to cope
94
+ # with large datasets
95
+ per_col[which][:col].find(from, :fields => per_col[which][:payload]).each do |r|
96
+ _id = r["_id"]
97
+ commit = GH.read_value(r, per_col[which][:payload])
98
+
99
+ # If entries cannot be parsed, remove them
100
+ if commit.empty?
101
+ puts "Deleting unknown entry #{_id}"
102
+ removed += 1 if delete_by_id per_col[which][:col], _id
103
+ else
104
+ data[commit] = [] if data[commit].nil?
105
+ data[commit] << _id
106
+ end
107
+
108
+ processed += 1
109
+ total_processed += 1
110
+
111
+ print "\rProcessed #{processed} records"
112
+
113
+ # Calculate duplicates, save intermediate result
114
+ if processed > 500000
115
+ puts "\nLoaded #{data.size} values, cleaning"
116
+ removed += remove_duplicates data, per_col[which][:col]
117
+ data = Hash.new
118
+ processed = 0
119
+ end
120
+ end
121
+
122
+ removed += remove_duplicates data, per_col[which][:col]
123
+
124
+ puts "Processed #{total_processed}, deleted #{removed} duplicates"
@@ -0,0 +1,180 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
4
+ #
5
+ # Redistribution and use in source and binary forms, with or
6
+ # without modification, are permitted provided that the following
7
+ # conditions are met:
8
+ #
9
+ # 1. Redistributions of source code must retain the above
10
+ # copyright notice, this list of conditions and the following
11
+ # disclaimer.
12
+ #
13
+ # 2. Redistributions in binary form must reproduce the above
14
+ # copyright notice, this list of conditions and the following
15
+ # disclaimer in the documentation and/or other materials
16
+ # provided with the distribution.
17
+ #
18
+ # THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ # AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20
+ # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
22
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
25
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26
+ # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27
+ # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28
+ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
+ # POSSIBILITY OF SUCH DAMAGE.
30
+
31
+ require 'rubygems'
32
+ require 'erb'
33
+ require 'set'
34
+ require 'date'
35
+ require 'ghtorrent'
36
+
37
+ class Page
38
+ attr_reader :collections
39
+ attr_reader :dumps
40
+
41
+ def initialize(last_update)
42
+ @last_update = last_update
43
+ @dumps = Set.new
44
+ @collections = Set.new
45
+ end
46
+
47
+ def add_dump(dump)
48
+ @dumps << dump
49
+ end
50
+
51
+ def add_collection(col)
52
+ @collections << col
53
+ end
54
+
55
+ # Expose private binding() method.
56
+ def get_binding
57
+ binding()
58
+ end
59
+
60
+ end
61
+
62
+ class Dump
63
+ attr_reader :torrents
64
+ attr_reader :date
65
+
66
+ def initialize(torrents, date)
67
+ @torrents = torrents
68
+ @date = date
69
+ end
70
+ end
71
+
72
+ class Torrent
73
+ attr_reader :url
74
+ attr_reader :name
75
+ attr_reader :size
76
+ attr_reader :date
77
+
78
+ def initialize(url, name, size, date)
79
+ @url = url
80
+ @name = name
81
+ @size = size
82
+ @date = date
83
+ end
84
+ end
85
+
86
+ class Indexer < GHTorrent::Command
87
+
88
+ def prepare_options(options)
89
+ options.banner <<-BANNER
90
+ Create an HTML table from a list of torrent and data files. The expected
91
+ naming is the following:
92
+
93
+ collname-dump-2012-03-27.torrent
94
+ collname-dump-2012-03-27.tar.gz
95
+
96
+ #{command_name} [options]
97
+
98
+ #{command_name} options:
99
+ BANNER
100
+
101
+ options.opt :prefix, 'URL prefix to use for links',
102
+ :short => 'p', :default => "", :type => :string
103
+ end
104
+
105
+ def validate_options
106
+
107
+ end
108
+
109
+ def go
110
+ url_prefix=options[:prefix]
111
+
112
+ # Load the template
113
+ gem_root = Gem.loaded_specs['ghtorrent']
114
+
115
+ file = if gem_root.nil?
116
+ # Gem not installed yet, try current dir
117
+ File.open("index.erb").read
118
+ else
119
+ File.open(File.join(gem_root, "index.erb")).read
120
+ end
121
+
122
+ rhtml = ERB.new(file)
123
+
124
+ # Open the dir to read entries from
125
+ dir = ARGV.shift
126
+
127
+ if dir.nil?
128
+ dir = "."
129
+ end
130
+
131
+ torrents = Dir.entries("#{dir}").map do |f|
132
+
133
+ #File name format expected: collname-dump-2012-03-27.torrent
134
+ # collname-dump-2012-03-27.tar.gz
135
+
136
+ # Go through all torrent files and extract name of
137
+ # dumped collection and dump date
138
+ matches = /([a-z0-9]+)-[a-z]+\.(.*)\.torrent/.match(f)
139
+ next if matches.nil?
140
+
141
+ # Calculate original file size
142
+ dump = f.gsub(/.torrent/, ".tar.gz")
143
+ size = File.stat(File.join(dir, dump)).size / 1024 / 1024
144
+
145
+ # Expects a format of yyyy-mm-dd
146
+ date = Date.parse(matches[2])
147
+
148
+ if size > 0
149
+ Torrent.new(url_prefix + "/" + f, matches[1], size, date)
150
+ end
151
+ end.select { |x| !x.nil? }
152
+
153
+ all_dates = torrents.inject(Set.new) { |acc, t| acc << t.date }
154
+
155
+ all_dumps = all_dates.map { |d|
156
+ date_torrents = torrents.select { |t| t.date == d }
157
+ name_torrents = date_torrents.inject(Hash.new) { |acc, a|
158
+ acc.store(a.name, a);
159
+ acc
160
+ }
161
+ Dump.new(name_torrents, d)
162
+ }
163
+
164
+ max_date = all_dates.max { |a, b| a <=> b }
165
+
166
+ ghtorrent = Page.new(max_date)
167
+ all_dumps.each { |x|
168
+ ghtorrent.add_dump x
169
+ x.torrents.values.each { |t|
170
+ ghtorrent.add_collection t.name
171
+ }
172
+ }
173
+
174
+ puts rhtml.result(ghtorrent.get_binding).gsub(/^\s+/, "").gsub(/\s+$/, $/).gsub(/<table>/, "\n<table>")
175
+ end
176
+ end
177
+
178
+ Indexer.run
179
+
180
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :