ghtorrent 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,154 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
4
+ #
5
+ # Redistribution and use in source and binary forms, with or
6
+ # without modification, are permitted provided that the following
7
+ # conditions are met:
8
+ #
9
+ # 1. Redistributions of source code must retain the above
10
+ # copyright notice, this list of conditions and the following
11
+ # disclaimer.
12
+ #
13
+ # 2. Redistributions in binary form must reproduce the above
14
+ # copyright notice, this list of conditions and the following
15
+ # disclaimer in the documentation and/or other materials
16
+ # provided with the distribution.
17
+ #
18
+ # THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ # AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20
+ # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
22
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
25
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26
+ # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27
+ # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28
+ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
+ # POSSIBILITY OF SUCH DAMAGE.
30
+
31
+ require 'rubygems'
32
+ require 'yaml'
33
+ require 'amqp'
34
+ require 'eventmachine'
35
+ require 'ghtorrent'
36
+ require 'json'
37
+ require 'logger'
38
+
39
+ class GHTMirrorEvents < GHTorrent::Command
40
+
41
+ include GHTorrent::Settings
42
+ include GHTorrent::Logging
43
+ include GHTorrent::Persister
44
+
45
+ attr_reader :settings
46
+
47
+ # Retrieve events from Github, store them in the DB
48
+ def retrieve(exchange)
49
+ begin
50
+ new = dupl = 0
51
+ events = @gh.get_events
52
+
53
+ events.each do |e|
54
+ unless @persister.find(:events, {'id' => e['id']}).empty?
55
+ info "Already got #{e['id']}"
56
+ dupl += 1
57
+ next
58
+ end
59
+
60
+ new += 1
61
+ @persister.store(:events, e)
62
+ info "Added #{e['id']}"
63
+
64
+ msg = JSON.dump(e)
65
+ key = "evt.%s" % e['type']
66
+ exchange.publish msg, :persistent => true, :routing_key => key
67
+ end
68
+ return new, dupl
69
+ rescue Exception => e
70
+ STDERR.puts e.message
71
+ STDERR.puts e.backtrace
72
+ end
73
+ end
74
+
75
+ def prepare_options(options)
76
+
77
+ end
78
+
79
+ def go
80
+
81
+ @gh = GHTorrent::Mirror.new(options[:config])
82
+ @settings = @gh.settings
83
+ @persister = connect(:mongo, @settings)
84
+ @logger = Logger.new(STDOUT)
85
+
86
+ # Graceful exit
87
+ Signal.trap('INT') { AMQP.stop { EM.stop } }
88
+ Signal.trap('TERM') { AMQP.stop { EM.stop } }
89
+
90
+ # The event loop
91
+ AMQP.start(:host => config(:amqp_host),
92
+ :port => config(:amqp_port),
93
+ :username => config(:amqp_username),
94
+ :password => config(:amqp_password)) do |connection|
95
+
96
+ # Statistics used to recalibrate event delays
97
+ dupl_msgs = new_msgs = 1
98
+
99
+ debug "connected to rabbit"
100
+
101
+ channel = AMQP::Channel.new(connection)
102
+ exchange = channel.topic(config(:amqp_exchange), :durable => true,
103
+ :auto_delete => false)
104
+
105
+ # Initial delay for the retrieve event loop
106
+ retrieval_delay = config(:mirror_pollevery)
107
+
108
+ # Retrieve commits.
109
+ retriever = EventMachine.add_periodic_timer(retrieval_delay) do
110
+ (new, dupl) = retrieve exchange
111
+ dupl_msgs += dupl
112
+ new_msgs += new
113
+ end
114
+
115
+ # Adjust event retrieval delay time to reduce load to Github
116
+ EventMachine.add_periodic_timer(120) do
117
+ ratio = (dupl_msgs.to_f / (dupl_msgs + new_msgs).to_f)
118
+
119
+ info("Stats: #{new_msgs} new, #{dupl_msgs} duplicate, ratio: #{ratio}")
120
+
121
+ new_delay = if ratio >= 0 and ratio < 0.3 then
122
+ -1
123
+ elsif ratio >= 0.3 and ratio <= 0.5 then
124
+ 0
125
+ elsif ratio > 0.5 and ratio < 1 then
126
+ +1
127
+ end
128
+
129
+ # Reset counters for new loop
130
+ dupl_msgs = new_msgs = 0
131
+
132
+ # Update the retrieval delay and restart the event retriever
133
+ if new_delay != 0
134
+
135
+ # Stop the retriever task and adjust retrieval delay
136
+ retriever.cancel
137
+ retrieval_delay = retrieval_delay + new_delay
138
+ info("Setting event retrieval delay to #{retrieval_delay} secs")
139
+
140
+ # Restart the retriever
141
+ retriever = EventMachine.add_periodic_timer(retrieval_delay) do
142
+ (new, dupl) = retrieve exchange
143
+ dupl_msgs += dupl
144
+ new_msgs += new
145
+ end
146
+ end
147
+ end
148
+ end
149
+ end
150
+ end
151
+
152
+ GHTMirrorEvents.run
153
+
154
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :
@@ -0,0 +1,92 @@
1
+ #!/bin/sh
2
+ #
3
+ # Create the periodic database dump files
4
+ #
5
+
6
+ # Directory to place compressed files and torrents
7
+ OUTDIR=/home/data/github-mirror/dumps
8
+
9
+ # Base URL for HTTP dir containing torrents and data
10
+ WEBSEED=http://ikaria.dmst.aueb.gr/ghtorrent/
11
+
12
+ # Time to start dumping from
13
+ if [ -r lastrun ]
14
+ then
15
+ timeStart=`cat lastrun`
16
+ else
17
+ timeStart=0
18
+ fi
19
+
20
+ # Time to end dumping
21
+ if [ "$1" = "" ]
22
+ then
23
+ timeEnd=`date +%s`
24
+ else
25
+ timeEnd=`date -d "$1" +%s` || exit 1
26
+ fi
27
+
28
+ # Name used for the files
29
+ dateName=`date -d @$timeEnd -u +'%Y-%m-%d'`
30
+
31
+ # _id example:
32
+ # 4f208c3e08d69a1835000077
33
+ # 000102030405060708091011
34
+ # | || || || |
35
+ # time mach pid count
36
+
37
+ endId=`printf '%08x0000000000000000' $timeEnd`
38
+ startId=`printf '%08x0000000000000000' $timeStart`
39
+
40
+ echo "Dumping database from `date -d @$timeStart` to `date -d @$timeEnd`"
41
+
42
+ collections=`echo "show collections"|mongo --quiet github|egrep -v "system|bye"`
43
+
44
+ rm -rf dump
45
+ for col in $collections; do
46
+
47
+ echo "Dumping $col"
48
+ mongodump --db github --collection $col -q '{"_id" : {"$gte" : ObjectId("'$startId'"), "$lt" : ObjectId("'$endId'")} }' || exit 1
49
+ done
50
+
51
+ # Report the metadata for the given database
52
+ meta()
53
+ {
54
+ echo -n "Number of $1: "
55
+ mongo --quiet --eval 'db.'$1'.find({"_id" : {"$gte" : ObjectId("'$startId'"), "$lt" : ObjectId("'$endId'")} }).count() + 0' github
56
+ echo -n "Uncompressed size of $1: "
57
+ wc -c dump/github/$1.bson | awk '{printf "%d bytes ", $1}'
58
+ du -h dump/github/$1.bson | awk '{print " (" $1 ")" }'
59
+ }
60
+
61
+ for col in $collections; do
62
+ (
63
+ echo "Start date: `date -u -d @$timeStart +'%Y-%m-%dT%H:%M:%SZ'`"
64
+ echo "End date: `date -u -d @$timeEnd +'%Y-%m-%dT%H:%M:%SZ'`"
65
+ meta $col
66
+ )
67
+ done |
68
+ tee README.$dateName.txt >dump/github/README.txt || exit 1
69
+
70
+ # Do the same per collection
71
+ for col in $collections; do
72
+ echo "Archiving $col.bson"
73
+ if [ ! -s dump/github/$col.bson ]; then
74
+ echo "Collection empty, skipping"
75
+ continue
76
+ fi
77
+
78
+ if ! tar zcf $OUTDIR/$col-dump.$dateName.tar.gz dump/github/$col.bson
79
+ then
80
+ rm -f $OUTDIR/$col-dump.$dateName.tar.gz
81
+ exit 1
82
+ fi
83
+
84
+ mktorrent -a udp://tracker.openbittorrent.com:80 -a udp://tracker.publicbt.com:80/announce -a http://tracker.bittorrent.am/announce -w $WEBSEED/$col-dump.$dateName.tar.gz -o $OUTDIR/$col-dump.$dateName.torrent $OUTDIR/$col-dump.$dateName.tar.gz
85
+ done
86
+
87
+ # Update last run info
88
+ echo $timeEnd >lastrun || exit 1
89
+
90
+ # Clean up
91
+ rm -rf dump
92
+
data/bin/ght-rm-dupl ADDED
@@ -0,0 +1,124 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Knows how to remove duplicate entries from various collections.
4
+ #
5
+ # Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
6
+ #
7
+ # Redistribution and use in source and binary forms, with or
8
+ # without modification, are permitted provided that the following
9
+ # conditions are met:
10
+ #
11
+ # 1. Redistributions of source code must retain the above
12
+ # copyright notice, this list of conditions and the following
13
+ # disclaimer.
14
+ #
15
+ # 2. Redistributions in binary form must reproduce the above
16
+ # copyright notice, this list of conditions and the following
17
+ # disclaimer in the documentation and/or other materials
18
+ # provided with the distribution.
19
+ #
20
+ # THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
+ # AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22
+ # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
24
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
27
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28
+ # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29
+ # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30
+ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31
+ # POSSIBILITY OF SUCH DAMAGE.
32
+
33
+ require 'rubygems'
34
+ require 'mongo'
35
+ require 'ghtorrent-old'
36
+
37
+ GH = Mirror.new
38
+ GH.init("config.yaml")
39
+
40
+ # Unique keys per known collection
41
+ per_col = {
42
+ :commits => {
43
+ :payload => "commit.id",
44
+ :col => GH.commits_col,
45
+ },
46
+ :events => {
47
+ :payload => "id",
48
+ :col => GH.events_col,
49
+ }
50
+ }
51
+
52
+ # Print MongoDB remove statements that
53
+ # remove all but one entries for each commit.
54
+ def remove_duplicates(data, col)
55
+ removed = 0
56
+ data.select { |k, v| v.size > 1 }.each do |k, v|
57
+ v.slice(0..(v.size - 2)).map do |x|
58
+ removed += 1 if delete_by_id col, x
59
+ end
60
+ end
61
+ removed
62
+ end
63
+
64
+ def delete_by_id(col, id)
65
+ begin
66
+ col.remove({'_id' => id})
67
+ true
68
+ rescue Mongo::OperationFailure
69
+ puts "Cannot remove record with id #{id} from #{col.name}"
70
+ false
71
+ end
72
+ end
73
+
74
+ which = case ARGV[0]
75
+ when "commits" then :commits
76
+ when "events" then :events
77
+ else puts "Not a known collection name: #{ARGV[0]}\n"
78
+ end
79
+
80
+ from = case ARGV[1]
81
+ when nil then {}
82
+ else
83
+ t = Time.at(ARGV[1].to_i)
84
+ STDERR.puts "Searching for duplicates after #{t}"
85
+ {'_id' => {'$gte' => BSON::ObjectId.from_time(t)}}
86
+ end
87
+
88
+ # Various counters to report stats
89
+ processed = total_processed = removed = 0
90
+
91
+ data = Hash.new
92
+
93
+ # The following code needs to save intermediate results to cope
94
+ # with large datasets
95
+ per_col[which][:col].find(from, :fields => per_col[which][:payload]).each do |r|
96
+ _id = r["_id"]
97
+ commit = GH.read_value(r, per_col[which][:payload])
98
+
99
+ # If entries cannot be parsed, remove them
100
+ if commit.empty?
101
+ puts "Deleting unknown entry #{_id}"
102
+ removed += 1 if delete_by_id per_col[which][:col], _id
103
+ else
104
+ data[commit] = [] if data[commit].nil?
105
+ data[commit] << _id
106
+ end
107
+
108
+ processed += 1
109
+ total_processed += 1
110
+
111
+ print "\rProcessed #{processed} records"
112
+
113
+ # Calculate duplicates, save intermediate result
114
+ if processed > 500000
115
+ puts "\nLoaded #{data.size} values, cleaning"
116
+ removed += remove_duplicates data, per_col[which][:col]
117
+ data = Hash.new
118
+ processed = 0
119
+ end
120
+ end
121
+
122
+ removed += remove_duplicates data, per_col[which][:col]
123
+
124
+ puts "Processed #{total_processed}, deleted #{removed} duplicates"
@@ -0,0 +1,180 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Copyright 2012 Georgios Gousios <gousiosg@gmail.com>
4
+ #
5
+ # Redistribution and use in source and binary forms, with or
6
+ # without modification, are permitted provided that the following
7
+ # conditions are met:
8
+ #
9
+ # 1. Redistributions of source code must retain the above
10
+ # copyright notice, this list of conditions and the following
11
+ # disclaimer.
12
+ #
13
+ # 2. Redistributions in binary form must reproduce the above
14
+ # copyright notice, this list of conditions and the following
15
+ # disclaimer in the documentation and/or other materials
16
+ # provided with the distribution.
17
+ #
18
+ # THIS SOFTWARE IS PROVIDED BY BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ # AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20
+ # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
22
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
25
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26
+ # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27
+ # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28
+ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
+ # POSSIBILITY OF SUCH DAMAGE.
30
+
31
+ require 'rubygems'
32
+ require 'erb'
33
+ require 'set'
34
+ require 'date'
35
+ require 'ghtorrent'
36
+
37
+ class Page
38
+ attr_reader :collections
39
+ attr_reader :dumps
40
+
41
+ def initialize(last_update)
42
+ @last_update = last_update
43
+ @dumps = Set.new
44
+ @collections = Set.new
45
+ end
46
+
47
+ def add_dump(dump)
48
+ @dumps << dump
49
+ end
50
+
51
+ def add_collection(col)
52
+ @collections << col
53
+ end
54
+
55
+ # Expose private binding() method.
56
+ def get_binding
57
+ binding()
58
+ end
59
+
60
+ end
61
+
62
+ class Dump
63
+ attr_reader :torrents
64
+ attr_reader :date
65
+
66
+ def initialize(torrents, date)
67
+ @torrents = torrents
68
+ @date = date
69
+ end
70
+ end
71
+
72
+ class Torrent
73
+ attr_reader :url
74
+ attr_reader :name
75
+ attr_reader :size
76
+ attr_reader :date
77
+
78
+ def initialize(url, name, size, date)
79
+ @url = url
80
+ @name = name
81
+ @size = size
82
+ @date = date
83
+ end
84
+ end
85
+
86
+ class Indexer < GHTorrent::Command
87
+
88
+ def prepare_options(options)
89
+ options.banner <<-BANNER
90
+ Create an HTML table from a list of torrent and data files. The expected
91
+ naming is the following:
92
+
93
+ collname-dump-2012-03-27.torrent
94
+ collname-dump-2012-03-27.tar.gz
95
+
96
+ #{command_name} [options]
97
+
98
+ #{command_name} options:
99
+ BANNER
100
+
101
+ options.opt :prefix, 'URL prefix to use for links',
102
+ :short => 'p', :default => "", :type => :string
103
+ end
104
+
105
+ def validate_options
106
+
107
+ end
108
+
109
+ def go
110
+ url_prefix=options[:prefix]
111
+
112
+ # Load the template
113
+ gem_root = Gem.loaded_specs['ghtorrent']
114
+
115
+ file = if gem_root.nil?
116
+ # Gem not installed yet, try current dir
117
+ File.open("index.erb").read
118
+ else
119
+ File.open(File.join(gem_root, "index.erb")).read
120
+ end
121
+
122
+ rhtml = ERB.new(file)
123
+
124
+ # Open the dir to read entries from
125
+ dir = ARGV.shift
126
+
127
+ if dir.nil?
128
+ dir = "."
129
+ end
130
+
131
+ torrents = Dir.entries("#{dir}").map do |f|
132
+
133
+ #File name format expected: collname-dump-2012-03-27.torrent
134
+ # collname-dump-2012-03-27.tar.gz
135
+
136
+ # Go through all torrent files and extract name of
137
+ # dumped collection and dump date
138
+ matches = /([a-z0-9]+)-[a-z]+\.(.*)\.torrent/.match(f)
139
+ next if matches.nil?
140
+
141
+ # Calculate original file size
142
+ dump = f.gsub(/.torrent/, ".tar.gz")
143
+ size = File.stat(File.join(dir, dump)).size / 1024 / 1024
144
+
145
+ # Expects a format of yyyy-mm-dd
146
+ date = Date.parse(matches[2])
147
+
148
+ if size > 0
149
+ Torrent.new(url_prefix + "/" + f, matches[1], size, date)
150
+ end
151
+ end.select { |x| !x.nil? }
152
+
153
+ all_dates = torrents.inject(Set.new) { |acc, t| acc << t.date }
154
+
155
+ all_dumps = all_dates.map { |d|
156
+ date_torrents = torrents.select { |t| t.date == d }
157
+ name_torrents = date_torrents.inject(Hash.new) { |acc, a|
158
+ acc.store(a.name, a);
159
+ acc
160
+ }
161
+ Dump.new(name_torrents, d)
162
+ }
163
+
164
+ max_date = all_dates.max { |a, b| a <=> b }
165
+
166
+ ghtorrent = Page.new(max_date)
167
+ all_dumps.each { |x|
168
+ ghtorrent.add_dump x
169
+ x.torrents.values.each { |t|
170
+ ghtorrent.add_collection t.name
171
+ }
172
+ }
173
+
174
+ puts rhtml.result(ghtorrent.get_binding).gsub(/^\s+/, "").gsub(/\s+$/, $/).gsub(/<table>/, "\n<table>")
175
+ end
176
+ end
177
+
178
+ Indexer.run
179
+
180
+ # vim: set sta sts=2 shiftwidth=2 sw=2 et ai :