ghtorrent 0.10 → 0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +16 -0
- data/Gemfile.lock +12 -27
- data/README.md +20 -33
- data/Rakefile +1 -9
- data/bin/ght-log-analyzer +11 -6
- data/bin/ght-log-influx +190 -0
- data/bin/ght-queue-grep.rb +55 -0
- data/bin/ght-retrieve-users +6 -0
- data/bin/{ght-rm-dupl → ght-update-repo} +1 -1
- data/lib/ghtorrent.rb +4 -4
- data/lib/ghtorrent/adapters/base_adapter.rb +4 -11
- data/lib/ghtorrent/adapters/mongo_persister.rb +5 -9
- data/lib/ghtorrent/adapters/noop_persister.rb +0 -5
- data/lib/ghtorrent/api_client.rb +45 -119
- data/lib/ghtorrent/command.rb +25 -8
- data/lib/ghtorrent/commands/full_user_retriever.rb +50 -0
- data/lib/ghtorrent/commands/ght_data_retrieval.rb +12 -98
- data/lib/ghtorrent/commands/ght_get_more_commits.rb +13 -17
- data/lib/ghtorrent/commands/ght_load.rb +1 -2
- data/lib/ghtorrent/commands/ght_mirror_events.rb +8 -12
- data/lib/ghtorrent/commands/ght_retrieve_dependents.rb +0 -5
- data/lib/ghtorrent/commands/ght_retrieve_one.rb +1 -6
- data/lib/ghtorrent/commands/ght_retrieve_repo.rb +56 -26
- data/lib/ghtorrent/commands/ght_retrieve_repos.rb +5 -15
- data/lib/ghtorrent/commands/ght_retrieve_user.rb +13 -54
- data/lib/ghtorrent/commands/ght_retrieve_users.rb +49 -0
- data/lib/ghtorrent/commands/ght_update_repo.rb +126 -0
- data/lib/ghtorrent/event_processing.rb +140 -0
- data/lib/ghtorrent/ghtorrent.rb +330 -396
- data/lib/ghtorrent/logging.rb +65 -12
- data/lib/ghtorrent/migrations/014_add_deleted_to_projects.rb +1 -1
- data/lib/ghtorrent/migrations/019_add_fake_to_users.rb +1 -1
- data/lib/ghtorrent/migrations/020_add_deleted_to_users.rb +19 -0
- data/lib/ghtorrent/migrations/021_remove_ext_ref_id.rb +42 -0
- data/lib/ghtorrent/migrations/022_add_project_languages.rb +24 -0
- data/lib/ghtorrent/multiprocess_queue_client.rb +25 -5
- data/lib/ghtorrent/retriever.rb +100 -57
- data/lib/ghtorrent/settings.rb +14 -17
- data/lib/ghtorrent/{transacted_ghtorrent.rb → transacted_gh_torrent.rb} +28 -5
- data/lib/version.rb +1 -1
- metadata +14 -46
- data/bin/ght-process-event +0 -35
- data/lib/ghtorrent/cache.rb +0 -97
- data/lib/ghtorrent/commands/ght_rm_dupl.rb +0 -132
- data/lib/ghtorrent/gh_torrent_exception.rb +0 -6
- data/spec/api_client_spec.rb +0 -42
- data/spec/spec_helper.rb +0 -21
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 68b11841a8dfbd0418723fce0620a5d625b4cca1
|
|
4
|
+
data.tar.gz: d39f30596d257cfe5cb365e1365388169e50cc18
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6443373ff38703c8113c23716db591a1574eb7f2e36eba1ae68d78ce322b20e42f1c0c840c3a113bb870b7c8b5aae817cb07a60f46d6bccb07ed4058cabc23d2
|
|
7
|
+
data.tar.gz: cfad88e464fad602f38f6f4cfb7963a0b7ac28f5c205ddf954939937c398d513e004725ef70e920b682b06239c2adc6663c3a29f6eea90913aad3611c9c2a310
|
data/CHANGELOG
CHANGED
|
@@ -1,3 +1,19 @@
|
|
|
1
|
+
= Version 0.11
|
|
2
|
+
* Retrieve members by processing MemberEvents to counter API change
|
|
3
|
+
* Removed the request caching layer. Requests are cached in the persister
|
|
4
|
+
* Change default DB isolation to REPEATABLE READ for stronger isolation
|
|
5
|
+
* Finer-grained (commit level) transactions when processing forks
|
|
6
|
+
* More accurate and uniform logging
|
|
7
|
+
* Tool to push logs to InfluxDB for monitoring
|
|
8
|
+
* Drop ext_ref_id from all tables
|
|
9
|
+
* More efficient retrieval of events, 100 in one go
|
|
10
|
+
* Tool to retrieve all user details and support for marking users deleted
|
|
11
|
+
* Support for retrieving repo events when using ght-retrieve-repo
|
|
12
|
+
* Non-recursive retrieval of pull requests leads to 1/3 API calls
|
|
13
|
+
* Custom rate limits for Github API tokens
|
|
14
|
+
* Tooling for MySQL dumps in CSV files
|
|
15
|
+
* General bug fixes and cleanups
|
|
16
|
+
|
|
1
17
|
= Version 0.10
|
|
2
18
|
* Base class for multiprocess queue clients
|
|
3
19
|
* Make retrieval of pull request commits faster
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
ghtorrent (0.
|
|
4
|
+
ghtorrent (0.11)
|
|
5
5
|
bson_ext (~> 1.9, >= 1.9.0)
|
|
6
6
|
bunny (~> 1.0, >= 1.0.0)
|
|
7
7
|
mongo (~> 1.9, >= 1.9.0)
|
|
@@ -11,32 +11,16 @@ PATH
|
|
|
11
11
|
GEM
|
|
12
12
|
remote: https://rubygems.org/
|
|
13
13
|
specs:
|
|
14
|
-
addressable (2.3.5)
|
|
15
14
|
amq-protocol (1.9.2)
|
|
16
|
-
bson (1.
|
|
17
|
-
bson_ext (1.
|
|
18
|
-
bson (~> 1.
|
|
19
|
-
bunny (1.
|
|
15
|
+
bson (1.12.2)
|
|
16
|
+
bson_ext (1.12.2)
|
|
17
|
+
bson (~> 1.12.2)
|
|
18
|
+
bunny (1.3.1)
|
|
20
19
|
amq-protocol (>= 1.9.2)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
bson (~> 1.10.0)
|
|
26
|
-
rspec (2.14.1)
|
|
27
|
-
rspec-core (~> 2.14.0)
|
|
28
|
-
rspec-expectations (~> 2.14.0)
|
|
29
|
-
rspec-mocks (~> 2.14.0)
|
|
30
|
-
rspec-core (2.14.7)
|
|
31
|
-
rspec-expectations (2.14.4)
|
|
32
|
-
diff-lcs (>= 1.1.3, < 2.0)
|
|
33
|
-
rspec-mocks (2.14.4)
|
|
34
|
-
safe_yaml (0.9.7)
|
|
35
|
-
sequel (4.10.0)
|
|
36
|
-
trollop (2.0)
|
|
37
|
-
webmock (1.16.0)
|
|
38
|
-
addressable (>= 2.2.7)
|
|
39
|
-
crack (>= 0.3.2)
|
|
20
|
+
mongo (1.12.2)
|
|
21
|
+
bson (= 1.12.2)
|
|
22
|
+
sequel (4.23.0)
|
|
23
|
+
trollop (2.1.2)
|
|
40
24
|
|
|
41
25
|
PLATFORMS
|
|
42
26
|
ruby
|
|
@@ -44,5 +28,6 @@ PLATFORMS
|
|
|
44
28
|
DEPENDENCIES
|
|
45
29
|
ghtorrent!
|
|
46
30
|
jdbc-mysql
|
|
47
|
-
|
|
48
|
-
|
|
31
|
+
|
|
32
|
+
BUNDLED WITH
|
|
33
|
+
1.10.2
|
data/README.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ghtorrent: Mirror and
|
|
1
|
+
# ghtorrent: Mirror and index data from the Github API
|
|
2
2
|
|
|
3
3
|
A library and a collection of scripts used to retrieve data from the Github API
|
|
4
4
|
and extract metadata in an SQL database, in a modular and scalable manner. The
|
|
@@ -10,8 +10,7 @@ GHTorrent can be used for a variety of purposes, such as:
|
|
|
10
10
|
* Mirror the Github API event stream and follow links from events to actual data
|
|
11
11
|
to gradually build a [Github index](http://ghtorrent.org/)
|
|
12
12
|
* Create a queriable metadata index for a specific repository
|
|
13
|
-
*
|
|
14
|
-
|
|
13
|
+
* Construct a data source for [extracting process analytics](http://www.gousios.gr/blog/ghtorrent-project-statistics/) (see for example [those](http://ghtorrent.org/pullreq-perf/))for one or more repositories
|
|
15
14
|
|
|
16
15
|
## Components
|
|
17
16
|
|
|
@@ -19,8 +18,8 @@ GHTorrents components (which can be used individually) are:
|
|
|
19
18
|
|
|
20
19
|
* [APIClient](https://github.com/gousiosg/github-mirror/blob/master/lib/ghtorrent/api_client.rb): Knows how to query the Github API (both single entities and
|
|
21
20
|
pages) and respect the API request limit. Can be configured to override the
|
|
22
|
-
default IP address, in case of multihomed hosts.
|
|
23
|
-
* [Retriever](https://github.com/gousiosg/github-mirror/blob/master/lib/ghtorrent/retriever.rb): Knows how to retrieve specific Github entities (users, repositories, watchers) by name. Uses an optional persister to avoid
|
|
21
|
+
default IP address, in case of multihomed hosts.
|
|
22
|
+
* [Retriever](https://github.com/gousiosg/github-mirror/blob/master/lib/ghtorrent/retriever.rb): Knows how to retrieve specific Github entities (users, repositories, watchers) by name. Uses an optional persister to avoid
|
|
24
23
|
retrieving data that have not changed.
|
|
25
24
|
* [Persister](https://github.com/gousiosg/github-mirror/blob/master/lib/ghtorrent/persister.rb): A key/value store, which can be backed by a real key/value store,
|
|
26
25
|
to store Github JSON replies and query them on request. The backing key/value
|
|
@@ -28,25 +27,23 @@ store must support arbitrary queries to the stored JSON objects.
|
|
|
28
27
|
* [GHTorrent](https://github.com/gousiosg/github-mirror/blob/master/lib/ghtorrent/ghtorrent.rb): Knows how to extract information from the data retrieved by
|
|
29
28
|
the retriever in order to update an SQL database (see [schema](http://ghtorrent.org/relational.html)) with metadata.
|
|
30
29
|
|
|
31
|
-
### Component Configuration
|
|
30
|
+
### Component Configuration
|
|
32
31
|
|
|
33
32
|
The Persister and GHTorrent components have configurable back ends:
|
|
34
33
|
|
|
35
34
|
* **Persister:** Either uses MongoDB > 2.0 (`mongo` driver) or no persister (`noop` driver)
|
|
36
|
-
* **GHTorrent:** GHTorrent is tested mainly with MySQL, but can theoretically be
|
|
37
|
-
used with any SQL database compatible with [Sequel](http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html). Your milaege may vary.
|
|
38
|
-
|
|
39
|
-
The distributed mirroring scripts also require RabbitMQ >= 2.8 or other
|
|
35
|
+
* **GHTorrent:** GHTorrent is tested mainly with MySQL and SQLite, but can theoretically be used with any SQL database compatible with [Sequel](http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html). Your milaege may vary.
|
|
40
36
|
|
|
37
|
+
For distributed mirroring you also need RabbitMQ >= 3
|
|
41
38
|
|
|
42
39
|
## Installation
|
|
43
40
|
|
|
44
41
|
|
|
45
42
|
### 1. Install GHTorrent
|
|
46
|
-
GHTorrent is written in Ruby (tested with
|
|
43
|
+
GHTorrent is written in Ruby (tested with 2.0). To install it as a Gem do:
|
|
47
44
|
|
|
48
45
|
<code>
|
|
49
|
-
sudo gem install ghtorrent
|
|
46
|
+
sudo gem install ghtorrent
|
|
50
47
|
</code>
|
|
51
48
|
|
|
52
49
|
|
|
@@ -56,14 +53,14 @@ Depending on which SQL database you want to use, install the appropriate
|
|
|
56
53
|
dependency gem.
|
|
57
54
|
|
|
58
55
|
<code>
|
|
59
|
-
sudo gem install mysql2 # or
|
|
56
|
+
sudo gem install mysql2 # or sqlite3
|
|
60
57
|
</code>
|
|
61
58
|
|
|
62
59
|
|
|
63
60
|
## Configuration
|
|
64
61
|
|
|
65
62
|
Copy [config.yaml.tmpl](https://github.com/gousiosg/github-mirror/blob/master/config.yaml.tmpl)
|
|
66
|
-
to a file in your home directory.
|
|
63
|
+
to a file in your home directory.
|
|
67
64
|
|
|
68
65
|
All provided scripts accept the `-c` option, which accepts the location of the configuration file as
|
|
69
66
|
a parameter.
|
|
@@ -74,7 +71,7 @@ to retrieve data in parallel on the [Wiki](https://github.com/gousiosg/github-mi
|
|
|
74
71
|
|
|
75
72
|
## Using GHTorrent
|
|
76
73
|
|
|
77
|
-
To mirror the event stream and capture all data:
|
|
74
|
+
To mirror the event stream and capture all data:
|
|
78
75
|
|
|
79
76
|
* `ght-mirror-events.rb` periodically polls Github's event
|
|
80
77
|
queue (`https://api.github.com/events`), stores all new events in the
|
|
@@ -85,7 +82,7 @@ RabbitMQ.
|
|
|
85
82
|
functions. The functions use the appropriate Github API call to retrieve the
|
|
86
83
|
linked contents, extract metadata (for database storage), and store the
|
|
87
84
|
retrieved data in the appropriate collection in the persister, to avoid
|
|
88
|
-
duplicate API calls.
|
|
85
|
+
duplicate API calls.
|
|
89
86
|
Data in the SQL database contain pointers (the `ext_ref_id` field) to the
|
|
90
87
|
"raw" data in the persister.
|
|
91
88
|
|
|
@@ -98,32 +95,29 @@ To perform maintenance:
|
|
|
98
95
|
|
|
99
96
|
* `ght-load` loads selected events from the persister to the queue in order for
|
|
100
97
|
the `ght-data-retrieval` script to reprocess them
|
|
101
|
-
* `ght-get-more-commits` retrieves all commits for a specific repository
|
|
102
|
-
|
|
103
98
|
|
|
104
|
-
### Data
|
|
99
|
+
### Data
|
|
105
100
|
|
|
106
|
-
|
|
107
|
-
[
|
|
101
|
+
The code in this repository is used to power the data collection process of
|
|
102
|
+
the [GHTorrent.org](http://ghtorrent.org/) project.
|
|
103
|
+
You can find all data collected by in the project in the
|
|
104
|
+
[Downloads](https://ghtorrent.org/downloads.html) page.
|
|
108
105
|
|
|
109
106
|
There are two sets of data:
|
|
110
107
|
|
|
111
108
|
* **Raw events:** Github's [event stream](https://api.github.com/events). These
|
|
112
109
|
are the roots for mirroring operations. The `ght-data-retrieval` crawler starts
|
|
113
110
|
from an event and goes deep into the rabbit hole.
|
|
114
|
-
* **SQL dumps + Linked data:** Data dumps from the SQL database and the corresponding
|
|
115
|
-
MongoDB entities.
|
|
116
|
-
|
|
111
|
+
* **SQL dumps + Linked data:** Data dumps from the SQL database and the corresponding MongoDB entities.
|
|
117
112
|
|
|
118
113
|
## Bugs & Feature Requests
|
|
119
114
|
|
|
120
|
-
Please tell us about features you'd like or bugs you've discovered on our
|
|
115
|
+
Please tell us about features you'd like or bugs you've discovered on our
|
|
121
116
|
[Issue Tracker](https://github.com/gousiosg/github-mirror/issues).
|
|
122
117
|
|
|
123
118
|
Patches, bug fixes, etc are welcome. Please fork the repository and create
|
|
124
119
|
a pull request when done fixing/implementing the new feature.
|
|
125
120
|
|
|
126
|
-
|
|
127
121
|
## Citing GHTorrent in your Research
|
|
128
122
|
|
|
129
123
|
If you find GHTorrent and the accompanying datasets useful in your research,
|
|
@@ -131,18 +125,11 @@ please consider citing the following paper:
|
|
|
131
125
|
|
|
132
126
|
> Georgios Gousios and Diomidis Spinellis, "GHTorrent: GitHub’s data from a firehose," in _MSR '12: Proceedings of the 9th Working Conference on Mining Software Repositories_, June 2-–3, 2012. Zurich, Switzerland.
|
|
133
127
|
|
|
134
|
-
See also the following presentation:
|
|
135
|
-
|
|
136
|
-
<iframe src="http://www.slideshare.net/slideshow/embed_code/13184524?rel=0" width="342" height="291" frameborder="0" marginwidth="0" marginheight="0" scrolling="no" style="border:1px solid #CCC;border-width:1px 1px 0;margin-bottom:5px" allowfullscreen />
|
|
137
|
-
<div style="margin-bottom:5px"> <strong> <a href="http://www.slideshare.net/gousiosg/ghtorrent-githubs-data-from-a-firehose-13184524" title="GHTorrent: Github's Data from a Firehose" target="_blank">GHTorrent: Github's Data from a Firehose</a> </strong> </div>
|
|
138
|
-
|
|
139
|
-
|
|
140
128
|
## Authors
|
|
141
129
|
|
|
142
130
|
* [Georgios Gousios](http://istlab.dmst.aueb.gr/~george) <gousiosg@gmail.com>
|
|
143
131
|
* [Diomidis Spinellis](http://www.dmst.aueb.gr/dds) <dds@aueb.gr>
|
|
144
132
|
|
|
145
|
-
|
|
146
133
|
## License
|
|
147
134
|
|
|
148
135
|
[2-clause BSD](http://www.opensource.org/licenses/bsd-license.php)
|
data/Rakefile
CHANGED
|
@@ -1,15 +1,7 @@
|
|
|
1
1
|
require 'rake'
|
|
2
|
-
require 'rake/testtask'
|
|
3
2
|
require 'rake/rdoctask'
|
|
4
3
|
|
|
5
|
-
task :default => [:
|
|
6
|
-
|
|
7
|
-
desc "Run basic tests"
|
|
8
|
-
Rake::TestTask.new(:spec) do |t|
|
|
9
|
-
t.pattern = 'spec/*_test.rb'
|
|
10
|
-
t.verbose = true
|
|
11
|
-
t.warning = true
|
|
12
|
-
end
|
|
4
|
+
task :default => [:rdoc]
|
|
13
5
|
|
|
14
6
|
desc "Run Rdoc"
|
|
15
7
|
Rake::RDocTask.new(:rdoc) do |rd|
|
data/bin/ght-log-analyzer
CHANGED
|
@@ -9,7 +9,7 @@ Thread.new do
|
|
|
9
9
|
puts "Collecting data..."
|
|
10
10
|
while (true) do
|
|
11
11
|
sleep(1)
|
|
12
|
-
system
|
|
12
|
+
system 'clear' or system 'cls'
|
|
13
13
|
|
|
14
14
|
stats.each do |k,v|
|
|
15
15
|
unless v[:time_in].nil?
|
|
@@ -68,7 +68,7 @@ end
|
|
|
68
68
|
|
|
69
69
|
ARGF.each do |x|
|
|
70
70
|
|
|
71
|
-
next unless x =~ /
|
|
71
|
+
next unless x =~ /api_client.rb/
|
|
72
72
|
|
|
73
73
|
if x =~ /sleeping/
|
|
74
74
|
ts, pid, remaining = x.match(/\[([^.]+).*#([0-9]+)\].*for ([0-9]+).*/).captures
|
|
@@ -82,15 +82,20 @@ ARGF.each do |x|
|
|
|
82
82
|
end
|
|
83
83
|
|
|
84
84
|
elsif x =~ /Not Found|Gone|Conflict/
|
|
85
|
-
pid = x.match(/.*#([0-9]+).*
|
|
85
|
+
pid = x.match(/.*#([0-9]+).*api_client.rb.*/).captures[0]
|
|
86
86
|
if stats[pid][:not_found].nil?
|
|
87
87
|
stats[pid][:not_found] = 0
|
|
88
|
-
|
|
89
|
-
|
|
88
|
+
end
|
|
89
|
+
stats[pid][:not_found] += 1
|
|
90
|
+
elsif x =~ /Forbidden/
|
|
91
|
+
if stats[pid][:forbidden].nil?
|
|
92
|
+
stats[pid][:forbidden] = 0
|
|
93
|
+
end
|
|
94
|
+
stats[pid][:forbidden] += 1
|
|
90
95
|
else
|
|
91
96
|
begin
|
|
92
97
|
ts, pid, ip, url, remaining, time =
|
|
93
|
-
x.match(/.*\[([^.]+).*#([0-9]+)\].*
|
|
98
|
+
x.match(/.*\[([^.]+).*#([0-9]+)\].*api_client.rb: \[(.*)\].*(https:\/\/.*) \(([0-9]+) remaining\).* ([0-9]+) ms$/).captures
|
|
94
99
|
rescue
|
|
95
100
|
puts x
|
|
96
101
|
next
|
data/bin/ght-log-influx
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
#
|
|
3
|
+
|
|
4
|
+
require 'rubygems'
|
|
5
|
+
require 'trollop'
|
|
6
|
+
require 'influxdb'
|
|
7
|
+
require 'pp'
|
|
8
|
+
require 'time'
|
|
9
|
+
|
|
10
|
+
def parse_api_client_line(line)
|
|
11
|
+
if line.start_with?("Successful")
|
|
12
|
+
# Successful request. URL: https://api.github.com/repos/amizony/self-destructing-task-list/comments/11518274?per_page=100, Remaining: 3949, Total: 423 ms
|
|
13
|
+
remaining, elapsed = line.match(/.*Remaining: ([\d]+), Total: ([\d]+) ms/).captures
|
|
14
|
+
{
|
|
15
|
+
:outcome => "success",
|
|
16
|
+
:elapsed => elapsed.to_i,
|
|
17
|
+
:remaining => remaining.to_i
|
|
18
|
+
}
|
|
19
|
+
elsif line.start_with?("Failed")
|
|
20
|
+
# Failed request. URL: https://api.github.com/repos/mingliang7/hotel/commits?per_page=100, Status code: 409, Status: Conflict, Access: ghtorrent, IP: 0.0.0.0, Remaining: 3332
|
|
21
|
+
code, elapsed = line.match(/.*Status code: ([^,]+), .*Remaining: ([\d]+)/).captures
|
|
22
|
+
{
|
|
23
|
+
:outcome => "error",
|
|
24
|
+
:error_code => code.to_i,
|
|
25
|
+
:remaining => remaining.to_i
|
|
26
|
+
}
|
|
27
|
+
else
|
|
28
|
+
{}
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def parse_data_retrieval_line(line)
|
|
33
|
+
#Success processing event. Type: PushEvent, ID: 2863181313, Time: 967 ms
|
|
34
|
+
return {} unless line.start_with?("Success") or line.start_with?("Error")
|
|
35
|
+
outcome, evt_type, time = line.match(/([^\ ]+) processing event\. Type: ([\D]+)Event, .*, Time: ([\d]+) ms/).captures
|
|
36
|
+
|
|
37
|
+
{
|
|
38
|
+
:outcome => outcome.downcase,
|
|
39
|
+
:evt_type => evt_type,
|
|
40
|
+
:elapsed => time.to_i
|
|
41
|
+
}
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def parse_retriever_line(line)
|
|
45
|
+
|
|
46
|
+
if line.start_with?("Added")
|
|
47
|
+
# Added repo hiropong -> googlemaplesson
|
|
48
|
+
outcome = "success"
|
|
49
|
+
entity = line.split(/ /)[1]
|
|
50
|
+
elsif line.start_with?("Could not find")
|
|
51
|
+
# Could not find commit_comment 12106552. Deleted?
|
|
52
|
+
outcome = "failure"
|
|
53
|
+
entity = line.split(/ /)[3]
|
|
54
|
+
else
|
|
55
|
+
return {}
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
{
|
|
59
|
+
:outcome => outcome,
|
|
60
|
+
:entity => entity
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def parse_ghtorrent_line(line)
|
|
66
|
+
|
|
67
|
+
if line.start_with?("Added")
|
|
68
|
+
# Added user hayjohnny2000
|
|
69
|
+
# Added issue_event etsy/logster -> 1/etsy/logster -> 1/417355
|
|
70
|
+
outcome = "success"
|
|
71
|
+
entity = line.split(/ /)[1]
|
|
72
|
+
elsif line.start_with?("Could not retrieve")
|
|
73
|
+
# Could not retrieve commit_comment 12106552. Deleted?
|
|
74
|
+
outcome = "failure"
|
|
75
|
+
entity = line.split(/ /)[3]
|
|
76
|
+
else
|
|
77
|
+
return {}
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
{
|
|
81
|
+
:outcome => outcome,
|
|
82
|
+
:entity => entity
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def parse_log_line(line)
|
|
88
|
+
begin
|
|
89
|
+
severity, time, progname, stage, msg =
|
|
90
|
+
line.match(/([A-Z]+), (.+), (.+) -- ([^:]*?): (.*)/).captures
|
|
91
|
+
rescue
|
|
92
|
+
puts "Error parsing line: #{line}"
|
|
93
|
+
return {}
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
return {} if severity.downcase == 'debug'
|
|
97
|
+
stage = stage.split(/\./)[0]
|
|
98
|
+
data = {
|
|
99
|
+
:time => Time.iso8601(time).to_f,
|
|
100
|
+
:client => progname,
|
|
101
|
+
:severity => severity,
|
|
102
|
+
:stage => stage
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return {} if msg.nil? or msg.length == 0
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
stage_specific =
|
|
109
|
+
begin
|
|
110
|
+
case stage
|
|
111
|
+
when 'api_client'
|
|
112
|
+
parse_api_client_line(msg)
|
|
113
|
+
when 'ght_data_retrieval'
|
|
114
|
+
parse_data_retrieval_line(msg)
|
|
115
|
+
when 'retriever'
|
|
116
|
+
parse_retriever_line(msg)
|
|
117
|
+
when 'ghtorrent'
|
|
118
|
+
parse_ghtorrent_line(msg)
|
|
119
|
+
else
|
|
120
|
+
{}
|
|
121
|
+
end
|
|
122
|
+
rescue
|
|
123
|
+
puts "Error parsing line: #{msg}"
|
|
124
|
+
{}
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
return {} if stage_specific.empty?
|
|
128
|
+
data.merge(stage_specific)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
opts = Trollop::options do
|
|
132
|
+
banner <<-END
|
|
133
|
+
Store GHTorrent log output to InfluxDB. By default reads from STDIN.
|
|
134
|
+
Can be configured to watch files in directories.
|
|
135
|
+
|
|
136
|
+
Options:
|
|
137
|
+
END
|
|
138
|
+
|
|
139
|
+
opt :watch, "Use watch mode", :sort => 'w'
|
|
140
|
+
opt :watch_pattern, "Pattern for files to watch",
|
|
141
|
+
:short => 'p', :default => '*.log'
|
|
142
|
+
|
|
143
|
+
opt :db_server, "InfluxDB server to use", :type => String,
|
|
144
|
+
:short => 's', :default => '127.0.0.1'
|
|
145
|
+
opt :database, "InfluxDB database to use", :type => String,
|
|
146
|
+
:short => 'd', :default => 'ghtorrent'
|
|
147
|
+
opt :db_uname, "Username for the Influx database", :type => String,
|
|
148
|
+
:short => 'u', :default => 'ghtorrent'
|
|
149
|
+
opt :db_passwd, "Password for the Influx database", :type => String,
|
|
150
|
+
:short => 'x', :default => ''
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
unless opts[:db_passwd_given].nil?
|
|
154
|
+
influx = InfluxDB::Client.new(opts[:database],
|
|
155
|
+
:host => opts[:db_server],
|
|
156
|
+
:username => opts[:db_uname],
|
|
157
|
+
:password => opts[:db_passwd])
|
|
158
|
+
else
|
|
159
|
+
influx = InfluxDB::Client.new("ghtorrent",
|
|
160
|
+
:host => opts[:db_server])
|
|
161
|
+
end
|
|
162
|
+
influx.get_database_list
|
|
163
|
+
|
|
164
|
+
if opts[:watch]
|
|
165
|
+
require 'filewatch/tail'
|
|
166
|
+
|
|
167
|
+
t = FileWatch::Tail.new
|
|
168
|
+
t.tail(opts[:watch_pattern])
|
|
169
|
+
|
|
170
|
+
t.subscribe do |path, line|
|
|
171
|
+
p = parse_log_line(line)
|
|
172
|
+
next if p.empty?
|
|
173
|
+
|
|
174
|
+
pp p
|
|
175
|
+
#influx.write_point(p[:stage], p)
|
|
176
|
+
end
|
|
177
|
+
else
|
|
178
|
+
puts "Reading from STDIN..."
|
|
179
|
+
ARGF.each do |line|
|
|
180
|
+
next if line !~ /^[IDEW]/
|
|
181
|
+
|
|
182
|
+
begin
|
|
183
|
+
p = parse_log_line(line)
|
|
184
|
+
next if p.empty?
|
|
185
|
+
pp p
|
|
186
|
+
influx.write_point(p[:stage], p)
|
|
187
|
+
rescue
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
end
|