ghtorrent 0.10 → 0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +16 -0
- data/Gemfile.lock +12 -27
- data/README.md +20 -33
- data/Rakefile +1 -9
- data/bin/ght-log-analyzer +11 -6
- data/bin/ght-log-influx +190 -0
- data/bin/ght-queue-grep.rb +55 -0
- data/bin/ght-retrieve-users +6 -0
- data/bin/{ght-rm-dupl → ght-update-repo} +1 -1
- data/lib/ghtorrent.rb +4 -4
- data/lib/ghtorrent/adapters/base_adapter.rb +4 -11
- data/lib/ghtorrent/adapters/mongo_persister.rb +5 -9
- data/lib/ghtorrent/adapters/noop_persister.rb +0 -5
- data/lib/ghtorrent/api_client.rb +45 -119
- data/lib/ghtorrent/command.rb +25 -8
- data/lib/ghtorrent/commands/full_user_retriever.rb +50 -0
- data/lib/ghtorrent/commands/ght_data_retrieval.rb +12 -98
- data/lib/ghtorrent/commands/ght_get_more_commits.rb +13 -17
- data/lib/ghtorrent/commands/ght_load.rb +1 -2
- data/lib/ghtorrent/commands/ght_mirror_events.rb +8 -12
- data/lib/ghtorrent/commands/ght_retrieve_dependents.rb +0 -5
- data/lib/ghtorrent/commands/ght_retrieve_one.rb +1 -6
- data/lib/ghtorrent/commands/ght_retrieve_repo.rb +56 -26
- data/lib/ghtorrent/commands/ght_retrieve_repos.rb +5 -15
- data/lib/ghtorrent/commands/ght_retrieve_user.rb +13 -54
- data/lib/ghtorrent/commands/ght_retrieve_users.rb +49 -0
- data/lib/ghtorrent/commands/ght_update_repo.rb +126 -0
- data/lib/ghtorrent/event_processing.rb +140 -0
- data/lib/ghtorrent/ghtorrent.rb +330 -396
- data/lib/ghtorrent/logging.rb +65 -12
- data/lib/ghtorrent/migrations/014_add_deleted_to_projects.rb +1 -1
- data/lib/ghtorrent/migrations/019_add_fake_to_users.rb +1 -1
- data/lib/ghtorrent/migrations/020_add_deleted_to_users.rb +19 -0
- data/lib/ghtorrent/migrations/021_remove_ext_ref_id.rb +42 -0
- data/lib/ghtorrent/migrations/022_add_project_languages.rb +24 -0
- data/lib/ghtorrent/multiprocess_queue_client.rb +25 -5
- data/lib/ghtorrent/retriever.rb +100 -57
- data/lib/ghtorrent/settings.rb +14 -17
- data/lib/ghtorrent/{transacted_ghtorrent.rb → transacted_gh_torrent.rb} +28 -5
- data/lib/version.rb +1 -1
- metadata +14 -46
- data/bin/ght-process-event +0 -35
- data/lib/ghtorrent/cache.rb +0 -97
- data/lib/ghtorrent/commands/ght_rm_dupl.rb +0 -132
- data/lib/ghtorrent/gh_torrent_exception.rb +0 -6
- data/spec/api_client_spec.rb +0 -42
- data/spec/spec_helper.rb +0 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 68b11841a8dfbd0418723fce0620a5d625b4cca1
|
4
|
+
data.tar.gz: d39f30596d257cfe5cb365e1365388169e50cc18
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6443373ff38703c8113c23716db591a1574eb7f2e36eba1ae68d78ce322b20e42f1c0c840c3a113bb870b7c8b5aae817cb07a60f46d6bccb07ed4058cabc23d2
|
7
|
+
data.tar.gz: cfad88e464fad602f38f6f4cfb7963a0b7ac28f5c205ddf954939937c398d513e004725ef70e920b682b06239c2adc6663c3a29f6eea90913aad3611c9c2a310
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
= Version 0.11
|
2
|
+
* Retrieve members by processing MemberEvents to counter API change
|
3
|
+
* Removed the request caching layer. Requests are cached in the persister
|
4
|
+
* Change default DB isolation to REPEATABLE READ for stronger isolation
|
5
|
+
* Finer-grained (commit level) transactions when processing forks
|
6
|
+
* More accurate and uniform logging
|
7
|
+
* Tool to push logs to InfluxDB for monitoring
|
8
|
+
* Drop ext_ref_id from all tables
|
9
|
+
* More efficient retrieval of events, 100 in one go
|
10
|
+
* Tool to retrieve all user details and support for marking users deleted
|
11
|
+
* Support for retrieving repo events when using ght-retrieve-repo
|
12
|
+
* Non-recursive retrieval of pull requests leads to 1/3 API calls
|
13
|
+
* Custom rate limits for Github API tokens
|
14
|
+
* Tooling for MySQL dumps in CSV files
|
15
|
+
* General bug fixes and cleanups
|
16
|
+
|
1
17
|
= Version 0.10
|
2
18
|
* Base class for multiprocess queue clients
|
3
19
|
* Make retrieval of pull request commits faster
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
ghtorrent (0.
|
4
|
+
ghtorrent (0.11)
|
5
5
|
bson_ext (~> 1.9, >= 1.9.0)
|
6
6
|
bunny (~> 1.0, >= 1.0.0)
|
7
7
|
mongo (~> 1.9, >= 1.9.0)
|
@@ -11,32 +11,16 @@ PATH
|
|
11
11
|
GEM
|
12
12
|
remote: https://rubygems.org/
|
13
13
|
specs:
|
14
|
-
addressable (2.3.5)
|
15
14
|
amq-protocol (1.9.2)
|
16
|
-
bson (1.
|
17
|
-
bson_ext (1.
|
18
|
-
bson (~> 1.
|
19
|
-
bunny (1.
|
15
|
+
bson (1.12.2)
|
16
|
+
bson_ext (1.12.2)
|
17
|
+
bson (~> 1.12.2)
|
18
|
+
bunny (1.3.1)
|
20
19
|
amq-protocol (>= 1.9.2)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
bson (~> 1.10.0)
|
26
|
-
rspec (2.14.1)
|
27
|
-
rspec-core (~> 2.14.0)
|
28
|
-
rspec-expectations (~> 2.14.0)
|
29
|
-
rspec-mocks (~> 2.14.0)
|
30
|
-
rspec-core (2.14.7)
|
31
|
-
rspec-expectations (2.14.4)
|
32
|
-
diff-lcs (>= 1.1.3, < 2.0)
|
33
|
-
rspec-mocks (2.14.4)
|
34
|
-
safe_yaml (0.9.7)
|
35
|
-
sequel (4.10.0)
|
36
|
-
trollop (2.0)
|
37
|
-
webmock (1.16.0)
|
38
|
-
addressable (>= 2.2.7)
|
39
|
-
crack (>= 0.3.2)
|
20
|
+
mongo (1.12.2)
|
21
|
+
bson (= 1.12.2)
|
22
|
+
sequel (4.23.0)
|
23
|
+
trollop (2.1.2)
|
40
24
|
|
41
25
|
PLATFORMS
|
42
26
|
ruby
|
@@ -44,5 +28,6 @@ PLATFORMS
|
|
44
28
|
DEPENDENCIES
|
45
29
|
ghtorrent!
|
46
30
|
jdbc-mysql
|
47
|
-
|
48
|
-
|
31
|
+
|
32
|
+
BUNDLED WITH
|
33
|
+
1.10.2
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# ghtorrent: Mirror and
|
1
|
+
# ghtorrent: Mirror and index data from the Github API
|
2
2
|
|
3
3
|
A library and a collection of scripts used to retrieve data from the Github API
|
4
4
|
and extract metadata in an SQL database, in a modular and scalable manner. The
|
@@ -10,8 +10,7 @@ GHTorrent can be used for a variety of purposes, such as:
|
|
10
10
|
* Mirror the Github API event stream and follow links from events to actual data
|
11
11
|
to gradually build a [Github index](http://ghtorrent.org/)
|
12
12
|
* Create a queriable metadata index for a specific repository
|
13
|
-
*
|
14
|
-
|
13
|
+
* Construct a data source for [extracting process analytics](http://www.gousios.gr/blog/ghtorrent-project-statistics/) (see for example [those](http://ghtorrent.org/pullreq-perf/))for one or more repositories
|
15
14
|
|
16
15
|
## Components
|
17
16
|
|
@@ -19,8 +18,8 @@ GHTorrents components (which can be used individually) are:
|
|
19
18
|
|
20
19
|
* [APIClient](https://github.com/gousiosg/github-mirror/blob/master/lib/ghtorrent/api_client.rb): Knows how to query the Github API (both single entities and
|
21
20
|
pages) and respect the API request limit. Can be configured to override the
|
22
|
-
default IP address, in case of multihomed hosts.
|
23
|
-
* [Retriever](https://github.com/gousiosg/github-mirror/blob/master/lib/ghtorrent/retriever.rb): Knows how to retrieve specific Github entities (users, repositories, watchers) by name. Uses an optional persister to avoid
|
21
|
+
default IP address, in case of multihomed hosts.
|
22
|
+
* [Retriever](https://github.com/gousiosg/github-mirror/blob/master/lib/ghtorrent/retriever.rb): Knows how to retrieve specific Github entities (users, repositories, watchers) by name. Uses an optional persister to avoid
|
24
23
|
retrieving data that have not changed.
|
25
24
|
* [Persister](https://github.com/gousiosg/github-mirror/blob/master/lib/ghtorrent/persister.rb): A key/value store, which can be backed by a real key/value store,
|
26
25
|
to store Github JSON replies and query them on request. The backing key/value
|
@@ -28,25 +27,23 @@ store must support arbitrary queries to the stored JSON objects.
|
|
28
27
|
* [GHTorrent](https://github.com/gousiosg/github-mirror/blob/master/lib/ghtorrent/ghtorrent.rb): Knows how to extract information from the data retrieved by
|
29
28
|
the retriever in order to update an SQL database (see [schema](http://ghtorrent.org/relational.html)) with metadata.
|
30
29
|
|
31
|
-
### Component Configuration
|
30
|
+
### Component Configuration
|
32
31
|
|
33
32
|
The Persister and GHTorrent components have configurable back ends:
|
34
33
|
|
35
34
|
* **Persister:** Either uses MongoDB > 2.0 (`mongo` driver) or no persister (`noop` driver)
|
36
|
-
* **GHTorrent:** GHTorrent is tested mainly with MySQL, but can theoretically be
|
37
|
-
used with any SQL database compatible with [Sequel](http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html). Your milaege may vary.
|
38
|
-
|
39
|
-
The distributed mirroring scripts also require RabbitMQ >= 2.8 or other
|
35
|
+
* **GHTorrent:** GHTorrent is tested mainly with MySQL and SQLite, but can theoretically be used with any SQL database compatible with [Sequel](http://sequel.rubyforge.org/rdoc/files/doc/opening_databases_rdoc.html). Your milaege may vary.
|
40
36
|
|
37
|
+
For distributed mirroring you also need RabbitMQ >= 3
|
41
38
|
|
42
39
|
## Installation
|
43
40
|
|
44
41
|
|
45
42
|
### 1. Install GHTorrent
|
46
|
-
GHTorrent is written in Ruby (tested with
|
43
|
+
GHTorrent is written in Ruby (tested with 2.0). To install it as a Gem do:
|
47
44
|
|
48
45
|
<code>
|
49
|
-
sudo gem install ghtorrent
|
46
|
+
sudo gem install ghtorrent
|
50
47
|
</code>
|
51
48
|
|
52
49
|
|
@@ -56,14 +53,14 @@ Depending on which SQL database you want to use, install the appropriate
|
|
56
53
|
dependency gem.
|
57
54
|
|
58
55
|
<code>
|
59
|
-
sudo gem install mysql2 # or
|
56
|
+
sudo gem install mysql2 # or sqlite3
|
60
57
|
</code>
|
61
58
|
|
62
59
|
|
63
60
|
## Configuration
|
64
61
|
|
65
62
|
Copy [config.yaml.tmpl](https://github.com/gousiosg/github-mirror/blob/master/config.yaml.tmpl)
|
66
|
-
to a file in your home directory.
|
63
|
+
to a file in your home directory.
|
67
64
|
|
68
65
|
All provided scripts accept the `-c` option, which accepts the location of the configuration file as
|
69
66
|
a parameter.
|
@@ -74,7 +71,7 @@ to retrieve data in parallel on the [Wiki](https://github.com/gousiosg/github-mi
|
|
74
71
|
|
75
72
|
## Using GHTorrent
|
76
73
|
|
77
|
-
To mirror the event stream and capture all data:
|
74
|
+
To mirror the event stream and capture all data:
|
78
75
|
|
79
76
|
* `ght-mirror-events.rb` periodically polls Github's event
|
80
77
|
queue (`https://api.github.com/events`), stores all new events in the
|
@@ -85,7 +82,7 @@ RabbitMQ.
|
|
85
82
|
functions. The functions use the appropriate Github API call to retrieve the
|
86
83
|
linked contents, extract metadata (for database storage), and store the
|
87
84
|
retrieved data in the appropriate collection in the persister, to avoid
|
88
|
-
duplicate API calls.
|
85
|
+
duplicate API calls.
|
89
86
|
Data in the SQL database contain pointers (the `ext_ref_id` field) to the
|
90
87
|
"raw" data in the persister.
|
91
88
|
|
@@ -98,32 +95,29 @@ To perform maintenance:
|
|
98
95
|
|
99
96
|
* `ght-load` loads selected events from the persister to the queue in order for
|
100
97
|
the `ght-data-retrieval` script to reprocess them
|
101
|
-
* `ght-get-more-commits` retrieves all commits for a specific repository
|
102
|
-
|
103
98
|
|
104
|
-
### Data
|
99
|
+
### Data
|
105
100
|
|
106
|
-
|
107
|
-
[
|
101
|
+
The code in this repository is used to power the data collection process of
|
102
|
+
the [GHTorrent.org](http://ghtorrent.org/) project.
|
103
|
+
You can find all data collected by in the project in the
|
104
|
+
[Downloads](https://ghtorrent.org/downloads.html) page.
|
108
105
|
|
109
106
|
There are two sets of data:
|
110
107
|
|
111
108
|
* **Raw events:** Github's [event stream](https://api.github.com/events). These
|
112
109
|
are the roots for mirroring operations. The `ght-data-retrieval` crawler starts
|
113
110
|
from an event and goes deep into the rabbit hole.
|
114
|
-
* **SQL dumps + Linked data:** Data dumps from the SQL database and the corresponding
|
115
|
-
MongoDB entities.
|
116
|
-
|
111
|
+
* **SQL dumps + Linked data:** Data dumps from the SQL database and the corresponding MongoDB entities.
|
117
112
|
|
118
113
|
## Bugs & Feature Requests
|
119
114
|
|
120
|
-
Please tell us about features you'd like or bugs you've discovered on our
|
115
|
+
Please tell us about features you'd like or bugs you've discovered on our
|
121
116
|
[Issue Tracker](https://github.com/gousiosg/github-mirror/issues).
|
122
117
|
|
123
118
|
Patches, bug fixes, etc are welcome. Please fork the repository and create
|
124
119
|
a pull request when done fixing/implementing the new feature.
|
125
120
|
|
126
|
-
|
127
121
|
## Citing GHTorrent in your Research
|
128
122
|
|
129
123
|
If you find GHTorrent and the accompanying datasets useful in your research,
|
@@ -131,18 +125,11 @@ please consider citing the following paper:
|
|
131
125
|
|
132
126
|
> Georgios Gousios and Diomidis Spinellis, "GHTorrent: GitHub’s data from a firehose," in _MSR '12: Proceedings of the 9th Working Conference on Mining Software Repositories_, June 2-–3, 2012. Zurich, Switzerland.
|
133
127
|
|
134
|
-
See also the following presentation:
|
135
|
-
|
136
|
-
<iframe src="http://www.slideshare.net/slideshow/embed_code/13184524?rel=0" width="342" height="291" frameborder="0" marginwidth="0" marginheight="0" scrolling="no" style="border:1px solid #CCC;border-width:1px 1px 0;margin-bottom:5px" allowfullscreen />
|
137
|
-
<div style="margin-bottom:5px"> <strong> <a href="http://www.slideshare.net/gousiosg/ghtorrent-githubs-data-from-a-firehose-13184524" title="GHTorrent: Github's Data from a Firehose" target="_blank">GHTorrent: Github's Data from a Firehose</a> </strong> </div>
|
138
|
-
|
139
|
-
|
140
128
|
## Authors
|
141
129
|
|
142
130
|
* [Georgios Gousios](http://istlab.dmst.aueb.gr/~george) <gousiosg@gmail.com>
|
143
131
|
* [Diomidis Spinellis](http://www.dmst.aueb.gr/dds) <dds@aueb.gr>
|
144
132
|
|
145
|
-
|
146
133
|
## License
|
147
134
|
|
148
135
|
[2-clause BSD](http://www.opensource.org/licenses/bsd-license.php)
|
data/Rakefile
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
require 'rake'
|
2
|
-
require 'rake/testtask'
|
3
2
|
require 'rake/rdoctask'
|
4
3
|
|
5
|
-
task :default => [:
|
6
|
-
|
7
|
-
desc "Run basic tests"
|
8
|
-
Rake::TestTask.new(:spec) do |t|
|
9
|
-
t.pattern = 'spec/*_test.rb'
|
10
|
-
t.verbose = true
|
11
|
-
t.warning = true
|
12
|
-
end
|
4
|
+
task :default => [:rdoc]
|
13
5
|
|
14
6
|
desc "Run Rdoc"
|
15
7
|
Rake::RDocTask.new(:rdoc) do |rd|
|
data/bin/ght-log-analyzer
CHANGED
@@ -9,7 +9,7 @@ Thread.new do
|
|
9
9
|
puts "Collecting data..."
|
10
10
|
while (true) do
|
11
11
|
sleep(1)
|
12
|
-
system
|
12
|
+
system 'clear' or system 'cls'
|
13
13
|
|
14
14
|
stats.each do |k,v|
|
15
15
|
unless v[:time_in].nil?
|
@@ -68,7 +68,7 @@ end
|
|
68
68
|
|
69
69
|
ARGF.each do |x|
|
70
70
|
|
71
|
-
next unless x =~ /
|
71
|
+
next unless x =~ /api_client.rb/
|
72
72
|
|
73
73
|
if x =~ /sleeping/
|
74
74
|
ts, pid, remaining = x.match(/\[([^.]+).*#([0-9]+)\].*for ([0-9]+).*/).captures
|
@@ -82,15 +82,20 @@ ARGF.each do |x|
|
|
82
82
|
end
|
83
83
|
|
84
84
|
elsif x =~ /Not Found|Gone|Conflict/
|
85
|
-
pid = x.match(/.*#([0-9]+).*
|
85
|
+
pid = x.match(/.*#([0-9]+).*api_client.rb.*/).captures[0]
|
86
86
|
if stats[pid][:not_found].nil?
|
87
87
|
stats[pid][:not_found] = 0
|
88
|
-
|
89
|
-
|
88
|
+
end
|
89
|
+
stats[pid][:not_found] += 1
|
90
|
+
elsif x =~ /Forbidden/
|
91
|
+
if stats[pid][:forbidden].nil?
|
92
|
+
stats[pid][:forbidden] = 0
|
93
|
+
end
|
94
|
+
stats[pid][:forbidden] += 1
|
90
95
|
else
|
91
96
|
begin
|
92
97
|
ts, pid, ip, url, remaining, time =
|
93
|
-
x.match(/.*\[([^.]+).*#([0-9]+)\].*
|
98
|
+
x.match(/.*\[([^.]+).*#([0-9]+)\].*api_client.rb: \[(.*)\].*(https:\/\/.*) \(([0-9]+) remaining\).* ([0-9]+) ms$/).captures
|
94
99
|
rescue
|
95
100
|
puts x
|
96
101
|
next
|
data/bin/ght-log-influx
ADDED
@@ -0,0 +1,190 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'trollop'
|
6
|
+
require 'influxdb'
|
7
|
+
require 'pp'
|
8
|
+
require 'time'
|
9
|
+
|
10
|
+
def parse_api_client_line(line)
|
11
|
+
if line.start_with?("Successful")
|
12
|
+
# Successful request. URL: https://api.github.com/repos/amizony/self-destructing-task-list/comments/11518274?per_page=100, Remaining: 3949, Total: 423 ms
|
13
|
+
remaining, elapsed = line.match(/.*Remaining: ([\d]+), Total: ([\d]+) ms/).captures
|
14
|
+
{
|
15
|
+
:outcome => "success",
|
16
|
+
:elapsed => elapsed.to_i,
|
17
|
+
:remaining => remaining.to_i
|
18
|
+
}
|
19
|
+
elsif line.start_with?("Failed")
|
20
|
+
# Failed request. URL: https://api.github.com/repos/mingliang7/hotel/commits?per_page=100, Status code: 409, Status: Conflict, Access: ghtorrent, IP: 0.0.0.0, Remaining: 3332
|
21
|
+
code, elapsed = line.match(/.*Status code: ([^,]+), .*Remaining: ([\d]+)/).captures
|
22
|
+
{
|
23
|
+
:outcome => "error",
|
24
|
+
:error_code => code.to_i,
|
25
|
+
:remaining => remaining.to_i
|
26
|
+
}
|
27
|
+
else
|
28
|
+
{}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def parse_data_retrieval_line(line)
|
33
|
+
#Success processing event. Type: PushEvent, ID: 2863181313, Time: 967 ms
|
34
|
+
return {} unless line.start_with?("Success") or line.start_with?("Error")
|
35
|
+
outcome, evt_type, time = line.match(/([^\ ]+) processing event\. Type: ([\D]+)Event, .*, Time: ([\d]+) ms/).captures
|
36
|
+
|
37
|
+
{
|
38
|
+
:outcome => outcome.downcase,
|
39
|
+
:evt_type => evt_type,
|
40
|
+
:elapsed => time.to_i
|
41
|
+
}
|
42
|
+
end
|
43
|
+
|
44
|
+
def parse_retriever_line(line)
|
45
|
+
|
46
|
+
if line.start_with?("Added")
|
47
|
+
# Added repo hiropong -> googlemaplesson
|
48
|
+
outcome = "success"
|
49
|
+
entity = line.split(/ /)[1]
|
50
|
+
elsif line.start_with?("Could not find")
|
51
|
+
# Could not find commit_comment 12106552. Deleted?
|
52
|
+
outcome = "failure"
|
53
|
+
entity = line.split(/ /)[3]
|
54
|
+
else
|
55
|
+
return {}
|
56
|
+
end
|
57
|
+
|
58
|
+
{
|
59
|
+
:outcome => outcome,
|
60
|
+
:entity => entity
|
61
|
+
}
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
def parse_ghtorrent_line(line)
|
66
|
+
|
67
|
+
if line.start_with?("Added")
|
68
|
+
# Added user hayjohnny2000
|
69
|
+
# Added issue_event etsy/logster -> 1/etsy/logster -> 1/417355
|
70
|
+
outcome = "success"
|
71
|
+
entity = line.split(/ /)[1]
|
72
|
+
elsif line.start_with?("Could not retrieve")
|
73
|
+
# Could not retrieve commit_comment 12106552. Deleted?
|
74
|
+
outcome = "failure"
|
75
|
+
entity = line.split(/ /)[3]
|
76
|
+
else
|
77
|
+
return {}
|
78
|
+
end
|
79
|
+
|
80
|
+
{
|
81
|
+
:outcome => outcome,
|
82
|
+
:entity => entity
|
83
|
+
}
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
def parse_log_line(line)
|
88
|
+
begin
|
89
|
+
severity, time, progname, stage, msg =
|
90
|
+
line.match(/([A-Z]+), (.+), (.+) -- ([^:]*?): (.*)/).captures
|
91
|
+
rescue
|
92
|
+
puts "Error parsing line: #{line}"
|
93
|
+
return {}
|
94
|
+
end
|
95
|
+
|
96
|
+
return {} if severity.downcase == 'debug'
|
97
|
+
stage = stage.split(/\./)[0]
|
98
|
+
data = {
|
99
|
+
:time => Time.iso8601(time).to_f,
|
100
|
+
:client => progname,
|
101
|
+
:severity => severity,
|
102
|
+
:stage => stage
|
103
|
+
}
|
104
|
+
|
105
|
+
return {} if msg.nil? or msg.length == 0
|
106
|
+
|
107
|
+
|
108
|
+
stage_specific =
|
109
|
+
begin
|
110
|
+
case stage
|
111
|
+
when 'api_client'
|
112
|
+
parse_api_client_line(msg)
|
113
|
+
when 'ght_data_retrieval'
|
114
|
+
parse_data_retrieval_line(msg)
|
115
|
+
when 'retriever'
|
116
|
+
parse_retriever_line(msg)
|
117
|
+
when 'ghtorrent'
|
118
|
+
parse_ghtorrent_line(msg)
|
119
|
+
else
|
120
|
+
{}
|
121
|
+
end
|
122
|
+
rescue
|
123
|
+
puts "Error parsing line: #{msg}"
|
124
|
+
{}
|
125
|
+
end
|
126
|
+
|
127
|
+
return {} if stage_specific.empty?
|
128
|
+
data.merge(stage_specific)
|
129
|
+
end
|
130
|
+
|
131
|
+
opts = Trollop::options do
|
132
|
+
banner <<-END
|
133
|
+
Store GHTorrent log output to InfluxDB. By default reads from STDIN.
|
134
|
+
Can be configured to watch files in directories.
|
135
|
+
|
136
|
+
Options:
|
137
|
+
END
|
138
|
+
|
139
|
+
opt :watch, "Use watch mode", :sort => 'w'
|
140
|
+
opt :watch_pattern, "Pattern for files to watch",
|
141
|
+
:short => 'p', :default => '*.log'
|
142
|
+
|
143
|
+
opt :db_server, "InfluxDB server to use", :type => String,
|
144
|
+
:short => 's', :default => '127.0.0.1'
|
145
|
+
opt :database, "InfluxDB database to use", :type => String,
|
146
|
+
:short => 'd', :default => 'ghtorrent'
|
147
|
+
opt :db_uname, "Username for the Influx database", :type => String,
|
148
|
+
:short => 'u', :default => 'ghtorrent'
|
149
|
+
opt :db_passwd, "Password for the Influx database", :type => String,
|
150
|
+
:short => 'x', :default => ''
|
151
|
+
end
|
152
|
+
|
153
|
+
unless opts[:db_passwd_given].nil?
|
154
|
+
influx = InfluxDB::Client.new(opts[:database],
|
155
|
+
:host => opts[:db_server],
|
156
|
+
:username => opts[:db_uname],
|
157
|
+
:password => opts[:db_passwd])
|
158
|
+
else
|
159
|
+
influx = InfluxDB::Client.new("ghtorrent",
|
160
|
+
:host => opts[:db_server])
|
161
|
+
end
|
162
|
+
influx.get_database_list
|
163
|
+
|
164
|
+
if opts[:watch]
|
165
|
+
require 'filewatch/tail'
|
166
|
+
|
167
|
+
t = FileWatch::Tail.new
|
168
|
+
t.tail(opts[:watch_pattern])
|
169
|
+
|
170
|
+
t.subscribe do |path, line|
|
171
|
+
p = parse_log_line(line)
|
172
|
+
next if p.empty?
|
173
|
+
|
174
|
+
pp p
|
175
|
+
#influx.write_point(p[:stage], p)
|
176
|
+
end
|
177
|
+
else
|
178
|
+
puts "Reading from STDIN..."
|
179
|
+
ARGF.each do |line|
|
180
|
+
next if line !~ /^[IDEW]/
|
181
|
+
|
182
|
+
begin
|
183
|
+
p = parse_log_line(line)
|
184
|
+
next if p.empty?
|
185
|
+
pp p
|
186
|
+
influx.write_point(p[:stage], p)
|
187
|
+
rescue
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|