github_archive_parser 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/github_archive_parser +2 -4
- data/lib/github_archive_parser.rb +24 -56
- data/lib/github_archive_parser/XCust.rb +6 -1
- data/lib/github_archive_parser/initialize.rb +3 -0
- data/lib/github_archive_parser/processor.rb +74 -0
- data/lib/github_archive_parser/utilities.rb +22 -0
- data/lib/github_archive_parser/version.rb +1 -1
- data/spec/processor_spec.rb +95 -0
- data/spec/utilities_spec.rb +48 -0
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cc577ab9b44ecf0384854bcabef6d03407beb841
|
4
|
+
data.tar.gz: 3e0af6d7aab095936d5865bc9f9b538cc6e60cd0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 253ce14d775c224434c7a498db8545ac9e4e6fd2be77cf8ed429280fabfb923b25c87a2a60787c279f80903a13e520342664859f92d14f4aa2e825a5912ed874
|
7
|
+
data.tar.gz: 5a479e595bf17b801e89b0af6293383161817585b6b35949e7aa327c16a666a64f5652c0761419ed438928b0e4279d702f31a916d5cfac9557d21c727f1dff0c
|
data/bin/github_archive_parser
CHANGED
@@ -3,7 +3,5 @@
|
|
3
3
|
$:.unshift File.expand_path("../../lib", __FILE__)
|
4
4
|
require 'github_archive_parser'
|
5
5
|
|
6
|
-
github_archive_parser = GitHubArchiveParser::
|
7
|
-
ARGV
|
8
|
-
github_archive_parser.process(url)
|
9
|
-
end
|
6
|
+
github_archive_parser = GitHubArchiveParser::CLI.new
|
7
|
+
github_archive_parser.process(ARGV)
|
@@ -1,42 +1,37 @@
|
|
1
1
|
require 'github_archive_parser/initialize'
|
2
2
|
|
3
3
|
module GitHubArchiveParser
|
4
|
-
class
|
5
|
-
|
4
|
+
class CLI
|
5
|
+
attr_accessor :options
|
6
6
|
|
7
7
|
def initialize
|
8
8
|
@options = OpenStruct.new(
|
9
9
|
debug: false,
|
10
|
+
quite: false,
|
11
|
+
since: nil,
|
12
|
+
until: nil,
|
10
13
|
)
|
11
14
|
parse_options
|
12
15
|
determine_log_level
|
13
|
-
|
14
|
-
# Create the concrete handlers and store them for future use
|
15
|
-
create_event_handlers
|
16
16
|
end
|
17
17
|
|
18
|
-
def process(
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
event_handler = @event_handlers[event_class]
|
28
|
-
|
29
|
-
event_handler.each { |handler|
|
30
|
-
if !handler.nil? && handler.respond_to?(:parse)
|
31
|
-
handler.parse(event)
|
32
|
-
end
|
33
|
-
}
|
18
|
+
def process(args)
|
19
|
+
processor = Processor.new
|
20
|
+
begin
|
21
|
+
if !@options.until.nil? && !@options.since.nil?
|
22
|
+
processor.process_between(@options.since, @options.until)
|
23
|
+
elsif !@options.since.nil?
|
24
|
+
processor.process_since(@options.since)
|
25
|
+
else
|
26
|
+
args.each {|url| processor.process_url(url) }
|
34
27
|
end
|
35
|
-
|
36
|
-
Log.
|
28
|
+
rescue Exception => e
|
29
|
+
Log.error e
|
37
30
|
end
|
38
31
|
end
|
39
32
|
|
33
|
+
private
|
34
|
+
|
40
35
|
def parse_options
|
41
36
|
OptionParser.new do |opt|
|
42
37
|
opt.version = VERSION
|
@@ -46,11 +41,15 @@ module GitHubArchiveParser
|
|
46
41
|
opt.on "-q", "--quite", "Hide all output (shows only UNKNOWN level log statements)" do
|
47
42
|
options.quite = true
|
48
43
|
end
|
44
|
+
opt.on "-s", "--since TIME", "Process all events since the provided date (can be specified in natural language)" do |time|
|
45
|
+
options.since = time
|
46
|
+
end
|
47
|
+
opt.on "-u", "--until TIME", "Process all events until the provided date (can be specified in natural language) starting from February 12, 2011" do |time|
|
48
|
+
options.until = time
|
49
|
+
end
|
49
50
|
end.parse!
|
50
51
|
end
|
51
52
|
|
52
|
-
private
|
53
|
-
|
54
53
|
def determine_log_level
|
55
54
|
if options.debug
|
56
55
|
Log.level = Logger::DEBUG
|
@@ -62,36 +61,5 @@ module GitHubArchiveParser
|
|
62
61
|
end
|
63
62
|
end
|
64
63
|
|
65
|
-
def class_from_string(string)
|
66
|
-
begin
|
67
|
-
string.split('::').inject(Object) do |mod, class_name|
|
68
|
-
mod.const_get(class_name)
|
69
|
-
end
|
70
|
-
rescue Exception
|
71
|
-
Log.warn "Event #{string} not found"
|
72
|
-
nil
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
def create_event_handlers
|
77
|
-
@event_handlers = {}
|
78
|
-
|
79
|
-
# Probably can do something to not hardcode this
|
80
|
-
# Iterate over the event types
|
81
|
-
[CommitCommentEvent, CreateEvent, DeleteEvent,
|
82
|
-
DeploymentEvent, DeploymentStatusEvent, DownloadEvent,
|
83
|
-
FollowEvent, ForkApplyEvent, ForkEvent,
|
84
|
-
GistEvent, GollumEvent, IssueCommentEvent,
|
85
|
-
IssueCommentEvent, IssuesEvent, MemberEvent,
|
86
|
-
PublicEvent, PullRequestEvent, PullRequestReviewCommentEvent,
|
87
|
-
PushEvent, ReleaseEvent, StatusEvent,
|
88
|
-
TeamAddEvent, WatchEvent].each do | event_type |
|
89
|
-
|
90
|
-
# Map list of concrete event handler to their event type
|
91
|
-
@event_handlers[event_type] = event_type.descendants.map { |handler|
|
92
|
-
handler.new
|
93
|
-
}
|
94
|
-
end
|
95
|
-
end
|
96
64
|
end
|
97
65
|
end
|
@@ -2,9 +2,14 @@ module AwesomeApplication
|
|
2
2
|
class PrintCreateEvent
|
3
3
|
include GitHubArchiveParser::CreateEvent
|
4
4
|
|
5
|
+
attr_accessor :counter
|
6
|
+
|
5
7
|
def parse(event)
|
6
8
|
# The event is a Hashie::Mash object for easy (dot) access
|
7
|
-
puts "#{event.repository.owner}/#{event.repository.name}"
|
9
|
+
#puts "#{event.repository.owner}/#{event.repository.name}"
|
10
|
+
@counter = 0 if @counter.nil?
|
11
|
+
@counter = 1 + @counter
|
12
|
+
puts @counter
|
8
13
|
end
|
9
14
|
end
|
10
15
|
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module GitHubArchiveParser
|
2
|
+
class Processor
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
create_event_handlers
|
6
|
+
end
|
7
|
+
|
8
|
+
def process_between(since_time, until_time)
|
9
|
+
start_time = Utilities.time_from_natural_language(since_time)
|
10
|
+
end_time = Utilities.time_from_natural_language(until_time)
|
11
|
+
Log.info "Processing between #{start_time} and #{end_time}"
|
12
|
+
|
13
|
+
# Start hourly iterator from start time, but exclude last hour (could be incomplete)
|
14
|
+
iterator = start_time
|
15
|
+
while iterator < end_time - 3600
|
16
|
+
process_url("http://data.githubarchive.org/#{iterator.year}-#{iterator.month.to_s.rjust(2, '0')}-#{iterator.day.to_s.rjust(2, '0')}-#{iterator.hour}.json.gz")
|
17
|
+
iterator += 3600
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def process_since(since_time)
|
22
|
+
process_between(since_time, 'Now')
|
23
|
+
end
|
24
|
+
|
25
|
+
def process_url(url)
|
26
|
+
Log.info "Processing #{url}"
|
27
|
+
if (!url.nil? && url.start_with?("http://data.githubarchive.org/"))
|
28
|
+
|
29
|
+
begin
|
30
|
+
gz = open(url)
|
31
|
+
js = Zlib::GzipReader.new(gz).read
|
32
|
+
rescue => e
|
33
|
+
raise e, "URL[#{url}] #{e.message}"
|
34
|
+
end
|
35
|
+
|
36
|
+
Yajl::Parser.parse(js) do |event|
|
37
|
+
event = Hashie::Mash.new(event)
|
38
|
+
event_class = Utilities.class_from_string("GitHubArchiveParser::#{event.type}")
|
39
|
+
event_handler = @event_handlers[event_class]
|
40
|
+
|
41
|
+
event_handler.each { |handler|
|
42
|
+
if !handler.nil? && handler.respond_to?(:parse)
|
43
|
+
handler.parse(event)
|
44
|
+
end
|
45
|
+
}
|
46
|
+
end
|
47
|
+
else
|
48
|
+
raise "URL[#{url}] does not belong to http://data.githubarchive.org/"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def create_event_handlers
|
55
|
+
@event_handlers = {}
|
56
|
+
|
57
|
+
# Probably can do something to not hardcode this
|
58
|
+
# Iterate over the event types
|
59
|
+
[CommitCommentEvent, CreateEvent, DeleteEvent,
|
60
|
+
DeploymentEvent, DeploymentStatusEvent, DownloadEvent,
|
61
|
+
FollowEvent, ForkApplyEvent, ForkEvent,
|
62
|
+
GistEvent, GollumEvent, IssueCommentEvent,
|
63
|
+
IssueCommentEvent, IssuesEvent, MemberEvent,
|
64
|
+
PublicEvent, PullRequestEvent, PullRequestReviewCommentEvent,
|
65
|
+
PushEvent, ReleaseEvent, StatusEvent,
|
66
|
+
TeamAddEvent, WatchEvent].each do | event_type |
|
67
|
+
|
68
|
+
# Map list of concrete event handler to their event type
|
69
|
+
@event_handlers[event_type] = event_type.descendants.map { |handler| handler.new }
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module GitHubArchiveParser
|
2
|
+
module Utilities
|
3
|
+
module_function
|
4
|
+
|
5
|
+
def class_from_string(string)
|
6
|
+
begin
|
7
|
+
string.split('::').inject(Object) do |mod, class_name|
|
8
|
+
mod.const_get(class_name)
|
9
|
+
end
|
10
|
+
rescue Exception
|
11
|
+
Log.warn "Event #{string} not found"
|
12
|
+
nil
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def time_from_natural_language(natural_language)
|
17
|
+
time = Chronic.parse(natural_language)
|
18
|
+
raise "Invalid time: #{natural_language}" if time.nil?
|
19
|
+
time.getutc
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
include GitHubArchiveParser
|
4
|
+
|
5
|
+
describe Processor do
|
6
|
+
let (:processor) { GitHubArchiveParser::Processor.new }
|
7
|
+
describe "#process_url" do
|
8
|
+
it "should fail on an invalid text" do
|
9
|
+
expect {processor.process_url("dummydata")}.to raise_error(RuntimeError)
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should fail on an invalid URL" do
|
13
|
+
expect {processor.process_url("http://data.githubarchive.org/dummydata.json.gz")}.to raise_error(OpenURI::HTTPError)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should fail on an wrong domain URL" do
|
17
|
+
expect {processor.process_url("http://data.wrongdomain.org/dummydata.json.gz")}.to raise_error(RuntimeError)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "process_since" do
|
22
|
+
it "should call process_between(since, 'Now')" do
|
23
|
+
since_time = 'Yesterday'
|
24
|
+
until_time = 'Now'
|
25
|
+
processor.should receive(:process_between).with(since_time, until_time)
|
26
|
+
processor.process_since(since_time)
|
27
|
+
end
|
28
|
+
|
29
|
+
context "with invalid time" do
|
30
|
+
it "should fail with invalid since time" do
|
31
|
+
since_time = 'nil'
|
32
|
+
allow(processor).to receive(:process_url)
|
33
|
+
expect {processor.process_since(since_time)}.to raise_error(RuntimeError)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe "process_between" do
|
39
|
+
it "should call process_url(url) for as many hour intervals between two times (hours)" do
|
40
|
+
since_time = '12 hours ago'
|
41
|
+
until_time = '6 hours ago'
|
42
|
+
processor.should receive(:process_url).exactly(6).times
|
43
|
+
processor.process_between(since_time, until_time)
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should call process_url(url) for as many hour intervals between two times (days)" do
|
47
|
+
since_time = '3 days ago'
|
48
|
+
until_time = '1 day ago'
|
49
|
+
processor.should receive(:process_url).exactly(48).times
|
50
|
+
processor.process_between(since_time, until_time)
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should call process_url(url) for as many hour intervals between two times (months)" do
|
54
|
+
since_time = 'April 1 2012 at 12pm'
|
55
|
+
until_time = 'June 1 2012 at 12pm'
|
56
|
+
|
57
|
+
# (24 hours * (April 30 days + May 31 days)) - last hour = 1463 hours
|
58
|
+
processor.should receive(:process_url).exactly(1463).times
|
59
|
+
processor.process_between(since_time, until_time)
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should call process_url(url) for as many hour intervals between two times (years)" do
|
63
|
+
since_time = 'April 1 2011 at 12pm'
|
64
|
+
until_time = 'April 1 2013 at 12pm'
|
65
|
+
|
66
|
+
# ((Number of days between April 1 2011 and April 1 2013) * 24 hours) - last hour = 731 days * 24 hours = 17543 hours
|
67
|
+
processor.should receive(:process_url).exactly(17543).times
|
68
|
+
processor.process_between(since_time, until_time)
|
69
|
+
end
|
70
|
+
|
71
|
+
context "with invalid time" do
|
72
|
+
it "should fail with invalid since time" do
|
73
|
+
since_time = 'nil'
|
74
|
+
until_time = 'Now'
|
75
|
+
allow(processor).to receive(:process_url)
|
76
|
+
expect {processor.process_between(since_time, until_time)}.to raise_error(RuntimeError)
|
77
|
+
end
|
78
|
+
|
79
|
+
it "should fail with invalid until time" do
|
80
|
+
since_time = '1 day ago'
|
81
|
+
until_time = 'nil'
|
82
|
+
allow(processor).to receive(:process_url)
|
83
|
+
expect {processor.process_between(since_time, until_time)}.to raise_error(RuntimeError)
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should fail with both invalid until/since time" do
|
87
|
+
since_time = 'nil'
|
88
|
+
until_time = 'nil'
|
89
|
+
allow(processor).to receive(:process_url)
|
90
|
+
expect {processor.process_between(since_time, until_time)}.to raise_error(RuntimeError)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
include GitHubArchiveParser
|
4
|
+
|
5
|
+
describe Utilities do
|
6
|
+
describe "#time_from_natural_language" do
|
7
|
+
it "should raise an error with an invalid time" do
|
8
|
+
expect { Utilities.time_from_natural_language('nil') }.to raise_error(RuntimeError)
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should return a Time object" do
|
12
|
+
time = Utilities.time_from_natural_language('August 1 2013 at 4am')
|
13
|
+
time.should be_a(Time)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should return a UTC time" do
|
17
|
+
time = Utilities.time_from_natural_language('August 1 2013 at 4am')
|
18
|
+
time.utc?.should be_true
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should return the correct time" do
|
22
|
+
time = Utilities.time_from_natural_language('August 1 2013 at 4am')
|
23
|
+
time.getlocal.year.should eq(2013)
|
24
|
+
time.getlocal.month.should eq(8)
|
25
|
+
time.getlocal.day.should eq(1)
|
26
|
+
time.getlocal.hour.should eq(4)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe "#class_from_string" do
|
31
|
+
it "should return a valid name of a module" do
|
32
|
+
Utilities.class_from_string('Utilities').should eq(Utilities)
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should return a valid name of a class" do
|
36
|
+
Utilities.class_from_string('Processor').should eq(Processor)
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should return a valid name of a class using full path" do
|
40
|
+
Utilities.class_from_string('GitHubArchiveParser::Processor').should eq(Processor)
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should return nil if class cannot be found" do
|
44
|
+
Utilities.class_from_string('SpecialApplication::MassUnit').should be_nil
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: github_archive_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin Jalbert
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-01-
|
11
|
+
date: 2014-01-29 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Gem which parses GitHub Archive data
|
14
14
|
email:
|
@@ -46,8 +46,12 @@ files:
|
|
46
46
|
- lib/github_archive_parser/event_handlers/watch_event.rb
|
47
47
|
- lib/github_archive_parser/initialize.rb
|
48
48
|
- lib/github_archive_parser/log.rb
|
49
|
+
- lib/github_archive_parser/processor.rb
|
50
|
+
- lib/github_archive_parser/utilities.rb
|
49
51
|
- lib/github_archive_parser/version.rb
|
50
52
|
- spec/log_spec.rb
|
53
|
+
- spec/processor_spec.rb
|
54
|
+
- spec/utilities_spec.rb
|
51
55
|
homepage: https://github.com/kevinjalbert/github_archive_parser
|
52
56
|
licenses: []
|
53
57
|
metadata: {}
|
@@ -73,3 +77,5 @@ specification_version: 4
|
|
73
77
|
summary: Gem which parses GitHub Archive data
|
74
78
|
test_files:
|
75
79
|
- spec/log_spec.rb
|
80
|
+
- spec/processor_spec.rb
|
81
|
+
- spec/utilities_spec.rb
|