github_archive_parser 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/github_archive_parser +2 -4
- data/lib/github_archive_parser.rb +24 -56
- data/lib/github_archive_parser/XCust.rb +6 -1
- data/lib/github_archive_parser/initialize.rb +3 -0
- data/lib/github_archive_parser/processor.rb +74 -0
- data/lib/github_archive_parser/utilities.rb +22 -0
- data/lib/github_archive_parser/version.rb +1 -1
- data/spec/processor_spec.rb +95 -0
- data/spec/utilities_spec.rb +48 -0
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cc577ab9b44ecf0384854bcabef6d03407beb841
|
4
|
+
data.tar.gz: 3e0af6d7aab095936d5865bc9f9b538cc6e60cd0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 253ce14d775c224434c7a498db8545ac9e4e6fd2be77cf8ed429280fabfb923b25c87a2a60787c279f80903a13e520342664859f92d14f4aa2e825a5912ed874
|
7
|
+
data.tar.gz: 5a479e595bf17b801e89b0af6293383161817585b6b35949e7aa327c16a666a64f5652c0761419ed438928b0e4279d702f31a916d5cfac9557d21c727f1dff0c
|
data/bin/github_archive_parser
CHANGED
@@ -3,7 +3,5 @@
|
|
3
3
|
$:.unshift File.expand_path("../../lib", __FILE__)
|
4
4
|
require 'github_archive_parser'
|
5
5
|
|
6
|
-
github_archive_parser = GitHubArchiveParser::
|
7
|
-
ARGV
|
8
|
-
github_archive_parser.process(url)
|
9
|
-
end
|
6
|
+
github_archive_parser = GitHubArchiveParser::CLI.new
|
7
|
+
github_archive_parser.process(ARGV)
|
@@ -1,42 +1,37 @@
|
|
1
1
|
require 'github_archive_parser/initialize'
|
2
2
|
|
3
3
|
module GitHubArchiveParser
|
4
|
-
class
|
5
|
-
|
4
|
+
class CLI
|
5
|
+
attr_accessor :options
|
6
6
|
|
7
7
|
def initialize
|
8
8
|
@options = OpenStruct.new(
|
9
9
|
debug: false,
|
10
|
+
quite: false,
|
11
|
+
since: nil,
|
12
|
+
until: nil,
|
10
13
|
)
|
11
14
|
parse_options
|
12
15
|
determine_log_level
|
13
|
-
|
14
|
-
# Create the concrete handlers and store them for future use
|
15
|
-
create_event_handlers
|
16
16
|
end
|
17
17
|
|
18
|
-
def process(
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
event_handler = @event_handlers[event_class]
|
28
|
-
|
29
|
-
event_handler.each { |handler|
|
30
|
-
if !handler.nil? && handler.respond_to?(:parse)
|
31
|
-
handler.parse(event)
|
32
|
-
end
|
33
|
-
}
|
18
|
+
def process(args)
|
19
|
+
processor = Processor.new
|
20
|
+
begin
|
21
|
+
if !@options.until.nil? && !@options.since.nil?
|
22
|
+
processor.process_between(@options.since, @options.until)
|
23
|
+
elsif !@options.since.nil?
|
24
|
+
processor.process_since(@options.since)
|
25
|
+
else
|
26
|
+
args.each {|url| processor.process_url(url) }
|
34
27
|
end
|
35
|
-
|
36
|
-
Log.
|
28
|
+
rescue Exception => e
|
29
|
+
Log.error e
|
37
30
|
end
|
38
31
|
end
|
39
32
|
|
33
|
+
private
|
34
|
+
|
40
35
|
def parse_options
|
41
36
|
OptionParser.new do |opt|
|
42
37
|
opt.version = VERSION
|
@@ -46,11 +41,15 @@ module GitHubArchiveParser
|
|
46
41
|
opt.on "-q", "--quite", "Hide all output (shows only UNKNOWN level log statements)" do
|
47
42
|
options.quite = true
|
48
43
|
end
|
44
|
+
opt.on "-s", "--since TIME", "Process all events since the provided date (can be specified in natural language)" do |time|
|
45
|
+
options.since = time
|
46
|
+
end
|
47
|
+
opt.on "-u", "--until TIME", "Process all events until the provided date (can be specified in natural language) starting from February 12, 2011" do |time|
|
48
|
+
options.until = time
|
49
|
+
end
|
49
50
|
end.parse!
|
50
51
|
end
|
51
52
|
|
52
|
-
private
|
53
|
-
|
54
53
|
def determine_log_level
|
55
54
|
if options.debug
|
56
55
|
Log.level = Logger::DEBUG
|
@@ -62,36 +61,5 @@ module GitHubArchiveParser
|
|
62
61
|
end
|
63
62
|
end
|
64
63
|
|
65
|
-
def class_from_string(string)
|
66
|
-
begin
|
67
|
-
string.split('::').inject(Object) do |mod, class_name|
|
68
|
-
mod.const_get(class_name)
|
69
|
-
end
|
70
|
-
rescue Exception
|
71
|
-
Log.warn "Event #{string} not found"
|
72
|
-
nil
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
def create_event_handlers
|
77
|
-
@event_handlers = {}
|
78
|
-
|
79
|
-
# Probably can do something to not hardcode this
|
80
|
-
# Iterate over the event types
|
81
|
-
[CommitCommentEvent, CreateEvent, DeleteEvent,
|
82
|
-
DeploymentEvent, DeploymentStatusEvent, DownloadEvent,
|
83
|
-
FollowEvent, ForkApplyEvent, ForkEvent,
|
84
|
-
GistEvent, GollumEvent, IssueCommentEvent,
|
85
|
-
IssueCommentEvent, IssuesEvent, MemberEvent,
|
86
|
-
PublicEvent, PullRequestEvent, PullRequestReviewCommentEvent,
|
87
|
-
PushEvent, ReleaseEvent, StatusEvent,
|
88
|
-
TeamAddEvent, WatchEvent].each do | event_type |
|
89
|
-
|
90
|
-
# Map list of concrete event handler to their event type
|
91
|
-
@event_handlers[event_type] = event_type.descendants.map { |handler|
|
92
|
-
handler.new
|
93
|
-
}
|
94
|
-
end
|
95
|
-
end
|
96
64
|
end
|
97
65
|
end
|
@@ -2,9 +2,14 @@ module AwesomeApplication
|
|
2
2
|
class PrintCreateEvent
|
3
3
|
include GitHubArchiveParser::CreateEvent
|
4
4
|
|
5
|
+
attr_accessor :counter
|
6
|
+
|
5
7
|
def parse(event)
|
6
8
|
# The event is a Hashie::Mash object for easy (dot) access
|
7
|
-
puts "#{event.repository.owner}/#{event.repository.name}"
|
9
|
+
#puts "#{event.repository.owner}/#{event.repository.name}"
|
10
|
+
@counter = 0 if @counter.nil?
|
11
|
+
@counter = 1 + @counter
|
12
|
+
puts @counter
|
8
13
|
end
|
9
14
|
end
|
10
15
|
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module GitHubArchiveParser
|
2
|
+
class Processor
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
create_event_handlers
|
6
|
+
end
|
7
|
+
|
8
|
+
def process_between(since_time, until_time)
|
9
|
+
start_time = Utilities.time_from_natural_language(since_time)
|
10
|
+
end_time = Utilities.time_from_natural_language(until_time)
|
11
|
+
Log.info "Processing between #{start_time} and #{end_time}"
|
12
|
+
|
13
|
+
# Start hourly iterator from start time, but exclude last hour (could be incomplete)
|
14
|
+
iterator = start_time
|
15
|
+
while iterator < end_time - 3600
|
16
|
+
process_url("http://data.githubarchive.org/#{iterator.year}-#{iterator.month.to_s.rjust(2, '0')}-#{iterator.day.to_s.rjust(2, '0')}-#{iterator.hour}.json.gz")
|
17
|
+
iterator += 3600
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def process_since(since_time)
|
22
|
+
process_between(since_time, 'Now')
|
23
|
+
end
|
24
|
+
|
25
|
+
def process_url(url)
|
26
|
+
Log.info "Processing #{url}"
|
27
|
+
if (!url.nil? && url.start_with?("http://data.githubarchive.org/"))
|
28
|
+
|
29
|
+
begin
|
30
|
+
gz = open(url)
|
31
|
+
js = Zlib::GzipReader.new(gz).read
|
32
|
+
rescue => e
|
33
|
+
raise e, "URL[#{url}] #{e.message}"
|
34
|
+
end
|
35
|
+
|
36
|
+
Yajl::Parser.parse(js) do |event|
|
37
|
+
event = Hashie::Mash.new(event)
|
38
|
+
event_class = Utilities.class_from_string("GitHubArchiveParser::#{event.type}")
|
39
|
+
event_handler = @event_handlers[event_class]
|
40
|
+
|
41
|
+
event_handler.each { |handler|
|
42
|
+
if !handler.nil? && handler.respond_to?(:parse)
|
43
|
+
handler.parse(event)
|
44
|
+
end
|
45
|
+
}
|
46
|
+
end
|
47
|
+
else
|
48
|
+
raise "URL[#{url}] does not belong to http://data.githubarchive.org/"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def create_event_handlers
|
55
|
+
@event_handlers = {}
|
56
|
+
|
57
|
+
# Probably can do something to not hardcode this
|
58
|
+
# Iterate over the event types
|
59
|
+
[CommitCommentEvent, CreateEvent, DeleteEvent,
|
60
|
+
DeploymentEvent, DeploymentStatusEvent, DownloadEvent,
|
61
|
+
FollowEvent, ForkApplyEvent, ForkEvent,
|
62
|
+
GistEvent, GollumEvent, IssueCommentEvent,
|
63
|
+
IssueCommentEvent, IssuesEvent, MemberEvent,
|
64
|
+
PublicEvent, PullRequestEvent, PullRequestReviewCommentEvent,
|
65
|
+
PushEvent, ReleaseEvent, StatusEvent,
|
66
|
+
TeamAddEvent, WatchEvent].each do | event_type |
|
67
|
+
|
68
|
+
# Map list of concrete event handler to their event type
|
69
|
+
@event_handlers[event_type] = event_type.descendants.map { |handler| handler.new }
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module GitHubArchiveParser
|
2
|
+
module Utilities
|
3
|
+
module_function
|
4
|
+
|
5
|
+
def class_from_string(string)
|
6
|
+
begin
|
7
|
+
string.split('::').inject(Object) do |mod, class_name|
|
8
|
+
mod.const_get(class_name)
|
9
|
+
end
|
10
|
+
rescue Exception
|
11
|
+
Log.warn "Event #{string} not found"
|
12
|
+
nil
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def time_from_natural_language(natural_language)
|
17
|
+
time = Chronic.parse(natural_language)
|
18
|
+
raise "Invalid time: #{natural_language}" if time.nil?
|
19
|
+
time.getutc
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
include GitHubArchiveParser
|
4
|
+
|
5
|
+
describe Processor do
|
6
|
+
let (:processor) { GitHubArchiveParser::Processor.new }
|
7
|
+
describe "#process_url" do
|
8
|
+
it "should fail on an invalid text" do
|
9
|
+
expect {processor.process_url("dummydata")}.to raise_error(RuntimeError)
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should fail on an invalid URL" do
|
13
|
+
expect {processor.process_url("http://data.githubarchive.org/dummydata.json.gz")}.to raise_error(OpenURI::HTTPError)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should fail on an wrong domain URL" do
|
17
|
+
expect {processor.process_url("http://data.wrongdomain.org/dummydata.json.gz")}.to raise_error(RuntimeError)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "process_since" do
|
22
|
+
it "should call process_between(since, 'Now')" do
|
23
|
+
since_time = 'Yesterday'
|
24
|
+
until_time = 'Now'
|
25
|
+
processor.should receive(:process_between).with(since_time, until_time)
|
26
|
+
processor.process_since(since_time)
|
27
|
+
end
|
28
|
+
|
29
|
+
context "with invalid time" do
|
30
|
+
it "should fail with invalid since time" do
|
31
|
+
since_time = 'nil'
|
32
|
+
allow(processor).to receive(:process_url)
|
33
|
+
expect {processor.process_since(since_time)}.to raise_error(RuntimeError)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe "process_between" do
|
39
|
+
it "should call process_url(url) for as many hour intervals between two times (hours)" do
|
40
|
+
since_time = '12 hours ago'
|
41
|
+
until_time = '6 hours ago'
|
42
|
+
processor.should receive(:process_url).exactly(6).times
|
43
|
+
processor.process_between(since_time, until_time)
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should call process_url(url) for as many hour intervals between two times (days)" do
|
47
|
+
since_time = '3 days ago'
|
48
|
+
until_time = '1 day ago'
|
49
|
+
processor.should receive(:process_url).exactly(48).times
|
50
|
+
processor.process_between(since_time, until_time)
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should call process_url(url) for as many hour intervals between two times (months)" do
|
54
|
+
since_time = 'April 1 2012 at 12pm'
|
55
|
+
until_time = 'June 1 2012 at 12pm'
|
56
|
+
|
57
|
+
# (24 hours * (April 30 days + May 31 days)) - last hour = 1463 hours
|
58
|
+
processor.should receive(:process_url).exactly(1463).times
|
59
|
+
processor.process_between(since_time, until_time)
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should call process_url(url) for as many hour intervals between two times (years)" do
|
63
|
+
since_time = 'April 1 2011 at 12pm'
|
64
|
+
until_time = 'April 1 2013 at 12pm'
|
65
|
+
|
66
|
+
# ((Number of days between April 1 2011 and April 1 2013) * 24 hours) - last hour = 731 days * 24 hours = 17543 hours
|
67
|
+
processor.should receive(:process_url).exactly(17543).times
|
68
|
+
processor.process_between(since_time, until_time)
|
69
|
+
end
|
70
|
+
|
71
|
+
context "with invalid time" do
|
72
|
+
it "should fail with invalid since time" do
|
73
|
+
since_time = 'nil'
|
74
|
+
until_time = 'Now'
|
75
|
+
allow(processor).to receive(:process_url)
|
76
|
+
expect {processor.process_between(since_time, until_time)}.to raise_error(RuntimeError)
|
77
|
+
end
|
78
|
+
|
79
|
+
it "should fail with invalid until time" do
|
80
|
+
since_time = '1 day ago'
|
81
|
+
until_time = 'nil'
|
82
|
+
allow(processor).to receive(:process_url)
|
83
|
+
expect {processor.process_between(since_time, until_time)}.to raise_error(RuntimeError)
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should fail with both invalid until/since time" do
|
87
|
+
since_time = 'nil'
|
88
|
+
until_time = 'nil'
|
89
|
+
allow(processor).to receive(:process_url)
|
90
|
+
expect {processor.process_between(since_time, until_time)}.to raise_error(RuntimeError)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
include GitHubArchiveParser
|
4
|
+
|
5
|
+
describe Utilities do
|
6
|
+
describe "#time_from_natural_language" do
|
7
|
+
it "should raise an error with an invalid time" do
|
8
|
+
expect { Utilities.time_from_natural_language('nil') }.to raise_error(RuntimeError)
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should return a Time object" do
|
12
|
+
time = Utilities.time_from_natural_language('August 1 2013 at 4am')
|
13
|
+
time.should be_a(Time)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should return a UTC time" do
|
17
|
+
time = Utilities.time_from_natural_language('August 1 2013 at 4am')
|
18
|
+
time.utc?.should be_true
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should return the correct time" do
|
22
|
+
time = Utilities.time_from_natural_language('August 1 2013 at 4am')
|
23
|
+
time.getlocal.year.should eq(2013)
|
24
|
+
time.getlocal.month.should eq(8)
|
25
|
+
time.getlocal.day.should eq(1)
|
26
|
+
time.getlocal.hour.should eq(4)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe "#class_from_string" do
|
31
|
+
it "should return a valid name of a module" do
|
32
|
+
Utilities.class_from_string('Utilities').should eq(Utilities)
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should return a valid name of a class" do
|
36
|
+
Utilities.class_from_string('Processor').should eq(Processor)
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should return a valid name of a class using full path" do
|
40
|
+
Utilities.class_from_string('GitHubArchiveParser::Processor').should eq(Processor)
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should return nil if class cannot be found" do
|
44
|
+
Utilities.class_from_string('SpecialApplication::MassUnit').should be_nil
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: github_archive_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin Jalbert
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-01-
|
11
|
+
date: 2014-01-29 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Gem which parses GitHub Archive data
|
14
14
|
email:
|
@@ -46,8 +46,12 @@ files:
|
|
46
46
|
- lib/github_archive_parser/event_handlers/watch_event.rb
|
47
47
|
- lib/github_archive_parser/initialize.rb
|
48
48
|
- lib/github_archive_parser/log.rb
|
49
|
+
- lib/github_archive_parser/processor.rb
|
50
|
+
- lib/github_archive_parser/utilities.rb
|
49
51
|
- lib/github_archive_parser/version.rb
|
50
52
|
- spec/log_spec.rb
|
53
|
+
- spec/processor_spec.rb
|
54
|
+
- spec/utilities_spec.rb
|
51
55
|
homepage: https://github.com/kevinjalbert/github_archive_parser
|
52
56
|
licenses: []
|
53
57
|
metadata: {}
|
@@ -73,3 +77,5 @@ specification_version: 4
|
|
73
77
|
summary: Gem which parses GitHub Archive data
|
74
78
|
test_files:
|
75
79
|
- spec/log_spec.rb
|
80
|
+
- spec/processor_spec.rb
|
81
|
+
- spec/utilities_spec.rb
|