github_archive_parser 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dbbcc01faaf08baed69942f2c59ffe2fca262966
4
- data.tar.gz: 4aaba6c19c5caa3e2a4b17fa331e2a0411cac792
3
+ metadata.gz: cc577ab9b44ecf0384854bcabef6d03407beb841
4
+ data.tar.gz: 3e0af6d7aab095936d5865bc9f9b538cc6e60cd0
5
5
  SHA512:
6
- metadata.gz: f4e608379a73168cf4de92cd2ef11e62fb022f3f778afa43d57dd3508e26d9b188cc26114d86a5cac4b55c8638a856545a0e3ce2f525f17abdd19c633feb9cf3
7
- data.tar.gz: 08d9faf62bdad2358c01d2f65440b8812fc91965ecc4c16f08af6fc98fb5dd4703914d43c9541b31d9b21ac7e4459b7b99ef1d7e7c2e4b2a9bf752eb3f0f9de9
6
+ metadata.gz: 253ce14d775c224434c7a498db8545ac9e4e6fd2be77cf8ed429280fabfb923b25c87a2a60787c279f80903a13e520342664859f92d14f4aa2e825a5912ed874
7
+ data.tar.gz: 5a479e595bf17b801e89b0af6293383161817585b6b35949e7aa327c16a666a64f5652c0761419ed438928b0e4279d702f31a916d5cfac9557d21c727f1dff0c
@@ -3,7 +3,5 @@
3
3
  $:.unshift File.expand_path("../../lib", __FILE__)
4
4
  require 'github_archive_parser'
5
5
 
6
- github_archive_parser = GitHubArchiveParser::Processor.new
7
- ARGV.each do |url|
8
- github_archive_parser.process(url)
9
- end
6
+ github_archive_parser = GitHubArchiveParser::CLI.new
7
+ github_archive_parser.process(ARGV)
@@ -1,42 +1,37 @@
1
1
  require 'github_archive_parser/initialize'
2
2
 
3
3
  module GitHubArchiveParser
4
- class Processor
5
- attr_reader :options
4
+ class CLI
5
+ attr_accessor :options
6
6
 
7
7
  def initialize
8
8
  @options = OpenStruct.new(
9
9
  debug: false,
10
+ quite: false,
11
+ since: nil,
12
+ until: nil,
10
13
  )
11
14
  parse_options
12
15
  determine_log_level
13
-
14
- # Create the concrete handlers and store them for future use
15
- create_event_handlers
16
16
  end
17
17
 
18
- def process(url)
19
- Log.info "Processing #{url}"
20
- if (!url.nil? && url.start_with?("http://data.githubarchive.org/"))
21
- gz = open(url)
22
- js = Zlib::GzipReader.new(gz).read
23
-
24
- Yajl::Parser.parse(js) do |event|
25
- event = Hashie::Mash.new(event)
26
- event_class = class_from_string("GitHubArchiveParser::#{event.type}")
27
- event_handler = @event_handlers[event_class]
28
-
29
- event_handler.each { |handler|
30
- if !handler.nil? && handler.respond_to?(:parse)
31
- handler.parse(event)
32
- end
33
- }
18
+ def process(args)
19
+ processor = Processor.new
20
+ begin
21
+ if !@options.until.nil? && !@options.since.nil?
22
+ processor.process_between(@options.since, @options.until)
23
+ elsif !@options.since.nil?
24
+ processor.process_since(@options.since)
25
+ else
26
+ args.each {|url| processor.process_url(url) }
34
27
  end
35
- else
36
- Log.warn "URL[#{url}] does not belong to http://data.githubarchive.org/"
28
+ rescue Exception => e
29
+ Log.error e
37
30
  end
38
31
  end
39
32
 
33
+ private
34
+
40
35
  def parse_options
41
36
  OptionParser.new do |opt|
42
37
  opt.version = VERSION
@@ -46,11 +41,15 @@ module GitHubArchiveParser
46
41
  opt.on "-q", "--quite", "Hide all output (shows only UNKNOWN level log statements)" do
47
42
  options.quite = true
48
43
  end
44
+ opt.on "-s", "--since TIME", "Process all events since the provided date (can be specified in natural language)" do |time|
45
+ options.since = time
46
+ end
47
+ opt.on "-u", "--until TIME", "Process all events until the provided date (can be specified in natural language) starting from February 12, 2011" do |time|
48
+ options.until = time
49
+ end
49
50
  end.parse!
50
51
  end
51
52
 
52
- private
53
-
54
53
  def determine_log_level
55
54
  if options.debug
56
55
  Log.level = Logger::DEBUG
@@ -62,36 +61,5 @@ module GitHubArchiveParser
62
61
  end
63
62
  end
64
63
 
65
- def class_from_string(string)
66
- begin
67
- string.split('::').inject(Object) do |mod, class_name|
68
- mod.const_get(class_name)
69
- end
70
- rescue Exception
71
- Log.warn "Event #{string} not found"
72
- nil
73
- end
74
- end
75
-
76
- def create_event_handlers
77
- @event_handlers = {}
78
-
79
- # Probably can do something to not hardcode this
80
- # Iterate over the event types
81
- [CommitCommentEvent, CreateEvent, DeleteEvent,
82
- DeploymentEvent, DeploymentStatusEvent, DownloadEvent,
83
- FollowEvent, ForkApplyEvent, ForkEvent,
84
- GistEvent, GollumEvent, IssueCommentEvent,
85
- IssueCommentEvent, IssuesEvent, MemberEvent,
86
- PublicEvent, PullRequestEvent, PullRequestReviewCommentEvent,
87
- PushEvent, ReleaseEvent, StatusEvent,
88
- TeamAddEvent, WatchEvent].each do | event_type |
89
-
90
- # Map list of concrete event handler to their event type
91
- @event_handlers[event_type] = event_type.descendants.map { |handler|
92
- handler.new
93
- }
94
- end
95
- end
96
64
  end
97
65
  end
@@ -2,9 +2,14 @@ module AwesomeApplication
2
2
  class PrintCreateEvent
3
3
  include GitHubArchiveParser::CreateEvent
4
4
 
5
+ attr_accessor :counter
6
+
5
7
  def parse(event)
6
8
  # The event is a Hashie::Mash object for easy (dot) access
7
- puts "#{event.repository.owner}/#{event.repository.name}"
9
+ #puts "#{event.repository.owner}/#{event.repository.name}"
10
+ @counter = 0 if @counter.nil?
11
+ @counter = 1 + @counter
12
+ puts @counter
8
13
  end
9
14
  end
10
15
  end
@@ -5,5 +5,8 @@ require 'open-uri'
5
5
  require 'zlib'
6
6
  require 'yajl'
7
7
  require 'hashie'
8
+ require 'chronic'
8
9
 
10
+ # Require event_handler first to prevent load order issues
11
+ require_relative './event_handler'
9
12
  Dir.glob(File.dirname(__FILE__) + '/**/*.rb') { |file| require file }
@@ -0,0 +1,74 @@
1
+ module GitHubArchiveParser
2
+ class Processor
3
+
4
+ def initialize
5
+ create_event_handlers
6
+ end
7
+
8
+ def process_between(since_time, until_time)
9
+ start_time = Utilities.time_from_natural_language(since_time)
10
+ end_time = Utilities.time_from_natural_language(until_time)
11
+ Log.info "Processing between #{start_time} and #{end_time}"
12
+
13
+ # Start hourly iterator from start time, but exclude last hour (could be incomplete)
14
+ iterator = start_time
15
+ while iterator < end_time - 3600
16
+ process_url("http://data.githubarchive.org/#{iterator.year}-#{iterator.month.to_s.rjust(2, '0')}-#{iterator.day.to_s.rjust(2, '0')}-#{iterator.hour}.json.gz")
17
+ iterator += 3600
18
+ end
19
+ end
20
+
21
+ def process_since(since_time)
22
+ process_between(since_time, 'Now')
23
+ end
24
+
25
+ def process_url(url)
26
+ Log.info "Processing #{url}"
27
+ if (!url.nil? && url.start_with?("http://data.githubarchive.org/"))
28
+
29
+ begin
30
+ gz = open(url)
31
+ js = Zlib::GzipReader.new(gz).read
32
+ rescue => e
33
+ raise e, "URL[#{url}] #{e.message}"
34
+ end
35
+
36
+ Yajl::Parser.parse(js) do |event|
37
+ event = Hashie::Mash.new(event)
38
+ event_class = Utilities.class_from_string("GitHubArchiveParser::#{event.type}")
39
+ event_handler = @event_handlers[event_class]
40
+
41
+ event_handler.each { |handler|
42
+ if !handler.nil? && handler.respond_to?(:parse)
43
+ handler.parse(event)
44
+ end
45
+ }
46
+ end
47
+ else
48
+ raise "URL[#{url}] does not belong to http://data.githubarchive.org/"
49
+ end
50
+ end
51
+
52
+ private
53
+
54
+ def create_event_handlers
55
+ @event_handlers = {}
56
+
57
+ # Probably can do something to not hardcode this
58
+ # Iterate over the event types
59
+ [CommitCommentEvent, CreateEvent, DeleteEvent,
60
+ DeploymentEvent, DeploymentStatusEvent, DownloadEvent,
61
+ FollowEvent, ForkApplyEvent, ForkEvent,
62
+ GistEvent, GollumEvent, IssueCommentEvent,
63
+ IssueCommentEvent, IssuesEvent, MemberEvent,
64
+ PublicEvent, PullRequestEvent, PullRequestReviewCommentEvent,
65
+ PushEvent, ReleaseEvent, StatusEvent,
66
+ TeamAddEvent, WatchEvent].each do | event_type |
67
+
68
+ # Map list of concrete event handler to their event type
69
+ @event_handlers[event_type] = event_type.descendants.map { |handler| handler.new }
70
+ end
71
+ end
72
+
73
+ end
74
+ end
@@ -0,0 +1,22 @@
1
+ module GitHubArchiveParser
2
+ module Utilities
3
+ module_function
4
+
5
+ def class_from_string(string)
6
+ begin
7
+ string.split('::').inject(Object) do |mod, class_name|
8
+ mod.const_get(class_name)
9
+ end
10
+ rescue Exception
11
+ Log.warn "Event #{string} not found"
12
+ nil
13
+ end
14
+ end
15
+
16
+ def time_from_natural_language(natural_language)
17
+ time = Chronic.parse(natural_language)
18
+ raise "Invalid time: #{natural_language}" if time.nil?
19
+ time.getutc
20
+ end
21
+ end
22
+ end
@@ -1,3 +1,3 @@
1
1
  module GitHubArchiveParser
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -0,0 +1,95 @@
1
+ require 'spec_helper'
2
+
3
+ include GitHubArchiveParser
4
+
5
+ describe Processor do
6
+ let (:processor) { GitHubArchiveParser::Processor.new }
7
+ describe "#process_url" do
8
+ it "should fail on an invalid text" do
9
+ expect {processor.process_url("dummydata")}.to raise_error(RuntimeError)
10
+ end
11
+
12
+ it "should fail on an invalid URL" do
13
+ expect {processor.process_url("http://data.githubarchive.org/dummydata.json.gz")}.to raise_error(OpenURI::HTTPError)
14
+ end
15
+
16
+ it "should fail on an wrong domain URL" do
17
+ expect {processor.process_url("http://data.wrongdomain.org/dummydata.json.gz")}.to raise_error(RuntimeError)
18
+ end
19
+ end
20
+
21
+ describe "process_since" do
22
+ it "should call process_between(since, 'Now')" do
23
+ since_time = 'Yesterday'
24
+ until_time = 'Now'
25
+ processor.should receive(:process_between).with(since_time, until_time)
26
+ processor.process_since(since_time)
27
+ end
28
+
29
+ context "with invalid time" do
30
+ it "should fail with invalid since time" do
31
+ since_time = 'nil'
32
+ allow(processor).to receive(:process_url)
33
+ expect {processor.process_since(since_time)}.to raise_error(RuntimeError)
34
+ end
35
+ end
36
+ end
37
+
38
+ describe "process_between" do
39
+ it "should call process_url(url) for as many hour intervals between two times (hours)" do
40
+ since_time = '12 hours ago'
41
+ until_time = '6 hours ago'
42
+ processor.should receive(:process_url).exactly(6).times
43
+ processor.process_between(since_time, until_time)
44
+ end
45
+
46
+ it "should call process_url(url) for as many hour intervals between two times (days)" do
47
+ since_time = '3 days ago'
48
+ until_time = '1 day ago'
49
+ processor.should receive(:process_url).exactly(48).times
50
+ processor.process_between(since_time, until_time)
51
+ end
52
+
53
+ it "should call process_url(url) for as many hour intervals between two times (months)" do
54
+ since_time = 'April 1 2012 at 12pm'
55
+ until_time = 'June 1 2012 at 12pm'
56
+
57
+ # (24 hours * (April 30 days + May 31 days)) - last hour = 1463 hours
58
+ processor.should receive(:process_url).exactly(1463).times
59
+ processor.process_between(since_time, until_time)
60
+ end
61
+
62
+ it "should call process_url(url) for as many hour intervals between two times (years)" do
63
+ since_time = 'April 1 2011 at 12pm'
64
+ until_time = 'April 1 2013 at 12pm'
65
+
66
+ # ((Number of days between April 1 2011 and April 1 2013) * 24 hours) - last hour = 731 days * 24 hours = 17543 hours
67
+ processor.should receive(:process_url).exactly(17543).times
68
+ processor.process_between(since_time, until_time)
69
+ end
70
+
71
+ context "with invalid time" do
72
+ it "should fail with invalid since time" do
73
+ since_time = 'nil'
74
+ until_time = 'Now'
75
+ allow(processor).to receive(:process_url)
76
+ expect {processor.process_between(since_time, until_time)}.to raise_error(RuntimeError)
77
+ end
78
+
79
+ it "should fail with invalid until time" do
80
+ since_time = '1 day ago'
81
+ until_time = 'nil'
82
+ allow(processor).to receive(:process_url)
83
+ expect {processor.process_between(since_time, until_time)}.to raise_error(RuntimeError)
84
+ end
85
+
86
+ it "should fail with both invalid until/since time" do
87
+ since_time = 'nil'
88
+ until_time = 'nil'
89
+ allow(processor).to receive(:process_url)
90
+ expect {processor.process_between(since_time, until_time)}.to raise_error(RuntimeError)
91
+ end
92
+ end
93
+ end
94
+ end
95
+
@@ -0,0 +1,48 @@
1
+ require 'spec_helper'
2
+
3
+ include GitHubArchiveParser
4
+
5
+ describe Utilities do
6
+ describe "#time_from_natural_language" do
7
+ it "should raise an error with an invalid time" do
8
+ expect { Utilities.time_from_natural_language('nil') }.to raise_error(RuntimeError)
9
+ end
10
+
11
+ it "should return a Time object" do
12
+ time = Utilities.time_from_natural_language('August 1 2013 at 4am')
13
+ time.should be_a(Time)
14
+ end
15
+
16
+ it "should return a UTC time" do
17
+ time = Utilities.time_from_natural_language('August 1 2013 at 4am')
18
+ time.utc?.should be_true
19
+ end
20
+
21
+ it "should return the correct time" do
22
+ time = Utilities.time_from_natural_language('August 1 2013 at 4am')
23
+ time.getlocal.year.should eq(2013)
24
+ time.getlocal.month.should eq(8)
25
+ time.getlocal.day.should eq(1)
26
+ time.getlocal.hour.should eq(4)
27
+ end
28
+ end
29
+
30
+ describe "#class_from_string" do
31
+ it "should return a valid name of a module" do
32
+ Utilities.class_from_string('Utilities').should eq(Utilities)
33
+ end
34
+
35
+ it "should return a valid name of a class" do
36
+ Utilities.class_from_string('Processor').should eq(Processor)
37
+ end
38
+
39
+ it "should return a valid name of a class using full path" do
40
+ Utilities.class_from_string('GitHubArchiveParser::Processor').should eq(Processor)
41
+ end
42
+
43
+ it "should return nil if class cannot be found" do
44
+ Utilities.class_from_string('SpecialApplication::MassUnit').should be_nil
45
+ end
46
+ end
47
+ end
48
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: github_archive_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin Jalbert
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-01-17 00:00:00.000000000 Z
11
+ date: 2014-01-29 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Gem which parses GitHub Archive data
14
14
  email:
@@ -46,8 +46,12 @@ files:
46
46
  - lib/github_archive_parser/event_handlers/watch_event.rb
47
47
  - lib/github_archive_parser/initialize.rb
48
48
  - lib/github_archive_parser/log.rb
49
+ - lib/github_archive_parser/processor.rb
50
+ - lib/github_archive_parser/utilities.rb
49
51
  - lib/github_archive_parser/version.rb
50
52
  - spec/log_spec.rb
53
+ - spec/processor_spec.rb
54
+ - spec/utilities_spec.rb
51
55
  homepage: https://github.com/kevinjalbert/github_archive_parser
52
56
  licenses: []
53
57
  metadata: {}
@@ -73,3 +77,5 @@ specification_version: 4
73
77
  summary: Gem which parses GitHub Archive data
74
78
  test_files:
75
79
  - spec/log_spec.rb
80
+ - spec/processor_spec.rb
81
+ - spec/utilities_spec.rb