github_archive_parser 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dbbcc01faaf08baed69942f2c59ffe2fca262966
4
- data.tar.gz: 4aaba6c19c5caa3e2a4b17fa331e2a0411cac792
3
+ metadata.gz: cc577ab9b44ecf0384854bcabef6d03407beb841
4
+ data.tar.gz: 3e0af6d7aab095936d5865bc9f9b538cc6e60cd0
5
5
  SHA512:
6
- metadata.gz: f4e608379a73168cf4de92cd2ef11e62fb022f3f778afa43d57dd3508e26d9b188cc26114d86a5cac4b55c8638a856545a0e3ce2f525f17abdd19c633feb9cf3
7
- data.tar.gz: 08d9faf62bdad2358c01d2f65440b8812fc91965ecc4c16f08af6fc98fb5dd4703914d43c9541b31d9b21ac7e4459b7b99ef1d7e7c2e4b2a9bf752eb3f0f9de9
6
+ metadata.gz: 253ce14d775c224434c7a498db8545ac9e4e6fd2be77cf8ed429280fabfb923b25c87a2a60787c279f80903a13e520342664859f92d14f4aa2e825a5912ed874
7
+ data.tar.gz: 5a479e595bf17b801e89b0af6293383161817585b6b35949e7aa327c16a666a64f5652c0761419ed438928b0e4279d702f31a916d5cfac9557d21c727f1dff0c
@@ -3,7 +3,5 @@
3
3
  $:.unshift File.expand_path("../../lib", __FILE__)
4
4
  require 'github_archive_parser'
5
5
 
6
- github_archive_parser = GitHubArchiveParser::Processor.new
7
- ARGV.each do |url|
8
- github_archive_parser.process(url)
9
- end
6
+ github_archive_parser = GitHubArchiveParser::CLI.new
7
+ github_archive_parser.process(ARGV)
@@ -1,42 +1,37 @@
1
1
  require 'github_archive_parser/initialize'
2
2
 
3
3
  module GitHubArchiveParser
4
- class Processor
5
- attr_reader :options
4
+ class CLI
5
+ attr_accessor :options
6
6
 
7
7
  def initialize
8
8
  @options = OpenStruct.new(
9
9
  debug: false,
10
+ quite: false,
11
+ since: nil,
12
+ until: nil,
10
13
  )
11
14
  parse_options
12
15
  determine_log_level
13
-
14
- # Create the concrete handlers and store them for future use
15
- create_event_handlers
16
16
  end
17
17
 
18
- def process(url)
19
- Log.info "Processing #{url}"
20
- if (!url.nil? && url.start_with?("http://data.githubarchive.org/"))
21
- gz = open(url)
22
- js = Zlib::GzipReader.new(gz).read
23
-
24
- Yajl::Parser.parse(js) do |event|
25
- event = Hashie::Mash.new(event)
26
- event_class = class_from_string("GitHubArchiveParser::#{event.type}")
27
- event_handler = @event_handlers[event_class]
28
-
29
- event_handler.each { |handler|
30
- if !handler.nil? && handler.respond_to?(:parse)
31
- handler.parse(event)
32
- end
33
- }
18
+ def process(args)
19
+ processor = Processor.new
20
+ begin
21
+ if !@options.until.nil? && !@options.since.nil?
22
+ processor.process_between(@options.since, @options.until)
23
+ elsif !@options.since.nil?
24
+ processor.process_since(@options.since)
25
+ else
26
+ args.each {|url| processor.process_url(url) }
34
27
  end
35
- else
36
- Log.warn "URL[#{url}] does not belong to http://data.githubarchive.org/"
28
+ rescue Exception => e
29
+ Log.error e
37
30
  end
38
31
  end
39
32
 
33
+ private
34
+
40
35
  def parse_options
41
36
  OptionParser.new do |opt|
42
37
  opt.version = VERSION
@@ -46,11 +41,15 @@ module GitHubArchiveParser
46
41
  opt.on "-q", "--quite", "Hide all output (shows only UNKNOWN level log statements)" do
47
42
  options.quite = true
48
43
  end
44
+ opt.on "-s", "--since TIME", "Process all events since the provided date (can be specified in natural language)" do |time|
45
+ options.since = time
46
+ end
47
+ opt.on "-u", "--until TIME", "Process all events until the provided date (can be specified in natural language) starting from February 12, 2011" do |time|
48
+ options.until = time
49
+ end
49
50
  end.parse!
50
51
  end
51
52
 
52
- private
53
-
54
53
  def determine_log_level
55
54
  if options.debug
56
55
  Log.level = Logger::DEBUG
@@ -62,36 +61,5 @@ module GitHubArchiveParser
62
61
  end
63
62
  end
64
63
 
65
- def class_from_string(string)
66
- begin
67
- string.split('::').inject(Object) do |mod, class_name|
68
- mod.const_get(class_name)
69
- end
70
- rescue Exception
71
- Log.warn "Event #{string} not found"
72
- nil
73
- end
74
- end
75
-
76
- def create_event_handlers
77
- @event_handlers = {}
78
-
79
- # Probably can do something to not hardcode this
80
- # Iterate over the event types
81
- [CommitCommentEvent, CreateEvent, DeleteEvent,
82
- DeploymentEvent, DeploymentStatusEvent, DownloadEvent,
83
- FollowEvent, ForkApplyEvent, ForkEvent,
84
- GistEvent, GollumEvent, IssueCommentEvent,
85
- IssueCommentEvent, IssuesEvent, MemberEvent,
86
- PublicEvent, PullRequestEvent, PullRequestReviewCommentEvent,
87
- PushEvent, ReleaseEvent, StatusEvent,
88
- TeamAddEvent, WatchEvent].each do | event_type |
89
-
90
- # Map list of concrete event handler to their event type
91
- @event_handlers[event_type] = event_type.descendants.map { |handler|
92
- handler.new
93
- }
94
- end
95
- end
96
64
  end
97
65
  end
@@ -2,9 +2,14 @@ module AwesomeApplication
2
2
  class PrintCreateEvent
3
3
  include GitHubArchiveParser::CreateEvent
4
4
 
5
+ attr_accessor :counter
6
+
5
7
  def parse(event)
6
8
  # The event is a Hashie::Mash object for easy (dot) access
7
- puts "#{event.repository.owner}/#{event.repository.name}"
9
+ #puts "#{event.repository.owner}/#{event.repository.name}"
10
+ @counter = 0 if @counter.nil?
11
+ @counter = 1 + @counter
12
+ puts @counter
8
13
  end
9
14
  end
10
15
  end
@@ -5,5 +5,8 @@ require 'open-uri'
5
5
  require 'zlib'
6
6
  require 'yajl'
7
7
  require 'hashie'
8
+ require 'chronic'
8
9
 
10
+ # Require event_handler first to prevent load order issues
11
+ require_relative './event_handler'
9
12
  Dir.glob(File.dirname(__FILE__) + '/**/*.rb') { |file| require file }
@@ -0,0 +1,74 @@
1
+ module GitHubArchiveParser
2
+ class Processor
3
+
4
+ def initialize
5
+ create_event_handlers
6
+ end
7
+
8
+ def process_between(since_time, until_time)
9
+ start_time = Utilities.time_from_natural_language(since_time)
10
+ end_time = Utilities.time_from_natural_language(until_time)
11
+ Log.info "Processing between #{start_time} and #{end_time}"
12
+
13
+ # Start hourly iterator from start time, but exclude last hour (could be incomplete)
14
+ iterator = start_time
15
+ while iterator < end_time - 3600
16
+ process_url("http://data.githubarchive.org/#{iterator.year}-#{iterator.month.to_s.rjust(2, '0')}-#{iterator.day.to_s.rjust(2, '0')}-#{iterator.hour}.json.gz")
17
+ iterator += 3600
18
+ end
19
+ end
20
+
21
+ def process_since(since_time)
22
+ process_between(since_time, 'Now')
23
+ end
24
+
25
+ def process_url(url)
26
+ Log.info "Processing #{url}"
27
+ if (!url.nil? && url.start_with?("http://data.githubarchive.org/"))
28
+
29
+ begin
30
+ gz = open(url)
31
+ js = Zlib::GzipReader.new(gz).read
32
+ rescue => e
33
+ raise e, "URL[#{url}] #{e.message}"
34
+ end
35
+
36
+ Yajl::Parser.parse(js) do |event|
37
+ event = Hashie::Mash.new(event)
38
+ event_class = Utilities.class_from_string("GitHubArchiveParser::#{event.type}")
39
+ event_handler = @event_handlers[event_class]
40
+
41
+ event_handler.each { |handler|
42
+ if !handler.nil? && handler.respond_to?(:parse)
43
+ handler.parse(event)
44
+ end
45
+ }
46
+ end
47
+ else
48
+ raise "URL[#{url}] does not belong to http://data.githubarchive.org/"
49
+ end
50
+ end
51
+
52
+ private
53
+
54
+ def create_event_handlers
55
+ @event_handlers = {}
56
+
57
+ # Probably can do something to not hardcode this
58
+ # Iterate over the event types
59
+ [CommitCommentEvent, CreateEvent, DeleteEvent,
60
+ DeploymentEvent, DeploymentStatusEvent, DownloadEvent,
61
+ FollowEvent, ForkApplyEvent, ForkEvent,
62
+ GistEvent, GollumEvent, IssueCommentEvent,
63
+ IssueCommentEvent, IssuesEvent, MemberEvent,
64
+ PublicEvent, PullRequestEvent, PullRequestReviewCommentEvent,
65
+ PushEvent, ReleaseEvent, StatusEvent,
66
+ TeamAddEvent, WatchEvent].each do | event_type |
67
+
68
+ # Map list of concrete event handler to their event type
69
+ @event_handlers[event_type] = event_type.descendants.map { |handler| handler.new }
70
+ end
71
+ end
72
+
73
+ end
74
+ end
@@ -0,0 +1,22 @@
1
+ module GitHubArchiveParser
2
+ module Utilities
3
+ module_function
4
+
5
+ def class_from_string(string)
6
+ begin
7
+ string.split('::').inject(Object) do |mod, class_name|
8
+ mod.const_get(class_name)
9
+ end
10
+ rescue Exception
11
+ Log.warn "Event #{string} not found"
12
+ nil
13
+ end
14
+ end
15
+
16
+ def time_from_natural_language(natural_language)
17
+ time = Chronic.parse(natural_language)
18
+ raise "Invalid time: #{natural_language}" if time.nil?
19
+ time.getutc
20
+ end
21
+ end
22
+ end
@@ -1,3 +1,3 @@
1
1
  module GitHubArchiveParser
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -0,0 +1,95 @@
1
+ require 'spec_helper'
2
+
3
+ include GitHubArchiveParser
4
+
5
+ describe Processor do
6
+ let (:processor) { GitHubArchiveParser::Processor.new }
7
+ describe "#process_url" do
8
+ it "should fail on an invalid text" do
9
+ expect {processor.process_url("dummydata")}.to raise_error(RuntimeError)
10
+ end
11
+
12
+ it "should fail on an invalid URL" do
13
+ expect {processor.process_url("http://data.githubarchive.org/dummydata.json.gz")}.to raise_error(OpenURI::HTTPError)
14
+ end
15
+
16
+ it "should fail on an wrong domain URL" do
17
+ expect {processor.process_url("http://data.wrongdomain.org/dummydata.json.gz")}.to raise_error(RuntimeError)
18
+ end
19
+ end
20
+
21
+ describe "process_since" do
22
+ it "should call process_between(since, 'Now')" do
23
+ since_time = 'Yesterday'
24
+ until_time = 'Now'
25
+ processor.should receive(:process_between).with(since_time, until_time)
26
+ processor.process_since(since_time)
27
+ end
28
+
29
+ context "with invalid time" do
30
+ it "should fail with invalid since time" do
31
+ since_time = 'nil'
32
+ allow(processor).to receive(:process_url)
33
+ expect {processor.process_since(since_time)}.to raise_error(RuntimeError)
34
+ end
35
+ end
36
+ end
37
+
38
+ describe "process_between" do
39
+ it "should call process_url(url) for as many hour intervals between two times (hours)" do
40
+ since_time = '12 hours ago'
41
+ until_time = '6 hours ago'
42
+ processor.should receive(:process_url).exactly(6).times
43
+ processor.process_between(since_time, until_time)
44
+ end
45
+
46
+ it "should call process_url(url) for as many hour intervals between two times (days)" do
47
+ since_time = '3 days ago'
48
+ until_time = '1 day ago'
49
+ processor.should receive(:process_url).exactly(48).times
50
+ processor.process_between(since_time, until_time)
51
+ end
52
+
53
+ it "should call process_url(url) for as many hour intervals between two times (months)" do
54
+ since_time = 'April 1 2012 at 12pm'
55
+ until_time = 'June 1 2012 at 12pm'
56
+
57
+ # (24 hours * (April 30 days + May 31 days)) - last hour = 1463 hours
58
+ processor.should receive(:process_url).exactly(1463).times
59
+ processor.process_between(since_time, until_time)
60
+ end
61
+
62
+ it "should call process_url(url) for as many hour intervals between two times (years)" do
63
+ since_time = 'April 1 2011 at 12pm'
64
+ until_time = 'April 1 2013 at 12pm'
65
+
66
+ # ((Number of days between April 1 2011 and April 1 2013) * 24 hours) - last hour = 731 days * 24 hours = 17543 hours
67
+ processor.should receive(:process_url).exactly(17543).times
68
+ processor.process_between(since_time, until_time)
69
+ end
70
+
71
+ context "with invalid time" do
72
+ it "should fail with invalid since time" do
73
+ since_time = 'nil'
74
+ until_time = 'Now'
75
+ allow(processor).to receive(:process_url)
76
+ expect {processor.process_between(since_time, until_time)}.to raise_error(RuntimeError)
77
+ end
78
+
79
+ it "should fail with invalid until time" do
80
+ since_time = '1 day ago'
81
+ until_time = 'nil'
82
+ allow(processor).to receive(:process_url)
83
+ expect {processor.process_between(since_time, until_time)}.to raise_error(RuntimeError)
84
+ end
85
+
86
+ it "should fail with both invalid until/since time" do
87
+ since_time = 'nil'
88
+ until_time = 'nil'
89
+ allow(processor).to receive(:process_url)
90
+ expect {processor.process_between(since_time, until_time)}.to raise_error(RuntimeError)
91
+ end
92
+ end
93
+ end
94
+ end
95
+
@@ -0,0 +1,48 @@
1
+ require 'spec_helper'
2
+
3
+ include GitHubArchiveParser
4
+
5
+ describe Utilities do
6
+ describe "#time_from_natural_language" do
7
+ it "should raise an error with an invalid time" do
8
+ expect { Utilities.time_from_natural_language('nil') }.to raise_error(RuntimeError)
9
+ end
10
+
11
+ it "should return a Time object" do
12
+ time = Utilities.time_from_natural_language('August 1 2013 at 4am')
13
+ time.should be_a(Time)
14
+ end
15
+
16
+ it "should return a UTC time" do
17
+ time = Utilities.time_from_natural_language('August 1 2013 at 4am')
18
+ time.utc?.should be_true
19
+ end
20
+
21
+ it "should return the correct time" do
22
+ time = Utilities.time_from_natural_language('August 1 2013 at 4am')
23
+ time.getlocal.year.should eq(2013)
24
+ time.getlocal.month.should eq(8)
25
+ time.getlocal.day.should eq(1)
26
+ time.getlocal.hour.should eq(4)
27
+ end
28
+ end
29
+
30
+ describe "#class_from_string" do
31
+ it "should return a valid name of a module" do
32
+ Utilities.class_from_string('Utilities').should eq(Utilities)
33
+ end
34
+
35
+ it "should return a valid name of a class" do
36
+ Utilities.class_from_string('Processor').should eq(Processor)
37
+ end
38
+
39
+ it "should return a valid name of a class using full path" do
40
+ Utilities.class_from_string('GitHubArchiveParser::Processor').should eq(Processor)
41
+ end
42
+
43
+ it "should return nil if class cannot be found" do
44
+ Utilities.class_from_string('SpecialApplication::MassUnit').should be_nil
45
+ end
46
+ end
47
+ end
48
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: github_archive_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin Jalbert
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-01-17 00:00:00.000000000 Z
11
+ date: 2014-01-29 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Gem which parses GitHub Archive data
14
14
  email:
@@ -46,8 +46,12 @@ files:
46
46
  - lib/github_archive_parser/event_handlers/watch_event.rb
47
47
  - lib/github_archive_parser/initialize.rb
48
48
  - lib/github_archive_parser/log.rb
49
+ - lib/github_archive_parser/processor.rb
50
+ - lib/github_archive_parser/utilities.rb
49
51
  - lib/github_archive_parser/version.rb
50
52
  - spec/log_spec.rb
53
+ - spec/processor_spec.rb
54
+ - spec/utilities_spec.rb
51
55
  homepage: https://github.com/kevinjalbert/github_archive_parser
52
56
  licenses: []
53
57
  metadata: {}
@@ -73,3 +77,5 @@ specification_version: 4
73
77
  summary: Gem which parses GitHub Archive data
74
78
  test_files:
75
79
  - spec/log_spec.rb
80
+ - spec/processor_spec.rb
81
+ - spec/utilities_spec.rb