ghtorrent 0.9 → 0.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,24 @@
1
+ require 'sequel'
2
+
3
+ require 'ghtorrent/migrations/mysql_defaults'
4
+
5
+ Sequel.migration do
6
+ up do
7
+ puts 'Dropping table forks'
8
+ drop_table :forks
9
+ end
10
+
11
+ down do
12
+ puts 'Adding table forks'
13
+
14
+ create_table :forks do
15
+ foreign_key :forked_project_id, :projects, :null => false
16
+ foreign_key :forked_from_id, :projects, :null => false
17
+ Integer :fork_id, :null => false, :unique => true
18
+ DateTime :created_at, :null => false,
19
+ :default => Sequel::CURRENT_TIMESTAMP
20
+ String :ext_ref_id, :null => false, :size => 24, :default => '0'
21
+ primary_key([:forked_project_id, :forked_from_id])
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,23 @@
1
+ require 'sequel'
2
+
3
+ require 'ghtorrent/migrations/mysql_defaults'
4
+
5
+ Sequel.migration do
6
+
7
+ up do
8
+ puts 'Dropping columns merged and user_id from pull_requests'
9
+ alter_table :pull_requests do
10
+ drop_column :merged
11
+ drop_foreign_key :user_id
12
+ end
13
+ end
14
+
15
+ down do
16
+ puts 'Adding columns merged and user_id to pull_requests'
17
+ add_column :pull_requests, :merged, TrueClass, :null => false,
18
+ :default => false
19
+
20
+ add_foreign_key :user_id, :users, :null => false
21
+
22
+ end
23
+ end
@@ -0,0 +1,33 @@
1
+ require 'sequel'
2
+
3
+ require 'ghtorrent/migrations/mysql_defaults'
4
+
5
+
6
+ Sequel.migration do
7
+
8
+ up do
9
+ puts 'Add column fake to users'
10
+ add_column :users, :fake, TrueClass, :null => false, :default => false
11
+
12
+ if self.database_type == :mysql
13
+ self.transaction(:rollback => :reraise, :isolation => :committed) do
14
+ self << "update users
15
+ set fake = '1'
16
+ where CAST(users.login AS BINARY) regexp '[A-Z]{8}'
17
+ and not exists (select * from pull_request_history where users.id = actor_id)
18
+ and not exists (select * from issue_events where actor_id = users.id)
19
+ and not exists (select * from project_members where users.id = user_id)
20
+ and not exists (select * from issues where reporter_id=users.id )
21
+ and not exists (select * from issues where assignee_id=users.id )
22
+ a nd not exists (select * from organization_members where user_id = users.id);"
23
+ end
24
+ end
25
+ end
26
+
27
+ down do
28
+ puts 'Drop column fake from users'
29
+ alter_table :users do
30
+ drop_column :fake
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,105 @@
1
+ class MultiprocessQueueClient < GHTorrent::Command
2
+
3
+ include GHTorrent::Settings
4
+ include GHTorrent::Logging
5
+
6
+ def clazz
7
+ raise('Unimplemented')
8
+ end
9
+
10
+ def prepare_options(options)
11
+ options.banner <<-BANNER
12
+ Retrieve data for multiple repos in parallel. To work, it requires
13
+ a mapping file formatted as either of the follow formats:
14
+
15
+ U IP UNAME PASSWD NUM_PROCS
16
+ T IP TOKEN NUM_PROCS
17
+
18
+ {U,T}: U signifies that a username/password pair is provided, T that an OAuth
19
+ token is specified instead
20
+ IP: address to use for outgoing requests (use 0.0.0.0 on non-multihomed hosts)
21
+ UNAME: Github user name to use for outgoing requests
22
+ PASSWD: Github password to use for outgoing requests
23
+ TOKEN: Github OAuth token
24
+ NUM_PROCS: Number of processes to spawn for this IP/UNAME combination
25
+
26
+ Values in the config.yaml file set with the -c command are overridden.
27
+
28
+ #{command_name} [options] mapping-file
29
+
30
+ BANNER
31
+ options.opt :queue, 'Queue to retrieve project names from',
32
+ :short => 'q', :default => 'multiprocess-queue-client',
33
+ :type => :string
34
+ end
35
+
36
+ def logger
37
+ @logger ||= Logger.new(STDOUT)
38
+ @logger
39
+ end
40
+
41
+ def validate
42
+ super
43
+ Trollop::die 'Argument mapping-file is required' unless not args[0].nil?
44
+ end
45
+
46
+ def go
47
+
48
+ configs = File.open(ARGV[0]).readlines.map do |line|
49
+ next if line =~ /^#/
50
+ case line.strip.split(/ /)[0]
51
+ when 'U'
52
+ type, ip, name, passwd, instances = line.strip.split(/ /)
53
+ when 'T'
54
+ type, ip, token, instances = line.strip.split(/ /)
55
+ end
56
+
57
+ (1..instances.to_i).map do |i|
58
+ newcfg = self.settings.clone
59
+ newcfg = override_config(newcfg, :attach_ip, ip)
60
+
61
+ case type
62
+ when 'U'
63
+ newcfg = override_config(newcfg, :github_username, name)
64
+ newcfg = override_config(newcfg, :github_passwd, passwd)
65
+ when 'T'
66
+ newcfg = override_config(newcfg, :github_token, token)
67
+ end
68
+
69
+ newcfg = override_config(newcfg, :mirror_history_pages_back, 100000)
70
+ newcfg
71
+ end
72
+ end.flatten.select { |x| !x.nil? }
73
+
74
+ children = configs.map do |config|
75
+ pid = Process::fork
76
+
77
+ if pid.nil?
78
+ retriever = clazz.new(config, options[:queue])
79
+
80
+ Signal.trap('TERM') {
81
+ retriever.stop
82
+ }
83
+
84
+ retriever.run(self)
85
+ exit
86
+ else
87
+ debug "Parent #{Process.pid} forked child #{pid}"
88
+ pid
89
+ end
90
+ end
91
+
92
+ debug 'Waiting for children'
93
+ begin
94
+ children.each do |pid|
95
+ debug "Waiting for child #{pid}"
96
+ Process.waitpid(pid, 0)
97
+ debug "Child #{pid} exited"
98
+ end
99
+ rescue Interrupt
100
+ debug 'Stopping'
101
+ end
102
+ end
103
+ end
104
+
105
+ # vim: ft=ruby:
@@ -19,5 +19,8 @@ module GHTorrent
19
19
  driver.new(settings)
20
20
  end
21
21
 
22
+ def disconnect
23
+ driver.close
24
+ end
22
25
  end
23
- end
26
+ end
@@ -145,20 +145,22 @@ module GHTorrent
145
145
  end
146
146
  end
147
147
 
148
- # Retrieve up to 30 * +:mirror_commit_pages_new_repo+ commits
149
- # starting from the provided +sha+
150
- def retrieve_commits(repo, sha, user, num_pages = config(:mirror_commit_pages_new_repo))
151
- last_sha = if sha == "head" then "master" else sha end
148
+ # Retrieve commits starting from the provided +sha+
149
+ def retrieve_commits(repo, sha, user, pages = -1)
152
150
 
153
- url = ghurl "repos/#{user}/#{repo}/commits?sha=#{last_sha}"
154
- commits = paged_api_request(url, num_pages)
151
+ url = if sha.nil?
152
+ ghurl "repos/#{user}/#{repo}/commits"
153
+ else
154
+ ghurl "repos/#{user}/#{repo}/commits?sha=#{sha}"
155
+ end
156
+
157
+ commits = restricted_page_request(url, pages)
155
158
 
156
159
  commits.map do |c|
157
160
  retrieve_commit(repo, c['sha'], user)
158
161
  end
159
162
  end
160
163
 
161
-
162
164
  def retrieve_repo(user, repo)
163
165
  stored_repo = persister.find(:repos, {'owner.login' => user,
164
166
  'name' => repo })
@@ -253,15 +255,15 @@ module GHTorrent
253
255
  # Retrieve all collaborators for a repository
254
256
  def retrieve_repo_collaborators(user, repo)
255
257
  repo_bound_items(user, repo, :repo_collaborators,
256
- "repos/#{user}/#{repo}/collaborators",
258
+ ["repos/#{user}/#{repo}/collaborators"],
257
259
  {'repo' => repo, 'owner' => user},
258
- 'login')
260
+ 'login', item = nil, refresh = false, order = :asc)
259
261
  end
260
262
 
261
263
  # Retrieve a single repository collaborator
262
264
  def retrieve_repo_collaborator(user, repo, new_member)
263
265
  repo_bound_item(user, repo, new_member, :repo_collaborators,
264
- "repos/#{user}/#{repo}/collaborators",
266
+ ["repos/#{user}/#{repo}/collaborators"],
265
267
  {'repo' => repo, 'owner' => user},
266
268
  'login')
267
269
  end
@@ -269,17 +271,17 @@ module GHTorrent
269
271
  # Retrieve all watchers for a repository
270
272
  def retrieve_watchers(user, repo)
271
273
  repo_bound_items(user, repo, :watchers,
272
- "repos/#{user}/#{repo}/stargazers",
274
+ ["repos/#{user}/#{repo}/stargazers"],
273
275
  {'repo' => repo, 'owner' => user},
274
- 'login')
276
+ 'login', item = nil, refresh = false, order = :desc)
275
277
  end
276
278
 
277
- # Retrieve a single watcher for a repositry
279
+ # Retrieve a single watcher for a repository
278
280
  def retrieve_watcher(user, repo, watcher)
279
281
  repo_bound_item(user, repo, watcher, :watchers,
280
- "repos/#{user}/#{repo}/stargazers",
282
+ ["repos/#{user}/#{repo}/stargazers"],
281
283
  {'repo' => repo, 'owner' => user},
282
- 'login')
284
+ 'login', order = :desc)
283
285
  end
284
286
 
285
287
  def retrieve_pull_requests(user, repo, refr = false)
@@ -288,7 +290,7 @@ module GHTorrent
288
290
  repo_bound_items(user, repo, :pull_requests,
289
291
  [open, closed],
290
292
  {'repo' => repo, 'owner' => user},
291
- 'number', item = nil, refresh = refr)
293
+ 'number', item = nil, refresh = refr, order = :asc)
292
294
  end
293
295
 
294
296
  def retrieve_pull_request(user, repo, pullreq_id)
@@ -303,51 +305,27 @@ module GHTorrent
303
305
 
304
306
  def retrieve_forks(user, repo)
305
307
  repo_bound_items(user, repo, :forks,
306
- "repos/#{user}/#{repo}/forks",
308
+ ["repos/#{user}/#{repo}/forks"],
307
309
  {'repo' => repo, 'owner' => user},
308
- 'id')
310
+ 'id', item = nil, refresh = false, order = :asc)
309
311
  end
310
312
 
311
313
  def retrieve_fork(user, repo, fork_id)
312
314
  repo_bound_item(user, repo, fork_id, :forks,
313
- "repos/#{user}/#{repo}/forks",
315
+ ["repos/#{user}/#{repo}/forks"],
314
316
  {'repo' => repo, 'owner' => user},
315
317
  'id')
316
318
  end
317
319
 
318
320
  def retrieve_pull_req_commits(user, repo, pullreq_id)
319
- def is_intra_branch(req)
320
- return false if req['head'].nil? or req['head']['repo'].nil?
321
- req['head']['repo']['owner']['login'] ==
322
- req['base']['repo']['owner']['login'] and
323
- req['head']['repo']['full_name'] == req['base']['repo']['full_name']
324
- end
325
-
326
- pull_req = retrieve_pull_request(user, repo, pullreq_id)
321
+ pr_commits = paged_api_request(ghurl "repos/#{user}/#{repo}/pulls/#{pullreq_id}/commits")
327
322
 
328
- unless is_intra_branch(pull_req)
329
-
330
- # Head repo has been deleted
331
- unless pull_req['head']['repo'].nil?
332
- head_user = pull_req['head']['repo']['owner']['login']
333
- head_repo = pull_req['head']['repo']['name']
334
- else
335
- # Try to find the commits in the base repo, in case the pull req
336
- # has been merged
337
- head_user = pull_req['base']['repo']['owner']['login']
338
- head_repo = pull_req['base']['repo']['name']
339
- end
323
+ pr_commits.map do |x|
324
+ head_user = x['url'].split(/\//)[4]
325
+ head_repo = x['url'].split(/\//)[5]
340
326
 
341
- commits = paged_api_request(ghurl "repos/#{user}/#{repo}/pulls/#{pullreq_id}/commits")
342
- commits.map { |x|
343
- retrieve_commit(head_repo, x['sha'], head_user)
344
- }
345
- else
346
- commits = paged_api_request(ghurl "repos/#{user}/#{repo}/pulls/#{pullreq_id}/commits")
347
- commits.map { |x|
348
- retrieve_commit(repo, x['sha'], user)
349
- }
350
- end
327
+ retrieve_commit(head_repo, x['sha'], head_user)
328
+ end.select{|x| not x.nil?}
351
329
  end
352
330
 
353
331
  def retrieve_pull_req_comments(owner, repo, pullreq_id)
@@ -406,7 +384,7 @@ module GHTorrent
406
384
  repo_bound_items(user, repo, :issues,
407
385
  [open, closed],
408
386
  {'repo' => repo, 'owner' => user},
409
- 'number', item = nil, refresh = refr)
387
+ 'number', item = nil, refresh = refr, order = :asc)
410
388
  end
411
389
 
412
390
  def retrieve_issue(user, repo, issue_id)
@@ -519,20 +497,16 @@ module GHTorrent
519
497
  end
520
498
  end
521
499
 
522
- def retrieve_issue_labels(owner, repo, issue_id)
523
-
524
- end
525
-
526
500
  def retrieve_repo_labels(owner, repo, refr = false)
527
501
  repo_bound_items(owner, repo, :repo_labels,
528
- "repos/#{owner}/#{repo}/labels",
502
+ ["repos/#{owner}/#{repo}/labels"],
529
503
  {'repo' => repo, 'owner' => owner},
530
- 'name', item = nil, refresh = refr)
504
+ 'name', item = nil, refresh = refr, order = :asc)
531
505
  end
532
506
 
533
507
  def retrieve_repo_label(owner, repo, name)
534
508
  repo_bound_item(owner, repo, name, :repo_labels,
535
- "repos/#{owner}/#{repo}/labels",
509
+ ["repos/#{owner}/#{repo}/labels"],
536
510
  {'repo' => repo, 'owner' => owner},
537
511
  'name')
538
512
  end
@@ -569,74 +543,96 @@ module GHTorrent
569
543
 
570
544
  private
571
545
 
572
- def repo_bound_items(user, repo, entity, urls, selector, descriminator,
573
- item_id = nil, refresh = false)
546
+ def restricted_page_request(url, pages)
547
+ if pages != -1
548
+ paged_api_request(url, pages)
549
+ else
550
+ paged_api_request(url)
551
+ end
552
+ end
553
+
554
+ def repo_bound_items(user, repo, entity, urls, selector, discriminator,
555
+ item_id = nil, refresh = false, order = :asc)
574
556
 
575
- items = if urls.class == Array
576
- urls.map { |url| paged_api_request(ghurl url) }.flatten
577
- else
578
- paged_api_request(ghurl urls)
579
- end
557
+ urls.each do |url|
558
+ total_pages = num_pages(ghurl url)
580
559
 
581
- items = items.map do |x|
582
- x['repo'] = repo
583
- x['owner'] = user
560
+ page_range = if order == :asc
561
+ (1..total_pages)
562
+ else
563
+ total_pages.downto(1)
564
+ end
584
565
 
585
- instances = repo_bound_instance(entity, selector,
586
- descriminator, x[descriminator])
587
- exists = !instances.empty?
566
+ page_range.each do |page|
567
+ items = api_request(ghurl(url, page))
588
568
 
589
- unless exists
590
- persister.store(entity, x)
591
- info "Retriever: Added #{entity} #{user}/#{repo} -> #{x[descriminator]}"
592
- else
593
- if refresh
594
- instances.each do |i|
569
+ items.each do |x|
570
+ x['repo'] = repo
571
+ x['owner'] = user
595
572
 
596
- id = if i[descriminator].to_i.to_s != i[descriminator]
597
- i[descriminator] # item_id is int
573
+ instances = repo_bound_instance(entity, selector,
574
+ discriminator, x[discriminator])
575
+ exists = !instances.empty?
576
+
577
+ unless exists
578
+ persister.store(entity, x)
579
+ info "Retriever: Added #{entity} #{user}/#{repo} -> #{x[discriminator]}"
580
+ else
581
+ if refresh
582
+ instances.each do |i|
583
+
584
+ id = if i[discriminator].to_i.to_s != i[discriminator]
585
+ i[discriminator] # item_id is int
586
+ else
587
+ i[discriminator].to_i # convert to int
588
+ end
589
+
590
+ instance_selector = selector.merge({discriminator => id})
591
+ persister.del(entity, instance_selector)
592
+ persister.store(entity, x)
593
+ debug "Retriever: Refreshing #{entity} #{user}/#{repo} -> #{x[discriminator]}"
594
+ end
598
595
  else
599
- i[descriminator].to_i # convert to int
596
+ debug "Retriever: #{entity} #{user}/#{repo} -> #{x[discriminator]} exists"
600
597
  end
598
+ end
601
599
 
602
- instance_selector = selector.merge({descriminator => id})
603
- persister.del(entity, instance_selector)
604
- persister.store(entity, x)
605
- debug "Retriever: Refreshing #{entity} #{user}/#{repo} -> #{x[descriminator]}"
600
+ # If we are just looking for a single item, give the method a chance
601
+ # to return as soon as we find it. This is to avoid loading all
602
+ # items before we actually search for what we are looking for.
603
+ unless item_id.nil?
604
+ a = repo_bound_instance(entity, selector, discriminator, item_id)
605
+ unless a.empty?
606
+ return a
607
+ end
606
608
  end
607
- else
608
- debug "Retriever: #{entity} #{user}/#{repo} -> #{x[descriminator]} exists"
609
609
  end
610
610
  end
611
- # If the persistence driver does not set an ext_ref_id key, set a dummy
612
- # one here
613
- unless x.has_key? ext_uniq
614
- x[ext_uniq] = '0'
615
- end
616
- x
617
611
  end
618
612
 
619
613
  if item_id.nil?
620
- a = persister.find(entity, selector)
621
- if a.empty? then items else a end
614
+ persister.find(entity, selector)
622
615
  else
623
- a = repo_bound_instance(entity, selector, descriminator, item_id)
624
- if a.empty? then [items.find{|x| x[descriminator] == item_id}] else a end
616
+ # If the item we are looking for has been found, the method should
617
+ # have returned earlier. So just return an empty result to indicate
618
+ # that the item has not been found.
619
+ []
625
620
  end
626
621
  end
627
622
 
628
- def repo_bound_item(user, repo, item_id, entity, url, selector, descriminator)
629
- stored_item = repo_bound_instance(entity, selector, descriminator, item_id)
623
+ def repo_bound_item(user, repo, item_id, entity, url, selector,
624
+ discriminator, order = :asc)
625
+ stored_item = repo_bound_instance(entity, selector, discriminator, item_id)
630
626
 
631
627
  if stored_item.empty?
632
- repo_bound_items(user, repo, entity, url, selector, descriminator,
633
- item_id).first
628
+ repo_bound_items(user, repo, entity, url, selector, discriminator,
629
+ item_id, false, order).first
634
630
  else
635
631
  stored_item.first
636
632
  end
637
633
  end
638
634
 
639
- def repo_bound_instance(entity, selector, descriminator, item_id)
635
+ def repo_bound_instance(entity, selector, discriminator, item_id)
640
636
 
641
637
  id = if item_id.to_i.to_s != item_id
642
638
  item_id # item_id is int
@@ -644,21 +640,35 @@ module GHTorrent
644
640
  item_id.to_i # convert to int
645
641
  end
646
642
 
647
- instance_selector = selector.merge({descriminator => id})
643
+ instance_selector = selector.merge({discriminator => id})
648
644
  result = persister.find(entity, instance_selector)
649
645
  if result.empty?
650
- # Try without type conversions. Useful when the descriminator type
646
+ # Try without type conversions. Useful when the discriminator type
651
647
  # is string and an item_id that can be converted to int is passed.
652
648
  # Having no types sucks occasionaly...
653
- instance_selector = selector.merge({descriminator => item_id})
649
+ instance_selector = selector.merge({discriminator => item_id})
654
650
  persister.find(entity, instance_selector)
655
651
  else
656
652
  result
657
653
  end
658
654
  end
659
655
 
660
- def ghurl(path)
661
- config(:mirror_urlbase) + path
656
+ def ghurl(path, page = -1, per_page = 100)
657
+ if page > 0
658
+ if path.include?('?')
659
+ path = path + "&page=#{page}&per_page=#{per_page}"
660
+ else
661
+ path = path + "?page=#{page}&per_page=#{per_page}"
662
+ end
663
+ config(:mirror_urlbase) + path
664
+ else
665
+ if path.include?('?')
666
+ path = path + "&per_page=#{per_page}"
667
+ else
668
+ path = path + "?per_page=#{per_page}"
669
+ end
670
+ config(:mirror_urlbase) + path
671
+ end
662
672
  end
663
673
 
664
674
  end