mobilize-base 1.0.2 → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/.gitignore +5 -0
  2. data/LICENSE.txt +202 -20
  3. data/README.md +219 -138
  4. data/Rakefile +1 -2
  5. data/lib/mobilize-base/extensions/google_drive/acl.rb +25 -0
  6. data/lib/mobilize-base/extensions/google_drive/client_login_fetcher.rb +49 -0
  7. data/lib/mobilize-base/extensions/google_drive/file.rb +80 -0
  8. data/lib/mobilize-base/extensions/{google_drive.rb → google_drive/worksheet.rb} +46 -173
  9. data/lib/mobilize-base/extensions/resque.rb +18 -24
  10. data/lib/mobilize-base/extensions/string.rb +12 -0
  11. data/lib/mobilize-base/handlers/gbook.rb +14 -47
  12. data/lib/mobilize-base/handlers/gdrive.rb +17 -18
  13. data/lib/mobilize-base/handlers/gfile.rb +18 -39
  14. data/lib/mobilize-base/handlers/gridfs.rb +43 -0
  15. data/lib/mobilize-base/handlers/gsheet.rb +48 -99
  16. data/lib/mobilize-base/jobtracker.rb +29 -15
  17. data/lib/mobilize-base/models/dataset.rb +33 -35
  18. data/lib/mobilize-base/models/job.rb +21 -168
  19. data/lib/mobilize-base/models/runner.rb +178 -0
  20. data/lib/mobilize-base/models/task.rb +137 -0
  21. data/lib/mobilize-base/models/user.rb +47 -0
  22. data/lib/mobilize-base/rakes.rb +59 -0
  23. data/lib/mobilize-base/version.rb +1 -1
  24. data/lib/mobilize-base.rb +20 -9
  25. data/lib/samples/gdrive.yml +12 -12
  26. data/lib/samples/gridfs.yml +9 -0
  27. data/lib/samples/gsheet.yml +6 -0
  28. data/lib/samples/jobtracker.yml +9 -9
  29. data/lib/samples/mongoid.yml +3 -3
  30. data/mobilize-base.gemspec +1 -1
  31. data/test/base1_task1.yml +3 -0
  32. data/test/base_job_rows.yml +13 -0
  33. data/test/mobilize-base_test.rb +59 -0
  34. metadata +20 -9
  35. data/lib/mobilize-base/handlers/mongodb.rb +0 -32
  36. data/lib/mobilize-base/models/requestor.rb +0 -232
  37. data/lib/mobilize-base/tasks.rb +0 -43
  38. data/test/mobilize_test.rb +0 -108
@@ -17,6 +17,18 @@ class String
17
17
  raise stderr.read if (stderr.read.length>0 and except==true)
18
18
  return stdout.read
19
19
  end
20
+ def escape_regex
21
+ str = self
22
+ new_str = str.clone
23
+ char_string = "[\/^$. |?*+()"
24
+ char_string.chars.to_a.each{|c|
25
+ new_str.gsub!(c,"\\#{c}")}
26
+ new_str
27
+ end
28
+ def gridsafe
29
+ str = self
30
+ str.downcase.gsub(/[^a-z0-9]/,"")
31
+ end
20
32
  def googlesafe
21
33
  v=self
22
34
  return "" if v.to_s==""
@@ -1,16 +1,16 @@
1
1
  module Mobilize
2
2
  module Gbook
3
- def Gbook.find_all_by_title(title,email=nil)
4
- Gdrive.books(email,{"title"=>title,"title-exact"=>"true"})
3
+ def Gbook.find_all_by_path(path,gdrive_slot)
4
+ Gdrive.books(gdrive_slot,{"title"=>path,"title-exact"=>"true"})
5
5
  end
6
- def Gbook.find_or_create_by_title(title,email)
7
- books = Gdrive.books(email,{"title"=>title,"title-exact"=>"true"})
8
- #there should only be one book with each title, otherwise we have fail
6
+ def Gbook.find_or_create_by_path(path,gdrive_slot)
7
+ books = Gdrive.books(gdrive_slot,{"title"=>path,"title-exact"=>"true"})
8
+ dst = Dataset.find_or_create_by_handler_and_path('gbook',path)
9
+ #there should only be one book with each path, otherwise we have fail
9
10
  book = nil
10
- if books.length>1
11
+ if books.length>1 and dst.url.to_s.length>0
11
12
  #some idiot process created a duplicate book.
12
13
  #Fix by renaming all but one with dst entry's key
13
- dst = Dataset.find_by_handler_and_name('gbook',title)
14
14
  dkey = dst.url.split("key=").last
15
15
  books.each do |b|
16
16
  bkey = b.resource_id.split(":").last
@@ -19,55 +19,22 @@ module Mobilize
19
19
  else
20
20
  #delete the invalid book
21
21
  b.delete
22
- ("Deleted duplicate book #{title}").oputs
22
+ ("Deleted duplicate book #{path}").oputs
23
23
  end
24
24
  end
25
25
  else
26
26
  book = books.first
27
27
  end
28
28
  if book.nil?
29
- #add book using owner email
30
- #http
31
- book = Gdrive.root.create_spreadsheet(title)
32
- ("Created book #{title} at #{Time.now.utc.to_s}").oputs
29
+ #always use owner email to make sure all books are owned by owner account
30
+ book = Gdrive.root(Gdrive.owner_email).create_spreadsheet(path)
31
+ ("Created book #{path} at #{Time.now.utc.to_s}; Access at #{book.human_url}").oputs
33
32
  end
34
- #delete Sheet1 if there are other sheets
35
- #http
36
- if (sheets = book.worksheets).length>1
37
- sheet1 = sheets.select{|s| s.title == "Sheet1"}.first
38
- #http
39
- sheet1.delete if sheet1
40
- end
41
- #always make sure books have admin acl
33
+ #always make sure book dataset URL is up to date
34
+ #and that book has admin acl
35
+ dst.update_attributes(:url=>book.human_url)
42
36
  book.add_admin_acl
43
37
  return book
44
38
  end
45
-
46
- def Gbook.find_or_create_by_dst_id(dst_id,email=nil)
47
- #creates by title, updates acl, updates dataset with url
48
- dst = Dataset.find(dst_id)
49
- r = Requestor.find(dst.requestor_id)
50
- book = nil
51
- #http
52
- book = Gdrive.root.spreadsheet_by_url(dst.url) if dst.url
53
- #manually try 5 times to validate sheet since we can't just try again and again
54
- 5.times.each do
55
- begin
56
- book.resource_id
57
- #if no error then break loop
58
- break
59
- rescue=>exc
60
- if book.nil? or exc.to_s.index('Invalid document id')
61
- book = Gbook.find_or_create_by_title(dst.name,email)
62
- #if invalid doc then update url w new book and break loop
63
- dst.update_attributes(:url=>book.human_url)
64
- break
65
- end
66
- end
67
- end
68
- #add requestor write access
69
- book.update_acl(r.email)
70
- return book
71
- end
72
39
  end
73
40
  end
@@ -1,7 +1,7 @@
1
1
  module Mobilize
2
2
  module Gdrive
3
3
  def Gdrive.config
4
- Base.config('gdrive')[Base.env]
4
+ Base.config('gdrive')
5
5
  end
6
6
 
7
7
  def Gdrive.domain
@@ -9,7 +9,7 @@ module Mobilize
9
9
  end
10
10
 
11
11
  def Gdrive.owner_email
12
- Gdrive.config['owner']['email']
12
+ [Gdrive.config['owner']['name'],Gdrive.domain].join("@")
13
13
  end
14
14
 
15
15
  def Gdrive.password(email)
@@ -29,44 +29,43 @@ module Mobilize
29
29
  if email.nil?
30
30
  Gdrive.config['workers']
31
31
  else
32
- Gdrive.workers.select{|w| w['email'] == email}.first
32
+ Gdrive.workers.select{|w| [w['name'],Gdrive.domain].join("@") == email}.first
33
33
  end
34
34
  end
35
35
 
36
36
  def Gdrive.worker_emails
37
- Gdrive.workers.map{|w| w['email']}
37
+ Gdrive.workers.map{|w| [w['name'],Gdrive.domain].join("@")}
38
38
  end
39
39
 
40
40
  def Gdrive.admin_emails
41
- Gdrive.admins.map{|w| w['email']}
41
+ Gdrive.admins.map{|w| [w['name'],Gdrive.domain].join("@")}
42
42
  end
43
43
 
44
44
  #email management - used to make sure not too many emails get used at the same time
45
- def Gdrive.get_worker_email_by_mongo_id(mongo_id)
46
- active_emails = Mobilize::Resque.jobs('working').map{|j| j['email'] if j['email']}.compact
45
+ def Gdrive.slot_worker_by_path(path)
46
+ working_slots = Mobilize::Resque.jobs('working').map{|j| j['gdrive_slot'] if j['gdrive_slot']}.compact
47
47
  Gdrive.workers.sort_by{rand}.each do |w|
48
- if !(active_emails.include?(w['email']))
49
- Mobilize::Resque.update_job_email(mongo_id,w['email'])
50
- return w['email']
48
+ unless working_slots.include?([w['name'],Gdrive.domain].join("@"))
49
+ Mobilize::Resque.set_worker_args_by_path(path,{'gdrive_slot'=>[w['name'],Gdrive.domain].join("@")})
50
+ return [w['name'],Gdrive.domain].join("@")
51
51
  end
52
52
  end
53
53
  #return false if none are available
54
54
  return false
55
55
  end
56
56
 
57
- def Gdrive.root(email=nil)
58
- email ||= Gdrive.owner_email
59
- pw = Gdrive.password(email)
60
- GoogleDrive.login(email,pw)
57
+ def Gdrive.root(gdrive_slot=nil)
58
+ pw = Gdrive.password(gdrive_slot)
59
+ GoogleDrive.login(gdrive_slot,pw)
61
60
  end
62
61
 
63
- def Gdrive.files(email=nil,params={})
64
- root = Gdrive.root(email)
62
+ def Gdrive.files(gdrive_slot=nil,params={})
63
+ root = Gdrive.root(gdrive_slot)
65
64
  root.files(params)
66
65
  end
67
66
 
68
- def Gdrive.books(email=nil,params={})
69
- Gdrive.files(email,params).select{|f| f.class==GoogleDrive::Spreadsheet}
67
+ def Gdrive.books(gdrive_slot=nil,params={})
68
+ Gdrive.files(gdrive_slot,params).select{|f| f.class==GoogleDrive::Spreadsheet}
70
69
  end
71
70
  end
72
71
  end
@@ -1,55 +1,34 @@
1
1
  module Mobilize
2
- class Gfile
3
- def Gfile.find_by_title(title,email=nil)
4
- Gdriver.files(email).select{|f| f.title==title}.first
5
- end
6
-
7
- def Gfile.find_by_dst_id(dst_id,email=nil)
8
- dst = Dataset.find(dst_id)
9
- Gfile.find_by_title(dst.path,email)
10
- end
11
-
12
- def Gfile.add_admin_acl_by_dst_id(dst_id)
13
- #adds admins and workers as writers
14
- file = Gfile.find_by_dst_id(dst_id)
2
+ module Gfile
3
+ def Gfile.add_admin_acl_by_path(path)
4
+ file = Gfile.find_by_path(path)
15
5
  file.add_admin_acl
16
6
  return true
17
7
  end
18
8
 
19
- def Gfile.add_admin_acl_by_title(title)
20
- file = Gfile.find_by_title(title)
21
- file.add_admin_acl
22
- return true
23
- end
24
-
25
- def Gfile.add_worker_acl_by_title(title)
26
- file = Gfile.find_by_title(title)
9
+ def Gfile.add_worker_acl_by_path(path)
10
+ file = Gfile.find_by_path(path)
27
11
  file.add_worker_acl
28
12
  return true
29
13
  end
30
14
 
31
- def Gfile.update_acl_by_dst_id(dst_id,email,role="writer",edit_email=nil)
32
- dst = Dataset.find(dst_id)
33
- Gfile.update_acl_by_title(dst.path,email,role,edit_email)
34
- end
35
-
36
- def Gfile.update_acl_by_title(title,email,role="writer",edit_email=nil)
37
- file = Gfile.find_by_title(title,edit_email)
38
- raise "File #{title} not found" unless file
39
- file.update_acl(email,role)
15
+ def Gfile.update_acl_by_path(path,gdrive_slot,role="writer",target_email=nil)
16
+ file = Gfile.find_by_path(path,target_email)
17
+ raise "File #{path} not found" unless file
18
+ file.update_acl(gdrive_slot,role)
40
19
  end
41
20
 
42
- def Gfile.read_by_name()
43
-
21
+ def Gfile.find_by_path(path,gdrive_slot)
22
+ Gdrive.files(gdrive_slot,{"title"=>path,"title-exact"=>"true"}).first
44
23
  end
45
24
 
46
- def Gfile.read_by_url()
47
-
25
+ def Gfile.read_by_task_path(task_path)
26
+ #reserve gdrive_slot account for read
27
+ gdrive_slot = Gdrive.slot_worker_by_path(t.path)
28
+ return false unless gdrive_slot
29
+ t = Task.where(:path=>task_path)
30
+ gfile_path = t.params.first
31
+ Gfile.find_by_path(gfile_path,gdrive_slot).read
48
32
  end
49
-
50
- def Gfile.read_by_job_id(job_id)
51
- j = Job.find(job_id)
52
- end
53
-
54
33
  end
55
34
  end
@@ -0,0 +1,43 @@
1
+ module Mobilize
2
+ module Gridfs
3
+ def Gridfs.config
4
+ Base.config('gridfs')
5
+ end
6
+
7
+ def Gridfs.grid
8
+ session = ::Mongoid.configure.sessions['default']
9
+ database_name = session['database']
10
+ host,port = session['hosts'].first.split(":")
11
+ return ::Mongo::GridFileSystem.new(::Mongo::Connection.new(host,port).db(database_name))
12
+ end
13
+
14
+ def Gridfs.read(path)
15
+ begin
16
+ zs=Gridfs.grid.open(path.gridsafe,'r').read
17
+ return ::Zlib::Inflate.inflate(zs)
18
+ rescue
19
+ return nil
20
+ end
21
+ end
22
+
23
+ def Gridfs.write(path,string)
24
+ zs = ::Zlib::Deflate.deflate(string)
25
+ raise "compressed string too large for Gridfs write" if zs.length > Gridfs.config['max_compressed_write_size']
26
+ curr_zs = Gridfs.read(path.gridsafe).to_s
27
+ #write a new version when there is a change
28
+ if curr_zs != zs
29
+ Gridfs.grid.open(path.gridsafe,'w',:versions => Gridfs.config['max_versions']){|f| f.write(zs)}
30
+ end
31
+ return true
32
+ end
33
+
34
+ def Gridfs.delete(path)
35
+ begin
36
+ Gridfs.grid.delete(path.gridsafe)
37
+ return true
38
+ rescue
39
+ return nil
40
+ end
41
+ end
42
+ end
43
+ end
@@ -1,122 +1,71 @@
1
1
  module Mobilize
2
2
  module Gsheet
3
3
 
4
- def Gsheet.max_cells
5
- 400000
4
+ def Gsheet.config
5
+ Base.config('gsheet')
6
6
  end
7
7
 
8
- def Gsheet.read(name,email=nil)
9
- sheet = Gsheet.find_or_create_by_name(name,email)
10
- sheet.to_tsv
8
+ def Gsheet.max_cells
9
+ Gsheet.config['max_cells']
11
10
  end
12
11
 
13
- def Gsheet.write(name,tsv,email=nil)
14
- sheet = Gsheet.find_or_create_by_name(name,email)
12
+ def Gsheet.write(path,tsv,gdrive_slot)
13
+ sheet = Gsheet.find_or_create_by_path(path,gdrive_slot)
15
14
  sheet.write(tsv)
16
15
  end
17
16
 
18
- def Gsheet.find_all_by_name(name,email)
19
- book_title,sheet_title = name.split("/")
20
- books = Gdrive.books(email,{"title"=>book_title,"title-exact"=>"true"})
21
- sheets = books.map{|b| b.worksheets}.flatten.select{|w| w.title == sheet_title }
22
- sheets
17
+ def Gsheet.find_by_path(path,gdrive_slot)
18
+ book_path,sheet_name = path.split("/")
19
+ book = Gdrive.books(gdrive_slot,{"title"=>book_path,"title-exact"=>"true"}).first
20
+ return book.worksheet_by_title(sheet_name) if book
23
21
  end
24
22
 
25
- def Gsheet.find_or_create_by_name(name,email=nil,rows=100,cols=20)
26
- book_title,sheet_title = name.split("/")
27
- book = Gbook.find_or_create_by_title(book_title,email)
28
- #http
29
- sheet = book.worksheets.select{|w| w.title==sheet_title}.first
23
+ def Gsheet.find_or_create_by_path(path,gdrive_slot,rows=100,cols=20)
24
+ book_path,sheet_name = path.split("/")
25
+ book = Gbook.find_or_create_by_path(book_path,gdrive_slot)
26
+ sheet = book.worksheet_by_title(sheet_name)
30
27
  if sheet.nil?
31
- #http
32
- sheet = book.add_worksheet(sheet_title,rows,cols)
33
- ("Created sheet #{name} at #{Time.now.utc.to_s}").oputs
28
+ sheet = book.add_worksheet(sheet_name,rows,cols)
29
+ ("Created gsheet #{path} at #{Time.now.utc.to_s}").oputs
34
30
  end
31
+ Dataset.find_or_create_by_handler_and_path("gsheet",path)
35
32
  return sheet
36
33
  end
37
34
 
38
- def Gsheet.find_or_create_by_dst_id(dst_id,email=nil)
39
- #creates by title, updates acl, updates dataset with url
40
- dst = Dataset.find(dst_id)
41
- r = Requestor.find(dst.requestor_id)
42
- name = dst.name
43
- book_title,sheet_title = name.split("/")
44
- #make sure book exists and is assigned to this user
45
- r.find_or_create_gbook_by_title(book_title,email)
46
- #add admin write access
47
- sheet = Gsheet.find_or_create_by_name(name)
48
- sheet_title = nil
49
- return sheet
50
- end
51
-
52
- def Gsheet.read_by_dst_id(dst_id,email=nil)
53
- dst = Dataset.find(dst_id)
54
- name = dst.name
55
- sheet = Gsheet.find_or_create_by_name(name,email)
56
- output = sheet.to_tsv
57
- return output
58
- end
59
-
60
- def Gsheet.read_by_job_id(job_id)
61
- j = Job.find(job_id)
62
- #reserve email account for read
63
- email = Gdrive.get_worker_email_by_mongo_id(job_id)
64
- return false unless email
65
- #pull tsv from cache
66
- j.dataset_array.first.read_cache
35
+ def Gsheet.read_by_task_path(task_path)
36
+ #reserve gdrive_slot account for read
37
+ gdrive_slot = Gdrive.slot_worker_by_path(task_path)
38
+ return false unless gdrive_slot
39
+ t = Task.where(:path=>task_path).first
40
+ gsheet_path = t.params.first
41
+ Gsheet.find_by_path(gsheet_path,gdrive_slot).to_tsv
67
42
  end
68
43
 
69
- def Gsheet.write_by_dst_id(dst_id,tsv,email=nil)
70
- dst = Dataset.find(dst_id)
71
- #see if this is a specific cell
72
- name = dst.name
73
- return false unless email
74
- #create temp tab, write data to it, checksum it against the source
75
- temp_sheet = Gsheet.find_or_create_by_name("#{name}_temp")
76
- temp_sheet.write(tsv)
77
- #delete current sheet, replace it with temp one
78
- sheet = Gsheet.find_or_create_by_name(dst.name)
79
- title = sheet.title
80
- #http
81
- sheet.delete
82
- begin
83
- temp_sheet.rename(title)
84
- rescue
85
- #need this because sometimes it gets confused and tries to rename twice
86
- end
87
- "Write successful for #{write_name}".oputs
88
- return true
89
- end
90
-
91
- def Gsheet.write_by_job_id(job_id)
92
- j = Job.find(job_id)
93
- r = j.requestor
94
- dest_name = if j.destination.split("/").length==1
95
- "#{r.jobspec_title}#{"/"}#{j.destination}"
96
- else
97
- j.destination
98
- end
99
- sheet_dst = Dataset.find_or_create_by_handler_and_name('gsheet',dest_name)
100
- sheet_dst.update_attributes(:requestor_id=>r.id.to_s) if sheet_dst.requestor_id.nil?
101
- email = Gdrive.get_worker_email_by_mongo_id(job_id)
44
+ def Gsheet.write_by_task_path(task_path)
45
+ gdrive_slot = Gdrive.slot_worker_by_path(task_path)
102
46
  #return false if there are no emails available
103
- return false unless email
104
- #create temp tab, write data to it, checksum it against the source
105
- temp_sheet_dst = Dataset.find_or_create_by_handler_and_name('gsheet',"#{dest_name}_temp")
106
- temp_sheet_dst.update_attributes(:requestor_id=>r.id.to_s) if temp_sheet_dst.requestor_id.nil?
107
- temp_sheet = Gsheet.find_or_create_by_name(temp_sheet_dst.name,email)
108
- #tsv is the prior task's output
109
- tsv = j.task_output_dsts[j.task_idx-1].read
110
- temp_sheet.write(tsv,true,job_id)
111
- #delete current sheet, replace it with temp one
112
- sheet = Gsheet.find_or_create_by_name(dest_name,email)
113
- title = sheet.title
114
- #http
115
- sheet.delete
116
- temp_sheet.title = title
117
- temp_sheet.save
118
- sheet_dst.update_attributes(:url=>temp_sheet.spreadsheet.human_url)
119
- "Write successful for #{dest_name}".oputs
47
+ return false unless gdrive_slot
48
+ t = Task.where(:path=>task_path).first
49
+ source = t.params.first
50
+ target_path = t.params.second
51
+ source_job_name, source_task_name = if source.index("/")
52
+ source.split("/")
53
+ else
54
+ [nil, source]
55
+ end
56
+ source_task_path = "#{t.job.runner.path}/#{source_job_name || t.job.name}/#{source_task_name}"
57
+ source_task = Task.where(:path=>source_task_path).first
58
+ tsv = source_task.stdout_dataset.read_cache
59
+ sheet_name = target_path.split("/").last
60
+ temp_path = [task_path.gridsafe,sheet_name].join("/")
61
+ temp_sheet = Gsheet.find_or_create_by_path(temp_path,gdrive_slot)
62
+ temp_sheet.write(tsv)
63
+ temp_sheet.check_and_fix(tsv)
64
+ target_sheet = Gsheet.find_or_create_by_path(target_path,gdrive_slot)
65
+ target_sheet.merge(temp_sheet)
66
+ #delete the temp sheet's book
67
+ temp_sheet.spreadsheet.delete
68
+ "Write successful for #{target_path}".oputs
120
69
  return true
121
70
  end
122
71
  end