mobilize-base 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/.gitignore +5 -0
  2. data/LICENSE.txt +202 -20
  3. data/README.md +219 -138
  4. data/Rakefile +1 -2
  5. data/lib/mobilize-base/extensions/google_drive/acl.rb +25 -0
  6. data/lib/mobilize-base/extensions/google_drive/client_login_fetcher.rb +49 -0
  7. data/lib/mobilize-base/extensions/google_drive/file.rb +80 -0
  8. data/lib/mobilize-base/extensions/{google_drive.rb → google_drive/worksheet.rb} +46 -173
  9. data/lib/mobilize-base/extensions/resque.rb +18 -24
  10. data/lib/mobilize-base/extensions/string.rb +12 -0
  11. data/lib/mobilize-base/handlers/gbook.rb +14 -47
  12. data/lib/mobilize-base/handlers/gdrive.rb +17 -18
  13. data/lib/mobilize-base/handlers/gfile.rb +18 -39
  14. data/lib/mobilize-base/handlers/gridfs.rb +43 -0
  15. data/lib/mobilize-base/handlers/gsheet.rb +48 -99
  16. data/lib/mobilize-base/jobtracker.rb +29 -15
  17. data/lib/mobilize-base/models/dataset.rb +33 -35
  18. data/lib/mobilize-base/models/job.rb +21 -168
  19. data/lib/mobilize-base/models/runner.rb +178 -0
  20. data/lib/mobilize-base/models/task.rb +137 -0
  21. data/lib/mobilize-base/models/user.rb +47 -0
  22. data/lib/mobilize-base/rakes.rb +59 -0
  23. data/lib/mobilize-base/version.rb +1 -1
  24. data/lib/mobilize-base.rb +20 -9
  25. data/lib/samples/gdrive.yml +12 -12
  26. data/lib/samples/gridfs.yml +9 -0
  27. data/lib/samples/gsheet.yml +6 -0
  28. data/lib/samples/jobtracker.yml +9 -9
  29. data/lib/samples/mongoid.yml +3 -3
  30. data/mobilize-base.gemspec +1 -1
  31. data/test/base1_task1.yml +3 -0
  32. data/test/base_job_rows.yml +13 -0
  33. data/test/mobilize-base_test.rb +59 -0
  34. metadata +20 -9
  35. data/lib/mobilize-base/handlers/mongodb.rb +0 -32
  36. data/lib/mobilize-base/models/requestor.rb +0 -232
  37. data/lib/mobilize-base/tasks.rb +0 -43
  38. data/test/mobilize_test.rb +0 -108
@@ -17,6 +17,18 @@ class String
17
17
  raise stderr.read if (stderr.read.length>0 and except==true)
18
18
  return stdout.read
19
19
  end
20
+ def escape_regex
21
+ str = self
22
+ new_str = str.clone
23
+ char_string = "[\/^$. |?*+()"
24
+ char_string.chars.to_a.each{|c|
25
+ new_str.gsub!(c,"\\#{c}")}
26
+ new_str
27
+ end
28
+ def gridsafe
29
+ str = self
30
+ str.downcase.gsub(/[^a-z0-9]/,"")
31
+ end
20
32
  def googlesafe
21
33
  v=self
22
34
  return "" if v.to_s==""
@@ -1,16 +1,16 @@
1
1
  module Mobilize
2
2
  module Gbook
3
- def Gbook.find_all_by_title(title,email=nil)
4
- Gdrive.books(email,{"title"=>title,"title-exact"=>"true"})
3
+ def Gbook.find_all_by_path(path,gdrive_slot)
4
+ Gdrive.books(gdrive_slot,{"title"=>path,"title-exact"=>"true"})
5
5
  end
6
- def Gbook.find_or_create_by_title(title,email)
7
- books = Gdrive.books(email,{"title"=>title,"title-exact"=>"true"})
8
- #there should only be one book with each title, otherwise we have fail
6
+ def Gbook.find_or_create_by_path(path,gdrive_slot)
7
+ books = Gdrive.books(gdrive_slot,{"title"=>path,"title-exact"=>"true"})
8
+ dst = Dataset.find_or_create_by_handler_and_path('gbook',path)
9
+ #there should only be one book with each path, otherwise we have fail
9
10
  book = nil
10
- if books.length>1
11
+ if books.length>1 and dst.url.to_s.length>0
11
12
  #some idiot process created a duplicate book.
12
13
  #Fix by renaming all but one with dst entry's key
13
- dst = Dataset.find_by_handler_and_name('gbook',title)
14
14
  dkey = dst.url.split("key=").last
15
15
  books.each do |b|
16
16
  bkey = b.resource_id.split(":").last
@@ -19,55 +19,22 @@ module Mobilize
19
19
  else
20
20
  #delete the invalid book
21
21
  b.delete
22
- ("Deleted duplicate book #{title}").oputs
22
+ ("Deleted duplicate book #{path}").oputs
23
23
  end
24
24
  end
25
25
  else
26
26
  book = books.first
27
27
  end
28
28
  if book.nil?
29
- #add book using owner email
30
- #http
31
- book = Gdrive.root.create_spreadsheet(title)
32
- ("Created book #{title} at #{Time.now.utc.to_s}").oputs
29
+ #always use owner email to make sure all books are owned by owner account
30
+ book = Gdrive.root(Gdrive.owner_email).create_spreadsheet(path)
31
+ ("Created book #{path} at #{Time.now.utc.to_s}; Access at #{book.human_url}").oputs
33
32
  end
34
- #delete Sheet1 if there are other sheets
35
- #http
36
- if (sheets = book.worksheets).length>1
37
- sheet1 = sheets.select{|s| s.title == "Sheet1"}.first
38
- #http
39
- sheet1.delete if sheet1
40
- end
41
- #always make sure books have admin acl
33
+ #always make sure book dataset URL is up to date
34
+ #and that book has admin acl
35
+ dst.update_attributes(:url=>book.human_url)
42
36
  book.add_admin_acl
43
37
  return book
44
38
  end
45
-
46
- def Gbook.find_or_create_by_dst_id(dst_id,email=nil)
47
- #creates by title, updates acl, updates dataset with url
48
- dst = Dataset.find(dst_id)
49
- r = Requestor.find(dst.requestor_id)
50
- book = nil
51
- #http
52
- book = Gdrive.root.spreadsheet_by_url(dst.url) if dst.url
53
- #manually try 5 times to validate sheet since we can't just try again and again
54
- 5.times.each do
55
- begin
56
- book.resource_id
57
- #if no error then break loop
58
- break
59
- rescue=>exc
60
- if book.nil? or exc.to_s.index('Invalid document id')
61
- book = Gbook.find_or_create_by_title(dst.name,email)
62
- #if invalid doc then update url w new book and break loop
63
- dst.update_attributes(:url=>book.human_url)
64
- break
65
- end
66
- end
67
- end
68
- #add requestor write access
69
- book.update_acl(r.email)
70
- return book
71
- end
72
39
  end
73
40
  end
@@ -1,7 +1,7 @@
1
1
  module Mobilize
2
2
  module Gdrive
3
3
  def Gdrive.config
4
- Base.config('gdrive')[Base.env]
4
+ Base.config('gdrive')
5
5
  end
6
6
 
7
7
  def Gdrive.domain
@@ -9,7 +9,7 @@ module Mobilize
9
9
  end
10
10
 
11
11
  def Gdrive.owner_email
12
- Gdrive.config['owner']['email']
12
+ [Gdrive.config['owner']['name'],Gdrive.domain].join("@")
13
13
  end
14
14
 
15
15
  def Gdrive.password(email)
@@ -29,44 +29,43 @@ module Mobilize
29
29
  if email.nil?
30
30
  Gdrive.config['workers']
31
31
  else
32
- Gdrive.workers.select{|w| w['email'] == email}.first
32
+ Gdrive.workers.select{|w| [w['name'],Gdrive.domain].join("@") == email}.first
33
33
  end
34
34
  end
35
35
 
36
36
  def Gdrive.worker_emails
37
- Gdrive.workers.map{|w| w['email']}
37
+ Gdrive.workers.map{|w| [w['name'],Gdrive.domain].join("@")}
38
38
  end
39
39
 
40
40
  def Gdrive.admin_emails
41
- Gdrive.admins.map{|w| w['email']}
41
+ Gdrive.admins.map{|w| [w['name'],Gdrive.domain].join("@")}
42
42
  end
43
43
 
44
44
  #email management - used to make sure not too many emails get used at the same time
45
- def Gdrive.get_worker_email_by_mongo_id(mongo_id)
46
- active_emails = Mobilize::Resque.jobs('working').map{|j| j['email'] if j['email']}.compact
45
+ def Gdrive.slot_worker_by_path(path)
46
+ working_slots = Mobilize::Resque.jobs('working').map{|j| j['gdrive_slot'] if j['gdrive_slot']}.compact
47
47
  Gdrive.workers.sort_by{rand}.each do |w|
48
- if !(active_emails.include?(w['email']))
49
- Mobilize::Resque.update_job_email(mongo_id,w['email'])
50
- return w['email']
48
+ unless working_slots.include?([w['name'],Gdrive.domain].join("@"))
49
+ Mobilize::Resque.set_worker_args_by_path(path,{'gdrive_slot'=>[w['name'],Gdrive.domain].join("@")})
50
+ return [w['name'],Gdrive.domain].join("@")
51
51
  end
52
52
  end
53
53
  #return false if none are available
54
54
  return false
55
55
  end
56
56
 
57
- def Gdrive.root(email=nil)
58
- email ||= Gdrive.owner_email
59
- pw = Gdrive.password(email)
60
- GoogleDrive.login(email,pw)
57
+ def Gdrive.root(gdrive_slot=nil)
58
+ pw = Gdrive.password(gdrive_slot)
59
+ GoogleDrive.login(gdrive_slot,pw)
61
60
  end
62
61
 
63
- def Gdrive.files(email=nil,params={})
64
- root = Gdrive.root(email)
62
+ def Gdrive.files(gdrive_slot=nil,params={})
63
+ root = Gdrive.root(gdrive_slot)
65
64
  root.files(params)
66
65
  end
67
66
 
68
- def Gdrive.books(email=nil,params={})
69
- Gdrive.files(email,params).select{|f| f.class==GoogleDrive::Spreadsheet}
67
+ def Gdrive.books(gdrive_slot=nil,params={})
68
+ Gdrive.files(gdrive_slot,params).select{|f| f.class==GoogleDrive::Spreadsheet}
70
69
  end
71
70
  end
72
71
  end
@@ -1,55 +1,34 @@
1
1
  module Mobilize
2
- class Gfile
3
- def Gfile.find_by_title(title,email=nil)
4
- Gdriver.files(email).select{|f| f.title==title}.first
5
- end
6
-
7
- def Gfile.find_by_dst_id(dst_id,email=nil)
8
- dst = Dataset.find(dst_id)
9
- Gfile.find_by_title(dst.path,email)
10
- end
11
-
12
- def Gfile.add_admin_acl_by_dst_id(dst_id)
13
- #adds admins and workers as writers
14
- file = Gfile.find_by_dst_id(dst_id)
2
+ module Gfile
3
+ def Gfile.add_admin_acl_by_path(path)
4
+ file = Gfile.find_by_path(path)
15
5
  file.add_admin_acl
16
6
  return true
17
7
  end
18
8
 
19
- def Gfile.add_admin_acl_by_title(title)
20
- file = Gfile.find_by_title(title)
21
- file.add_admin_acl
22
- return true
23
- end
24
-
25
- def Gfile.add_worker_acl_by_title(title)
26
- file = Gfile.find_by_title(title)
9
+ def Gfile.add_worker_acl_by_path(path)
10
+ file = Gfile.find_by_path(path)
27
11
  file.add_worker_acl
28
12
  return true
29
13
  end
30
14
 
31
- def Gfile.update_acl_by_dst_id(dst_id,email,role="writer",edit_email=nil)
32
- dst = Dataset.find(dst_id)
33
- Gfile.update_acl_by_title(dst.path,email,role,edit_email)
34
- end
35
-
36
- def Gfile.update_acl_by_title(title,email,role="writer",edit_email=nil)
37
- file = Gfile.find_by_title(title,edit_email)
38
- raise "File #{title} not found" unless file
39
- file.update_acl(email,role)
15
+ def Gfile.update_acl_by_path(path,gdrive_slot,role="writer",target_email=nil)
16
+ file = Gfile.find_by_path(path,target_email)
17
+ raise "File #{path} not found" unless file
18
+ file.update_acl(gdrive_slot,role)
40
19
  end
41
20
 
42
- def Gfile.read_by_name()
43
-
21
+ def Gfile.find_by_path(path,gdrive_slot)
22
+ Gdrive.files(gdrive_slot,{"title"=>path,"title-exact"=>"true"}).first
44
23
  end
45
24
 
46
- def Gfile.read_by_url()
47
-
25
+ def Gfile.read_by_task_path(task_path)
26
+ #reserve gdrive_slot account for read
27
+ gdrive_slot = Gdrive.slot_worker_by_path(t.path)
28
+ return false unless gdrive_slot
29
+ t = Task.where(:path=>task_path)
30
+ gfile_path = t.params.first
31
+ Gfile.find_by_path(gfile_path,gdrive_slot).read
48
32
  end
49
-
50
- def Gfile.read_by_job_id(job_id)
51
- j = Job.find(job_id)
52
- end
53
-
54
33
  end
55
34
  end
@@ -0,0 +1,43 @@
1
+ module Mobilize
2
+ module Gridfs
3
+ def Gridfs.config
4
+ Base.config('gridfs')
5
+ end
6
+
7
+ def Gridfs.grid
8
+ session = ::Mongoid.configure.sessions['default']
9
+ database_name = session['database']
10
+ host,port = session['hosts'].first.split(":")
11
+ return ::Mongo::GridFileSystem.new(::Mongo::Connection.new(host,port).db(database_name))
12
+ end
13
+
14
+ def Gridfs.read(path)
15
+ begin
16
+ zs=Gridfs.grid.open(path.gridsafe,'r').read
17
+ return ::Zlib::Inflate.inflate(zs)
18
+ rescue
19
+ return nil
20
+ end
21
+ end
22
+
23
+ def Gridfs.write(path,string)
24
+ zs = ::Zlib::Deflate.deflate(string)
25
+ raise "compressed string too large for Gridfs write" if zs.length > Gridfs.config['max_compressed_write_size']
26
+ curr_zs = Gridfs.read(path.gridsafe).to_s
27
+ #write a new version when there is a change
28
+ if curr_zs != zs
29
+ Gridfs.grid.open(path.gridsafe,'w',:versions => Gridfs.config['max_versions']){|f| f.write(zs)}
30
+ end
31
+ return true
32
+ end
33
+
34
+ def Gridfs.delete(path)
35
+ begin
36
+ Gridfs.grid.delete(path.gridsafe)
37
+ return true
38
+ rescue
39
+ return nil
40
+ end
41
+ end
42
+ end
43
+ end
@@ -1,122 +1,71 @@
1
1
  module Mobilize
2
2
  module Gsheet
3
3
 
4
- def Gsheet.max_cells
5
- 400000
4
+ def Gsheet.config
5
+ Base.config('gsheet')
6
6
  end
7
7
 
8
- def Gsheet.read(name,email=nil)
9
- sheet = Gsheet.find_or_create_by_name(name,email)
10
- sheet.to_tsv
8
+ def Gsheet.max_cells
9
+ Gsheet.config['max_cells']
11
10
  end
12
11
 
13
- def Gsheet.write(name,tsv,email=nil)
14
- sheet = Gsheet.find_or_create_by_name(name,email)
12
+ def Gsheet.write(path,tsv,gdrive_slot)
13
+ sheet = Gsheet.find_or_create_by_path(path,gdrive_slot)
15
14
  sheet.write(tsv)
16
15
  end
17
16
 
18
- def Gsheet.find_all_by_name(name,email)
19
- book_title,sheet_title = name.split("/")
20
- books = Gdrive.books(email,{"title"=>book_title,"title-exact"=>"true"})
21
- sheets = books.map{|b| b.worksheets}.flatten.select{|w| w.title == sheet_title }
22
- sheets
17
+ def Gsheet.find_by_path(path,gdrive_slot)
18
+ book_path,sheet_name = path.split("/")
19
+ book = Gdrive.books(gdrive_slot,{"title"=>book_path,"title-exact"=>"true"}).first
20
+ return book.worksheet_by_title(sheet_name) if book
23
21
  end
24
22
 
25
- def Gsheet.find_or_create_by_name(name,email=nil,rows=100,cols=20)
26
- book_title,sheet_title = name.split("/")
27
- book = Gbook.find_or_create_by_title(book_title,email)
28
- #http
29
- sheet = book.worksheets.select{|w| w.title==sheet_title}.first
23
+ def Gsheet.find_or_create_by_path(path,gdrive_slot,rows=100,cols=20)
24
+ book_path,sheet_name = path.split("/")
25
+ book = Gbook.find_or_create_by_path(book_path,gdrive_slot)
26
+ sheet = book.worksheet_by_title(sheet_name)
30
27
  if sheet.nil?
31
- #http
32
- sheet = book.add_worksheet(sheet_title,rows,cols)
33
- ("Created sheet #{name} at #{Time.now.utc.to_s}").oputs
28
+ sheet = book.add_worksheet(sheet_name,rows,cols)
29
+ ("Created gsheet #{path} at #{Time.now.utc.to_s}").oputs
34
30
  end
31
+ Dataset.find_or_create_by_handler_and_path("gsheet",path)
35
32
  return sheet
36
33
  end
37
34
 
38
- def Gsheet.find_or_create_by_dst_id(dst_id,email=nil)
39
- #creates by title, updates acl, updates dataset with url
40
- dst = Dataset.find(dst_id)
41
- r = Requestor.find(dst.requestor_id)
42
- name = dst.name
43
- book_title,sheet_title = name.split("/")
44
- #make sure book exists and is assigned to this user
45
- r.find_or_create_gbook_by_title(book_title,email)
46
- #add admin write access
47
- sheet = Gsheet.find_or_create_by_name(name)
48
- sheet_title = nil
49
- return sheet
50
- end
51
-
52
- def Gsheet.read_by_dst_id(dst_id,email=nil)
53
- dst = Dataset.find(dst_id)
54
- name = dst.name
55
- sheet = Gsheet.find_or_create_by_name(name,email)
56
- output = sheet.to_tsv
57
- return output
58
- end
59
-
60
- def Gsheet.read_by_job_id(job_id)
61
- j = Job.find(job_id)
62
- #reserve email account for read
63
- email = Gdrive.get_worker_email_by_mongo_id(job_id)
64
- return false unless email
65
- #pull tsv from cache
66
- j.dataset_array.first.read_cache
35
+ def Gsheet.read_by_task_path(task_path)
36
+ #reserve gdrive_slot account for read
37
+ gdrive_slot = Gdrive.slot_worker_by_path(task_path)
38
+ return false unless gdrive_slot
39
+ t = Task.where(:path=>task_path).first
40
+ gsheet_path = t.params.first
41
+ Gsheet.find_by_path(gsheet_path,gdrive_slot).to_tsv
67
42
  end
68
43
 
69
- def Gsheet.write_by_dst_id(dst_id,tsv,email=nil)
70
- dst = Dataset.find(dst_id)
71
- #see if this is a specific cell
72
- name = dst.name
73
- return false unless email
74
- #create temp tab, write data to it, checksum it against the source
75
- temp_sheet = Gsheet.find_or_create_by_name("#{name}_temp")
76
- temp_sheet.write(tsv)
77
- #delete current sheet, replace it with temp one
78
- sheet = Gsheet.find_or_create_by_name(dst.name)
79
- title = sheet.title
80
- #http
81
- sheet.delete
82
- begin
83
- temp_sheet.rename(title)
84
- rescue
85
- #need this because sometimes it gets confused and tries to rename twice
86
- end
87
- "Write successful for #{write_name}".oputs
88
- return true
89
- end
90
-
91
- def Gsheet.write_by_job_id(job_id)
92
- j = Job.find(job_id)
93
- r = j.requestor
94
- dest_name = if j.destination.split("/").length==1
95
- "#{r.jobspec_title}#{"/"}#{j.destination}"
96
- else
97
- j.destination
98
- end
99
- sheet_dst = Dataset.find_or_create_by_handler_and_name('gsheet',dest_name)
100
- sheet_dst.update_attributes(:requestor_id=>r.id.to_s) if sheet_dst.requestor_id.nil?
101
- email = Gdrive.get_worker_email_by_mongo_id(job_id)
44
+ def Gsheet.write_by_task_path(task_path)
45
+ gdrive_slot = Gdrive.slot_worker_by_path(task_path)
102
46
  #return false if there are no emails available
103
- return false unless email
104
- #create temp tab, write data to it, checksum it against the source
105
- temp_sheet_dst = Dataset.find_or_create_by_handler_and_name('gsheet',"#{dest_name}_temp")
106
- temp_sheet_dst.update_attributes(:requestor_id=>r.id.to_s) if temp_sheet_dst.requestor_id.nil?
107
- temp_sheet = Gsheet.find_or_create_by_name(temp_sheet_dst.name,email)
108
- #tsv is the prior task's output
109
- tsv = j.task_output_dsts[j.task_idx-1].read
110
- temp_sheet.write(tsv,true,job_id)
111
- #delete current sheet, replace it with temp one
112
- sheet = Gsheet.find_or_create_by_name(dest_name,email)
113
- title = sheet.title
114
- #http
115
- sheet.delete
116
- temp_sheet.title = title
117
- temp_sheet.save
118
- sheet_dst.update_attributes(:url=>temp_sheet.spreadsheet.human_url)
119
- "Write successful for #{dest_name}".oputs
47
+ return false unless gdrive_slot
48
+ t = Task.where(:path=>task_path).first
49
+ source = t.params.first
50
+ target_path = t.params.second
51
+ source_job_name, source_task_name = if source.index("/")
52
+ source.split("/")
53
+ else
54
+ [nil, source]
55
+ end
56
+ source_task_path = "#{t.job.runner.path}/#{source_job_name || t.job.name}/#{source_task_name}"
57
+ source_task = Task.where(:path=>source_task_path).first
58
+ tsv = source_task.stdout_dataset.read_cache
59
+ sheet_name = target_path.split("/").last
60
+ temp_path = [task_path.gridsafe,sheet_name].join("/")
61
+ temp_sheet = Gsheet.find_or_create_by_path(temp_path,gdrive_slot)
62
+ temp_sheet.write(tsv)
63
+ temp_sheet.check_and_fix(tsv)
64
+ target_sheet = Gsheet.find_or_create_by_path(target_path,gdrive_slot)
65
+ target_sheet.merge(temp_sheet)
66
+ #delete the temp sheet's book
67
+ temp_sheet.spreadsheet.delete
68
+ "Write successful for #{target_path}".oputs
120
69
  return true
121
70
  end
122
71
  end