inst_data_shipper 0.1.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +35 -0
  3. data/Rakefile +21 -0
  4. data/app/models/hosted_data_dumper/dump_batch.rb +10 -0
  5. data/db/migrate/20240301090836_create_canvas_sync_sync_batches.rb +17 -0
  6. data/lib/inst_data_shipper/basic_dumper.rb +27 -0
  7. data/lib/inst_data_shipper/concerns/hooks.rb +32 -0
  8. data/lib/inst_data_shipper/data_sources/base.rb +7 -0
  9. data/lib/inst_data_shipper/data_sources/canvas_reports.rb +113 -0
  10. data/lib/inst_data_shipper/data_sources/local_tables.rb +33 -0
  11. data/lib/inst_data_shipper/destinations/base.rb +104 -0
  12. data/lib/inst_data_shipper/destinations/concerns/chunking.rb +34 -0
  13. data/lib/inst_data_shipper/destinations/hosted_data.rb +133 -0
  14. data/lib/inst_data_shipper/destinations/s3.rb +72 -0
  15. data/lib/inst_data_shipper/dumper.rb +159 -0
  16. data/lib/inst_data_shipper/engine.rb +8 -0
  17. data/lib/inst_data_shipper/jobs/async_caller.rb +19 -0
  18. data/lib/inst_data_shipper/jobs/base.rb +27 -0
  19. data/lib/inst_data_shipper/jobs/basic_dump_job.rb +11 -0
  20. data/lib/inst_data_shipper/record.rb +6 -0
  21. data/lib/inst_data_shipper/schema_builder.rb +93 -0
  22. data/lib/inst_data_shipper/version.rb +3 -0
  23. data/lib/inst_data_shipper.rb +71 -0
  24. data/spec/dummy/README.rdoc +1 -0
  25. data/spec/dummy/Rakefile +6 -0
  26. data/spec/dummy/bin/rails +4 -0
  27. data/spec/dummy/config/application.rb +37 -0
  28. data/spec/dummy/config/boot.rb +5 -0
  29. data/spec/dummy/config/database.yml +25 -0
  30. data/spec/dummy/config/environment.rb +5 -0
  31. data/spec/dummy/config/environments/development.rb +41 -0
  32. data/spec/dummy/config/environments/test.rb +44 -0
  33. data/spec/dummy/config/initializers/assets.rb +11 -0
  34. data/spec/dummy/config/initializers/session_store.rb +3 -0
  35. data/spec/dummy/config/initializers/wrap_parameters.rb +14 -0
  36. data/spec/dummy/config/routes.rb +2 -0
  37. data/spec/dummy/config/secrets.yml +22 -0
  38. data/spec/dummy/config.ru +4 -0
  39. data/spec/dummy/db/schema.rb +45 -0
  40. data/spec/spec_helper.rb +70 -0
  41. data/spec/support/fixtures/reports/provisioning_csv_unzipped/courses.csv +3 -0
  42. data/spec/support/fixtures/reports/provisioning_csv_unzipped/users.csv +4 -0
  43. metadata +452 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: bf2f1cdd4b4181e945c5f36e7680ed0a054429dc191197fafbee60de9598305b
4
+ data.tar.gz: 5fb781dc8aa17bf7d672fdfc8942d70365edb68fd324d8aeab70297270af0b1e
5
+ SHA512:
6
+ metadata.gz: 9212cd9c647193aa7256f15f6da12cd4ee3c56a12e011ac269a1d801d15e0cb7182a71c2fa8d8e2d2ea808aff73ff4f7c974c3720db54414eb43c24658ca554f
7
+ data.tar.gz: c4cb69ad7ea635833aa5051dec5a8c14f3aa13e2b11dd3e8fbdd4d12c2a9d63ac9dbb5b235915da0d80898d0b048d5677fb807d6947911e3e10a87956b8ee1fc
data/README.md ADDED
@@ -0,0 +1,35 @@
1
+ # InstDataShipper
2
+
3
+ This gem is intended to facilitate fast and easy syncing of Canvas data.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'inst_data_shipper'
11
+ ```
12
+
13
+ Then run the migrations:
14
+
15
+ ```
16
+ bundle exec rake db:migrate
17
+ ```
18
+
19
+ ## Development
20
+
21
+ When adding to or updating this gem, make sure you do the following:
22
+
23
+ - Update the yardoc comments where necessary, and confirm the changes by running `yardoc --server`
24
+ - Write specs
25
+ - If you modify the model or migration templates, run `bundle exec rake update_test_schema` to update them in the Rails Dummy application (and commit those changes)
26
+
27
+ ## Docs
28
+
29
+ Docs can be generated using [yard](https://yardoc.org/). To view the docs:
30
+
31
+ - Clone this gem's repository
32
+ - `bundle install`
33
+ - `yard server --reload`
34
+
35
+ The yard server will give you a URL you can visit to view the docs.
data/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+ require "open3"
4
+
5
+ RSpec::Core::RakeTask.new(:spec)
6
+
7
+ task default: :spec
8
+
9
+ desc 'This updates the migrations used by the testing Rails Dummy app and should be run whenever those are updated.'
10
+ task :update_test_schema do
11
+ puts "Updating the test database and schema..."
12
+ stream_command("cd spec/dummy; bundle exec rake db:drop; bundle exec rake db:create; bundle exec rake db:migrate")
13
+ end
14
+
15
+ def stream_command(cmd)
16
+ Open3.popen2e(cmd) do |stdin, stdout_stderr, wait_thr|
17
+ while line = stdout_stderr.gets
18
+ puts line
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,10 @@
1
+ module InstDataShipper
2
+ class DumpBatch < ApplicationRecord
3
+ serialize :job_arguments, Array
4
+
5
+ ERROR_STATUS = "error".freeze
6
+ SUCCESS_STATUS = "success".freeze
7
+ ENQUEUED_STATUS = "enqueued".freeze
8
+ RUNNING_STATUS = "running".freeze
9
+ end
10
+ end
@@ -0,0 +1,17 @@
1
+ class InstDataShipperDumpBatches < CanvasSync::MiscHelper::MigrationClass
2
+ def change
3
+ create_table :inst_data_shipper_dump_batches do |t|
4
+ t.datetime :started_at
5
+ t.datetime :completed_at
6
+ t.string :status
7
+
8
+ t.string :job_class
9
+ t.string :exception
10
+ t.text :backtrace
11
+ t.text :metadata
12
+ t.text :job_arguments
13
+
14
+ t.timestamps
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,27 @@
1
+ module InstDataShipper
2
+ class BasicDumper < Dumper
3
+ def self.perform_dump(destinations:, schema:, &block)
4
+ raise "Schema must be a constantizable string" unless schema.is_a?(String)
5
+
6
+ dumper = new(destinations)
7
+ dumper.instance_variable_set(:@schema_pointer, schema)
8
+ dumper.instance_variable_set(:@body_block, block)
9
+ dumper.begin_dump
10
+
11
+ dumper.tracker
12
+ end
13
+
14
+ hook :initialize_dump_batch do |context|
15
+ context[:schema_pointer] = @schema_pointer
16
+ end
17
+
18
+ def enqueue_tasks
19
+ instance_exec(&@body_block)
20
+ end
21
+
22
+ def table_schemas
23
+ pointer = @schema_pointer || batch_context[:schema_pointer]
24
+ safe_constantize(pointer)
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,32 @@
1
+ module InstDataShipper
2
+ module Hooks
3
+ extend ActiveSupport::Concern
4
+
5
+ class_methods do
6
+ def define_hook(name)
7
+ @hooks ||= {}
8
+ @hooks[name] ||= []
9
+ end
10
+
11
+ def hook(name, prepend: false, &block)
12
+ hooks = @hooks[name]
13
+ prepend ? hooks.unshift(block) : hooks << block
14
+ end
15
+ end
16
+
17
+ def run_hook(name, *args, **kwargs)
18
+ hooks = @hooks[name]
19
+ hooks.each do |blk|
20
+ instance_exec(*args, **kwargs, &blk)
21
+ end
22
+ end
23
+
24
+ def run_hook_safe(name, *args, **kwargs)
25
+ hooks = @hooks[name]
26
+ hooks.each do |blk|
27
+ instance_exec(*args, **kwargs, &blk)
28
+ rescue StandardError
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,7 @@
1
+ module InstDataShipper
2
+ module DataSources
3
+ module Base
4
+ extend ActiveSupport::Concern
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,113 @@
1
+ module InstDataShipper
2
+ module DataSources
3
+ # This module contains the logic for processing Canvas reports
4
+ module CanvasReports
5
+ extend ActiveSupport::Concern
6
+
7
+ included do
8
+ hook :initialize_dump_batch do |context|
9
+ report_processor_pool = CanvasSync::JobBatches::Pool.new(
10
+ description: "HD #{export_genre} Export #{tracker.id} Canvas Report Pool",
11
+ concurrency: 4,
12
+ clean_when_empty: false,
13
+ )
14
+ context[:report_processor_pool] = report_processor_pool.pid
15
+ end
16
+
17
+ hook :finalize_dump_batch do
18
+ if batch_context[:report_processor_pool]
19
+ CanvasSync::JobBatches::Pool.from_pid(batch_context[:report_processor_pool]).cleanup_redis
20
+ end
21
+ end
22
+ end
23
+
24
+ public
25
+
26
+ def import_canvas_report(*args, **kwargs)
27
+ _in_canvas_report_pool(:_import_canvas_report, *args, **kwargs)
28
+ end
29
+
30
+ def import_canvas_report_by_terms(target_table, report_name, terms: [], params: {}, **kwargs)
31
+ term_ids = (terms || []).map do |term|
32
+ term.is_a?(Term) ? term.canvas_id : term
33
+ end
34
+
35
+ Sidekiq::Batch.new.tap do |b|
36
+ b.description = "Term Scoped #{report_name} Runners"
37
+ b.context = {
38
+ report_bid: b.bid,
39
+ }
40
+ b.jobs do
41
+ terms_query = term_ids.present? ? Term.where(canvas_id: term_ids) : Term
42
+ terms_query.find_each do |t|
43
+ import_canvas_report(target_table, report_name, params: { **params, enrollment_term_id: t.canvas_id }, **kwargs)
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ def import_existing_report(table, report)
50
+ delayed(:_process_canvas_report, table, report: report)
51
+ end
52
+
53
+ private
54
+
55
+ def _import_canvas_report(target_table, report_name, retry_count: 3, params: {}, **kwargs)
56
+ report = canvas_sync_client.start_report(
57
+ 'self', report_name,
58
+ parameters: params,
59
+ )
60
+
61
+ CanvasSync::Jobs::CanvasProcessWaiter.perform_later(
62
+ "/api/v1/accounts/self/reports/#{report_name}/#{report[:id]}",
63
+ {
64
+ instance_of: origin_class,
65
+ method: :_process_canvas_report,
66
+ args: [target_table],
67
+ kwargs: kwargs,
68
+ },
69
+ on_failure: {
70
+ instance_of: origin_class,
71
+ method: :_handle_failed_canvas_report,
72
+ args: [target_table, report_name, kwargs],
73
+ kwargs: { retry_count: retry_count },
74
+ },
75
+ status_key: :status,
76
+ progress_as: :report,
77
+ )
78
+ end
79
+
80
+ def _in_canvas_report_pool(mthd, *args, **kwargs)
81
+ pool = CanvasSync::JobBatches::Pool.from_pid(batch_context[:report_processor_pool])
82
+ AsyncCaller.call_from_pool(pool, self.class, mthd, *args, **kwargs)
83
+ end
84
+
85
+ def _process_canvas_report(table, report:)
86
+ table_def = table_schemas.find { |t| t[:warehouse_name].to_s == table }
87
+
88
+ IO.copy_stream(URI.parse(report['attachment']['url']).open, "#{working_dir}/#{table}.csv")
89
+
90
+ inner_block = ->(file) {
91
+ CSV.foreach("#{working_dir}/#{table}.csv", headers: true) do |m|
92
+ file << table_def[:columns].map do |c|
93
+ c[:transformer].present? ? m.instance_exec(&c[:transformer]) : m[c[:local_name].to_s]
94
+ end
95
+ end
96
+ }
97
+
98
+ upload_data(table_def, extra: report['id'], &inner_block)
99
+ end
100
+
101
+ def _handle_failed_canvas_report(table, report_name, kwargs, retry_count:, report:) # rubocop:disable Lint/UnusedMethodArgument
102
+ if retry_count.positive?
103
+ tbid = batch_context[:report_bid] || batch_context[:root_bid]
104
+ Sidekiq::Batch.new(tbid).jobs do
105
+ import_canvas_report(table, report_name, retry_count: retry_count - 1, **kwargs.symbolize_keys)
106
+ end
107
+ else
108
+ cleanup_fatal_error!
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,33 @@
1
+ module InstDataShipper
2
+ module DataSources
3
+ # This module contains the logic for processing local AR tables
4
+ module LocalTables
5
+ extend ActiveSupport::Concern
6
+
7
+ public
8
+
9
+ def import_local_table(*args, **kwargs)
10
+ delayed(:_import_local_table, *args, **kwargs)
11
+ end
12
+
13
+ private
14
+
15
+ def _import_local_table(table_name)
16
+ table_def = table_schemas.find { |t| t[:model].to_s == table_name }
17
+ model = table_def[:model]
18
+
19
+ inner_block = ->(file) {
20
+ query = model
21
+ query = query.includes(table_def[:includes]) if table_def[:includes].present?
22
+ model.find_each do |m|
23
+ file << table_def[:columns].map do |c|
24
+ c[:transformer].present? ? m.instance_exec(&c[:transformer]) : m[c[:local_name].to_s]
25
+ end
26
+ end
27
+ }
28
+
29
+ upload_data(table_def, &inner_block)
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,104 @@
1
+ module InstDataShipper
2
+ module Destinations
3
+ class Base
4
+ attr_reader :dumper
5
+
6
+ delegate :tracker, :table_schemas, :working_dir, to: :dumper
7
+
8
+ def initialize(cache_key, config, dumper)
9
+ @cache_key = cache_key
10
+ @_config = config
11
+ @dumper = dumper
12
+ end
13
+
14
+ # This method is called before processing any data.
15
+ # It should be used to initialize any external resources needed for the dump.
16
+ def initialize_dump; end
17
+
18
+ # Yields an object (can be anything) that will be passed to `upload_data_chunk` as `chunk`.
19
+ #
20
+ # If multiple Destinations have the same `group_key`, `chunk_data` will only be called on the first and the chunk will be passed to each destination.
21
+ # Thus, if chunking is config-dependent, your Destination must modify the `group_key` to be unique for each configuration.
22
+ #
23
+ # This must be overridden, but you may call super with a block to iterate individual rows. Manually batch the rows, or include Concerns::Chunking to pre-batch them.
24
+ def chunk_data(generator, **kwargs)
25
+ raise NotImplementedError if method(__method__).owner == Base
26
+
27
+ enum = Enumerator.new(&generator)
28
+ enum.each do |row|
29
+ yield format_row(row)
30
+ end
31
+ end
32
+
33
+ # Called with any values yielded from chunk_data.
34
+ # This method should upload the chunk to the destination.
35
+ def upload_data_chunk(table_def, chunk)
36
+ raise NotImplementedError
37
+ end
38
+
39
+ # This method is called after processing all data.
40
+ # It should be used to finalize any external resources created by the dump.
41
+ def finalize_dump; end
42
+
43
+ # This method is called if a fatal error occurs.
44
+ # It should cleanup any external resources created by the dump.
45
+ def cleanup_fatal_error; end
46
+
47
+ def config
48
+ return @_config if @_config.is_a?(Hash)
49
+ @config ||= parse_configuration(@_config)
50
+ end
51
+
52
+ def user_config
53
+ config[:extra]
54
+ end
55
+
56
+ def group_key
57
+ { class: self.class }
58
+ end
59
+
60
+ protected
61
+
62
+ def parse_configuration(uri)
63
+ if block_given?
64
+ parsed = URI.parse(uri)
65
+ cfg = {
66
+ params: parsed.query.present? ? Rack::Utils.parse_nested_query(parsed.query) : {},
67
+ extra: (parsed.fragment.present? && parsed.fragment.match?(/^\w+=/) && Rack::Utils.parse_nested_query(parsed.fragment)).presence || parsed.fragment || nil,
68
+ }
69
+ yield parsed, cfg
70
+ cfg
71
+ else
72
+ raise NotImplementedError
73
+ end
74
+ end
75
+
76
+ def rk(key)
77
+ "#{@cache_key}:#{key}"
78
+ end
79
+
80
+ def redis(*args, &blk)
81
+ InstDataShipper.redis(*args, &blk)
82
+ end
83
+
84
+ # This is a base/generic implementation and may need to be overridden
85
+ def format_row(row, override_nils: true)
86
+ if row.is_a?(Array)
87
+ row = row.map do |v|
88
+ v = '\N' if v.nil? && override_nils
89
+ v = v.utc.strftime('%Y-%m-%d %H:%M:%S') if v.is_a?(DateTime) || v.is_a?(Time)
90
+ v = v.strftime('%Y-%m-%d') if v.is_a?(Date)
91
+ v = JSON.dump(v) if v.is_a?(Hash) || v.is_a?(Array)
92
+ if v.is_a?(String)
93
+ v = v.gsub("\t", '\t')
94
+ v = v.gsub("\n", '\n')
95
+ end
96
+ v
97
+ end
98
+ end
99
+ row
100
+ end
101
+
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,34 @@
1
+ module InstDataShipper
2
+ module Destinations
3
+ module Concerns
4
+ module Chunking
5
+ extend ActiveSupport::Concern
6
+
7
+ DEFAULT_CHUNK_SIZE = 100_000
8
+
9
+ def chunk_data(generator, chunk_size: nil, **kwargs)
10
+ chunk_size ||= config.dig(:params, :chunk_size) || DEFAULT_CHUNK_SIZE
11
+ slice = 1
12
+
13
+ btchr = CanvasSync::BatchProcessor.new(of: chunk_size) do |batch|
14
+ yield batch, slice
15
+ slice += 1
16
+ end
17
+
18
+ super(generator, **kwargs) do |row|
19
+ btchr << row
20
+ end
21
+
22
+ btchr.flush
23
+ end
24
+
25
+ def group_key
26
+ super.tap do |k|
27
+ k[:chunk_size] = config.dig(:params, :chunk_size) || DEFAULT_CHUNK_SIZE
28
+ end
29
+ end
30
+
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,133 @@
1
+ module InstDataShipper
2
+ module Destinations
3
+ class HostedData < Base
4
+ include Concerns::Chunking
5
+
6
+ def initialize_dump
7
+ dump = hosted_data_client.post(
8
+ 'api/v1/custom_dumps/',
9
+ reference_id: tracker.id,
10
+ schema: convert_schema,
11
+ ).body.with_indifferent_access
12
+
13
+ redis.hset(rk(:state), :dump_id, dump[:id])
14
+ redis.expire(rk(:state), 30.days.to_i)
15
+ end
16
+
17
+ def chunk_data(generator, table:, extra: nil)
18
+ warehouse_name = table_def[:warehouse_name]
19
+
20
+ super(generator) do |batch, idx|
21
+ bits = [warehouse_name, extra, idx].compact
22
+ temp_file = "#{working_dir}/#{bits.join('.')}.tsv.gz"
23
+
24
+ Zlib::GzipWriter.open(temp_file) do |gz|
25
+ batch.each do |row|
26
+ row = row.join("\t") if row.is_a?(Array)
27
+ gz.puts(row)
28
+ end
29
+ end
30
+
31
+ yield temp_file
32
+
33
+ File.delete(temp_file)
34
+ end
35
+ end
36
+
37
+ def upload_data_chunk(table_def, chunk)
38
+ hosted_data_client.put("api/v1/custom_dumps/#{hd_dump_id}/", artifacts: {
39
+ table_def[:warehouse_name] => [Faraday::UploadIO.new(chunk, 'application/gzip')],
40
+ })
41
+ end
42
+
43
+ def finalize_dump
44
+ hosted_data_client.put("api/v1/custom_dumps/#{hd_dump_id}/", start_import: true) if hd_dump_id.present?
45
+ redis.delete(rk(:state))
46
+ end
47
+
48
+ def cleanup_fatal_error
49
+ hosted_data_client.delete("api/v1/custom_dumps/#{hd_dump_id}/", reason: 'Failure during extraction or transformation') if hd_dump_id.present?
50
+ redis.delete(rk(:state))
51
+ end
52
+
53
+ # TODO Support/allow single-table fatal errors?
54
+
55
+ protected
56
+
57
+ def hd_dump_id
58
+ @hd_dump_id ||= redis.hget(rk(:state), :dump_id)
59
+ end
60
+
61
+ def convert_schema
62
+ table_prefix = config[:table_prefix]
63
+ table_prefix = table_prefix.present? ? "#{table_prefix}_" : nil
64
+
65
+ definititions = {}
66
+ table_schemas.each do |ts|
67
+ ts = ts.dup
68
+
69
+ table_name = ts[:warehouse_name]
70
+ table_name = table_prefix + table_name if table_prefix.present?
71
+
72
+ definititions[ts[:warehouse_name]] = {
73
+ dw_type: 'dimension',
74
+ description: ts[:description],
75
+ incremental: !!ts[:incremental],
76
+ incremental_on: ts[:incremental] && ts[:incremental] != true ? ts[:incremental] : nil,
77
+ # indexed_columns
78
+ tableName: table_name,
79
+ columns: ts[:columns].map do |col|
80
+ {
81
+ name: col[:warehouse_name],
82
+ description: col[:description],
83
+ type: col[:type] || ts[:model].column_for_attribute(col[:local_name]).sql_type,
84
+ }
85
+ end,
86
+ }
87
+ end
88
+
89
+ {
90
+ version: "#{dumper.export_genre.downcase}-#{Digest::MD5.hexdigest(definititions.to_json)[0...6]}",
91
+ definition: definititions,
92
+ }
93
+ end
94
+
95
+ def hosted_data_client
96
+ @hosted_data_client ||= begin
97
+ token = config[:token]
98
+
99
+ host = config[:host]
100
+ unless host.present?
101
+ tok_content = JWT.decode(token, nil, false).first
102
+ host = tok_content['host']
103
+ end
104
+
105
+ Faraday.new(url: host) do |faraday|
106
+ faraday.request :multipart
107
+ faraday.request :json
108
+ faraday.response :raise_error
109
+ faraday.response :follow_redirects
110
+ faraday.response :json, :content_type => /\bjson$/
111
+ faraday.headers[:accept] = 'application/json'
112
+ faraday.headers[:authorization] = "Bearer #{token}"
113
+ faraday.adapter Faraday.default_adapter
114
+ end
115
+ end
116
+ end
117
+
118
+ def parse_configuration(uri)
119
+ super do |parsed_uri, cfg|
120
+ if parsed_uri.username.present?
121
+ # hosted-data://<JWT>:<hosted_data_domain>
122
+ cfg[:token] = parsed_uri.username
123
+ cfg[:host] = parsed_uri.host
124
+ else
125
+ # hosted-data://<JWT>
126
+ cfg[:token] = parsed_uri.host
127
+ end
128
+ end
129
+ end
130
+
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,72 @@
1
+ module InstDataShipper
2
+ module Destinations
3
+ class S3 < Base
4
+ include Concerns::Chunking
5
+
6
+ def chunk_data(generator, table:, extra: nil)
7
+ warehouse_name = table_def[:warehouse_name]
8
+
9
+ super(generator) do |batch, idx|
10
+ bits = [warehouse_name, extra, idx].compact
11
+ temp_file = "#{working_dir}/#{bits.join('.')}.csv"
12
+
13
+ CSV.open(temp_file, 'w', headers: false) do |row|
14
+ row << table[:columns].map { |c| c[:warehouse_name] }
15
+ batch.each do |batch_row|
16
+ row << batch_row
17
+ end
18
+ end
19
+
20
+ yield temp_file
21
+
22
+ File.delete(temp_file)
23
+ end
24
+ end
25
+
26
+ def upload_data_chunk(table_def, chunk)
27
+ s3 = Aws::S3::Resource.new(client: aws_client)
28
+ dir_key = tracker.created_at.strftime("%Y-%m-%dT%H:%M") + "_#{tracker.id}"
29
+ bucket = s3.bucket(config[:bucket])
30
+
31
+ subpath = config[:path].presence || "/"
32
+ subpath = subpath[1..-1] if subpath.starts_with?("/")
33
+ subpath = "instructure" unless subpath.present?
34
+
35
+ obj_path = File.join(config[:path], dir_key, File.basename(chunk))
36
+ object = bucket.object(obj_path)
37
+
38
+ File.open(chunk, 'rb') do |file|
39
+ object.put(body: file)
40
+ end
41
+ end
42
+
43
+ protected
44
+
45
+ def aws_client
46
+ @aws_client ||= Aws::S3::Client.new(
47
+ region: config[:region],
48
+ credentials: Aws::Credentials.new(
49
+ config[:access_key_id],
50
+ config[:access_key_secret],
51
+ )
52
+ )
53
+ end
54
+
55
+ def parse_configuration(uri)
56
+ # s3://<access_key_id>:<access_key_secret>@<region>/<bucket>/<path>
57
+ super do |parsed_uri, cfg|
58
+ split_path = parsed_uri.path.split('/')
59
+
60
+ cfg.merge!({
61
+ region: parsed_uri.host,
62
+ bucket: split_path[0],
63
+ access_key_id: parsed_uri.user,
64
+ access_key_secret: parsed_uri.password,
65
+ path: split_path[1..-1].join('/').presence,
66
+ })
67
+ end
68
+ end
69
+
70
+ end
71
+ end
72
+ end