inst_data_shipper 0.1.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +35 -0
  3. data/Rakefile +21 -0
  4. data/app/models/hosted_data_dumper/dump_batch.rb +10 -0
  5. data/db/migrate/20240301090836_create_canvas_sync_sync_batches.rb +17 -0
  6. data/lib/inst_data_shipper/basic_dumper.rb +27 -0
  7. data/lib/inst_data_shipper/concerns/hooks.rb +32 -0
  8. data/lib/inst_data_shipper/data_sources/base.rb +7 -0
  9. data/lib/inst_data_shipper/data_sources/canvas_reports.rb +113 -0
  10. data/lib/inst_data_shipper/data_sources/local_tables.rb +33 -0
  11. data/lib/inst_data_shipper/destinations/base.rb +104 -0
  12. data/lib/inst_data_shipper/destinations/concerns/chunking.rb +34 -0
  13. data/lib/inst_data_shipper/destinations/hosted_data.rb +133 -0
  14. data/lib/inst_data_shipper/destinations/s3.rb +72 -0
  15. data/lib/inst_data_shipper/dumper.rb +159 -0
  16. data/lib/inst_data_shipper/engine.rb +8 -0
  17. data/lib/inst_data_shipper/jobs/async_caller.rb +19 -0
  18. data/lib/inst_data_shipper/jobs/base.rb +27 -0
  19. data/lib/inst_data_shipper/jobs/basic_dump_job.rb +11 -0
  20. data/lib/inst_data_shipper/record.rb +6 -0
  21. data/lib/inst_data_shipper/schema_builder.rb +93 -0
  22. data/lib/inst_data_shipper/version.rb +3 -0
  23. data/lib/inst_data_shipper.rb +71 -0
  24. data/spec/dummy/README.rdoc +1 -0
  25. data/spec/dummy/Rakefile +6 -0
  26. data/spec/dummy/bin/rails +4 -0
  27. data/spec/dummy/config/application.rb +37 -0
  28. data/spec/dummy/config/boot.rb +5 -0
  29. data/spec/dummy/config/database.yml +25 -0
  30. data/spec/dummy/config/environment.rb +5 -0
  31. data/spec/dummy/config/environments/development.rb +41 -0
  32. data/spec/dummy/config/environments/test.rb +44 -0
  33. data/spec/dummy/config/initializers/assets.rb +11 -0
  34. data/spec/dummy/config/initializers/session_store.rb +3 -0
  35. data/spec/dummy/config/initializers/wrap_parameters.rb +14 -0
  36. data/spec/dummy/config/routes.rb +2 -0
  37. data/spec/dummy/config/secrets.yml +22 -0
  38. data/spec/dummy/config.ru +4 -0
  39. data/spec/dummy/db/schema.rb +45 -0
  40. data/spec/spec_helper.rb +70 -0
  41. data/spec/support/fixtures/reports/provisioning_csv_unzipped/courses.csv +3 -0
  42. data/spec/support/fixtures/reports/provisioning_csv_unzipped/users.csv +4 -0
  43. metadata +452 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: bf2f1cdd4b4181e945c5f36e7680ed0a054429dc191197fafbee60de9598305b
4
+ data.tar.gz: 5fb781dc8aa17bf7d672fdfc8942d70365edb68fd324d8aeab70297270af0b1e
5
+ SHA512:
6
+ metadata.gz: 9212cd9c647193aa7256f15f6da12cd4ee3c56a12e011ac269a1d801d15e0cb7182a71c2fa8d8e2d2ea808aff73ff4f7c974c3720db54414eb43c24658ca554f
7
+ data.tar.gz: c4cb69ad7ea635833aa5051dec5a8c14f3aa13e2b11dd3e8fbdd4d12c2a9d63ac9dbb5b235915da0d80898d0b048d5677fb807d6947911e3e10a87956b8ee1fc
data/README.md ADDED
@@ -0,0 +1,35 @@
1
+ # InstDataShipper
2
+
3
+ This gem is intended to facilitate fast and easy syncing of Canvas data.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'inst_data_shipper'
11
+ ```
12
+
13
+ Then run the migrations:
14
+
15
+ ```
16
+ bundle exec rake db:migrate
17
+ ```
18
+
19
+ ## Development
20
+
21
+ When adding to or updating this gem, make sure you do the following:
22
+
23
+ - Update the yardoc comments where necessary, and confirm the changes by running `yardoc --server`
24
+ - Write specs
25
+ - If you modify the model or migration templates, run `bundle exec rake update_test_schema` to update them in the Rails Dummy application (and commit those changes)
26
+
27
+ ## Docs
28
+
29
+ Docs can be generated using [yard](https://yardoc.org/). To view the docs:
30
+
31
+ - Clone this gem's repository
32
+ - `bundle install`
33
+ - `yard server --reload`
34
+
35
+ The yard server will give you a URL you can visit to view the docs.
data/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+ require "open3"
4
+
5
+ RSpec::Core::RakeTask.new(:spec)
6
+
7
+ task default: :spec
8
+
9
+ desc 'This updates the migrations used by the testing Rails Dummy app and should be run whenever those are updated.'
10
+ task :update_test_schema do
11
+ puts "Updating the test database and schema..."
12
+ stream_command("cd spec/dummy; bundle exec rake db:drop; bundle exec rake db:create; bundle exec rake db:migrate")
13
+ end
14
+
15
+ def stream_command(cmd)
16
+ Open3.popen2e(cmd) do |stdin, stdout_stderr, wait_thr|
17
+ while line = stdout_stderr.gets
18
+ puts line
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,10 @@
1
+ module InstDataShipper
2
+ class DumpBatch < ApplicationRecord
3
+ serialize :job_arguments, Array
4
+
5
+ ERROR_STATUS = "error".freeze
6
+ SUCCESS_STATUS = "success".freeze
7
+ ENQUEUED_STATUS = "enqueued".freeze
8
+ RUNNING_STATUS = "running".freeze
9
+ end
10
+ end
@@ -0,0 +1,17 @@
1
+ class InstDataShipperDumpBatches < CanvasSync::MiscHelper::MigrationClass
2
+ def change
3
+ create_table :inst_data_shipper_dump_batches do |t|
4
+ t.datetime :started_at
5
+ t.datetime :completed_at
6
+ t.string :status
7
+
8
+ t.string :job_class
9
+ t.string :exception
10
+ t.text :backtrace
11
+ t.text :metadata
12
+ t.text :job_arguments
13
+
14
+ t.timestamps
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,27 @@
1
+ module InstDataShipper
2
+ class BasicDumper < Dumper
3
+ def self.perform_dump(destinations:, schema:, &block)
4
+ raise "Schema must be a constantizable string" unless schema.is_a?(String)
5
+
6
+ dumper = new(destinations)
7
+ dumper.instance_variable_set(:@schema_pointer, schema)
8
+ dumper.instance_variable_set(:@body_block, block)
9
+ dumper.begin_dump
10
+
11
+ dumper.tracker
12
+ end
13
+
14
+ hook :initialize_dump_batch do |context|
15
+ context[:schema_pointer] = @schema_pointer
16
+ end
17
+
18
+ def enqueue_tasks
19
+ instance_exec(&@body_block)
20
+ end
21
+
22
+ def table_schemas
23
+ pointer = @schema_pointer || batch_context[:schema_pointer]
24
+ safe_constantize(pointer)
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,32 @@
1
+ module InstDataShipper
2
+ module Hooks
3
+ extend ActiveSupport::Concern
4
+
5
+ class_methods do
6
+ def define_hook(name)
7
+ @hooks ||= {}
8
+ @hooks[name] ||= []
9
+ end
10
+
11
+ def hook(name, prepend: false, &block)
12
+ hooks = @hooks[name]
13
+ prepend ? hooks.unshift(block) : hooks << block
14
+ end
15
+ end
16
+
17
+ def run_hook(name, *args, **kwargs)
18
+ hooks = @hooks[name]
19
+ hooks.each do |blk|
20
+ instance_exec(*args, **kwargs, &blk)
21
+ end
22
+ end
23
+
24
+ def run_hook_safe(name, *args, **kwargs)
25
+ hooks = @hooks[name]
26
+ hooks.each do |blk|
27
+ instance_exec(*args, **kwargs, &blk)
28
+ rescue StandardError
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,7 @@
1
+ module InstDataShipper
2
+ module DataSources
3
+ module Base
4
+ extend ActiveSupport::Concern
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,113 @@
1
+ module InstDataShipper
2
+ module DataSources
3
+ # This module contains the logic for processing Canvas reports
4
+ module CanvasReports
5
+ extend ActiveSupport::Concern
6
+
7
+ included do
8
+ hook :initialize_dump_batch do |context|
9
+ report_processor_pool = CanvasSync::JobBatches::Pool.new(
10
+ description: "HD #{export_genre} Export #{tracker.id} Canvas Report Pool",
11
+ concurrency: 4,
12
+ clean_when_empty: false,
13
+ )
14
+ context[:report_processor_pool] = report_processor_pool.pid
15
+ end
16
+
17
+ hook :finalize_dump_batch do
18
+ if batch_context[:report_processor_pool]
19
+ CanvasSync::JobBatches::Pool.from_pid(batch_context[:report_processor_pool]).cleanup_redis
20
+ end
21
+ end
22
+ end
23
+
24
+ public
25
+
26
+ def import_canvas_report(*args, **kwargs)
27
+ _in_canvas_report_pool(:_import_canvas_report, *args, **kwargs)
28
+ end
29
+
30
+ def import_canvas_report_by_terms(target_table, report_name, terms: [], params: {}, **kwargs)
31
+ term_ids = (terms || []).map do |term|
32
+ term.is_a?(Term) ? term.canvas_id : term
33
+ end
34
+
35
+ Sidekiq::Batch.new.tap do |b|
36
+ b.description = "Term Scoped #{report_name} Runners"
37
+ b.context = {
38
+ report_bid: b.bid,
39
+ }
40
+ b.jobs do
41
+ terms_query = term_ids.present? ? Term.where(canvas_id: term_ids) : Term
42
+ terms_query.find_each do |t|
43
+ import_canvas_report(target_table, report_name, params: { **params, enrollment_term_id: t.canvas_id }, **kwargs)
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ def import_existing_report(table, report)
50
+ delayed(:_process_canvas_report, table, report: report)
51
+ end
52
+
53
+ private
54
+
55
+ def _import_canvas_report(target_table, report_name, retry_count: 3, params: {}, **kwargs)
56
+ report = canvas_sync_client.start_report(
57
+ 'self', report_name,
58
+ parameters: params,
59
+ )
60
+
61
+ CanvasSync::Jobs::CanvasProcessWaiter.perform_later(
62
+ "/api/v1/accounts/self/reports/#{report_name}/#{report[:id]}",
63
+ {
64
+ instance_of: origin_class,
65
+ method: :_process_canvas_report,
66
+ args: [target_table],
67
+ kwargs: kwargs,
68
+ },
69
+ on_failure: {
70
+ instance_of: origin_class,
71
+ method: :_handle_failed_canvas_report,
72
+ args: [target_table, report_name, kwargs],
73
+ kwargs: { retry_count: retry_count },
74
+ },
75
+ status_key: :status,
76
+ progress_as: :report,
77
+ )
78
+ end
79
+
80
+ def _in_canvas_report_pool(mthd, *args, **kwargs)
81
+ pool = CanvasSync::JobBatches::Pool.from_pid(batch_context[:report_processor_pool])
82
+ AsyncCaller.call_from_pool(pool, self.class, mthd, *args, **kwargs)
83
+ end
84
+
85
+ def _process_canvas_report(table, report:)
86
+ table_def = table_schemas.find { |t| t[:warehouse_name].to_s == table }
87
+
88
+ IO.copy_stream(URI.parse(report['attachment']['url']).open, "#{working_dir}/#{table}.csv")
89
+
90
+ inner_block = ->(file) {
91
+ CSV.foreach("#{working_dir}/#{table}.csv", headers: true) do |m|
92
+ file << table_def[:columns].map do |c|
93
+ c[:transformer].present? ? m.instance_exec(&c[:transformer]) : m[c[:local_name].to_s]
94
+ end
95
+ end
96
+ }
97
+
98
+ upload_data(table_def, extra: report['id'], &inner_block)
99
+ end
100
+
101
+ def _handle_failed_canvas_report(table, report_name, kwargs, retry_count:, report:) # rubocop:disable Lint/UnusedMethodArgument
102
+ if retry_count.positive?
103
+ tbid = batch_context[:report_bid] || batch_context[:root_bid]
104
+ Sidekiq::Batch.new(tbid).jobs do
105
+ import_canvas_report(table, report_name, retry_count: retry_count - 1, **kwargs.symbolize_keys)
106
+ end
107
+ else
108
+ cleanup_fatal_error!
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,33 @@
1
+ module InstDataShipper
2
+ module DataSources
3
+ # This module contains the logic for processing local AR tables
4
+ module LocalTables
5
+ extend ActiveSupport::Concern
6
+
7
+ public
8
+
9
+ def import_local_table(*args, **kwargs)
10
+ delayed(:_import_local_table, *args, **kwargs)
11
+ end
12
+
13
+ private
14
+
15
+ def _import_local_table(table_name)
16
+ table_def = table_schemas.find { |t| t[:model].to_s == table_name }
17
+ model = table_def[:model]
18
+
19
+ inner_block = ->(file) {
20
+ query = model
21
+ query = query.includes(table_def[:includes]) if table_def[:includes].present?
22
+ model.find_each do |m|
23
+ file << table_def[:columns].map do |c|
24
+ c[:transformer].present? ? m.instance_exec(&c[:transformer]) : m[c[:local_name].to_s]
25
+ end
26
+ end
27
+ }
28
+
29
+ upload_data(table_def, &inner_block)
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,104 @@
1
+ module InstDataShipper
2
+ module Destinations
3
+ class Base
4
+ attr_reader :dumper
5
+
6
+ delegate :tracker, :table_schemas, :working_dir, to: :dumper
7
+
8
+ def initialize(cache_key, config, dumper)
9
+ @cache_key = cache_key
10
+ @_config = config
11
+ @dumper = dumper
12
+ end
13
+
14
+ # This method is called before processing any data.
15
+ # It should be used to initialize any external resources needed for the dump.
16
+ def initialize_dump; end
17
+
18
+ # Yields an object (can be anything) that will be passed to `upload_data_chunk` as `chunk`.
19
+ #
20
+ # If multiple Destinations have the same `group_key`, `chunk_data` will only be called on the first and the chunk will be passed to each destination.
21
+ # Thus, if chunking is config-dependent, your Destination must modify the `group_key` to be unique for each configuration.
22
+ #
23
+ # This must be overridden, but you may call super with a block to iterate individual rows. Manually batch the rows, or include Concerns::Chunking to pre-batch them.
24
+ def chunk_data(generator, **kwargs)
25
+ raise NotImplementedError if method(__method__).owner == Base
26
+
27
+ enum = Enumerator.new(&generator)
28
+ enum.each do |row|
29
+ yield format_row(row)
30
+ end
31
+ end
32
+
33
+ # Called with any values yielded from chunk_data.
34
+ # This method should upload the chunk to the destination.
35
+ def upload_data_chunk(table_def, chunk)
36
+ raise NotImplementedError
37
+ end
38
+
39
+ # This method is called after processing all data.
40
+ # It should be used to finalize any external resources created by the dump.
41
+ def finalize_dump; end
42
+
43
+ # This method is called if a fatal error occurs.
44
+ # It should cleanup any external resources created by the dump.
45
+ def cleanup_fatal_error; end
46
+
47
+ def config
48
+ return @_config if @_config.is_a?(Hash)
49
+ @config ||= parse_configuration(@_config)
50
+ end
51
+
52
+ def user_config
53
+ config[:extra]
54
+ end
55
+
56
+ def group_key
57
+ { class: self.class }
58
+ end
59
+
60
+ protected
61
+
62
+ def parse_configuration(uri)
63
+ if block_given?
64
+ parsed = URI.parse(uri)
65
+ cfg = {
66
+ params: parsed.query.present? ? Rack::Utils.parse_nested_query(parsed.query) : {},
67
+ extra: (parsed.fragment.present? && parsed.fragment.match?(/^\w+=/) && Rack::Utils.parse_nested_query(parsed.fragment)).presence || parsed.fragment || nil,
68
+ }
69
+ yield parsed, cfg
70
+ cfg
71
+ else
72
+ raise NotImplementedError
73
+ end
74
+ end
75
+
76
+ def rk(key)
77
+ "#{@cache_key}:#{key}"
78
+ end
79
+
80
+ def redis(*args, &blk)
81
+ InstDataShipper.redis(*args, &blk)
82
+ end
83
+
84
+ # This is a base/generic implementation and may need to be overridden
85
+ def format_row(row, override_nils: true)
86
+ if row.is_a?(Array)
87
+ row = row.map do |v|
88
+ v = '\N' if v.nil? && override_nils
89
+ v = v.utc.strftime('%Y-%m-%d %H:%M:%S') if v.is_a?(DateTime) || v.is_a?(Time)
90
+ v = v.strftime('%Y-%m-%d') if v.is_a?(Date)
91
+ v = JSON.dump(v) if v.is_a?(Hash) || v.is_a?(Array)
92
+ if v.is_a?(String)
93
+ v = v.gsub("\t", '\t')
94
+ v = v.gsub("\n", '\n')
95
+ end
96
+ v
97
+ end
98
+ end
99
+ row
100
+ end
101
+
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,34 @@
1
+ module InstDataShipper
2
+ module Destinations
3
+ module Concerns
4
+ module Chunking
5
+ extend ActiveSupport::Concern
6
+
7
+ DEFAULT_CHUNK_SIZE = 100_000
8
+
9
+ def chunk_data(generator, chunk_size: nil, **kwargs)
10
+ chunk_size ||= config.dig(:params, :chunk_size) || DEFAULT_CHUNK_SIZE
11
+ slice = 1
12
+
13
+ btchr = CanvasSync::BatchProcessor.new(of: chunk_size) do |batch|
14
+ yield batch, slice
15
+ slice += 1
16
+ end
17
+
18
+ super(generator, **kwargs) do |row|
19
+ btchr << row
20
+ end
21
+
22
+ btchr.flush
23
+ end
24
+
25
+ def group_key
26
+ super.tap do |k|
27
+ k[:chunk_size] = config.dig(:params, :chunk_size) || DEFAULT_CHUNK_SIZE
28
+ end
29
+ end
30
+
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,133 @@
1
+ module InstDataShipper
2
+ module Destinations
3
+ class HostedData < Base
4
+ include Concerns::Chunking
5
+
6
+ def initialize_dump
7
+ dump = hosted_data_client.post(
8
+ 'api/v1/custom_dumps/',
9
+ reference_id: tracker.id,
10
+ schema: convert_schema,
11
+ ).body.with_indifferent_access
12
+
13
+ redis.hset(rk(:state), :dump_id, dump[:id])
14
+ redis.expire(rk(:state), 30.days.to_i)
15
+ end
16
+
17
+ def chunk_data(generator, table:, extra: nil)
18
+ warehouse_name = table_def[:warehouse_name]
19
+
20
+ super(generator) do |batch, idx|
21
+ bits = [warehouse_name, extra, idx].compact
22
+ temp_file = "#{working_dir}/#{bits.join('.')}.tsv.gz"
23
+
24
+ Zlib::GzipWriter.open(temp_file) do |gz|
25
+ batch.each do |row|
26
+ row = row.join("\t") if row.is_a?(Array)
27
+ gz.puts(row)
28
+ end
29
+ end
30
+
31
+ yield temp_file
32
+
33
+ File.delete(temp_file)
34
+ end
35
+ end
36
+
37
+ def upload_data_chunk(table_def, chunk)
38
+ hosted_data_client.put("api/v1/custom_dumps/#{hd_dump_id}/", artifacts: {
39
+ table_def[:warehouse_name] => [Faraday::UploadIO.new(chunk, 'application/gzip')],
40
+ })
41
+ end
42
+
43
+ def finalize_dump
44
+ hosted_data_client.put("api/v1/custom_dumps/#{hd_dump_id}/", start_import: true) if hd_dump_id.present?
45
+ redis.delete(rk(:state))
46
+ end
47
+
48
+ def cleanup_fatal_error
49
+ hosted_data_client.delete("api/v1/custom_dumps/#{hd_dump_id}/", reason: 'Failure during extraction or transformation') if hd_dump_id.present?
50
+ redis.delete(rk(:state))
51
+ end
52
+
53
+ # TODO Support/allow single-table fatal errors?
54
+
55
+ protected
56
+
57
+ def hd_dump_id
58
+ @hd_dump_id ||= redis.hget(rk(:state), :dump_id)
59
+ end
60
+
61
+ def convert_schema
62
+ table_prefix = config[:table_prefix]
63
+ table_prefix = table_prefix.present? ? "#{table_prefix}_" : nil
64
+
65
+ definititions = {}
66
+ table_schemas.each do |ts|
67
+ ts = ts.dup
68
+
69
+ table_name = ts[:warehouse_name]
70
+ table_name = table_prefix + table_name if table_prefix.present?
71
+
72
+ definititions[ts[:warehouse_name]] = {
73
+ dw_type: 'dimension',
74
+ description: ts[:description],
75
+ incremental: !!ts[:incremental],
76
+ incremental_on: ts[:incremental] && ts[:incremental] != true ? ts[:incremental] : nil,
77
+ # indexed_columns
78
+ tableName: table_name,
79
+ columns: ts[:columns].map do |col|
80
+ {
81
+ name: col[:warehouse_name],
82
+ description: col[:description],
83
+ type: col[:type] || ts[:model].column_for_attribute(col[:local_name]).sql_type,
84
+ }
85
+ end,
86
+ }
87
+ end
88
+
89
+ {
90
+ version: "#{dumper.export_genre.downcase}-#{Digest::MD5.hexdigest(definititions.to_json)[0...6]}",
91
+ definition: definititions,
92
+ }
93
+ end
94
+
95
+ def hosted_data_client
96
+ @hosted_data_client ||= begin
97
+ token = config[:token]
98
+
99
+ host = config[:host]
100
+ unless host.present?
101
+ tok_content = JWT.decode(token, nil, false).first
102
+ host = tok_content['host']
103
+ end
104
+
105
+ Faraday.new(url: host) do |faraday|
106
+ faraday.request :multipart
107
+ faraday.request :json
108
+ faraday.response :raise_error
109
+ faraday.response :follow_redirects
110
+ faraday.response :json, :content_type => /\bjson$/
111
+ faraday.headers[:accept] = 'application/json'
112
+ faraday.headers[:authorization] = "Bearer #{token}"
113
+ faraday.adapter Faraday.default_adapter
114
+ end
115
+ end
116
+ end
117
+
118
+ def parse_configuration(uri)
119
+ super do |parsed_uri, cfg|
120
+ if parsed_uri.username.present?
121
+ # hosted-data://<JWT>:<hosted_data_domain>
122
+ cfg[:token] = parsed_uri.username
123
+ cfg[:host] = parsed_uri.host
124
+ else
125
+ # hosted-data://<JWT>
126
+ cfg[:token] = parsed_uri.host
127
+ end
128
+ end
129
+ end
130
+
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,72 @@
1
+ module InstDataShipper
2
+ module Destinations
3
+ class S3 < Base
4
+ include Concerns::Chunking
5
+
6
+ def chunk_data(generator, table:, extra: nil)
7
+ warehouse_name = table_def[:warehouse_name]
8
+
9
+ super(generator) do |batch, idx|
10
+ bits = [warehouse_name, extra, idx].compact
11
+ temp_file = "#{working_dir}/#{bits.join('.')}.csv"
12
+
13
+ CSV.open(temp_file, 'w', headers: false) do |row|
14
+ row << table[:columns].map { |c| c[:warehouse_name] }
15
+ batch.each do |batch_row|
16
+ row << batch_row
17
+ end
18
+ end
19
+
20
+ yield temp_file
21
+
22
+ File.delete(temp_file)
23
+ end
24
+ end
25
+
26
+ def upload_data_chunk(table_def, chunk)
27
+ s3 = Aws::S3::Resource.new(client: aws_client)
28
+ dir_key = tracker.created_at.strftime("%Y-%m-%dT%H:%M") + "_#{tracker.id}"
29
+ bucket = s3.bucket(config[:bucket])
30
+
31
+ subpath = config[:path].presence || "/"
32
+ subpath = subpath[1..-1] if subpath.starts_with?("/")
33
+ subpath = "instructure" unless subpath.present?
34
+
35
+ obj_path = File.join(config[:path], dir_key, File.basename(chunk))
36
+ object = bucket.object(obj_path)
37
+
38
+ File.open(chunk, 'rb') do |file|
39
+ object.put(body: file)
40
+ end
41
+ end
42
+
43
+ protected
44
+
45
+ def aws_client
46
+ @aws_client ||= Aws::S3::Client.new(
47
+ region: config[:region],
48
+ credentials: Aws::Credentials.new(
49
+ config[:access_key_id],
50
+ config[:access_key_secret],
51
+ )
52
+ )
53
+ end
54
+
55
+ def parse_configuration(uri)
56
+ # s3://<access_key_id>:<access_key_secret>@<region>/<bucket>/<path>
57
+ super do |parsed_uri, cfg|
58
+ split_path = parsed_uri.path.split('/')
59
+
60
+ cfg.merge!({
61
+ region: parsed_uri.host,
62
+ bucket: split_path[0],
63
+ access_key_id: parsed_uri.user,
64
+ access_key_secret: parsed_uri.password,
65
+ path: split_path[1..-1].join('/').presence,
66
+ })
67
+ end
68
+ end
69
+
70
+ end
71
+ end
72
+ end