inst_data_shipper 0.1.0.beta1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +139 -1
- data/db/migrate/{20240301090836_create_canvas_sync_sync_batches.rb → 20240301090836_create_inst_data_shipper_dump_batches.rb} +6 -3
- data/lib/inst_data_shipper/basic_dumper.rb +2 -2
- data/lib/inst_data_shipper/concerns/hooks.rb +18 -4
- data/lib/inst_data_shipper/data_sources/canvas_reports.rb +58 -21
- data/lib/inst_data_shipper/data_sources/local_tables.rb +35 -7
- data/lib/inst_data_shipper/destinations/base.rb +33 -6
- data/lib/inst_data_shipper/destinations/hosted_data.rb +63 -19
- data/lib/inst_data_shipper/destinations/s3.rb +1 -1
- data/lib/inst_data_shipper/dumper.rb +158 -50
- data/lib/inst_data_shipper/engine.rb +6 -0
- data/lib/inst_data_shipper/jobs/async_caller.rb +10 -2
- data/lib/inst_data_shipper/schema_builder.rb +99 -37
- data/lib/inst_data_shipper/version.rb +1 -1
- data/lib/inst_data_shipper.rb +13 -3
- data/spec/spec_helper.rb +2 -2
- metadata +22 -9
- data/lib/inst_data_shipper/jobs/basic_dump_job.rb +0 -11
- /data/app/models/{hosted_data_dumper → inst_data_shipper}/dump_batch.rb +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f7909aa44e9dabd1d43d58a5a3c2c081891104d64336294dce287c06804804df
|
4
|
+
data.tar.gz: 5da874689ac1de3e016a7feefce5866b211e6f7595021b565564f796685ed104
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd81e6c26e2416ce1a32de588e04f560496cfb7cfdac3f4c837828a1c65798bec405d98197032b0d8935a1ba2b24a291aa25f1b73a469ac7a9c6ef8d2286103f
|
7
|
+
data.tar.gz: 66c5ccfd82128e8c5dc39c7c937ee7f4f9412743b7202e221e53c575d4b0e572f0b014b4f41ae5924b1d2d119a05cd5de2acbae4eb81022df844a1fea181faec
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# InstDataShipper
|
2
2
|
|
3
|
-
This gem is intended to facilitate
|
3
|
+
This gem is intended to facilitate easy upload of LTI datasets to Instructure Hosted Data.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -16,6 +16,144 @@ Then run the migrations:
|
|
16
16
|
bundle exec rake db:migrate
|
17
17
|
```
|
18
18
|
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
### Dumper
|
22
|
+
|
23
|
+
The main tool provided by this Gem is the `InstDataDumper::Dumper` class. It is used to define a "Dump" which is a combination of tasks and schema.
|
24
|
+
|
25
|
+
Here is an example `Dumper` implementation, wrapped in an ActiveJob job:
|
26
|
+
```ruby
|
27
|
+
class HostedDataPushJob < ApplicationJob
|
28
|
+
# The schema serves two purposes: defining the schema and mapping data
|
29
|
+
SCHEMA = InstDataShipper::SchemaBuilder.build do
|
30
|
+
# You can augment the Table-builder DSL with custom methods like so:
|
31
|
+
extend_table_builder do
|
32
|
+
# It may be useful to define a custom column definition helpers:
|
33
|
+
def custom_column(*args, from: nil, **kwargs, &blk)
|
34
|
+
# In this example, the helper reads the value from a `data` jsonb column - without it, you'd need
|
35
|
+
# to define `from: ->(row) { row.data["<KEY>"] }` on each column that needs to read from the jsonb
|
36
|
+
from ||= args[0].to_s
|
37
|
+
from = ->(row) { row.data[from] } if from.is_a?(String)
|
38
|
+
column(*args, **kwargs, from: from, &blk)
|
39
|
+
end
|
40
|
+
|
41
|
+
# `extend_table_builder` uses `class_eval`, so you could alternatively write your helpers in a Concern or Module and include them like normal:
|
42
|
+
include SomeConcern
|
43
|
+
end
|
44
|
+
|
45
|
+
table(ALocalModel, "<TABLE DESCRIPTION>") do
|
46
|
+
# If you define a table as incremental, it'll only export changes made since the start of the last successful Dumper run
|
47
|
+
# The first argument "scope" can be interpreted in different ways:
|
48
|
+
# If exporting a local model it may be a: (default: `updated_at`)
|
49
|
+
# Proc that will receive a Relation and return a Relation (use `incremental_since`)
|
50
|
+
# String of a column to compare with `incremental_since`
|
51
|
+
# If exporting a Canvas report it may be a: (default: `updated_after`)
|
52
|
+
# Proc that will receive report params and return modified report params (use `incremental_since`)
|
53
|
+
# String of a report param to set to `incremental_since`
|
54
|
+
# `on:` is passed to Hosted Data and is used as the unique key. It may be an array to form a composite-key
|
55
|
+
# `if:` may be a Proc or a Symbol (of a method on the Dumper)
|
56
|
+
incremental "updated_at", on: [:id], if: ->() {}
|
57
|
+
|
58
|
+
column :name_in_destinations, :maybe_optional_sql_type, "Optional description of column"
|
59
|
+
|
60
|
+
# The type may usually be omitted if the `table()` is passed a Model class, but strings are an exception to this
|
61
|
+
custom_column :name, :"varchar(128)"
|
62
|
+
|
63
|
+
# `from:` May be...
|
64
|
+
# A Symbol of a method to be called on the record
|
65
|
+
custom_column :sis_type, :"varchar(32)", from: :some_model_method
|
66
|
+
# A String of a column to read from the record
|
67
|
+
custom_column :sis_type, :"varchar(32)", from: "sis_source_type"
|
68
|
+
# A Proc to be called with each record
|
69
|
+
custom_column :sis_type, :"varchar(32)", from: ->(rec) { ... }
|
70
|
+
# Not specified. Will default to using the Schema Column Name as a String ("sis_type" in this case)
|
71
|
+
custom_column :sis_type, :"varchar(32)"
|
72
|
+
end
|
73
|
+
|
74
|
+
table("my_table", model: ALocalModel) do
|
75
|
+
# ...
|
76
|
+
end
|
77
|
+
|
78
|
+
table("proserv_student_submissions_csv") do
|
79
|
+
column :canvas_id, :bigint, from: "canvas user id"
|
80
|
+
column :sis_id, :"varchar(64)", from: "sis user id"
|
81
|
+
column :name, :"varchar(64)", from: "user name"
|
82
|
+
column :submission_id, :bigint, from: "submission id"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
Dumper = InstDataShipper::Dumper.define(schema: SCHEMA, include: [
|
87
|
+
InstDataShipper::DataSources::LocalTables,
|
88
|
+
InstDataShipper::DataSources::CanvasReports,
|
89
|
+
]) do
|
90
|
+
import_local_table(ALocalModel)
|
91
|
+
import_canvas_report_by_terms("proserv_student_submissions_csv", terms: Term.all.pluck(:canvas_id))
|
92
|
+
|
93
|
+
# If the report_name/Model don't directly match the Schema, a schema_name: parameter may be passed:
|
94
|
+
import_local_table(SomeModel, schema_name: "my_table")
|
95
|
+
import_canvas_report_by_terms("some_report", terms: Term.all.pluck(:canvas_id), schema_name: "my_table")
|
96
|
+
end
|
97
|
+
|
98
|
+
def perform
|
99
|
+
Dumper.perform_dump([
|
100
|
+
"hosted-data://<JWT>@<HOSTED DATA SERVER>?table_prefix=example",
|
101
|
+
"s3://<access_key_id>:<access_key_secret>@<region>/<bucket>/<path>",
|
102
|
+
])
|
103
|
+
end
|
104
|
+
end
|
105
|
+
```
|
106
|
+
|
107
|
+
`Dumper`s may also be formed as a normal Ruby subclass:
|
108
|
+
```ruby
|
109
|
+
class HostedDataPushJob < ApplicationJob
|
110
|
+
SCHEMA = InstDataShipper::SchemaBuilder.build do
|
111
|
+
# ...
|
112
|
+
end
|
113
|
+
|
114
|
+
class Dumper < InstDataShipper::Dumper
|
115
|
+
include InstDataShipper::DataSources::LocalTables
|
116
|
+
include InstDataShipper::DataSources::CanvasReports
|
117
|
+
|
118
|
+
def enqueue_tasks
|
119
|
+
import_local_table(ALocalModel)
|
120
|
+
import_canvas_report_by_terms("proserv_student_submissions_csv", terms: Term.all.pluck(:canvas_id))
|
121
|
+
end
|
122
|
+
|
123
|
+
def table_schemas
|
124
|
+
SCHEMA
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def perform
|
129
|
+
Dumper.perform_dump([
|
130
|
+
"hosted-data://<JWT>@<HOSTED DATA SERVER>?table_prefix=example",
|
131
|
+
"s3://<access_key_id>:<access_key_secret>@<region>/<bucket>/<path>",
|
132
|
+
])
|
133
|
+
end
|
134
|
+
end
|
135
|
+
```
|
136
|
+
|
137
|
+
### Destinations
|
138
|
+
|
139
|
+
This Gem is mainly designed for use with Hosted Data, but it tries to abstract that a little to allow for other destinations/backends. Out of the box, support for Hosted Data and S3 are included.
|
140
|
+
|
141
|
+
Destinations are passed as URI-formatted strings. Passing Hashes is also supported, but the format/keys are destination specific.
|
142
|
+
|
143
|
+
Destinations blindly accept URI Fragments (the `#` chunk at the end of the URI). These options are not used internally but will be made available as `dest.user_config`. Ideally these are in the same format as query parameters (`x=1&y=2`, which it will try to parse into a Hash), but it can be any string.
|
144
|
+
|
145
|
+
#### Hosted Data
|
146
|
+
`hosted-data://<JWT>@<HOSTED DATA SERVER>`
|
147
|
+
|
148
|
+
##### Optional Parameters:
|
149
|
+
- `table_prefix`: An optional string to prefix onto each table name in the schema when declaring the schema in Hosted Data
|
150
|
+
|
151
|
+
#### S3
|
152
|
+
`s3://<access_key_id>:<access_key_secret>@<region>/<bucket>/<optional path>`
|
153
|
+
|
154
|
+
##### Optional Parameters:
|
155
|
+
_None_
|
156
|
+
|
19
157
|
## Development
|
20
158
|
|
21
159
|
When adding to or updating this gem, make sure you do the following:
|
@@ -1,4 +1,4 @@
|
|
1
|
-
class
|
1
|
+
class CreateInstDataShipperDumpBatches < CanvasSync::MiscHelper::MigrationClass
|
2
2
|
def change
|
3
3
|
create_table :inst_data_shipper_dump_batches do |t|
|
4
4
|
t.datetime :started_at
|
@@ -6,10 +6,13 @@ class InstDataShipperDumpBatches < CanvasSync::MiscHelper::MigrationClass
|
|
6
6
|
t.string :status
|
7
7
|
|
8
8
|
t.string :job_class
|
9
|
+
t.string :genre
|
10
|
+
t.string :batch_id
|
11
|
+
|
9
12
|
t.string :exception
|
10
13
|
t.text :backtrace
|
11
|
-
t.text :metadata
|
12
|
-
t.text :job_arguments
|
14
|
+
# t.text :metadata
|
15
|
+
# t.text :job_arguments
|
13
16
|
|
14
17
|
t.timestamps
|
15
18
|
end
|
@@ -9,21 +9,35 @@ module InstDataShipper
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def hook(name, prepend: false, &block)
|
12
|
+
_assert_hook_defined(name)
|
13
|
+
@hooks ||= {}
|
14
|
+
@hooks[name] ||= []
|
12
15
|
hooks = @hooks[name]
|
13
16
|
prepend ? hooks.unshift(block) : hooks << block
|
14
17
|
end
|
18
|
+
|
19
|
+
def _assert_hook_defined(name)
|
20
|
+
return true if @hooks&.key?(name)
|
21
|
+
return if superclass.respond_to?(:_assert_hook_defined) && superclass._assert_hook_defined(name)
|
22
|
+
raise ArgumentError, "Hook #{name} is not defined"
|
23
|
+
end
|
24
|
+
|
25
|
+
def _list_hooks(name)
|
26
|
+
list = []
|
27
|
+
list.push(*superclass._list_hooks(name)) if superclass.respond_to?(:_list_hooks)
|
28
|
+
list.push(*@hooks[name]) if (@hooks || {})[name]
|
29
|
+
list
|
30
|
+
end
|
15
31
|
end
|
16
32
|
|
17
33
|
def run_hook(name, *args, **kwargs)
|
18
|
-
|
19
|
-
hooks.each do |blk|
|
34
|
+
self.class._list_hooks(name).each do |blk|
|
20
35
|
instance_exec(*args, **kwargs, &blk)
|
21
36
|
end
|
22
37
|
end
|
23
38
|
|
24
39
|
def run_hook_safe(name, *args, **kwargs)
|
25
|
-
|
26
|
-
hooks.each do |blk|
|
40
|
+
self.class._list_hooks(name).each do |blk|
|
27
41
|
instance_exec(*args, **kwargs, &blk)
|
28
42
|
rescue StandardError
|
29
43
|
end
|
@@ -27,11 +27,25 @@ module InstDataShipper
|
|
27
27
|
_in_canvas_report_pool(:_import_canvas_report, *args, **kwargs)
|
28
28
|
end
|
29
29
|
|
30
|
-
def import_canvas_report_by_terms(
|
30
|
+
def import_canvas_report_by_terms(*args, **kwargs)
|
31
|
+
_in_canvas_report_pool(:_import_canvas_report_by_terms, *args, **kwargs)
|
32
|
+
end
|
33
|
+
|
34
|
+
def import_existing_report(report, **kwargs)
|
35
|
+
delayed(:_process_canvas_report, report: report, **kwargs)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def _import_canvas_report_by_terms(report_name, terms: [], params: {}, **kwargs)
|
31
41
|
term_ids = (terms || []).map do |term|
|
32
42
|
term.is_a?(Term) ? term.canvas_id : term
|
33
43
|
end
|
34
44
|
|
45
|
+
table_def = lookup_table_schema!(kwargs[:schema_name], report_name)
|
46
|
+
|
47
|
+
_resolve_report_incremenal_parameters(table_def, params)
|
48
|
+
|
35
49
|
Sidekiq::Batch.new.tap do |b|
|
36
50
|
b.description = "Term Scoped #{report_name} Runners"
|
37
51
|
b.context = {
|
@@ -40,19 +54,21 @@ module InstDataShipper
|
|
40
54
|
b.jobs do
|
41
55
|
terms_query = term_ids.present? ? Term.where(canvas_id: term_ids) : Term
|
42
56
|
terms_query.find_each do |t|
|
43
|
-
|
57
|
+
_in_canvas_report_pool(:_trigger_canvas_report, report_name, params: { **params, enrollment_term_id: t.canvas_id }, **kwargs)
|
44
58
|
end
|
45
59
|
end
|
46
60
|
end
|
47
61
|
end
|
48
62
|
|
49
|
-
def
|
50
|
-
|
51
|
-
end
|
63
|
+
def _import_canvas_report(report_name, params: {}, **kwargs)
|
64
|
+
table_def = lookup_table_schema!(kwargs[:schema_name], report_name)
|
52
65
|
|
53
|
-
|
66
|
+
_resolve_report_incremenal_parameters(table_def, params)
|
67
|
+
|
68
|
+
_trigger_canvas_report(report_name, params: params, **kwargs)
|
69
|
+
end
|
54
70
|
|
55
|
-
def
|
71
|
+
def _trigger_canvas_report(report_name, retry_count: 3, params: {}, **kwargs)
|
56
72
|
report = canvas_sync_client.start_report(
|
57
73
|
'self', report_name,
|
58
74
|
parameters: params,
|
@@ -61,15 +77,13 @@ module InstDataShipper
|
|
61
77
|
CanvasSync::Jobs::CanvasProcessWaiter.perform_later(
|
62
78
|
"/api/v1/accounts/self/reports/#{report_name}/#{report[:id]}",
|
63
79
|
{
|
64
|
-
|
65
|
-
|
66
|
-
args: [target_table],
|
80
|
+
job: Jobs::AsyncCaller,
|
81
|
+
args: [origin_class, :_process_canvas_report],
|
67
82
|
kwargs: kwargs,
|
68
83
|
},
|
69
84
|
on_failure: {
|
70
|
-
|
71
|
-
|
72
|
-
args: [target_table, report_name, kwargs],
|
85
|
+
job: Jobs::AsyncCaller,
|
86
|
+
args: [origin_class, :_handle_failed_canvas_report, report_name, kwargs],
|
73
87
|
kwargs: { retry_count: retry_count },
|
74
88
|
},
|
75
89
|
status_key: :status,
|
@@ -79,18 +93,18 @@ module InstDataShipper
|
|
79
93
|
|
80
94
|
def _in_canvas_report_pool(mthd, *args, **kwargs)
|
81
95
|
pool = CanvasSync::JobBatches::Pool.from_pid(batch_context[:report_processor_pool])
|
82
|
-
AsyncCaller.call_from_pool(pool, self.class, mthd, *args, **kwargs)
|
96
|
+
Jobs::AsyncCaller.call_from_pool(pool, self.class, mthd, *args, **kwargs)
|
83
97
|
end
|
84
98
|
|
85
|
-
def _process_canvas_report(
|
86
|
-
table_def =
|
99
|
+
def _process_canvas_report(report:, schema_name: nil)
|
100
|
+
table_def = lookup_table_schema!(schema_name, report[:report])
|
87
101
|
|
88
|
-
IO.copy_stream(URI.parse(report['attachment']['url']).open, "#{working_dir}
|
102
|
+
IO.copy_stream(URI.parse(report['attachment']['url']).open, "#{working_dir}/temp_report.csv")
|
89
103
|
|
90
104
|
inner_block = ->(file) {
|
91
|
-
CSV.foreach("#{working_dir}
|
105
|
+
CSV.foreach("#{working_dir}/temp_report.csv", headers: true) do |m|
|
92
106
|
file << table_def[:columns].map do |c|
|
93
|
-
|
107
|
+
instance_exec(m, &c[:block])
|
94
108
|
end
|
95
109
|
end
|
96
110
|
}
|
@@ -98,13 +112,36 @@ module InstDataShipper
|
|
98
112
|
upload_data(table_def, extra: report['id'], &inner_block)
|
99
113
|
end
|
100
114
|
|
101
|
-
def
|
115
|
+
def _resolve_report_incremenal_parameters(table_def, params)
|
116
|
+
if table_is_incremental?(table_def)
|
117
|
+
inc = table_def[:incremental]
|
118
|
+
scope = inc[:scope]
|
119
|
+
|
120
|
+
if scope != false
|
121
|
+
scope ||= "updated_after"
|
122
|
+
|
123
|
+
if scope.is_a?(Proc)
|
124
|
+
scope = instance_exec(params, &scope)
|
125
|
+
if scope.is_a?(Hash) && scope != params
|
126
|
+
params.merge!(scope)
|
127
|
+
end
|
128
|
+
elsif scope.is_a?(String) || scope.is_a?(Symbol)
|
129
|
+
params[scope] = incremental_since
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
params
|
135
|
+
end
|
136
|
+
|
137
|
+
def _handle_failed_canvas_report(report_name, kwargs, retry_count:, report:)
|
102
138
|
if retry_count.positive?
|
103
139
|
tbid = batch_context[:report_bid] || batch_context[:root_bid]
|
104
140
|
Sidekiq::Batch.new(tbid).jobs do
|
105
|
-
|
141
|
+
_in_canvas_report_pool(:_trigger_canvas_report, report_name, retry_count: retry_count - 1, **kwargs.symbolize_keys)
|
106
142
|
end
|
107
143
|
else
|
144
|
+
# TODO Allow marking the table as incomplete. Destination code can then decide how to handle incomplete tables since (eg) incremental imports wouldn't mind too much
|
108
145
|
cleanup_fatal_error!
|
109
146
|
end
|
110
147
|
end
|
@@ -12,22 +12,50 @@ module InstDataShipper
|
|
12
12
|
|
13
13
|
private
|
14
14
|
|
15
|
-
def _import_local_table(
|
16
|
-
|
17
|
-
|
15
|
+
def _import_local_table(model, schema_name: nil)
|
16
|
+
model = model.safe_constantize if model.is_a?(String)
|
17
|
+
|
18
|
+
table_def = lookup_table_schema!(schema_name, { model: model })
|
18
19
|
|
19
20
|
inner_block = ->(file) {
|
20
|
-
query = model
|
21
|
-
query = query
|
22
|
-
|
21
|
+
query = model.all
|
22
|
+
query = _resolve_model_query(query, table_def[:query])
|
23
|
+
|
24
|
+
if table_is_incremental?(table_def)
|
25
|
+
query = _resolve_model_query(
|
26
|
+
query,
|
27
|
+
table_def.dig(:incremental, :scope),
|
28
|
+
string: ->(query, column) { query.where("#{column} > ?", incremental_since) },
|
29
|
+
default: "updated_at",
|
30
|
+
)
|
31
|
+
end
|
32
|
+
|
33
|
+
query.find_each do |m|
|
23
34
|
file << table_def[:columns].map do |c|
|
24
|
-
|
35
|
+
instance_exec(m, &c[:block])
|
25
36
|
end
|
26
37
|
end
|
27
38
|
}
|
28
39
|
|
29
40
|
upload_data(table_def, &inner_block)
|
30
41
|
end
|
42
|
+
|
43
|
+
def _resolve_model_query(relation, query, string: nil, default: nil)
|
44
|
+
return relation if query == false
|
45
|
+
query = default if query.nil?
|
46
|
+
return relation if query.nil?
|
47
|
+
|
48
|
+
if query.is_a?(Symbol)
|
49
|
+
relation.send(query)
|
50
|
+
elsif query.is_a?(Proc)
|
51
|
+
instance_exec(relation, &query)
|
52
|
+
elsif query.is_a?(String) && string.present?
|
53
|
+
instance_exec(relation, query, &string)
|
54
|
+
else
|
55
|
+
raise "Invalid query: #{query.inspect}"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
31
59
|
end
|
32
60
|
end
|
33
61
|
end
|
@@ -3,7 +3,7 @@ module InstDataShipper
|
|
3
3
|
class Base
|
4
4
|
attr_reader :dumper
|
5
5
|
|
6
|
-
delegate :tracker, :
|
6
|
+
delegate :tracker, :schema, :working_dir, to: :dumper
|
7
7
|
|
8
8
|
def initialize(cache_key, config, dumper)
|
9
9
|
@cache_key = cache_key
|
@@ -11,9 +11,13 @@ module InstDataShipper
|
|
11
11
|
@dumper = dumper
|
12
12
|
end
|
13
13
|
|
14
|
+
# This method is called before taking any actions.
|
15
|
+
# It should be used to make any necessarry state assumptions (eg, the HostedData destination checks for a previous dump to determine if it can use incremental_since)
|
16
|
+
def preinitialize_dump(context); end
|
17
|
+
|
14
18
|
# This method is called before processing any data.
|
15
19
|
# It should be used to initialize any external resources needed for the dump.
|
16
|
-
def initialize_dump; end
|
20
|
+
def initialize_dump(context); end
|
17
21
|
|
18
22
|
# Yields an object (can be anything) that will be passed to `upload_data_chunk` as `chunk`.
|
19
23
|
#
|
@@ -50,7 +54,7 @@ module InstDataShipper
|
|
50
54
|
end
|
51
55
|
|
52
56
|
def user_config
|
53
|
-
config[:
|
57
|
+
config[:user_config]
|
54
58
|
end
|
55
59
|
|
56
60
|
def group_key
|
@@ -62,11 +66,11 @@ module InstDataShipper
|
|
62
66
|
def parse_configuration(uri)
|
63
67
|
if block_given?
|
64
68
|
parsed = URI.parse(uri)
|
69
|
+
cparsed = ConfigURI.new(parsed)
|
65
70
|
cfg = {
|
66
|
-
|
67
|
-
extra: (parsed.fragment.present? && parsed.fragment.match?(/^\w+=/) && Rack::Utils.parse_nested_query(parsed.fragment)).presence || parsed.fragment || nil,
|
71
|
+
user_config: cparsed.hash_params,
|
68
72
|
}
|
69
|
-
yield
|
73
|
+
yield cparsed, cfg
|
70
74
|
cfg
|
71
75
|
else
|
72
76
|
raise NotImplementedError
|
@@ -100,5 +104,28 @@ module InstDataShipper
|
|
100
104
|
end
|
101
105
|
|
102
106
|
end
|
107
|
+
|
108
|
+
class ConfigURI
|
109
|
+
def initialize(uri)
|
110
|
+
@uri = uri
|
111
|
+
end
|
112
|
+
|
113
|
+
# delegate_missing_to :uri
|
114
|
+
delegate :scheme, :user, :password, :host, :hostname, :port, :path, :query, :fragment, to: :uri
|
115
|
+
|
116
|
+
def params
|
117
|
+
@params ||= (query.present? ? Rack::Utils.parse_nested_query(query).with_indifferent_access : {}).freeze
|
118
|
+
end
|
119
|
+
|
120
|
+
def hash_params
|
121
|
+
@hash_params ||= ((fragment.present? && fragment.match?(/^\w+=/) && Rack::Utils.parse_nested_query(fragment).with_indifferent_access).presence || fragment || nil)&.freeze
|
122
|
+
end
|
123
|
+
|
124
|
+
private
|
125
|
+
|
126
|
+
def uri
|
127
|
+
@uri
|
128
|
+
end
|
129
|
+
end
|
103
130
|
end
|
104
131
|
end
|
@@ -1,13 +1,47 @@
|
|
1
|
+
require "faraday_middleware"
|
2
|
+
|
1
3
|
module InstDataShipper
|
2
4
|
module Destinations
|
3
5
|
class HostedData < Base
|
4
6
|
include Concerns::Chunking
|
5
7
|
|
6
|
-
def
|
8
|
+
def preinitialize_dump(context)
|
9
|
+
if context[:incremental_since].present?
|
10
|
+
begin
|
11
|
+
last_dump = hosted_data_client.get("api/v1/custom_dumps/last", {
|
12
|
+
status: 'imported',
|
13
|
+
# schema_version: convert_schema[:version],
|
14
|
+
tags: [
|
15
|
+
"ids-schema=#{dumper.schema_digest}",
|
16
|
+
"ids-genre=#{dumper.export_genre}",
|
17
|
+
],
|
18
|
+
}).body.with_indifferent_access
|
19
|
+
|
20
|
+
if last_dump[:created_at] < context[:incremental_since]
|
21
|
+
InstDataShipper.logger.info("Last successful HostedData dump is older than incremental_since - bumping back incremental_since")
|
22
|
+
context[:incremental_since] = last_dump[:created_at]
|
23
|
+
end
|
24
|
+
rescue Faraday::ResourceNotFound
|
25
|
+
# TODO It'd be nice to make this per-table
|
26
|
+
InstDataShipper.logger.info("No Last successful HostedData dump of the same schema - not using incremental_since")
|
27
|
+
context[:incremental_since] = nil
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def initialize_dump(context)
|
33
|
+
tags = [
|
34
|
+
"ids-schema=#{dumper.schema_digest}",
|
35
|
+
"ids-genre=#{dumper.export_genre}",
|
36
|
+
]
|
37
|
+
tags << "ids-app=#{Rails.application.class.name.gsub(/::Application$/, '')}" if defined?(Rails) && Rails.application
|
38
|
+
tags << "ids-schema-version=#{schema[:version]}" if schema[:version].present?
|
39
|
+
|
7
40
|
dump = hosted_data_client.post(
|
8
41
|
'api/v1/custom_dumps/',
|
9
42
|
reference_id: tracker.id,
|
10
43
|
schema: convert_schema,
|
44
|
+
tags: tags,
|
11
45
|
).body.with_indifferent_access
|
12
46
|
|
13
47
|
redis.hset(rk(:state), :dump_id, dump[:id])
|
@@ -15,7 +49,7 @@ module InstDataShipper
|
|
15
49
|
end
|
16
50
|
|
17
51
|
def chunk_data(generator, table:, extra: nil)
|
18
|
-
warehouse_name =
|
52
|
+
warehouse_name = table[:warehouse_name]
|
19
53
|
|
20
54
|
super(generator) do |batch, idx|
|
21
55
|
bits = [warehouse_name, extra, idx].compact
|
@@ -36,18 +70,18 @@ module InstDataShipper
|
|
36
70
|
|
37
71
|
def upload_data_chunk(table_def, chunk)
|
38
72
|
hosted_data_client.put("api/v1/custom_dumps/#{hd_dump_id}/", artifacts: {
|
39
|
-
table_def
|
73
|
+
table_name(table_def) => [Faraday::UploadIO.new(chunk, 'application/gzip')],
|
40
74
|
})
|
41
75
|
end
|
42
76
|
|
43
77
|
def finalize_dump
|
44
78
|
hosted_data_client.put("api/v1/custom_dumps/#{hd_dump_id}/", start_import: true) if hd_dump_id.present?
|
45
|
-
redis.
|
79
|
+
redis.del(rk(:state))
|
46
80
|
end
|
47
81
|
|
48
82
|
def cleanup_fatal_error
|
49
83
|
hosted_data_client.delete("api/v1/custom_dumps/#{hd_dump_id}/", reason: 'Failure during extraction or transformation') if hd_dump_id.present?
|
50
|
-
redis.
|
84
|
+
redis.del(rk(:state))
|
51
85
|
end
|
52
86
|
|
53
87
|
# TODO Support/allow single-table fatal errors?
|
@@ -59,39 +93,45 @@ module InstDataShipper
|
|
59
93
|
end
|
60
94
|
|
61
95
|
def convert_schema
|
62
|
-
table_prefix = config[:table_prefix]
|
63
|
-
table_prefix = table_prefix.present? ? "#{table_prefix}_" : nil
|
64
|
-
|
65
96
|
definititions = {}
|
97
|
+
table_schemas = schema[:tables]
|
66
98
|
table_schemas.each do |ts|
|
67
99
|
ts = ts.dup
|
100
|
+
tname = table_name(ts)
|
68
101
|
|
69
|
-
|
70
|
-
table_name = table_prefix + table_name if table_prefix.present?
|
71
|
-
|
72
|
-
definititions[ts[:warehouse_name]] = {
|
102
|
+
definititions[tname] = {
|
73
103
|
dw_type: 'dimension',
|
74
104
|
description: ts[:description],
|
75
|
-
incremental:
|
76
|
-
incremental_on: ts
|
105
|
+
incremental: dumper.table_is_incremental?(ts),
|
106
|
+
incremental_on: ts.dig(:incremental, :on),
|
77
107
|
# indexed_columns
|
78
|
-
tableName:
|
108
|
+
tableName: tname,
|
79
109
|
columns: ts[:columns].map do |col|
|
110
|
+
coltype = col[:type]
|
111
|
+
coltype ||= ts[:model].column_for_attribute(col[:from]).sql_type if col[:from].is_a?(String)
|
80
112
|
{
|
81
113
|
name: col[:warehouse_name],
|
82
114
|
description: col[:description],
|
83
|
-
type:
|
115
|
+
type: coltype,
|
84
116
|
}
|
85
117
|
end,
|
86
118
|
}
|
87
119
|
end
|
88
120
|
|
89
121
|
{
|
90
|
-
version: "#{dumper.
|
122
|
+
version: "#{dumper.schema_digest}-#{Digest::MD5.hexdigest(definititions.to_json)[0...6]}",
|
91
123
|
definition: definititions,
|
92
124
|
}
|
93
125
|
end
|
94
126
|
|
127
|
+
def table_name(table_def)
|
128
|
+
table_prefix = config[:table_prefix]
|
129
|
+
table_prefix = table_prefix.present? ? "#{table_prefix}_" : nil
|
130
|
+
table_name = table_def[:warehouse_name]
|
131
|
+
table_name = table_prefix + table_name if table_prefix.present?
|
132
|
+
table_name
|
133
|
+
end
|
134
|
+
|
95
135
|
def hosted_data_client
|
96
136
|
@hosted_data_client ||= begin
|
97
137
|
token = config[:token]
|
@@ -102,6 +142,8 @@ module InstDataShipper
|
|
102
142
|
host = tok_content['host']
|
103
143
|
end
|
104
144
|
|
145
|
+
host = "https://#{host}" unless host.include?('://')
|
146
|
+
|
105
147
|
Faraday.new(url: host) do |faraday|
|
106
148
|
faraday.request :multipart
|
107
149
|
faraday.request :json
|
@@ -117,14 +159,16 @@ module InstDataShipper
|
|
117
159
|
|
118
160
|
def parse_configuration(uri)
|
119
161
|
super do |parsed_uri, cfg|
|
120
|
-
if parsed_uri.
|
162
|
+
if parsed_uri.user.present?
|
121
163
|
# hosted-data://<JWT>:<hosted_data_domain>
|
122
|
-
cfg[:token] = parsed_uri.
|
164
|
+
cfg[:token] = parsed_uri.user
|
123
165
|
cfg[:host] = parsed_uri.host
|
124
166
|
else
|
125
167
|
# hosted-data://<JWT>
|
126
168
|
cfg[:token] = parsed_uri.host
|
127
169
|
end
|
170
|
+
|
171
|
+
cfg[:table_prefix] = parsed_uri.params[:table_prefix]
|
128
172
|
end
|
129
173
|
end
|
130
174
|
|