inst_data_shipper 0.1.0.beta2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 146f5b93819d7950f9bd256a99eb690a63d453b86be4ac6ac7cf4c5901724cdd
4
- data.tar.gz: 2410298ebb3b1ddc565ca70d49a274a129e83087d461dcfae4d4981979795ea5
3
+ metadata.gz: f7909aa44e9dabd1d43d58a5a3c2c081891104d64336294dce287c06804804df
4
+ data.tar.gz: 5da874689ac1de3e016a7feefce5866b211e6f7595021b565564f796685ed104
5
5
  SHA512:
6
- metadata.gz: c6dc93902e0ef7a114d2434d3901c677021d57603b07c1122d72bd2f953184b207d0e57f5441fafb7035a00bf28f2e5035d34cd0edfb73da6d2f93d93874f344
7
- data.tar.gz: 2e7babf6a2ed86f9a2e5769bfb393549fb07b771b700513efd640ba2b98ebc3eeadec99578e4c828738903ef121f8e3bebe8490f6f75de0ce9afac43ac28b8fa
6
+ metadata.gz: cd81e6c26e2416ce1a32de588e04f560496cfb7cfdac3f4c837828a1c65798bec405d98197032b0d8935a1ba2b24a291aa25f1b73a469ac7a9c6ef8d2286103f
7
+ data.tar.gz: 66c5ccfd82128e8c5dc39c7c937ee7f4f9412743b7202e221e53c575d4b0e572f0b014b4f41ae5924b1d2d119a05cd5de2acbae4eb81022df844a1fea181faec
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # InstDataShipper
2
2
 
3
- This gem is intended to facilitate fast and easy syncing of Canvas data.
3
+ This gem is intended to facilitate easy upload of LTI datasets to Instructure Hosted Data.
4
4
 
5
5
  ## Installation
6
6
 
@@ -16,6 +16,144 @@ Then run the migrations:
16
16
  bundle exec rake db:migrate
17
17
  ```
18
18
 
19
+ ## Usage
20
+
21
+ ### Dumper
22
+
23
+ The main tool provided by this Gem is the `InstDataDumper::Dumper` class. It is used to define a "Dump" which is a combination of tasks and schema.
24
+
25
+ Here is an example `Dumper` implementation, wrapped in an ActiveJob job:
26
+ ```ruby
27
+ class HostedDataPushJob < ApplicationJob
28
+ # The schema serves two purposes: defining the schema and mapping data
29
+ SCHEMA = InstDataShipper::SchemaBuilder.build do
30
+ # You can augment the Table-builder DSL with custom methods like so:
31
+ extend_table_builder do
32
+ # It may be useful to define a custom column definition helpers:
33
+ def custom_column(*args, from: nil, **kwargs, &blk)
34
+ # In this example, the helper reads the value from a `data` jsonb column - without it, you'd need
35
+ # to define `from: ->(row) { row.data["<KEY>"] }` on each column that needs to read from the jsonb
36
+ from ||= args[0].to_s
37
+ from = ->(row) { row.data[from] } if from.is_a?(String)
38
+ column(*args, **kwargs, from: from, &blk)
39
+ end
40
+
41
+ # `extend_table_builder` uses `class_eval`, so you could alternatively write your helpers in a Concern or Module and include them like normal:
42
+ include SomeConcern
43
+ end
44
+
45
+ table(ALocalModel, "<TABLE DESCRIPTION>") do
46
+ # If you define a table as incremental, it'll only export changes made since the start of the last successful Dumper run
47
+ # The first argument "scope" can be interpreted in different ways:
48
+ # If exporting a local model it may be a: (default: `updated_at`)
49
+ # Proc that will receive a Relation and return a Relation (use `incremental_since`)
50
+ # String of a column to compare with `incremental_since`
51
+ # If exporting a Canvas report it may be a: (default: `updated_after`)
52
+ # Proc that will receive report params and return modified report params (use `incremental_since`)
53
+ # String of a report param to set to `incremental_since`
54
+ # `on:` is passed to Hosted Data and is used as the unique key. It may be an array to form a composite-key
55
+ # `if:` may be a Proc or a Symbol (of a method on the Dumper)
56
+ incremental "updated_at", on: [:id], if: ->() {}
57
+
58
+ column :name_in_destinations, :maybe_optional_sql_type, "Optional description of column"
59
+
60
+ # The type may usually be omitted if the `table()` is passed a Model class, but strings are an exception to this
61
+ custom_column :name, :"varchar(128)"
62
+
63
+ # `from:` May be...
64
+ # A Symbol of a method to be called on the record
65
+ custom_column :sis_type, :"varchar(32)", from: :some_model_method
66
+ # A String of a column to read from the record
67
+ custom_column :sis_type, :"varchar(32)", from: "sis_source_type"
68
+ # A Proc to be called with each record
69
+ custom_column :sis_type, :"varchar(32)", from: ->(rec) { ... }
70
+ # Not specified. Will default to using the Schema Column Name as a String ("sis_type" in this case)
71
+ custom_column :sis_type, :"varchar(32)"
72
+ end
73
+
74
+ table("my_table", model: ALocalModel) do
75
+ # ...
76
+ end
77
+
78
+ table("proserv_student_submissions_csv") do
79
+ column :canvas_id, :bigint, from: "canvas user id"
80
+ column :sis_id, :"varchar(64)", from: "sis user id"
81
+ column :name, :"varchar(64)", from: "user name"
82
+ column :submission_id, :bigint, from: "submission id"
83
+ end
84
+ end
85
+
86
+ Dumper = InstDataShipper::Dumper.define(schema: SCHEMA, include: [
87
+ InstDataShipper::DataSources::LocalTables,
88
+ InstDataShipper::DataSources::CanvasReports,
89
+ ]) do
90
+ import_local_table(ALocalModel)
91
+ import_canvas_report_by_terms("proserv_student_submissions_csv", terms: Term.all.pluck(:canvas_id))
92
+
93
+ # If the report_name/Model don't directly match the Schema, a schema_name: parameter may be passed:
94
+ import_local_table(SomeModel, schema_name: "my_table")
95
+ import_canvas_report_by_terms("some_report", terms: Term.all.pluck(:canvas_id), schema_name: "my_table")
96
+ end
97
+
98
+ def perform
99
+ Dumper.perform_dump([
100
+ "hosted-data://<JWT>@<HOSTED DATA SERVER>?table_prefix=example",
101
+ "s3://<access_key_id>:<access_key_secret>@<region>/<bucket>/<path>",
102
+ ])
103
+ end
104
+ end
105
+ ```
106
+
107
+ `Dumper`s may also be formed as a normal Ruby subclass:
108
+ ```ruby
109
+ class HostedDataPushJob < ApplicationJob
110
+ SCHEMA = InstDataShipper::SchemaBuilder.build do
111
+ # ...
112
+ end
113
+
114
+ class Dumper < InstDataShipper::Dumper
115
+ include InstDataShipper::DataSources::LocalTables
116
+ include InstDataShipper::DataSources::CanvasReports
117
+
118
+ def enqueue_tasks
119
+ import_local_table(ALocalModel)
120
+ import_canvas_report_by_terms("proserv_student_submissions_csv", terms: Term.all.pluck(:canvas_id))
121
+ end
122
+
123
+ def table_schemas
124
+ SCHEMA
125
+ end
126
+ end
127
+
128
+ def perform
129
+ Dumper.perform_dump([
130
+ "hosted-data://<JWT>@<HOSTED DATA SERVER>?table_prefix=example",
131
+ "s3://<access_key_id>:<access_key_secret>@<region>/<bucket>/<path>",
132
+ ])
133
+ end
134
+ end
135
+ ```
136
+
137
+ ### Destinations
138
+
139
+ This Gem is mainly designed for use with Hosted Data, but it tries to abstract that a little to allow for other destinations/backends. Out of the box, support for Hosted Data and S3 are included.
140
+
141
+ Destinations are passed as URI-formatted strings. Passing Hashes is also supported, but the format/keys are destination specific.
142
+
143
+ Destinations blindly accept URI Fragments (the `#` chunk at the end of the URI). These options are not used internally but will be made available as `dest.user_config`. Ideally these are in the same format as query parameters (`x=1&y=2`, which it will try to parse into a Hash), but it can be any string.
144
+
145
+ #### Hosted Data
146
+ `hosted-data://<JWT>@<HOSTED DATA SERVER>`
147
+
148
+ ##### Optional Parameters:
149
+ - `table_prefix`: An optional string to prefix onto each table name in the schema when declaring the schema in Hosted Data
150
+
151
+ #### S3
152
+ `s3://<access_key_id>:<access_key_secret>@<region>/<bucket>/<optional path>`
153
+
154
+ ##### Optional Parameters:
155
+ _None_
156
+
19
157
  ## Development
20
158
 
21
159
  When adding to or updating this gem, make sure you do the following:
@@ -7,11 +7,12 @@ class CreateInstDataShipperDumpBatches < CanvasSync::MiscHelper::MigrationClass
7
7
 
8
8
  t.string :job_class
9
9
  t.string :genre
10
+ t.string :batch_id
10
11
 
11
12
  t.string :exception
12
13
  t.text :backtrace
13
- t.text :metadata
14
- t.text :job_arguments
14
+ # t.text :metadata
15
+ # t.text :job_arguments
15
16
 
16
17
  t.timestamps
17
18
  end
@@ -19,7 +19,7 @@ module InstDataShipper
19
19
  instance_exec(&@body_block)
20
20
  end
21
21
 
22
- def table_schemas
22
+ def schema
23
23
  pointer = @schema_pointer || batch_context[:schema_pointer]
24
24
  pointer.constantize
25
25
  end
@@ -22,7 +22,12 @@ module InstDataShipper
22
22
  query = _resolve_model_query(query, table_def[:query])
23
23
 
24
24
  if table_is_incremental?(table_def)
25
- query = _resolve_model_query(query, table_def.dig(:incremental, :scope), string: ->(r, c) { r.where("? > ?", c, incremental_since) })
25
+ query = _resolve_model_query(
26
+ query,
27
+ table_def.dig(:incremental, :scope),
28
+ string: ->(query, column) { query.where("#{column} > ?", incremental_since) },
29
+ default: "updated_at",
30
+ )
26
31
  end
27
32
 
28
33
  query.find_each do |m|
@@ -35,7 +40,9 @@ module InstDataShipper
35
40
  upload_data(table_def, &inner_block)
36
41
  end
37
42
 
38
- def _resolve_model_query(relation, query, string: nil)
43
+ def _resolve_model_query(relation, query, string: nil, default: nil)
44
+ return relation if query == false
45
+ query = default if query.nil?
39
46
  return relation if query.nil?
40
47
 
41
48
  if query.is_a?(Symbol)
@@ -3,7 +3,7 @@ module InstDataShipper
3
3
  class Base
4
4
  attr_reader :dumper
5
5
 
6
- delegate :tracker, :table_schemas, :working_dir, to: :dumper
6
+ delegate :tracker, :schema, :working_dir, to: :dumper
7
7
 
8
8
  def initialize(cache_key, config, dumper)
9
9
  @cache_key = cache_key
@@ -11,9 +11,13 @@ module InstDataShipper
11
11
  @dumper = dumper
12
12
  end
13
13
 
14
+ # This method is called before taking any actions.
15
+ # It should be used to make any necessarry state assumptions (eg, the HostedData destination checks for a previous dump to determine if it can use incremental_since)
16
+ def preinitialize_dump(context); end
17
+
14
18
  # This method is called before processing any data.
15
19
  # It should be used to initialize any external resources needed for the dump.
16
- def initialize_dump; end
20
+ def initialize_dump(context); end
17
21
 
18
22
  # Yields an object (can be anything) that will be passed to `upload_data_chunk` as `chunk`.
19
23
  #
@@ -5,11 +5,43 @@ module InstDataShipper
5
5
  class HostedData < Base
6
6
  include Concerns::Chunking
7
7
 
8
- def initialize_dump
8
+ def preinitialize_dump(context)
9
+ if context[:incremental_since].present?
10
+ begin
11
+ last_dump = hosted_data_client.get("api/v1/custom_dumps/last", {
12
+ status: 'imported',
13
+ # schema_version: convert_schema[:version],
14
+ tags: [
15
+ "ids-schema=#{dumper.schema_digest}",
16
+ "ids-genre=#{dumper.export_genre}",
17
+ ],
18
+ }).body.with_indifferent_access
19
+
20
+ if last_dump[:created_at] < context[:incremental_since]
21
+ InstDataShipper.logger.info("Last successful HostedData dump is older than incremental_since - bumping back incremental_since")
22
+ context[:incremental_since] = last_dump[:created_at]
23
+ end
24
+ rescue Faraday::ResourceNotFound
25
+ # TODO It'd be nice to make this per-table
26
+ InstDataShipper.logger.info("No Last successful HostedData dump of the same schema - not using incremental_since")
27
+ context[:incremental_since] = nil
28
+ end
29
+ end
30
+ end
31
+
32
+ def initialize_dump(context)
33
+ tags = [
34
+ "ids-schema=#{dumper.schema_digest}",
35
+ "ids-genre=#{dumper.export_genre}",
36
+ ]
37
+ tags << "ids-app=#{Rails.application.class.name.gsub(/::Application$/, '')}" if defined?(Rails) && Rails.application
38
+ tags << "ids-schema-version=#{schema[:version]}" if schema[:version].present?
39
+
9
40
  dump = hosted_data_client.post(
10
41
  'api/v1/custom_dumps/',
11
42
  reference_id: tracker.id,
12
43
  schema: convert_schema,
44
+ tags: tags,
13
45
  ).body.with_indifferent_access
14
46
 
15
47
  redis.hset(rk(:state), :dump_id, dump[:id])
@@ -62,6 +94,7 @@ module InstDataShipper
62
94
 
63
95
  def convert_schema
64
96
  definititions = {}
97
+ table_schemas = schema[:tables]
65
98
  table_schemas.each do |ts|
66
99
  ts = ts.dup
67
100
  tname = table_name(ts)
@@ -86,7 +119,7 @@ module InstDataShipper
86
119
  end
87
120
 
88
121
  {
89
- version: "#{dumper.export_genre.downcase}-#{Digest::MD5.hexdigest(definititions.to_json)[0...6]}",
122
+ version: "#{dumper.schema_digest}-#{Digest::MD5.hexdigest(definititions.to_json)[0...6]}",
90
123
  definition: definititions,
91
124
  }
92
125
  end
@@ -19,10 +19,20 @@ module InstDataShipper
19
19
  include(*include)
20
20
 
21
21
  define_method(:enqueue_tasks, &blk)
22
- define_method(:table_schemas) { schema }
22
+ define_method(:schema) { schema }
23
23
  end
24
24
  end
25
25
 
26
+ def self.current(executor: nil)
27
+ cur_batch = Thread.current[CanvasSync::JobBatches::CURRENT_BATCH_THREAD_KEY]
28
+ ctx = cur_batch&.context || {}
29
+ return nil unless ctx[:origin_class].present? && ctx[:tracker_id].present?
30
+
31
+ clazz = ctx[:origin_class]
32
+ clazz = clazz.constantize if clazz.is_a?(String)
33
+ clazz.new(executor: executor)
34
+ end
35
+
26
36
  public
27
37
 
28
38
  def begin_dump
@@ -31,15 +41,18 @@ module InstDataShipper
31
41
  @tracker = tracker = DumpBatch.create(job_class: self.class.to_s, genre: export_genre, status: 'in_progress')
32
42
 
33
43
  @batch_context = context = {
34
- # TODO Allow to be hooked by Destination, likely via initialize_dump_batch and batch_context, so that if an earlier destination fails we can resend data
35
44
  # TODO Consider behavior if last is still running
36
- incremental_since: DumpBatch.where(genre: export_genre, status: 'completed').order(created_at: :desc).first&.created_at,
45
+ incremental_since: last_successful_tracker&.created_at,
37
46
  }
38
47
 
48
+ destinations.each do |dest|
49
+ dest.preinitialize_dump(context)
50
+ end
51
+
39
52
  begin
40
53
  begin
41
54
  destinations.each do |dest|
42
- dest.initialize_dump()
55
+ dest.initialize_dump(context)
43
56
  end
44
57
 
45
58
  run_hook(:initialize_dump_batch, context)
@@ -52,6 +65,7 @@ module InstDataShipper
52
65
 
53
66
  Sidekiq::Batch.new.tap do |batch|
54
67
  context[:root_bid] = batch.bid
68
+ tracker.update(batch_id: batch.bid)
55
69
 
56
70
  batch.description = "HD #{export_genre} Export #{tracker.id} Root"
57
71
  batch.context = context
@@ -62,6 +76,7 @@ module InstDataShipper
62
76
  rescue => ex
63
77
  delayed :cleanup_fatal_error!
64
78
  InstDataShipper.handle_suppressed_error(ex)
79
+ tracker.update(status: 'failed', exception: ex.message, backtrace: ex.backtrace.join("\n"))
65
80
  end
66
81
  end
67
82
  rescue => ex
@@ -74,6 +89,7 @@ module InstDataShipper
74
89
  end
75
90
  end
76
91
  end
92
+ tracker.update(status: 'failed', exception: ex.message, backtrace: ex.backtrace.join("\n"))
77
93
  raise ex
78
94
  end
79
95
  end
@@ -82,15 +98,31 @@ module InstDataShipper
82
98
  @tracker ||= batch_context[:tracker_id].present? ? DumpBatch.find(batch_context[:tracker_id]) : nil
83
99
  end
84
100
 
101
+ def last_successful_tracker
102
+ @last_successful_tracker ||= DumpBatch.where(job_class: self.class.to_s, genre: export_genre, status: 'completed').order(created_at: :desc).first
103
+ end
104
+
85
105
  def export_genre
86
- self.class.to_s.gsub(/HD|ExportJob/, '')
106
+ self.class.to_s
87
107
  end
88
108
 
89
109
  def origin_class
90
110
  batch_context[:origin_class]&.constantize || self.class
91
111
  end
92
112
 
113
+ def schema
114
+ return origin_class::SCHEMA if defined?(origin_class::SCHEMA)
115
+ raise NotImplementedError
116
+ end
117
+
118
+ def schema_digest
119
+ Digest::MD5.hexdigest(schema.to_json)[0...8]
120
+ end
121
+
93
122
  def table_is_incremental?(table_def)
123
+ return false unless incremental_since.present?
124
+
125
+ # TODO Return false if table's schema changes
94
126
  if (inc = table_def[:incremental]).present?
95
127
  differ = inc[:if]
96
128
  return !!incremental_since if differ.nil?
@@ -119,7 +151,7 @@ module InstDataShipper
119
151
 
120
152
  value = Array(value).compact
121
153
 
122
- table_schemas.each do |ts|
154
+ schema[:tables].each do |ts|
123
155
  return ts if value.include?(ts[key])
124
156
  end
125
157
  end
@@ -207,11 +239,6 @@ module InstDataShipper
207
239
 
208
240
  # Helper Methods
209
241
 
210
- def table_schemas
211
- return origin_class::TABLE_SCHEMAS if defined?(origin_class::TABLE_SCHEMAS)
212
- raise NotImplementedError
213
- end
214
-
215
242
  def delayed(mthd, *args, **kwargs)
216
243
  Jobs::AsyncCaller.perform_later(self.class.to_s, mthd.to_s, *args, **kwargs)
217
244
  end
@@ -2,16 +2,22 @@ module InstDataShipper
2
2
  # This class ends up fill two roles - Schema and Mapping.
3
3
  # It makes for a clean API, but it's a little less canonical since, (eg) the S3 destination doesn't need column type annotations.
4
4
  class SchemaBuilder
5
- attr_reader :tables
5
+ attr_reader :schema
6
6
 
7
7
  def initialize
8
- @tables = []
8
+ @schema = {
9
+ tables: [],
10
+ }
9
11
  end
10
12
 
11
13
  def self.build(&block)
12
14
  builder = new
13
15
  builder.instance_exec(&block)
14
- builder.tables
16
+ builder.schema
17
+ end
18
+
19
+ def version(version)
20
+ @schema[:version] = version
15
21
  end
16
22
 
17
23
  def extend_table_builder(&block)
@@ -45,7 +51,7 @@ module InstDataShipper
45
51
 
46
52
  @table_builder_class.build(tdef, &block)
47
53
 
48
- @tables << tdef
54
+ @schema[:tables] << tdef
49
55
 
50
56
  tdef
51
57
  end
@@ -68,7 +74,11 @@ module InstDataShipper
68
74
  # options[key] = value
69
75
  # end
70
76
 
71
- def incremental(scope="updated_at", **kwargs)
77
+ def version(version)
78
+ options[:version] = version
79
+ end
80
+
81
+ def incremental(scope=nil, **kwargs)
72
82
  if (extras = kwargs.keys - %i[on if]).present?
73
83
  raise ArgumentError, "Unsuppored options: #{extras.inspect}"
74
84
  end
@@ -1,3 +1,3 @@
1
1
  module InstDataShipper
2
- VERSION = "0.1.0.beta2".freeze
2
+ VERSION = "0.2.0".freeze
3
3
  end
@@ -39,6 +39,7 @@ module InstDataShipper
39
39
 
40
40
  def logger
41
41
  return @logger if defined? @logger
42
+ # TODO Annotate logs with DumpBatch ID
42
43
  @logger = Logger.new(STDOUT)
43
44
  @logger.level = Logger::DEBUG
44
45
  @logger
@@ -49,7 +50,7 @@ module InstDataShipper
49
50
  end
50
51
 
51
52
  def redis_prefix
52
- pfx = "hdd"
53
+ pfx = "ids"
53
54
  pfx = "#{Apartment::Tenant.current}:#{pfx}" if defined?(Apartment)
54
55
  pfx
55
56
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: inst_data_shipper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0.beta2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Instructure CustomDev
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-03-08 00:00:00.000000000 Z
11
+ date: 2024-03-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rails
@@ -399,7 +399,6 @@ files:
399
399
  - lib/inst_data_shipper/engine.rb
400
400
  - lib/inst_data_shipper/jobs/async_caller.rb
401
401
  - lib/inst_data_shipper/jobs/base.rb
402
- - lib/inst_data_shipper/jobs/basic_dump_job.rb
403
402
  - lib/inst_data_shipper/record.rb
404
403
  - lib/inst_data_shipper/schema_builder.rb
405
404
  - lib/inst_data_shipper/version.rb
@@ -436,9 +435,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
436
435
  version: '0'
437
436
  required_rubygems_version: !ruby/object:Gem::Requirement
438
437
  requirements:
439
- - - ">"
438
+ - - ">="
440
439
  - !ruby/object:Gem::Version
441
- version: 1.3.1
440
+ version: '0'
442
441
  requirements: []
443
442
  rubygems_version: 3.1.6
444
443
  signing_key:
@@ -1,15 +0,0 @@
1
- module InstDataShipper
2
- module Jobs
3
- class BasicDumpJob < InstDataShipper::Jobs::Base
4
- sidekiq_options retry: 3 if defined?(sidekiq_options)
5
-
6
- def perform(endpoints)
7
-
8
- end
9
-
10
- protected
11
-
12
-
13
- end
14
- end
15
- end