RubyGems - inst_data_shipper - Versions diffs - 0.1.0.beta2 → 0.2.0 - Mend

inst_data_shipper 0.1.0.beta2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/README.md +139 -1
data/db/migrate/20240301090836_create_inst_data_shipper_dump_batches.rb +3 -2
data/lib/inst_data_shipper/basic_dumper.rb +1 -1
data/lib/inst_data_shipper/data_sources/local_tables.rb +9 -2
data/lib/inst_data_shipper/destinations/base.rb +6 -2
data/lib/inst_data_shipper/destinations/hosted_data.rb +35 -2
data/lib/inst_data_shipper/dumper.rb +38 -11
data/lib/inst_data_shipper/schema_builder.rb +15 -5
data/lib/inst_data_shipper/version.rb +1 -1
data/lib/inst_data_shipper.rb +2 -1
metadata +4 -5
data/lib/inst_data_shipper/jobs/basic_dump_job.rb +0 -15

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 146f5b93819d7950f9bd256a99eb690a63d453b86be4ac6ac7cf4c5901724cdd
-  data.tar.gz: 2410298ebb3b1ddc565ca70d49a274a129e83087d461dcfae4d4981979795ea5
+  metadata.gz: f7909aa44e9dabd1d43d58a5a3c2c081891104d64336294dce287c06804804df
+  data.tar.gz: 5da874689ac1de3e016a7feefce5866b211e6f7595021b565564f796685ed104
 SHA512:
-  metadata.gz: c6dc93902e0ef7a114d2434d3901c677021d57603b07c1122d72bd2f953184b207d0e57f5441fafb7035a00bf28f2e5035d34cd0edfb73da6d2f93d93874f344
-  data.tar.gz: 2e7babf6a2ed86f9a2e5769bfb393549fb07b771b700513efd640ba2b98ebc3eeadec99578e4c828738903ef121f8e3bebe8490f6f75de0ce9afac43ac28b8fa
+  metadata.gz: cd81e6c26e2416ce1a32de588e04f560496cfb7cfdac3f4c837828a1c65798bec405d98197032b0d8935a1ba2b24a291aa25f1b73a469ac7a9c6ef8d2286103f
+  data.tar.gz: 66c5ccfd82128e8c5dc39c7c937ee7f4f9412743b7202e221e53c575d4b0e572f0b014b4f41ae5924b1d2d119a05cd5de2acbae4eb81022df844a1fea181faec

data/README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # InstDataShipper
-This gem is intended to facilitate fast and easy syncing of Canvas data.
+This gem is intended to facilitate easy upload of LTI datasets to Instructure Hosted Data.
 ## Installation
@@ -16,6 +16,144 @@ Then run the migrations:
 bundle exec rake db:migrate
 ```
+## Usage
+### Dumper
+The main tool provided by this Gem is the `InstDataDumper::Dumper` class. It is used to define a "Dump" which is a combination of tasks and schema.
+Here is an example `Dumper` implementation, wrapped in an ActiveJob job:
+```ruby
+class HostedDataPushJob < ApplicationJob
+  # The schema serves two purposes: defining the schema and mapping data
+  SCHEMA = InstDataShipper::SchemaBuilder.build do
+    # You can augment the Table-builder DSL with custom methods like so:
+    extend_table_builder do
+      # It may be useful to define a custom column definition helpers:
+      def custom_column(*args, from: nil, **kwargs, &blk)
+        # In this example, the helper reads the value from a `data` jsonb column - without it, you'd need
+        #   to define `from: ->(row) { row.data["<KEY>"] }` on each column that needs to read from the jsonb
+        from ||= args[0].to_s
+        from = ->(row) { row.data[from] } if from.is_a?(String)
+        column(*args, **kwargs, from: from, &blk)
+      end
+      # `extend_table_builder` uses `class_eval`, so you could alternatively write your helpers in a Concern or Module and include them like normal:
+      include SomeConcern
+    end
+    table(ALocalModel, "<TABLE DESCRIPTION>") do
+      # If you define a table as incremental, it'll only export changes made since the start of the last successful Dumper run
+      #  The first argument "scope" can be interpreted in different ways:
+      #    If exporting a local model it may be a: (default: `updated_at`)
+      #      Proc that will receive a Relation and return a Relation (use `incremental_since`)
+      #      String of a column to compare with `incremental_since`
+      #    If exporting a Canvas report it may be a: (default: `updated_after`)
+      #      Proc that will receive report params and return modified report params (use `incremental_since`)
+      #      String of a report param to set to `incremental_since`
+      #  `on:` is passed to Hosted Data and is used as the unique key. It may be an array to form a composite-key
+      #  `if:` may be a Proc or a Symbol (of a method on the Dumper)
+      incremental "updated_at", on: [:id], if: ->() {}
+      column :name_in_destinations, :maybe_optional_sql_type, "Optional description of column"
+      # The type may usually be omitted if the `table()` is passed a Model class, but strings are an exception to this
+      custom_column :name, :"varchar(128)"
+      # `from:` May be...
+      # A Symbol of a method to be called on the record
+      custom_column :sis_type, :"varchar(32)", from: :some_model_method
+      # A String of a column to read from the record
+      custom_column :sis_type, :"varchar(32)", from: "sis_source_type"
+      # A Proc to be called with each record
+      custom_column :sis_type, :"varchar(32)", from: ->(rec) { ... }
+      # Not specified. Will default to using the Schema Column Name as a String ("sis_type" in this case)
+      custom_column :sis_type, :"varchar(32)"
+    end
+    table("my_table", model: ALocalModel) do
+      # ...
+    end
+    table("proserv_student_submissions_csv") do
+      column :canvas_id, :bigint, from: "canvas user id"
+      column :sis_id, :"varchar(64)", from: "sis user id"
+      column :name, :"varchar(64)", from: "user name"
+      column :submission_id, :bigint, from: "submission id"
+    end
+  end
+  Dumper = InstDataShipper::Dumper.define(schema: SCHEMA, include: [
+    InstDataShipper::DataSources::LocalTables,
+    InstDataShipper::DataSources::CanvasReports,
+  ]) do
+    import_local_table(ALocalModel)
+    import_canvas_report_by_terms("proserv_student_submissions_csv", terms: Term.all.pluck(:canvas_id))
+    # If the report_name/Model don't directly match the Schema, a schema_name: parameter may be passed:
+    import_local_table(SomeModel, schema_name: "my_table")
+    import_canvas_report_by_terms("some_report", terms: Term.all.pluck(:canvas_id), schema_name: "my_table")
+  end
+  def perform
+    Dumper.perform_dump([
+      "hosted-data://<JWT>@<HOSTED DATA SERVER>?table_prefix=example",
+      "s3://<access_key_id>:<access_key_secret>@<region>/<bucket>/<path>",
+    ])
+  end
+end
+```
+`Dumper`s may also be formed as a normal Ruby subclass:
+```ruby
+class HostedDataPushJob < ApplicationJob
+  SCHEMA = InstDataShipper::SchemaBuilder.build do
+    # ...
+  end
+  class Dumper < InstDataShipper::Dumper
+    include InstDataShipper::DataSources::LocalTables
+    include InstDataShipper::DataSources::CanvasReports
+    def enqueue_tasks
+      import_local_table(ALocalModel)
+      import_canvas_report_by_terms("proserv_student_submissions_csv", terms: Term.all.pluck(:canvas_id))
+    end
+    def table_schemas
+      SCHEMA
+    end
+  end
+  def perform
+    Dumper.perform_dump([
+      "hosted-data://<JWT>@<HOSTED DATA SERVER>?table_prefix=example",
+      "s3://<access_key_id>:<access_key_secret>@<region>/<bucket>/<path>",
+    ])
+  end
+end
+```
+### Destinations
+This Gem is mainly designed for use with Hosted Data, but it tries to abstract that a little to allow for other destinations/backends. Out of the box, support for Hosted Data and S3 are included.
+Destinations are passed as URI-formatted strings. Passing Hashes is also supported, but the format/keys are destination specific.
+Destinations blindly accept URI Fragments (the `#` chunk at the end of the URI). These options are not used internally but will be made available as `dest.user_config`. Ideally these are in the same format as query parameters (`x=1&y=2`, which it will try to parse into a Hash), but it can be any string.
+#### Hosted Data
+`hosted-data://<JWT>@<HOSTED DATA SERVER>`
+##### Optional Parameters:
+- `table_prefix`: An optional string to prefix onto each table name in the schema when declaring the schema in Hosted Data
+#### S3
+`s3://<access_key_id>:<access_key_secret>@<region>/<bucket>/<optional path>`
+##### Optional Parameters:
+_None_
 ## Development
 When adding to or updating this gem, make sure you do the following:

data/db/migrate/20240301090836_create_inst_data_shipper_dump_batches.rb CHANGED Viewed

@@ -7,11 +7,12 @@ class CreateInstDataShipperDumpBatches < CanvasSync::MiscHelper::MigrationClass
       t.string :job_class
       t.string :genre
+      t.string :batch_id
       t.string :exception
       t.text :backtrace
-      t.text :metadata
-      t.text :job_arguments
+      # t.text :metadata
+      # t.text :job_arguments
       t.timestamps
     end

data/lib/inst_data_shipper/basic_dumper.rb CHANGED Viewed

@@ -19,7 +19,7 @@ module InstDataShipper
       instance_exec(&@body_block)
     end
-    def table_schemas
+    def schema
       pointer = @schema_pointer || batch_context[:schema_pointer]
       pointer.constantize
     end

data/lib/inst_data_shipper/data_sources/local_tables.rb CHANGED Viewed

@@ -22,7 +22,12 @@ module InstDataShipper
           query = _resolve_model_query(query, table_def[:query])
           if table_is_incremental?(table_def)
-            query = _resolve_model_query(query, table_def.dig(:incremental, :scope), string: ->(r, c) { r.where("? > ?", c, incremental_since) })
+            query = _resolve_model_query(
+              query,
+              table_def.dig(:incremental, :scope),
+              string: ->(query, column) { query.where("#{column} > ?", incremental_since) },
+              default: "updated_at",
+            )
           end
           query.find_each do |m|
@@ -35,7 +40,9 @@ module InstDataShipper
         upload_data(table_def, &inner_block)
       end
-      def _resolve_model_query(relation, query, string: nil)
+      def _resolve_model_query(relation, query, string: nil, default: nil)
+        return relation if query == false
+        query = default if query.nil?
         return relation if query.nil?
         if query.is_a?(Symbol)

data/lib/inst_data_shipper/destinations/base.rb CHANGED Viewed

@@ -3,7 +3,7 @@ module InstDataShipper
     class Base
       attr_reader :dumper
-      delegate :tracker, :table_schemas, :working_dir, to: :dumper
+      delegate :tracker, :schema, :working_dir, to: :dumper
       def initialize(cache_key, config, dumper)
         @cache_key = cache_key
@@ -11,9 +11,13 @@ module InstDataShipper
         @dumper = dumper
       end
+      # This method is called before taking any actions.
+      # It should be used to make any necessarry state assumptions (eg, the HostedData destination checks for a previous dump to determine if it can use incremental_since)
+      def preinitialize_dump(context); end
       # This method is called before processing any data.
       # It should be used to initialize any external resources needed for the dump.
-      def initialize_dump; end
+      def initialize_dump(context); end
       # Yields an object (can be anything) that will be passed to `upload_data_chunk` as `chunk`.
       #

data/lib/inst_data_shipper/destinations/hosted_data.rb CHANGED Viewed

@@ -5,11 +5,43 @@ module InstDataShipper
     class HostedData < Base
       include Concerns::Chunking
-      def initialize_dump
+      def preinitialize_dump(context)
+        if context[:incremental_since].present?
+          begin
+            last_dump = hosted_data_client.get("api/v1/custom_dumps/last", {
+              status: 'imported',
+              # schema_version: convert_schema[:version],
+              tags: [
+                "ids-schema=#{dumper.schema_digest}",
+                "ids-genre=#{dumper.export_genre}",
+              ],
+            }).body.with_indifferent_access
+            if last_dump[:created_at] < context[:incremental_since]
+              InstDataShipper.logger.info("Last successful HostedData dump is older than incremental_since - bumping back incremental_since")
+              context[:incremental_since] = last_dump[:created_at]
+            end
+          rescue Faraday::ResourceNotFound
+            # TODO It'd be nice to make this per-table
+            InstDataShipper.logger.info("No Last successful HostedData dump of the same schema - not using incremental_since")
+            context[:incremental_since] = nil
+          end
+        end
+      end
+      def initialize_dump(context)
+        tags = [
+          "ids-schema=#{dumper.schema_digest}",
+          "ids-genre=#{dumper.export_genre}",
+        ]
+        tags << "ids-app=#{Rails.application.class.name.gsub(/::Application$/, '')}" if defined?(Rails) && Rails.application
+        tags << "ids-schema-version=#{schema[:version]}" if schema[:version].present?
         dump = hosted_data_client.post(
           'api/v1/custom_dumps/',
           reference_id: tracker.id,
           schema: convert_schema,
+          tags: tags,
         ).body.with_indifferent_access
         redis.hset(rk(:state), :dump_id, dump[:id])
@@ -62,6 +94,7 @@ module InstDataShipper
       def convert_schema
         definititions = {}
+        table_schemas = schema[:tables]
         table_schemas.each do |ts|
           ts = ts.dup
           tname = table_name(ts)
@@ -86,7 +119,7 @@ module InstDataShipper
         end
         {
-          version: "#{dumper.export_genre.downcase}-#{Digest::MD5.hexdigest(definititions.to_json)[0...6]}",
+          version: "#{dumper.schema_digest}-#{Digest::MD5.hexdigest(definititions.to_json)[0...6]}",
           definition: definititions,
         }
       end

data/lib/inst_data_shipper/dumper.rb CHANGED Viewed

@@ -19,10 +19,20 @@ module InstDataShipper
         include(*include)
         define_method(:enqueue_tasks, &blk)
-        define_method(:table_schemas) { schema }
+        define_method(:schema) { schema }
       end
     end
+    def self.current(executor: nil)
+      cur_batch = Thread.current[CanvasSync::JobBatches::CURRENT_BATCH_THREAD_KEY]
+      ctx = cur_batch&.context || {}
+      return nil unless ctx[:origin_class].present? && ctx[:tracker_id].present?
+      clazz = ctx[:origin_class]
+      clazz = clazz.constantize if clazz.is_a?(String)
+      clazz.new(executor: executor)
+    end
     public
     def begin_dump
@@ -31,15 +41,18 @@ module InstDataShipper
       @tracker = tracker = DumpBatch.create(job_class: self.class.to_s, genre: export_genre, status: 'in_progress')
       @batch_context = context = {
-        # TODO Allow to be hooked by Destination, likely via initialize_dump_batch and batch_context, so that if an earlier destination fails we can resend data
         # TODO Consider behavior if last is still running
-        incremental_since: DumpBatch.where(genre: export_genre, status: 'completed').order(created_at: :desc).first&.created_at,
+        incremental_since: last_successful_tracker&.created_at,
       }
+      destinations.each do |dest|
+        dest.preinitialize_dump(context)
+      end
       begin
         begin
           destinations.each do |dest|
-            dest.initialize_dump()
+            dest.initialize_dump(context)
           end
           run_hook(:initialize_dump_batch, context)
@@ -52,6 +65,7 @@ module InstDataShipper
         Sidekiq::Batch.new.tap do |batch|
           context[:root_bid] = batch.bid
+          tracker.update(batch_id: batch.bid)
           batch.description = "HD #{export_genre} Export #{tracker.id} Root"
           batch.context = context
@@ -62,6 +76,7 @@ module InstDataShipper
           rescue => ex
             delayed :cleanup_fatal_error!
             InstDataShipper.handle_suppressed_error(ex)
+            tracker.update(status: 'failed', exception: ex.message, backtrace: ex.backtrace.join("\n"))
           end
         end
       rescue => ex
@@ -74,6 +89,7 @@ module InstDataShipper
             end
           end
         end
+        tracker.update(status: 'failed', exception: ex.message, backtrace: ex.backtrace.join("\n"))
         raise ex
       end
     end
@@ -82,15 +98,31 @@ module InstDataShipper
       @tracker ||= batch_context[:tracker_id].present? ? DumpBatch.find(batch_context[:tracker_id]) : nil
     end
+    def last_successful_tracker
+      @last_successful_tracker ||= DumpBatch.where(job_class: self.class.to_s, genre: export_genre, status: 'completed').order(created_at: :desc).first
+    end
     def export_genre
-      self.class.to_s.gsub(/HD|ExportJob/, '')
+      self.class.to_s
     end
     def origin_class
       batch_context[:origin_class]&.constantize || self.class
     end
+    def schema
+      return origin_class::SCHEMA if defined?(origin_class::SCHEMA)
+      raise NotImplementedError
+    end
+    def schema_digest
+      Digest::MD5.hexdigest(schema.to_json)[0...8]
+    end
     def table_is_incremental?(table_def)
+      return false unless incremental_since.present?
+      # TODO Return false if table's schema changes
       if (inc = table_def[:incremental]).present?
         differ = inc[:if]
         return !!incremental_since if differ.nil?
@@ -119,7 +151,7 @@ module InstDataShipper
         value = Array(value).compact
-        table_schemas.each do |ts|
+        schema[:tables].each do |ts|
           return ts if value.include?(ts[key])
         end
       end
@@ -207,11 +239,6 @@ module InstDataShipper
     # Helper Methods
-    def table_schemas
-      return origin_class::TABLE_SCHEMAS if defined?(origin_class::TABLE_SCHEMAS)
-      raise NotImplementedError
-    end
     def delayed(mthd, *args, **kwargs)
       Jobs::AsyncCaller.perform_later(self.class.to_s, mthd.to_s, *args, **kwargs)
     end

data/lib/inst_data_shipper/schema_builder.rb CHANGED Viewed

@@ -2,16 +2,22 @@ module InstDataShipper
   # This class ends up fill two roles - Schema and Mapping.
   # It makes for a clean API, but it's a little less canonical since, (eg) the S3 destination doesn't need column type annotations.
   class SchemaBuilder
-    attr_reader :tables
+    attr_reader :schema
     def initialize
-      @tables = []
+      @schema = {
+        tables: [],
+      }
     end
     def self.build(&block)
       builder = new
       builder.instance_exec(&block)
-      builder.tables
+      builder.schema
+    end
+    def version(version)
+      @schema[:version] = version
     end
     def extend_table_builder(&block)
@@ -45,7 +51,7 @@ module InstDataShipper
       @table_builder_class.build(tdef, &block)
-      @tables << tdef
+      @schema[:tables] << tdef
       tdef
     end
@@ -68,7 +74,11 @@ module InstDataShipper
       #   options[key] = value
       # end
-      def incremental(scope="updated_at", **kwargs)
+      def version(version)
+        options[:version] = version
+      end
+      def incremental(scope=nil, **kwargs)
         if (extras = kwargs.keys - %i[on if]).present?
           raise ArgumentError, "Unsuppored options: #{extras.inspect}"
         end

data/lib/inst_data_shipper/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module InstDataShipper
-  VERSION = "0.1.0.beta2".freeze
+  VERSION = "0.2.0".freeze
 end

data/lib/inst_data_shipper.rb CHANGED Viewed

@@ -39,6 +39,7 @@ module InstDataShipper
     def logger
       return @logger if defined? @logger
+      # TODO Annotate logs with DumpBatch ID
       @logger = Logger.new(STDOUT)
       @logger.level = Logger::DEBUG
       @logger
@@ -49,7 +50,7 @@ module InstDataShipper
     end
     def redis_prefix
-      pfx = "hdd"
+      pfx = "ids"
       pfx = "#{Apartment::Tenant.current}:#{pfx}" if defined?(Apartment)
       pfx
     end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: inst_data_shipper
 version: !ruby/object:Gem::Version
-  version: 0.1.0.beta2
+  version: 0.2.0
 platform: ruby
 authors:
 - Instructure CustomDev
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-03-08 00:00:00.000000000 Z
+date: 2024-03-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rails
@@ -399,7 +399,6 @@ files:
 - lib/inst_data_shipper/engine.rb
 - lib/inst_data_shipper/jobs/async_caller.rb
 - lib/inst_data_shipper/jobs/base.rb
-- lib/inst_data_shipper/jobs/basic_dump_job.rb
 - lib/inst_data_shipper/record.rb
 - lib/inst_data_shipper/schema_builder.rb
 - lib/inst_data_shipper/version.rb
@@ -436,9 +435,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">"
+  - - ">="
     - !ruby/object:Gem::Version
-      version: 1.3.1
+      version: '0'
 requirements: []
 rubygems_version: 3.1.6
 signing_key:

data/lib/inst_data_shipper/jobs/basic_dump_job.rb DELETED Viewed

@@ -1,15 +0,0 @@
-module InstDataShipper
-  module Jobs
-    class BasicDumpJob < InstDataShipper::Jobs::Base
-      sidekiq_options retry: 3 if defined?(sidekiq_options)
-      def perform(endpoints)
-      end
-      protected
-    end
-  end
-end