RubyGems - inst_data_shipper - Versions diffs - 0.1.0.beta1 - Mend

inst_data_shipper 0.1.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

checksums.yaml +7 -0
data/README.md +35 -0
data/Rakefile +21 -0
data/app/models/hosted_data_dumper/dump_batch.rb +10 -0
data/db/migrate/20240301090836_create_canvas_sync_sync_batches.rb +17 -0
data/lib/inst_data_shipper/basic_dumper.rb +27 -0
data/lib/inst_data_shipper/concerns/hooks.rb +32 -0
data/lib/inst_data_shipper/data_sources/base.rb +7 -0
data/lib/inst_data_shipper/data_sources/canvas_reports.rb +113 -0
data/lib/inst_data_shipper/data_sources/local_tables.rb +33 -0
data/lib/inst_data_shipper/destinations/base.rb +104 -0
data/lib/inst_data_shipper/destinations/concerns/chunking.rb +34 -0
data/lib/inst_data_shipper/destinations/hosted_data.rb +133 -0
data/lib/inst_data_shipper/destinations/s3.rb +72 -0
data/lib/inst_data_shipper/dumper.rb +159 -0
data/lib/inst_data_shipper/engine.rb +8 -0
data/lib/inst_data_shipper/jobs/async_caller.rb +19 -0
data/lib/inst_data_shipper/jobs/base.rb +27 -0
data/lib/inst_data_shipper/jobs/basic_dump_job.rb +11 -0
data/lib/inst_data_shipper/record.rb +6 -0
data/lib/inst_data_shipper/schema_builder.rb +93 -0
data/lib/inst_data_shipper/version.rb +3 -0
data/lib/inst_data_shipper.rb +71 -0
data/spec/dummy/README.rdoc +1 -0
data/spec/dummy/Rakefile +6 -0
data/spec/dummy/bin/rails +4 -0
data/spec/dummy/config/application.rb +37 -0
data/spec/dummy/config/boot.rb +5 -0
data/spec/dummy/config/database.yml +25 -0
data/spec/dummy/config/environment.rb +5 -0
data/spec/dummy/config/environments/development.rb +41 -0
data/spec/dummy/config/environments/test.rb +44 -0
data/spec/dummy/config/initializers/assets.rb +11 -0
data/spec/dummy/config/initializers/session_store.rb +3 -0
data/spec/dummy/config/initializers/wrap_parameters.rb +14 -0
data/spec/dummy/config/routes.rb +2 -0
data/spec/dummy/config/secrets.yml +22 -0
data/spec/dummy/config.ru +4 -0
data/spec/dummy/db/schema.rb +45 -0
data/spec/spec_helper.rb +70 -0
data/spec/support/fixtures/reports/provisioning_csv_unzipped/courses.csv +3 -0
data/spec/support/fixtures/reports/provisioning_csv_unzipped/users.csv +4 -0
metadata +452 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: bf2f1cdd4b4181e945c5f36e7680ed0a054429dc191197fafbee60de9598305b
+  data.tar.gz: 5fb781dc8aa17bf7d672fdfc8942d70365edb68fd324d8aeab70297270af0b1e
+SHA512:
+  metadata.gz: 9212cd9c647193aa7256f15f6da12cd4ee3c56a12e011ac269a1d801d15e0cb7182a71c2fa8d8e2d2ea808aff73ff4f7c974c3720db54414eb43c24658ca554f
+  data.tar.gz: c4cb69ad7ea635833aa5051dec5a8c14f3aa13e2b11dd3e8fbdd4d12c2a9d63ac9dbb5b235915da0d80898d0b048d5677fb807d6947911e3e10a87956b8ee1fc

data/README.md ADDED Viewed

@@ -0,0 +1,35 @@
+# InstDataShipper
+This gem is intended to facilitate fast and easy syncing of Canvas data.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'inst_data_shipper'
+```
+Then run the migrations:
+```
+bundle exec rake db:migrate
+```
+## Development
+When adding to or updating this gem, make sure you do the following:
+- Update the yardoc comments where necessary, and confirm the changes by running `yardoc --server`
+- Write specs
+- If you modify the model or migration templates, run `bundle exec rake update_test_schema` to update them in the Rails Dummy application (and commit those changes)
+## Docs
+Docs can be generated using [yard](https://yardoc.org/). To view the docs:
+- Clone this gem's repository
+- `bundle install`
+- `yard server --reload`
+The yard server will give you a URL you can visit to view the docs.

data/Rakefile ADDED Viewed

@@ -0,0 +1,21 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+require "open3"
+RSpec::Core::RakeTask.new(:spec)
+task default: :spec
+desc 'This updates the migrations used by the testing Rails Dummy app and should be run whenever those are updated.'
+task :update_test_schema do
+  puts "Updating the test database and schema..."
+  stream_command("cd spec/dummy; bundle exec rake db:drop; bundle exec rake db:create; bundle exec rake db:migrate")
+end
+def stream_command(cmd)
+  Open3.popen2e(cmd) do |stdin, stdout_stderr, wait_thr|
+    while line = stdout_stderr.gets
+      puts line
+    end
+  end
+end

data/app/models/hosted_data_dumper/dump_batch.rb ADDED Viewed

@@ -0,0 +1,10 @@
+module InstDataShipper
+  class DumpBatch < ApplicationRecord
+    serialize :job_arguments, Array
+    ERROR_STATUS = "error".freeze
+    SUCCESS_STATUS = "success".freeze
+    ENQUEUED_STATUS = "enqueued".freeze
+    RUNNING_STATUS = "running".freeze
+  end
+end

data/db/migrate/20240301090836_create_canvas_sync_sync_batches.rb ADDED Viewed

@@ -0,0 +1,17 @@
+class InstDataShipperDumpBatches < CanvasSync::MiscHelper::MigrationClass
+  def change
+    create_table :inst_data_shipper_dump_batches do |t|
+      t.datetime :started_at
+      t.datetime :completed_at
+      t.string :status
+      t.string :job_class
+      t.string :exception
+      t.text :backtrace
+      t.text :metadata
+      t.text :job_arguments
+      t.timestamps
+    end
+  end
+end

data/lib/inst_data_shipper/basic_dumper.rb ADDED Viewed

@@ -0,0 +1,27 @@
+module InstDataShipper
+  class BasicDumper < Dumper
+    def self.perform_dump(destinations:, schema:, &block)
+      raise "Schema must be a constantizable string" unless schema.is_a?(String)
+      dumper = new(destinations)
+      dumper.instance_variable_set(:@schema_pointer, schema)
+      dumper.instance_variable_set(:@body_block, block)
+      dumper.begin_dump
+      dumper.tracker
+    end
+    hook :initialize_dump_batch do |context|
+      context[:schema_pointer] = @schema_pointer
+    end
+    def enqueue_tasks
+      instance_exec(&@body_block)
+    end
+    def table_schemas
+      pointer = @schema_pointer || batch_context[:schema_pointer]
+      safe_constantize(pointer)
+    end
+  end
+end

data/lib/inst_data_shipper/concerns/hooks.rb ADDED Viewed

@@ -0,0 +1,32 @@
+module InstDataShipper
+  module Hooks
+    extend ActiveSupport::Concern
+    class_methods do
+      def define_hook(name)
+        @hooks ||= {}
+        @hooks[name] ||= []
+      end
+      def hook(name, prepend: false, &block)
+        hooks = @hooks[name]
+        prepend ? hooks.unshift(block) : hooks << block
+      end
+    end
+    def run_hook(name, *args, **kwargs)
+      hooks = @hooks[name]
+      hooks.each do |blk|
+        instance_exec(*args, **kwargs, &blk)
+      end
+    end
+    def run_hook_safe(name, *args, **kwargs)
+      hooks = @hooks[name]
+      hooks.each do |blk|
+        instance_exec(*args, **kwargs, &blk)
+      rescue StandardError
+      end
+    end
+  end
+end

data/lib/inst_data_shipper/data_sources/base.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module InstDataShipper
+  module DataSources
+    module Base
+      extend ActiveSupport::Concern
+    end
+  end
+end

data/lib/inst_data_shipper/data_sources/canvas_reports.rb ADDED Viewed

@@ -0,0 +1,113 @@
+module InstDataShipper
+  module DataSources
+    # This module contains the logic for processing Canvas reports
+    module CanvasReports
+      extend ActiveSupport::Concern
+      included do
+        hook :initialize_dump_batch do |context|
+          report_processor_pool = CanvasSync::JobBatches::Pool.new(
+            description: "HD #{export_genre} Export #{tracker.id} Canvas Report Pool",
+            concurrency: 4,
+            clean_when_empty: false,
+          )
+          context[:report_processor_pool] = report_processor_pool.pid
+        end
+        hook :finalize_dump_batch do
+          if batch_context[:report_processor_pool]
+            CanvasSync::JobBatches::Pool.from_pid(batch_context[:report_processor_pool]).cleanup_redis
+          end
+        end
+      end
+      public
+      def import_canvas_report(*args, **kwargs)
+        _in_canvas_report_pool(:_import_canvas_report, *args, **kwargs)
+      end
+      def import_canvas_report_by_terms(target_table, report_name, terms: [], params: {}, **kwargs)
+        term_ids = (terms || []).map do |term|
+          term.is_a?(Term) ? term.canvas_id : term
+        end
+        Sidekiq::Batch.new.tap do |b|
+          b.description = "Term Scoped #{report_name} Runners"
+          b.context = {
+            report_bid: b.bid,
+          }
+          b.jobs do
+            terms_query = term_ids.present? ? Term.where(canvas_id: term_ids) : Term
+            terms_query.find_each do |t|
+              import_canvas_report(target_table, report_name, params: { **params, enrollment_term_id: t.canvas_id }, **kwargs)
+            end
+          end
+        end
+      end
+      def import_existing_report(table, report)
+        delayed(:_process_canvas_report, table, report: report)
+      end
+      private
+      def _import_canvas_report(target_table, report_name, retry_count: 3, params: {}, **kwargs)
+        report = canvas_sync_client.start_report(
+          'self', report_name,
+          parameters: params,
+        )
+        CanvasSync::Jobs::CanvasProcessWaiter.perform_later(
+          "/api/v1/accounts/self/reports/#{report_name}/#{report[:id]}",
+          {
+            instance_of: origin_class,
+            method: :_process_canvas_report,
+            args: [target_table],
+            kwargs: kwargs,
+          },
+          on_failure: {
+            instance_of: origin_class,
+            method: :_handle_failed_canvas_report,
+            args: [target_table, report_name, kwargs],
+            kwargs: { retry_count: retry_count },
+          },
+          status_key: :status,
+          progress_as: :report,
+        )
+      end
+      def _in_canvas_report_pool(mthd, *args, **kwargs)
+        pool = CanvasSync::JobBatches::Pool.from_pid(batch_context[:report_processor_pool])
+        AsyncCaller.call_from_pool(pool, self.class, mthd, *args, **kwargs)
+      end
+      def _process_canvas_report(table, report:)
+        table_def = table_schemas.find { |t| t[:warehouse_name].to_s == table }
+        IO.copy_stream(URI.parse(report['attachment']['url']).open, "#{working_dir}/#{table}.csv")
+        inner_block = ->(file) {
+          CSV.foreach("#{working_dir}/#{table}.csv", headers: true) do |m|
+            file << table_def[:columns].map do |c|
+              c[:transformer].present? ? m.instance_exec(&c[:transformer]) : m[c[:local_name].to_s]
+            end
+          end
+        }
+        upload_data(table_def, extra: report['id'], &inner_block)
+      end
+      def _handle_failed_canvas_report(table, report_name, kwargs, retry_count:, report:) # rubocop:disable Lint/UnusedMethodArgument
+        if retry_count.positive?
+          tbid = batch_context[:report_bid] || batch_context[:root_bid]
+          Sidekiq::Batch.new(tbid).jobs do
+            import_canvas_report(table, report_name, retry_count: retry_count - 1, **kwargs.symbolize_keys)
+          end
+        else
+          cleanup_fatal_error!
+        end
+      end
+    end
+  end
+end

data/lib/inst_data_shipper/data_sources/local_tables.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module InstDataShipper
+  module DataSources
+    # This module contains the logic for processing local AR tables
+    module LocalTables
+      extend ActiveSupport::Concern
+      public
+      def import_local_table(*args, **kwargs)
+        delayed(:_import_local_table, *args, **kwargs)
+      end
+      private
+      def _import_local_table(table_name)
+        table_def = table_schemas.find { |t| t[:model].to_s == table_name }
+        model = table_def[:model]
+        inner_block = ->(file) {
+          query = model
+          query = query.includes(table_def[:includes]) if table_def[:includes].present?
+          model.find_each do |m|
+            file << table_def[:columns].map do |c|
+              c[:transformer].present? ? m.instance_exec(&c[:transformer]) : m[c[:local_name].to_s]
+            end
+          end
+        }
+        upload_data(table_def, &inner_block)
+      end
+    end
+  end
+end

data/lib/inst_data_shipper/destinations/base.rb ADDED Viewed

@@ -0,0 +1,104 @@
+module InstDataShipper
+  module Destinations
+    class Base
+      attr_reader :dumper
+      delegate :tracker, :table_schemas, :working_dir, to: :dumper
+      def initialize(cache_key, config, dumper)
+        @cache_key = cache_key
+        @_config = config
+        @dumper = dumper
+      end
+      # This method is called before processing any data.
+      # It should be used to initialize any external resources needed for the dump.
+      def initialize_dump; end
+      # Yields an object (can be anything) that will be passed to `upload_data_chunk` as `chunk`.
+      #
+      # If multiple Destinations have the same `group_key`, `chunk_data` will only be called on the first and the chunk will be passed to each destination.
+      # Thus, if chunking is config-dependent, your Destination must modify the `group_key` to be unique for each configuration.
+      #
+      # This must be overridden, but you may call super with a block to iterate individual rows. Manually batch the rows, or include Concerns::Chunking to pre-batch them.
+      def chunk_data(generator, **kwargs)
+        raise NotImplementedError if method(__method__).owner == Base
+        enum = Enumerator.new(&generator)
+        enum.each do |row|
+          yield format_row(row)
+        end
+      end
+      # Called with any values yielded from chunk_data.
+      # This method should upload the chunk to the destination.
+      def upload_data_chunk(table_def, chunk)
+        raise NotImplementedError
+      end
+      # This method is called after processing all data.
+      # It should be used to finalize any external resources created by the dump.
+      def finalize_dump; end
+      # This method is called if a fatal error occurs.
+      # It should cleanup any external resources created by the dump.
+      def cleanup_fatal_error; end
+      def config
+        return @_config if @_config.is_a?(Hash)
+        @config ||= parse_configuration(@_config)
+      end
+      def user_config
+        config[:extra]
+      end
+      def group_key
+        { class: self.class }
+      end
+      protected
+      def parse_configuration(uri)
+        if block_given?
+          parsed = URI.parse(uri)
+          cfg = {
+            params: parsed.query.present? ? Rack::Utils.parse_nested_query(parsed.query) : {},
+            extra: (parsed.fragment.present? && parsed.fragment.match?(/^\w+=/) && Rack::Utils.parse_nested_query(parsed.fragment)).presence || parsed.fragment || nil,
+          }
+          yield parsed, cfg
+          cfg
+        else
+          raise NotImplementedError
+        end
+      end
+      def rk(key)
+        "#{@cache_key}:#{key}"
+      end
+      def redis(*args, &blk)
+        InstDataShipper.redis(*args, &blk)
+      end
+      # This is a base/generic implementation and may need to be overridden
+      def format_row(row, override_nils: true)
+        if row.is_a?(Array)
+          row = row.map do |v|
+            v = '\N' if v.nil? && override_nils
+            v = v.utc.strftime('%Y-%m-%d %H:%M:%S') if v.is_a?(DateTime) || v.is_a?(Time)
+            v = v.strftime('%Y-%m-%d') if v.is_a?(Date)
+            v = JSON.dump(v) if v.is_a?(Hash) || v.is_a?(Array)
+            if v.is_a?(String)
+              v = v.gsub("\t", '\t')
+              v = v.gsub("\n", '\n')
+            end
+            v
+          end
+        end
+        row
+      end
+    end
+  end
+end

data/lib/inst_data_shipper/destinations/concerns/chunking.rb ADDED Viewed

@@ -0,0 +1,34 @@
+module InstDataShipper
+  module Destinations
+    module Concerns
+      module Chunking
+        extend ActiveSupport::Concern
+        DEFAULT_CHUNK_SIZE = 100_000
+        def chunk_data(generator, chunk_size: nil, **kwargs)
+          chunk_size ||= config.dig(:params, :chunk_size) || DEFAULT_CHUNK_SIZE
+          slice = 1
+          btchr = CanvasSync::BatchProcessor.new(of: chunk_size) do |batch|
+            yield batch, slice
+            slice += 1
+          end
+          super(generator, **kwargs) do |row|
+            btchr << row
+          end
+          btchr.flush
+        end
+        def group_key
+          super.tap do |k|
+            k[:chunk_size] = config.dig(:params, :chunk_size) || DEFAULT_CHUNK_SIZE
+          end
+        end
+      end
+    end
+  end
+end

data/lib/inst_data_shipper/destinations/hosted_data.rb ADDED Viewed

@@ -0,0 +1,133 @@
+module InstDataShipper
+  module Destinations
+    class HostedData < Base
+      include Concerns::Chunking
+      def initialize_dump
+        dump = hosted_data_client.post(
+          'api/v1/custom_dumps/',
+          reference_id: tracker.id,
+          schema: convert_schema,
+        ).body.with_indifferent_access
+        redis.hset(rk(:state), :dump_id, dump[:id])
+        redis.expire(rk(:state), 30.days.to_i)
+      end
+      def chunk_data(generator, table:, extra: nil)
+        warehouse_name = table_def[:warehouse_name]
+        super(generator) do |batch, idx|
+          bits = [warehouse_name, extra, idx].compact
+          temp_file = "#{working_dir}/#{bits.join('.')}.tsv.gz"
+          Zlib::GzipWriter.open(temp_file) do |gz|
+            batch.each do |row|
+              row = row.join("\t") if row.is_a?(Array)
+              gz.puts(row)
+            end
+          end
+          yield temp_file
+          File.delete(temp_file)
+        end
+      end
+      def upload_data_chunk(table_def, chunk)
+        hosted_data_client.put("api/v1/custom_dumps/#{hd_dump_id}/", artifacts: {
+          table_def[:warehouse_name] => [Faraday::UploadIO.new(chunk, 'application/gzip')],
+        })
+      end
+      def finalize_dump
+        hosted_data_client.put("api/v1/custom_dumps/#{hd_dump_id}/", start_import: true) if hd_dump_id.present?
+        redis.delete(rk(:state))
+      end
+      def cleanup_fatal_error
+        hosted_data_client.delete("api/v1/custom_dumps/#{hd_dump_id}/", reason: 'Failure during extraction or transformation') if hd_dump_id.present?
+        redis.delete(rk(:state))
+      end
+      # TODO Support/allow single-table fatal errors?
+      protected
+      def hd_dump_id
+        @hd_dump_id ||= redis.hget(rk(:state), :dump_id)
+      end
+      def convert_schema
+        table_prefix = config[:table_prefix]
+        table_prefix = table_prefix.present? ? "#{table_prefix}_" : nil
+        definititions = {}
+        table_schemas.each do |ts|
+          ts = ts.dup
+          table_name = ts[:warehouse_name]
+          table_name = table_prefix + table_name if table_prefix.present?
+          definititions[ts[:warehouse_name]] = {
+            dw_type: 'dimension',
+            description: ts[:description],
+            incremental: !!ts[:incremental],
+            incremental_on: ts[:incremental] && ts[:incremental] != true ? ts[:incremental] : nil,
+            # indexed_columns
+            tableName: table_name,
+            columns: ts[:columns].map do |col|
+              {
+                name: col[:warehouse_name],
+                description: col[:description],
+                type: col[:type] || ts[:model].column_for_attribute(col[:local_name]).sql_type,
+              }
+            end,
+          }
+        end
+        {
+          version: "#{dumper.export_genre.downcase}-#{Digest::MD5.hexdigest(definititions.to_json)[0...6]}",
+          definition: definititions,
+        }
+      end
+      def hosted_data_client
+        @hosted_data_client ||= begin
+          token = config[:token]
+          host = config[:host]
+          unless host.present?
+            tok_content = JWT.decode(token, nil, false).first
+            host = tok_content['host']
+          end
+          Faraday.new(url: host) do |faraday|
+            faraday.request                     :multipart
+            faraday.request                     :json
+            faraday.response                    :raise_error
+            faraday.response                    :follow_redirects
+            faraday.response                    :json, :content_type => /\bjson$/
+            faraday.headers[:accept]          = 'application/json'
+            faraday.headers[:authorization]   = "Bearer #{token}"
+            faraday.adapter                     Faraday.default_adapter
+          end
+        end
+      end
+      def parse_configuration(uri)
+        super do |parsed_uri, cfg|
+          if parsed_uri.username.present?
+            # hosted-data://<JWT>:<hosted_data_domain>
+            cfg[:token] = parsed_uri.username
+            cfg[:host] = parsed_uri.host
+          else
+            # hosted-data://<JWT>
+            cfg[:token] = parsed_uri.host
+          end
+        end
+      end
+    end
+  end
+end

data/lib/inst_data_shipper/destinations/s3.rb ADDED Viewed

@@ -0,0 +1,72 @@
+module InstDataShipper
+  module Destinations
+    class S3 < Base
+      include Concerns::Chunking
+      def chunk_data(generator, table:, extra: nil)
+        warehouse_name = table_def[:warehouse_name]
+        super(generator) do |batch, idx|
+          bits = [warehouse_name, extra, idx].compact
+          temp_file = "#{working_dir}/#{bits.join('.')}.csv"
+          CSV.open(temp_file, 'w', headers: false) do |row|
+            row << table[:columns].map { |c| c[:warehouse_name] }
+            batch.each do |batch_row|
+              row << batch_row
+            end
+          end
+          yield temp_file
+          File.delete(temp_file)
+        end
+      end
+      def upload_data_chunk(table_def, chunk)
+        s3 = Aws::S3::Resource.new(client: aws_client)
+        dir_key = tracker.created_at.strftime("%Y-%m-%dT%H:%M") + "_#{tracker.id}"
+        bucket = s3.bucket(config[:bucket])
+        subpath = config[:path].presence || "/"
+        subpath = subpath[1..-1] if subpath.starts_with?("/")
+        subpath = "instructure" unless subpath.present?
+        obj_path = File.join(config[:path], dir_key, File.basename(chunk))
+        object = bucket.object(obj_path)
+        File.open(chunk, 'rb') do |file|
+          object.put(body: file)
+        end
+      end
+      protected
+      def aws_client
+        @aws_client ||= Aws::S3::Client.new(
+          region: config[:region],
+          credentials: Aws::Credentials.new(
+            config[:access_key_id],
+            config[:access_key_secret],
+          )
+        )
+      end
+      def parse_configuration(uri)
+        # s3://<access_key_id>:<access_key_secret>@<region>/<bucket>/<path>
+        super do |parsed_uri, cfg|
+          split_path = parsed_uri.path.split('/')
+          cfg.merge!({
+            region: parsed_uri.host,
+            bucket: split_path[0],
+            access_key_id: parsed_uri.user,
+            access_key_secret: parsed_uri.password,
+            path: split_path[1..-1].join('/').presence,
+          })
+        end
+      end
+    end
+  end
+end