RubyGems - inst_data_shipper - Versions diffs - 0.1.0.beta1 - Mend

inst_data_shipper 0.1.0.beta1

Files changed (43) hide show

checksums.yaml +7 -0
data/README.md +35 -0
data/Rakefile +21 -0
data/app/models/hosted_data_dumper/dump_batch.rb +10 -0
data/db/migrate/20240301090836_create_canvas_sync_sync_batches.rb +17 -0
data/lib/inst_data_shipper/basic_dumper.rb +27 -0
data/lib/inst_data_shipper/concerns/hooks.rb +32 -0
data/lib/inst_data_shipper/data_sources/base.rb +7 -0
data/lib/inst_data_shipper/data_sources/canvas_reports.rb +113 -0
data/lib/inst_data_shipper/data_sources/local_tables.rb +33 -0
data/lib/inst_data_shipper/destinations/base.rb +104 -0
data/lib/inst_data_shipper/destinations/concerns/chunking.rb +34 -0
data/lib/inst_data_shipper/destinations/hosted_data.rb +133 -0
data/lib/inst_data_shipper/destinations/s3.rb +72 -0
data/lib/inst_data_shipper/dumper.rb +159 -0
data/lib/inst_data_shipper/engine.rb +8 -0
data/lib/inst_data_shipper/jobs/async_caller.rb +19 -0
data/lib/inst_data_shipper/jobs/base.rb +27 -0
data/lib/inst_data_shipper/jobs/basic_dump_job.rb +11 -0
data/lib/inst_data_shipper/record.rb +6 -0
data/lib/inst_data_shipper/schema_builder.rb +93 -0
data/lib/inst_data_shipper/version.rb +3 -0
data/lib/inst_data_shipper.rb +71 -0
data/spec/dummy/README.rdoc +1 -0
data/spec/dummy/Rakefile +6 -0
data/spec/dummy/bin/rails +4 -0
data/spec/dummy/config/application.rb +37 -0
data/spec/dummy/config/boot.rb +5 -0
data/spec/dummy/config/database.yml +25 -0
data/spec/dummy/config/environment.rb +5 -0
data/spec/dummy/config/environments/development.rb +41 -0
data/spec/dummy/config/environments/test.rb +44 -0
data/spec/dummy/config/initializers/assets.rb +11 -0
data/spec/dummy/config/initializers/session_store.rb +3 -0
data/spec/dummy/config/initializers/wrap_parameters.rb +14 -0
data/spec/dummy/config/routes.rb +2 -0
data/spec/dummy/config/secrets.yml +22 -0
data/spec/dummy/config.ru +4 -0
data/spec/dummy/db/schema.rb +45 -0
data/spec/spec_helper.rb +70 -0
data/spec/support/fixtures/reports/provisioning_csv_unzipped/courses.csv +3 -0
data/spec/support/fixtures/reports/provisioning_csv_unzipped/users.csv +4 -0
metadata +452 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: bf2f1cdd4b4181e945c5f36e7680ed0a054429dc191197fafbee60de9598305b
+  data.tar.gz: 5fb781dc8aa17bf7d672fdfc8942d70365edb68fd324d8aeab70297270af0b1e
+SHA512:
+  metadata.gz: 9212cd9c647193aa7256f15f6da12cd4ee3c56a12e011ac269a1d801d15e0cb7182a71c2fa8d8e2d2ea808aff73ff4f7c974c3720db54414eb43c24658ca554f
+  data.tar.gz: c4cb69ad7ea635833aa5051dec5a8c14f3aa13e2b11dd3e8fbdd4d12c2a9d63ac9dbb5b235915da0d80898d0b048d5677fb807d6947911e3e10a87956b8ee1fc

data/README.md ADDED Viewed

@@ -0,0 +1,35 @@
+# InstDataShipper
+This gem is intended to facilitate fast and easy syncing of Canvas data.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'inst_data_shipper'
+```
+Then run the migrations:
+```
+bundle exec rake db:migrate
+```
+## Development
+When adding to or updating this gem, make sure you do the following:
+- Update the yardoc comments where necessary, and confirm the changes by running `yardoc --server`
+- Write specs
+- If you modify the model or migration templates, run `bundle exec rake update_test_schema` to update them in the Rails Dummy application (and commit those changes)
+## Docs
+Docs can be generated using [yard](https://yardoc.org/). To view the docs:
+- Clone this gem's repository
+- `bundle install`
+- `yard server --reload`
+The yard server will give you a URL you can visit to view the docs.

data/Rakefile ADDED Viewed

@@ -0,0 +1,21 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+require "open3"
+RSpec::Core::RakeTask.new(:spec)
+task default: :spec
+desc 'This updates the migrations used by the testing Rails Dummy app and should be run whenever those are updated.'
+task :update_test_schema do
+  puts "Updating the test database and schema..."
+  stream_command("cd spec/dummy; bundle exec rake db:drop; bundle exec rake db:create; bundle exec rake db:migrate")
+end
+def stream_command(cmd)
+  Open3.popen2e(cmd) do |stdin, stdout_stderr, wait_thr|
+    while line = stdout_stderr.gets
+      puts line
+    end
+  end
+end

data/app/models/hosted_data_dumper/dump_batch.rb ADDED Viewed

@@ -0,0 +1,10 @@
+module InstDataShipper
+  class DumpBatch < ApplicationRecord
+    serialize :job_arguments, Array
+    ERROR_STATUS = "error".freeze
+    SUCCESS_STATUS = "success".freeze
+    ENQUEUED_STATUS = "enqueued".freeze
+    RUNNING_STATUS = "running".freeze
+  end
+end

data/db/migrate/20240301090836_create_canvas_sync_sync_batches.rb ADDED Viewed

@@ -0,0 +1,17 @@
+class InstDataShipperDumpBatches < CanvasSync::MiscHelper::MigrationClass
+  def change
+    create_table :inst_data_shipper_dump_batches do |t|
+      t.datetime :started_at
+      t.datetime :completed_at
+      t.string :status
+      t.string :job_class
+      t.string :exception
+      t.text :backtrace
+      t.text :metadata
+      t.text :job_arguments
+      t.timestamps
+    end
+  end
+end

data/lib/inst_data_shipper/basic_dumper.rb ADDED Viewed

@@ -0,0 +1,27 @@
+module InstDataShipper
+  class BasicDumper < Dumper
+    def self.perform_dump(destinations:, schema:, &block)
+      raise "Schema must be a constantizable string" unless schema.is_a?(String)
+      dumper = new(destinations)
+      dumper.instance_variable_set(:@schema_pointer, schema)
+      dumper.instance_variable_set(:@body_block, block)
+      dumper.begin_dump
+      dumper.tracker
+    end
+    hook :initialize_dump_batch do |context|
+      context[:schema_pointer] = @schema_pointer
+    end
+    def enqueue_tasks
+      instance_exec(&@body_block)
+    end
+    def table_schemas
+      pointer = @schema_pointer || batch_context[:schema_pointer]
+      safe_constantize(pointer)
+    end
+  end
+end

data/lib/inst_data_shipper/concerns/hooks.rb ADDED Viewed

@@ -0,0 +1,32 @@
+module InstDataShipper
+  module Hooks
+    extend ActiveSupport::Concern
+    class_methods do
+      def define_hook(name)
+        @hooks ||= {}
+        @hooks[name] ||= []
+      end
+      def hook(name, prepend: false, &block)
+        hooks = @hooks[name]
+        prepend ? hooks.unshift(block) : hooks << block
+      end
+    end
+    def run_hook(name, *args, **kwargs)
+      hooks = @hooks[name]
+      hooks.each do |blk|
+        instance_exec(*args, **kwargs, &blk)
+      end
+    end
+    def run_hook_safe(name, *args, **kwargs)
+      hooks = @hooks[name]
+      hooks.each do |blk|
+        instance_exec(*args, **kwargs, &blk)
+      rescue StandardError
+      end
+    end
+  end
+end

data/lib/inst_data_shipper/data_sources/base.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module InstDataShipper
+  module DataSources
+    module Base
+      extend ActiveSupport::Concern
+    end
+  end
+end

data/lib/inst_data_shipper/data_sources/canvas_reports.rb ADDED Viewed

@@ -0,0 +1,113 @@
+module InstDataShipper
+  module DataSources
+    # This module contains the logic for processing Canvas reports
+    module CanvasReports
+      extend ActiveSupport::Concern
+      included do
+        hook :initialize_dump_batch do |context|
+          report_processor_pool = CanvasSync::JobBatches::Pool.new(
+            description: "HD #{export_genre} Export #{tracker.id} Canvas Report Pool",
+            concurrency: 4,
+            clean_when_empty: false,
+          )
+          context[:report_processor_pool] = report_processor_pool.pid
+        end
+        hook :finalize_dump_batch do
+          if batch_context[:report_processor_pool]
+            CanvasSync::JobBatches::Pool.from_pid(batch_context[:report_processor_pool]).cleanup_redis
+          end
+        end
+      end
+      public
+      def import_canvas_report(*args, **kwargs)
+        _in_canvas_report_pool(:_import_canvas_report, *args, **kwargs)
+      end
+      def import_canvas_report_by_terms(target_table, report_name, terms: [], params: {}, **kwargs)
+        term_ids = (terms || []).map do |term|
+          term.is_a?(Term) ? term.canvas_id : term
+        end
+        Sidekiq::Batch.new.tap do |b|
+          b.description = "Term Scoped #{report_name} Runners"
+          b.context = {
+            report_bid: b.bid,
+          }
+          b.jobs do
+            terms_query = term_ids.present? ? Term.where(canvas_id: term_ids) : Term
+            terms_query.find_each do |t|
+              import_canvas_report(target_table, report_name, params: { **params, enrollment_term_id: t.canvas_id }, **kwargs)
+            end
+          end
+        end
+      end
+      def import_existing_report(table, report)
+        delayed(:_process_canvas_report, table, report: report)
+      end
+      private
+      def _import_canvas_report(target_table, report_name, retry_count: 3, params: {}, **kwargs)
+        report = canvas_sync_client.start_report(
+          'self', report_name,
+          parameters: params,
+        )
+        CanvasSync::Jobs::CanvasProcessWaiter.perform_later(
+          "/api/v1/accounts/self/reports/#{report_name}/#{report[:id]}",
+          {
+            instance_of: origin_class,
+            method: :_process_canvas_report,
+            args: [target_table],
+            kwargs: kwargs,
+          },
+          on_failure: {
+            instance_of: origin_class,
+            method: :_handle_failed_canvas_report,
+            args: [target_table, report_name, kwargs],
+            kwargs: { retry_count: retry_count },
+          },
+          status_key: :status,
+          progress_as: :report,
+        )
+      end
+      def _in_canvas_report_pool(mthd, *args, **kwargs)
+        pool = CanvasSync::JobBatches::Pool.from_pid(batch_context[:report_processor_pool])
+        AsyncCaller.call_from_pool(pool, self.class, mthd, *args, **kwargs)
+      end
+      def _process_canvas_report(table, report:)
+        table_def = table_schemas.find { |t| t[:warehouse_name].to_s == table }
+        IO.copy_stream(URI.parse(report['attachment']['url']).open, "#{working_dir}/#{table}.csv")
+        inner_block = ->(file) {
+          CSV.foreach("#{working_dir}/#{table}.csv", headers: true) do |m|
+            file << table_def[:columns].map do |c|
+              c[:transformer].present? ? m.instance_exec(&c[:transformer]) : m[c[:local_name].to_s]
+            end
+          end
+        }
+        upload_data(table_def, extra: report['id'], &inner_block)
+      end
+      def _handle_failed_canvas_report(table, report_name, kwargs, retry_count:, report:) # rubocop:disable Lint/UnusedMethodArgument
+        if retry_count.positive?
+          tbid = batch_context[:report_bid] || batch_context[:root_bid]
+          Sidekiq::Batch.new(tbid).jobs do
+            import_canvas_report(table, report_name, retry_count: retry_count - 1, **kwargs.symbolize_keys)
+          end
+        else
+          cleanup_fatal_error!
+        end
+      end
+    end
+  end
+end

data/lib/inst_data_shipper/data_sources/local_tables.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module InstDataShipper
+  module DataSources
+    # This module contains the logic for processing local AR tables
+    module LocalTables
+      extend ActiveSupport::Concern
+      public
+      def import_local_table(*args, **kwargs)
+        delayed(:_import_local_table, *args, **kwargs)
+      end
+      private
+      def _import_local_table(table_name)
+        table_def = table_schemas.find { |t| t[:model].to_s == table_name }
+        model = table_def[:model]
+        inner_block = ->(file) {
+          query = model
+          query = query.includes(table_def[:includes]) if table_def[:includes].present?
+          model.find_each do |m|
+            file << table_def[:columns].map do |c|
+              c[:transformer].present? ? m.instance_exec(&c[:transformer]) : m[c[:local_name].to_s]
+            end
+          end
+        }
+        upload_data(table_def, &inner_block)
+      end
+    end
+  end
+end

data/lib/inst_data_shipper/destinations/base.rb ADDED Viewed

@@ -0,0 +1,104 @@
+module InstDataShipper
+  module Destinations
+    class Base
+      attr_reader :dumper
+      delegate :tracker, :table_schemas, :working_dir, to: :dumper
+      def initialize(cache_key, config, dumper)
+        @cache_key = cache_key
+        @_config = config
+        @dumper = dumper
+      end
+      # This method is called before processing any data.
+      # It should be used to initialize any external resources needed for the dump.
+      def initialize_dump; end
+      # Yields an object (can be anything) that will be passed to `upload_data_chunk` as `chunk`.
+      #
+      # If multiple Destinations have the same `group_key`, `chunk_data` will only be called on the first and the chunk will be passed to each destination.
+      # Thus, if chunking is config-dependent, your Destination must modify the `group_key` to be unique for each configuration.
+      #
+      # This must be overridden, but you may call super with a block to iterate individual rows. Manually batch the rows, or include Concerns::Chunking to pre-batch them.
+      def chunk_data(generator, **kwargs)
+        raise NotImplementedError if method(__method__).owner == Base
+        enum = Enumerator.new(&generator)
+        enum.each do |row|
+          yield format_row(row)
+        end
+      end
+      # Called with any values yielded from chunk_data.
+      # This method should upload the chunk to the destination.
+      def upload_data_chunk(table_def, chunk)
+        raise NotImplementedError
+      end
+      # This method is called after processing all data.
+      # It should be used to finalize any external resources created by the dump.
+      def finalize_dump; end
+      # This method is called if a fatal error occurs.
+      # It should cleanup any external resources created by the dump.
+      def cleanup_fatal_error; end
+      def config
+        return @_config if @_config.is_a?(Hash)
+        @config ||= parse_configuration(@_config)
+      end
+      def user_config
+        config[:extra]
+      end
+      def group_key
+        { class: self.class }
+      end
+      protected
+      def parse_configuration(uri)
+        if block_given?
+          parsed = URI.parse(uri)
+          cfg = {
+            params: parsed.query.present? ? Rack::Utils.parse_nested_query(parsed.query) : {},
+            extra: (parsed.fragment.present? && parsed.fragment.match?(/^\w+=/) && Rack::Utils.parse_nested_query(parsed.fragment)).presence || parsed.fragment || nil,
+          }
+          yield parsed, cfg
+          cfg
+        else
+          raise NotImplementedError
+        end
+      end
+      def rk(key)
+        "#{@cache_key}:#{key}"
+      end
+      def redis(*args, &blk)
+        InstDataShipper.redis(*args, &blk)
+      end
+      # This is a base/generic implementation and may need to be overridden
+      def format_row(row, override_nils: true)
+        if row.is_a?(Array)
+          row = row.map do |v|
+            v = '\N' if v.nil? && override_nils
+            v = v.utc.strftime('%Y-%m-%d %H:%M:%S') if v.is_a?(DateTime) || v.is_a?(Time)
+            v = v.strftime('%Y-%m-%d') if v.is_a?(Date)
+            v = JSON.dump(v) if v.is_a?(Hash) || v.is_a?(Array)
+            if v.is_a?(String)
+              v = v.gsub("\t", '\t')
+              v = v.gsub("\n", '\n')
+            end
+            v
+          end
+        end
+        row
+      end
+    end
+  end
+end

data/lib/inst_data_shipper/destinations/concerns/chunking.rb ADDED Viewed

@@ -0,0 +1,34 @@
+module InstDataShipper
+  module Destinations
+    module Concerns
+      module Chunking
+        extend ActiveSupport::Concern
+        DEFAULT_CHUNK_SIZE = 100_000
+        def chunk_data(generator, chunk_size: nil, **kwargs)
+          chunk_size ||= config.dig(:params, :chunk_size) || DEFAULT_CHUNK_SIZE
+          slice = 1
+          btchr = CanvasSync::BatchProcessor.new(of: chunk_size) do |batch|
+            yield batch, slice
+            slice += 1
+          end
+          super(generator, **kwargs) do |row|
+            btchr << row
+          end
+          btchr.flush
+        end
+        def group_key
+          super.tap do |k|
+            k[:chunk_size] = config.dig(:params, :chunk_size) || DEFAULT_CHUNK_SIZE
+          end
+        end
+      end
+    end
+  end
+end

data/lib/inst_data_shipper/destinations/hosted_data.rb ADDED Viewed

@@ -0,0 +1,133 @@
+module InstDataShipper
+  module Destinations
+    class HostedData < Base
+      include Concerns::Chunking
+      def initialize_dump
+        dump = hosted_data_client.post(
+          'api/v1/custom_dumps/',
+          reference_id: tracker.id,
+          schema: convert_schema,
+        ).body.with_indifferent_access
+        redis.hset(rk(:state), :dump_id, dump[:id])
+        redis.expire(rk(:state), 30.days.to_i)
+      end
+      def chunk_data(generator, table:, extra: nil)
+        warehouse_name = table_def[:warehouse_name]
+        super(generator) do |batch, idx|
+          bits = [warehouse_name, extra, idx].compact
+          temp_file = "#{working_dir}/#{bits.join('.')}.tsv.gz"
+          Zlib::GzipWriter.open(temp_file) do |gz|
+            batch.each do |row|
+              row = row.join("\t") if row.is_a?(Array)
+              gz.puts(row)
+            end
+          end
+          yield temp_file
+          File.delete(temp_file)
+        end
+      end
+      def upload_data_chunk(table_def, chunk)
+        hosted_data_client.put("api/v1/custom_dumps/#{hd_dump_id}/", artifacts: {
+          table_def[:warehouse_name] => [Faraday::UploadIO.new(chunk, 'application/gzip')],
+        })
+      end
+      def finalize_dump
+        hosted_data_client.put("api/v1/custom_dumps/#{hd_dump_id}/", start_import: true) if hd_dump_id.present?
+        redis.delete(rk(:state))
+      end
+      def cleanup_fatal_error
+        hosted_data_client.delete("api/v1/custom_dumps/#{hd_dump_id}/", reason: 'Failure during extraction or transformation') if hd_dump_id.present?
+        redis.delete(rk(:state))
+      end
+      # TODO Support/allow single-table fatal errors?
+      protected
+      def hd_dump_id
+        @hd_dump_id ||= redis.hget(rk(:state), :dump_id)
+      end
+      def convert_schema
+        table_prefix = config[:table_prefix]
+        table_prefix = table_prefix.present? ? "#{table_prefix}_" : nil
+        definititions = {}
+        table_schemas.each do |ts|
+          ts = ts.dup
+          table_name = ts[:warehouse_name]
+          table_name = table_prefix + table_name if table_prefix.present?
+          definititions[ts[:warehouse_name]] = {
+            dw_type: 'dimension',
+            description: ts[:description],
+            incremental: !!ts[:incremental],
+            incremental_on: ts[:incremental] && ts[:incremental] != true ? ts[:incremental] : nil,
+            # indexed_columns
+            tableName: table_name,
+            columns: ts[:columns].map do |col|
+              {
+                name: col[:warehouse_name],
+                description: col[:description],
+                type: col[:type] || ts[:model].column_for_attribute(col[:local_name]).sql_type,
+              }
+            end,
+          }
+        end
+        {
+          version: "#{dumper.export_genre.downcase}-#{Digest::MD5.hexdigest(definititions.to_json)[0...6]}",
+          definition: definititions,
+        }
+      end
+      def hosted_data_client
+        @hosted_data_client ||= begin
+          token = config[:token]
+          host = config[:host]
+          unless host.present?
+            tok_content = JWT.decode(token, nil, false).first
+            host = tok_content['host']
+          end
+          Faraday.new(url: host) do |faraday|
+            faraday.request                     :multipart
+            faraday.request                     :json
+            faraday.response                    :raise_error
+            faraday.response                    :follow_redirects
+            faraday.response                    :json, :content_type => /\bjson$/
+            faraday.headers[:accept]          = 'application/json'
+            faraday.headers[:authorization]   = "Bearer #{token}"
+            faraday.adapter                     Faraday.default_adapter
+          end
+        end
+      end
+      def parse_configuration(uri)
+        super do |parsed_uri, cfg|
+          if parsed_uri.username.present?
+            # hosted-data://<JWT>:<hosted_data_domain>
+            cfg[:token] = parsed_uri.username
+            cfg[:host] = parsed_uri.host
+          else
+            # hosted-data://<JWT>
+            cfg[:token] = parsed_uri.host
+          end
+        end
+      end
+    end
+  end
+end

data/lib/inst_data_shipper/destinations/s3.rb ADDED Viewed

@@ -0,0 +1,72 @@
+module InstDataShipper
+  module Destinations
+    class S3 < Base
+      include Concerns::Chunking
+      def chunk_data(generator, table:, extra: nil)
+        warehouse_name = table_def[:warehouse_name]
+        super(generator) do |batch, idx|
+          bits = [warehouse_name, extra, idx].compact
+          temp_file = "#{working_dir}/#{bits.join('.')}.csv"
+          CSV.open(temp_file, 'w', headers: false) do |row|
+            row << table[:columns].map { |c| c[:warehouse_name] }
+            batch.each do |batch_row|
+              row << batch_row
+            end
+          end
+          yield temp_file
+          File.delete(temp_file)
+        end
+      end
+      def upload_data_chunk(table_def, chunk)
+        s3 = Aws::S3::Resource.new(client: aws_client)
+        dir_key = tracker.created_at.strftime("%Y-%m-%dT%H:%M") + "_#{tracker.id}"
+        bucket = s3.bucket(config[:bucket])
+        subpath = config[:path].presence || "/"
+        subpath = subpath[1..-1] if subpath.starts_with?("/")
+        subpath = "instructure" unless subpath.present?
+        obj_path = File.join(config[:path], dir_key, File.basename(chunk))
+        object = bucket.object(obj_path)
+        File.open(chunk, 'rb') do |file|
+          object.put(body: file)
+        end
+      end
+      protected
+      def aws_client
+        @aws_client ||= Aws::S3::Client.new(
+          region: config[:region],
+          credentials: Aws::Credentials.new(
+            config[:access_key_id],
+            config[:access_key_secret],
+          )
+        )
+      end
+      def parse_configuration(uri)
+        # s3://<access_key_id>:<access_key_secret>@<region>/<bucket>/<path>
+        super do |parsed_uri, cfg|
+          split_path = parsed_uri.path.split('/')
+          cfg.merge!({
+            region: parsed_uri.host,
+            bucket: split_path[0],
+            access_key_id: parsed_uri.user,
+            access_key_secret: parsed_uri.password,
+            path: split_path[1..-1].join('/').presence,
+          })
+        end
+      end
+    end
+  end
+end