RubyGems - chronicle-etl - Versions diffs - 0.3.1 → 0.4.2 - Mend

chronicle-etl 0.3.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

checksums.yaml +4 -4
data/.github/workflows/ruby.yml +35 -0
data/.rubocop.yml +31 -1
data/Guardfile +7 -0
data/README.md +157 -82
data/Rakefile +4 -2
data/chronicle-etl.gemspec +11 -3
data/exe/chronicle-etl +1 -1
data/lib/chronicle/etl/cli/connectors.rb +34 -5
data/lib/chronicle/etl/cli/jobs.rb +90 -24
data/lib/chronicle/etl/cli/main.rb +41 -19
data/lib/chronicle/etl/cli/plugins.rb +62 -0
data/lib/chronicle/etl/cli/subcommand_base.rb +2 -2
data/lib/chronicle/etl/cli.rb +9 -0
data/lib/chronicle/etl/config.rb +7 -4
data/lib/chronicle/etl/configurable.rb +163 -0
data/lib/chronicle/etl/exceptions.rb +29 -1
data/lib/chronicle/etl/extractors/csv_extractor.rb +24 -23
data/lib/chronicle/etl/extractors/extractor.rb +16 -15
data/lib/chronicle/etl/extractors/file_extractor.rb +34 -11
data/lib/chronicle/etl/extractors/helpers/input_reader.rb +76 -0
data/lib/chronicle/etl/extractors/json_extractor.rb +19 -18
data/lib/chronicle/etl/job.rb +8 -2
data/lib/chronicle/etl/job_definition.rb +20 -5
data/lib/chronicle/etl/loaders/csv_loader.rb +36 -9
data/lib/chronicle/etl/loaders/helpers/encoding_helper.rb +18 -0
data/lib/chronicle/etl/loaders/json_loader.rb +44 -0
data/lib/chronicle/etl/loaders/loader.rb +28 -2
data/lib/chronicle/etl/loaders/rest_loader.rb +5 -5
data/lib/chronicle/etl/loaders/table_loader.rb +18 -37
data/lib/chronicle/etl/logger.rb +6 -2
data/lib/chronicle/etl/models/base.rb +3 -0
data/lib/chronicle/etl/models/entity.rb +8 -2
data/lib/chronicle/etl/models/raw.rb +26 -0
data/lib/chronicle/etl/registry/connector_registration.rb +6 -0
data/lib/chronicle/etl/registry/plugin_registry.rb +70 -0
data/lib/chronicle/etl/registry/registry.rb +27 -14
data/lib/chronicle/etl/runner.rb +35 -17
data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +6 -0
data/lib/chronicle/etl/serializers/raw_serializer.rb +10 -0
data/lib/chronicle/etl/serializers/serializer.rb +2 -1
data/lib/chronicle/etl/transformers/image_file_transformer.rb +22 -28
data/lib/chronicle/etl/transformers/null_transformer.rb +1 -1
data/lib/chronicle/etl/transformers/transformer.rb +3 -2
data/lib/chronicle/etl/version.rb +1 -1
data/lib/chronicle/etl.rb +12 -4
metadata +123 -18
data/.ruby-version +0 -1
data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +0 -104
data/lib/chronicle/etl/loaders/stdout_loader.rb +0 -14
data/lib/chronicle/etl/models/generic.rb +0 -23

data/lib/chronicle/etl/extractors/csv_extractor.rb CHANGED Viewed

@@ -2,46 +2,47 @@ require 'csv'
 module Chronicle
   module ETL
-    class CsvExtractor < Chronicle::ETL::Extractor
-      include Extractors::Helpers::FilesystemReader
+    class CSVExtractor < Chronicle::ETL::Extractor
+      include Extractors::Helpers::InputReader
       register_connector do |r|
-        r.description = 'input as CSV'
+        r.description = 'CSV'
       end
-      DEFAULT_OPTIONS = {
-        headers: true,
-        filename: $stdin
-      }.freeze
+      setting :headers, default: true
-      def initialize(options = {})
-        super(DEFAULT_OPTIONS.merge(options))
+      def prepare
+        @csvs = prepare_sources
       end
       def extract
-        csv = initialize_csv
-        csv.each do |row|
-          yield Chronicle::ETL::Extraction.new(data: row.to_h)
+        @csvs.each do |csv|
+          csv.read.each do |row|
+            yield Chronicle::ETL::Extraction.new(data: row.to_h)
+          end
         end
       end
       def results_count
-        CSV.read(@options[:filename], headers: @options[:headers]).count unless stdin?(@options[:filename])
+        @csvs.reduce(0) do |total_rows, csv|
+          row_count = csv.readlines.size
+          csv.rewind
+          total_rows + row_count
+        end
       end
       private
-      def initialize_csv
-        headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
-        csv_options = {
-          headers: headers,
-          converters: :all
-        }
-        open_from_filesystem(filename: @options[:filename]) do |file|
-          return CSV.new(file, **csv_options)
+      def prepare_sources
+        @csvs = []
+        read_input do |csv_data|
+          csv_options = {
+            headers: @config.headers.is_a?(String) ? @config.headers.split(',') : @config.headers,
+            converters: :all
+          }
+          @csvs << CSV.new(csv_data, **csv_options)
         end
+        @csvs
       end
     end
   end

data/lib/chronicle/etl/extractors/extractor.rb CHANGED Viewed

@@ -5,15 +5,20 @@ module Chronicle
     # Abstract class representing an Extractor for an ETL job
     class Extractor
       extend Chronicle::ETL::Registry::SelfRegistering
+      include Chronicle::ETL::Configurable
+      setting :since, type: :time
+      setting :until, type: :time
+      setting :limit, type: :numeric
+      setting :load_after_id
+      setting :input
       # Construct a new instance of this extractor. Options are passed in from a Runner
-      # == Paramters:
+      # == Parameters:
       # options::
       #   Options for configuring this Extractor
       def initialize(options = {})
-        @options = options.transform_keys!(&:to_sym)
-        sanitize_options
-        handle_continuation
+        apply_options(options)
       end
       # Hook called before #extract. Useful for gathering data, initailizing proxies, etc
@@ -30,22 +35,18 @@ module Chronicle
       private
-      def sanitize_options
-        @options[:load_since] = Time.parse(@options[:load_since]) if @options[:load_since] && @options[:load_since].is_a?(String)
-        @options[:load_until] = Time.parse(@options[:load_until]) if @options[:load_until] && @options[:load_until].is_a?(String)
-      end
-      def handle_continuation
-        return unless @options[:continuation]
+      # TODO: reimplemenet this
+      # def handle_continuation
+      #   return unless @config.continuation
-        @options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation].highest_timestamp
-        @options[:load_after_id] = @options[:continuation].last_id if @options[:continuation].last_id
-      end
+      #   @config.since = @config.continuation.highest_timestamp if @config.continuation.highest_timestamp
+      #   @config.load_after_id = @config.continuation.last_id if @config.continuation.last_id
+      # end
     end
   end
 end
-require_relative 'helpers/filesystem_reader'
+require_relative 'helpers/input_reader'
 require_relative 'csv_extractor'
 require_relative 'file_extractor'
 require_relative 'json_extractor'

data/lib/chronicle/etl/extractors/file_extractor.rb CHANGED Viewed

@@ -2,32 +2,55 @@ require 'pathname'
 module Chronicle
   module ETL
+    # Return filenames that match a pattern in a directory
     class FileExtractor < Chronicle::ETL::Extractor
-      include Extractors::Helpers::FilesystemReader
       register_connector do |r|
         r.description = 'file or directory of files'
       end
+      setting :input, default: ['.']
+      setting :dir_glob_pattern, default: "**/*"
+      setting :larger_than
+      setting :smaller_than
+      def prepare
+        @pathnames = gather_files
+      end
       def extract
-        filenames.each do |filename|
-          yield Chronicle::ETL::Extraction.new(data: filename)
+        @pathnames.each do |pathname|
+          yield Chronicle::ETL::Extraction.new(data: pathname.to_path)
         end
       end
       def results_count
-        filenames.count
+        @pathnames.count
       end
       private
-      def filenames
-        @filenames ||= filenames_in_directory(
-          path: @options[:filename],
-          dir_glob_pattern: @options[:dir_glob_pattern],
-          load_since: @options[:load_since],
-          load_until: @options[:load_until]
-        )
+      def gather_files
+        roots = [@config.input].flatten.map { |filename| Pathname.new(filename) }
+        raise(ExtractionError, "Input must exist") unless roots.all?(&:exist?)
+        directories, files = roots.partition(&:directory?)
+        directories.each do |directory|
+          files += Dir.glob(File.join(directory, @config.dir_glob_pattern)).map { |filename| Pathname.new(filename) }
+        end
+        files = files.uniq
+        files = files.keep_if { |f| (f.mtime > @config.since) } if @config.since
+        files = files.keep_if { |f| (f.mtime < @config.until) } if @config.until
+        # pass in file sizes in bytes
+        files = files.keep_if { |f| (f.size < @config.smaller_than) } if @config.smaller_than
+        files = files.keep_if { |f| (f.size > @config.larger_than) } if @config.larger_than
+        # # TODO: incorporate sort argument
+        files.sort_by(&:mtime)
       end
     end
   end

data/lib/chronicle/etl/extractors/helpers/input_reader.rb ADDED Viewed

@@ -0,0 +1,76 @@
+require 'pathname'
+module Chronicle
+  module ETL
+    module Extractors
+      module Helpers
+        module InputReader
+          # Return an array of input filenames; converts a single string
+          # to an array if necessary
+          def filenames
+            [@config.input].flatten.map
+          end
+          # Filenames as an array of pathnames
+          def pathnames
+            filenames.map { |filename| Pathname.new(filename) }
+          end
+          # Whether we're reading from files
+          def read_from_files?
+            filenames.any?
+          end
+          # Whether we're reading input from stdin
+          def read_from_stdin?
+            !read_from_files? && $stdin.stat.pipe?
+          end
+          # Read input sources and yield each content
+          def read_input
+            if read_from_files?
+              pathnames.each do |pathname|
+                File.open(pathname) do |file|
+                  yield file.read, pathname.to_path
+                end
+              end
+            elsif read_from_stdin?
+              yield $stdin.read, $stdin
+            else
+              raise ExtractionError, "No input files or stdin provided"
+            end
+          end
+          # Read input sources line by line
+          def read_input_as_lines(&block)
+            if read_from_files?
+              lines_from_files(&block)
+            elsif read_from_stdin?
+              lines_from_stdin(&block)
+            else
+              raise ExtractionError, "No input files or stdin provided"
+            end
+          end
+          private
+          def lines_from_files(&block)
+            pathnames.each do |pathname|
+              File.open(pathname) do |file|
+                lines_from_io(file, &block)
+              end
+            end
+          end
+          def lines_from_stdin(&block)
+            lines_from_io($stdin, &block)
+          end
+          def lines_from_io(io, &block)
+            io.each_line(&block)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/chronicle/etl/extractors/json_extractor.rb CHANGED Viewed

@@ -1,43 +1,44 @@
 module Chronicle
   module ETL
-    class JsonExtractor < Chronicle::ETL::Extractor
-      include Extractors::Helpers::FilesystemReader
+    class JSONExtractor < Chronicle::ETL::Extractor
+      include Extractors::Helpers::InputReader
       register_connector do |r|
-        r.description = 'input as JSON'
+        r.description = 'JSON'
       end
-      DEFAULT_OPTIONS = {
-        filename: $stdin,
+      setting :jsonl, default: true, type: :boolean
-        # We're expecting line-separated json objects
-        jsonl: true
-      }.freeze
-      def initialize(options = {})
-        super(DEFAULT_OPTIONS.merge(options))
+      def prepare
+        @jsons = []
+        load_input do |input|
+          @jsons << parse_data(input)
+        end
       end
       def extract
-        load_input do |input|
-          parsed_data = parse_data(input)
-          yield Chronicle::ETL::Extraction.new(data: parsed_data) if parsed_data
+        @jsons.each do |json|
+          yield Chronicle::ETL::Extraction.new(data: json)
         end
       end
       def results_count
+        @jsons.count
       end
       private
       def parse_data data
         JSON.parse(data)
-      rescue JSON::ParserError => e
+      rescue JSON::ParserError
+        raise Chronicle::ETL::ExtractionError, "Could not parse JSON"
       end
-      def load_input
-        read_from_filesystem(filename: @options[:filename]) do |data|
-          yield data
+      def load_input(&block)
+        if @config.jsonl
+          read_input_as_lines(&block)
+        else
+          read_input(&block)
         end
       end
     end

data/lib/chronicle/etl/job.rb CHANGED Viewed

@@ -1,6 +1,11 @@
 require 'forwardable'
 module Chronicle
   module ETL
+    # A runner job
+    #
+    # TODO: this can probably be merged with JobDefinition. Not clear
+    # where the boundaries are
     class Job
       extend Forwardable
@@ -12,7 +17,8 @@ module Chronicle
                     :transformer_klass,
                     :transformer_options,
                     :loader_klass,
-                    :loader_options
+                    :loader_options,
+                    :job_definition
       # TODO: build a proper id system
       alias id name
@@ -35,7 +41,7 @@ module Chronicle
       def instantiate_transformer(extraction)
         @transformer_klass = @job_definition.transformer_klass
-        @transformer_klass.new(@transformer_options, extraction)
+        @transformer_klass.new(extraction, @transformer_options)
       end
       def instantiate_loader

data/lib/chronicle/etl/job_definition.rb CHANGED Viewed

@@ -14,17 +14,36 @@ module Chronicle
           options: {}
         },
         loader: {
-          name: 'stdout',
+          name: 'table',
           options: {}
         }
       }.freeze
+      attr_reader :errors
       attr_accessor :definition
       def initialize()
         @definition = SKELETON_DEFINITION
       end
+      def validate
+        @errors = []
+        Chronicle::ETL::Registry::PHASES.each do |phase|
+          __send__("#{phase}_klass".to_sym)
+        rescue Chronicle::ETL::PluginError => e
+          @errors << e
+        end
+        @errors.empty?
+      end
+      def validate!
+        raise(Chronicle::ETL::JobDefinitionError.new(self), "Job definition is invalid") unless validate
+        true
+      end
       # Add config hash to this definition
       def add_config(config = {})
         @definition = @definition.deep_merge(config)
@@ -80,10 +99,6 @@ module Chronicle
           end
         end
       end
-      def validate
-        return true   # TODO
-      end
     end
   end
 end

data/lib/chronicle/etl/loaders/csv_loader.rb CHANGED Viewed

@@ -2,27 +2,54 @@ require 'csv'
 module Chronicle
   module ETL
-    class CsvLoader < Chronicle::ETL::Loader
+    class CSVLoader < Chronicle::ETL::Loader
       register_connector do |r|
         r.description = 'CSV'
       end
-      def initialize(options={})
-        super(options)
-        @rows = []
+      setting :output, default: $stdout
+      setting :headers, default: true
+      setting :header_row, default: true
+      def records
+        @records ||= []
       end
       def load(record)
-        @rows << record.to_h_flattened.values
+        records << record.to_h_flattened
       end
       def finish
-        z = $stdout
-        CSV(z) do |csv|
-          @rows.each do |row|
-            csv << row
+        return unless records.any?
+        headers = build_headers(records)
+        csv_options = {}
+        if @config.headers
+          csv_options[:write_headers] = @config.header_row
+          csv_options[:headers] = headers
+        end
+        if @config.output.is_a?(IO)
+          # This might seem like a duplication of the default value ($stdout)
+          # but it's because rspec overwrites $stdout (in helper #capture) to
+          # capture output.
+          io = $stdout.dup
+        else
+          io = File.open(@config.output, "w+")
+        end
+        output = CSV.generate(**csv_options) do |csv|
+          records.each do |record|
+            csv << record
+              .transform_keys(&:to_sym)
+              .values_at(*headers)
+              .map { |value| force_utf8(value) }
           end
         end
+        io.write(output)
+        io.close
       end
     end
   end

data/lib/chronicle/etl/loaders/helpers/encoding_helper.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require 'pathname'
+module Chronicle
+  module ETL
+    module Loaders
+      module Helpers
+        module EncodingHelper
+          # Mostly useful for handling loading with binary data from a raw extraction
+          def force_utf8(value)
+            return value unless value.is_a?(String)
+            value.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
+          end
+        end
+      end
+    end
+  end
+end

data/lib/chronicle/etl/loaders/json_loader.rb ADDED Viewed

@@ -0,0 +1,44 @@
+module Chronicle
+  module ETL
+    class JSONLoader < Chronicle::ETL::Loader
+      register_connector do |r|
+        r.description = 'json'
+      end
+      setting :serializer
+      setting :output, default: $stdout
+      def start
+        if @config.output == $stdout
+          @output = @config.output
+        else
+          @output = File.open(@config.output, "w")
+        end
+      end
+      def load(record)
+        serialized = serializer.serialize(record)
+        # When dealing with raw data, we can get improperly encoded strings
+        # (eg from sqlite database columns). We force conversion to UTF-8
+        # before converting into JSON
+        encoded = serialized.transform_values do |value|
+          next value unless value.is_a?(String)
+          force_utf8(value)
+        end
+        @output.puts encoded.to_json
+      end
+      def finish
+        @output.close
+      end
+      private
+      def serializer
+        @config.serializer || Chronicle::ETL::RawSerializer
+      end
+    end
+  end
+end

data/lib/chronicle/etl/loaders/loader.rb CHANGED Viewed

@@ -1,15 +1,24 @@
+require_relative 'helpers/encoding_helper'
 module Chronicle
   module ETL
     # Abstract class representing a Loader for an ETL job
     class Loader
       extend Chronicle::ETL::Registry::SelfRegistering
+      include Chronicle::ETL::Configurable
+      include Chronicle::ETL::Loaders::Helpers::EncodingHelper
+      setting :output
+      setting :fields
+      setting :fields_limit, default: nil
+      setting :fields_exclude
       # Construct a new instance of this loader. Options are passed in from a Runner
       # == Parameters:
       # options::
       #   Options for configuring this Loader
       def initialize(options = {})
-        @options = options
+        apply_options(options)
       end
       # Called once before processing records
@@ -22,11 +31,28 @@ module Chronicle
       # Called once there are no more records to process
       def finish; end
+      private
+      def build_headers(records)
+        headers =
+          if @config.fields && @config.fields.any?
+            Set[*@config.fields]
+          else
+            # use all the keys of the flattened record hash
+            Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
+          end
+        headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) }
+        headers = headers.first(@config.fields_limit) if @config.fields_limit
+        headers.to_a.map(&:to_sym)
+      end
     end
   end
 end
 require_relative 'csv_loader'
+require_relative 'json_loader'
 require_relative 'rest_loader'
-require_relative 'stdout_loader'
 require_relative 'table_loader'

data/lib/chronicle/etl/loaders/rest_loader.rb CHANGED Viewed

@@ -9,19 +9,19 @@ module Chronicle
         r.description = 'a REST endpoint'
       end
-      def initialize( options={} )
-        super(options)
-      end
+      setting :hostname, required: true
+      setting :endpoint, required: true
+      setting :access_token
       def load(record)
         payload = Chronicle::ETL::JSONAPISerializer.serialize(record)
         # have the outer data key that json-api expects
         payload = { data: payload } unless payload[:data]
-        uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
+        uri = URI.parse("#{@config.hostname}#{@config.endpoint}")
         header = {
-          "Authorization" => "Bearer #{@options[:access_token]}",
+          "Authorization" => "Bearer #{@config.access_token}",
           "Content-Type": 'application/json'
         }
         use_ssl = uri.scheme == 'https'