chronicle-etl 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b74c4a7782c1ab31173e628b3e5ccb8743fe21f29d6f48d739b0e3cc2dfda22e
4
- data.tar.gz: 7ea44638b08f6da12c0a5386f3d852600f50336ce0bb57347114804770f75691
3
+ metadata.gz: 5fd411a9a41a645b85780230c79b09f361e121d0e8ca7f3270ca8eba55a76ca8
4
+ data.tar.gz: c09053715910ab4f027fbdc3a5b7d10c042eee962f7fa93c6571ce8359f51009
5
5
  SHA512:
6
- metadata.gz: efb23677c731a54b0382c3095dc9bb5f98a97365c1daf031bbc8c20335e7bd146b76b3a50486971e48192e7540bc0ae1b09f232590a590257203ae3560396767
7
- data.tar.gz: cba40b71a7e8c0b17a286ecd3db3724bff290fdd79b3fdf55ab89967f6af14228911c0e6928a949b8dd899acd6ad396b8a21fb03162a8561247c97b1200bac29
6
+ metadata.gz: 2c9ec14b6c0a51f1c5ec77ee8d9a7f016d16bdc35db5634f9fa5d38aabc30dec201cd4b8bef06a31b86773a0c1cda2d271d7008dcb247a86d956c094919f3c0f
7
+ data.tar.gz: 0dca41e1654e5b2b98a148f853492a67126cdac767000b3c5f97c5c8ff88b77464e17a2fab38b72c1f014f3515c911e5f3f391eaf68d64e73dcfcff5d8e6cb6a
@@ -0,0 +1,35 @@
1
+ # This workflow uses actions that are not certified by GitHub.
2
+ # They are provided by a third-party and are governed by
3
+ # separate terms of service, privacy policy, and support
4
+ # documentation.
5
+ # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake
6
+ # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby
7
+
8
+ name: Ruby
9
+
10
+ on:
11
+ push:
12
+ branches: [ master ]
13
+ pull_request:
14
+ branches: [ master ]
15
+
16
+ jobs:
17
+ test:
18
+
19
+ runs-on: ubuntu-latest
20
+ strategy:
21
+ matrix:
22
+ ruby-version: ['2.7', '3.0']
23
+
24
+ steps:
25
+ - uses: actions/checkout@v2
26
+ - name: Set up Ruby
27
+ # To automatically get bug fixes and new Ruby versions for ruby/setup-ruby,
28
+ # change this to (see https://github.com/ruby/setup-ruby#versioning):
29
+ # uses: ruby/setup-ruby@v1
30
+ uses: ruby/setup-ruby@473e4d8fe5dd94ee328fdfca9f8c9c7afc9dae5e
31
+ with:
32
+ ruby-version: ${{ matrix.ruby-version }}
33
+ bundler-cache: true # runs 'bundle install' and caches installed gems automatically
34
+ - name: Run tests
35
+ run: bundle exec rake
data/.rubocop.yml CHANGED
@@ -1,11 +1,38 @@
1
1
  AllCops:
2
2
  EnabledByDefault: true
3
+ TargetRubyVersion: 2.7
4
+
5
+ Style/FrozenStringLiteralComment:
6
+ SafeAutoCorrect: true
3
7
 
4
8
  Style/StringLiterals:
5
9
  Enabled: false
6
10
 
11
+ Layout/MultilineAssignmentLayout:
12
+ Enabled: false
13
+
14
+ Layout/RedundantLineBreak:
15
+ Enabled: false
16
+
7
17
  Style/MethodCallWithArgsParentheses:
8
18
  Enabled: false
9
19
 
20
+ Style/MethodCalledOnDoEndBlock:
21
+ Exclude:
22
+ - 'spec/**/*'
23
+
24
+ Style/OpenStructUse:
25
+ Enabled: false
26
+
27
+ Style/Copyright:
28
+ Enabled: false
29
+
30
+ Style/SymbolArray:
31
+ EnforcedStyle: brackets
32
+
33
+ Style/WordArray:
34
+ EnforcedStyle: brackets
35
+
10
36
  Lint/ConstantResolution:
11
- Enabled: false
37
+ Enabled: false
38
+
data/Guardfile ADDED
@@ -0,0 +1,7 @@
1
+ guard :rspec, cmd: "bundle exec rspec" do
2
+ require "guard/rspec/dsl"
3
+
4
+ watch(%r{^spec/.+_spec\.rb$})
5
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
6
+ watch('spec/spec_helper.rb') { "spec" }
7
+ end
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Chronicle::ETL
2
2
 
3
- [![Gem Version](https://badge.fury.io/rb/chronicle-etl.svg)](https://badge.fury.io/rb/chronicle-etl)
3
+ [![Gem Version](https://badge.fury.io/rb/chronicle-etl.svg)](https://badge.fury.io/rb/chronicle-etl) [![Ruby](https://github.com/chronicle-app/chronicle-etl/actions/workflows/ruby.yml/badge.svg)](https://github.com/chronicle-app/chronicle-etl/actions/workflows/ruby.yml)
4
4
 
5
5
  Chronicle ETL is a utility that helps you archive and processes personal data. You can *extract* it from a variety of sources, *transform* it, and *load* it to an external API, file, or stdout.
6
6
 
@@ -57,7 +57,7 @@ Built in connectors:
57
57
  In addition to the built-in importers, importers for third-party platforms are available. They are packaged as individual Ruby gems.
58
58
 
59
59
  - [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files
60
- - [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`
60
+ - [shell](https://github.com/chronicle-app/chronicle-shell). Extract shell history from Bash or Zsh`
61
61
  - [imessage](https://github.com/chronicle-app/chronicle-imessage). Extract iMessage messages from a local macOS installation
62
62
 
63
63
  To install any of these, run `gem install chronicle-PROVIDER`.
data/Rakefile CHANGED
@@ -1,6 +1,8 @@
1
1
  require "bundler/gem_tasks"
2
2
  require "rspec/core/rake_task"
3
-
4
3
  RSpec::Core::RakeTask.new(:spec)
5
4
 
6
- task :default => :spec
5
+ require 'yard'
6
+ YARD::Rake::YardocTask.new
7
+
8
+ task default: :spec
@@ -35,17 +35,18 @@ Gem::Specification.new do |spec|
35
35
  spec.bindir = "exe"
36
36
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
37
37
  spec.require_paths = ["lib"]
38
+ spec.required_ruby_version = ">= 2.7"
38
39
 
39
- spec.add_dependency "activesupport"
40
+ spec.add_dependency "activesupport", "~> 7.0"
40
41
  spec.add_dependency "chronic_duration", "~> 0.10.6"
41
42
  spec.add_dependency "colorize", "~> 0.8.1"
42
43
  spec.add_dependency "marcel", "~> 1.0.2"
43
44
  spec.add_dependency "mini_exiftool", "~> 2.10"
44
45
  spec.add_dependency "nokogiri", "~> 1.13"
45
- spec.add_dependency "runcom", "~> 6.2"
46
+ spec.add_dependency "runcom", ">= 6.0"
46
47
  spec.add_dependency "sequel", "~> 5.35"
47
48
  spec.add_dependency "sqlite3", "~> 1.4"
48
- spec.add_dependency "thor", "~> 0.20"
49
+ spec.add_dependency "thor", "~> 1.2"
49
50
  spec.add_dependency "tty-progressbar", "~> 0.17"
50
51
  spec.add_dependency "tty-table", "~> 0.11"
51
52
 
@@ -53,4 +54,8 @@ Gem::Specification.new do |spec|
53
54
  spec.add_development_dependency "pry-byebug", "~> 3.9"
54
55
  spec.add_development_dependency "rake", "~> 13.0"
55
56
  spec.add_development_dependency "rspec", "~> 3.9"
57
+ spec.add_development_dependency "simplecov", "~> 0.21"
58
+ spec.add_development_dependency "guard-rspec", "~> 4.7.3"
59
+ spec.add_development_dependency "yard", "~> 0.9.7"
60
+ spec.add_development_dependency "rubocop", "~> 1.25.1"
56
61
  end
data/exe/chronicle-etl CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "chronicle/etl/cli/main"
3
+ require "chronicle/etl/cli"
4
4
 
5
5
  Chronicle::ETL::CLI::Main.start(ARGV)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Chronicle
2
4
  module ETL
3
5
  module CLI
@@ -38,6 +40,38 @@ module Chronicle
38
40
  table = TTY::Table.new(headers, connector_info.map(&:values))
39
41
  puts table.render(indent: 0, padding: [0, 2])
40
42
  end
43
+
44
+ desc "show PHASE IDENTIFIER", "Show information about a connector"
45
+ def show(phase, identifier)
46
+ unless ['extractor', 'transformer', 'loader'].include?(phase)
47
+ puts "phase argument must be one of: [extractor, transformer, loader]"
48
+ return
49
+ end
50
+
51
+ begin
52
+ connector = Chronicle::ETL::Registry.find_by_phase_and_identifier(phase.to_sym, identifier)
53
+ rescue Chronicle::ETL::ConnectorNotAvailableError
54
+ puts "Could not find #{phase} #{identifier}"
55
+ return
56
+ end
57
+
58
+ puts connector.klass.to_s.bold
59
+ puts " #{connector.descriptive_phrase}"
60
+ puts
61
+ puts "OPTIONS"
62
+
63
+ headers = ['name', 'default', 'required'].map{ |h| h.to_s.upcase.bold }
64
+
65
+ settings = connector.klass.settings.map do |name, setting|
66
+ [
67
+ name,
68
+ setting.default,
69
+ setting.required ? 'yes' : 'no'
70
+ ]
71
+ end
72
+ table = TTY::Table.new(headers, settings)
73
+ puts table.render(indent: 0, padding: [0, 2])
74
+ end
41
75
  end
42
76
  end
43
77
  end
@@ -1,4 +1,5 @@
1
1
  require 'pp'
2
+
2
3
  module Chronicle
3
4
  module ETL
4
5
  module CLI
@@ -7,15 +8,29 @@ module Chronicle
7
8
  default_task "start"
8
9
  namespace :jobs
9
10
 
11
+ class_option :name, aliases: '-j', desc: 'Job configuration name'
12
+
10
13
  class_option :extractor, aliases: '-e', desc: "Extractor class. Default: stdin", banner: 'extractor-name'
11
14
  class_option :'extractor-opts', desc: 'Extractor options', type: :hash, default: {}
12
15
  class_option :transformer, aliases: '-t', desc: 'Transformer class. Default: null', banner: 'transformer-name'
13
16
  class_option :'transformer-opts', desc: 'Transformer options', type: :hash, default: {}
14
17
  class_option :loader, aliases: '-l', desc: 'Loader class. Default: stdout', banner: 'loader-name'
15
18
  class_option :'loader-opts', desc: 'Loader options', type: :hash, default: {}
16
- class_option :name, aliases: '-j', desc: 'Job configuration name'
17
19
 
18
- map run: :start # Thor doesn't like `run` as a command name
20
+ # This is an array to deal with shell globbing
21
+ class_option :input, aliases: '-i', desc: 'Input filename or directory', default: [], type: 'array', banner: 'FILENAME'
22
+ class_option :since, desc: "Load records SINCE this date. Overrides job's `load_since` configuration option in extractor's options", banner: 'DATE'
23
+ class_option :until, desc: "Load records UNTIL this date", banner: 'DATE'
24
+ class_option :limit, desc: "Only extract the first LIMIT records", banner: 'N'
25
+
26
+ class_option :output, aliases: '-o', desc: 'Output filename', type: 'string'
27
+ class_option :fields, desc: 'Output only these fields', type: 'array', banner: 'field1 field2 ...'
28
+
29
+ class_option :log_level, desc: 'Log level (debug, info, warn, error, fatal)', default: 'info'
30
+ class_option :verbose, aliases: '-v', desc: 'Set log level to verbose', type: :boolean
31
+
32
+ # Thor doesn't like `run` as a command name
33
+ map run: :start
19
34
  desc "run", "Start a job"
20
35
  option :log_level, desc: 'Log level (debug, info, warn, error, fatal)', default: 'info'
21
36
  option :verbose, aliases: '-v', desc: 'Set log level to verbose', type: :boolean
@@ -69,7 +84,7 @@ LONG_DESC
69
84
  [job, extractor, transformer, loader]
70
85
  end
71
86
 
72
- headers = ['name', 'extractor', 'transformer', 'loader'].map{|h| h.upcase.bold }
87
+ headers = ['name', 'extractor', 'transformer', 'loader'].map { |h| h.upcase.bold }
73
88
 
74
89
  table = TTY::Table.new(headers, job_details)
75
90
  puts table.render(indent: 0, padding: [0, 2])
@@ -90,7 +105,7 @@ LONG_DESC
90
105
  def build_job_definition(options)
91
106
  definition = Chronicle::ETL::JobDefinition.new
92
107
  definition.add_config(load_job_config(options[:name]))
93
- definition.add_config(process_flag_options(options))
108
+ definition.add_config(process_flag_options(options).transform_keys(&:to_sym))
94
109
  definition
95
110
  end
96
111
 
@@ -100,19 +115,33 @@ LONG_DESC
100
115
 
101
116
  # Takes flag options and turns them into a runner config
102
117
  def process_flag_options options
118
+ extractor_options = options[:'extractor-opts'].merge({
119
+ filename: (options[:input] if options[:input].any?),
120
+ since: options[:since],
121
+ until: options[:until],
122
+ limit: options[:limit],
123
+ }.compact)
124
+
125
+ transformer_options = options[:'transformer-opts']
126
+
127
+ loader_options = options[:'loader-opts'].merge({
128
+ output: options[:output],
129
+ fields: options[:fields]
130
+ }.compact)
131
+
103
132
  {
104
133
  dry_run: options[:dry_run],
105
134
  extractor: {
106
135
  name: options[:extractor],
107
- options: options[:'extractor-opts']
136
+ options: extractor_options
108
137
  }.compact,
109
138
  transformer: {
110
139
  name: options[:transformer],
111
- options: options[:'transformer-opts']
140
+ options: transformer_options
112
141
  }.compact,
113
142
  loader: {
114
143
  name: options[:loader],
115
- options: options[:'loader-opts']
144
+ options: loader_options
116
145
  }.compact
117
146
  }
118
147
  end
@@ -1,17 +1,10 @@
1
- require 'thor'
2
- require 'chronicle/etl'
3
1
  require 'colorize'
4
2
 
5
- require 'chronicle/etl/cli/subcommand_base'
6
- require 'chronicle/etl/cli/connectors'
7
- require 'chronicle/etl/cli/jobs'
8
-
9
3
  module Chronicle
10
4
  module ETL
11
5
  module CLI
12
6
  # Main entrypoint for CLI app
13
- class Main < Thor
14
- class_option "verbose", type: :boolean, default: false
7
+ class Main < ::Thor
15
8
  default_task "jobs"
16
9
 
17
10
  desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
@@ -22,15 +15,6 @@ module Chronicle
22
15
 
23
16
  # Entrypoint for the CLI
24
17
  def self.start(given_args = ARGV, config = {})
25
- if given_args[0] == "--version"
26
- puts "#{Chronicle::ETL::VERSION}"
27
- exit
28
- end
29
-
30
- if given_args.none?
31
- abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
32
- end
33
-
34
18
  # take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
35
19
  if given_args.any? && given_args[0].include?(':')
36
20
  commands = given_args.shift.split(':')
@@ -40,10 +24,20 @@ module Chronicle
40
24
  super(given_args, config)
41
25
  end
42
26
 
27
+ def self.exit_on_failure?
28
+ true
29
+ end
30
+
31
+ desc "version", "Show version"
32
+ map %w(--version -v) => :version
33
+ def version
34
+ shell.say "chronicle-etl #{Chronicle::ETL::VERSION}"
35
+ end
36
+
43
37
  # Displays help options for chronicle-etl
44
38
  def help(meth = nil, subcommand = false)
45
39
  if meth && !respond_to?(meth)
46
- klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
40
+ klass, task = ::Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
47
41
  klass.start(['-h', task].compact, shell: shell)
48
42
  else
49
43
  shell.say "ABOUT".bold
@@ -64,7 +58,7 @@ module Chronicle
64
58
 
65
59
  list = []
66
60
 
67
- Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
61
+ ::Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
68
62
  list += thor_class.printable_tasks(false)
69
63
  end
70
64
  list.sort! { |a, b| a[0] <=> b[0] }
@@ -2,11 +2,11 @@ module Chronicle
2
2
  module ETL
3
3
  module CLI
4
4
  # Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
5
- class SubcommandBase < Thor
5
+ class SubcommandBase < ::Thor
6
6
  # Print usage instructions for a subcommand
7
7
  def self.help(shell, subcommand = false)
8
8
  list = printable_commands(true, subcommand)
9
- Thor::Util.thor_classes_in(self).each do |klass|
9
+ ::Thor::Util.thor_classes_in(self).each do |klass|
10
10
  list += klass.printable_commands(false)
11
11
  end
12
12
  list.sort! { |a, b| a[0] <=> b[0] }
@@ -0,0 +1,7 @@
1
+ require 'thor'
2
+ require 'chronicle/etl'
3
+
4
+ require 'chronicle/etl/cli/subcommand_base'
5
+ require 'chronicle/etl/cli/connectors'
6
+ require 'chronicle/etl/cli/jobs'
7
+ require 'chronicle/etl/cli/main'
@@ -0,0 +1,150 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ostruct"
4
+
5
+ module Chronicle
6
+ module ETL
7
+ # A mixin that gives a class
8
+ # a {Chronicle::ETL::Configurable::ClassMethods#setting} macro to define
9
+ # settings and their properties (require, type, etc)
10
+ #
11
+ # @example Basic usage
12
+ # class Test < Chronicle::ETL::Extractor
13
+ # include Chronicle::ETL::Configurable
14
+ # setting :when, type: :date, required: true
15
+ # end
16
+ #
17
+ # t = Test.new(when: '2022-02-24')
18
+ # t.config.when
19
+ module Configurable
20
+ # An individual setting for this Configurable
21
+ Setting = Struct.new(:default, :required, :type)
22
+ private_constant :Setting
23
+
24
+ # Collection of user-supplied options for this Configurable
25
+ class Config < OpenStruct
26
+ # Config values that aren't nil, as a hash
27
+ def compacted_h
28
+ to_h.compact
29
+ end
30
+ end
31
+
32
+ # @private
33
+ def self.included(klass)
34
+ klass.extend(ClassMethods)
35
+ klass.include(InstanceMethods)
36
+ klass.prepend(Initializer)
37
+ end
38
+
39
+ # Initializer method for classes that have Configurable mixed in
40
+ module Initializer
41
+ # Make sure this class has a default @config ready to use
42
+ def initialize(*args)
43
+ @config = initialize_default_config
44
+ super
45
+ end
46
+ end
47
+
48
+ # Instance methods for classes that have Configurable mixed in
49
+ module InstanceMethods
50
+ attr_reader :config
51
+
52
+ # Take given options and apply them to this class's settings
53
+ # and make them available in @config and validates that they
54
+ # conform to setting rules
55
+ def apply_options(options)
56
+ options.transform_keys!(&:to_sym)
57
+
58
+ options.each do |name, value|
59
+ setting = self.class.all_settings[name]
60
+ raise(Chronicle::ETL::ConfigurationError, "Unrecognized setting: #{name}") unless setting
61
+
62
+ @config[name] = coerced_value(setting, value)
63
+ end
64
+ validate_config
65
+ options
66
+ end
67
+
68
+ # Name of all settings available to this class
69
+ def self.settings
70
+ self.class.all_settings.keys
71
+ end
72
+
73
+ private
74
+
75
+ def initialize_default_config
76
+ self.class.config_with_defaults
77
+ end
78
+
79
+ def validate_config
80
+ missing = (self.class.all_required_settings.keys - @config.compacted_h.keys)
81
+ raise Chronicle::ETL::ConfigurationError, "Missing options: #{missing}" if missing.count.positive?
82
+ end
83
+
84
+ def coerced_value(setting, value)
85
+ setting.type ? __send__("coerce_#{setting.type}", value) : value
86
+ end
87
+
88
+ def coerce_string(value)
89
+ value.to_s
90
+ end
91
+
92
+ def coerce_time(value)
93
+ # TODO: handle durations like '3h'
94
+ if value.is_a?(String)
95
+ Time.parse(value)
96
+ else
97
+ value
98
+ end
99
+ end
100
+ end
101
+
102
+ # Class methods for classes that have Configurable mixed in
103
+ module ClassMethods
104
+ # Macro for creating a setting on a class {::Chronicle::ETL::Configurable}
105
+ #
106
+ # @param [String] name Name of the setting
107
+ # @param [Boolean] required whether setting is required
108
+ # @param [Object] default Default value
109
+ # @param [Symbol] type Type
110
+ #
111
+ # @example Basic usage
112
+ # setting :when, type: :date, required: true
113
+ #
114
+ # @see ::Chronicle::ETL::Configurable
115
+ def setting(name, default: nil, required: false, type: nil)
116
+ s = Setting.new(default, required, type)
117
+ settings[name] = s
118
+ end
119
+
120
+ # Collect all settings defined on this class and its ancestors (that
121
+ # have Configurable mixin included)
122
+ def all_settings
123
+ if superclass.include?(Chronicle::ETL::Configurable)
124
+ superclass.all_settings.merge(settings)
125
+ else
126
+ settings
127
+ end
128
+ end
129
+
130
+ # Filters settings to those that are required.
131
+ def all_required_settings
132
+ all_settings.select { |_name, setting| setting.required } || {}
133
+ end
134
+
135
+ def settings
136
+ @settings ||= {}
137
+ end
138
+
139
+ def setting_exists?(name)
140
+ all_settings.keys.include? name
141
+ end
142
+
143
+ def config_with_defaults
144
+ s = all_settings.transform_values(&:default)
145
+ Config.new(s)
146
+ end
147
+ end
148
+ end
149
+ end
150
+ end
@@ -2,6 +2,8 @@ module Chronicle
2
2
  module ETL
3
3
  class Error < StandardError; end;
4
4
 
5
+ class ConfigurationError < Error; end;
6
+
5
7
  class RunnerTypeError < Error; end
6
8
 
7
9
  class ConnectorNotAvailableError < Error
@@ -2,21 +2,15 @@ require 'csv'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
5
- class CsvExtractor < Chronicle::ETL::Extractor
5
+ class CSVExtractor < Chronicle::ETL::Extractor
6
6
  include Extractors::Helpers::FilesystemReader
7
7
 
8
8
  register_connector do |r|
9
9
  r.description = 'input as CSV'
10
10
  end
11
11
 
12
- DEFAULT_OPTIONS = {
13
- headers: true,
14
- filename: $stdin
15
- }.freeze
16
-
17
- def initialize(options = {})
18
- super(DEFAULT_OPTIONS.merge(options))
19
- end
12
+ setting :headers, default: true
13
+ setting :filename, default: $stdin
20
14
 
21
15
  def extract
22
16
  csv = initialize_csv
@@ -26,20 +20,20 @@ module Chronicle
26
20
  end
27
21
 
28
22
  def results_count
29
- CSV.read(@options[:filename], headers: @options[:headers]).count unless stdin?(@options[:filename])
23
+ CSV.read(@config.filename, headers: @config.headers).count unless stdin?(@config.filename)
30
24
  end
31
25
 
32
26
  private
33
27
 
34
28
  def initialize_csv
35
- headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
29
+ headers = @config.headers.is_a?(String) ? @config.headers.split(',') : @config.headers
36
30
 
37
31
  csv_options = {
38
32
  headers: headers,
39
33
  converters: :all
40
34
  }
41
35
 
42
- open_from_filesystem(filename: @options[:filename]) do |file|
36
+ open_from_filesystem(filename: @config.filename) do |file|
43
37
  return CSV.new(file, **csv_options)
44
38
  end
45
39
  end
@@ -5,15 +5,20 @@ module Chronicle
5
5
  # Abstract class representing an Extractor for an ETL job
6
6
  class Extractor
7
7
  extend Chronicle::ETL::Registry::SelfRegistering
8
+ include Chronicle::ETL::Configurable
9
+
10
+ setting :since, type: :date
11
+ setting :until, type: :date
12
+ setting :limit
13
+ setting :load_after_id
14
+ setting :filename
8
15
 
9
16
  # Construct a new instance of this extractor. Options are passed in from a Runner
10
- # == Paramters:
17
+ # == Parameters:
11
18
  # options::
12
19
  # Options for configuring this Extractor
13
20
  def initialize(options = {})
14
- @options = options.transform_keys!(&:to_sym)
15
- sanitize_options
16
- handle_continuation
21
+ apply_options(options)
17
22
  end
18
23
 
19
24
  # Hook called before #extract. Useful for gathering data, initailizing proxies, etc
@@ -30,17 +35,13 @@ module Chronicle
30
35
 
31
36
  private
32
37
 
33
- def sanitize_options
34
- @options[:load_since] = Time.parse(@options[:load_since]) if @options[:load_since] && @options[:load_since].is_a?(String)
35
- @options[:load_until] = Time.parse(@options[:load_until]) if @options[:load_until] && @options[:load_until].is_a?(String)
36
- end
37
-
38
- def handle_continuation
39
- return unless @options[:continuation]
38
+ # TODO: reimplemenet this
39
+ # def handle_continuation
40
+ # return unless @config.continuation
40
41
 
41
- @options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation].highest_timestamp
42
- @options[:load_after_id] = @options[:continuation].last_id if @options[:continuation].last_id
43
- end
42
+ # @config.since = @config.continuation.highest_timestamp if @config.continuation.highest_timestamp
43
+ # @config.load_after_id = @config.continuation.last_id if @config.continuation.last_id
44
+ # end
44
45
  end
45
46
  end
46
47
  end
@@ -9,6 +9,9 @@ module Chronicle
9
9
  r.description = 'file or directory of files'
10
10
  end
11
11
 
12
+ # TODO: consolidate this with @config.filename
13
+ setting :dir_glob_pattern
14
+
12
15
  def extract
13
16
  filenames.each do |filename|
14
17
  yield Chronicle::ETL::Extraction.new(data: filename)
@@ -23,10 +26,10 @@ module Chronicle
23
26
 
24
27
  def filenames
25
28
  @filenames ||= filenames_in_directory(
26
- path: @options[:filename],
27
- dir_glob_pattern: @options[:dir_glob_pattern],
28
- load_since: @options[:load_since],
29
- load_until: @options[:load_until]
29
+ path: @config.filename,
30
+ dir_glob_pattern: @config.dir_glob_pattern,
31
+ load_since: @config.since,
32
+ load_until: @config.until
30
33
  )
31
34
  end
32
35
  end
@@ -7,16 +7,8 @@ module Chronicle
7
7
  r.description = 'input as JSON'
8
8
  end
9
9
 
10
- DEFAULT_OPTIONS = {
11
- filename: $stdin,
12
-
13
- # We're expecting line-separated json objects
14
- jsonl: true
15
- }.freeze
16
-
17
- def initialize(options = {})
18
- super(DEFAULT_OPTIONS.merge(options))
19
- end
10
+ setting :filename, default: $stdin
11
+ setting :jsonl, default: true
20
12
 
21
13
  def extract
22
14
  load_input do |input|
@@ -35,7 +35,7 @@ module Chronicle
35
35
 
36
36
  def instantiate_transformer(extraction)
37
37
  @transformer_klass = @job_definition.transformer_klass
38
- @transformer_klass.new(@transformer_options, extraction)
38
+ @transformer_klass.new(extraction, @transformer_options)
39
39
  end
40
40
 
41
41
  def instantiate_loader
@@ -2,7 +2,7 @@ require 'csv'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
5
- class CsvLoader < Chronicle::ETL::Loader
5
+ class CSVLoader < Chronicle::ETL::Loader
6
6
  register_connector do |r|
7
7
  r.description = 'CSV'
8
8
  end
@@ -3,13 +3,16 @@ module Chronicle
3
3
  # Abstract class representing a Loader for an ETL job
4
4
  class Loader
5
5
  extend Chronicle::ETL::Registry::SelfRegistering
6
+ include Chronicle::ETL::Configurable
7
+
8
+ setting :output
6
9
 
7
10
  # Construct a new instance of this loader. Options are passed in from a Runner
8
11
  # == Parameters:
9
12
  # options::
10
13
  # Options for configuring this Loader
11
14
  def initialize(options = {})
12
- @options = options
15
+ apply_options(options)
13
16
  end
14
17
 
15
18
  # Called once before processing records
@@ -9,19 +9,19 @@ module Chronicle
9
9
  r.description = 'a REST endpoint'
10
10
  end
11
11
 
12
- def initialize( options={} )
13
- super(options)
14
- end
12
+ setting :hostname, required: true
13
+ setting :endpoint, required: true
14
+ setting :access_token
15
15
 
16
16
  def load(record)
17
17
  payload = Chronicle::ETL::JSONAPISerializer.serialize(record)
18
18
  # have the outer data key that json-api expects
19
19
  payload = { data: payload } unless payload[:data]
20
20
 
21
- uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
21
+ uri = URI.parse("#{@config.hostname}#{@config.endpoint}")
22
22
 
23
23
  header = {
24
- "Authorization" => "Bearer #{@options[:access_token]}",
24
+ "Authorization" => "Bearer #{@config.access_token}",
25
25
  "Content-Type": 'application/json'
26
26
  }
27
27
  use_ssl = uri.scheme == 'https'
@@ -9,20 +9,14 @@ module Chronicle
9
9
  r.description = 'an ASCII table'
10
10
  end
11
11
 
12
- DEFAULT_OPTIONS = {
13
- fields_limit: nil,
14
- fields_exclude: ['lids', 'type'],
15
- fields_include: [],
16
- truncate_values_at: nil,
17
- table_renderer: :basic
18
- }.freeze
19
-
20
- def initialize(options={})
21
- @options = options.reverse_merge(DEFAULT_OPTIONS)
22
- @records = []
23
- end
12
+ setting :fields_limit, default: nil
13
+ setting :fields_exclude, default: ['lids', 'type']
14
+ setting :fields_include, default: []
15
+ setting :truncate_values_at, default: 40
16
+ setting :table_renderer, default: :basic
24
17
 
25
18
  def load(record)
19
+ @records ||= []
26
20
  @records << record.to_h_flattened
27
21
  end
28
22
 
@@ -34,7 +28,7 @@ module Chronicle
34
28
 
35
29
  @table = TTY::Table.new(header: headers, rows: rows)
36
30
  puts @table.render(
37
- @options[:table_renderer].to_sym,
31
+ @config.table_renderer.to_sym,
38
32
  padding: [0, 2, 0, 0]
39
33
  )
40
34
  end
@@ -43,15 +37,15 @@ module Chronicle
43
37
 
44
38
  def build_headers(records)
45
39
  headers =
46
- if @options[:fields_include].any?
47
- Set[*@options[:fields_include]]
40
+ if @config.fields_include.any?
41
+ Set[*@config.fields_include]
48
42
  else
49
43
  # use all the keys of the flattened record hash
50
44
  Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
51
45
  end
52
46
 
53
- headers = headers.delete_if { |header| header.end_with?(*@options[:fields_exclude]) } if @options[:fields_exclude].any?
54
- headers = headers.first(@options[:fields_limit]) if @options[:fields_limit]
47
+ headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) } if @config.fields_exclude.any?
48
+ headers = headers.first(@config.fields_limit) if @config.fields_limit
55
49
 
56
50
  headers.to_a.map(&:to_sym)
57
51
  end
@@ -60,8 +54,8 @@ module Chronicle
60
54
  records.map do |record|
61
55
  values = record.values_at(*headers).map{|value| value.to_s }
62
56
 
63
- if @options[:truncate_values_at]
64
- values = values.map{ |value| value.truncate(@options[:truncate_values_at]) }
57
+ if @config.truncate_values_at
58
+ values = values.map{ |value| value.truncate(@config.truncate_values_at) }
65
59
  end
66
60
 
67
61
  values
@@ -3,6 +3,7 @@ module Chronicle
3
3
  module Registry
4
4
  # Records details about a connector such as its provider and a description
5
5
  class ConnectorRegistration
6
+ # FIXME: refactor custom accessor methods later in file
6
7
  attr_accessor :identifier, :provider, :klass, :description
7
8
 
8
9
  def initialize(klass)
@@ -19,20 +19,14 @@ module Chronicle
19
19
  r.description = 'an image file'
20
20
  end
21
21
 
22
- DEFAULT_OPTIONS = {
23
- timestamp_strategy: 'file_mtime',
24
- id_strategy: 'file_hash',
25
- verb: 'photographed',
26
-
27
- # EXIF tags often don't have timezones
28
- timezone_default: 'Eastern Time (US & Canada)',
29
- include_image_data: true
30
- }.freeze
31
-
32
- def initialize(*args)
33
- super(*args)
34
- @options = @options.reverse_merge(DEFAULT_OPTIONS)
35
- end
22
+ setting :timestamp_strategy, default: 'file_mtime'
23
+ setting :id_strategy, default: 'file_hash'
24
+ setting :verb, default: 'photographed'
25
+ # EXIF tags often don't have timezones
26
+ setting :timezone_default, default: 'Eastern Time (US & Canada)'
27
+ setting :include_image_data, default: true
28
+ setting :actor
29
+ setting :involved
36
30
 
37
31
  def transform
38
32
  # FIXME: set @filename; use block for reading file when necessary
@@ -48,7 +42,7 @@ module Chronicle
48
42
 
49
43
  def id
50
44
  @id ||= begin
51
- id = build_with_strategy(field: :id, strategy: @options[:id_strategy])
45
+ id = build_with_strategy(field: :id, strategy: @config.id_strategy)
52
46
  raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
53
47
 
54
48
  id
@@ -57,7 +51,7 @@ module Chronicle
57
51
 
58
52
  def timestamp
59
53
  @timestamp ||= begin
60
- ts = build_with_strategy(field: :timestamp, strategy: @options[:timestamp_strategy])
54
+ ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
61
55
  raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
62
56
 
63
57
  ts
@@ -68,8 +62,8 @@ module Chronicle
68
62
 
69
63
  def build_created(file)
70
64
  record = ::Chronicle::ETL::Models::Activity.new
71
- record.verb = @options[:verb]
72
- record.provider = @options[:provider]
65
+ record.verb = @config.verb
66
+ record.provider = @config.provider
73
67
  record.provider_id = id
74
68
  record.end_at = timestamp
75
69
  record.dedupe_on = [[:provider_id, :verb, :provider]]
@@ -84,24 +78,24 @@ module Chronicle
84
78
  def build_actor
85
79
  actor = ::Chronicle::ETL::Models::Entity.new
86
80
  actor.represents = 'identity'
87
- actor.provider = @options[:actor][:provider]
88
- actor.slug = @options[:actor][:slug]
81
+ actor.provider = @config.actor[:provider]
82
+ actor.slug = @config.actor[:slug]
89
83
  actor.dedupe_on = [[:provider, :slug, :represents]]
90
84
  actor
91
85
  end
92
86
 
93
87
  def build_image
94
88
  image = ::Chronicle::ETL::Models::Entity.new
95
- image.represents = @options[:involved][:represents]
89
+ image.represents = @config.involved[:represents]
96
90
  image.title = build_title
97
91
  image.body = exif['Description']
98
- image.provider = @options[:involved][:provider]
92
+ image.provider = @config.involved[:provider]
99
93
  image.provider_id = id
100
94
  image.assign_attributes(build_gps)
101
95
  image.dedupe_on = [[:provider, :provider_id, :represents]]
102
96
 
103
- if @options[:ocr_strategy]
104
- ocr_text = build_with_strategy(field: :ocr, strategy: @options[:ocr_strategy])
97
+ if @config.ocr_strategy
98
+ ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
105
99
  image.metadata[:ocr_text] = ocr_text if ocr_text
106
100
  end
107
101
 
@@ -111,7 +105,7 @@ module Chronicle
111
105
  image.depicts = build_people_depicted(names)
112
106
  image.abouts = build_keywords(tags)
113
107
 
114
- if @options[:include_image_data]
108
+ if @config.include_image_data
115
109
  attachment = ::Chronicle::ETL::Models::Attachment.new
116
110
  attachment.data = build_image_data
117
111
  image.attachments = [attachment]
@@ -124,7 +118,7 @@ module Chronicle
124
118
  topics.map do |topic|
125
119
  t = ::Chronicle::ETL::Models::Entity.new
126
120
  t.represents = 'topic'
127
- t.provider = @options[:involved][:provider]
121
+ t.provider = @config.involved[:provider]
128
122
  t.title = topic
129
123
  t.slug = topic.parameterize
130
124
  t.dedupe_on = [[:provider, :represents, :slug]]
@@ -136,7 +130,7 @@ module Chronicle
136
130
  names.map do |name|
137
131
  identity = ::Chronicle::ETL::Models::Entity.new
138
132
  identity.represents = 'identity'
139
- identity.provider = @options[:involved][:provider]
133
+ identity.provider = @config.involved[:provider]
140
134
  identity.slug = name.parameterize
141
135
  identity.title = name
142
136
  identity.dedupe_on = [[:provider, :represents, :slug]]
@@ -199,7 +193,7 @@ module Chronicle
199
193
  elsif false
200
194
  # TODO: support option of using GPS coordinates to determine timezone
201
195
  else
202
- zone = ActiveSupport::TimeZone.new(@options[:timezone_default])
196
+ zone = ActiveSupport::TimeZone.new(@config.timezone_default)
203
197
  timestamp = zone.parse(timestamp.asctime)
204
198
  end
205
199
 
@@ -3,14 +3,15 @@ module Chronicle
3
3
  # Abstract class representing an Transformer for an ETL job
4
4
  class Transformer
5
5
  extend Chronicle::ETL::Registry::SelfRegistering
6
+ include Chronicle::ETL::Configurable
6
7
 
7
8
  # Construct a new instance of this transformer. Options are passed in from a Runner
8
9
  # == Parameters:
9
10
  # options::
10
11
  # Options for configuring this Transformer
11
- def initialize(options = {}, extraction)
12
- @options = options
12
+ def initialize(extraction, options = {})
13
13
  @extraction = extraction
14
+ apply_options(options)
14
15
  end
15
16
 
16
17
  # @abstract Subclass is expected to implement #transform
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.3.1"
3
+ VERSION = "0.4.0"
4
4
  end
5
5
  end
data/lib/chronicle/etl.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require_relative 'etl/registry/registry'
2
2
  require_relative 'etl/config'
3
+ require_relative 'etl/configurable'
3
4
  require_relative 'etl/exceptions'
4
5
  require_relative 'etl/extraction'
5
6
  require_relative 'etl/extractors/extractor'
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chronicle-etl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Louis
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-02-10 00:00:00.000000000 Z
11
+ date: 2022-02-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: '7.0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ">="
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: '7.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: chronic_duration
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -98,16 +98,16 @@ dependencies:
98
98
  name: runcom
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - "~>"
101
+ - - ">="
102
102
  - !ruby/object:Gem::Version
103
- version: '6.2'
103
+ version: '6.0'
104
104
  type: :runtime
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
- - - "~>"
108
+ - - ">="
109
109
  - !ruby/object:Gem::Version
110
- version: '6.2'
110
+ version: '6.0'
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: sequel
113
113
  requirement: !ruby/object:Gem::Requirement
@@ -142,14 +142,14 @@ dependencies:
142
142
  requirements:
143
143
  - - "~>"
144
144
  - !ruby/object:Gem::Version
145
- version: '0.20'
145
+ version: '1.2'
146
146
  type: :runtime
147
147
  prerelease: false
148
148
  version_requirements: !ruby/object:Gem::Requirement
149
149
  requirements:
150
150
  - - "~>"
151
151
  - !ruby/object:Gem::Version
152
- version: '0.20'
152
+ version: '1.2'
153
153
  - !ruby/object:Gem::Dependency
154
154
  name: tty-progressbar
155
155
  requirement: !ruby/object:Gem::Requirement
@@ -234,6 +234,62 @@ dependencies:
234
234
  - - "~>"
235
235
  - !ruby/object:Gem::Version
236
236
  version: '3.9'
237
+ - !ruby/object:Gem::Dependency
238
+ name: simplecov
239
+ requirement: !ruby/object:Gem::Requirement
240
+ requirements:
241
+ - - "~>"
242
+ - !ruby/object:Gem::Version
243
+ version: '0.21'
244
+ type: :development
245
+ prerelease: false
246
+ version_requirements: !ruby/object:Gem::Requirement
247
+ requirements:
248
+ - - "~>"
249
+ - !ruby/object:Gem::Version
250
+ version: '0.21'
251
+ - !ruby/object:Gem::Dependency
252
+ name: guard-rspec
253
+ requirement: !ruby/object:Gem::Requirement
254
+ requirements:
255
+ - - "~>"
256
+ - !ruby/object:Gem::Version
257
+ version: 4.7.3
258
+ type: :development
259
+ prerelease: false
260
+ version_requirements: !ruby/object:Gem::Requirement
261
+ requirements:
262
+ - - "~>"
263
+ - !ruby/object:Gem::Version
264
+ version: 4.7.3
265
+ - !ruby/object:Gem::Dependency
266
+ name: yard
267
+ requirement: !ruby/object:Gem::Requirement
268
+ requirements:
269
+ - - "~>"
270
+ - !ruby/object:Gem::Version
271
+ version: 0.9.7
272
+ type: :development
273
+ prerelease: false
274
+ version_requirements: !ruby/object:Gem::Requirement
275
+ requirements:
276
+ - - "~>"
277
+ - !ruby/object:Gem::Version
278
+ version: 0.9.7
279
+ - !ruby/object:Gem::Dependency
280
+ name: rubocop
281
+ requirement: !ruby/object:Gem::Requirement
282
+ requirements:
283
+ - - "~>"
284
+ - !ruby/object:Gem::Version
285
+ version: 1.25.1
286
+ type: :development
287
+ prerelease: false
288
+ version_requirements: !ruby/object:Gem::Requirement
289
+ requirements:
290
+ - - "~>"
291
+ - !ruby/object:Gem::Version
292
+ version: 1.25.1
237
293
  description: Chronicle-ETL allows you to extract personal data from a variety of services,
238
294
  transformer it, and load it.
239
295
  email:
@@ -243,14 +299,15 @@ executables:
243
299
  extensions: []
244
300
  extra_rdoc_files: []
245
301
  files:
302
+ - ".github/workflows/ruby.yml"
246
303
  - ".gitignore"
247
304
  - ".rspec"
248
305
  - ".rubocop.yml"
249
- - ".ruby-version"
250
306
  - ".travis.yml"
251
307
  - ".yardopts"
252
308
  - CODE_OF_CONDUCT.md
253
309
  - Gemfile
310
+ - Guardfile
254
311
  - LICENSE.txt
255
312
  - README.md
256
313
  - Rakefile
@@ -259,11 +316,13 @@ files:
259
316
  - chronicle-etl.gemspec
260
317
  - exe/chronicle-etl
261
318
  - lib/chronicle/etl.rb
319
+ - lib/chronicle/etl/cli.rb
262
320
  - lib/chronicle/etl/cli/connectors.rb
263
321
  - lib/chronicle/etl/cli/jobs.rb
264
322
  - lib/chronicle/etl/cli/main.rb
265
323
  - lib/chronicle/etl/cli/subcommand_base.rb
266
324
  - lib/chronicle/etl/config.rb
325
+ - lib/chronicle/etl/configurable.rb
267
326
  - lib/chronicle/etl/exceptions.rb
268
327
  - lib/chronicle/etl/extraction.rb
269
328
  - lib/chronicle/etl/extractors/csv_extractor.rb
@@ -317,14 +376,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
317
376
  requirements:
318
377
  - - ">="
319
378
  - !ruby/object:Gem::Version
320
- version: '0'
379
+ version: '2.7'
321
380
  required_rubygems_version: !ruby/object:Gem::Requirement
322
381
  requirements:
323
382
  - - ">="
324
383
  - !ruby/object:Gem::Version
325
384
  version: '0'
326
385
  requirements: []
327
- rubygems_version: 3.1.2
386
+ rubygems_version: 3.1.6
328
387
  signing_key:
329
388
  specification_version: 4
330
389
  summary: ETL tool for personal data
data/.ruby-version DELETED
@@ -1 +0,0 @@
1
- 2.7.1