chronicle-etl 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b74c4a7782c1ab31173e628b3e5ccb8743fe21f29d6f48d739b0e3cc2dfda22e
4
- data.tar.gz: 7ea44638b08f6da12c0a5386f3d852600f50336ce0bb57347114804770f75691
3
+ metadata.gz: 5fd411a9a41a645b85780230c79b09f361e121d0e8ca7f3270ca8eba55a76ca8
4
+ data.tar.gz: c09053715910ab4f027fbdc3a5b7d10c042eee962f7fa93c6571ce8359f51009
5
5
  SHA512:
6
- metadata.gz: efb23677c731a54b0382c3095dc9bb5f98a97365c1daf031bbc8c20335e7bd146b76b3a50486971e48192e7540bc0ae1b09f232590a590257203ae3560396767
7
- data.tar.gz: cba40b71a7e8c0b17a286ecd3db3724bff290fdd79b3fdf55ab89967f6af14228911c0e6928a949b8dd899acd6ad396b8a21fb03162a8561247c97b1200bac29
6
+ metadata.gz: 2c9ec14b6c0a51f1c5ec77ee8d9a7f016d16bdc35db5634f9fa5d38aabc30dec201cd4b8bef06a31b86773a0c1cda2d271d7008dcb247a86d956c094919f3c0f
7
+ data.tar.gz: 0dca41e1654e5b2b98a148f853492a67126cdac767000b3c5f97c5c8ff88b77464e17a2fab38b72c1f014f3515c911e5f3f391eaf68d64e73dcfcff5d8e6cb6a
@@ -0,0 +1,35 @@
1
+ # This workflow uses actions that are not certified by GitHub.
2
+ # They are provided by a third-party and are governed by
3
+ # separate terms of service, privacy policy, and support
4
+ # documentation.
5
+ # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake
6
+ # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby
7
+
8
+ name: Ruby
9
+
10
+ on:
11
+ push:
12
+ branches: [ master ]
13
+ pull_request:
14
+ branches: [ master ]
15
+
16
+ jobs:
17
+ test:
18
+
19
+ runs-on: ubuntu-latest
20
+ strategy:
21
+ matrix:
22
+ ruby-version: ['2.7', '3.0']
23
+
24
+ steps:
25
+ - uses: actions/checkout@v2
26
+ - name: Set up Ruby
27
+ # To automatically get bug fixes and new Ruby versions for ruby/setup-ruby,
28
+ # change this to (see https://github.com/ruby/setup-ruby#versioning):
29
+ # uses: ruby/setup-ruby@v1
30
+ uses: ruby/setup-ruby@473e4d8fe5dd94ee328fdfca9f8c9c7afc9dae5e
31
+ with:
32
+ ruby-version: ${{ matrix.ruby-version }}
33
+ bundler-cache: true # runs 'bundle install' and caches installed gems automatically
34
+ - name: Run tests
35
+ run: bundle exec rake
data/.rubocop.yml CHANGED
@@ -1,11 +1,38 @@
1
1
  AllCops:
2
2
  EnabledByDefault: true
3
+ TargetRubyVersion: 2.7
4
+
5
+ Style/FrozenStringLiteralComment:
6
+ SafeAutoCorrect: true
3
7
 
4
8
  Style/StringLiterals:
5
9
  Enabled: false
6
10
 
11
+ Layout/MultilineAssignmentLayout:
12
+ Enabled: false
13
+
14
+ Layout/RedundantLineBreak:
15
+ Enabled: false
16
+
7
17
  Style/MethodCallWithArgsParentheses:
8
18
  Enabled: false
9
19
 
20
+ Style/MethodCalledOnDoEndBlock:
21
+ Exclude:
22
+ - 'spec/**/*'
23
+
24
+ Style/OpenStructUse:
25
+ Enabled: false
26
+
27
+ Style/Copyright:
28
+ Enabled: false
29
+
30
+ Style/SymbolArray:
31
+ EnforcedStyle: brackets
32
+
33
+ Style/WordArray:
34
+ EnforcedStyle: brackets
35
+
10
36
  Lint/ConstantResolution:
11
- Enabled: false
37
+ Enabled: false
38
+
data/Guardfile ADDED
@@ -0,0 +1,7 @@
1
+ guard :rspec, cmd: "bundle exec rspec" do
2
+ require "guard/rspec/dsl"
3
+
4
+ watch(%r{^spec/.+_spec\.rb$})
5
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
6
+ watch('spec/spec_helper.rb') { "spec" }
7
+ end
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Chronicle::ETL
2
2
 
3
- [![Gem Version](https://badge.fury.io/rb/chronicle-etl.svg)](https://badge.fury.io/rb/chronicle-etl)
3
+ [![Gem Version](https://badge.fury.io/rb/chronicle-etl.svg)](https://badge.fury.io/rb/chronicle-etl) [![Ruby](https://github.com/chronicle-app/chronicle-etl/actions/workflows/ruby.yml/badge.svg)](https://github.com/chronicle-app/chronicle-etl/actions/workflows/ruby.yml)
4
4
 
5
5
  Chronicle ETL is a utility that helps you archive and processes personal data. You can *extract* it from a variety of sources, *transform* it, and *load* it to an external API, file, or stdout.
6
6
 
@@ -57,7 +57,7 @@ Built in connectors:
57
57
  In addition to the built-in importers, importers for third-party platforms are available. They are packaged as individual Ruby gems.
58
58
 
59
59
  - [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files
60
- - [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`
60
+ - [shell](https://github.com/chronicle-app/chronicle-shell). Extract shell history from Bash or Zsh`
61
61
  - [imessage](https://github.com/chronicle-app/chronicle-imessage). Extract iMessage messages from a local macOS installation
62
62
 
63
63
  To install any of these, run `gem install chronicle-PROVIDER`.
data/Rakefile CHANGED
@@ -1,6 +1,8 @@
1
1
  require "bundler/gem_tasks"
2
2
  require "rspec/core/rake_task"
3
-
4
3
  RSpec::Core::RakeTask.new(:spec)
5
4
 
6
- task :default => :spec
5
+ require 'yard'
6
+ YARD::Rake::YardocTask.new
7
+
8
+ task default: :spec
@@ -35,17 +35,18 @@ Gem::Specification.new do |spec|
35
35
  spec.bindir = "exe"
36
36
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
37
37
  spec.require_paths = ["lib"]
38
+ spec.required_ruby_version = ">= 2.7"
38
39
 
39
- spec.add_dependency "activesupport"
40
+ spec.add_dependency "activesupport", "~> 7.0"
40
41
  spec.add_dependency "chronic_duration", "~> 0.10.6"
41
42
  spec.add_dependency "colorize", "~> 0.8.1"
42
43
  spec.add_dependency "marcel", "~> 1.0.2"
43
44
  spec.add_dependency "mini_exiftool", "~> 2.10"
44
45
  spec.add_dependency "nokogiri", "~> 1.13"
45
- spec.add_dependency "runcom", "~> 6.2"
46
+ spec.add_dependency "runcom", ">= 6.0"
46
47
  spec.add_dependency "sequel", "~> 5.35"
47
48
  spec.add_dependency "sqlite3", "~> 1.4"
48
- spec.add_dependency "thor", "~> 0.20"
49
+ spec.add_dependency "thor", "~> 1.2"
49
50
  spec.add_dependency "tty-progressbar", "~> 0.17"
50
51
  spec.add_dependency "tty-table", "~> 0.11"
51
52
 
@@ -53,4 +54,8 @@ Gem::Specification.new do |spec|
53
54
  spec.add_development_dependency "pry-byebug", "~> 3.9"
54
55
  spec.add_development_dependency "rake", "~> 13.0"
55
56
  spec.add_development_dependency "rspec", "~> 3.9"
57
+ spec.add_development_dependency "simplecov", "~> 0.21"
58
+ spec.add_development_dependency "guard-rspec", "~> 4.7.3"
59
+ spec.add_development_dependency "yard", "~> 0.9.7"
60
+ spec.add_development_dependency "rubocop", "~> 1.25.1"
56
61
  end
data/exe/chronicle-etl CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "chronicle/etl/cli/main"
3
+ require "chronicle/etl/cli"
4
4
 
5
5
  Chronicle::ETL::CLI::Main.start(ARGV)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Chronicle
2
4
  module ETL
3
5
  module CLI
@@ -38,6 +40,38 @@ module Chronicle
38
40
  table = TTY::Table.new(headers, connector_info.map(&:values))
39
41
  puts table.render(indent: 0, padding: [0, 2])
40
42
  end
43
+
44
+ desc "show PHASE IDENTIFIER", "Show information about a connector"
45
+ def show(phase, identifier)
46
+ unless ['extractor', 'transformer', 'loader'].include?(phase)
47
+ puts "phase argument must be one of: [extractor, transformer, loader]"
48
+ return
49
+ end
50
+
51
+ begin
52
+ connector = Chronicle::ETL::Registry.find_by_phase_and_identifier(phase.to_sym, identifier)
53
+ rescue Chronicle::ETL::ConnectorNotAvailableError
54
+ puts "Could not find #{phase} #{identifier}"
55
+ return
56
+ end
57
+
58
+ puts connector.klass.to_s.bold
59
+ puts " #{connector.descriptive_phrase}"
60
+ puts
61
+ puts "OPTIONS"
62
+
63
+ headers = ['name', 'default', 'required'].map{ |h| h.to_s.upcase.bold }
64
+
65
+ settings = connector.klass.settings.map do |name, setting|
66
+ [
67
+ name,
68
+ setting.default,
69
+ setting.required ? 'yes' : 'no'
70
+ ]
71
+ end
72
+ table = TTY::Table.new(headers, settings)
73
+ puts table.render(indent: 0, padding: [0, 2])
74
+ end
41
75
  end
42
76
  end
43
77
  end
@@ -1,4 +1,5 @@
1
1
  require 'pp'
2
+
2
3
  module Chronicle
3
4
  module ETL
4
5
  module CLI
@@ -7,15 +8,29 @@ module Chronicle
7
8
  default_task "start"
8
9
  namespace :jobs
9
10
 
11
+ class_option :name, aliases: '-j', desc: 'Job configuration name'
12
+
10
13
  class_option :extractor, aliases: '-e', desc: "Extractor class. Default: stdin", banner: 'extractor-name'
11
14
  class_option :'extractor-opts', desc: 'Extractor options', type: :hash, default: {}
12
15
  class_option :transformer, aliases: '-t', desc: 'Transformer class. Default: null', banner: 'transformer-name'
13
16
  class_option :'transformer-opts', desc: 'Transformer options', type: :hash, default: {}
14
17
  class_option :loader, aliases: '-l', desc: 'Loader class. Default: stdout', banner: 'loader-name'
15
18
  class_option :'loader-opts', desc: 'Loader options', type: :hash, default: {}
16
- class_option :name, aliases: '-j', desc: 'Job configuration name'
17
19
 
18
- map run: :start # Thor doesn't like `run` as a command name
20
+ # This is an array to deal with shell globbing
21
+ class_option :input, aliases: '-i', desc: 'Input filename or directory', default: [], type: 'array', banner: 'FILENAME'
22
+ class_option :since, desc: "Load records SINCE this date. Overrides job's `load_since` configuration option in extractor's options", banner: 'DATE'
23
+ class_option :until, desc: "Load records UNTIL this date", banner: 'DATE'
24
+ class_option :limit, desc: "Only extract the first LIMIT records", banner: 'N'
25
+
26
+ class_option :output, aliases: '-o', desc: 'Output filename', type: 'string'
27
+ class_option :fields, desc: 'Output only these fields', type: 'array', banner: 'field1 field2 ...'
28
+
29
+ class_option :log_level, desc: 'Log level (debug, info, warn, error, fatal)', default: 'info'
30
+ class_option :verbose, aliases: '-v', desc: 'Set log level to verbose', type: :boolean
31
+
32
+ # Thor doesn't like `run` as a command name
33
+ map run: :start
19
34
  desc "run", "Start a job"
20
35
  option :log_level, desc: 'Log level (debug, info, warn, error, fatal)', default: 'info'
21
36
  option :verbose, aliases: '-v', desc: 'Set log level to verbose', type: :boolean
@@ -69,7 +84,7 @@ LONG_DESC
69
84
  [job, extractor, transformer, loader]
70
85
  end
71
86
 
72
- headers = ['name', 'extractor', 'transformer', 'loader'].map{|h| h.upcase.bold }
87
+ headers = ['name', 'extractor', 'transformer', 'loader'].map { |h| h.upcase.bold }
73
88
 
74
89
  table = TTY::Table.new(headers, job_details)
75
90
  puts table.render(indent: 0, padding: [0, 2])
@@ -90,7 +105,7 @@ LONG_DESC
90
105
  def build_job_definition(options)
91
106
  definition = Chronicle::ETL::JobDefinition.new
92
107
  definition.add_config(load_job_config(options[:name]))
93
- definition.add_config(process_flag_options(options))
108
+ definition.add_config(process_flag_options(options).transform_keys(&:to_sym))
94
109
  definition
95
110
  end
96
111
 
@@ -100,19 +115,33 @@ LONG_DESC
100
115
 
101
116
  # Takes flag options and turns them into a runner config
102
117
  def process_flag_options options
118
+ extractor_options = options[:'extractor-opts'].merge({
119
+ filename: (options[:input] if options[:input].any?),
120
+ since: options[:since],
121
+ until: options[:until],
122
+ limit: options[:limit],
123
+ }.compact)
124
+
125
+ transformer_options = options[:'transformer-opts']
126
+
127
+ loader_options = options[:'loader-opts'].merge({
128
+ output: options[:output],
129
+ fields: options[:fields]
130
+ }.compact)
131
+
103
132
  {
104
133
  dry_run: options[:dry_run],
105
134
  extractor: {
106
135
  name: options[:extractor],
107
- options: options[:'extractor-opts']
136
+ options: extractor_options
108
137
  }.compact,
109
138
  transformer: {
110
139
  name: options[:transformer],
111
- options: options[:'transformer-opts']
140
+ options: transformer_options
112
141
  }.compact,
113
142
  loader: {
114
143
  name: options[:loader],
115
- options: options[:'loader-opts']
144
+ options: loader_options
116
145
  }.compact
117
146
  }
118
147
  end
@@ -1,17 +1,10 @@
1
- require 'thor'
2
- require 'chronicle/etl'
3
1
  require 'colorize'
4
2
 
5
- require 'chronicle/etl/cli/subcommand_base'
6
- require 'chronicle/etl/cli/connectors'
7
- require 'chronicle/etl/cli/jobs'
8
-
9
3
  module Chronicle
10
4
  module ETL
11
5
  module CLI
12
6
  # Main entrypoint for CLI app
13
- class Main < Thor
14
- class_option "verbose", type: :boolean, default: false
7
+ class Main < ::Thor
15
8
  default_task "jobs"
16
9
 
17
10
  desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
@@ -22,15 +15,6 @@ module Chronicle
22
15
 
23
16
  # Entrypoint for the CLI
24
17
  def self.start(given_args = ARGV, config = {})
25
- if given_args[0] == "--version"
26
- puts "#{Chronicle::ETL::VERSION}"
27
- exit
28
- end
29
-
30
- if given_args.none?
31
- abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
32
- end
33
-
34
18
  # take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
35
19
  if given_args.any? && given_args[0].include?(':')
36
20
  commands = given_args.shift.split(':')
@@ -40,10 +24,20 @@ module Chronicle
40
24
  super(given_args, config)
41
25
  end
42
26
 
27
+ def self.exit_on_failure?
28
+ true
29
+ end
30
+
31
+ desc "version", "Show version"
32
+ map %w(--version -v) => :version
33
+ def version
34
+ shell.say "chronicle-etl #{Chronicle::ETL::VERSION}"
35
+ end
36
+
43
37
  # Displays help options for chronicle-etl
44
38
  def help(meth = nil, subcommand = false)
45
39
  if meth && !respond_to?(meth)
46
- klass, task = Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
40
+ klass, task = ::Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
47
41
  klass.start(['-h', task].compact, shell: shell)
48
42
  else
49
43
  shell.say "ABOUT".bold
@@ -64,7 +58,7 @@ module Chronicle
64
58
 
65
59
  list = []
66
60
 
67
- Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
61
+ ::Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
68
62
  list += thor_class.printable_tasks(false)
69
63
  end
70
64
  list.sort! { |a, b| a[0] <=> b[0] }
@@ -2,11 +2,11 @@ module Chronicle
2
2
  module ETL
3
3
  module CLI
4
4
  # Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
5
- class SubcommandBase < Thor
5
+ class SubcommandBase < ::Thor
6
6
  # Print usage instructions for a subcommand
7
7
  def self.help(shell, subcommand = false)
8
8
  list = printable_commands(true, subcommand)
9
- Thor::Util.thor_classes_in(self).each do |klass|
9
+ ::Thor::Util.thor_classes_in(self).each do |klass|
10
10
  list += klass.printable_commands(false)
11
11
  end
12
12
  list.sort! { |a, b| a[0] <=> b[0] }
@@ -0,0 +1,7 @@
1
+ require 'thor'
2
+ require 'chronicle/etl'
3
+
4
+ require 'chronicle/etl/cli/subcommand_base'
5
+ require 'chronicle/etl/cli/connectors'
6
+ require 'chronicle/etl/cli/jobs'
7
+ require 'chronicle/etl/cli/main'
@@ -0,0 +1,150 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ostruct"
4
+
5
+ module Chronicle
6
+ module ETL
7
+ # A mixin that gives a class
8
+ # a {Chronicle::ETL::Configurable::ClassMethods#setting} macro to define
9
+ # settings and their properties (require, type, etc)
10
+ #
11
+ # @example Basic usage
12
+ # class Test < Chronicle::ETL::Extractor
13
+ # include Chronicle::ETL::Configurable
14
+ # setting :when, type: :date, required: true
15
+ # end
16
+ #
17
+ # t = Test.new(when: '2022-02-24')
18
+ # t.config.when
19
+ module Configurable
20
+ # An individual setting for this Configurable
21
+ Setting = Struct.new(:default, :required, :type)
22
+ private_constant :Setting
23
+
24
+ # Collection of user-supplied options for this Configurable
25
+ class Config < OpenStruct
26
+ # Config values that aren't nil, as a hash
27
+ def compacted_h
28
+ to_h.compact
29
+ end
30
+ end
31
+
32
+ # @private
33
+ def self.included(klass)
34
+ klass.extend(ClassMethods)
35
+ klass.include(InstanceMethods)
36
+ klass.prepend(Initializer)
37
+ end
38
+
39
+ # Initializer method for classes that have Configurable mixed in
40
+ module Initializer
41
+ # Make sure this class has a default @config ready to use
42
+ def initialize(*args)
43
+ @config = initialize_default_config
44
+ super
45
+ end
46
+ end
47
+
48
+ # Instance methods for classes that have Configurable mixed in
49
+ module InstanceMethods
50
+ attr_reader :config
51
+
52
+ # Take given options and apply them to this class's settings
53
+ # and make them available in @config and validates that they
54
+ # conform to setting rules
55
+ def apply_options(options)
56
+ options.transform_keys!(&:to_sym)
57
+
58
+ options.each do |name, value|
59
+ setting = self.class.all_settings[name]
60
+ raise(Chronicle::ETL::ConfigurationError, "Unrecognized setting: #{name}") unless setting
61
+
62
+ @config[name] = coerced_value(setting, value)
63
+ end
64
+ validate_config
65
+ options
66
+ end
67
+
68
+ # Name of all settings available to this class
69
+ def self.settings
70
+ self.class.all_settings.keys
71
+ end
72
+
73
+ private
74
+
75
+ def initialize_default_config
76
+ self.class.config_with_defaults
77
+ end
78
+
79
+ def validate_config
80
+ missing = (self.class.all_required_settings.keys - @config.compacted_h.keys)
81
+ raise Chronicle::ETL::ConfigurationError, "Missing options: #{missing}" if missing.count.positive?
82
+ end
83
+
84
+ def coerced_value(setting, value)
85
+ setting.type ? __send__("coerce_#{setting.type}", value) : value
86
+ end
87
+
88
+ def coerce_string(value)
89
+ value.to_s
90
+ end
91
+
92
+ def coerce_time(value)
93
+ # TODO: handle durations like '3h'
94
+ if value.is_a?(String)
95
+ Time.parse(value)
96
+ else
97
+ value
98
+ end
99
+ end
100
+ end
101
+
102
+ # Class methods for classes that have Configurable mixed in
103
+ module ClassMethods
104
+ # Macro for creating a setting on a class {::Chronicle::ETL::Configurable}
105
+ #
106
+ # @param [String] name Name of the setting
107
+ # @param [Boolean] required whether setting is required
108
+ # @param [Object] default Default value
109
+ # @param [Symbol] type Type
110
+ #
111
+ # @example Basic usage
112
+ # setting :when, type: :date, required: true
113
+ #
114
+ # @see ::Chronicle::ETL::Configurable
115
+ def setting(name, default: nil, required: false, type: nil)
116
+ s = Setting.new(default, required, type)
117
+ settings[name] = s
118
+ end
119
+
120
+ # Collect all settings defined on this class and its ancestors (that
121
+ # have Configurable mixin included)
122
+ def all_settings
123
+ if superclass.include?(Chronicle::ETL::Configurable)
124
+ superclass.all_settings.merge(settings)
125
+ else
126
+ settings
127
+ end
128
+ end
129
+
130
+ # Filters settings to those that are required.
131
+ def all_required_settings
132
+ all_settings.select { |_name, setting| setting.required } || {}
133
+ end
134
+
135
+ def settings
136
+ @settings ||= {}
137
+ end
138
+
139
+ def setting_exists?(name)
140
+ all_settings.keys.include? name
141
+ end
142
+
143
+ def config_with_defaults
144
+ s = all_settings.transform_values(&:default)
145
+ Config.new(s)
146
+ end
147
+ end
148
+ end
149
+ end
150
+ end
@@ -2,6 +2,8 @@ module Chronicle
2
2
  module ETL
3
3
  class Error < StandardError; end;
4
4
 
5
+ class ConfigurationError < Error; end;
6
+
5
7
  class RunnerTypeError < Error; end
6
8
 
7
9
  class ConnectorNotAvailableError < Error
@@ -2,21 +2,15 @@ require 'csv'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
5
- class CsvExtractor < Chronicle::ETL::Extractor
5
+ class CSVExtractor < Chronicle::ETL::Extractor
6
6
  include Extractors::Helpers::FilesystemReader
7
7
 
8
8
  register_connector do |r|
9
9
  r.description = 'input as CSV'
10
10
  end
11
11
 
12
- DEFAULT_OPTIONS = {
13
- headers: true,
14
- filename: $stdin
15
- }.freeze
16
-
17
- def initialize(options = {})
18
- super(DEFAULT_OPTIONS.merge(options))
19
- end
12
+ setting :headers, default: true
13
+ setting :filename, default: $stdin
20
14
 
21
15
  def extract
22
16
  csv = initialize_csv
@@ -26,20 +20,20 @@ module Chronicle
26
20
  end
27
21
 
28
22
  def results_count
29
- CSV.read(@options[:filename], headers: @options[:headers]).count unless stdin?(@options[:filename])
23
+ CSV.read(@config.filename, headers: @config.headers).count unless stdin?(@config.filename)
30
24
  end
31
25
 
32
26
  private
33
27
 
34
28
  def initialize_csv
35
- headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
29
+ headers = @config.headers.is_a?(String) ? @config.headers.split(',') : @config.headers
36
30
 
37
31
  csv_options = {
38
32
  headers: headers,
39
33
  converters: :all
40
34
  }
41
35
 
42
- open_from_filesystem(filename: @options[:filename]) do |file|
36
+ open_from_filesystem(filename: @config.filename) do |file|
43
37
  return CSV.new(file, **csv_options)
44
38
  end
45
39
  end
@@ -5,15 +5,20 @@ module Chronicle
5
5
  # Abstract class representing an Extractor for an ETL job
6
6
  class Extractor
7
7
  extend Chronicle::ETL::Registry::SelfRegistering
8
+ include Chronicle::ETL::Configurable
9
+
10
+ setting :since, type: :date
11
+ setting :until, type: :date
12
+ setting :limit
13
+ setting :load_after_id
14
+ setting :filename
8
15
 
9
16
  # Construct a new instance of this extractor. Options are passed in from a Runner
10
- # == Paramters:
17
+ # == Parameters:
11
18
  # options::
12
19
  # Options for configuring this Extractor
13
20
  def initialize(options = {})
14
- @options = options.transform_keys!(&:to_sym)
15
- sanitize_options
16
- handle_continuation
21
+ apply_options(options)
17
22
  end
18
23
 
19
24
  # Hook called before #extract. Useful for gathering data, initailizing proxies, etc
@@ -30,17 +35,13 @@ module Chronicle
30
35
 
31
36
  private
32
37
 
33
- def sanitize_options
34
- @options[:load_since] = Time.parse(@options[:load_since]) if @options[:load_since] && @options[:load_since].is_a?(String)
35
- @options[:load_until] = Time.parse(@options[:load_until]) if @options[:load_until] && @options[:load_until].is_a?(String)
36
- end
37
-
38
- def handle_continuation
39
- return unless @options[:continuation]
38
+ # TODO: reimplemenet this
39
+ # def handle_continuation
40
+ # return unless @config.continuation
40
41
 
41
- @options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation].highest_timestamp
42
- @options[:load_after_id] = @options[:continuation].last_id if @options[:continuation].last_id
43
- end
42
+ # @config.since = @config.continuation.highest_timestamp if @config.continuation.highest_timestamp
43
+ # @config.load_after_id = @config.continuation.last_id if @config.continuation.last_id
44
+ # end
44
45
  end
45
46
  end
46
47
  end
@@ -9,6 +9,9 @@ module Chronicle
9
9
  r.description = 'file or directory of files'
10
10
  end
11
11
 
12
+ # TODO: consolidate this with @config.filename
13
+ setting :dir_glob_pattern
14
+
12
15
  def extract
13
16
  filenames.each do |filename|
14
17
  yield Chronicle::ETL::Extraction.new(data: filename)
@@ -23,10 +26,10 @@ module Chronicle
23
26
 
24
27
  def filenames
25
28
  @filenames ||= filenames_in_directory(
26
- path: @options[:filename],
27
- dir_glob_pattern: @options[:dir_glob_pattern],
28
- load_since: @options[:load_since],
29
- load_until: @options[:load_until]
29
+ path: @config.filename,
30
+ dir_glob_pattern: @config.dir_glob_pattern,
31
+ load_since: @config.since,
32
+ load_until: @config.until
30
33
  )
31
34
  end
32
35
  end
@@ -7,16 +7,8 @@ module Chronicle
7
7
  r.description = 'input as JSON'
8
8
  end
9
9
 
10
- DEFAULT_OPTIONS = {
11
- filename: $stdin,
12
-
13
- # We're expecting line-separated json objects
14
- jsonl: true
15
- }.freeze
16
-
17
- def initialize(options = {})
18
- super(DEFAULT_OPTIONS.merge(options))
19
- end
10
+ setting :filename, default: $stdin
11
+ setting :jsonl, default: true
20
12
 
21
13
  def extract
22
14
  load_input do |input|
@@ -35,7 +35,7 @@ module Chronicle
35
35
 
36
36
  def instantiate_transformer(extraction)
37
37
  @transformer_klass = @job_definition.transformer_klass
38
- @transformer_klass.new(@transformer_options, extraction)
38
+ @transformer_klass.new(extraction, @transformer_options)
39
39
  end
40
40
 
41
41
  def instantiate_loader
@@ -2,7 +2,7 @@ require 'csv'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
5
- class CsvLoader < Chronicle::ETL::Loader
5
+ class CSVLoader < Chronicle::ETL::Loader
6
6
  register_connector do |r|
7
7
  r.description = 'CSV'
8
8
  end
@@ -3,13 +3,16 @@ module Chronicle
3
3
  # Abstract class representing a Loader for an ETL job
4
4
  class Loader
5
5
  extend Chronicle::ETL::Registry::SelfRegistering
6
+ include Chronicle::ETL::Configurable
7
+
8
+ setting :output
6
9
 
7
10
  # Construct a new instance of this loader. Options are passed in from a Runner
8
11
  # == Parameters:
9
12
  # options::
10
13
  # Options for configuring this Loader
11
14
  def initialize(options = {})
12
- @options = options
15
+ apply_options(options)
13
16
  end
14
17
 
15
18
  # Called once before processing records
@@ -9,19 +9,19 @@ module Chronicle
9
9
  r.description = 'a REST endpoint'
10
10
  end
11
11
 
12
- def initialize( options={} )
13
- super(options)
14
- end
12
+ setting :hostname, required: true
13
+ setting :endpoint, required: true
14
+ setting :access_token
15
15
 
16
16
  def load(record)
17
17
  payload = Chronicle::ETL::JSONAPISerializer.serialize(record)
18
18
  # have the outer data key that json-api expects
19
19
  payload = { data: payload } unless payload[:data]
20
20
 
21
- uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
21
+ uri = URI.parse("#{@config.hostname}#{@config.endpoint}")
22
22
 
23
23
  header = {
24
- "Authorization" => "Bearer #{@options[:access_token]}",
24
+ "Authorization" => "Bearer #{@config.access_token}",
25
25
  "Content-Type": 'application/json'
26
26
  }
27
27
  use_ssl = uri.scheme == 'https'
@@ -9,20 +9,14 @@ module Chronicle
9
9
  r.description = 'an ASCII table'
10
10
  end
11
11
 
12
- DEFAULT_OPTIONS = {
13
- fields_limit: nil,
14
- fields_exclude: ['lids', 'type'],
15
- fields_include: [],
16
- truncate_values_at: nil,
17
- table_renderer: :basic
18
- }.freeze
19
-
20
- def initialize(options={})
21
- @options = options.reverse_merge(DEFAULT_OPTIONS)
22
- @records = []
23
- end
12
+ setting :fields_limit, default: nil
13
+ setting :fields_exclude, default: ['lids', 'type']
14
+ setting :fields_include, default: []
15
+ setting :truncate_values_at, default: 40
16
+ setting :table_renderer, default: :basic
24
17
 
25
18
  def load(record)
19
+ @records ||= []
26
20
  @records << record.to_h_flattened
27
21
  end
28
22
 
@@ -34,7 +28,7 @@ module Chronicle
34
28
 
35
29
  @table = TTY::Table.new(header: headers, rows: rows)
36
30
  puts @table.render(
37
- @options[:table_renderer].to_sym,
31
+ @config.table_renderer.to_sym,
38
32
  padding: [0, 2, 0, 0]
39
33
  )
40
34
  end
@@ -43,15 +37,15 @@ module Chronicle
43
37
 
44
38
  def build_headers(records)
45
39
  headers =
46
- if @options[:fields_include].any?
47
- Set[*@options[:fields_include]]
40
+ if @config.fields_include.any?
41
+ Set[*@config.fields_include]
48
42
  else
49
43
  # use all the keys of the flattened record hash
50
44
  Set[*records.map(&:keys).flatten.map(&:to_s).uniq]
51
45
  end
52
46
 
53
- headers = headers.delete_if { |header| header.end_with?(*@options[:fields_exclude]) } if @options[:fields_exclude].any?
54
- headers = headers.first(@options[:fields_limit]) if @options[:fields_limit]
47
+ headers = headers.delete_if { |header| header.end_with?(*@config.fields_exclude) } if @config.fields_exclude.any?
48
+ headers = headers.first(@config.fields_limit) if @config.fields_limit
55
49
 
56
50
  headers.to_a.map(&:to_sym)
57
51
  end
@@ -60,8 +54,8 @@ module Chronicle
60
54
  records.map do |record|
61
55
  values = record.values_at(*headers).map{|value| value.to_s }
62
56
 
63
- if @options[:truncate_values_at]
64
- values = values.map{ |value| value.truncate(@options[:truncate_values_at]) }
57
+ if @config.truncate_values_at
58
+ values = values.map{ |value| value.truncate(@config.truncate_values_at) }
65
59
  end
66
60
 
67
61
  values
@@ -3,6 +3,7 @@ module Chronicle
3
3
  module Registry
4
4
  # Records details about a connector such as its provider and a description
5
5
  class ConnectorRegistration
6
+ # FIXME: refactor custom accessor methods later in file
6
7
  attr_accessor :identifier, :provider, :klass, :description
7
8
 
8
9
  def initialize(klass)
@@ -19,20 +19,14 @@ module Chronicle
19
19
  r.description = 'an image file'
20
20
  end
21
21
 
22
- DEFAULT_OPTIONS = {
23
- timestamp_strategy: 'file_mtime',
24
- id_strategy: 'file_hash',
25
- verb: 'photographed',
26
-
27
- # EXIF tags often don't have timezones
28
- timezone_default: 'Eastern Time (US & Canada)',
29
- include_image_data: true
30
- }.freeze
31
-
32
- def initialize(*args)
33
- super(*args)
34
- @options = @options.reverse_merge(DEFAULT_OPTIONS)
35
- end
22
+ setting :timestamp_strategy, default: 'file_mtime'
23
+ setting :id_strategy, default: 'file_hash'
24
+ setting :verb, default: 'photographed'
25
+ # EXIF tags often don't have timezones
26
+ setting :timezone_default, default: 'Eastern Time (US & Canada)'
27
+ setting :include_image_data, default: true
28
+ setting :actor
29
+ setting :involved
36
30
 
37
31
  def transform
38
32
  # FIXME: set @filename; use block for reading file when necessary
@@ -48,7 +42,7 @@ module Chronicle
48
42
 
49
43
  def id
50
44
  @id ||= begin
51
- id = build_with_strategy(field: :id, strategy: @options[:id_strategy])
45
+ id = build_with_strategy(field: :id, strategy: @config.id_strategy)
52
46
  raise UntransformableRecordError.new("Could not build id", transformation: self) unless id
53
47
 
54
48
  id
@@ -57,7 +51,7 @@ module Chronicle
57
51
 
58
52
  def timestamp
59
53
  @timestamp ||= begin
60
- ts = build_with_strategy(field: :timestamp, strategy: @options[:timestamp_strategy])
54
+ ts = build_with_strategy(field: :timestamp, strategy: @config.timestamp_strategy)
61
55
  raise UntransformableRecordError.new("Could not build timestamp", transformation: self) unless ts
62
56
 
63
57
  ts
@@ -68,8 +62,8 @@ module Chronicle
68
62
 
69
63
  def build_created(file)
70
64
  record = ::Chronicle::ETL::Models::Activity.new
71
- record.verb = @options[:verb]
72
- record.provider = @options[:provider]
65
+ record.verb = @config.verb
66
+ record.provider = @config.provider
73
67
  record.provider_id = id
74
68
  record.end_at = timestamp
75
69
  record.dedupe_on = [[:provider_id, :verb, :provider]]
@@ -84,24 +78,24 @@ module Chronicle
84
78
  def build_actor
85
79
  actor = ::Chronicle::ETL::Models::Entity.new
86
80
  actor.represents = 'identity'
87
- actor.provider = @options[:actor][:provider]
88
- actor.slug = @options[:actor][:slug]
81
+ actor.provider = @config.actor[:provider]
82
+ actor.slug = @config.actor[:slug]
89
83
  actor.dedupe_on = [[:provider, :slug, :represents]]
90
84
  actor
91
85
  end
92
86
 
93
87
  def build_image
94
88
  image = ::Chronicle::ETL::Models::Entity.new
95
- image.represents = @options[:involved][:represents]
89
+ image.represents = @config.involved[:represents]
96
90
  image.title = build_title
97
91
  image.body = exif['Description']
98
- image.provider = @options[:involved][:provider]
92
+ image.provider = @config.involved[:provider]
99
93
  image.provider_id = id
100
94
  image.assign_attributes(build_gps)
101
95
  image.dedupe_on = [[:provider, :provider_id, :represents]]
102
96
 
103
- if @options[:ocr_strategy]
104
- ocr_text = build_with_strategy(field: :ocr, strategy: @options[:ocr_strategy])
97
+ if @config.ocr_strategy
98
+ ocr_text = build_with_strategy(field: :ocr, strategy: @config.ocr_strategy)
105
99
  image.metadata[:ocr_text] = ocr_text if ocr_text
106
100
  end
107
101
 
@@ -111,7 +105,7 @@ module Chronicle
111
105
  image.depicts = build_people_depicted(names)
112
106
  image.abouts = build_keywords(tags)
113
107
 
114
- if @options[:include_image_data]
108
+ if @config.include_image_data
115
109
  attachment = ::Chronicle::ETL::Models::Attachment.new
116
110
  attachment.data = build_image_data
117
111
  image.attachments = [attachment]
@@ -124,7 +118,7 @@ module Chronicle
124
118
  topics.map do |topic|
125
119
  t = ::Chronicle::ETL::Models::Entity.new
126
120
  t.represents = 'topic'
127
- t.provider = @options[:involved][:provider]
121
+ t.provider = @config.involved[:provider]
128
122
  t.title = topic
129
123
  t.slug = topic.parameterize
130
124
  t.dedupe_on = [[:provider, :represents, :slug]]
@@ -136,7 +130,7 @@ module Chronicle
136
130
  names.map do |name|
137
131
  identity = ::Chronicle::ETL::Models::Entity.new
138
132
  identity.represents = 'identity'
139
- identity.provider = @options[:involved][:provider]
133
+ identity.provider = @config.involved[:provider]
140
134
  identity.slug = name.parameterize
141
135
  identity.title = name
142
136
  identity.dedupe_on = [[:provider, :represents, :slug]]
@@ -199,7 +193,7 @@ module Chronicle
199
193
  elsif false
200
194
  # TODO: support option of using GPS coordinates to determine timezone
201
195
  else
202
- zone = ActiveSupport::TimeZone.new(@options[:timezone_default])
196
+ zone = ActiveSupport::TimeZone.new(@config.timezone_default)
203
197
  timestamp = zone.parse(timestamp.asctime)
204
198
  end
205
199
 
@@ -3,14 +3,15 @@ module Chronicle
3
3
  # Abstract class representing an Transformer for an ETL job
4
4
  class Transformer
5
5
  extend Chronicle::ETL::Registry::SelfRegistering
6
+ include Chronicle::ETL::Configurable
6
7
 
7
8
  # Construct a new instance of this transformer. Options are passed in from a Runner
8
9
  # == Parameters:
9
10
  # options::
10
11
  # Options for configuring this Transformer
11
- def initialize(options = {}, extraction)
12
- @options = options
12
+ def initialize(extraction, options = {})
13
13
  @extraction = extraction
14
+ apply_options(options)
14
15
  end
15
16
 
16
17
  # @abstract Subclass is expected to implement #transform
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.3.1"
3
+ VERSION = "0.4.0"
4
4
  end
5
5
  end
data/lib/chronicle/etl.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require_relative 'etl/registry/registry'
2
2
  require_relative 'etl/config'
3
+ require_relative 'etl/configurable'
3
4
  require_relative 'etl/exceptions'
4
5
  require_relative 'etl/extraction'
5
6
  require_relative 'etl/extractors/extractor'
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chronicle-etl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Louis
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-02-10 00:00:00.000000000 Z
11
+ date: 2022-02-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: '7.0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ">="
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: '7.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: chronic_duration
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -98,16 +98,16 @@ dependencies:
98
98
  name: runcom
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - "~>"
101
+ - - ">="
102
102
  - !ruby/object:Gem::Version
103
- version: '6.2'
103
+ version: '6.0'
104
104
  type: :runtime
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
- - - "~>"
108
+ - - ">="
109
109
  - !ruby/object:Gem::Version
110
- version: '6.2'
110
+ version: '6.0'
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: sequel
113
113
  requirement: !ruby/object:Gem::Requirement
@@ -142,14 +142,14 @@ dependencies:
142
142
  requirements:
143
143
  - - "~>"
144
144
  - !ruby/object:Gem::Version
145
- version: '0.20'
145
+ version: '1.2'
146
146
  type: :runtime
147
147
  prerelease: false
148
148
  version_requirements: !ruby/object:Gem::Requirement
149
149
  requirements:
150
150
  - - "~>"
151
151
  - !ruby/object:Gem::Version
152
- version: '0.20'
152
+ version: '1.2'
153
153
  - !ruby/object:Gem::Dependency
154
154
  name: tty-progressbar
155
155
  requirement: !ruby/object:Gem::Requirement
@@ -234,6 +234,62 @@ dependencies:
234
234
  - - "~>"
235
235
  - !ruby/object:Gem::Version
236
236
  version: '3.9'
237
+ - !ruby/object:Gem::Dependency
238
+ name: simplecov
239
+ requirement: !ruby/object:Gem::Requirement
240
+ requirements:
241
+ - - "~>"
242
+ - !ruby/object:Gem::Version
243
+ version: '0.21'
244
+ type: :development
245
+ prerelease: false
246
+ version_requirements: !ruby/object:Gem::Requirement
247
+ requirements:
248
+ - - "~>"
249
+ - !ruby/object:Gem::Version
250
+ version: '0.21'
251
+ - !ruby/object:Gem::Dependency
252
+ name: guard-rspec
253
+ requirement: !ruby/object:Gem::Requirement
254
+ requirements:
255
+ - - "~>"
256
+ - !ruby/object:Gem::Version
257
+ version: 4.7.3
258
+ type: :development
259
+ prerelease: false
260
+ version_requirements: !ruby/object:Gem::Requirement
261
+ requirements:
262
+ - - "~>"
263
+ - !ruby/object:Gem::Version
264
+ version: 4.7.3
265
+ - !ruby/object:Gem::Dependency
266
+ name: yard
267
+ requirement: !ruby/object:Gem::Requirement
268
+ requirements:
269
+ - - "~>"
270
+ - !ruby/object:Gem::Version
271
+ version: 0.9.7
272
+ type: :development
273
+ prerelease: false
274
+ version_requirements: !ruby/object:Gem::Requirement
275
+ requirements:
276
+ - - "~>"
277
+ - !ruby/object:Gem::Version
278
+ version: 0.9.7
279
+ - !ruby/object:Gem::Dependency
280
+ name: rubocop
281
+ requirement: !ruby/object:Gem::Requirement
282
+ requirements:
283
+ - - "~>"
284
+ - !ruby/object:Gem::Version
285
+ version: 1.25.1
286
+ type: :development
287
+ prerelease: false
288
+ version_requirements: !ruby/object:Gem::Requirement
289
+ requirements:
290
+ - - "~>"
291
+ - !ruby/object:Gem::Version
292
+ version: 1.25.1
237
293
  description: Chronicle-ETL allows you to extract personal data from a variety of services,
238
294
  transformer it, and load it.
239
295
  email:
@@ -243,14 +299,15 @@ executables:
243
299
  extensions: []
244
300
  extra_rdoc_files: []
245
301
  files:
302
+ - ".github/workflows/ruby.yml"
246
303
  - ".gitignore"
247
304
  - ".rspec"
248
305
  - ".rubocop.yml"
249
- - ".ruby-version"
250
306
  - ".travis.yml"
251
307
  - ".yardopts"
252
308
  - CODE_OF_CONDUCT.md
253
309
  - Gemfile
310
+ - Guardfile
254
311
  - LICENSE.txt
255
312
  - README.md
256
313
  - Rakefile
@@ -259,11 +316,13 @@ files:
259
316
  - chronicle-etl.gemspec
260
317
  - exe/chronicle-etl
261
318
  - lib/chronicle/etl.rb
319
+ - lib/chronicle/etl/cli.rb
262
320
  - lib/chronicle/etl/cli/connectors.rb
263
321
  - lib/chronicle/etl/cli/jobs.rb
264
322
  - lib/chronicle/etl/cli/main.rb
265
323
  - lib/chronicle/etl/cli/subcommand_base.rb
266
324
  - lib/chronicle/etl/config.rb
325
+ - lib/chronicle/etl/configurable.rb
267
326
  - lib/chronicle/etl/exceptions.rb
268
327
  - lib/chronicle/etl/extraction.rb
269
328
  - lib/chronicle/etl/extractors/csv_extractor.rb
@@ -317,14 +376,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
317
376
  requirements:
318
377
  - - ">="
319
378
  - !ruby/object:Gem::Version
320
- version: '0'
379
+ version: '2.7'
321
380
  required_rubygems_version: !ruby/object:Gem::Requirement
322
381
  requirements:
323
382
  - - ">="
324
383
  - !ruby/object:Gem::Version
325
384
  version: '0'
326
385
  requirements: []
327
- rubygems_version: 3.1.2
386
+ rubygems_version: 3.1.6
328
387
  signing_key:
329
388
  specification_version: 4
330
389
  summary: ETL tool for personal data
data/.ruby-version DELETED
@@ -1 +0,0 @@
1
- 2.7.1