chronicle-etl 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/README.md +22 -15
  5. data/chronicle-etl.gemspec +11 -5
  6. data/lib/chronicle/etl/cli/connectors.rb +19 -7
  7. data/lib/chronicle/etl/cli/jobs.rb +38 -27
  8. data/lib/chronicle/etl/cli/main.rb +10 -2
  9. data/lib/chronicle/etl/config.rb +24 -3
  10. data/lib/chronicle/etl/exceptions.rb +30 -0
  11. data/lib/chronicle/etl/extraction.rb +12 -0
  12. data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -37
  13. data/lib/chronicle/etl/extractors/extractor.rb +19 -1
  14. data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
  15. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  16. data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
  17. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  18. data/lib/chronicle/etl/job.rb +72 -0
  19. data/lib/chronicle/etl/job_definition.rb +89 -0
  20. data/lib/chronicle/etl/job_log.rb +95 -0
  21. data/lib/chronicle/etl/job_logger.rb +81 -0
  22. data/lib/chronicle/etl/loaders/csv_loader.rb +6 -6
  23. data/lib/chronicle/etl/loaders/loader.rb +2 -2
  24. data/lib/chronicle/etl/loaders/rest_loader.rb +16 -9
  25. data/lib/chronicle/etl/loaders/stdout_loader.rb +8 -3
  26. data/lib/chronicle/etl/loaders/table_loader.rb +58 -7
  27. data/lib/chronicle/etl/logger.rb +48 -0
  28. data/lib/chronicle/etl/models/activity.rb +15 -0
  29. data/lib/chronicle/etl/models/attachment.rb +14 -0
  30. data/lib/chronicle/etl/models/base.rb +119 -0
  31. data/lib/chronicle/etl/models/entity.rb +21 -0
  32. data/lib/chronicle/etl/models/generic.rb +23 -0
  33. data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
  34. data/lib/chronicle/etl/registry/registry.rb +52 -0
  35. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  36. data/lib/chronicle/etl/runner.rb +70 -42
  37. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  38. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  39. data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
  40. data/lib/chronicle/etl/transformers/null_transformer.rb +12 -4
  41. data/lib/chronicle/etl/transformers/transformer.rb +42 -12
  42. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  43. data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
  44. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  45. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  46. data/lib/chronicle/etl/version.rb +1 -1
  47. data/lib/chronicle/etl.rb +17 -1
  48. metadata +138 -35
  49. data/CHANGELOG.md +0 -23
  50. data/Gemfile.lock +0 -85
  51. data/lib/chronicle/etl/catalog.rb +0 -62
  52. data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3d9be4b073385d9758a8b709bb6726d8dae648b8c4dbef474343840c011d1178
4
- data.tar.gz: 0d95395407d37f7e322287a0920bba60e1b4f81eb8649190d021e13580604a2d
3
+ metadata.gz: bfd4e081bfeda7d097a5a5eee6ccf28baf0a9b3878968d74c9a604013d0b55a6
4
+ data.tar.gz: 003ebd2ffe2b1220c7f43a4875043cb5500aa1b7a6327b84c9be10f04e0e8d40
5
5
  SHA512:
6
- metadata.gz: 386c96518aa2d2810ae2a93bbe3af5bb08e26e132608b4e6ed8a278da076783e453854a2120c5016b6d02cd5dea406146d10ef3c7c1e77d854acd8ff2608eaf7
7
- data.tar.gz: eb14402be5d6db44a6f06e6ec930acc5103b36e5c2e5a13e89137c9ee45f5f11c1e9e6ab13d4d44e6ee06bd9b02309e02ac33e81256645ef71d5b431c97eb199
6
+ metadata.gz: 3d786fb4acf8d0b03e65262209def310ca25b92646847f6e96791e6491e9b159ab11db7fa35f785f6782fbc0b9e3daebb625e2353fce2422f7fc79aed7a4d6bc
7
+ data.tar.gz: 87771745b9df2160966299f1d73eb568b46080ed217a1952e1dd938fff7758432f2cc6f449036d24951660b405e18083f1adac26d00243a0fab9003a96eb569d
data/.gitignore CHANGED
@@ -7,6 +7,9 @@
7
7
  /spec/reports/
8
8
  /tmp/
9
9
 
10
+ # https://yehudakatz.com/2010/12/16/clarifying-the-roles-of-the-gemspec-and-gemfile/
11
+ Gemfile.lock
12
+
10
13
  # rspec failure tracking
11
14
  .rspec_status
12
15
  .DS_Store
data/.rubocop.yml CHANGED
@@ -5,4 +5,7 @@ Style/StringLiterals:
5
5
  Enabled: false
6
6
 
7
7
  Style/MethodCallWithArgsParentheses:
8
+ Enabled: false
9
+
10
+ Lint/ConstantResolution:
8
11
  Enabled: false
data/README.md CHANGED
@@ -2,9 +2,9 @@
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/chronicle-etl.svg)](https://badge.fury.io/rb/chronicle-etl)
4
4
 
5
- Chronicle ETL is a utility tool for archiving and processing personal data. You can extract it from a variety of source, transform it, and load it to different APIs or file formats.
5
+ Chronicle ETL is a utility that helps you archive and processes personal data. You can *extract* it from a variety of sources, *transform* it, and *load* it to an external API, file, or stdout.
6
6
 
7
- This project is an adaptation of Andrew Louis's experimental [Memex project](https://hyfen.net/memex).
7
+ This tool is an adaptation of Andrew Louis's experimental [Memex project](https://hyfen.net/memex) and the dozens of existing importers are being migrated to Chronicle.
8
8
 
9
9
  ## Installation
10
10
 
@@ -31,6 +31,9 @@ Connectors are available to read, process, and load data from different formats
31
31
  ```bash
32
32
  # List all available connectors
33
33
  $ chronicle-etl connectors:list
34
+
35
+ # Install a connector
36
+ $ chronicle-etl connectors:install imessage
34
37
  ```
35
38
 
36
39
  Built in connectors:
@@ -44,16 +47,18 @@ Built in connectors:
44
47
  - `null` - (default) Don't do anything
45
48
 
46
49
  ### Loaders
47
- - `stdout` - (default) output transformed records to stdount
50
+ - `stdout` - (default) output records to stdout serialized as JSON
48
51
  - `csv` - Load records to a csv file
52
+ - `rest` - Serialize records with [JSONAPI](https://jsonapi.org/) and send to a REST API
49
53
  - `table` - Output an ascii table of records. Useful for debugging.
50
54
 
51
55
  ### Provider-specific importers
52
56
 
53
57
  In addition to the built-in importers, importers for third-party platforms are available. They are packaged as individual Ruby gems.
54
58
 
55
- - [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` files. Transformers for chronicle schema
56
- - [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`. Transform it for chronicle schema
59
+ - [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files
60
+ - [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`
61
+ - [imessage](https://github.com/chronicle-app/chronicle-imessage). Extract iMessage messages from a local macOS installation
57
62
 
58
63
  To install any of these, run `gem install chronicle-PROVIDER`.
59
64
 
@@ -61,7 +66,7 @@ If you don't want to use the available rubygem importers, `chronicle-etl` can us
61
66
 
62
67
  I'll be open-sourcing more importers. Please [contact me](mailto:andrew@hyfen.net) to chat about what will be available!
63
68
 
64
- ### Full commands
69
+ ## Full commands
65
70
 
66
71
  ```
67
72
  $ chronicle-etl help
@@ -75,26 +80,28 @@ ALL COMMANDS
75
80
  jobs:create # Create a job
76
81
  jobs:list # List all available jobs
77
82
  jobs:run # Start a job
78
- jobs:show # Show a job
83
+ jobs:show # Show details about a job
79
84
  ```
80
85
 
81
- ### Job options
86
+ ### Running a job
82
87
 
83
88
  ```
84
89
  Usage:
85
90
  chronicle-etl jobs:run
86
91
 
87
92
  Options:
88
- -e, [--extractor=extractor-name] # Extractor class (available: stdin, csv, file)
89
- # Default: stdin
93
+ [--log-level=LOG_LEVEL] # Log level (debug, info, warn, error, fatal)
94
+ # Default: info
95
+ -v, [--verbose], [--no-verbose] # Set log level to verbose
96
+ [--dry-run], [--no-dry-run] # Only run the extraction and transform steps, not the loading
97
+ -e, [--extractor=extractor-name] # Extractor class. Default: stdin
90
98
  [--extractor-opts=key:value] # Extractor options
91
- -t, [--transformer=transformer-name] # Transformer class (available: null)
92
- # Default: null
99
+ -t, [--transformer=transformer-name] # Transformer class. Default: null
93
100
  [--transformer-opts=key:value] # Transformer options
94
- -l, [--loader=loader-name] # Loader class (available: stdout, csv, table)
95
- # Default: stdout
101
+ -l, [--loader=loader-name] # Loader class. Default: stdout
96
102
  [--loader-opts=key:value] # Loader options
97
- -j, [--job=JOB] # Job configuration file
103
+ -j, [--name=NAME] # Job configuration name
104
+
98
105
 
99
106
  Runs an ETL job
100
107
  ```
@@ -36,15 +36,21 @@ Gem::Specification.new do |spec|
36
36
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
37
37
  spec.require_paths = ["lib"]
38
38
 
39
- spec.add_dependency "thor", "~> 0.20"
39
+ spec.add_dependency "activesupport"
40
+ spec.add_dependency "chronic_duration", "~> 0.10.6"
40
41
  spec.add_dependency "colorize", "~> 0.8.1"
41
- spec.add_dependency "tty-table", "~> 0.11"
42
+ spec.add_dependency "marcel", "~> 1.0.2"
43
+ spec.add_dependency "mini_exiftool", "~> 2.10"
44
+ spec.add_dependency "nokogiri", "~> 1.13"
45
+ spec.add_dependency "runcom", "~> 6.2"
46
+ spec.add_dependency "sequel", "~> 5.35"
47
+ spec.add_dependency "sqlite3", "~> 1.4"
48
+ spec.add_dependency "thor", "~> 0.20"
42
49
  spec.add_dependency "tty-progressbar", "~> 0.17"
50
+ spec.add_dependency "tty-table", "~> 0.11"
43
51
 
44
52
  spec.add_development_dependency "bundler", "~> 2.1"
53
+ spec.add_development_dependency "pry-byebug", "~> 3.9"
45
54
  spec.add_development_dependency "rake", "~> 13.0"
46
55
  spec.add_development_dependency "rspec", "~> 3.9"
47
- spec.add_development_dependency "pry-byebug", "~> 3.9"
48
- spec.add_development_dependency 'runcom', '~> 6.2'
49
- spec.add_development_dependency 'redcarpet', '~> 3.5'
50
56
  end
@@ -7,23 +7,35 @@ module Chronicle
7
7
  namespace :connectors
8
8
 
9
9
  desc "install NAME", "Installs connector NAME"
10
- def install
11
- puts "Installing"
10
+ def install(name)
11
+ Chronicle::ETL::Registry.install_connector(name)
12
12
  end
13
13
 
14
14
  desc "list", "Lists available connectors"
15
15
  # Display all available connectors that chronicle-etl has access to
16
16
  def list
17
- klasses = Chronicle::ETL::Catalog.available_classes
18
- klasses = klasses.sort_by do |a|
19
- [a[:built_in].to_s, a[:provider], a[:phase]]
17
+ Chronicle::ETL::Registry.load_all!
18
+
19
+ connector_info = Chronicle::ETL::Registry.connectors.map do |connector_registration|
20
+ {
21
+ identifier: connector_registration.identifier,
22
+ phase: connector_registration.phase,
23
+ description: connector_registration.descriptive_phrase,
24
+ provider: connector_registration.provider,
25
+ core: connector_registration.built_in? ? '✓' : '',
26
+ class: connector_registration.klass_name
27
+ }
28
+ end
29
+
30
+ connector_info = connector_info.sort_by do |a|
31
+ [a[:core].to_s, a[:provider], a[:phase], a[:identifier]]
20
32
  end
21
33
 
22
- headers = klasses.first.keys.map do |key|
34
+ headers = connector_info.first.keys.map do |key|
23
35
  key.to_s.upcase.bold
24
36
  end
25
37
 
26
- table = TTY::Table.new(headers, klasses.map(&:values))
38
+ table = TTY::Table.new(headers, connector_info.map(&:values))
27
39
  puts table.render(indent: 0, padding: [0, 2])
28
40
  end
29
41
  end
@@ -1,6 +1,4 @@
1
1
  require 'pp'
2
- require 'pry'
3
-
4
2
  module Chronicle
5
3
  module ETL
6
4
  module CLI
@@ -9,16 +7,19 @@ module Chronicle
9
7
  default_task "start"
10
8
  namespace :jobs
11
9
 
12
- class_option :extractor, aliases: '-e', desc: 'Extractor class (available: stdin, csv, file)', default: 'stdin', banner: 'extractor-name'
10
+ class_option :extractor, aliases: '-e', desc: "Extractor class. Default: stdin", banner: 'extractor-name'
13
11
  class_option :'extractor-opts', desc: 'Extractor options', type: :hash, default: {}
14
- class_option :transformer, aliases: '-t', desc: 'Transformer class (available: null)', default: 'null', banner: 'transformer-name'
12
+ class_option :transformer, aliases: '-t', desc: 'Transformer class. Default: null', banner: 'transformer-name'
15
13
  class_option :'transformer-opts', desc: 'Transformer options', type: :hash, default: {}
16
- class_option :loader, aliases: '-l', desc: 'Loader class (available: stdout, csv, table)', default: 'stdout', banner: 'loader-name'
14
+ class_option :loader, aliases: '-l', desc: 'Loader class. Default: stdout', banner: 'loader-name'
17
15
  class_option :'loader-opts', desc: 'Loader options', type: :hash, default: {}
18
- class_option :job, aliases: '-j', desc: 'Job configuration name (or filename)'
16
+ class_option :name, aliases: '-j', desc: 'Job configuration name'
19
17
 
20
18
  map run: :start # Thor doesn't like `run` as a command name
21
19
  desc "run", "Start a job"
20
+ option :log_level, desc: 'Log level (debug, info, warn, error, fatal)', default: 'info'
21
+ option :verbose, aliases: '-v', desc: 'Set log level to verbose', type: :boolean
22
+ option :dry_run, desc: 'Only run the extraction and transform steps, not the loading', type: :boolean
22
23
  long_desc <<-LONG_DESC
23
24
  This will run an ETL job. Each job needs three parts:
24
25
 
@@ -26,36 +27,37 @@ module Chronicle
26
27
 
27
28
  2. #{'Transformer'.underline}: transforms data into a new format. If none is specified, we use the `null` transformer which does nothing to the data.
28
29
 
29
- 3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout.
30
+ 3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout. With the --dry-run option, this step won't be run.
30
31
 
31
32
  If you do not want to use the command line flags, you can also configure a job with a .yml config file. You can either specify the path to this file or use the filename and place the file in ~/.config/chronicle/etl/jobs/NAME.yml and call it with `--job NAME`
32
33
  LONG_DESC
33
34
  # Run an ETL job
34
35
  def start
35
- runner_options = build_runner_options(options)
36
- runner = Chronicle::ETL::Runner.new(runner_options)
36
+ setup_log_level
37
+ job_definition = build_job_definition(options)
38
+ job = Chronicle::ETL::Job.new(job_definition)
39
+ runner = Chronicle::ETL::Runner.new(job)
37
40
  runner.run!
38
41
  end
39
42
 
40
43
  desc "create", "Create a job"
41
44
  # Create an ETL job
42
45
  def create
43
- runner_options = build_runner_options(options)
44
- path = File.join('chronicle', 'etl', 'jobs', options[:job])
45
- Chronicle::ETL::Config.write(path, runner_options)
46
+ job_definition = build_job_definition(options)
47
+ path = File.join('chronicle', 'etl', 'jobs', options[:name])
48
+ Chronicle::ETL::Config.write(path, job_definition.definition)
46
49
  end
47
50
 
48
51
  desc "show", "Show details about a job"
49
52
  # Show an ETL job
50
53
  def show
51
- runner_options = build_runner_options(options)
52
- pp runner_options
54
+ puts Chronicle::ETL::Job.new(build_job_definition(options))
53
55
  end
54
56
 
55
57
  desc "list", "List all available jobs"
56
58
  # List available ETL jobs
57
59
  def list
58
- jobs = Chronicle::ETL::Config.jobs
60
+ jobs = Chronicle::ETL::Config.available_jobs
59
61
 
60
62
  job_details = jobs.map do |job|
61
63
  r = Chronicle::ETL::Config.load("chronicle/etl/jobs/#{job}.yml")
@@ -75,34 +77,43 @@ LONG_DESC
75
77
 
76
78
  private
77
79
 
78
- # Create runner options by reading config file and then overwriting with flag options
79
- def build_runner_options options
80
- flag_options = process_flag_options(options)
81
- job_options = load_job(options[:job])
82
- flag_options.merge(job_options)
80
+ def setup_log_level
81
+ if options[:verbose]
82
+ Chronicle::ETL::Logger.log_level = Chronicle::ETL::Logger::DEBUG
83
+ elsif options[:log_level]
84
+ level = Chronicle::ETL::Logger.const_get(options[:log_level].upcase)
85
+ Chronicle::ETL::Logger.log_level = level
86
+ end
87
+ end
88
+
89
+ # Create job definition by reading config file and then overwriting with flag options
90
+ def build_job_definition(options)
91
+ definition = Chronicle::ETL::JobDefinition.new
92
+ definition.add_config(load_job_config(options[:name]))
93
+ definition.add_config(process_flag_options(options))
94
+ definition
83
95
  end
84
96
 
85
- def load_job job
86
- yml_config = Chronicle::ETL::Config.load("chronicle/etl/jobs/#{job}.yml")
87
- # FIXME: use better trick to depely symbolize keys
88
- JSON.parse(yml_config.to_json, symbolize_names: true)
97
+ def load_job_config name
98
+ Chronicle::ETL::Config.load_job_from_config(name)
89
99
  end
90
100
 
91
101
  # Takes flag options and turns them into a runner config
92
102
  def process_flag_options options
93
103
  {
104
+ dry_run: options[:dry_run],
94
105
  extractor: {
95
106
  name: options[:extractor],
96
107
  options: options[:'extractor-opts']
97
- },
108
+ }.compact,
98
109
  transformer: {
99
110
  name: options[:transformer],
100
111
  options: options[:'transformer-opts']
101
- },
112
+ }.compact,
102
113
  loader: {
103
114
  name: options[:loader],
104
115
  options: options[:'loader-opts']
105
- }
116
+ }.compact
106
117
  }
107
118
  end
108
119
  end
@@ -22,6 +22,11 @@ module Chronicle
22
22
 
23
23
  # Entrypoint for the CLI
24
24
  def self.start(given_args = ARGV, config = {})
25
+ if given_args[0] == "--version"
26
+ puts "#{Chronicle::ETL::VERSION}"
27
+ exit
28
+ end
29
+
25
30
  if given_args.none?
26
31
  abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
27
32
  end
@@ -52,10 +57,10 @@ module Chronicle
52
57
  shell.say " $ chronicle-etl connectors:list"
53
58
  shell.say
54
59
  shell.say " Run a simple job:".italic.light_black
55
- shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
60
+ shell.say " $ chronicle-etl jobs:run --extractor stdin --transformer null --loader stdout"
56
61
  shell.say
57
62
  shell.say " Show full job options:".italic.light_black
58
- shell.say " $ chronicle-etl jobs help start"
63
+ shell.say " $ chronicle-etl jobs help run"
59
64
 
60
65
  list = []
61
66
 
@@ -72,6 +77,9 @@ module Chronicle
72
77
  shell.say "VERSION".bold
73
78
  shell.say " #{Chronicle::ETL::VERSION}"
74
79
  shell.say
80
+ shell.say " Display current version:".italic.light_black
81
+ shell.say " $ chronicle-etl --version"
82
+ shell.say
75
83
  shell.say "FULL DOCUMENTATION".bold
76
84
  shell.say " https://github.com/chronicle-app/chronicle-etl".blue
77
85
  shell.say
@@ -4,15 +4,17 @@ module Chronicle
4
4
  module ETL
5
5
  # Utility methods to read, write, and access config files
6
6
  module Config
7
+ module_function
8
+
7
9
  # Loads a yml config file
8
- def self.load(path)
10
+ def load(path)
9
11
  config = Runcom::Config.new(path)
10
12
  # FIXME: hack to deeply symbolize keys
11
13
  JSON.parse(config.to_h.to_json, symbolize_names: true)
12
14
  end
13
15
 
14
16
  # Writes a hash as a yml config file
15
- def self.write(path, data)
17
+ def write(path, data)
16
18
  config = Runcom::Config.new(path)
17
19
  filename = config.all[0].to_s + '.yml'
18
20
  File.open(filename, 'w') do |f|
@@ -21,12 +23,31 @@ module Chronicle
21
23
  end
22
24
 
23
25
  # Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
24
- def self.jobs
26
+ def available_jobs
25
27
  job_directory = Runcom::Config.new('chronicle/etl/jobs').current
26
28
  Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
27
29
  File.basename(filename, ".*")
28
30
  end
29
31
  end
32
+
33
+ # Returns all available credentials available in ~/.config/chronicle/etl/credentials/*.yml
34
+ def available_credentials
35
+ job_directory = Runcom::Config.new('chronicle/etl/credentials').current
36
+ Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
37
+ File.basename(filename, ".*")
38
+ end
39
+ end
40
+
41
+ # Load a job definition from job config directory
42
+ def load_job_from_config(job_name)
43
+ definition = self.load("chronicle/etl/jobs/#{job_name}.yml")
44
+ definition[:name] = job_name
45
+ definition
46
+ end
47
+
48
+ def load_credentials(name)
49
+ config = self.load("chronicle/etl/credentials/#{name}.yml")
50
+ end
30
51
  end
31
52
  end
32
53
  end
@@ -0,0 +1,30 @@
1
+ module Chronicle
2
+ module ETL
3
+ class Error < StandardError; end;
4
+
5
+ class RunnerTypeError < Error; end
6
+
7
+ class ConnectorNotAvailableError < Error
8
+ def initialize(message, provider: nil, name: nil)
9
+ super(message)
10
+ @provider = provider
11
+ @name = name
12
+ end
13
+ attr_reader :name, :provider
14
+ end
15
+
16
+ class ProviderNotAvailableError < ConnectorNotAvailableError; end
17
+ class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
18
+
19
+ class TransformationError < Error
20
+ attr_reader :transformation
21
+
22
+ def initialize(message=nil, transformation:)
23
+ super(message)
24
+ @transformation = transformation
25
+ end
26
+ end
27
+
28
+ class UntransformableRecordError < TransformationError; end
29
+ end
30
+ end
@@ -0,0 +1,12 @@
1
+ module Chronicle
2
+ module ETL
3
+ class Extraction
4
+ attr_accessor :data, :meta
5
+
6
+ def initialize(data: {}, meta: {})
7
+ @data = data
8
+ @meta = meta
9
+ end
10
+ end
11
+ end
12
+ end
@@ -1,42 +1,48 @@
1
1
  require 'csv'
2
- class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
3
- DEFAULT_OPTIONS = {
4
- headers: true,
5
- filename: $stdin
6
- }.freeze
7
-
8
- def initialize(options = {})
9
- super(DEFAULT_OPTIONS.merge(options))
10
- end
11
2
 
12
- def extract
13
- csv = initialize_csv
14
- csv.each do |row|
15
- result = row.to_h
16
- yield result
3
+ module Chronicle
4
+ module ETL
5
+ class CsvExtractor < Chronicle::ETL::Extractor
6
+ include Extractors::Helpers::FilesystemReader
7
+
8
+ register_connector do |r|
9
+ r.description = 'input as CSV'
10
+ end
11
+
12
+ DEFAULT_OPTIONS = {
13
+ headers: true,
14
+ filename: $stdin
15
+ }.freeze
16
+
17
+ def initialize(options = {})
18
+ super(DEFAULT_OPTIONS.merge(options))
19
+ end
20
+
21
+ def extract
22
+ csv = initialize_csv
23
+ csv.each do |row|
24
+ yield Chronicle::ETL::Extraction.new(data: row.to_h)
25
+ end
26
+ end
27
+
28
+ def results_count
29
+ CSV.read(@options[:filename], headers: @options[:headers]).count unless stdin?(@options[:filename])
30
+ end
31
+
32
+ private
33
+
34
+ def initialize_csv
35
+ headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
36
+
37
+ csv_options = {
38
+ headers: headers,
39
+ converters: :all
40
+ }
41
+
42
+ open_from_filesystem(filename: @options[:filename]) do |file|
43
+ return CSV.new(file, **csv_options)
44
+ end
45
+ end
17
46
  end
18
47
  end
19
-
20
- def results_count
21
- CSV.read(@options[:filename], headers: @options[:headers]).count if read_from_file?
22
- end
23
-
24
- private
25
-
26
- def initialize_csv
27
- headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
28
-
29
- csv_options = {
30
- headers: headers,
31
- header_converters: :symbol,
32
- converters: [:all]
33
- }
34
-
35
- stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
36
- CSV.new(stream, **csv_options)
37
- end
38
-
39
- def read_from_file?
40
- @options[:filename] != $stdin
41
- end
42
48
  end
@@ -4,7 +4,7 @@ module Chronicle
4
4
  module ETL
5
5
  # Abstract class representing an Extractor for an ETL job
6
6
  class Extractor
7
- extend Chronicle::ETL::Catalog
7
+ extend Chronicle::ETL::Registry::SelfRegistering
8
8
 
9
9
  # Construct a new instance of this extractor. Options are passed in from a Runner
10
10
  # == Paramters:
@@ -12,6 +12,8 @@ module Chronicle
12
12
  # Options for configuring this Extractor
13
13
  def initialize(options = {})
14
14
  @options = options.transform_keys!(&:to_sym)
15
+ sanitize_options
16
+ handle_continuation
15
17
  end
16
18
 
17
19
  # Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
@@ -22,10 +24,26 @@ module Chronicle
22
24
  # An optional method to calculate how many records there are to extract. Used primarily for
23
25
  # building the progress bar
24
26
  def results_count; end
27
+
28
+ private
29
+
30
+ def sanitize_options
31
+ @options[:load_since] = Time.parse(@options[:load_since]) if @options[:load_since] && @options[:load_since].is_a?(String)
32
+ @options[:load_until] = Time.parse(@options[:load_until]) if @options[:load_until] && @options[:load_until].is_a?(String)
33
+ end
34
+
35
+ def handle_continuation
36
+ return unless @options[:continuation]
37
+
38
+ @options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation].highest_timestamp
39
+ @options[:load_after_id] = @options[:continuation].last_id if @options[:continuation].last_id
40
+ end
25
41
  end
26
42
  end
27
43
  end
28
44
 
45
+ require_relative 'helpers/filesystem_reader'
29
46
  require_relative 'csv_extractor'
30
47
  require_relative 'file_extractor'
48
+ require_relative 'json_extractor'
31
49
  require_relative 'stdin_extractor'
@@ -3,49 +3,31 @@ require 'pathname'
3
3
  module Chronicle
4
4
  module ETL
5
5
  class FileExtractor < Chronicle::ETL::Extractor
6
- def extract
7
- if file?
8
- extract_file do |data, metadata|
9
- yield(data, metadata)
10
- end
11
- elsif directory?
12
- extract_from_directory do |data, metadata|
13
- yield(data, metadata)
14
- end
15
- end
16
- end
6
+ include Extractors::Helpers::FilesystemReader
17
7
 
18
- def results_count
19
- if file?
20
- return 1
21
- else
22
- search_pattern = File.join(@options[:filename], '**/*.eml')
23
- Dir.glob(search_pattern).count
24
- end
8
+ register_connector do |r|
9
+ r.description = 'file or directory of files'
25
10
  end
26
11
 
27
- private
28
-
29
- def extract_from_directory
30
- search_pattern = File.join(@options[:filename], '**/*.eml')
31
- filenames = Dir.glob(search_pattern)
12
+ def extract
32
13
  filenames.each do |filename|
33
- file = File.open(filename)
34
- yield(file.read, {filename: file})
14
+ yield Chronicle::ETL::Extraction.new(data: filename)
35
15
  end
36
16
  end
37
17
 
38
- def extract_file
39
- file = File.open(@options[:filename])
40
- yield(file.read, {filename: @options[:filename]})
18
+ def results_count
19
+ filenames.count
41
20
  end
42
21
 
43
- def directory?
44
- Pathname.new(@options[:filename]).directory?
45
- end
22
+ private
46
23
 
47
- def file?
48
- Pathname.new(@options[:filename]).file?
24
+ def filenames
25
+ @filenames ||= filenames_in_directory(
26
+ path: @options[:filename],
27
+ dir_glob_pattern: @options[:dir_glob_pattern],
28
+ load_since: @options[:load_since],
29
+ load_until: @options[:load_until]
30
+ )
49
31
  end
50
32
  end
51
33
  end