chronicle-etl 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +3 -0
- data/README.md +22 -15
- data/chronicle-etl.gemspec +11 -5
- data/lib/chronicle/etl/cli/connectors.rb +19 -7
- data/lib/chronicle/etl/cli/jobs.rb +38 -27
- data/lib/chronicle/etl/cli/main.rb +10 -2
- data/lib/chronicle/etl/config.rb +24 -3
- data/lib/chronicle/etl/exceptions.rb +30 -0
- data/lib/chronicle/etl/extraction.rb +12 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -37
- data/lib/chronicle/etl/extractors/extractor.rb +19 -1
- data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
- data/lib/chronicle/etl/job.rb +72 -0
- data/lib/chronicle/etl/job_definition.rb +89 -0
- data/lib/chronicle/etl/job_log.rb +95 -0
- data/lib/chronicle/etl/job_logger.rb +81 -0
- data/lib/chronicle/etl/loaders/csv_loader.rb +6 -6
- data/lib/chronicle/etl/loaders/loader.rb +2 -2
- data/lib/chronicle/etl/loaders/rest_loader.rb +16 -9
- data/lib/chronicle/etl/loaders/stdout_loader.rb +8 -3
- data/lib/chronicle/etl/loaders/table_loader.rb +58 -7
- data/lib/chronicle/etl/logger.rb +48 -0
- data/lib/chronicle/etl/models/activity.rb +15 -0
- data/lib/chronicle/etl/models/attachment.rb +14 -0
- data/lib/chronicle/etl/models/base.rb +119 -0
- data/lib/chronicle/etl/models/entity.rb +21 -0
- data/lib/chronicle/etl/models/generic.rb +23 -0
- data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
- data/lib/chronicle/etl/registry/registry.rb +52 -0
- data/lib/chronicle/etl/registry/self_registering.rb +25 -0
- data/lib/chronicle/etl/runner.rb +70 -42
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
- data/lib/chronicle/etl/serializers/serializer.rb +27 -0
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +12 -4
- data/lib/chronicle/etl/transformers/transformer.rb +42 -12
- data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
- data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
- data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +17 -1
- metadata +138 -35
- data/CHANGELOG.md +0 -23
- data/Gemfile.lock +0 -85
- data/lib/chronicle/etl/catalog.rb +0 -62
- data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bfd4e081bfeda7d097a5a5eee6ccf28baf0a9b3878968d74c9a604013d0b55a6
|
4
|
+
data.tar.gz: 003ebd2ffe2b1220c7f43a4875043cb5500aa1b7a6327b84c9be10f04e0e8d40
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3d786fb4acf8d0b03e65262209def310ca25b92646847f6e96791e6491e9b159ab11db7fa35f785f6782fbc0b9e3daebb625e2353fce2422f7fc79aed7a4d6bc
|
7
|
+
data.tar.gz: 87771745b9df2160966299f1d73eb568b46080ed217a1952e1dd938fff7758432f2cc6f449036d24951660b405e18083f1adac26d00243a0fab9003a96eb569d
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
data/README.md
CHANGED
@@ -2,9 +2,9 @@
|
|
2
2
|
|
3
3
|
[![Gem Version](https://badge.fury.io/rb/chronicle-etl.svg)](https://badge.fury.io/rb/chronicle-etl)
|
4
4
|
|
5
|
-
Chronicle ETL is a utility
|
5
|
+
Chronicle ETL is a utility that helps you archive and processes personal data. You can *extract* it from a variety of sources, *transform* it, and *load* it to an external API, file, or stdout.
|
6
6
|
|
7
|
-
This
|
7
|
+
This tool is an adaptation of Andrew Louis's experimental [Memex project](https://hyfen.net/memex) and the dozens of existing importers are being migrated to Chronicle.
|
8
8
|
|
9
9
|
## Installation
|
10
10
|
|
@@ -31,6 +31,9 @@ Connectors are available to read, process, and load data from different formats
|
|
31
31
|
```bash
|
32
32
|
# List all available connectors
|
33
33
|
$ chronicle-etl connectors:list
|
34
|
+
|
35
|
+
# Install a connector
|
36
|
+
$ chronicle-etl connectors:install imessage
|
34
37
|
```
|
35
38
|
|
36
39
|
Built in connectors:
|
@@ -44,16 +47,18 @@ Built in connectors:
|
|
44
47
|
- `null` - (default) Don't do anything
|
45
48
|
|
46
49
|
### Loaders
|
47
|
-
- `stdout` - (default) output
|
50
|
+
- `stdout` - (default) output records to stdout serialized as JSON
|
48
51
|
- `csv` - Load records to a csv file
|
52
|
+
- `rest` - Serialize records with [JSONAPI](https://jsonapi.org/) and send to a REST API
|
49
53
|
- `table` - Output an ascii table of records. Useful for debugging.
|
50
54
|
|
51
55
|
### Provider-specific importers
|
52
56
|
|
53
57
|
In addition to the built-in importers, importers for third-party platforms are available. They are packaged as individual Ruby gems.
|
54
58
|
|
55
|
-
- [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox`
|
56
|
-
- [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history
|
59
|
+
- [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files
|
60
|
+
- [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`
|
61
|
+
- [imessage](https://github.com/chronicle-app/chronicle-imessage). Extract iMessage messages from a local macOS installation
|
57
62
|
|
58
63
|
To install any of these, run `gem install chronicle-PROVIDER`.
|
59
64
|
|
@@ -61,7 +66,7 @@ If you don't want to use the available rubygem importers, `chronicle-etl` can us
|
|
61
66
|
|
62
67
|
I'll be open-sourcing more importers. Please [contact me](mailto:andrew@hyfen.net) to chat about what will be available!
|
63
68
|
|
64
|
-
|
69
|
+
## Full commands
|
65
70
|
|
66
71
|
```
|
67
72
|
$ chronicle-etl help
|
@@ -75,26 +80,28 @@ ALL COMMANDS
|
|
75
80
|
jobs:create # Create a job
|
76
81
|
jobs:list # List all available jobs
|
77
82
|
jobs:run # Start a job
|
78
|
-
jobs:show # Show a job
|
83
|
+
jobs:show # Show details about a job
|
79
84
|
```
|
80
85
|
|
81
|
-
###
|
86
|
+
### Running a job
|
82
87
|
|
83
88
|
```
|
84
89
|
Usage:
|
85
90
|
chronicle-etl jobs:run
|
86
91
|
|
87
92
|
Options:
|
88
|
-
|
89
|
-
# Default:
|
93
|
+
[--log-level=LOG_LEVEL] # Log level (debug, info, warn, error, fatal)
|
94
|
+
# Default: info
|
95
|
+
-v, [--verbose], [--no-verbose] # Set log level to verbose
|
96
|
+
[--dry-run], [--no-dry-run] # Only run the extraction and transform steps, not the loading
|
97
|
+
-e, [--extractor=extractor-name] # Extractor class. Default: stdin
|
90
98
|
[--extractor-opts=key:value] # Extractor options
|
91
|
-
-t, [--transformer=transformer-name] # Transformer class
|
92
|
-
# Default: null
|
99
|
+
-t, [--transformer=transformer-name] # Transformer class. Default: null
|
93
100
|
[--transformer-opts=key:value] # Transformer options
|
94
|
-
-l, [--loader=loader-name] # Loader class
|
95
|
-
# Default: stdout
|
101
|
+
-l, [--loader=loader-name] # Loader class. Default: stdout
|
96
102
|
[--loader-opts=key:value] # Loader options
|
97
|
-
-j, [--
|
103
|
+
-j, [--name=NAME] # Job configuration name
|
104
|
+
|
98
105
|
|
99
106
|
Runs an ETL job
|
100
107
|
```
|
data/chronicle-etl.gemspec
CHANGED
@@ -36,15 +36,21 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
37
37
|
spec.require_paths = ["lib"]
|
38
38
|
|
39
|
-
spec.add_dependency "
|
39
|
+
spec.add_dependency "activesupport"
|
40
|
+
spec.add_dependency "chronic_duration", "~> 0.10.6"
|
40
41
|
spec.add_dependency "colorize", "~> 0.8.1"
|
41
|
-
spec.add_dependency "
|
42
|
+
spec.add_dependency "marcel", "~> 1.0.2"
|
43
|
+
spec.add_dependency "mini_exiftool", "~> 2.10"
|
44
|
+
spec.add_dependency "nokogiri", "~> 1.13"
|
45
|
+
spec.add_dependency "runcom", "~> 6.2"
|
46
|
+
spec.add_dependency "sequel", "~> 5.35"
|
47
|
+
spec.add_dependency "sqlite3", "~> 1.4"
|
48
|
+
spec.add_dependency "thor", "~> 0.20"
|
42
49
|
spec.add_dependency "tty-progressbar", "~> 0.17"
|
50
|
+
spec.add_dependency "tty-table", "~> 0.11"
|
43
51
|
|
44
52
|
spec.add_development_dependency "bundler", "~> 2.1"
|
53
|
+
spec.add_development_dependency "pry-byebug", "~> 3.9"
|
45
54
|
spec.add_development_dependency "rake", "~> 13.0"
|
46
55
|
spec.add_development_dependency "rspec", "~> 3.9"
|
47
|
-
spec.add_development_dependency "pry-byebug", "~> 3.9"
|
48
|
-
spec.add_development_dependency 'runcom', '~> 6.2'
|
49
|
-
spec.add_development_dependency 'redcarpet', '~> 3.5'
|
50
56
|
end
|
@@ -7,23 +7,35 @@ module Chronicle
|
|
7
7
|
namespace :connectors
|
8
8
|
|
9
9
|
desc "install NAME", "Installs connector NAME"
|
10
|
-
def install
|
11
|
-
|
10
|
+
def install(name)
|
11
|
+
Chronicle::ETL::Registry.install_connector(name)
|
12
12
|
end
|
13
13
|
|
14
14
|
desc "list", "Lists available connectors"
|
15
15
|
# Display all available connectors that chronicle-etl has access to
|
16
16
|
def list
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
Chronicle::ETL::Registry.load_all!
|
18
|
+
|
19
|
+
connector_info = Chronicle::ETL::Registry.connectors.map do |connector_registration|
|
20
|
+
{
|
21
|
+
identifier: connector_registration.identifier,
|
22
|
+
phase: connector_registration.phase,
|
23
|
+
description: connector_registration.descriptive_phrase,
|
24
|
+
provider: connector_registration.provider,
|
25
|
+
core: connector_registration.built_in? ? '✓' : '',
|
26
|
+
class: connector_registration.klass_name
|
27
|
+
}
|
28
|
+
end
|
29
|
+
|
30
|
+
connector_info = connector_info.sort_by do |a|
|
31
|
+
[a[:core].to_s, a[:provider], a[:phase], a[:identifier]]
|
20
32
|
end
|
21
33
|
|
22
|
-
headers =
|
34
|
+
headers = connector_info.first.keys.map do |key|
|
23
35
|
key.to_s.upcase.bold
|
24
36
|
end
|
25
37
|
|
26
|
-
table = TTY::Table.new(headers,
|
38
|
+
table = TTY::Table.new(headers, connector_info.map(&:values))
|
27
39
|
puts table.render(indent: 0, padding: [0, 2])
|
28
40
|
end
|
29
41
|
end
|
@@ -1,6 +1,4 @@
|
|
1
1
|
require 'pp'
|
2
|
-
require 'pry'
|
3
|
-
|
4
2
|
module Chronicle
|
5
3
|
module ETL
|
6
4
|
module CLI
|
@@ -9,16 +7,19 @@ module Chronicle
|
|
9
7
|
default_task "start"
|
10
8
|
namespace :jobs
|
11
9
|
|
12
|
-
class_option :extractor, aliases: '-e', desc:
|
10
|
+
class_option :extractor, aliases: '-e', desc: "Extractor class. Default: stdin", banner: 'extractor-name'
|
13
11
|
class_option :'extractor-opts', desc: 'Extractor options', type: :hash, default: {}
|
14
|
-
class_option :transformer, aliases: '-t', desc: 'Transformer class
|
12
|
+
class_option :transformer, aliases: '-t', desc: 'Transformer class. Default: null', banner: 'transformer-name'
|
15
13
|
class_option :'transformer-opts', desc: 'Transformer options', type: :hash, default: {}
|
16
|
-
class_option :loader, aliases: '-l', desc: 'Loader class
|
14
|
+
class_option :loader, aliases: '-l', desc: 'Loader class. Default: stdout', banner: 'loader-name'
|
17
15
|
class_option :'loader-opts', desc: 'Loader options', type: :hash, default: {}
|
18
|
-
class_option :
|
16
|
+
class_option :name, aliases: '-j', desc: 'Job configuration name'
|
19
17
|
|
20
18
|
map run: :start # Thor doesn't like `run` as a command name
|
21
19
|
desc "run", "Start a job"
|
20
|
+
option :log_level, desc: 'Log level (debug, info, warn, error, fatal)', default: 'info'
|
21
|
+
option :verbose, aliases: '-v', desc: 'Set log level to verbose', type: :boolean
|
22
|
+
option :dry_run, desc: 'Only run the extraction and transform steps, not the loading', type: :boolean
|
22
23
|
long_desc <<-LONG_DESC
|
23
24
|
This will run an ETL job. Each job needs three parts:
|
24
25
|
|
@@ -26,36 +27,37 @@ module Chronicle
|
|
26
27
|
|
27
28
|
2. #{'Transformer'.underline}: transforms data into a new format. If none is specified, we use the `null` transformer which does nothing to the data.
|
28
29
|
|
29
|
-
3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout.
|
30
|
+
3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout. With the --dry-run option, this step won't be run.
|
30
31
|
|
31
32
|
If you do not want to use the command line flags, you can also configure a job with a .yml config file. You can either specify the path to this file or use the filename and place the file in ~/.config/chronicle/etl/jobs/NAME.yml and call it with `--job NAME`
|
32
33
|
LONG_DESC
|
33
34
|
# Run an ETL job
|
34
35
|
def start
|
35
|
-
|
36
|
-
|
36
|
+
setup_log_level
|
37
|
+
job_definition = build_job_definition(options)
|
38
|
+
job = Chronicle::ETL::Job.new(job_definition)
|
39
|
+
runner = Chronicle::ETL::Runner.new(job)
|
37
40
|
runner.run!
|
38
41
|
end
|
39
42
|
|
40
43
|
desc "create", "Create a job"
|
41
44
|
# Create an ETL job
|
42
45
|
def create
|
43
|
-
|
44
|
-
path = File.join('chronicle', 'etl', 'jobs', options[:
|
45
|
-
Chronicle::ETL::Config.write(path,
|
46
|
+
job_definition = build_job_definition(options)
|
47
|
+
path = File.join('chronicle', 'etl', 'jobs', options[:name])
|
48
|
+
Chronicle::ETL::Config.write(path, job_definition.definition)
|
46
49
|
end
|
47
50
|
|
48
51
|
desc "show", "Show details about a job"
|
49
52
|
# Show an ETL job
|
50
53
|
def show
|
51
|
-
|
52
|
-
pp runner_options
|
54
|
+
puts Chronicle::ETL::Job.new(build_job_definition(options))
|
53
55
|
end
|
54
56
|
|
55
57
|
desc "list", "List all available jobs"
|
56
58
|
# List available ETL jobs
|
57
59
|
def list
|
58
|
-
jobs = Chronicle::ETL::Config.
|
60
|
+
jobs = Chronicle::ETL::Config.available_jobs
|
59
61
|
|
60
62
|
job_details = jobs.map do |job|
|
61
63
|
r = Chronicle::ETL::Config.load("chronicle/etl/jobs/#{job}.yml")
|
@@ -75,34 +77,43 @@ LONG_DESC
|
|
75
77
|
|
76
78
|
private
|
77
79
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
80
|
+
def setup_log_level
|
81
|
+
if options[:verbose]
|
82
|
+
Chronicle::ETL::Logger.log_level = Chronicle::ETL::Logger::DEBUG
|
83
|
+
elsif options[:log_level]
|
84
|
+
level = Chronicle::ETL::Logger.const_get(options[:log_level].upcase)
|
85
|
+
Chronicle::ETL::Logger.log_level = level
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Create job definition by reading config file and then overwriting with flag options
|
90
|
+
def build_job_definition(options)
|
91
|
+
definition = Chronicle::ETL::JobDefinition.new
|
92
|
+
definition.add_config(load_job_config(options[:name]))
|
93
|
+
definition.add_config(process_flag_options(options))
|
94
|
+
definition
|
83
95
|
end
|
84
96
|
|
85
|
-
def
|
86
|
-
|
87
|
-
# FIXME: use better trick to depely symbolize keys
|
88
|
-
JSON.parse(yml_config.to_json, symbolize_names: true)
|
97
|
+
def load_job_config name
|
98
|
+
Chronicle::ETL::Config.load_job_from_config(name)
|
89
99
|
end
|
90
100
|
|
91
101
|
# Takes flag options and turns them into a runner config
|
92
102
|
def process_flag_options options
|
93
103
|
{
|
104
|
+
dry_run: options[:dry_run],
|
94
105
|
extractor: {
|
95
106
|
name: options[:extractor],
|
96
107
|
options: options[:'extractor-opts']
|
97
|
-
},
|
108
|
+
}.compact,
|
98
109
|
transformer: {
|
99
110
|
name: options[:transformer],
|
100
111
|
options: options[:'transformer-opts']
|
101
|
-
},
|
112
|
+
}.compact,
|
102
113
|
loader: {
|
103
114
|
name: options[:loader],
|
104
115
|
options: options[:'loader-opts']
|
105
|
-
}
|
116
|
+
}.compact
|
106
117
|
}
|
107
118
|
end
|
108
119
|
end
|
@@ -22,6 +22,11 @@ module Chronicle
|
|
22
22
|
|
23
23
|
# Entrypoint for the CLI
|
24
24
|
def self.start(given_args = ARGV, config = {})
|
25
|
+
if given_args[0] == "--version"
|
26
|
+
puts "#{Chronicle::ETL::VERSION}"
|
27
|
+
exit
|
28
|
+
end
|
29
|
+
|
25
30
|
if given_args.none?
|
26
31
|
abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
|
27
32
|
end
|
@@ -52,10 +57,10 @@ module Chronicle
|
|
52
57
|
shell.say " $ chronicle-etl connectors:list"
|
53
58
|
shell.say
|
54
59
|
shell.say " Run a simple job:".italic.light_black
|
55
|
-
shell.say " $ chronicle-etl jobs:
|
60
|
+
shell.say " $ chronicle-etl jobs:run --extractor stdin --transformer null --loader stdout"
|
56
61
|
shell.say
|
57
62
|
shell.say " Show full job options:".italic.light_black
|
58
|
-
shell.say " $ chronicle-etl jobs help
|
63
|
+
shell.say " $ chronicle-etl jobs help run"
|
59
64
|
|
60
65
|
list = []
|
61
66
|
|
@@ -72,6 +77,9 @@ module Chronicle
|
|
72
77
|
shell.say "VERSION".bold
|
73
78
|
shell.say " #{Chronicle::ETL::VERSION}"
|
74
79
|
shell.say
|
80
|
+
shell.say " Display current version:".italic.light_black
|
81
|
+
shell.say " $ chronicle-etl --version"
|
82
|
+
shell.say
|
75
83
|
shell.say "FULL DOCUMENTATION".bold
|
76
84
|
shell.say " https://github.com/chronicle-app/chronicle-etl".blue
|
77
85
|
shell.say
|
data/lib/chronicle/etl/config.rb
CHANGED
@@ -4,15 +4,17 @@ module Chronicle
|
|
4
4
|
module ETL
|
5
5
|
# Utility methods to read, write, and access config files
|
6
6
|
module Config
|
7
|
+
module_function
|
8
|
+
|
7
9
|
# Loads a yml config file
|
8
|
-
def
|
10
|
+
def load(path)
|
9
11
|
config = Runcom::Config.new(path)
|
10
12
|
# FIXME: hack to deeply symbolize keys
|
11
13
|
JSON.parse(config.to_h.to_json, symbolize_names: true)
|
12
14
|
end
|
13
15
|
|
14
16
|
# Writes a hash as a yml config file
|
15
|
-
def
|
17
|
+
def write(path, data)
|
16
18
|
config = Runcom::Config.new(path)
|
17
19
|
filename = config.all[0].to_s + '.yml'
|
18
20
|
File.open(filename, 'w') do |f|
|
@@ -21,12 +23,31 @@ module Chronicle
|
|
21
23
|
end
|
22
24
|
|
23
25
|
# Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
|
24
|
-
def
|
26
|
+
def available_jobs
|
25
27
|
job_directory = Runcom::Config.new('chronicle/etl/jobs').current
|
26
28
|
Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
|
27
29
|
File.basename(filename, ".*")
|
28
30
|
end
|
29
31
|
end
|
32
|
+
|
33
|
+
# Returns all available credentials available in ~/.config/chronicle/etl/credentials/*.yml
|
34
|
+
def available_credentials
|
35
|
+
job_directory = Runcom::Config.new('chronicle/etl/credentials').current
|
36
|
+
Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
|
37
|
+
File.basename(filename, ".*")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# Load a job definition from job config directory
|
42
|
+
def load_job_from_config(job_name)
|
43
|
+
definition = self.load("chronicle/etl/jobs/#{job_name}.yml")
|
44
|
+
definition[:name] = job_name
|
45
|
+
definition
|
46
|
+
end
|
47
|
+
|
48
|
+
def load_credentials(name)
|
49
|
+
config = self.load("chronicle/etl/credentials/#{name}.yml")
|
50
|
+
end
|
30
51
|
end
|
31
52
|
end
|
32
53
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
class Error < StandardError; end;
|
4
|
+
|
5
|
+
class RunnerTypeError < Error; end
|
6
|
+
|
7
|
+
class ConnectorNotAvailableError < Error
|
8
|
+
def initialize(message, provider: nil, name: nil)
|
9
|
+
super(message)
|
10
|
+
@provider = provider
|
11
|
+
@name = name
|
12
|
+
end
|
13
|
+
attr_reader :name, :provider
|
14
|
+
end
|
15
|
+
|
16
|
+
class ProviderNotAvailableError < ConnectorNotAvailableError; end
|
17
|
+
class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
|
18
|
+
|
19
|
+
class TransformationError < Error
|
20
|
+
attr_reader :transformation
|
21
|
+
|
22
|
+
def initialize(message=nil, transformation:)
|
23
|
+
super(message)
|
24
|
+
@transformation = transformation
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class UntransformableRecordError < TransformationError; end
|
29
|
+
end
|
30
|
+
end
|
@@ -1,42 +1,48 @@
|
|
1
1
|
require 'csv'
|
2
|
-
class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
|
3
|
-
DEFAULT_OPTIONS = {
|
4
|
-
headers: true,
|
5
|
-
filename: $stdin
|
6
|
-
}.freeze
|
7
|
-
|
8
|
-
def initialize(options = {})
|
9
|
-
super(DEFAULT_OPTIONS.merge(options))
|
10
|
-
end
|
11
2
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class CsvExtractor < Chronicle::ETL::Extractor
|
6
|
+
include Extractors::Helpers::FilesystemReader
|
7
|
+
|
8
|
+
register_connector do |r|
|
9
|
+
r.description = 'input as CSV'
|
10
|
+
end
|
11
|
+
|
12
|
+
DEFAULT_OPTIONS = {
|
13
|
+
headers: true,
|
14
|
+
filename: $stdin
|
15
|
+
}.freeze
|
16
|
+
|
17
|
+
def initialize(options = {})
|
18
|
+
super(DEFAULT_OPTIONS.merge(options))
|
19
|
+
end
|
20
|
+
|
21
|
+
def extract
|
22
|
+
csv = initialize_csv
|
23
|
+
csv.each do |row|
|
24
|
+
yield Chronicle::ETL::Extraction.new(data: row.to_h)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def results_count
|
29
|
+
CSV.read(@options[:filename], headers: @options[:headers]).count unless stdin?(@options[:filename])
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def initialize_csv
|
35
|
+
headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
|
36
|
+
|
37
|
+
csv_options = {
|
38
|
+
headers: headers,
|
39
|
+
converters: :all
|
40
|
+
}
|
41
|
+
|
42
|
+
open_from_filesystem(filename: @options[:filename]) do |file|
|
43
|
+
return CSV.new(file, **csv_options)
|
44
|
+
end
|
45
|
+
end
|
17
46
|
end
|
18
47
|
end
|
19
|
-
|
20
|
-
def results_count
|
21
|
-
CSV.read(@options[:filename], headers: @options[:headers]).count if read_from_file?
|
22
|
-
end
|
23
|
-
|
24
|
-
private
|
25
|
-
|
26
|
-
def initialize_csv
|
27
|
-
headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
|
28
|
-
|
29
|
-
csv_options = {
|
30
|
-
headers: headers,
|
31
|
-
header_converters: :symbol,
|
32
|
-
converters: [:all]
|
33
|
-
}
|
34
|
-
|
35
|
-
stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
|
36
|
-
CSV.new(stream, **csv_options)
|
37
|
-
end
|
38
|
-
|
39
|
-
def read_from_file?
|
40
|
-
@options[:filename] != $stdin
|
41
|
-
end
|
42
48
|
end
|
@@ -4,7 +4,7 @@ module Chronicle
|
|
4
4
|
module ETL
|
5
5
|
# Abstract class representing an Extractor for an ETL job
|
6
6
|
class Extractor
|
7
|
-
extend Chronicle::ETL::
|
7
|
+
extend Chronicle::ETL::Registry::SelfRegistering
|
8
8
|
|
9
9
|
# Construct a new instance of this extractor. Options are passed in from a Runner
|
10
10
|
# == Paramters:
|
@@ -12,6 +12,8 @@ module Chronicle
|
|
12
12
|
# Options for configuring this Extractor
|
13
13
|
def initialize(options = {})
|
14
14
|
@options = options.transform_keys!(&:to_sym)
|
15
|
+
sanitize_options
|
16
|
+
handle_continuation
|
15
17
|
end
|
16
18
|
|
17
19
|
# Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
|
@@ -22,10 +24,26 @@ module Chronicle
|
|
22
24
|
# An optional method to calculate how many records there are to extract. Used primarily for
|
23
25
|
# building the progress bar
|
24
26
|
def results_count; end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def sanitize_options
|
31
|
+
@options[:load_since] = Time.parse(@options[:load_since]) if @options[:load_since] && @options[:load_since].is_a?(String)
|
32
|
+
@options[:load_until] = Time.parse(@options[:load_until]) if @options[:load_until] && @options[:load_until].is_a?(String)
|
33
|
+
end
|
34
|
+
|
35
|
+
def handle_continuation
|
36
|
+
return unless @options[:continuation]
|
37
|
+
|
38
|
+
@options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation].highest_timestamp
|
39
|
+
@options[:load_after_id] = @options[:continuation].last_id if @options[:continuation].last_id
|
40
|
+
end
|
25
41
|
end
|
26
42
|
end
|
27
43
|
end
|
28
44
|
|
45
|
+
require_relative 'helpers/filesystem_reader'
|
29
46
|
require_relative 'csv_extractor'
|
30
47
|
require_relative 'file_extractor'
|
48
|
+
require_relative 'json_extractor'
|
31
49
|
require_relative 'stdin_extractor'
|
@@ -3,49 +3,31 @@ require 'pathname'
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
5
5
|
class FileExtractor < Chronicle::ETL::Extractor
|
6
|
-
|
7
|
-
if file?
|
8
|
-
extract_file do |data, metadata|
|
9
|
-
yield(data, metadata)
|
10
|
-
end
|
11
|
-
elsif directory?
|
12
|
-
extract_from_directory do |data, metadata|
|
13
|
-
yield(data, metadata)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
6
|
+
include Extractors::Helpers::FilesystemReader
|
17
7
|
|
18
|
-
|
19
|
-
|
20
|
-
return 1
|
21
|
-
else
|
22
|
-
search_pattern = File.join(@options[:filename], '**/*.eml')
|
23
|
-
Dir.glob(search_pattern).count
|
24
|
-
end
|
8
|
+
register_connector do |r|
|
9
|
+
r.description = 'file or directory of files'
|
25
10
|
end
|
26
11
|
|
27
|
-
|
28
|
-
|
29
|
-
def extract_from_directory
|
30
|
-
search_pattern = File.join(@options[:filename], '**/*.eml')
|
31
|
-
filenames = Dir.glob(search_pattern)
|
12
|
+
def extract
|
32
13
|
filenames.each do |filename|
|
33
|
-
|
34
|
-
yield(file.read, {filename: file})
|
14
|
+
yield Chronicle::ETL::Extraction.new(data: filename)
|
35
15
|
end
|
36
16
|
end
|
37
17
|
|
38
|
-
def
|
39
|
-
|
40
|
-
yield(file.read, {filename: @options[:filename]})
|
18
|
+
def results_count
|
19
|
+
filenames.count
|
41
20
|
end
|
42
21
|
|
43
|
-
|
44
|
-
Pathname.new(@options[:filename]).directory?
|
45
|
-
end
|
22
|
+
private
|
46
23
|
|
47
|
-
def
|
48
|
-
|
24
|
+
def filenames
|
25
|
+
@filenames ||= filenames_in_directory(
|
26
|
+
path: @options[:filename],
|
27
|
+
dir_glob_pattern: @options[:dir_glob_pattern],
|
28
|
+
load_since: @options[:load_since],
|
29
|
+
load_until: @options[:load_until]
|
30
|
+
)
|
49
31
|
end
|
50
32
|
end
|
51
33
|
end
|