chronicle-etl 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +3 -0
- data/README.md +20 -13
- data/chronicle-etl.gemspec +11 -8
- data/lib/chronicle/etl/cli/connectors.rb +19 -7
- data/lib/chronicle/etl/cli/jobs.rb +24 -18
- data/lib/chronicle/etl/cli/main.rb +10 -2
- data/lib/chronicle/etl/config.rb +1 -1
- data/lib/chronicle/etl/exceptions.rb +12 -1
- data/lib/chronicle/etl/extraction.rb +12 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -36
- data/lib/chronicle/etl/extractors/extractor.rb +9 -1
- data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
- data/lib/chronicle/etl/job.rb +30 -29
- data/lib/chronicle/etl/job_definition.rb +45 -7
- data/lib/chronicle/etl/job_log.rb +10 -0
- data/lib/chronicle/etl/job_logger.rb +23 -20
- data/lib/chronicle/etl/loaders/csv_loader.rb +4 -0
- data/lib/chronicle/etl/loaders/loader.rb +1 -1
- data/lib/chronicle/etl/loaders/rest_loader.rb +5 -1
- data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
- data/lib/chronicle/etl/loaders/table_loader.rb +57 -7
- data/lib/chronicle/etl/logger.rb +48 -0
- data/lib/chronicle/etl/models/attachment.rb +14 -0
- data/lib/chronicle/etl/models/base.rb +23 -7
- data/lib/chronicle/etl/models/entity.rb +9 -3
- data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
- data/lib/chronicle/etl/registry/registry.rb +52 -0
- data/lib/chronicle/etl/registry/self_registering.rb +25 -0
- data/lib/chronicle/etl/runner.rb +57 -7
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
- data/lib/chronicle/etl/serializers/serializer.rb +27 -0
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
- data/lib/chronicle/etl/transformers/transformer.rb +39 -9
- data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
- data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +7 -2
- metadata +96 -44
- data/Gemfile.lock +0 -91
- data/lib/chronicle/etl/catalog.rb +0 -108
- data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bfd4e081bfeda7d097a5a5eee6ccf28baf0a9b3878968d74c9a604013d0b55a6
|
4
|
+
data.tar.gz: 003ebd2ffe2b1220c7f43a4875043cb5500aa1b7a6327b84c9be10f04e0e8d40
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3d786fb4acf8d0b03e65262209def310ca25b92646847f6e96791e6491e9b159ab11db7fa35f785f6782fbc0b9e3daebb625e2353fce2422f7fc79aed7a4d6bc
|
7
|
+
data.tar.gz: 87771745b9df2160966299f1d73eb568b46080ed217a1952e1dd938fff7758432f2cc6f449036d24951660b405e18083f1adac26d00243a0fab9003a96eb569d
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
data/README.md
CHANGED
@@ -31,6 +31,9 @@ Connectors are available to read, process, and load data from different formats
|
|
31
31
|
```bash
|
32
32
|
# List all available connectors
|
33
33
|
$ chronicle-etl connectors:list
|
34
|
+
|
35
|
+
# Install a connector
|
36
|
+
$ chronicle-etl connectors:install imessage
|
34
37
|
```
|
35
38
|
|
36
39
|
Built in connectors:
|
@@ -44,16 +47,18 @@ Built in connectors:
|
|
44
47
|
- `null` - (default) Don't do anything
|
45
48
|
|
46
49
|
### Loaders
|
47
|
-
- `stdout` - (default) output
|
50
|
+
- `stdout` - (default) output records to stdout serialized as JSON
|
48
51
|
- `csv` - Load records to a csv file
|
52
|
+
- `rest` - Serialize records with [JSONAPI](https://jsonapi.org/) and send to a REST API
|
49
53
|
- `table` - Output an ascii table of records. Useful for debugging.
|
50
54
|
|
51
55
|
### Provider-specific importers
|
52
56
|
|
53
57
|
In addition to the built-in importers, importers for third-party platforms are available. They are packaged as individual Ruby gems.
|
54
58
|
|
55
|
-
- [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files
|
56
|
-
- [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history
|
59
|
+
- [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files
|
60
|
+
- [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`
|
61
|
+
- [imessage](https://github.com/chronicle-app/chronicle-imessage). Extract iMessage messages from a local macOS installation
|
57
62
|
|
58
63
|
To install any of these, run `gem install chronicle-PROVIDER`.
|
59
64
|
|
@@ -61,7 +66,7 @@ If you don't want to use the available rubygem importers, `chronicle-etl` can us
|
|
61
66
|
|
62
67
|
I'll be open-sourcing more importers. Please [contact me](mailto:andrew@hyfen.net) to chat about what will be available!
|
63
68
|
|
64
|
-
|
69
|
+
## Full commands
|
65
70
|
|
66
71
|
```
|
67
72
|
$ chronicle-etl help
|
@@ -75,26 +80,28 @@ ALL COMMANDS
|
|
75
80
|
jobs:create # Create a job
|
76
81
|
jobs:list # List all available jobs
|
77
82
|
jobs:run # Start a job
|
78
|
-
jobs:show # Show a job
|
83
|
+
jobs:show # Show details about a job
|
79
84
|
```
|
80
85
|
|
81
|
-
###
|
86
|
+
### Running a job
|
82
87
|
|
83
88
|
```
|
84
89
|
Usage:
|
85
90
|
chronicle-etl jobs:run
|
86
91
|
|
87
92
|
Options:
|
88
|
-
|
89
|
-
# Default:
|
93
|
+
[--log-level=LOG_LEVEL] # Log level (debug, info, warn, error, fatal)
|
94
|
+
# Default: info
|
95
|
+
-v, [--verbose], [--no-verbose] # Set log level to verbose
|
96
|
+
[--dry-run], [--no-dry-run] # Only run the extraction and transform steps, not the loading
|
97
|
+
-e, [--extractor=extractor-name] # Extractor class. Default: stdin
|
90
98
|
[--extractor-opts=key:value] # Extractor options
|
91
|
-
-t, [--transformer=transformer-name] # Transformer class
|
92
|
-
# Default: null
|
99
|
+
-t, [--transformer=transformer-name] # Transformer class. Default: null
|
93
100
|
[--transformer-opts=key:value] # Transformer options
|
94
|
-
-l, [--loader=loader-name] # Loader class
|
95
|
-
# Default: stdout
|
101
|
+
-l, [--loader=loader-name] # Loader class. Default: stdout
|
96
102
|
[--loader-opts=key:value] # Loader options
|
97
|
-
-j, [--
|
103
|
+
-j, [--name=NAME] # Job configuration name
|
104
|
+
|
98
105
|
|
99
106
|
Runs an ETL job
|
100
107
|
```
|
data/chronicle-etl.gemspec
CHANGED
@@ -36,18 +36,21 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
37
37
|
spec.require_paths = ["lib"]
|
38
38
|
|
39
|
-
spec.add_dependency "
|
39
|
+
spec.add_dependency "activesupport"
|
40
|
+
spec.add_dependency "chronic_duration", "~> 0.10.6"
|
40
41
|
spec.add_dependency "colorize", "~> 0.8.1"
|
41
|
-
spec.add_dependency "
|
42
|
+
spec.add_dependency "marcel", "~> 1.0.2"
|
43
|
+
spec.add_dependency "mini_exiftool", "~> 2.10"
|
44
|
+
spec.add_dependency "nokogiri", "~> 1.13"
|
45
|
+
spec.add_dependency "runcom", "~> 6.2"
|
46
|
+
spec.add_dependency "sequel", "~> 5.35"
|
47
|
+
spec.add_dependency "sqlite3", "~> 1.4"
|
48
|
+
spec.add_dependency "thor", "~> 0.20"
|
42
49
|
spec.add_dependency "tty-progressbar", "~> 0.17"
|
43
|
-
spec.add_dependency
|
44
|
-
spec.add_dependency 'deep_merge', '~> 1.2'
|
50
|
+
spec.add_dependency "tty-table", "~> 0.11"
|
45
51
|
|
46
52
|
spec.add_development_dependency "bundler", "~> 2.1"
|
53
|
+
spec.add_development_dependency "pry-byebug", "~> 3.9"
|
47
54
|
spec.add_development_dependency "rake", "~> 13.0"
|
48
55
|
spec.add_development_dependency "rspec", "~> 3.9"
|
49
|
-
spec.add_development_dependency "pry-byebug", "~> 3.9"
|
50
|
-
spec.add_development_dependency 'runcom', '~> 6.2'
|
51
|
-
spec.add_development_dependency 'redcarpet', '~> 3.5'
|
52
|
-
spec.add_development_dependency 'sqlite3', '~> 1.4'
|
53
56
|
end
|
@@ -7,23 +7,35 @@ module Chronicle
|
|
7
7
|
namespace :connectors
|
8
8
|
|
9
9
|
desc "install NAME", "Installs connector NAME"
|
10
|
-
def install
|
11
|
-
|
10
|
+
def install(name)
|
11
|
+
Chronicle::ETL::Registry.install_connector(name)
|
12
12
|
end
|
13
13
|
|
14
14
|
desc "list", "Lists available connectors"
|
15
15
|
# Display all available connectors that chronicle-etl has access to
|
16
16
|
def list
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
Chronicle::ETL::Registry.load_all!
|
18
|
+
|
19
|
+
connector_info = Chronicle::ETL::Registry.connectors.map do |connector_registration|
|
20
|
+
{
|
21
|
+
identifier: connector_registration.identifier,
|
22
|
+
phase: connector_registration.phase,
|
23
|
+
description: connector_registration.descriptive_phrase,
|
24
|
+
provider: connector_registration.provider,
|
25
|
+
core: connector_registration.built_in? ? '✓' : '',
|
26
|
+
class: connector_registration.klass_name
|
27
|
+
}
|
28
|
+
end
|
29
|
+
|
30
|
+
connector_info = connector_info.sort_by do |a|
|
31
|
+
[a[:core].to_s, a[:provider], a[:phase], a[:identifier]]
|
20
32
|
end
|
21
33
|
|
22
|
-
headers =
|
34
|
+
headers = connector_info.first.keys.map do |key|
|
23
35
|
key.to_s.upcase.bold
|
24
36
|
end
|
25
37
|
|
26
|
-
table = TTY::Table.new(headers,
|
38
|
+
table = TTY::Table.new(headers, connector_info.map(&:values))
|
27
39
|
puts table.render(indent: 0, padding: [0, 2])
|
28
40
|
end
|
29
41
|
end
|
@@ -7,16 +7,19 @@ module Chronicle
|
|
7
7
|
default_task "start"
|
8
8
|
namespace :jobs
|
9
9
|
|
10
|
-
class_option :extractor, aliases: '-e', desc:
|
10
|
+
class_option :extractor, aliases: '-e', desc: "Extractor class. Default: stdin", banner: 'extractor-name'
|
11
11
|
class_option :'extractor-opts', desc: 'Extractor options', type: :hash, default: {}
|
12
|
-
class_option :transformer, aliases: '-t', desc: 'Transformer class
|
12
|
+
class_option :transformer, aliases: '-t', desc: 'Transformer class. Default: null', banner: 'transformer-name'
|
13
13
|
class_option :'transformer-opts', desc: 'Transformer options', type: :hash, default: {}
|
14
|
-
class_option :loader, aliases: '-l', desc: 'Loader class
|
14
|
+
class_option :loader, aliases: '-l', desc: 'Loader class. Default: stdout', banner: 'loader-name'
|
15
15
|
class_option :'loader-opts', desc: 'Loader options', type: :hash, default: {}
|
16
16
|
class_option :name, aliases: '-j', desc: 'Job configuration name'
|
17
17
|
|
18
18
|
map run: :start # Thor doesn't like `run` as a command name
|
19
19
|
desc "run", "Start a job"
|
20
|
+
option :log_level, desc: 'Log level (debug, info, warn, error, fatal)', default: 'info'
|
21
|
+
option :verbose, aliases: '-v', desc: 'Set log level to verbose', type: :boolean
|
22
|
+
option :dry_run, desc: 'Only run the extraction and transform steps, not the loading', type: :boolean
|
20
23
|
long_desc <<-LONG_DESC
|
21
24
|
This will run an ETL job. Each job needs three parts:
|
22
25
|
|
@@ -24,23 +27,17 @@ module Chronicle
|
|
24
27
|
|
25
28
|
2. #{'Transformer'.underline}: transforms data into a new format. If none is specified, we use the `null` transformer which does nothing to the data.
|
26
29
|
|
27
|
-
3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout.
|
30
|
+
3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout. With the --dry-run option, this step won't be run.
|
28
31
|
|
29
32
|
If you do not want to use the command line flags, you can also configure a job with a .yml config file. You can either specify the path to this file or use the filename and place the file in ~/.config/chronicle/etl/jobs/NAME.yml and call it with `--job NAME`
|
30
33
|
LONG_DESC
|
31
34
|
# Run an ETL job
|
32
35
|
def start
|
36
|
+
setup_log_level
|
33
37
|
job_definition = build_job_definition(options)
|
34
38
|
job = Chronicle::ETL::Job.new(job_definition)
|
35
39
|
runner = Chronicle::ETL::Runner.new(job)
|
36
40
|
runner.run!
|
37
|
-
rescue Chronicle::ETL::ProviderNotAvailableError => e
|
38
|
-
warn(e.message.red)
|
39
|
-
warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{e.provider}`")
|
40
|
-
exit(false)
|
41
|
-
rescue Chronicle::ETL::ConnectorNotAvailableError => e
|
42
|
-
warn(e.message.red)
|
43
|
-
exit(false)
|
44
41
|
end
|
45
42
|
|
46
43
|
desc "create", "Create a job"
|
@@ -48,14 +45,13 @@ LONG_DESC
|
|
48
45
|
def create
|
49
46
|
job_definition = build_job_definition(options)
|
50
47
|
path = File.join('chronicle', 'etl', 'jobs', options[:name])
|
51
|
-
Chronicle::ETL::Config.write(path, job_definition)
|
48
|
+
Chronicle::ETL::Config.write(path, job_definition.definition)
|
52
49
|
end
|
53
50
|
|
54
51
|
desc "show", "Show details about a job"
|
55
52
|
# Show an ETL job
|
56
53
|
def show
|
57
|
-
|
58
|
-
pp job_config
|
54
|
+
puts Chronicle::ETL::Job.new(build_job_definition(options))
|
59
55
|
end
|
60
56
|
|
61
57
|
desc "list", "List all available jobs"
|
@@ -81,11 +77,20 @@ LONG_DESC
|
|
81
77
|
|
82
78
|
private
|
83
79
|
|
80
|
+
def setup_log_level
|
81
|
+
if options[:verbose]
|
82
|
+
Chronicle::ETL::Logger.log_level = Chronicle::ETL::Logger::DEBUG
|
83
|
+
elsif options[:log_level]
|
84
|
+
level = Chronicle::ETL::Logger.const_get(options[:log_level].upcase)
|
85
|
+
Chronicle::ETL::Logger.log_level = level
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
84
89
|
# Create job definition by reading config file and then overwriting with flag options
|
85
90
|
def build_job_definition(options)
|
86
91
|
definition = Chronicle::ETL::JobDefinition.new
|
87
|
-
definition.add_config(process_flag_options(options))
|
88
92
|
definition.add_config(load_job_config(options[:name]))
|
93
|
+
definition.add_config(process_flag_options(options))
|
89
94
|
definition
|
90
95
|
end
|
91
96
|
|
@@ -96,18 +101,19 @@ LONG_DESC
|
|
96
101
|
# Takes flag options and turns them into a runner config
|
97
102
|
def process_flag_options options
|
98
103
|
{
|
104
|
+
dry_run: options[:dry_run],
|
99
105
|
extractor: {
|
100
106
|
name: options[:extractor],
|
101
107
|
options: options[:'extractor-opts']
|
102
|
-
},
|
108
|
+
}.compact,
|
103
109
|
transformer: {
|
104
110
|
name: options[:transformer],
|
105
111
|
options: options[:'transformer-opts']
|
106
|
-
},
|
112
|
+
}.compact,
|
107
113
|
loader: {
|
108
114
|
name: options[:loader],
|
109
115
|
options: options[:'loader-opts']
|
110
|
-
}
|
116
|
+
}.compact
|
111
117
|
}
|
112
118
|
end
|
113
119
|
end
|
@@ -22,6 +22,11 @@ module Chronicle
|
|
22
22
|
|
23
23
|
# Entrypoint for the CLI
|
24
24
|
def self.start(given_args = ARGV, config = {})
|
25
|
+
if given_args[0] == "--version"
|
26
|
+
puts "#{Chronicle::ETL::VERSION}"
|
27
|
+
exit
|
28
|
+
end
|
29
|
+
|
25
30
|
if given_args.none?
|
26
31
|
abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
|
27
32
|
end
|
@@ -52,10 +57,10 @@ module Chronicle
|
|
52
57
|
shell.say " $ chronicle-etl connectors:list"
|
53
58
|
shell.say
|
54
59
|
shell.say " Run a simple job:".italic.light_black
|
55
|
-
shell.say " $ chronicle-etl jobs:
|
60
|
+
shell.say " $ chronicle-etl jobs:run --extractor stdin --transformer null --loader stdout"
|
56
61
|
shell.say
|
57
62
|
shell.say " Show full job options:".italic.light_black
|
58
|
-
shell.say " $ chronicle-etl jobs help
|
63
|
+
shell.say " $ chronicle-etl jobs help run"
|
59
64
|
|
60
65
|
list = []
|
61
66
|
|
@@ -72,6 +77,9 @@ module Chronicle
|
|
72
77
|
shell.say "VERSION".bold
|
73
78
|
shell.say " #{Chronicle::ETL::VERSION}"
|
74
79
|
shell.say
|
80
|
+
shell.say " Display current version:".italic.light_black
|
81
|
+
shell.say " $ chronicle-etl --version"
|
82
|
+
shell.say
|
75
83
|
shell.say "FULL DOCUMENTATION".bold
|
76
84
|
shell.say " https://github.com/chronicle-app/chronicle-etl".blue
|
77
85
|
shell.say
|
data/lib/chronicle/etl/config.rb
CHANGED
@@ -30,7 +30,7 @@ module Chronicle
|
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
33
|
-
# Returns all available credentials available in ~/.config/
|
33
|
+
# Returns all available credentials available in ~/.config/chronicle/etl/credentials/*.yml
|
34
34
|
def available_credentials
|
35
35
|
job_directory = Runcom::Config.new('chronicle/etl/credentials').current
|
36
36
|
Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
|
@@ -2,7 +2,7 @@ module Chronicle
|
|
2
2
|
module ETL
|
3
3
|
class Error < StandardError; end;
|
4
4
|
|
5
|
-
class
|
5
|
+
class RunnerTypeError < Error; end
|
6
6
|
|
7
7
|
class ConnectorNotAvailableError < Error
|
8
8
|
def initialize(message, provider: nil, name: nil)
|
@@ -15,5 +15,16 @@ module Chronicle
|
|
15
15
|
|
16
16
|
class ProviderNotAvailableError < ConnectorNotAvailableError; end
|
17
17
|
class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
|
18
|
+
|
19
|
+
class TransformationError < Error
|
20
|
+
attr_reader :transformation
|
21
|
+
|
22
|
+
def initialize(message=nil, transformation:)
|
23
|
+
super(message)
|
24
|
+
@transformation = transformation
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class UntransformableRecordError < TransformationError; end
|
18
29
|
end
|
19
30
|
end
|
@@ -1,41 +1,48 @@
|
|
1
1
|
require 'csv'
|
2
|
-
class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
|
3
|
-
DEFAULT_OPTIONS = {
|
4
|
-
headers: true,
|
5
|
-
filename: $stdin
|
6
|
-
}.freeze
|
7
|
-
|
8
|
-
def initialize(options = {})
|
9
|
-
super(DEFAULT_OPTIONS.merge(options))
|
10
|
-
end
|
11
2
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class CsvExtractor < Chronicle::ETL::Extractor
|
6
|
+
include Extractors::Helpers::FilesystemReader
|
7
|
+
|
8
|
+
register_connector do |r|
|
9
|
+
r.description = 'input as CSV'
|
10
|
+
end
|
11
|
+
|
12
|
+
DEFAULT_OPTIONS = {
|
13
|
+
headers: true,
|
14
|
+
filename: $stdin
|
15
|
+
}.freeze
|
16
|
+
|
17
|
+
def initialize(options = {})
|
18
|
+
super(DEFAULT_OPTIONS.merge(options))
|
19
|
+
end
|
20
|
+
|
21
|
+
def extract
|
22
|
+
csv = initialize_csv
|
23
|
+
csv.each do |row|
|
24
|
+
yield Chronicle::ETL::Extraction.new(data: row.to_h)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def results_count
|
29
|
+
CSV.read(@options[:filename], headers: @options[:headers]).count unless stdin?(@options[:filename])
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def initialize_csv
|
35
|
+
headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
|
36
|
+
|
37
|
+
csv_options = {
|
38
|
+
headers: headers,
|
39
|
+
converters: :all
|
40
|
+
}
|
41
|
+
|
42
|
+
open_from_filesystem(filename: @options[:filename]) do |file|
|
43
|
+
return CSV.new(file, **csv_options)
|
44
|
+
end
|
45
|
+
end
|
17
46
|
end
|
18
47
|
end
|
19
|
-
|
20
|
-
def results_count
|
21
|
-
CSV.read(@options[:filename], headers: @options[:headers]).count if read_from_file?
|
22
|
-
end
|
23
|
-
|
24
|
-
private
|
25
|
-
|
26
|
-
def initialize_csv
|
27
|
-
headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
|
28
|
-
|
29
|
-
csv_options = {
|
30
|
-
headers: headers,
|
31
|
-
converters: :all
|
32
|
-
}
|
33
|
-
|
34
|
-
stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
|
35
|
-
CSV.new(stream, **csv_options)
|
36
|
-
end
|
37
|
-
|
38
|
-
def read_from_file?
|
39
|
-
@options[:filename] != $stdin
|
40
|
-
end
|
41
48
|
end
|
@@ -4,7 +4,7 @@ module Chronicle
|
|
4
4
|
module ETL
|
5
5
|
# Abstract class representing an Extractor for an ETL job
|
6
6
|
class Extractor
|
7
|
-
extend Chronicle::ETL::
|
7
|
+
extend Chronicle::ETL::Registry::SelfRegistering
|
8
8
|
|
9
9
|
# Construct a new instance of this extractor. Options are passed in from a Runner
|
10
10
|
# == Paramters:
|
@@ -12,6 +12,7 @@ module Chronicle
|
|
12
12
|
# Options for configuring this Extractor
|
13
13
|
def initialize(options = {})
|
14
14
|
@options = options.transform_keys!(&:to_sym)
|
15
|
+
sanitize_options
|
15
16
|
handle_continuation
|
16
17
|
end
|
17
18
|
|
@@ -26,6 +27,11 @@ module Chronicle
|
|
26
27
|
|
27
28
|
private
|
28
29
|
|
30
|
+
def sanitize_options
|
31
|
+
@options[:load_since] = Time.parse(@options[:load_since]) if @options[:load_since] && @options[:load_since].is_a?(String)
|
32
|
+
@options[:load_until] = Time.parse(@options[:load_until]) if @options[:load_until] && @options[:load_until].is_a?(String)
|
33
|
+
end
|
34
|
+
|
29
35
|
def handle_continuation
|
30
36
|
return unless @options[:continuation]
|
31
37
|
|
@@ -36,6 +42,8 @@ module Chronicle
|
|
36
42
|
end
|
37
43
|
end
|
38
44
|
|
45
|
+
require_relative 'helpers/filesystem_reader'
|
39
46
|
require_relative 'csv_extractor'
|
40
47
|
require_relative 'file_extractor'
|
48
|
+
require_relative 'json_extractor'
|
41
49
|
require_relative 'stdin_extractor'
|
@@ -3,49 +3,31 @@ require 'pathname'
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
5
5
|
class FileExtractor < Chronicle::ETL::Extractor
|
6
|
-
|
7
|
-
if file?
|
8
|
-
extract_file do |data, metadata|
|
9
|
-
yield(data, metadata)
|
10
|
-
end
|
11
|
-
elsif directory?
|
12
|
-
extract_from_directory do |data, metadata|
|
13
|
-
yield(data, metadata)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
6
|
+
include Extractors::Helpers::FilesystemReader
|
17
7
|
|
18
|
-
|
19
|
-
|
20
|
-
return 1
|
21
|
-
else
|
22
|
-
search_pattern = File.join(@options[:filename], '**/*.eml')
|
23
|
-
Dir.glob(search_pattern).count
|
24
|
-
end
|
8
|
+
register_connector do |r|
|
9
|
+
r.description = 'file or directory of files'
|
25
10
|
end
|
26
11
|
|
27
|
-
|
28
|
-
|
29
|
-
def extract_from_directory
|
30
|
-
search_pattern = File.join(@options[:filename], '**/*.eml')
|
31
|
-
filenames = Dir.glob(search_pattern)
|
12
|
+
def extract
|
32
13
|
filenames.each do |filename|
|
33
|
-
|
34
|
-
yield(file.read, {filename: file})
|
14
|
+
yield Chronicle::ETL::Extraction.new(data: filename)
|
35
15
|
end
|
36
16
|
end
|
37
17
|
|
38
|
-
def
|
39
|
-
|
40
|
-
yield(file.read, {filename: @options[:filename]})
|
18
|
+
def results_count
|
19
|
+
filenames.count
|
41
20
|
end
|
42
21
|
|
43
|
-
|
44
|
-
Pathname.new(@options[:filename]).directory?
|
45
|
-
end
|
22
|
+
private
|
46
23
|
|
47
|
-
def
|
48
|
-
|
24
|
+
def filenames
|
25
|
+
@filenames ||= filenames_in_directory(
|
26
|
+
path: @options[:filename],
|
27
|
+
dir_glob_pattern: @options[:dir_glob_pattern],
|
28
|
+
load_since: @options[:load_since],
|
29
|
+
load_until: @options[:load_until]
|
30
|
+
)
|
49
31
|
end
|
50
32
|
end
|
51
33
|
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Extractors
|
6
|
+
module Helpers
|
7
|
+
module FilesystemReader
|
8
|
+
|
9
|
+
def filenames_in_directory(...)
|
10
|
+
filenames = gather_files(...)
|
11
|
+
if block_given?
|
12
|
+
filenames.each do |filename|
|
13
|
+
yield filename
|
14
|
+
end
|
15
|
+
else
|
16
|
+
filenames
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def read_from_filesystem(filename:, yield_each_line: true, dir_glob_pattern: '**/*')
|
21
|
+
open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
|
22
|
+
if yield_each_line
|
23
|
+
file.each_line do |line|
|
24
|
+
yield line
|
25
|
+
end
|
26
|
+
else
|
27
|
+
yield file.read
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def open_from_filesystem(filename:, dir_glob_pattern: '**/*')
|
33
|
+
open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
|
34
|
+
yield file
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def results_count
|
39
|
+
raise NotImplementedError
|
40
|
+
# if file?
|
41
|
+
# return 1
|
42
|
+
# else
|
43
|
+
# search_pattern = File.join(@options[:filename], '**/*')
|
44
|
+
# Dir.glob(search_pattern).count
|
45
|
+
# end
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def gather_files(path:, dir_glob_pattern: '**/*', load_since: nil, load_until: nil, smaller_than: nil, larger_than: nil, sort: :mtime)
|
51
|
+
search_pattern = File.join(path, '**', dir_glob_pattern)
|
52
|
+
files = Dir.glob(search_pattern)
|
53
|
+
|
54
|
+
files = files.keep_if {|f| (File.mtime(f) > load_since)} if load_since
|
55
|
+
files = files.keep_if {|f| (File.mtime(f) < load_until)} if load_until
|
56
|
+
|
57
|
+
# pass in file sizes in bytes
|
58
|
+
files = files.keep_if {|f| (File.size(f) < smaller_than)} if smaller_than
|
59
|
+
files = files.keep_if {|f| (File.size(f) > larger_than)} if larger_than
|
60
|
+
|
61
|
+
# TODO: incorporate sort argument
|
62
|
+
files.sort_by{ |f| File.mtime(f) }
|
63
|
+
end
|
64
|
+
|
65
|
+
def select_files_in_directory(path:, dir_glob_pattern: '**/*')
|
66
|
+
raise IOError.new("#{path} is not a directory.") unless directory?(path)
|
67
|
+
|
68
|
+
search_pattern = File.join(path, dir_glob_pattern)
|
69
|
+
Dir.glob(search_pattern).each do |filename|
|
70
|
+
yield(filename)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def open_files(filename:, dir_glob_pattern:)
|
75
|
+
if stdin?(filename)
|
76
|
+
yield $stdin
|
77
|
+
elsif directory?(filename)
|
78
|
+
search_pattern = File.join(filename, dir_glob_pattern)
|
79
|
+
filenames = Dir.glob(search_pattern)
|
80
|
+
filenames.each do |filename|
|
81
|
+
file = File.open(filename)
|
82
|
+
yield(file)
|
83
|
+
end
|
84
|
+
elsif file?(filename)
|
85
|
+
yield File.open(filename)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def stdin?(filename)
|
90
|
+
filename == $stdin
|
91
|
+
end
|
92
|
+
|
93
|
+
def directory?(filename)
|
94
|
+
Pathname.new(filename).directory?
|
95
|
+
end
|
96
|
+
|
97
|
+
def file?(filename)
|
98
|
+
Pathname.new(filename).file?
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|