chronicle-etl 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +3 -0
- data/README.md +20 -13
- data/chronicle-etl.gemspec +11 -8
- data/lib/chronicle/etl/cli/connectors.rb +19 -7
- data/lib/chronicle/etl/cli/jobs.rb +24 -18
- data/lib/chronicle/etl/cli/main.rb +10 -2
- data/lib/chronicle/etl/config.rb +1 -1
- data/lib/chronicle/etl/exceptions.rb +12 -1
- data/lib/chronicle/etl/extraction.rb +12 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -36
- data/lib/chronicle/etl/extractors/extractor.rb +9 -1
- data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
- data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
- data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
- data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
- data/lib/chronicle/etl/job.rb +30 -29
- data/lib/chronicle/etl/job_definition.rb +45 -7
- data/lib/chronicle/etl/job_log.rb +10 -0
- data/lib/chronicle/etl/job_logger.rb +23 -20
- data/lib/chronicle/etl/loaders/csv_loader.rb +4 -0
- data/lib/chronicle/etl/loaders/loader.rb +1 -1
- data/lib/chronicle/etl/loaders/rest_loader.rb +5 -1
- data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
- data/lib/chronicle/etl/loaders/table_loader.rb +57 -7
- data/lib/chronicle/etl/logger.rb +48 -0
- data/lib/chronicle/etl/models/attachment.rb +14 -0
- data/lib/chronicle/etl/models/base.rb +23 -7
- data/lib/chronicle/etl/models/entity.rb +9 -3
- data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
- data/lib/chronicle/etl/registry/registry.rb +52 -0
- data/lib/chronicle/etl/registry/self_registering.rb +25 -0
- data/lib/chronicle/etl/runner.rb +57 -7
- data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
- data/lib/chronicle/etl/serializers/serializer.rb +27 -0
- data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
- data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
- data/lib/chronicle/etl/transformers/transformer.rb +39 -9
- data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
- data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
- data/lib/chronicle/etl/version.rb +1 -1
- data/lib/chronicle/etl.rb +7 -2
- metadata +96 -44
- data/Gemfile.lock +0 -91
- data/lib/chronicle/etl/catalog.rb +0 -108
- data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bfd4e081bfeda7d097a5a5eee6ccf28baf0a9b3878968d74c9a604013d0b55a6
|
4
|
+
data.tar.gz: 003ebd2ffe2b1220c7f43a4875043cb5500aa1b7a6327b84c9be10f04e0e8d40
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3d786fb4acf8d0b03e65262209def310ca25b92646847f6e96791e6491e9b159ab11db7fa35f785f6782fbc0b9e3daebb625e2353fce2422f7fc79aed7a4d6bc
|
7
|
+
data.tar.gz: 87771745b9df2160966299f1d73eb568b46080ed217a1952e1dd938fff7758432f2cc6f449036d24951660b405e18083f1adac26d00243a0fab9003a96eb569d
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
data/README.md
CHANGED
@@ -31,6 +31,9 @@ Connectors are available to read, process, and load data from different formats
|
|
31
31
|
```bash
|
32
32
|
# List all available connectors
|
33
33
|
$ chronicle-etl connectors:list
|
34
|
+
|
35
|
+
# Install a connector
|
36
|
+
$ chronicle-etl connectors:install imessage
|
34
37
|
```
|
35
38
|
|
36
39
|
Built in connectors:
|
@@ -44,16 +47,18 @@ Built in connectors:
|
|
44
47
|
- `null` - (default) Don't do anything
|
45
48
|
|
46
49
|
### Loaders
|
47
|
-
- `stdout` - (default) output
|
50
|
+
- `stdout` - (default) output records to stdout serialized as JSON
|
48
51
|
- `csv` - Load records to a csv file
|
52
|
+
- `rest` - Serialize records with [JSONAPI](https://jsonapi.org/) and send to a REST API
|
49
53
|
- `table` - Output an ascii table of records. Useful for debugging.
|
50
54
|
|
51
55
|
### Provider-specific importers
|
52
56
|
|
53
57
|
In addition to the built-in importers, importers for third-party platforms are available. They are packaged as individual Ruby gems.
|
54
58
|
|
55
|
-
- [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files
|
56
|
-
- [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history
|
59
|
+
- [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files
|
60
|
+
- [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`
|
61
|
+
- [imessage](https://github.com/chronicle-app/chronicle-imessage). Extract iMessage messages from a local macOS installation
|
57
62
|
|
58
63
|
To install any of these, run `gem install chronicle-PROVIDER`.
|
59
64
|
|
@@ -61,7 +66,7 @@ If you don't want to use the available rubygem importers, `chronicle-etl` can us
|
|
61
66
|
|
62
67
|
I'll be open-sourcing more importers. Please [contact me](mailto:andrew@hyfen.net) to chat about what will be available!
|
63
68
|
|
64
|
-
|
69
|
+
## Full commands
|
65
70
|
|
66
71
|
```
|
67
72
|
$ chronicle-etl help
|
@@ -75,26 +80,28 @@ ALL COMMANDS
|
|
75
80
|
jobs:create # Create a job
|
76
81
|
jobs:list # List all available jobs
|
77
82
|
jobs:run # Start a job
|
78
|
-
jobs:show # Show a job
|
83
|
+
jobs:show # Show details about a job
|
79
84
|
```
|
80
85
|
|
81
|
-
###
|
86
|
+
### Running a job
|
82
87
|
|
83
88
|
```
|
84
89
|
Usage:
|
85
90
|
chronicle-etl jobs:run
|
86
91
|
|
87
92
|
Options:
|
88
|
-
|
89
|
-
# Default:
|
93
|
+
[--log-level=LOG_LEVEL] # Log level (debug, info, warn, error, fatal)
|
94
|
+
# Default: info
|
95
|
+
-v, [--verbose], [--no-verbose] # Set log level to verbose
|
96
|
+
[--dry-run], [--no-dry-run] # Only run the extraction and transform steps, not the loading
|
97
|
+
-e, [--extractor=extractor-name] # Extractor class. Default: stdin
|
90
98
|
[--extractor-opts=key:value] # Extractor options
|
91
|
-
-t, [--transformer=transformer-name] # Transformer class
|
92
|
-
# Default: null
|
99
|
+
-t, [--transformer=transformer-name] # Transformer class. Default: null
|
93
100
|
[--transformer-opts=key:value] # Transformer options
|
94
|
-
-l, [--loader=loader-name] # Loader class
|
95
|
-
# Default: stdout
|
101
|
+
-l, [--loader=loader-name] # Loader class. Default: stdout
|
96
102
|
[--loader-opts=key:value] # Loader options
|
97
|
-
-j, [--
|
103
|
+
-j, [--name=NAME] # Job configuration name
|
104
|
+
|
98
105
|
|
99
106
|
Runs an ETL job
|
100
107
|
```
|
data/chronicle-etl.gemspec
CHANGED
@@ -36,18 +36,21 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
37
37
|
spec.require_paths = ["lib"]
|
38
38
|
|
39
|
-
spec.add_dependency "
|
39
|
+
spec.add_dependency "activesupport"
|
40
|
+
spec.add_dependency "chronic_duration", "~> 0.10.6"
|
40
41
|
spec.add_dependency "colorize", "~> 0.8.1"
|
41
|
-
spec.add_dependency "
|
42
|
+
spec.add_dependency "marcel", "~> 1.0.2"
|
43
|
+
spec.add_dependency "mini_exiftool", "~> 2.10"
|
44
|
+
spec.add_dependency "nokogiri", "~> 1.13"
|
45
|
+
spec.add_dependency "runcom", "~> 6.2"
|
46
|
+
spec.add_dependency "sequel", "~> 5.35"
|
47
|
+
spec.add_dependency "sqlite3", "~> 1.4"
|
48
|
+
spec.add_dependency "thor", "~> 0.20"
|
42
49
|
spec.add_dependency "tty-progressbar", "~> 0.17"
|
43
|
-
spec.add_dependency
|
44
|
-
spec.add_dependency 'deep_merge', '~> 1.2'
|
50
|
+
spec.add_dependency "tty-table", "~> 0.11"
|
45
51
|
|
46
52
|
spec.add_development_dependency "bundler", "~> 2.1"
|
53
|
+
spec.add_development_dependency "pry-byebug", "~> 3.9"
|
47
54
|
spec.add_development_dependency "rake", "~> 13.0"
|
48
55
|
spec.add_development_dependency "rspec", "~> 3.9"
|
49
|
-
spec.add_development_dependency "pry-byebug", "~> 3.9"
|
50
|
-
spec.add_development_dependency 'runcom', '~> 6.2'
|
51
|
-
spec.add_development_dependency 'redcarpet', '~> 3.5'
|
52
|
-
spec.add_development_dependency 'sqlite3', '~> 1.4'
|
53
56
|
end
|
@@ -7,23 +7,35 @@ module Chronicle
|
|
7
7
|
namespace :connectors
|
8
8
|
|
9
9
|
desc "install NAME", "Installs connector NAME"
|
10
|
-
def install
|
11
|
-
|
10
|
+
def install(name)
|
11
|
+
Chronicle::ETL::Registry.install_connector(name)
|
12
12
|
end
|
13
13
|
|
14
14
|
desc "list", "Lists available connectors"
|
15
15
|
# Display all available connectors that chronicle-etl has access to
|
16
16
|
def list
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
Chronicle::ETL::Registry.load_all!
|
18
|
+
|
19
|
+
connector_info = Chronicle::ETL::Registry.connectors.map do |connector_registration|
|
20
|
+
{
|
21
|
+
identifier: connector_registration.identifier,
|
22
|
+
phase: connector_registration.phase,
|
23
|
+
description: connector_registration.descriptive_phrase,
|
24
|
+
provider: connector_registration.provider,
|
25
|
+
core: connector_registration.built_in? ? '✓' : '',
|
26
|
+
class: connector_registration.klass_name
|
27
|
+
}
|
28
|
+
end
|
29
|
+
|
30
|
+
connector_info = connector_info.sort_by do |a|
|
31
|
+
[a[:core].to_s, a[:provider], a[:phase], a[:identifier]]
|
20
32
|
end
|
21
33
|
|
22
|
-
headers =
|
34
|
+
headers = connector_info.first.keys.map do |key|
|
23
35
|
key.to_s.upcase.bold
|
24
36
|
end
|
25
37
|
|
26
|
-
table = TTY::Table.new(headers,
|
38
|
+
table = TTY::Table.new(headers, connector_info.map(&:values))
|
27
39
|
puts table.render(indent: 0, padding: [0, 2])
|
28
40
|
end
|
29
41
|
end
|
@@ -7,16 +7,19 @@ module Chronicle
|
|
7
7
|
default_task "start"
|
8
8
|
namespace :jobs
|
9
9
|
|
10
|
-
class_option :extractor, aliases: '-e', desc:
|
10
|
+
class_option :extractor, aliases: '-e', desc: "Extractor class. Default: stdin", banner: 'extractor-name'
|
11
11
|
class_option :'extractor-opts', desc: 'Extractor options', type: :hash, default: {}
|
12
|
-
class_option :transformer, aliases: '-t', desc: 'Transformer class
|
12
|
+
class_option :transformer, aliases: '-t', desc: 'Transformer class. Default: null', banner: 'transformer-name'
|
13
13
|
class_option :'transformer-opts', desc: 'Transformer options', type: :hash, default: {}
|
14
|
-
class_option :loader, aliases: '-l', desc: 'Loader class
|
14
|
+
class_option :loader, aliases: '-l', desc: 'Loader class. Default: stdout', banner: 'loader-name'
|
15
15
|
class_option :'loader-opts', desc: 'Loader options', type: :hash, default: {}
|
16
16
|
class_option :name, aliases: '-j', desc: 'Job configuration name'
|
17
17
|
|
18
18
|
map run: :start # Thor doesn't like `run` as a command name
|
19
19
|
desc "run", "Start a job"
|
20
|
+
option :log_level, desc: 'Log level (debug, info, warn, error, fatal)', default: 'info'
|
21
|
+
option :verbose, aliases: '-v', desc: 'Set log level to verbose', type: :boolean
|
22
|
+
option :dry_run, desc: 'Only run the extraction and transform steps, not the loading', type: :boolean
|
20
23
|
long_desc <<-LONG_DESC
|
21
24
|
This will run an ETL job. Each job needs three parts:
|
22
25
|
|
@@ -24,23 +27,17 @@ module Chronicle
|
|
24
27
|
|
25
28
|
2. #{'Transformer'.underline}: transforms data into a new format. If none is specified, we use the `null` transformer which does nothing to the data.
|
26
29
|
|
27
|
-
3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout.
|
30
|
+
3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout. With the --dry-run option, this step won't be run.
|
28
31
|
|
29
32
|
If you do not want to use the command line flags, you can also configure a job with a .yml config file. You can either specify the path to this file or use the filename and place the file in ~/.config/chronicle/etl/jobs/NAME.yml and call it with `--job NAME`
|
30
33
|
LONG_DESC
|
31
34
|
# Run an ETL job
|
32
35
|
def start
|
36
|
+
setup_log_level
|
33
37
|
job_definition = build_job_definition(options)
|
34
38
|
job = Chronicle::ETL::Job.new(job_definition)
|
35
39
|
runner = Chronicle::ETL::Runner.new(job)
|
36
40
|
runner.run!
|
37
|
-
rescue Chronicle::ETL::ProviderNotAvailableError => e
|
38
|
-
warn(e.message.red)
|
39
|
-
warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{e.provider}`")
|
40
|
-
exit(false)
|
41
|
-
rescue Chronicle::ETL::ConnectorNotAvailableError => e
|
42
|
-
warn(e.message.red)
|
43
|
-
exit(false)
|
44
41
|
end
|
45
42
|
|
46
43
|
desc "create", "Create a job"
|
@@ -48,14 +45,13 @@ LONG_DESC
|
|
48
45
|
def create
|
49
46
|
job_definition = build_job_definition(options)
|
50
47
|
path = File.join('chronicle', 'etl', 'jobs', options[:name])
|
51
|
-
Chronicle::ETL::Config.write(path, job_definition)
|
48
|
+
Chronicle::ETL::Config.write(path, job_definition.definition)
|
52
49
|
end
|
53
50
|
|
54
51
|
desc "show", "Show details about a job"
|
55
52
|
# Show an ETL job
|
56
53
|
def show
|
57
|
-
|
58
|
-
pp job_config
|
54
|
+
puts Chronicle::ETL::Job.new(build_job_definition(options))
|
59
55
|
end
|
60
56
|
|
61
57
|
desc "list", "List all available jobs"
|
@@ -81,11 +77,20 @@ LONG_DESC
|
|
81
77
|
|
82
78
|
private
|
83
79
|
|
80
|
+
def setup_log_level
|
81
|
+
if options[:verbose]
|
82
|
+
Chronicle::ETL::Logger.log_level = Chronicle::ETL::Logger::DEBUG
|
83
|
+
elsif options[:log_level]
|
84
|
+
level = Chronicle::ETL::Logger.const_get(options[:log_level].upcase)
|
85
|
+
Chronicle::ETL::Logger.log_level = level
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
84
89
|
# Create job definition by reading config file and then overwriting with flag options
|
85
90
|
def build_job_definition(options)
|
86
91
|
definition = Chronicle::ETL::JobDefinition.new
|
87
|
-
definition.add_config(process_flag_options(options))
|
88
92
|
definition.add_config(load_job_config(options[:name]))
|
93
|
+
definition.add_config(process_flag_options(options))
|
89
94
|
definition
|
90
95
|
end
|
91
96
|
|
@@ -96,18 +101,19 @@ LONG_DESC
|
|
96
101
|
# Takes flag options and turns them into a runner config
|
97
102
|
def process_flag_options options
|
98
103
|
{
|
104
|
+
dry_run: options[:dry_run],
|
99
105
|
extractor: {
|
100
106
|
name: options[:extractor],
|
101
107
|
options: options[:'extractor-opts']
|
102
|
-
},
|
108
|
+
}.compact,
|
103
109
|
transformer: {
|
104
110
|
name: options[:transformer],
|
105
111
|
options: options[:'transformer-opts']
|
106
|
-
},
|
112
|
+
}.compact,
|
107
113
|
loader: {
|
108
114
|
name: options[:loader],
|
109
115
|
options: options[:'loader-opts']
|
110
|
-
}
|
116
|
+
}.compact
|
111
117
|
}
|
112
118
|
end
|
113
119
|
end
|
@@ -22,6 +22,11 @@ module Chronicle
|
|
22
22
|
|
23
23
|
# Entrypoint for the CLI
|
24
24
|
def self.start(given_args = ARGV, config = {})
|
25
|
+
if given_args[0] == "--version"
|
26
|
+
puts "#{Chronicle::ETL::VERSION}"
|
27
|
+
exit
|
28
|
+
end
|
29
|
+
|
25
30
|
if given_args.none?
|
26
31
|
abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
|
27
32
|
end
|
@@ -52,10 +57,10 @@ module Chronicle
|
|
52
57
|
shell.say " $ chronicle-etl connectors:list"
|
53
58
|
shell.say
|
54
59
|
shell.say " Run a simple job:".italic.light_black
|
55
|
-
shell.say " $ chronicle-etl jobs:
|
60
|
+
shell.say " $ chronicle-etl jobs:run --extractor stdin --transformer null --loader stdout"
|
56
61
|
shell.say
|
57
62
|
shell.say " Show full job options:".italic.light_black
|
58
|
-
shell.say " $ chronicle-etl jobs help
|
63
|
+
shell.say " $ chronicle-etl jobs help run"
|
59
64
|
|
60
65
|
list = []
|
61
66
|
|
@@ -72,6 +77,9 @@ module Chronicle
|
|
72
77
|
shell.say "VERSION".bold
|
73
78
|
shell.say " #{Chronicle::ETL::VERSION}"
|
74
79
|
shell.say
|
80
|
+
shell.say " Display current version:".italic.light_black
|
81
|
+
shell.say " $ chronicle-etl --version"
|
82
|
+
shell.say
|
75
83
|
shell.say "FULL DOCUMENTATION".bold
|
76
84
|
shell.say " https://github.com/chronicle-app/chronicle-etl".blue
|
77
85
|
shell.say
|
data/lib/chronicle/etl/config.rb
CHANGED
@@ -30,7 +30,7 @@ module Chronicle
|
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
33
|
-
# Returns all available credentials available in ~/.config/
|
33
|
+
# Returns all available credentials available in ~/.config/chronicle/etl/credentials/*.yml
|
34
34
|
def available_credentials
|
35
35
|
job_directory = Runcom::Config.new('chronicle/etl/credentials').current
|
36
36
|
Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
|
@@ -2,7 +2,7 @@ module Chronicle
|
|
2
2
|
module ETL
|
3
3
|
class Error < StandardError; end;
|
4
4
|
|
5
|
-
class
|
5
|
+
class RunnerTypeError < Error; end
|
6
6
|
|
7
7
|
class ConnectorNotAvailableError < Error
|
8
8
|
def initialize(message, provider: nil, name: nil)
|
@@ -15,5 +15,16 @@ module Chronicle
|
|
15
15
|
|
16
16
|
class ProviderNotAvailableError < ConnectorNotAvailableError; end
|
17
17
|
class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
|
18
|
+
|
19
|
+
class TransformationError < Error
|
20
|
+
attr_reader :transformation
|
21
|
+
|
22
|
+
def initialize(message=nil, transformation:)
|
23
|
+
super(message)
|
24
|
+
@transformation = transformation
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class UntransformableRecordError < TransformationError; end
|
18
29
|
end
|
19
30
|
end
|
@@ -1,41 +1,48 @@
|
|
1
1
|
require 'csv'
|
2
|
-
class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
|
3
|
-
DEFAULT_OPTIONS = {
|
4
|
-
headers: true,
|
5
|
-
filename: $stdin
|
6
|
-
}.freeze
|
7
|
-
|
8
|
-
def initialize(options = {})
|
9
|
-
super(DEFAULT_OPTIONS.merge(options))
|
10
|
-
end
|
11
2
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
class CsvExtractor < Chronicle::ETL::Extractor
|
6
|
+
include Extractors::Helpers::FilesystemReader
|
7
|
+
|
8
|
+
register_connector do |r|
|
9
|
+
r.description = 'input as CSV'
|
10
|
+
end
|
11
|
+
|
12
|
+
DEFAULT_OPTIONS = {
|
13
|
+
headers: true,
|
14
|
+
filename: $stdin
|
15
|
+
}.freeze
|
16
|
+
|
17
|
+
def initialize(options = {})
|
18
|
+
super(DEFAULT_OPTIONS.merge(options))
|
19
|
+
end
|
20
|
+
|
21
|
+
def extract
|
22
|
+
csv = initialize_csv
|
23
|
+
csv.each do |row|
|
24
|
+
yield Chronicle::ETL::Extraction.new(data: row.to_h)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def results_count
|
29
|
+
CSV.read(@options[:filename], headers: @options[:headers]).count unless stdin?(@options[:filename])
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def initialize_csv
|
35
|
+
headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
|
36
|
+
|
37
|
+
csv_options = {
|
38
|
+
headers: headers,
|
39
|
+
converters: :all
|
40
|
+
}
|
41
|
+
|
42
|
+
open_from_filesystem(filename: @options[:filename]) do |file|
|
43
|
+
return CSV.new(file, **csv_options)
|
44
|
+
end
|
45
|
+
end
|
17
46
|
end
|
18
47
|
end
|
19
|
-
|
20
|
-
def results_count
|
21
|
-
CSV.read(@options[:filename], headers: @options[:headers]).count if read_from_file?
|
22
|
-
end
|
23
|
-
|
24
|
-
private
|
25
|
-
|
26
|
-
def initialize_csv
|
27
|
-
headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
|
28
|
-
|
29
|
-
csv_options = {
|
30
|
-
headers: headers,
|
31
|
-
converters: :all
|
32
|
-
}
|
33
|
-
|
34
|
-
stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
|
35
|
-
CSV.new(stream, **csv_options)
|
36
|
-
end
|
37
|
-
|
38
|
-
def read_from_file?
|
39
|
-
@options[:filename] != $stdin
|
40
|
-
end
|
41
48
|
end
|
@@ -4,7 +4,7 @@ module Chronicle
|
|
4
4
|
module ETL
|
5
5
|
# Abstract class representing an Extractor for an ETL job
|
6
6
|
class Extractor
|
7
|
-
extend Chronicle::ETL::
|
7
|
+
extend Chronicle::ETL::Registry::SelfRegistering
|
8
8
|
|
9
9
|
# Construct a new instance of this extractor. Options are passed in from a Runner
|
10
10
|
# == Paramters:
|
@@ -12,6 +12,7 @@ module Chronicle
|
|
12
12
|
# Options for configuring this Extractor
|
13
13
|
def initialize(options = {})
|
14
14
|
@options = options.transform_keys!(&:to_sym)
|
15
|
+
sanitize_options
|
15
16
|
handle_continuation
|
16
17
|
end
|
17
18
|
|
@@ -26,6 +27,11 @@ module Chronicle
|
|
26
27
|
|
27
28
|
private
|
28
29
|
|
30
|
+
def sanitize_options
|
31
|
+
@options[:load_since] = Time.parse(@options[:load_since]) if @options[:load_since] && @options[:load_since].is_a?(String)
|
32
|
+
@options[:load_until] = Time.parse(@options[:load_until]) if @options[:load_until] && @options[:load_until].is_a?(String)
|
33
|
+
end
|
34
|
+
|
29
35
|
def handle_continuation
|
30
36
|
return unless @options[:continuation]
|
31
37
|
|
@@ -36,6 +42,8 @@ module Chronicle
|
|
36
42
|
end
|
37
43
|
end
|
38
44
|
|
45
|
+
require_relative 'helpers/filesystem_reader'
|
39
46
|
require_relative 'csv_extractor'
|
40
47
|
require_relative 'file_extractor'
|
48
|
+
require_relative 'json_extractor'
|
41
49
|
require_relative 'stdin_extractor'
|
@@ -3,49 +3,31 @@ require 'pathname'
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
5
5
|
class FileExtractor < Chronicle::ETL::Extractor
|
6
|
-
|
7
|
-
if file?
|
8
|
-
extract_file do |data, metadata|
|
9
|
-
yield(data, metadata)
|
10
|
-
end
|
11
|
-
elsif directory?
|
12
|
-
extract_from_directory do |data, metadata|
|
13
|
-
yield(data, metadata)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end
|
6
|
+
include Extractors::Helpers::FilesystemReader
|
17
7
|
|
18
|
-
|
19
|
-
|
20
|
-
return 1
|
21
|
-
else
|
22
|
-
search_pattern = File.join(@options[:filename], '**/*.eml')
|
23
|
-
Dir.glob(search_pattern).count
|
24
|
-
end
|
8
|
+
register_connector do |r|
|
9
|
+
r.description = 'file or directory of files'
|
25
10
|
end
|
26
11
|
|
27
|
-
|
28
|
-
|
29
|
-
def extract_from_directory
|
30
|
-
search_pattern = File.join(@options[:filename], '**/*.eml')
|
31
|
-
filenames = Dir.glob(search_pattern)
|
12
|
+
def extract
|
32
13
|
filenames.each do |filename|
|
33
|
-
|
34
|
-
yield(file.read, {filename: file})
|
14
|
+
yield Chronicle::ETL::Extraction.new(data: filename)
|
35
15
|
end
|
36
16
|
end
|
37
17
|
|
38
|
-
def
|
39
|
-
|
40
|
-
yield(file.read, {filename: @options[:filename]})
|
18
|
+
def results_count
|
19
|
+
filenames.count
|
41
20
|
end
|
42
21
|
|
43
|
-
|
44
|
-
Pathname.new(@options[:filename]).directory?
|
45
|
-
end
|
22
|
+
private
|
46
23
|
|
47
|
-
def
|
48
|
-
|
24
|
+
def filenames
|
25
|
+
@filenames ||= filenames_in_directory(
|
26
|
+
path: @options[:filename],
|
27
|
+
dir_glob_pattern: @options[:dir_glob_pattern],
|
28
|
+
load_since: @options[:load_since],
|
29
|
+
load_until: @options[:load_until]
|
30
|
+
)
|
49
31
|
end
|
50
32
|
end
|
51
33
|
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Extractors
|
6
|
+
module Helpers
|
7
|
+
module FilesystemReader
|
8
|
+
|
9
|
+
def filenames_in_directory(...)
|
10
|
+
filenames = gather_files(...)
|
11
|
+
if block_given?
|
12
|
+
filenames.each do |filename|
|
13
|
+
yield filename
|
14
|
+
end
|
15
|
+
else
|
16
|
+
filenames
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def read_from_filesystem(filename:, yield_each_line: true, dir_glob_pattern: '**/*')
|
21
|
+
open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
|
22
|
+
if yield_each_line
|
23
|
+
file.each_line do |line|
|
24
|
+
yield line
|
25
|
+
end
|
26
|
+
else
|
27
|
+
yield file.read
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def open_from_filesystem(filename:, dir_glob_pattern: '**/*')
|
33
|
+
open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
|
34
|
+
yield file
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def results_count
|
39
|
+
raise NotImplementedError
|
40
|
+
# if file?
|
41
|
+
# return 1
|
42
|
+
# else
|
43
|
+
# search_pattern = File.join(@options[:filename], '**/*')
|
44
|
+
# Dir.glob(search_pattern).count
|
45
|
+
# end
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def gather_files(path:, dir_glob_pattern: '**/*', load_since: nil, load_until: nil, smaller_than: nil, larger_than: nil, sort: :mtime)
|
51
|
+
search_pattern = File.join(path, '**', dir_glob_pattern)
|
52
|
+
files = Dir.glob(search_pattern)
|
53
|
+
|
54
|
+
files = files.keep_if {|f| (File.mtime(f) > load_since)} if load_since
|
55
|
+
files = files.keep_if {|f| (File.mtime(f) < load_until)} if load_until
|
56
|
+
|
57
|
+
# pass in file sizes in bytes
|
58
|
+
files = files.keep_if {|f| (File.size(f) < smaller_than)} if smaller_than
|
59
|
+
files = files.keep_if {|f| (File.size(f) > larger_than)} if larger_than
|
60
|
+
|
61
|
+
# TODO: incorporate sort argument
|
62
|
+
files.sort_by{ |f| File.mtime(f) }
|
63
|
+
end
|
64
|
+
|
65
|
+
def select_files_in_directory(path:, dir_glob_pattern: '**/*')
|
66
|
+
raise IOError.new("#{path} is not a directory.") unless directory?(path)
|
67
|
+
|
68
|
+
search_pattern = File.join(path, dir_glob_pattern)
|
69
|
+
Dir.glob(search_pattern).each do |filename|
|
70
|
+
yield(filename)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def open_files(filename:, dir_glob_pattern:)
|
75
|
+
if stdin?(filename)
|
76
|
+
yield $stdin
|
77
|
+
elsif directory?(filename)
|
78
|
+
search_pattern = File.join(filename, dir_glob_pattern)
|
79
|
+
filenames = Dir.glob(search_pattern)
|
80
|
+
filenames.each do |filename|
|
81
|
+
file = File.open(filename)
|
82
|
+
yield(file)
|
83
|
+
end
|
84
|
+
elsif file?(filename)
|
85
|
+
yield File.open(filename)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def stdin?(filename)
|
90
|
+
filename == $stdin
|
91
|
+
end
|
92
|
+
|
93
|
+
def directory?(filename)
|
94
|
+
Pathname.new(filename).directory?
|
95
|
+
end
|
96
|
+
|
97
|
+
def file?(filename)
|
98
|
+
Pathname.new(filename).file?
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|