chronicle-etl 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/README.md +20 -13
  5. data/chronicle-etl.gemspec +11 -8
  6. data/lib/chronicle/etl/cli/connectors.rb +19 -7
  7. data/lib/chronicle/etl/cli/jobs.rb +24 -18
  8. data/lib/chronicle/etl/cli/main.rb +10 -2
  9. data/lib/chronicle/etl/config.rb +1 -1
  10. data/lib/chronicle/etl/exceptions.rb +12 -1
  11. data/lib/chronicle/etl/extraction.rb +12 -0
  12. data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -36
  13. data/lib/chronicle/etl/extractors/extractor.rb +9 -1
  14. data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
  15. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  16. data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
  17. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  18. data/lib/chronicle/etl/job.rb +30 -29
  19. data/lib/chronicle/etl/job_definition.rb +45 -7
  20. data/lib/chronicle/etl/job_log.rb +10 -0
  21. data/lib/chronicle/etl/job_logger.rb +23 -20
  22. data/lib/chronicle/etl/loaders/csv_loader.rb +4 -0
  23. data/lib/chronicle/etl/loaders/loader.rb +1 -1
  24. data/lib/chronicle/etl/loaders/rest_loader.rb +5 -1
  25. data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
  26. data/lib/chronicle/etl/loaders/table_loader.rb +57 -7
  27. data/lib/chronicle/etl/logger.rb +48 -0
  28. data/lib/chronicle/etl/models/attachment.rb +14 -0
  29. data/lib/chronicle/etl/models/base.rb +23 -7
  30. data/lib/chronicle/etl/models/entity.rb +9 -3
  31. data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
  32. data/lib/chronicle/etl/registry/registry.rb +52 -0
  33. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  34. data/lib/chronicle/etl/runner.rb +57 -7
  35. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  36. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  37. data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
  38. data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
  39. data/lib/chronicle/etl/transformers/transformer.rb +39 -9
  40. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  41. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  42. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  43. data/lib/chronicle/etl/version.rb +1 -1
  44. data/lib/chronicle/etl.rb +7 -2
  45. metadata +96 -44
  46. data/Gemfile.lock +0 -91
  47. data/lib/chronicle/etl/catalog.rb +0 -108
  48. data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7a02a2377d0e8d4135f3b931bc73641eac28058d736d9c1dba0a97107c1d4c0e
4
- data.tar.gz: 810d5bff80e852fa08ef9824ed6b313aa309bb69e84228bc1fbb7595069e043b
3
+ metadata.gz: bfd4e081bfeda7d097a5a5eee6ccf28baf0a9b3878968d74c9a604013d0b55a6
4
+ data.tar.gz: 003ebd2ffe2b1220c7f43a4875043cb5500aa1b7a6327b84c9be10f04e0e8d40
5
5
  SHA512:
6
- metadata.gz: 0d5fbea3c63349bb3f566e6137755f6cc8a4060d0e401abf5a0e7d8b44a4c4278089c10ffb8bb9cf2d783a238449140e5e54d90f3ad158aa362c6335eedca5aa
7
- data.tar.gz: bf6fa83b1d5e55760e62d3cc090bf09bb69a7c761ae4a9358fb4d82192c7efc7500b6db361f39adac3581982862654aa4603a78dfbb3aed53b51d01137ffd736
6
+ metadata.gz: 3d786fb4acf8d0b03e65262209def310ca25b92646847f6e96791e6491e9b159ab11db7fa35f785f6782fbc0b9e3daebb625e2353fce2422f7fc79aed7a4d6bc
7
+ data.tar.gz: 87771745b9df2160966299f1d73eb568b46080ed217a1952e1dd938fff7758432f2cc6f449036d24951660b405e18083f1adac26d00243a0fab9003a96eb569d
data/.gitignore CHANGED
@@ -7,6 +7,9 @@
7
7
  /spec/reports/
8
8
  /tmp/
9
9
 
10
+ # https://yehudakatz.com/2010/12/16/clarifying-the-roles-of-the-gemspec-and-gemfile/
11
+ Gemfile.lock
12
+
10
13
  # rspec failure tracking
11
14
  .rspec_status
12
15
  .DS_Store
data/.rubocop.yml CHANGED
@@ -5,4 +5,7 @@ Style/StringLiterals:
5
5
  Enabled: false
6
6
 
7
7
  Style/MethodCallWithArgsParentheses:
8
+ Enabled: false
9
+
10
+ Lint/ConstantResolution:
8
11
  Enabled: false
data/README.md CHANGED
@@ -31,6 +31,9 @@ Connectors are available to read, process, and load data from different formats
31
31
  ```bash
32
32
  # List all available connectors
33
33
  $ chronicle-etl connectors:list
34
+
35
+ # Install a connector
36
+ $ chronicle-etl connectors:install imessage
34
37
  ```
35
38
 
36
39
  Built in connectors:
@@ -44,16 +47,18 @@ Built in connectors:
44
47
  - `null` - (default) Don't do anything
45
48
 
46
49
  ### Loaders
47
- - `stdout` - (default) output transformed records to stdount
50
+ - `stdout` - (default) output records to stdout serialized as JSON
48
51
  - `csv` - Load records to a csv file
52
+ - `rest` - Serialize records with [JSONAPI](https://jsonapi.org/) and send to a REST API
49
53
  - `table` - Output an ascii table of records. Useful for debugging.
50
54
 
51
55
  ### Provider-specific importers
52
56
 
53
57
  In addition to the built-in importers, importers for third-party platforms are available. They are packaged as individual Ruby gems.
54
58
 
55
- - [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files. Transformers for chronicle schema
56
- - [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`. Transform it for chronicle schema
59
+ - [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files
60
+ - [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`
61
+ - [imessage](https://github.com/chronicle-app/chronicle-imessage). Extract iMessage messages from a local macOS installation
57
62
 
58
63
  To install any of these, run `gem install chronicle-PROVIDER`.
59
64
 
@@ -61,7 +66,7 @@ If you don't want to use the available rubygem importers, `chronicle-etl` can us
61
66
 
62
67
  I'll be open-sourcing more importers. Please [contact me](mailto:andrew@hyfen.net) to chat about what will be available!
63
68
 
64
- ### Full commands
69
+ ## Full commands
65
70
 
66
71
  ```
67
72
  $ chronicle-etl help
@@ -75,26 +80,28 @@ ALL COMMANDS
75
80
  jobs:create # Create a job
76
81
  jobs:list # List all available jobs
77
82
  jobs:run # Start a job
78
- jobs:show # Show a job
83
+ jobs:show # Show details about a job
79
84
  ```
80
85
 
81
- ### Job options
86
+ ### Running a job
82
87
 
83
88
  ```
84
89
  Usage:
85
90
  chronicle-etl jobs:run
86
91
 
87
92
  Options:
88
- -e, [--extractor=extractor-name] # Extractor class (available: stdin, csv, file)
89
- # Default: stdin
93
+ [--log-level=LOG_LEVEL] # Log level (debug, info, warn, error, fatal)
94
+ # Default: info
95
+ -v, [--verbose], [--no-verbose] # Set log level to verbose
96
+ [--dry-run], [--no-dry-run] # Only run the extraction and transform steps, not the loading
97
+ -e, [--extractor=extractor-name] # Extractor class. Default: stdin
90
98
  [--extractor-opts=key:value] # Extractor options
91
- -t, [--transformer=transformer-name] # Transformer class (available: null)
92
- # Default: null
99
+ -t, [--transformer=transformer-name] # Transformer class. Default: null
93
100
  [--transformer-opts=key:value] # Transformer options
94
- -l, [--loader=loader-name] # Loader class (available: stdout, csv, table)
95
- # Default: stdout
101
+ -l, [--loader=loader-name] # Loader class. Default: stdout
96
102
  [--loader-opts=key:value] # Loader options
97
- -j, [--job=JOB] # Job configuration file
103
+ -j, [--name=NAME] # Job configuration name
104
+
98
105
 
99
106
  Runs an ETL job
100
107
  ```
@@ -36,18 +36,21 @@ Gem::Specification.new do |spec|
36
36
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
37
37
  spec.require_paths = ["lib"]
38
38
 
39
- spec.add_dependency "thor", "~> 0.20"
39
+ spec.add_dependency "activesupport"
40
+ spec.add_dependency "chronic_duration", "~> 0.10.6"
40
41
  spec.add_dependency "colorize", "~> 0.8.1"
41
- spec.add_dependency "tty-table", "~> 0.11"
42
+ spec.add_dependency "marcel", "~> 1.0.2"
43
+ spec.add_dependency "mini_exiftool", "~> 2.10"
44
+ spec.add_dependency "nokogiri", "~> 1.13"
45
+ spec.add_dependency "runcom", "~> 6.2"
46
+ spec.add_dependency "sequel", "~> 5.35"
47
+ spec.add_dependency "sqlite3", "~> 1.4"
48
+ spec.add_dependency "thor", "~> 0.20"
42
49
  spec.add_dependency "tty-progressbar", "~> 0.17"
43
- spec.add_dependency 'sequel', '~> 5.35'
44
- spec.add_dependency 'deep_merge', '~> 1.2'
50
+ spec.add_dependency "tty-table", "~> 0.11"
45
51
 
46
52
  spec.add_development_dependency "bundler", "~> 2.1"
53
+ spec.add_development_dependency "pry-byebug", "~> 3.9"
47
54
  spec.add_development_dependency "rake", "~> 13.0"
48
55
  spec.add_development_dependency "rspec", "~> 3.9"
49
- spec.add_development_dependency "pry-byebug", "~> 3.9"
50
- spec.add_development_dependency 'runcom', '~> 6.2'
51
- spec.add_development_dependency 'redcarpet', '~> 3.5'
52
- spec.add_development_dependency 'sqlite3', '~> 1.4'
53
56
  end
@@ -7,23 +7,35 @@ module Chronicle
7
7
  namespace :connectors
8
8
 
9
9
  desc "install NAME", "Installs connector NAME"
10
- def install
11
- puts "Installing"
10
+ def install(name)
11
+ Chronicle::ETL::Registry.install_connector(name)
12
12
  end
13
13
 
14
14
  desc "list", "Lists available connectors"
15
15
  # Display all available connectors that chronicle-etl has access to
16
16
  def list
17
- klasses = Chronicle::ETL::Catalog.available_classes
18
- klasses = klasses.sort_by do |a|
19
- [a[:built_in].to_s, a[:provider], a[:phase]]
17
+ Chronicle::ETL::Registry.load_all!
18
+
19
+ connector_info = Chronicle::ETL::Registry.connectors.map do |connector_registration|
20
+ {
21
+ identifier: connector_registration.identifier,
22
+ phase: connector_registration.phase,
23
+ description: connector_registration.descriptive_phrase,
24
+ provider: connector_registration.provider,
25
+ core: connector_registration.built_in? ? '✓' : '',
26
+ class: connector_registration.klass_name
27
+ }
28
+ end
29
+
30
+ connector_info = connector_info.sort_by do |a|
31
+ [a[:core].to_s, a[:provider], a[:phase], a[:identifier]]
20
32
  end
21
33
 
22
- headers = klasses.first.keys.map do |key|
34
+ headers = connector_info.first.keys.map do |key|
23
35
  key.to_s.upcase.bold
24
36
  end
25
37
 
26
- table = TTY::Table.new(headers, klasses.map(&:values))
38
+ table = TTY::Table.new(headers, connector_info.map(&:values))
27
39
  puts table.render(indent: 0, padding: [0, 2])
28
40
  end
29
41
  end
@@ -7,16 +7,19 @@ module Chronicle
7
7
  default_task "start"
8
8
  namespace :jobs
9
9
 
10
- class_option :extractor, aliases: '-e', desc: 'Extractor class (available: stdin, csv, file)', default: 'stdin', banner: 'extractor-name'
10
+ class_option :extractor, aliases: '-e', desc: "Extractor class. Default: stdin", banner: 'extractor-name'
11
11
  class_option :'extractor-opts', desc: 'Extractor options', type: :hash, default: {}
12
- class_option :transformer, aliases: '-t', desc: 'Transformer class (available: null)', default: 'null', banner: 'transformer-name'
12
+ class_option :transformer, aliases: '-t', desc: 'Transformer class. Default: null', banner: 'transformer-name'
13
13
  class_option :'transformer-opts', desc: 'Transformer options', type: :hash, default: {}
14
- class_option :loader, aliases: '-l', desc: 'Loader class (available: stdout, csv, table)', default: 'stdout', banner: 'loader-name'
14
+ class_option :loader, aliases: '-l', desc: 'Loader class. Default: stdout', banner: 'loader-name'
15
15
  class_option :'loader-opts', desc: 'Loader options', type: :hash, default: {}
16
16
  class_option :name, aliases: '-j', desc: 'Job configuration name'
17
17
 
18
18
  map run: :start # Thor doesn't like `run` as a command name
19
19
  desc "run", "Start a job"
20
+ option :log_level, desc: 'Log level (debug, info, warn, error, fatal)', default: 'info'
21
+ option :verbose, aliases: '-v', desc: 'Set log level to verbose', type: :boolean
22
+ option :dry_run, desc: 'Only run the extraction and transform steps, not the loading', type: :boolean
20
23
  long_desc <<-LONG_DESC
21
24
  This will run an ETL job. Each job needs three parts:
22
25
 
@@ -24,23 +27,17 @@ module Chronicle
24
27
 
25
28
  2. #{'Transformer'.underline}: transforms data into a new format. If none is specified, we use the `null` transformer which does nothing to the data.
26
29
 
27
- 3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout.
30
+ 3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout. With the --dry-run option, this step won't be run.
28
31
 
29
32
  If you do not want to use the command line flags, you can also configure a job with a .yml config file. You can either specify the path to this file or use the filename and place the file in ~/.config/chronicle/etl/jobs/NAME.yml and call it with `--job NAME`
30
33
  LONG_DESC
31
34
  # Run an ETL job
32
35
  def start
36
+ setup_log_level
33
37
  job_definition = build_job_definition(options)
34
38
  job = Chronicle::ETL::Job.new(job_definition)
35
39
  runner = Chronicle::ETL::Runner.new(job)
36
40
  runner.run!
37
- rescue Chronicle::ETL::ProviderNotAvailableError => e
38
- warn(e.message.red)
39
- warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{e.provider}`")
40
- exit(false)
41
- rescue Chronicle::ETL::ConnectorNotAvailableError => e
42
- warn(e.message.red)
43
- exit(false)
44
41
  end
45
42
 
46
43
  desc "create", "Create a job"
@@ -48,14 +45,13 @@ LONG_DESC
48
45
  def create
49
46
  job_definition = build_job_definition(options)
50
47
  path = File.join('chronicle', 'etl', 'jobs', options[:name])
51
- Chronicle::ETL::Config.write(path, job_definition)
48
+ Chronicle::ETL::Config.write(path, job_definition.definition)
52
49
  end
53
50
 
54
51
  desc "show", "Show details about a job"
55
52
  # Show an ETL job
56
53
  def show
57
- job_config = build_job_definition(options)
58
- pp job_config
54
+ puts Chronicle::ETL::Job.new(build_job_definition(options))
59
55
  end
60
56
 
61
57
  desc "list", "List all available jobs"
@@ -81,11 +77,20 @@ LONG_DESC
81
77
 
82
78
  private
83
79
 
80
+ def setup_log_level
81
+ if options[:verbose]
82
+ Chronicle::ETL::Logger.log_level = Chronicle::ETL::Logger::DEBUG
83
+ elsif options[:log_level]
84
+ level = Chronicle::ETL::Logger.const_get(options[:log_level].upcase)
85
+ Chronicle::ETL::Logger.log_level = level
86
+ end
87
+ end
88
+
84
89
  # Create job definition by reading config file and then overwriting with flag options
85
90
  def build_job_definition(options)
86
91
  definition = Chronicle::ETL::JobDefinition.new
87
- definition.add_config(process_flag_options(options))
88
92
  definition.add_config(load_job_config(options[:name]))
93
+ definition.add_config(process_flag_options(options))
89
94
  definition
90
95
  end
91
96
 
@@ -96,18 +101,19 @@ LONG_DESC
96
101
  # Takes flag options and turns them into a runner config
97
102
  def process_flag_options options
98
103
  {
104
+ dry_run: options[:dry_run],
99
105
  extractor: {
100
106
  name: options[:extractor],
101
107
  options: options[:'extractor-opts']
102
- },
108
+ }.compact,
103
109
  transformer: {
104
110
  name: options[:transformer],
105
111
  options: options[:'transformer-opts']
106
- },
112
+ }.compact,
107
113
  loader: {
108
114
  name: options[:loader],
109
115
  options: options[:'loader-opts']
110
- }
116
+ }.compact
111
117
  }
112
118
  end
113
119
  end
@@ -22,6 +22,11 @@ module Chronicle
22
22
 
23
23
  # Entrypoint for the CLI
24
24
  def self.start(given_args = ARGV, config = {})
25
+ if given_args[0] == "--version"
26
+ puts "#{Chronicle::ETL::VERSION}"
27
+ exit
28
+ end
29
+
25
30
  if given_args.none?
26
31
  abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
27
32
  end
@@ -52,10 +57,10 @@ module Chronicle
52
57
  shell.say " $ chronicle-etl connectors:list"
53
58
  shell.say
54
59
  shell.say " Run a simple job:".italic.light_black
55
- shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
60
+ shell.say " $ chronicle-etl jobs:run --extractor stdin --transformer null --loader stdout"
56
61
  shell.say
57
62
  shell.say " Show full job options:".italic.light_black
58
- shell.say " $ chronicle-etl jobs help start"
63
+ shell.say " $ chronicle-etl jobs help run"
59
64
 
60
65
  list = []
61
66
 
@@ -72,6 +77,9 @@ module Chronicle
72
77
  shell.say "VERSION".bold
73
78
  shell.say " #{Chronicle::ETL::VERSION}"
74
79
  shell.say
80
+ shell.say " Display current version:".italic.light_black
81
+ shell.say " $ chronicle-etl --version"
82
+ shell.say
75
83
  shell.say "FULL DOCUMENTATION".bold
76
84
  shell.say " https://github.com/chronicle-app/chronicle-etl".blue
77
85
  shell.say
@@ -30,7 +30,7 @@ module Chronicle
30
30
  end
31
31
  end
32
32
 
33
- # Returns all available credentials available in ~/.config/chronilce/etl/credenetials/*.yml
33
+ # Returns all available credentials available in ~/.config/chronicle/etl/credentials/*.yml
34
34
  def available_credentials
35
35
  job_directory = Runcom::Config.new('chronicle/etl/credentials').current
36
36
  Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
@@ -2,7 +2,7 @@ module Chronicle
2
2
  module ETL
3
3
  class Error < StandardError; end;
4
4
 
5
- class InvalidTransformedRecordError < Error; end
5
+ class RunnerTypeError < Error; end
6
6
 
7
7
  class ConnectorNotAvailableError < Error
8
8
  def initialize(message, provider: nil, name: nil)
@@ -15,5 +15,16 @@ module Chronicle
15
15
 
16
16
  class ProviderNotAvailableError < ConnectorNotAvailableError; end
17
17
  class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
18
+
19
+ class TransformationError < Error
20
+ attr_reader :transformation
21
+
22
+ def initialize(message=nil, transformation:)
23
+ super(message)
24
+ @transformation = transformation
25
+ end
26
+ end
27
+
28
+ class UntransformableRecordError < TransformationError; end
18
29
  end
19
30
  end
@@ -0,0 +1,12 @@
1
+ module Chronicle
2
+ module ETL
3
+ class Extraction
4
+ attr_accessor :data, :meta
5
+
6
+ def initialize(data: {}, meta: {})
7
+ @data = data
8
+ @meta = meta
9
+ end
10
+ end
11
+ end
12
+ end
@@ -1,41 +1,48 @@
1
1
  require 'csv'
2
- class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
3
- DEFAULT_OPTIONS = {
4
- headers: true,
5
- filename: $stdin
6
- }.freeze
7
-
8
- def initialize(options = {})
9
- super(DEFAULT_OPTIONS.merge(options))
10
- end
11
2
 
12
- def extract
13
- csv = initialize_csv
14
- csv.each do |row|
15
- result = row.to_h
16
- yield result
3
+ module Chronicle
4
+ module ETL
5
+ class CsvExtractor < Chronicle::ETL::Extractor
6
+ include Extractors::Helpers::FilesystemReader
7
+
8
+ register_connector do |r|
9
+ r.description = 'input as CSV'
10
+ end
11
+
12
+ DEFAULT_OPTIONS = {
13
+ headers: true,
14
+ filename: $stdin
15
+ }.freeze
16
+
17
+ def initialize(options = {})
18
+ super(DEFAULT_OPTIONS.merge(options))
19
+ end
20
+
21
+ def extract
22
+ csv = initialize_csv
23
+ csv.each do |row|
24
+ yield Chronicle::ETL::Extraction.new(data: row.to_h)
25
+ end
26
+ end
27
+
28
+ def results_count
29
+ CSV.read(@options[:filename], headers: @options[:headers]).count unless stdin?(@options[:filename])
30
+ end
31
+
32
+ private
33
+
34
+ def initialize_csv
35
+ headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
36
+
37
+ csv_options = {
38
+ headers: headers,
39
+ converters: :all
40
+ }
41
+
42
+ open_from_filesystem(filename: @options[:filename]) do |file|
43
+ return CSV.new(file, **csv_options)
44
+ end
45
+ end
17
46
  end
18
47
  end
19
-
20
- def results_count
21
- CSV.read(@options[:filename], headers: @options[:headers]).count if read_from_file?
22
- end
23
-
24
- private
25
-
26
- def initialize_csv
27
- headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
28
-
29
- csv_options = {
30
- headers: headers,
31
- converters: :all
32
- }
33
-
34
- stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
35
- CSV.new(stream, **csv_options)
36
- end
37
-
38
- def read_from_file?
39
- @options[:filename] != $stdin
40
- end
41
48
  end
@@ -4,7 +4,7 @@ module Chronicle
4
4
  module ETL
5
5
  # Abstract class representing an Extractor for an ETL job
6
6
  class Extractor
7
- extend Chronicle::ETL::Catalog
7
+ extend Chronicle::ETL::Registry::SelfRegistering
8
8
 
9
9
  # Construct a new instance of this extractor. Options are passed in from a Runner
10
10
  # == Paramters:
@@ -12,6 +12,7 @@ module Chronicle
12
12
  # Options for configuring this Extractor
13
13
  def initialize(options = {})
14
14
  @options = options.transform_keys!(&:to_sym)
15
+ sanitize_options
15
16
  handle_continuation
16
17
  end
17
18
 
@@ -26,6 +27,11 @@ module Chronicle
26
27
 
27
28
  private
28
29
 
30
+ def sanitize_options
31
+ @options[:load_since] = Time.parse(@options[:load_since]) if @options[:load_since] && @options[:load_since].is_a?(String)
32
+ @options[:load_until] = Time.parse(@options[:load_until]) if @options[:load_until] && @options[:load_until].is_a?(String)
33
+ end
34
+
29
35
  def handle_continuation
30
36
  return unless @options[:continuation]
31
37
 
@@ -36,6 +42,8 @@ module Chronicle
36
42
  end
37
43
  end
38
44
 
45
+ require_relative 'helpers/filesystem_reader'
39
46
  require_relative 'csv_extractor'
40
47
  require_relative 'file_extractor'
48
+ require_relative 'json_extractor'
41
49
  require_relative 'stdin_extractor'
@@ -3,49 +3,31 @@ require 'pathname'
3
3
  module Chronicle
4
4
  module ETL
5
5
  class FileExtractor < Chronicle::ETL::Extractor
6
- def extract
7
- if file?
8
- extract_file do |data, metadata|
9
- yield(data, metadata)
10
- end
11
- elsif directory?
12
- extract_from_directory do |data, metadata|
13
- yield(data, metadata)
14
- end
15
- end
16
- end
6
+ include Extractors::Helpers::FilesystemReader
17
7
 
18
- def results_count
19
- if file?
20
- return 1
21
- else
22
- search_pattern = File.join(@options[:filename], '**/*.eml')
23
- Dir.glob(search_pattern).count
24
- end
8
+ register_connector do |r|
9
+ r.description = 'file or directory of files'
25
10
  end
26
11
 
27
- private
28
-
29
- def extract_from_directory
30
- search_pattern = File.join(@options[:filename], '**/*.eml')
31
- filenames = Dir.glob(search_pattern)
12
+ def extract
32
13
  filenames.each do |filename|
33
- file = File.open(filename)
34
- yield(file.read, {filename: file})
14
+ yield Chronicle::ETL::Extraction.new(data: filename)
35
15
  end
36
16
  end
37
17
 
38
- def extract_file
39
- file = File.open(@options[:filename])
40
- yield(file.read, {filename: @options[:filename]})
18
+ def results_count
19
+ filenames.count
41
20
  end
42
21
 
43
- def directory?
44
- Pathname.new(@options[:filename]).directory?
45
- end
22
+ private
46
23
 
47
- def file?
48
- Pathname.new(@options[:filename]).file?
24
+ def filenames
25
+ @filenames ||= filenames_in_directory(
26
+ path: @options[:filename],
27
+ dir_glob_pattern: @options[:dir_glob_pattern],
28
+ load_since: @options[:load_since],
29
+ load_until: @options[:load_until]
30
+ )
49
31
  end
50
32
  end
51
33
  end
@@ -0,0 +1,104 @@
1
+ require 'pathname'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Extractors
6
+ module Helpers
7
+ module FilesystemReader
8
+
9
+ def filenames_in_directory(...)
10
+ filenames = gather_files(...)
11
+ if block_given?
12
+ filenames.each do |filename|
13
+ yield filename
14
+ end
15
+ else
16
+ filenames
17
+ end
18
+ end
19
+
20
+ def read_from_filesystem(filename:, yield_each_line: true, dir_glob_pattern: '**/*')
21
+ open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
22
+ if yield_each_line
23
+ file.each_line do |line|
24
+ yield line
25
+ end
26
+ else
27
+ yield file.read
28
+ end
29
+ end
30
+ end
31
+
32
+ def open_from_filesystem(filename:, dir_glob_pattern: '**/*')
33
+ open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
34
+ yield file
35
+ end
36
+ end
37
+
38
+ def results_count
39
+ raise NotImplementedError
40
+ # if file?
41
+ # return 1
42
+ # else
43
+ # search_pattern = File.join(@options[:filename], '**/*')
44
+ # Dir.glob(search_pattern).count
45
+ # end
46
+ end
47
+
48
+ private
49
+
50
+ def gather_files(path:, dir_glob_pattern: '**/*', load_since: nil, load_until: nil, smaller_than: nil, larger_than: nil, sort: :mtime)
51
+ search_pattern = File.join(path, '**', dir_glob_pattern)
52
+ files = Dir.glob(search_pattern)
53
+
54
+ files = files.keep_if {|f| (File.mtime(f) > load_since)} if load_since
55
+ files = files.keep_if {|f| (File.mtime(f) < load_until)} if load_until
56
+
57
+ # pass in file sizes in bytes
58
+ files = files.keep_if {|f| (File.size(f) < smaller_than)} if smaller_than
59
+ files = files.keep_if {|f| (File.size(f) > larger_than)} if larger_than
60
+
61
+ # TODO: incorporate sort argument
62
+ files.sort_by{ |f| File.mtime(f) }
63
+ end
64
+
65
+ def select_files_in_directory(path:, dir_glob_pattern: '**/*')
66
+ raise IOError.new("#{path} is not a directory.") unless directory?(path)
67
+
68
+ search_pattern = File.join(path, dir_glob_pattern)
69
+ Dir.glob(search_pattern).each do |filename|
70
+ yield(filename)
71
+ end
72
+ end
73
+
74
+ def open_files(filename:, dir_glob_pattern:)
75
+ if stdin?(filename)
76
+ yield $stdin
77
+ elsif directory?(filename)
78
+ search_pattern = File.join(filename, dir_glob_pattern)
79
+ filenames = Dir.glob(search_pattern)
80
+ filenames.each do |filename|
81
+ file = File.open(filename)
82
+ yield(file)
83
+ end
84
+ elsif file?(filename)
85
+ yield File.open(filename)
86
+ end
87
+ end
88
+
89
+ def stdin?(filename)
90
+ filename == $stdin
91
+ end
92
+
93
+ def directory?(filename)
94
+ Pathname.new(filename).directory?
95
+ end
96
+
97
+ def file?(filename)
98
+ Pathname.new(filename).file?
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end