chronicle-etl 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/README.md +20 -13
  5. data/chronicle-etl.gemspec +11 -8
  6. data/lib/chronicle/etl/cli/connectors.rb +19 -7
  7. data/lib/chronicle/etl/cli/jobs.rb +24 -18
  8. data/lib/chronicle/etl/cli/main.rb +10 -2
  9. data/lib/chronicle/etl/config.rb +1 -1
  10. data/lib/chronicle/etl/exceptions.rb +12 -1
  11. data/lib/chronicle/etl/extraction.rb +12 -0
  12. data/lib/chronicle/etl/extractors/csv_extractor.rb +43 -36
  13. data/lib/chronicle/etl/extractors/extractor.rb +9 -1
  14. data/lib/chronicle/etl/extractors/file_extractor.rb +15 -33
  15. data/lib/chronicle/etl/extractors/helpers/filesystem_reader.rb +104 -0
  16. data/lib/chronicle/etl/extractors/json_extractor.rb +45 -0
  17. data/lib/chronicle/etl/extractors/stdin_extractor.rb +6 -1
  18. data/lib/chronicle/etl/job.rb +30 -29
  19. data/lib/chronicle/etl/job_definition.rb +45 -7
  20. data/lib/chronicle/etl/job_log.rb +10 -0
  21. data/lib/chronicle/etl/job_logger.rb +23 -20
  22. data/lib/chronicle/etl/loaders/csv_loader.rb +4 -0
  23. data/lib/chronicle/etl/loaders/loader.rb +1 -1
  24. data/lib/chronicle/etl/loaders/rest_loader.rb +5 -1
  25. data/lib/chronicle/etl/loaders/stdout_loader.rb +6 -1
  26. data/lib/chronicle/etl/loaders/table_loader.rb +57 -7
  27. data/lib/chronicle/etl/logger.rb +48 -0
  28. data/lib/chronicle/etl/models/attachment.rb +14 -0
  29. data/lib/chronicle/etl/models/base.rb +23 -7
  30. data/lib/chronicle/etl/models/entity.rb +9 -3
  31. data/lib/chronicle/etl/registry/connector_registration.rb +61 -0
  32. data/lib/chronicle/etl/registry/registry.rb +52 -0
  33. data/lib/chronicle/etl/registry/self_registering.rb +25 -0
  34. data/lib/chronicle/etl/runner.rb +57 -7
  35. data/lib/chronicle/etl/serializers/jsonapi_serializer.rb +25 -0
  36. data/lib/chronicle/etl/serializers/serializer.rb +27 -0
  37. data/lib/chronicle/etl/transformers/image_file_transformer.rb +253 -0
  38. data/lib/chronicle/etl/transformers/null_transformer.rb +10 -1
  39. data/lib/chronicle/etl/transformers/transformer.rb +39 -9
  40. data/lib/chronicle/etl/utils/binary_attachments.rb +21 -0
  41. data/lib/chronicle/etl/utils/progress_bar.rb +3 -1
  42. data/lib/chronicle/etl/utils/text_recognition.rb +15 -0
  43. data/lib/chronicle/etl/version.rb +1 -1
  44. data/lib/chronicle/etl.rb +7 -2
  45. metadata +96 -44
  46. data/Gemfile.lock +0 -91
  47. data/lib/chronicle/etl/catalog.rb +0 -108
  48. data/lib/chronicle/etl/utils/jsonapi.rb +0 -28
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7a02a2377d0e8d4135f3b931bc73641eac28058d736d9c1dba0a97107c1d4c0e
4
- data.tar.gz: 810d5bff80e852fa08ef9824ed6b313aa309bb69e84228bc1fbb7595069e043b
3
+ metadata.gz: bfd4e081bfeda7d097a5a5eee6ccf28baf0a9b3878968d74c9a604013d0b55a6
4
+ data.tar.gz: 003ebd2ffe2b1220c7f43a4875043cb5500aa1b7a6327b84c9be10f04e0e8d40
5
5
  SHA512:
6
- metadata.gz: 0d5fbea3c63349bb3f566e6137755f6cc8a4060d0e401abf5a0e7d8b44a4c4278089c10ffb8bb9cf2d783a238449140e5e54d90f3ad158aa362c6335eedca5aa
7
- data.tar.gz: bf6fa83b1d5e55760e62d3cc090bf09bb69a7c761ae4a9358fb4d82192c7efc7500b6db361f39adac3581982862654aa4603a78dfbb3aed53b51d01137ffd736
6
+ metadata.gz: 3d786fb4acf8d0b03e65262209def310ca25b92646847f6e96791e6491e9b159ab11db7fa35f785f6782fbc0b9e3daebb625e2353fce2422f7fc79aed7a4d6bc
7
+ data.tar.gz: 87771745b9df2160966299f1d73eb568b46080ed217a1952e1dd938fff7758432f2cc6f449036d24951660b405e18083f1adac26d00243a0fab9003a96eb569d
data/.gitignore CHANGED
@@ -7,6 +7,9 @@
7
7
  /spec/reports/
8
8
  /tmp/
9
9
 
10
+ # https://yehudakatz.com/2010/12/16/clarifying-the-roles-of-the-gemspec-and-gemfile/
11
+ Gemfile.lock
12
+
10
13
  # rspec failure tracking
11
14
  .rspec_status
12
15
  .DS_Store
data/.rubocop.yml CHANGED
@@ -5,4 +5,7 @@ Style/StringLiterals:
5
5
  Enabled: false
6
6
 
7
7
  Style/MethodCallWithArgsParentheses:
8
+ Enabled: false
9
+
10
+ Lint/ConstantResolution:
8
11
  Enabled: false
data/README.md CHANGED
@@ -31,6 +31,9 @@ Connectors are available to read, process, and load data from different formats
31
31
  ```bash
32
32
  # List all available connectors
33
33
  $ chronicle-etl connectors:list
34
+
35
+ # Install a connector
36
+ $ chronicle-etl connectors:install imessage
34
37
  ```
35
38
 
36
39
  Built in connectors:
@@ -44,16 +47,18 @@ Built in connectors:
44
47
  - `null` - (default) Don't do anything
45
48
 
46
49
  ### Loaders
47
- - `stdout` - (default) output transformed records to stdount
50
+ - `stdout` - (default) output records to stdout serialized as JSON
48
51
  - `csv` - Load records to a csv file
52
+ - `rest` - Serialize records with [JSONAPI](https://jsonapi.org/) and send to a REST API
49
53
  - `table` - Output an ascii table of records. Useful for debugging.
50
54
 
51
55
  ### Provider-specific importers
52
56
 
53
57
  In addition to the built-in importers, importers for third-party platforms are available. They are packaged as individual Ruby gems.
54
58
 
55
- - [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files. Transformers for chronicle schema
56
- - [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`. Transform it for chronicle schema
59
+ - [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files
60
+ - [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`
61
+ - [imessage](https://github.com/chronicle-app/chronicle-imessage). Extract iMessage messages from a local macOS installation
57
62
 
58
63
  To install any of these, run `gem install chronicle-PROVIDER`.
59
64
 
@@ -61,7 +66,7 @@ If you don't want to use the available rubygem importers, `chronicle-etl` can us
61
66
 
62
67
  I'll be open-sourcing more importers. Please [contact me](mailto:andrew@hyfen.net) to chat about what will be available!
63
68
 
64
- ### Full commands
69
+ ## Full commands
65
70
 
66
71
  ```
67
72
  $ chronicle-etl help
@@ -75,26 +80,28 @@ ALL COMMANDS
75
80
  jobs:create # Create a job
76
81
  jobs:list # List all available jobs
77
82
  jobs:run # Start a job
78
- jobs:show # Show a job
83
+ jobs:show # Show details about a job
79
84
  ```
80
85
 
81
- ### Job options
86
+ ### Running a job
82
87
 
83
88
  ```
84
89
  Usage:
85
90
  chronicle-etl jobs:run
86
91
 
87
92
  Options:
88
- -e, [--extractor=extractor-name] # Extractor class (available: stdin, csv, file)
89
- # Default: stdin
93
+ [--log-level=LOG_LEVEL] # Log level (debug, info, warn, error, fatal)
94
+ # Default: info
95
+ -v, [--verbose], [--no-verbose] # Set log level to verbose
96
+ [--dry-run], [--no-dry-run] # Only run the extraction and transform steps, not the loading
97
+ -e, [--extractor=extractor-name] # Extractor class. Default: stdin
90
98
  [--extractor-opts=key:value] # Extractor options
91
- -t, [--transformer=transformer-name] # Transformer class (available: null)
92
- # Default: null
99
+ -t, [--transformer=transformer-name] # Transformer class. Default: null
93
100
  [--transformer-opts=key:value] # Transformer options
94
- -l, [--loader=loader-name] # Loader class (available: stdout, csv, table)
95
- # Default: stdout
101
+ -l, [--loader=loader-name] # Loader class. Default: stdout
96
102
  [--loader-opts=key:value] # Loader options
97
- -j, [--job=JOB] # Job configuration file
103
+ -j, [--name=NAME] # Job configuration name
104
+
98
105
 
99
106
  Runs an ETL job
100
107
  ```
@@ -36,18 +36,21 @@ Gem::Specification.new do |spec|
36
36
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
37
37
  spec.require_paths = ["lib"]
38
38
 
39
- spec.add_dependency "thor", "~> 0.20"
39
+ spec.add_dependency "activesupport"
40
+ spec.add_dependency "chronic_duration", "~> 0.10.6"
40
41
  spec.add_dependency "colorize", "~> 0.8.1"
41
- spec.add_dependency "tty-table", "~> 0.11"
42
+ spec.add_dependency "marcel", "~> 1.0.2"
43
+ spec.add_dependency "mini_exiftool", "~> 2.10"
44
+ spec.add_dependency "nokogiri", "~> 1.13"
45
+ spec.add_dependency "runcom", "~> 6.2"
46
+ spec.add_dependency "sequel", "~> 5.35"
47
+ spec.add_dependency "sqlite3", "~> 1.4"
48
+ spec.add_dependency "thor", "~> 0.20"
42
49
  spec.add_dependency "tty-progressbar", "~> 0.17"
43
- spec.add_dependency 'sequel', '~> 5.35'
44
- spec.add_dependency 'deep_merge', '~> 1.2'
50
+ spec.add_dependency "tty-table", "~> 0.11"
45
51
 
46
52
  spec.add_development_dependency "bundler", "~> 2.1"
53
+ spec.add_development_dependency "pry-byebug", "~> 3.9"
47
54
  spec.add_development_dependency "rake", "~> 13.0"
48
55
  spec.add_development_dependency "rspec", "~> 3.9"
49
- spec.add_development_dependency "pry-byebug", "~> 3.9"
50
- spec.add_development_dependency 'runcom', '~> 6.2'
51
- spec.add_development_dependency 'redcarpet', '~> 3.5'
52
- spec.add_development_dependency 'sqlite3', '~> 1.4'
53
56
  end
@@ -7,23 +7,35 @@ module Chronicle
7
7
  namespace :connectors
8
8
 
9
9
  desc "install NAME", "Installs connector NAME"
10
- def install
11
- puts "Installing"
10
+ def install(name)
11
+ Chronicle::ETL::Registry.install_connector(name)
12
12
  end
13
13
 
14
14
  desc "list", "Lists available connectors"
15
15
  # Display all available connectors that chronicle-etl has access to
16
16
  def list
17
- klasses = Chronicle::ETL::Catalog.available_classes
18
- klasses = klasses.sort_by do |a|
19
- [a[:built_in].to_s, a[:provider], a[:phase]]
17
+ Chronicle::ETL::Registry.load_all!
18
+
19
+ connector_info = Chronicle::ETL::Registry.connectors.map do |connector_registration|
20
+ {
21
+ identifier: connector_registration.identifier,
22
+ phase: connector_registration.phase,
23
+ description: connector_registration.descriptive_phrase,
24
+ provider: connector_registration.provider,
25
+ core: connector_registration.built_in? ? '✓' : '',
26
+ class: connector_registration.klass_name
27
+ }
28
+ end
29
+
30
+ connector_info = connector_info.sort_by do |a|
31
+ [a[:core].to_s, a[:provider], a[:phase], a[:identifier]]
20
32
  end
21
33
 
22
- headers = klasses.first.keys.map do |key|
34
+ headers = connector_info.first.keys.map do |key|
23
35
  key.to_s.upcase.bold
24
36
  end
25
37
 
26
- table = TTY::Table.new(headers, klasses.map(&:values))
38
+ table = TTY::Table.new(headers, connector_info.map(&:values))
27
39
  puts table.render(indent: 0, padding: [0, 2])
28
40
  end
29
41
  end
@@ -7,16 +7,19 @@ module Chronicle
7
7
  default_task "start"
8
8
  namespace :jobs
9
9
 
10
- class_option :extractor, aliases: '-e', desc: 'Extractor class (available: stdin, csv, file)', default: 'stdin', banner: 'extractor-name'
10
+ class_option :extractor, aliases: '-e', desc: "Extractor class. Default: stdin", banner: 'extractor-name'
11
11
  class_option :'extractor-opts', desc: 'Extractor options', type: :hash, default: {}
12
- class_option :transformer, aliases: '-t', desc: 'Transformer class (available: null)', default: 'null', banner: 'transformer-name'
12
+ class_option :transformer, aliases: '-t', desc: 'Transformer class. Default: null', banner: 'transformer-name'
13
13
  class_option :'transformer-opts', desc: 'Transformer options', type: :hash, default: {}
14
- class_option :loader, aliases: '-l', desc: 'Loader class (available: stdout, csv, table)', default: 'stdout', banner: 'loader-name'
14
+ class_option :loader, aliases: '-l', desc: 'Loader class. Default: stdout', banner: 'loader-name'
15
15
  class_option :'loader-opts', desc: 'Loader options', type: :hash, default: {}
16
16
  class_option :name, aliases: '-j', desc: 'Job configuration name'
17
17
 
18
18
  map run: :start # Thor doesn't like `run` as a command name
19
19
  desc "run", "Start a job"
20
+ option :log_level, desc: 'Log level (debug, info, warn, error, fatal)', default: 'info'
21
+ option :verbose, aliases: '-v', desc: 'Set log level to verbose', type: :boolean
22
+ option :dry_run, desc: 'Only run the extraction and transform steps, not the loading', type: :boolean
20
23
  long_desc <<-LONG_DESC
21
24
  This will run an ETL job. Each job needs three parts:
22
25
 
@@ -24,23 +27,17 @@ module Chronicle
24
27
 
25
28
  2. #{'Transformer'.underline}: transforms data into a new format. If none is specified, we use the `null` transformer which does nothing to the data.
26
29
 
27
- 3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout.
30
+ 3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout. With the --dry-run option, this step won't be run.
28
31
 
29
32
  If you do not want to use the command line flags, you can also configure a job with a .yml config file. You can either specify the path to this file or use the filename and place the file in ~/.config/chronicle/etl/jobs/NAME.yml and call it with `--job NAME`
30
33
  LONG_DESC
31
34
  # Run an ETL job
32
35
  def start
36
+ setup_log_level
33
37
  job_definition = build_job_definition(options)
34
38
  job = Chronicle::ETL::Job.new(job_definition)
35
39
  runner = Chronicle::ETL::Runner.new(job)
36
40
  runner.run!
37
- rescue Chronicle::ETL::ProviderNotAvailableError => e
38
- warn(e.message.red)
39
- warn(" Perhaps you haven't installed it yet: `$ gem install chronicle-#{e.provider}`")
40
- exit(false)
41
- rescue Chronicle::ETL::ConnectorNotAvailableError => e
42
- warn(e.message.red)
43
- exit(false)
44
41
  end
45
42
 
46
43
  desc "create", "Create a job"
@@ -48,14 +45,13 @@ LONG_DESC
48
45
  def create
49
46
  job_definition = build_job_definition(options)
50
47
  path = File.join('chronicle', 'etl', 'jobs', options[:name])
51
- Chronicle::ETL::Config.write(path, job_definition)
48
+ Chronicle::ETL::Config.write(path, job_definition.definition)
52
49
  end
53
50
 
54
51
  desc "show", "Show details about a job"
55
52
  # Show an ETL job
56
53
  def show
57
- job_config = build_job_definition(options)
58
- pp job_config
54
+ puts Chronicle::ETL::Job.new(build_job_definition(options))
59
55
  end
60
56
 
61
57
  desc "list", "List all available jobs"
@@ -81,11 +77,20 @@ LONG_DESC
81
77
 
82
78
  private
83
79
 
80
+ def setup_log_level
81
+ if options[:verbose]
82
+ Chronicle::ETL::Logger.log_level = Chronicle::ETL::Logger::DEBUG
83
+ elsif options[:log_level]
84
+ level = Chronicle::ETL::Logger.const_get(options[:log_level].upcase)
85
+ Chronicle::ETL::Logger.log_level = level
86
+ end
87
+ end
88
+
84
89
  # Create job definition by reading config file and then overwriting with flag options
85
90
  def build_job_definition(options)
86
91
  definition = Chronicle::ETL::JobDefinition.new
87
- definition.add_config(process_flag_options(options))
88
92
  definition.add_config(load_job_config(options[:name]))
93
+ definition.add_config(process_flag_options(options))
89
94
  definition
90
95
  end
91
96
 
@@ -96,18 +101,19 @@ LONG_DESC
96
101
  # Takes flag options and turns them into a runner config
97
102
  def process_flag_options options
98
103
  {
104
+ dry_run: options[:dry_run],
99
105
  extractor: {
100
106
  name: options[:extractor],
101
107
  options: options[:'extractor-opts']
102
- },
108
+ }.compact,
103
109
  transformer: {
104
110
  name: options[:transformer],
105
111
  options: options[:'transformer-opts']
106
- },
112
+ }.compact,
107
113
  loader: {
108
114
  name: options[:loader],
109
115
  options: options[:'loader-opts']
110
- }
116
+ }.compact
111
117
  }
112
118
  end
113
119
  end
@@ -22,6 +22,11 @@ module Chronicle
22
22
 
23
23
  # Entrypoint for the CLI
24
24
  def self.start(given_args = ARGV, config = {})
25
+ if given_args[0] == "--version"
26
+ puts "#{Chronicle::ETL::VERSION}"
27
+ exit
28
+ end
29
+
25
30
  if given_args.none?
26
31
  abort "No command entered or job specified. To see commands, run `chronicle-etl help`".red
27
32
  end
@@ -52,10 +57,10 @@ module Chronicle
52
57
  shell.say " $ chronicle-etl connectors:list"
53
58
  shell.say
54
59
  shell.say " Run a simple job:".italic.light_black
55
- shell.say " $ chronicle-etl jobs:start --extractor stdin --transformer null --loader stdout"
60
+ shell.say " $ chronicle-etl jobs:run --extractor stdin --transformer null --loader stdout"
56
61
  shell.say
57
62
  shell.say " Show full job options:".italic.light_black
58
- shell.say " $ chronicle-etl jobs help start"
63
+ shell.say " $ chronicle-etl jobs help run"
59
64
 
60
65
  list = []
61
66
 
@@ -72,6 +77,9 @@ module Chronicle
72
77
  shell.say "VERSION".bold
73
78
  shell.say " #{Chronicle::ETL::VERSION}"
74
79
  shell.say
80
+ shell.say " Display current version:".italic.light_black
81
+ shell.say " $ chronicle-etl --version"
82
+ shell.say
75
83
  shell.say "FULL DOCUMENTATION".bold
76
84
  shell.say " https://github.com/chronicle-app/chronicle-etl".blue
77
85
  shell.say
@@ -30,7 +30,7 @@ module Chronicle
30
30
  end
31
31
  end
32
32
 
33
- # Returns all available credentials available in ~/.config/chronilce/etl/credenetials/*.yml
33
+ # Returns all available credentials available in ~/.config/chronicle/etl/credentials/*.yml
34
34
  def available_credentials
35
35
  job_directory = Runcom::Config.new('chronicle/etl/credentials').current
36
36
  Dir.glob(File.join(job_directory, "*.yml")).map do |filename|
@@ -2,7 +2,7 @@ module Chronicle
2
2
  module ETL
3
3
  class Error < StandardError; end;
4
4
 
5
- class InvalidTransformedRecordError < Error; end
5
+ class RunnerTypeError < Error; end
6
6
 
7
7
  class ConnectorNotAvailableError < Error
8
8
  def initialize(message, provider: nil, name: nil)
@@ -15,5 +15,16 @@ module Chronicle
15
15
 
16
16
  class ProviderNotAvailableError < ConnectorNotAvailableError; end
17
17
  class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
18
+
19
+ class TransformationError < Error
20
+ attr_reader :transformation
21
+
22
+ def initialize(message=nil, transformation:)
23
+ super(message)
24
+ @transformation = transformation
25
+ end
26
+ end
27
+
28
+ class UntransformableRecordError < TransformationError; end
18
29
  end
19
30
  end
@@ -0,0 +1,12 @@
1
+ module Chronicle
2
+ module ETL
3
+ class Extraction
4
+ attr_accessor :data, :meta
5
+
6
+ def initialize(data: {}, meta: {})
7
+ @data = data
8
+ @meta = meta
9
+ end
10
+ end
11
+ end
12
+ end
@@ -1,41 +1,48 @@
1
1
  require 'csv'
2
- class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
3
- DEFAULT_OPTIONS = {
4
- headers: true,
5
- filename: $stdin
6
- }.freeze
7
-
8
- def initialize(options = {})
9
- super(DEFAULT_OPTIONS.merge(options))
10
- end
11
2
 
12
- def extract
13
- csv = initialize_csv
14
- csv.each do |row|
15
- result = row.to_h
16
- yield result
3
+ module Chronicle
4
+ module ETL
5
+ class CsvExtractor < Chronicle::ETL::Extractor
6
+ include Extractors::Helpers::FilesystemReader
7
+
8
+ register_connector do |r|
9
+ r.description = 'input as CSV'
10
+ end
11
+
12
+ DEFAULT_OPTIONS = {
13
+ headers: true,
14
+ filename: $stdin
15
+ }.freeze
16
+
17
+ def initialize(options = {})
18
+ super(DEFAULT_OPTIONS.merge(options))
19
+ end
20
+
21
+ def extract
22
+ csv = initialize_csv
23
+ csv.each do |row|
24
+ yield Chronicle::ETL::Extraction.new(data: row.to_h)
25
+ end
26
+ end
27
+
28
+ def results_count
29
+ CSV.read(@options[:filename], headers: @options[:headers]).count unless stdin?(@options[:filename])
30
+ end
31
+
32
+ private
33
+
34
+ def initialize_csv
35
+ headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
36
+
37
+ csv_options = {
38
+ headers: headers,
39
+ converters: :all
40
+ }
41
+
42
+ open_from_filesystem(filename: @options[:filename]) do |file|
43
+ return CSV.new(file, **csv_options)
44
+ end
45
+ end
17
46
  end
18
47
  end
19
-
20
- def results_count
21
- CSV.read(@options[:filename], headers: @options[:headers]).count if read_from_file?
22
- end
23
-
24
- private
25
-
26
- def initialize_csv
27
- headers = @options[:headers].is_a?(String) ? @options[:headers].split(',') : @options[:headers]
28
-
29
- csv_options = {
30
- headers: headers,
31
- converters: :all
32
- }
33
-
34
- stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
35
- CSV.new(stream, **csv_options)
36
- end
37
-
38
- def read_from_file?
39
- @options[:filename] != $stdin
40
- end
41
48
  end
@@ -4,7 +4,7 @@ module Chronicle
4
4
  module ETL
5
5
  # Abstract class representing an Extractor for an ETL job
6
6
  class Extractor
7
- extend Chronicle::ETL::Catalog
7
+ extend Chronicle::ETL::Registry::SelfRegistering
8
8
 
9
9
  # Construct a new instance of this extractor. Options are passed in from a Runner
10
10
  # == Paramters:
@@ -12,6 +12,7 @@ module Chronicle
12
12
  # Options for configuring this Extractor
13
13
  def initialize(options = {})
14
14
  @options = options.transform_keys!(&:to_sym)
15
+ sanitize_options
15
16
  handle_continuation
16
17
  end
17
18
 
@@ -26,6 +27,11 @@ module Chronicle
26
27
 
27
28
  private
28
29
 
30
+ def sanitize_options
31
+ @options[:load_since] = Time.parse(@options[:load_since]) if @options[:load_since] && @options[:load_since].is_a?(String)
32
+ @options[:load_until] = Time.parse(@options[:load_until]) if @options[:load_until] && @options[:load_until].is_a?(String)
33
+ end
34
+
29
35
  def handle_continuation
30
36
  return unless @options[:continuation]
31
37
 
@@ -36,6 +42,8 @@ module Chronicle
36
42
  end
37
43
  end
38
44
 
45
+ require_relative 'helpers/filesystem_reader'
39
46
  require_relative 'csv_extractor'
40
47
  require_relative 'file_extractor'
48
+ require_relative 'json_extractor'
41
49
  require_relative 'stdin_extractor'
@@ -3,49 +3,31 @@ require 'pathname'
3
3
  module Chronicle
4
4
  module ETL
5
5
  class FileExtractor < Chronicle::ETL::Extractor
6
- def extract
7
- if file?
8
- extract_file do |data, metadata|
9
- yield(data, metadata)
10
- end
11
- elsif directory?
12
- extract_from_directory do |data, metadata|
13
- yield(data, metadata)
14
- end
15
- end
16
- end
6
+ include Extractors::Helpers::FilesystemReader
17
7
 
18
- def results_count
19
- if file?
20
- return 1
21
- else
22
- search_pattern = File.join(@options[:filename], '**/*.eml')
23
- Dir.glob(search_pattern).count
24
- end
8
+ register_connector do |r|
9
+ r.description = 'file or directory of files'
25
10
  end
26
11
 
27
- private
28
-
29
- def extract_from_directory
30
- search_pattern = File.join(@options[:filename], '**/*.eml')
31
- filenames = Dir.glob(search_pattern)
12
+ def extract
32
13
  filenames.each do |filename|
33
- file = File.open(filename)
34
- yield(file.read, {filename: file})
14
+ yield Chronicle::ETL::Extraction.new(data: filename)
35
15
  end
36
16
  end
37
17
 
38
- def extract_file
39
- file = File.open(@options[:filename])
40
- yield(file.read, {filename: @options[:filename]})
18
+ def results_count
19
+ filenames.count
41
20
  end
42
21
 
43
- def directory?
44
- Pathname.new(@options[:filename]).directory?
45
- end
22
+ private
46
23
 
47
- def file?
48
- Pathname.new(@options[:filename]).file?
24
+ def filenames
25
+ @filenames ||= filenames_in_directory(
26
+ path: @options[:filename],
27
+ dir_glob_pattern: @options[:dir_glob_pattern],
28
+ load_since: @options[:load_since],
29
+ load_until: @options[:load_until]
30
+ )
49
31
  end
50
32
  end
51
33
  end
@@ -0,0 +1,104 @@
1
+ require 'pathname'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Extractors
6
+ module Helpers
7
+ module FilesystemReader
8
+
9
+ def filenames_in_directory(...)
10
+ filenames = gather_files(...)
11
+ if block_given?
12
+ filenames.each do |filename|
13
+ yield filename
14
+ end
15
+ else
16
+ filenames
17
+ end
18
+ end
19
+
20
+ def read_from_filesystem(filename:, yield_each_line: true, dir_glob_pattern: '**/*')
21
+ open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
22
+ if yield_each_line
23
+ file.each_line do |line|
24
+ yield line
25
+ end
26
+ else
27
+ yield file.read
28
+ end
29
+ end
30
+ end
31
+
32
+ def open_from_filesystem(filename:, dir_glob_pattern: '**/*')
33
+ open_files(filename: filename, dir_glob_pattern: dir_glob_pattern) do |file|
34
+ yield file
35
+ end
36
+ end
37
+
38
+ def results_count
39
+ raise NotImplementedError
40
+ # if file?
41
+ # return 1
42
+ # else
43
+ # search_pattern = File.join(@options[:filename], '**/*')
44
+ # Dir.glob(search_pattern).count
45
+ # end
46
+ end
47
+
48
+ private
49
+
50
+ def gather_files(path:, dir_glob_pattern: '**/*', load_since: nil, load_until: nil, smaller_than: nil, larger_than: nil, sort: :mtime)
51
+ search_pattern = File.join(path, '**', dir_glob_pattern)
52
+ files = Dir.glob(search_pattern)
53
+
54
+ files = files.keep_if {|f| (File.mtime(f) > load_since)} if load_since
55
+ files = files.keep_if {|f| (File.mtime(f) < load_until)} if load_until
56
+
57
+ # pass in file sizes in bytes
58
+ files = files.keep_if {|f| (File.size(f) < smaller_than)} if smaller_than
59
+ files = files.keep_if {|f| (File.size(f) > larger_than)} if larger_than
60
+
61
+ # TODO: incorporate sort argument
62
+ files.sort_by{ |f| File.mtime(f) }
63
+ end
64
+
65
+ def select_files_in_directory(path:, dir_glob_pattern: '**/*')
66
+ raise IOError.new("#{path} is not a directory.") unless directory?(path)
67
+
68
+ search_pattern = File.join(path, dir_glob_pattern)
69
+ Dir.glob(search_pattern).each do |filename|
70
+ yield(filename)
71
+ end
72
+ end
73
+
74
+ def open_files(filename:, dir_glob_pattern:)
75
+ if stdin?(filename)
76
+ yield $stdin
77
+ elsif directory?(filename)
78
+ search_pattern = File.join(filename, dir_glob_pattern)
79
+ filenames = Dir.glob(search_pattern)
80
+ filenames.each do |filename|
81
+ file = File.open(filename)
82
+ yield(file)
83
+ end
84
+ elsif file?(filename)
85
+ yield File.open(filename)
86
+ end
87
+ end
88
+
89
+ def stdin?(filename)
90
+ filename == $stdin
91
+ end
92
+
93
+ def directory?(filename)
94
+ Pathname.new(filename).directory?
95
+ end
96
+
97
+ def file?(filename)
98
+ Pathname.new(filename).file?
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end