chronicle-etl 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 12a38a335c05b1626b9f259318956480df3f96e447cc2b1a25b8a9c23d591e49
4
- data.tar.gz: d8ed027154403e68e5684213b0d0f58218a23dc2f667a882dcd3b2e8ab0c69b7
3
+ metadata.gz: 7a02a2377d0e8d4135f3b931bc73641eac28058d736d9c1dba0a97107c1d4c0e
4
+ data.tar.gz: 810d5bff80e852fa08ef9824ed6b313aa309bb69e84228bc1fbb7595069e043b
5
5
  SHA512:
6
- metadata.gz: 396863ed665137905cfa9fe51ee925776e0a0f616721658a889b9b587dda83b9cd1e0fa2a483b08fc65ec70797f07facec082c1c88403aa8d61e1ce4ae791779
7
- data.tar.gz: 705d626f45c816494949d6bc5c4f83cc4a8cd1c527aef72911bb90000e2151f758889c6080e9ac489f235df4169fbc158a7713dbf76d5e7ba5fdaf2a6ad51567
6
+ metadata.gz: 0d5fbea3c63349bb3f566e6137755f6cc8a4060d0e401abf5a0e7d8b44a4c4278089c10ffb8bb9cf2d783a238449140e5e54d90f3ad158aa362c6335eedca5aa
7
+ data.tar.gz: bf6fa83b1d5e55760e62d3cc090bf09bb69a7c761ae4a9358fb4d82192c7efc7500b6db361f39adac3581982862654aa4603a78dfbb3aed53b51d01137ffd736
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- chronicle-etl (0.2.3)
4
+ chronicle-etl (0.2.4)
5
5
  colorize (~> 0.8.1)
6
6
  deep_merge (~> 1.2)
7
7
  sequel (~> 5.35)
data/README.md CHANGED
@@ -2,9 +2,9 @@
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/chronicle-etl.svg)](https://badge.fury.io/rb/chronicle-etl)
4
4
 
5
- Chronicle ETL is a utility tool for archiving and processing personal data. You can extract it from a variety of source, transform it, and load it to different APIs or file formats.
5
+ Chronicle ETL is a utility that helps you archive and processes personal data. You can *extract* it from a variety of sources, *transform* it, and *load* it to an external API, file, or stdout.
6
6
 
7
- This project is an adaptation of Andrew Louis's experimental [Memex project](https://hyfen.net/memex).
7
+ This tool is an adaptation of Andrew Louis's experimental [Memex project](https://hyfen.net/memex) and the dozens of existing importers are being migrated to Chronicle.
8
8
 
9
9
  ## Installation
10
10
 
@@ -52,7 +52,7 @@ Built in connectors:
52
52
 
53
53
  In addition to the built-in importers, importers for third-party platforms are available. They are packaged as individual Ruby gems.
54
54
 
55
- - [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` files. Transformers for chronicle schema
55
+ - [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files. Transformers for chronicle schema
56
56
  - [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`. Transform it for chronicle schema
57
57
 
58
58
  To install any of these, run `gem install chronicle-PROVIDER`.
@@ -1,13 +1,19 @@
1
1
  require_relative 'etl/catalog'
2
2
  require_relative 'etl/config'
3
- require_relative 'etl/job_definition'
4
3
  require_relative 'etl/exceptions'
5
4
  require_relative 'etl/extractors/extractor'
5
+ require_relative 'etl/job_definition'
6
6
  require_relative 'etl/job_log'
7
7
  require_relative 'etl/job_logger'
8
8
  require_relative 'etl/job'
9
9
  require_relative 'etl/loaders/loader'
10
+ require_relative 'etl/models/activity'
11
+ require_relative 'etl/models/base'
12
+ require_relative 'etl/models/entity'
13
+ require_relative 'etl/models/generic'
10
14
  require_relative 'etl/runner'
11
15
  require_relative 'etl/transformers/transformer'
16
+ require_relative 'etl/utils/hash_utilities'
17
+ require_relative 'etl/utils/jsonapi'
12
18
  require_relative 'etl/utils/progress_bar'
13
- require_relative 'etl/version'
19
+ require_relative 'etl/version'
@@ -2,6 +2,8 @@ module Chronicle
2
2
  module ETL
3
3
  class Error < StandardError; end;
4
4
 
5
+ class InvalidTransformedRecordError < Error; end
6
+
5
7
  class ConnectorNotAvailableError < Error
6
8
  def initialize(message, provider: nil, name: nil)
7
9
  super(message)
@@ -28,8 +28,7 @@ class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
28
28
 
29
29
  csv_options = {
30
30
  headers: headers,
31
- header_converters: :symbol,
32
- converters: [:all]
31
+ converters: :all
33
32
  }
34
33
 
35
34
  stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
@@ -27,7 +27,10 @@ module Chronicle
27
27
  private
28
28
 
29
29
  def handle_continuation
30
- @options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation] && @options[:continuation].highest_timestamp
30
+ return unless @options[:continuation]
31
+
32
+ @options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation].highest_timestamp
33
+ @options[:load_after_id] = @options[:continuation].last_id if @options[:continuation].last_id
31
34
  end
32
35
  end
33
36
  end
@@ -24,7 +24,7 @@ module Chronicle
24
24
  @loader_klass = load_klass(:loader, definition[:loader][:name])
25
25
  @loader_options = definition[:loader][:options] || {}
26
26
 
27
- set_continuation
27
+ set_continuation if load_continuation?
28
28
  yield self if block_given?
29
29
  end
30
30
 
@@ -32,7 +32,7 @@ module Chronicle
32
32
  instantiate_klass(:extractor)
33
33
  end
34
34
 
35
- def instantiate_transformer data
35
+ def instantiate_transformer(data)
36
36
  instantiate_klass(:transformer, data)
37
37
  end
38
38
 
@@ -40,6 +40,11 @@ module Chronicle
40
40
  instantiate_klass(:loader)
41
41
  end
42
42
 
43
+ def save_log?
44
+ # TODO: this needs more nuance
45
+ return !id.nil?
46
+ end
47
+
43
48
  private
44
49
 
45
50
  def instantiate_klass(phase, *args)
@@ -57,6 +62,10 @@ module Chronicle
57
62
  continuation = Chronicle::ETL::JobLogger.load_latest(@job_id)
58
63
  @extractor_options[:continuation] = continuation
59
64
  end
65
+
66
+ def load_continuation?
67
+ save_log?
68
+ end
60
69
  end
61
70
  end
62
71
  end
@@ -1,4 +1,4 @@
1
- require 'pry'
1
+ require 'forwardable'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
@@ -6,6 +6,8 @@ module Chronicle
6
6
  # tracking when it ran, if it was successful, and what the latest record
7
7
  # we found is (to use as a cursor for the next time)
8
8
  class JobLog
9
+ extend Forwardable
10
+
9
11
  attr_accessor :job,
10
12
  :job_id,
11
13
  :last_id,
@@ -15,6 +17,8 @@ module Chronicle
15
17
  :finished_at,
16
18
  :success
17
19
 
20
+ def_delegators :@job, :save_log?
21
+
18
22
  # Create a new JobLog for a given Job
19
23
  def initialize
20
24
  @num_records_processed = 0
@@ -64,6 +68,8 @@ module Chronicle
64
68
  }
65
69
  end
66
70
 
71
+ private
72
+
67
73
  # Create a new JobLog and set its instance variables from a serialized hash
68
74
  def self.build_from_serialized attrs
69
75
  attrs.delete(:id)
@@ -20,6 +20,8 @@ module Chronicle
20
20
 
21
21
  # Save this JobLogger's JobLog to db
22
22
  def save
23
+ return unless @job_log.save_log?
24
+
23
25
  JobLogger.with_db_connection do |db|
24
26
  dataset = db[:job_logs]
25
27
  dataset.insert(@job_log.serialize)
@@ -8,12 +8,8 @@ module Chronicle
8
8
  @rows = []
9
9
  end
10
10
 
11
- def load(result)
12
- if (result.is_a? Hash)
13
- @rows << result.values
14
- else
15
- @rows << result
16
- end
11
+ def load(record)
12
+ @rows << record.to_h_flattened.values
17
13
  end
18
14
 
19
15
  def finish
@@ -5,7 +5,7 @@ module Chronicle
5
5
  extend Chronicle::ETL::Catalog
6
6
 
7
7
  # Construct a new instance of this loader. Options are passed in from a Runner
8
- # == Paramters:
8
+ # == Parameters:
9
9
  # options::
10
10
  # Options for configuring this Loader
11
11
  def initialize(options = {})
@@ -5,25 +5,28 @@ require 'json'
5
5
  module Chronicle
6
6
  module ETL
7
7
  class RestLoader < Chronicle::ETL::Loader
8
- def initialize(options={})
8
+ def initialize( options={} )
9
9
  super(options)
10
10
  end
11
11
 
12
- def load(result)
12
+ def load(record)
13
+ payload = Chronicle::ETL::Utils::JSONAPI.serialize(record)
14
+ # have the outer data key that json-api expects
15
+ payload = { data: payload } unless payload[:data]
16
+
13
17
  uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
14
18
 
15
19
  header = {
16
20
  "Authorization" => "Bearer #{@options[:access_token]}",
17
21
  "Content-Type": 'application/json'
18
22
  }
23
+ use_ssl = uri.scheme == 'https'
19
24
 
20
- http = Net::HTTP.new(uri.host, uri.port)
21
- request = Net::HTTP::Post.new(uri.request_uri, header)
22
-
23
- obj = {data: result} unless result[:data]
24
- request.body = obj.to_json
25
-
26
- response = http.request(request)
25
+ Net::HTTP.start(uri.host, uri.port, use_ssl: use_ssl) do |http|
26
+ request = Net::HTTP::Post.new(uri.request_uri, header)
27
+ request.body = payload.to_json
28
+ http.request(request)
29
+ end
27
30
  end
28
31
  end
29
32
  end
@@ -1,9 +1,9 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class StdoutLoader < Chronicle::ETL::Loader
4
- def load(result)
5
- puts result.inspect
4
+ def load(record)
5
+ puts record.to_h
6
6
  end
7
7
  end
8
8
  end
9
- end
9
+ end
@@ -7,14 +7,15 @@ module Chronicle
7
7
  super(options)
8
8
  end
9
9
 
10
- def load(result)
11
- @table ||= TTY::Table.new(header: result.keys)
12
- values = result.values.map{|x| x.to_s[0..30]}
10
+ def load(record)
11
+ record_hash = record.to_h_flattened
12
+ @table ||= TTY::Table.new(header: record_hash.keys)
13
+ values = record_hash.values.map{|x| x.to_s[0..30]}
13
14
  @table << values
14
15
  end
15
16
 
16
17
  def finish
17
- puts @table.render(:ascii, padding: [0, 1])
18
+ puts @table.render(:ascii, padding: [0, 1]) if @table
18
19
  end
19
20
  end
20
21
  end
@@ -0,0 +1,15 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Activity < Chronicle::ETL::Models::Base
7
+ TYPE = 'activities'.freeze
8
+ ATTRIBUTES = [:verb, :start_at, :end_at].freeze
9
+ ASSOCIATIONS = [:involved, :actor].freeze
10
+
11
+ attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,103 @@
1
+ require 'digest'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ # Represents a record that's been transformed by a Transformer and
7
+ # ready to be loaded. Loosely based on ActiveModel.
8
+ class Base
9
+ ATTRIBUTES = [:provider, :provider_id, :lat, :lng].freeze
10
+ ASSOCIATIONS = [].freeze
11
+
12
+ attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
13
+
14
+ def initialize(attributes = {})
15
+ assign_attributes(attributes) if attributes
16
+ @dedupe_on = []
17
+ end
18
+
19
+ # A unique identifier for this model is formed from a type
20
+ # and either an id or lids.
21
+ def identifier_hash
22
+ {
23
+ type: self.class::TYPE,
24
+ id: @id,
25
+ lids: lids
26
+ }.compact
27
+ end
28
+
29
+ # Array of local ids that uniquely identify this record
30
+ def lids
31
+ @dedupe_on.map do |fields|
32
+ generate_lid(fields)
33
+ end.compact.uniq
34
+ end
35
+
36
+ # For a given set of fields of this model, generate a
37
+ # unique local id by hashing the field values
38
+ def generate_lid fields
39
+ values = fields.sort.map do |field|
40
+ instance_variable = "@#{field.to_s}"
41
+ self.instance_variable_get(instance_variable)
42
+ end
43
+
44
+ return if values.any? { |e| e.nil? }
45
+
46
+ Digest::SHA256.hexdigest(values.join(","))
47
+ end
48
+
49
+ # Set of attribute names that this model has is Base's shared
50
+ # attributes combined with the child class's
51
+ def attribute_list
52
+ (ATTRIBUTES + self.class::ATTRIBUTES).uniq
53
+ end
54
+
55
+ # All of this record's attributes
56
+ def attributes
57
+ attributes = {}
58
+ attribute_list.each do |attribute|
59
+ instance_variable = "@#{attribute.to_s}"
60
+ attributes[attribute] = self.instance_variable_get(instance_variable)
61
+ end
62
+ attributes.compact
63
+ end
64
+
65
+ # All of this record's associations
66
+ def associations
67
+ association_list = ASSOCIATIONS + self.class::ASSOCIATIONS
68
+ attributes = {}
69
+ association_list.each do |attribute|
70
+ instance_variable = "@#{attribute.to_s}"
71
+ association = self.instance_variable_get(instance_variable)
72
+ attributes[attribute] = association if association
73
+ end
74
+ attributes.compact
75
+ end
76
+
77
+ def associations_hash
78
+ Hash[associations.map do |k, v|
79
+ [k, v.to_h]
80
+ end]
81
+ end
82
+
83
+ # FIXME: move this to a Utils module
84
+ def to_h_flattened
85
+ Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
86
+ end
87
+
88
+ def to_h
89
+ identifier_hash.merge(attributes).merge(associations_hash)
90
+ end
91
+
92
+ private
93
+
94
+ def assign_attributes attributes
95
+ attributes.each do |k, v|
96
+ setter = :"#{k}="
97
+ public_send(setter, v) if respond_to? setter
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,15 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Entity < Chronicle::ETL::Models::Base
7
+ TYPE = 'entities'.freeze
8
+ ATTRIBUTES = [:title, :body, :represents, :slug].freeze
9
+ ASSOCIATIONS = [].freeze # TODO: add these to reflect Chronicle Schema
10
+
11
+ attr_accessor(*ATTRIBUTES)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,23 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Generic < Chronicle::ETL::Models::Base
7
+ TYPE = 'generic'
8
+
9
+ attr_accessor :properties
10
+
11
+ def initialize(properties = {})
12
+ @properties = properties
13
+ super
14
+ end
15
+
16
+ # Generic models have arbitrary attributes stored in @properties
17
+ def attributes
18
+ @properties.transform_keys(&:to_sym)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -18,9 +18,14 @@ class Chronicle::ETL::Runner
18
18
 
19
19
  extractor.extract do |data, metadata|
20
20
  transformer = @job.instantiate_transformer(data)
21
- transformed_data = transformer.transform
21
+ record = transformer.transform
22
+
23
+ unless record.is_a?(Chronicle::ETL::Models::Base)
24
+ raise Chronicle::ETL::InvalidTransformedRecordError, "Transformed data is not a type of Chronicle::ETL::Models"
25
+ end
26
+
22
27
  @job_logger.log_transformation(transformer)
23
- loader.load(transformed_data)
28
+ loader.load(record)
24
29
  progress_bar.increment
25
30
  end
26
31
 
@@ -2,9 +2,8 @@ module Chronicle
2
2
  module ETL
3
3
  class NullTransformer < Chronicle::ETL::Transformer
4
4
  def transform
5
- return @data
5
+ Chronicle::ETL::Models::Generic.new(@data)
6
6
  end
7
7
  end
8
-
9
8
  end
10
- end
9
+ end
@@ -11,12 +11,12 @@ module Chronicle
11
11
  def initialize(options = {}, data)
12
12
  @options = options
13
13
  @data = data
14
+ @record = Chronicle::ETL::Models::Activity.new
14
15
  end
15
16
 
16
- # The main entrypoint for transforming a record. Called by a Runner on each extracted record
17
- def transform
18
- raise NotImplementedError
19
- end
17
+ # @abstract Subclass is expected to implement #transform
18
+ # @!method transform
19
+ # The main entrypoint for transforming a record. Called by a Runner on each extracted record
20
20
 
21
21
  # The domain or provider-specific id of the record this transformer is working on.
22
22
  # Used for building a cursor so an extractor doesn't have to start from the beginning of a
@@ -31,5 +31,4 @@ module Chronicle
31
31
  end
32
32
  end
33
33
 
34
- require_relative 'json_transformer'
35
34
  require_relative 'null_transformer'
@@ -0,0 +1,19 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Utils
4
+ module HashUtilities
5
+ def self.flatten_hash(hash)
6
+ hash.each_with_object({}) do |(k, v), h|
7
+ if v.is_a? Hash
8
+ flatten_hash(v).map do |h_k, h_v|
9
+ h["#{k}.#{h_k}".to_sym] = h_v
10
+ end
11
+ else
12
+ h[k] = v
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,28 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Utils
4
+ module JSONAPI
5
+ # For a given Chronicle::ETL::Model, serialize it as jsonapi
6
+ def self.serialize(record)
7
+ return unless record.is_a? Chronicle::ETL::Models::Base
8
+
9
+ obj = record.identifier_hash
10
+ obj[:attributes] = record.attributes
11
+
12
+ relationships = Hash[record.associations.map do |k, v|
13
+ if v.is_a?(Array)
14
+ data = { data: v.map{ |association| serialize(association) } }
15
+ else
16
+ data = { data: serialize(v) }
17
+ end
18
+
19
+ [k, data]
20
+ end]
21
+
22
+ obj[:relationships] = relationships if relationships.any?
23
+ obj
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -64,7 +64,7 @@ module Chronicle
64
64
  end
65
65
 
66
66
  def log(message)
67
- @pbar.log message.inspect
67
+ @pbar.log message
68
68
  end
69
69
 
70
70
  def finish
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.2.3"
3
+ VERSION = "0.2.4"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chronicle-etl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Louis
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-09-04 00:00:00.000000000 Z
11
+ date: 2020-09-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -238,10 +238,15 @@ files:
238
238
  - lib/chronicle/etl/loaders/rest_loader.rb
239
239
  - lib/chronicle/etl/loaders/stdout_loader.rb
240
240
  - lib/chronicle/etl/loaders/table_loader.rb
241
+ - lib/chronicle/etl/models/activity.rb
242
+ - lib/chronicle/etl/models/base.rb
243
+ - lib/chronicle/etl/models/entity.rb
244
+ - lib/chronicle/etl/models/generic.rb
241
245
  - lib/chronicle/etl/runner.rb
242
- - lib/chronicle/etl/transformers/json_transformer.rb
243
246
  - lib/chronicle/etl/transformers/null_transformer.rb
244
247
  - lib/chronicle/etl/transformers/transformer.rb
248
+ - lib/chronicle/etl/utils/hash_utilities.rb
249
+ - lib/chronicle/etl/utils/jsonapi.rb
245
250
  - lib/chronicle/etl/utils/progress_bar.rb
246
251
  - lib/chronicle/etl/version.rb
247
252
  homepage: https://github.com/chronicle-app
@@ -1,11 +0,0 @@
1
- require 'json'
2
-
3
- module Chronicle
4
- module ETL
5
- class JsonTransformer < Chronicle::ETL::Transformer
6
- def transform data
7
- return JSON.parse(data)
8
- end
9
- end
10
- end
11
- end