chronicle-etl 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 12a38a335c05b1626b9f259318956480df3f96e447cc2b1a25b8a9c23d591e49
4
- data.tar.gz: d8ed027154403e68e5684213b0d0f58218a23dc2f667a882dcd3b2e8ab0c69b7
3
+ metadata.gz: 7a02a2377d0e8d4135f3b931bc73641eac28058d736d9c1dba0a97107c1d4c0e
4
+ data.tar.gz: 810d5bff80e852fa08ef9824ed6b313aa309bb69e84228bc1fbb7595069e043b
5
5
  SHA512:
6
- metadata.gz: 396863ed665137905cfa9fe51ee925776e0a0f616721658a889b9b587dda83b9cd1e0fa2a483b08fc65ec70797f07facec082c1c88403aa8d61e1ce4ae791779
7
- data.tar.gz: 705d626f45c816494949d6bc5c4f83cc4a8cd1c527aef72911bb90000e2151f758889c6080e9ac489f235df4169fbc158a7713dbf76d5e7ba5fdaf2a6ad51567
6
+ metadata.gz: 0d5fbea3c63349bb3f566e6137755f6cc8a4060d0e401abf5a0e7d8b44a4c4278089c10ffb8bb9cf2d783a238449140e5e54d90f3ad158aa362c6335eedca5aa
7
+ data.tar.gz: bf6fa83b1d5e55760e62d3cc090bf09bb69a7c761ae4a9358fb4d82192c7efc7500b6db361f39adac3581982862654aa4603a78dfbb3aed53b51d01137ffd736
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- chronicle-etl (0.2.3)
4
+ chronicle-etl (0.2.4)
5
5
  colorize (~> 0.8.1)
6
6
  deep_merge (~> 1.2)
7
7
  sequel (~> 5.35)
data/README.md CHANGED
@@ -2,9 +2,9 @@
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/chronicle-etl.svg)](https://badge.fury.io/rb/chronicle-etl)
4
4
 
5
- Chronicle ETL is a utility tool for archiving and processing personal data. You can extract it from a variety of source, transform it, and load it to different APIs or file formats.
5
+ Chronicle ETL is a utility that helps you archive and processes personal data. You can *extract* it from a variety of sources, *transform* it, and *load* it to an external API, file, or stdout.
6
6
 
7
- This project is an adaptation of Andrew Louis's experimental [Memex project](https://hyfen.net/memex).
7
+ This tool is an adaptation of Andrew Louis's experimental [Memex project](https://hyfen.net/memex) and the dozens of existing importers are being migrated to Chronicle.
8
8
 
9
9
  ## Installation
10
10
 
@@ -52,7 +52,7 @@ Built in connectors:
52
52
 
53
53
  In addition to the built-in importers, importers for third-party platforms are available. They are packaged as individual Ruby gems.
54
54
 
55
- - [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` files. Transformers for chronicle schema
55
+ - [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files. Transformers for chronicle schema
56
56
  - [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`. Transform it for chronicle schema
57
57
 
58
58
  To install any of these, run `gem install chronicle-PROVIDER`.
@@ -1,13 +1,19 @@
1
1
  require_relative 'etl/catalog'
2
2
  require_relative 'etl/config'
3
- require_relative 'etl/job_definition'
4
3
  require_relative 'etl/exceptions'
5
4
  require_relative 'etl/extractors/extractor'
5
+ require_relative 'etl/job_definition'
6
6
  require_relative 'etl/job_log'
7
7
  require_relative 'etl/job_logger'
8
8
  require_relative 'etl/job'
9
9
  require_relative 'etl/loaders/loader'
10
+ require_relative 'etl/models/activity'
11
+ require_relative 'etl/models/base'
12
+ require_relative 'etl/models/entity'
13
+ require_relative 'etl/models/generic'
10
14
  require_relative 'etl/runner'
11
15
  require_relative 'etl/transformers/transformer'
16
+ require_relative 'etl/utils/hash_utilities'
17
+ require_relative 'etl/utils/jsonapi'
12
18
  require_relative 'etl/utils/progress_bar'
13
- require_relative 'etl/version'
19
+ require_relative 'etl/version'
@@ -2,6 +2,8 @@ module Chronicle
2
2
  module ETL
3
3
  class Error < StandardError; end;
4
4
 
5
+ class InvalidTransformedRecordError < Error; end
6
+
5
7
  class ConnectorNotAvailableError < Error
6
8
  def initialize(message, provider: nil, name: nil)
7
9
  super(message)
@@ -28,8 +28,7 @@ class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
28
28
 
29
29
  csv_options = {
30
30
  headers: headers,
31
- header_converters: :symbol,
32
- converters: [:all]
31
+ converters: :all
33
32
  }
34
33
 
35
34
  stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
@@ -27,7 +27,10 @@ module Chronicle
27
27
  private
28
28
 
29
29
  def handle_continuation
30
- @options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation] && @options[:continuation].highest_timestamp
30
+ return unless @options[:continuation]
31
+
32
+ @options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation].highest_timestamp
33
+ @options[:load_after_id] = @options[:continuation].last_id if @options[:continuation].last_id
31
34
  end
32
35
  end
33
36
  end
@@ -24,7 +24,7 @@ module Chronicle
24
24
  @loader_klass = load_klass(:loader, definition[:loader][:name])
25
25
  @loader_options = definition[:loader][:options] || {}
26
26
 
27
- set_continuation
27
+ set_continuation if load_continuation?
28
28
  yield self if block_given?
29
29
  end
30
30
 
@@ -32,7 +32,7 @@ module Chronicle
32
32
  instantiate_klass(:extractor)
33
33
  end
34
34
 
35
- def instantiate_transformer data
35
+ def instantiate_transformer(data)
36
36
  instantiate_klass(:transformer, data)
37
37
  end
38
38
 
@@ -40,6 +40,11 @@ module Chronicle
40
40
  instantiate_klass(:loader)
41
41
  end
42
42
 
43
+ def save_log?
44
+ # TODO: this needs more nuance
45
+ return !id.nil?
46
+ end
47
+
43
48
  private
44
49
 
45
50
  def instantiate_klass(phase, *args)
@@ -57,6 +62,10 @@ module Chronicle
57
62
  continuation = Chronicle::ETL::JobLogger.load_latest(@job_id)
58
63
  @extractor_options[:continuation] = continuation
59
64
  end
65
+
66
+ def load_continuation?
67
+ save_log?
68
+ end
60
69
  end
61
70
  end
62
71
  end
@@ -1,4 +1,4 @@
1
- require 'pry'
1
+ require 'forwardable'
2
2
 
3
3
  module Chronicle
4
4
  module ETL
@@ -6,6 +6,8 @@ module Chronicle
6
6
  # tracking when it ran, if it was successful, and what the latest record
7
7
  # we found is (to use as a cursor for the next time)
8
8
  class JobLog
9
+ extend Forwardable
10
+
9
11
  attr_accessor :job,
10
12
  :job_id,
11
13
  :last_id,
@@ -15,6 +17,8 @@ module Chronicle
15
17
  :finished_at,
16
18
  :success
17
19
 
20
+ def_delegators :@job, :save_log?
21
+
18
22
  # Create a new JobLog for a given Job
19
23
  def initialize
20
24
  @num_records_processed = 0
@@ -64,6 +68,8 @@ module Chronicle
64
68
  }
65
69
  end
66
70
 
71
+ private
72
+
67
73
  # Create a new JobLog and set its instance variables from a serialized hash
68
74
  def self.build_from_serialized attrs
69
75
  attrs.delete(:id)
@@ -20,6 +20,8 @@ module Chronicle
20
20
 
21
21
  # Save this JobLogger's JobLog to db
22
22
  def save
23
+ return unless @job_log.save_log?
24
+
23
25
  JobLogger.with_db_connection do |db|
24
26
  dataset = db[:job_logs]
25
27
  dataset.insert(@job_log.serialize)
@@ -8,12 +8,8 @@ module Chronicle
8
8
  @rows = []
9
9
  end
10
10
 
11
- def load(result)
12
- if (result.is_a? Hash)
13
- @rows << result.values
14
- else
15
- @rows << result
16
- end
11
+ def load(record)
12
+ @rows << record.to_h_flattened.values
17
13
  end
18
14
 
19
15
  def finish
@@ -5,7 +5,7 @@ module Chronicle
5
5
  extend Chronicle::ETL::Catalog
6
6
 
7
7
  # Construct a new instance of this loader. Options are passed in from a Runner
8
- # == Paramters:
8
+ # == Parameters:
9
9
  # options::
10
10
  # Options for configuring this Loader
11
11
  def initialize(options = {})
@@ -5,25 +5,28 @@ require 'json'
5
5
  module Chronicle
6
6
  module ETL
7
7
  class RestLoader < Chronicle::ETL::Loader
8
- def initialize(options={})
8
+ def initialize( options={} )
9
9
  super(options)
10
10
  end
11
11
 
12
- def load(result)
12
+ def load(record)
13
+ payload = Chronicle::ETL::Utils::JSONAPI.serialize(record)
14
+ # have the outer data key that json-api expects
15
+ payload = { data: payload } unless payload[:data]
16
+
13
17
  uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
14
18
 
15
19
  header = {
16
20
  "Authorization" => "Bearer #{@options[:access_token]}",
17
21
  "Content-Type": 'application/json'
18
22
  }
23
+ use_ssl = uri.scheme == 'https'
19
24
 
20
- http = Net::HTTP.new(uri.host, uri.port)
21
- request = Net::HTTP::Post.new(uri.request_uri, header)
22
-
23
- obj = {data: result} unless result[:data]
24
- request.body = obj.to_json
25
-
26
- response = http.request(request)
25
+ Net::HTTP.start(uri.host, uri.port, use_ssl: use_ssl) do |http|
26
+ request = Net::HTTP::Post.new(uri.request_uri, header)
27
+ request.body = payload.to_json
28
+ http.request(request)
29
+ end
27
30
  end
28
31
  end
29
32
  end
@@ -1,9 +1,9 @@
1
1
  module Chronicle
2
2
  module ETL
3
3
  class StdoutLoader < Chronicle::ETL::Loader
4
- def load(result)
5
- puts result.inspect
4
+ def load(record)
5
+ puts record.to_h
6
6
  end
7
7
  end
8
8
  end
9
- end
9
+ end
@@ -7,14 +7,15 @@ module Chronicle
7
7
  super(options)
8
8
  end
9
9
 
10
- def load(result)
11
- @table ||= TTY::Table.new(header: result.keys)
12
- values = result.values.map{|x| x.to_s[0..30]}
10
+ def load(record)
11
+ record_hash = record.to_h_flattened
12
+ @table ||= TTY::Table.new(header: record_hash.keys)
13
+ values = record_hash.values.map{|x| x.to_s[0..30]}
13
14
  @table << values
14
15
  end
15
16
 
16
17
  def finish
17
- puts @table.render(:ascii, padding: [0, 1])
18
+ puts @table.render(:ascii, padding: [0, 1]) if @table
18
19
  end
19
20
  end
20
21
  end
@@ -0,0 +1,15 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Activity < Chronicle::ETL::Models::Base
7
+ TYPE = 'activities'.freeze
8
+ ATTRIBUTES = [:verb, :start_at, :end_at].freeze
9
+ ASSOCIATIONS = [:involved, :actor].freeze
10
+
11
+ attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,103 @@
1
+ require 'digest'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ # Represents a record that's been transformed by a Transformer and
7
+ # ready to be loaded. Loosely based on ActiveModel.
8
+ class Base
9
+ ATTRIBUTES = [:provider, :provider_id, :lat, :lng].freeze
10
+ ASSOCIATIONS = [].freeze
11
+
12
+ attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
13
+
14
+ def initialize(attributes = {})
15
+ assign_attributes(attributes) if attributes
16
+ @dedupe_on = []
17
+ end
18
+
19
+ # A unique identifier for this model is formed from a type
20
+ # and either an id or lids.
21
+ def identifier_hash
22
+ {
23
+ type: self.class::TYPE,
24
+ id: @id,
25
+ lids: lids
26
+ }.compact
27
+ end
28
+
29
+ # Array of local ids that uniquely identify this record
30
+ def lids
31
+ @dedupe_on.map do |fields|
32
+ generate_lid(fields)
33
+ end.compact.uniq
34
+ end
35
+
36
+ # For a given set of fields of this model, generate a
37
+ # unique local id by hashing the field values
38
+ def generate_lid fields
39
+ values = fields.sort.map do |field|
40
+ instance_variable = "@#{field.to_s}"
41
+ self.instance_variable_get(instance_variable)
42
+ end
43
+
44
+ return if values.any? { |e| e.nil? }
45
+
46
+ Digest::SHA256.hexdigest(values.join(","))
47
+ end
48
+
49
+ # Set of attribute names that this model has is Base's shared
50
+ # attributes combined with the child class's
51
+ def attribute_list
52
+ (ATTRIBUTES + self.class::ATTRIBUTES).uniq
53
+ end
54
+
55
+ # All of this record's attributes
56
+ def attributes
57
+ attributes = {}
58
+ attribute_list.each do |attribute|
59
+ instance_variable = "@#{attribute.to_s}"
60
+ attributes[attribute] = self.instance_variable_get(instance_variable)
61
+ end
62
+ attributes.compact
63
+ end
64
+
65
+ # All of this record's associations
66
+ def associations
67
+ association_list = ASSOCIATIONS + self.class::ASSOCIATIONS
68
+ attributes = {}
69
+ association_list.each do |attribute|
70
+ instance_variable = "@#{attribute.to_s}"
71
+ association = self.instance_variable_get(instance_variable)
72
+ attributes[attribute] = association if association
73
+ end
74
+ attributes.compact
75
+ end
76
+
77
+ def associations_hash
78
+ Hash[associations.map do |k, v|
79
+ [k, v.to_h]
80
+ end]
81
+ end
82
+
83
+ # FIXME: move this to a Utils module
84
+ def to_h_flattened
85
+ Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
86
+ end
87
+
88
+ def to_h
89
+ identifier_hash.merge(attributes).merge(associations_hash)
90
+ end
91
+
92
+ private
93
+
94
+ def assign_attributes attributes
95
+ attributes.each do |k, v|
96
+ setter = :"#{k}="
97
+ public_send(setter, v) if respond_to? setter
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,15 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Entity < Chronicle::ETL::Models::Base
7
+ TYPE = 'entities'.freeze
8
+ ATTRIBUTES = [:title, :body, :represents, :slug].freeze
9
+ ASSOCIATIONS = [].freeze # TODO: add these to reflect Chronicle Schema
10
+
11
+ attr_accessor(*ATTRIBUTES)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,23 @@
1
+ require 'chronicle/etl/models/base'
2
+
3
+ module Chronicle
4
+ module ETL
5
+ module Models
6
+ class Generic < Chronicle::ETL::Models::Base
7
+ TYPE = 'generic'
8
+
9
+ attr_accessor :properties
10
+
11
+ def initialize(properties = {})
12
+ @properties = properties
13
+ super
14
+ end
15
+
16
+ # Generic models have arbitrary attributes stored in @properties
17
+ def attributes
18
+ @properties.transform_keys(&:to_sym)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -18,9 +18,14 @@ class Chronicle::ETL::Runner
18
18
 
19
19
  extractor.extract do |data, metadata|
20
20
  transformer = @job.instantiate_transformer(data)
21
- transformed_data = transformer.transform
21
+ record = transformer.transform
22
+
23
+ unless record.is_a?(Chronicle::ETL::Models::Base)
24
+ raise Chronicle::ETL::InvalidTransformedRecordError, "Transformed data is not a type of Chronicle::ETL::Models"
25
+ end
26
+
22
27
  @job_logger.log_transformation(transformer)
23
- loader.load(transformed_data)
28
+ loader.load(record)
24
29
  progress_bar.increment
25
30
  end
26
31
 
@@ -2,9 +2,8 @@ module Chronicle
2
2
  module ETL
3
3
  class NullTransformer < Chronicle::ETL::Transformer
4
4
  def transform
5
- return @data
5
+ Chronicle::ETL::Models::Generic.new(@data)
6
6
  end
7
7
  end
8
-
9
8
  end
10
- end
9
+ end
@@ -11,12 +11,12 @@ module Chronicle
11
11
  def initialize(options = {}, data)
12
12
  @options = options
13
13
  @data = data
14
+ @record = Chronicle::ETL::Models::Activity.new
14
15
  end
15
16
 
16
- # The main entrypoint for transforming a record. Called by a Runner on each extracted record
17
- def transform
18
- raise NotImplementedError
19
- end
17
+ # @abstract Subclass is expected to implement #transform
18
+ # @!method transform
19
+ # The main entrypoint for transforming a record. Called by a Runner on each extracted record
20
20
 
21
21
  # The domain or provider-specific id of the record this transformer is working on.
22
22
  # Used for building a cursor so an extractor doesn't have to start from the beginning of a
@@ -31,5 +31,4 @@ module Chronicle
31
31
  end
32
32
  end
33
33
 
34
- require_relative 'json_transformer'
35
34
  require_relative 'null_transformer'
@@ -0,0 +1,19 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Utils
4
+ module HashUtilities
5
+ def self.flatten_hash(hash)
6
+ hash.each_with_object({}) do |(k, v), h|
7
+ if v.is_a? Hash
8
+ flatten_hash(v).map do |h_k, h_v|
9
+ h["#{k}.#{h_k}".to_sym] = h_v
10
+ end
11
+ else
12
+ h[k] = v
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,28 @@
1
+ module Chronicle
2
+ module ETL
3
+ module Utils
4
+ module JSONAPI
5
+ # For a given Chronicle::ETL::Model, serialize it as jsonapi
6
+ def self.serialize(record)
7
+ return unless record.is_a? Chronicle::ETL::Models::Base
8
+
9
+ obj = record.identifier_hash
10
+ obj[:attributes] = record.attributes
11
+
12
+ relationships = Hash[record.associations.map do |k, v|
13
+ if v.is_a?(Array)
14
+ data = { data: v.map{ |association| serialize(association) } }
15
+ else
16
+ data = { data: serialize(v) }
17
+ end
18
+
19
+ [k, data]
20
+ end]
21
+
22
+ obj[:relationships] = relationships if relationships.any?
23
+ obj
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -64,7 +64,7 @@ module Chronicle
64
64
  end
65
65
 
66
66
  def log(message)
67
- @pbar.log message.inspect
67
+ @pbar.log message
68
68
  end
69
69
 
70
70
  def finish
@@ -1,5 +1,5 @@
1
1
  module Chronicle
2
2
  module ETL
3
- VERSION = "0.2.3"
3
+ VERSION = "0.2.4"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chronicle-etl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Louis
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-09-04 00:00:00.000000000 Z
11
+ date: 2020-09-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -238,10 +238,15 @@ files:
238
238
  - lib/chronicle/etl/loaders/rest_loader.rb
239
239
  - lib/chronicle/etl/loaders/stdout_loader.rb
240
240
  - lib/chronicle/etl/loaders/table_loader.rb
241
+ - lib/chronicle/etl/models/activity.rb
242
+ - lib/chronicle/etl/models/base.rb
243
+ - lib/chronicle/etl/models/entity.rb
244
+ - lib/chronicle/etl/models/generic.rb
241
245
  - lib/chronicle/etl/runner.rb
242
- - lib/chronicle/etl/transformers/json_transformer.rb
243
246
  - lib/chronicle/etl/transformers/null_transformer.rb
244
247
  - lib/chronicle/etl/transformers/transformer.rb
248
+ - lib/chronicle/etl/utils/hash_utilities.rb
249
+ - lib/chronicle/etl/utils/jsonapi.rb
245
250
  - lib/chronicle/etl/utils/progress_bar.rb
246
251
  - lib/chronicle/etl/version.rb
247
252
  homepage: https://github.com/chronicle-app
@@ -1,11 +0,0 @@
1
- require 'json'
2
-
3
- module Chronicle
4
- module ETL
5
- class JsonTransformer < Chronicle::ETL::Transformer
6
- def transform data
7
- return JSON.parse(data)
8
- end
9
- end
10
- end
11
- end