chronicle-etl 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +3 -3
- data/lib/chronicle/etl.rb +8 -2
- data/lib/chronicle/etl/exceptions.rb +2 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +1 -2
- data/lib/chronicle/etl/extractors/extractor.rb +4 -1
- data/lib/chronicle/etl/job.rb +11 -2
- data/lib/chronicle/etl/job_log.rb +7 -1
- data/lib/chronicle/etl/job_logger.rb +2 -0
- data/lib/chronicle/etl/loaders/csv_loader.rb +2 -6
- data/lib/chronicle/etl/loaders/loader.rb +1 -1
- data/lib/chronicle/etl/loaders/rest_loader.rb +12 -9
- data/lib/chronicle/etl/loaders/stdout_loader.rb +3 -3
- data/lib/chronicle/etl/loaders/table_loader.rb +5 -4
- data/lib/chronicle/etl/models/activity.rb +15 -0
- data/lib/chronicle/etl/models/base.rb +103 -0
- data/lib/chronicle/etl/models/entity.rb +15 -0
- data/lib/chronicle/etl/models/generic.rb +23 -0
- data/lib/chronicle/etl/runner.rb +7 -2
- data/lib/chronicle/etl/transformers/null_transformer.rb +2 -3
- data/lib/chronicle/etl/transformers/transformer.rb +4 -5
- data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
- data/lib/chronicle/etl/utils/jsonapi.rb +28 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +1 -1
- data/lib/chronicle/etl/version.rb +1 -1
- metadata +8 -3
- data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7a02a2377d0e8d4135f3b931bc73641eac28058d736d9c1dba0a97107c1d4c0e
|
4
|
+
data.tar.gz: 810d5bff80e852fa08ef9824ed6b313aa309bb69e84228bc1fbb7595069e043b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0d5fbea3c63349bb3f566e6137755f6cc8a4060d0e401abf5a0e7d8b44a4c4278089c10ffb8bb9cf2d783a238449140e5e54d90f3ad158aa362c6335eedca5aa
|
7
|
+
data.tar.gz: bf6fa83b1d5e55760e62d3cc090bf09bb69a7c761ae4a9358fb4d82192c7efc7500b6db361f39adac3581982862654aa4603a78dfbb3aed53b51d01137ffd736
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -2,9 +2,9 @@
|
|
2
2
|
|
3
3
|
[](https://badge.fury.io/rb/chronicle-etl)
|
4
4
|
|
5
|
-
Chronicle ETL is a utility
|
5
|
+
Chronicle ETL is a utility that helps you archive and processes personal data. You can *extract* it from a variety of sources, *transform* it, and *load* it to an external API, file, or stdout.
|
6
6
|
|
7
|
-
This
|
7
|
+
This tool is an adaptation of Andrew Louis's experimental [Memex project](https://hyfen.net/memex) and the dozens of existing importers are being migrated to Chronicle.
|
8
8
|
|
9
9
|
## Installation
|
10
10
|
|
@@ -52,7 +52,7 @@ Built in connectors:
|
|
52
52
|
|
53
53
|
In addition to the built-in importers, importers for third-party platforms are available. They are packaged as individual Ruby gems.
|
54
54
|
|
55
|
-
- [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` files. Transformers for chronicle schema
|
55
|
+
- [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files. Transformers for chronicle schema
|
56
56
|
- [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`. Transform it for chronicle schema
|
57
57
|
|
58
58
|
To install any of these, run `gem install chronicle-PROVIDER`.
|
data/lib/chronicle/etl.rb
CHANGED
@@ -1,13 +1,19 @@
|
|
1
1
|
require_relative 'etl/catalog'
|
2
2
|
require_relative 'etl/config'
|
3
|
-
require_relative 'etl/job_definition'
|
4
3
|
require_relative 'etl/exceptions'
|
5
4
|
require_relative 'etl/extractors/extractor'
|
5
|
+
require_relative 'etl/job_definition'
|
6
6
|
require_relative 'etl/job_log'
|
7
7
|
require_relative 'etl/job_logger'
|
8
8
|
require_relative 'etl/job'
|
9
9
|
require_relative 'etl/loaders/loader'
|
10
|
+
require_relative 'etl/models/activity'
|
11
|
+
require_relative 'etl/models/base'
|
12
|
+
require_relative 'etl/models/entity'
|
13
|
+
require_relative 'etl/models/generic'
|
10
14
|
require_relative 'etl/runner'
|
11
15
|
require_relative 'etl/transformers/transformer'
|
16
|
+
require_relative 'etl/utils/hash_utilities'
|
17
|
+
require_relative 'etl/utils/jsonapi'
|
12
18
|
require_relative 'etl/utils/progress_bar'
|
13
|
-
require_relative 'etl/version'
|
19
|
+
require_relative 'etl/version'
|
@@ -28,8 +28,7 @@ class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
|
|
28
28
|
|
29
29
|
csv_options = {
|
30
30
|
headers: headers,
|
31
|
-
|
32
|
-
converters: [:all]
|
31
|
+
converters: :all
|
33
32
|
}
|
34
33
|
|
35
34
|
stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
|
@@ -27,7 +27,10 @@ module Chronicle
|
|
27
27
|
private
|
28
28
|
|
29
29
|
def handle_continuation
|
30
|
-
|
30
|
+
return unless @options[:continuation]
|
31
|
+
|
32
|
+
@options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation].highest_timestamp
|
33
|
+
@options[:load_after_id] = @options[:continuation].last_id if @options[:continuation].last_id
|
31
34
|
end
|
32
35
|
end
|
33
36
|
end
|
data/lib/chronicle/etl/job.rb
CHANGED
@@ -24,7 +24,7 @@ module Chronicle
|
|
24
24
|
@loader_klass = load_klass(:loader, definition[:loader][:name])
|
25
25
|
@loader_options = definition[:loader][:options] || {}
|
26
26
|
|
27
|
-
set_continuation
|
27
|
+
set_continuation if load_continuation?
|
28
28
|
yield self if block_given?
|
29
29
|
end
|
30
30
|
|
@@ -32,7 +32,7 @@ module Chronicle
|
|
32
32
|
instantiate_klass(:extractor)
|
33
33
|
end
|
34
34
|
|
35
|
-
def instantiate_transformer
|
35
|
+
def instantiate_transformer(data)
|
36
36
|
instantiate_klass(:transformer, data)
|
37
37
|
end
|
38
38
|
|
@@ -40,6 +40,11 @@ module Chronicle
|
|
40
40
|
instantiate_klass(:loader)
|
41
41
|
end
|
42
42
|
|
43
|
+
def save_log?
|
44
|
+
# TODO: this needs more nuance
|
45
|
+
return !id.nil?
|
46
|
+
end
|
47
|
+
|
43
48
|
private
|
44
49
|
|
45
50
|
def instantiate_klass(phase, *args)
|
@@ -57,6 +62,10 @@ module Chronicle
|
|
57
62
|
continuation = Chronicle::ETL::JobLogger.load_latest(@job_id)
|
58
63
|
@extractor_options[:continuation] = continuation
|
59
64
|
end
|
65
|
+
|
66
|
+
def load_continuation?
|
67
|
+
save_log?
|
68
|
+
end
|
60
69
|
end
|
61
70
|
end
|
62
71
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require 'forwardable'
|
2
2
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
@@ -6,6 +6,8 @@ module Chronicle
|
|
6
6
|
# tracking when it ran, if it was successful, and what the latest record
|
7
7
|
# we found is (to use as a cursor for the next time)
|
8
8
|
class JobLog
|
9
|
+
extend Forwardable
|
10
|
+
|
9
11
|
attr_accessor :job,
|
10
12
|
:job_id,
|
11
13
|
:last_id,
|
@@ -15,6 +17,8 @@ module Chronicle
|
|
15
17
|
:finished_at,
|
16
18
|
:success
|
17
19
|
|
20
|
+
def_delegators :@job, :save_log?
|
21
|
+
|
18
22
|
# Create a new JobLog for a given Job
|
19
23
|
def initialize
|
20
24
|
@num_records_processed = 0
|
@@ -64,6 +68,8 @@ module Chronicle
|
|
64
68
|
}
|
65
69
|
end
|
66
70
|
|
71
|
+
private
|
72
|
+
|
67
73
|
# Create a new JobLog and set its instance variables from a serialized hash
|
68
74
|
def self.build_from_serialized attrs
|
69
75
|
attrs.delete(:id)
|
@@ -5,25 +5,28 @@ require 'json'
|
|
5
5
|
module Chronicle
|
6
6
|
module ETL
|
7
7
|
class RestLoader < Chronicle::ETL::Loader
|
8
|
-
def initialize(options={})
|
8
|
+
def initialize( options={} )
|
9
9
|
super(options)
|
10
10
|
end
|
11
11
|
|
12
|
-
def load(
|
12
|
+
def load(record)
|
13
|
+
payload = Chronicle::ETL::Utils::JSONAPI.serialize(record)
|
14
|
+
# have the outer data key that json-api expects
|
15
|
+
payload = { data: payload } unless payload[:data]
|
16
|
+
|
13
17
|
uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
|
14
18
|
|
15
19
|
header = {
|
16
20
|
"Authorization" => "Bearer #{@options[:access_token]}",
|
17
21
|
"Content-Type": 'application/json'
|
18
22
|
}
|
23
|
+
use_ssl = uri.scheme == 'https'
|
19
24
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
response = http.request(request)
|
25
|
+
Net::HTTP.start(uri.host, uri.port, use_ssl: use_ssl) do |http|
|
26
|
+
request = Net::HTTP::Post.new(uri.request_uri, header)
|
27
|
+
request.body = payload.to_json
|
28
|
+
http.request(request)
|
29
|
+
end
|
27
30
|
end
|
28
31
|
end
|
29
32
|
end
|
@@ -7,14 +7,15 @@ module Chronicle
|
|
7
7
|
super(options)
|
8
8
|
end
|
9
9
|
|
10
|
-
def load(
|
11
|
-
|
12
|
-
|
10
|
+
def load(record)
|
11
|
+
record_hash = record.to_h_flattened
|
12
|
+
@table ||= TTY::Table.new(header: record_hash.keys)
|
13
|
+
values = record_hash.values.map{|x| x.to_s[0..30]}
|
13
14
|
@table << values
|
14
15
|
end
|
15
16
|
|
16
17
|
def finish
|
17
|
-
puts @table.render(:ascii, padding: [0, 1])
|
18
|
+
puts @table.render(:ascii, padding: [0, 1]) if @table
|
18
19
|
end
|
19
20
|
end
|
20
21
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
class Activity < Chronicle::ETL::Models::Base
|
7
|
+
TYPE = 'activities'.freeze
|
8
|
+
ATTRIBUTES = [:verb, :start_at, :end_at].freeze
|
9
|
+
ASSOCIATIONS = [:involved, :actor].freeze
|
10
|
+
|
11
|
+
attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
require 'digest'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
# Represents a record that's been transformed by a Transformer and
|
7
|
+
# ready to be loaded. Loosely based on ActiveModel.
|
8
|
+
class Base
|
9
|
+
ATTRIBUTES = [:provider, :provider_id, :lat, :lng].freeze
|
10
|
+
ASSOCIATIONS = [].freeze
|
11
|
+
|
12
|
+
attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
|
13
|
+
|
14
|
+
def initialize(attributes = {})
|
15
|
+
assign_attributes(attributes) if attributes
|
16
|
+
@dedupe_on = []
|
17
|
+
end
|
18
|
+
|
19
|
+
# A unique identifier for this model is formed from a type
|
20
|
+
# and either an id or lids.
|
21
|
+
def identifier_hash
|
22
|
+
{
|
23
|
+
type: self.class::TYPE,
|
24
|
+
id: @id,
|
25
|
+
lids: lids
|
26
|
+
}.compact
|
27
|
+
end
|
28
|
+
|
29
|
+
# Array of local ids that uniquely identify this record
|
30
|
+
def lids
|
31
|
+
@dedupe_on.map do |fields|
|
32
|
+
generate_lid(fields)
|
33
|
+
end.compact.uniq
|
34
|
+
end
|
35
|
+
|
36
|
+
# For a given set of fields of this model, generate a
|
37
|
+
# unique local id by hashing the field values
|
38
|
+
def generate_lid fields
|
39
|
+
values = fields.sort.map do |field|
|
40
|
+
instance_variable = "@#{field.to_s}"
|
41
|
+
self.instance_variable_get(instance_variable)
|
42
|
+
end
|
43
|
+
|
44
|
+
return if values.any? { |e| e.nil? }
|
45
|
+
|
46
|
+
Digest::SHA256.hexdigest(values.join(","))
|
47
|
+
end
|
48
|
+
|
49
|
+
# Set of attribute names that this model has is Base's shared
|
50
|
+
# attributes combined with the child class's
|
51
|
+
def attribute_list
|
52
|
+
(ATTRIBUTES + self.class::ATTRIBUTES).uniq
|
53
|
+
end
|
54
|
+
|
55
|
+
# All of this record's attributes
|
56
|
+
def attributes
|
57
|
+
attributes = {}
|
58
|
+
attribute_list.each do |attribute|
|
59
|
+
instance_variable = "@#{attribute.to_s}"
|
60
|
+
attributes[attribute] = self.instance_variable_get(instance_variable)
|
61
|
+
end
|
62
|
+
attributes.compact
|
63
|
+
end
|
64
|
+
|
65
|
+
# All of this record's associations
|
66
|
+
def associations
|
67
|
+
association_list = ASSOCIATIONS + self.class::ASSOCIATIONS
|
68
|
+
attributes = {}
|
69
|
+
association_list.each do |attribute|
|
70
|
+
instance_variable = "@#{attribute.to_s}"
|
71
|
+
association = self.instance_variable_get(instance_variable)
|
72
|
+
attributes[attribute] = association if association
|
73
|
+
end
|
74
|
+
attributes.compact
|
75
|
+
end
|
76
|
+
|
77
|
+
def associations_hash
|
78
|
+
Hash[associations.map do |k, v|
|
79
|
+
[k, v.to_h]
|
80
|
+
end]
|
81
|
+
end
|
82
|
+
|
83
|
+
# FIXME: move this to a Utils module
|
84
|
+
def to_h_flattened
|
85
|
+
Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
|
86
|
+
end
|
87
|
+
|
88
|
+
def to_h
|
89
|
+
identifier_hash.merge(attributes).merge(associations_hash)
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
|
94
|
+
def assign_attributes attributes
|
95
|
+
attributes.each do |k, v|
|
96
|
+
setter = :"#{k}="
|
97
|
+
public_send(setter, v) if respond_to? setter
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
class Entity < Chronicle::ETL::Models::Base
|
7
|
+
TYPE = 'entities'.freeze
|
8
|
+
ATTRIBUTES = [:title, :body, :represents, :slug].freeze
|
9
|
+
ASSOCIATIONS = [].freeze # TODO: add these to reflect Chronicle Schema
|
10
|
+
|
11
|
+
attr_accessor(*ATTRIBUTES)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
class Generic < Chronicle::ETL::Models::Base
|
7
|
+
TYPE = 'generic'
|
8
|
+
|
9
|
+
attr_accessor :properties
|
10
|
+
|
11
|
+
def initialize(properties = {})
|
12
|
+
@properties = properties
|
13
|
+
super
|
14
|
+
end
|
15
|
+
|
16
|
+
# Generic models have arbitrary attributes stored in @properties
|
17
|
+
def attributes
|
18
|
+
@properties.transform_keys(&:to_sym)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -18,9 +18,14 @@ class Chronicle::ETL::Runner
|
|
18
18
|
|
19
19
|
extractor.extract do |data, metadata|
|
20
20
|
transformer = @job.instantiate_transformer(data)
|
21
|
-
|
21
|
+
record = transformer.transform
|
22
|
+
|
23
|
+
unless record.is_a?(Chronicle::ETL::Models::Base)
|
24
|
+
raise Chronicle::ETL::InvalidTransformedRecordError, "Transformed data is not a type of Chronicle::ETL::Models"
|
25
|
+
end
|
26
|
+
|
22
27
|
@job_logger.log_transformation(transformer)
|
23
|
-
loader.load(
|
28
|
+
loader.load(record)
|
24
29
|
progress_bar.increment
|
25
30
|
end
|
26
31
|
|
@@ -11,12 +11,12 @@ module Chronicle
|
|
11
11
|
def initialize(options = {}, data)
|
12
12
|
@options = options
|
13
13
|
@data = data
|
14
|
+
@record = Chronicle::ETL::Models::Activity.new
|
14
15
|
end
|
15
16
|
|
16
|
-
#
|
17
|
-
|
18
|
-
|
19
|
-
end
|
17
|
+
# @abstract Subclass is expected to implement #transform
|
18
|
+
# @!method transform
|
19
|
+
# The main entrypoint for transforming a record. Called by a Runner on each extracted record
|
20
20
|
|
21
21
|
# The domain or provider-specific id of the record this transformer is working on.
|
22
22
|
# Used for building a cursor so an extractor doesn't have to start from the beginning of a
|
@@ -31,5 +31,4 @@ module Chronicle
|
|
31
31
|
end
|
32
32
|
end
|
33
33
|
|
34
|
-
require_relative 'json_transformer'
|
35
34
|
require_relative 'null_transformer'
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
module Utils
|
4
|
+
module HashUtilities
|
5
|
+
def self.flatten_hash(hash)
|
6
|
+
hash.each_with_object({}) do |(k, v), h|
|
7
|
+
if v.is_a? Hash
|
8
|
+
flatten_hash(v).map do |h_k, h_v|
|
9
|
+
h["#{k}.#{h_k}".to_sym] = h_v
|
10
|
+
end
|
11
|
+
else
|
12
|
+
h[k] = v
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
module Utils
|
4
|
+
module JSONAPI
|
5
|
+
# For a given Chronicle::ETL::Model, serialize it as jsonapi
|
6
|
+
def self.serialize(record)
|
7
|
+
return unless record.is_a? Chronicle::ETL::Models::Base
|
8
|
+
|
9
|
+
obj = record.identifier_hash
|
10
|
+
obj[:attributes] = record.attributes
|
11
|
+
|
12
|
+
relationships = Hash[record.associations.map do |k, v|
|
13
|
+
if v.is_a?(Array)
|
14
|
+
data = { data: v.map{ |association| serialize(association) } }
|
15
|
+
else
|
16
|
+
data = { data: serialize(v) }
|
17
|
+
end
|
18
|
+
|
19
|
+
[k, data]
|
20
|
+
end]
|
21
|
+
|
22
|
+
obj[:relationships] = relationships if relationships.any?
|
23
|
+
obj
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chronicle-etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Louis
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -238,10 +238,15 @@ files:
|
|
238
238
|
- lib/chronicle/etl/loaders/rest_loader.rb
|
239
239
|
- lib/chronicle/etl/loaders/stdout_loader.rb
|
240
240
|
- lib/chronicle/etl/loaders/table_loader.rb
|
241
|
+
- lib/chronicle/etl/models/activity.rb
|
242
|
+
- lib/chronicle/etl/models/base.rb
|
243
|
+
- lib/chronicle/etl/models/entity.rb
|
244
|
+
- lib/chronicle/etl/models/generic.rb
|
241
245
|
- lib/chronicle/etl/runner.rb
|
242
|
-
- lib/chronicle/etl/transformers/json_transformer.rb
|
243
246
|
- lib/chronicle/etl/transformers/null_transformer.rb
|
244
247
|
- lib/chronicle/etl/transformers/transformer.rb
|
248
|
+
- lib/chronicle/etl/utils/hash_utilities.rb
|
249
|
+
- lib/chronicle/etl/utils/jsonapi.rb
|
245
250
|
- lib/chronicle/etl/utils/progress_bar.rb
|
246
251
|
- lib/chronicle/etl/version.rb
|
247
252
|
homepage: https://github.com/chronicle-app
|