chronicle-etl 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +3 -3
- data/lib/chronicle/etl.rb +8 -2
- data/lib/chronicle/etl/exceptions.rb +2 -0
- data/lib/chronicle/etl/extractors/csv_extractor.rb +1 -2
- data/lib/chronicle/etl/extractors/extractor.rb +4 -1
- data/lib/chronicle/etl/job.rb +11 -2
- data/lib/chronicle/etl/job_log.rb +7 -1
- data/lib/chronicle/etl/job_logger.rb +2 -0
- data/lib/chronicle/etl/loaders/csv_loader.rb +2 -6
- data/lib/chronicle/etl/loaders/loader.rb +1 -1
- data/lib/chronicle/etl/loaders/rest_loader.rb +12 -9
- data/lib/chronicle/etl/loaders/stdout_loader.rb +3 -3
- data/lib/chronicle/etl/loaders/table_loader.rb +5 -4
- data/lib/chronicle/etl/models/activity.rb +15 -0
- data/lib/chronicle/etl/models/base.rb +103 -0
- data/lib/chronicle/etl/models/entity.rb +15 -0
- data/lib/chronicle/etl/models/generic.rb +23 -0
- data/lib/chronicle/etl/runner.rb +7 -2
- data/lib/chronicle/etl/transformers/null_transformer.rb +2 -3
- data/lib/chronicle/etl/transformers/transformer.rb +4 -5
- data/lib/chronicle/etl/utils/hash_utilities.rb +19 -0
- data/lib/chronicle/etl/utils/jsonapi.rb +28 -0
- data/lib/chronicle/etl/utils/progress_bar.rb +1 -1
- data/lib/chronicle/etl/version.rb +1 -1
- metadata +8 -3
- data/lib/chronicle/etl/transformers/json_transformer.rb +0 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7a02a2377d0e8d4135f3b931bc73641eac28058d736d9c1dba0a97107c1d4c0e
|
4
|
+
data.tar.gz: 810d5bff80e852fa08ef9824ed6b313aa309bb69e84228bc1fbb7595069e043b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0d5fbea3c63349bb3f566e6137755f6cc8a4060d0e401abf5a0e7d8b44a4c4278089c10ffb8bb9cf2d783a238449140e5e54d90f3ad158aa362c6335eedca5aa
|
7
|
+
data.tar.gz: bf6fa83b1d5e55760e62d3cc090bf09bb69a7c761ae4a9358fb4d82192c7efc7500b6db361f39adac3581982862654aa4603a78dfbb3aed53b51d01137ffd736
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -2,9 +2,9 @@
|
|
2
2
|
|
3
3
|
[![Gem Version](https://badge.fury.io/rb/chronicle-etl.svg)](https://badge.fury.io/rb/chronicle-etl)
|
4
4
|
|
5
|
-
Chronicle ETL is a utility
|
5
|
+
Chronicle ETL is a utility that helps you archive and processes personal data. You can *extract* it from a variety of sources, *transform* it, and *load* it to an external API, file, or stdout.
|
6
6
|
|
7
|
-
This
|
7
|
+
This tool is an adaptation of Andrew Louis's experimental [Memex project](https://hyfen.net/memex) and the dozens of existing importers are being migrated to Chronicle.
|
8
8
|
|
9
9
|
## Installation
|
10
10
|
|
@@ -52,7 +52,7 @@ Built in connectors:
|
|
52
52
|
|
53
53
|
In addition to the built-in importers, importers for third-party platforms are available. They are packaged as individual Ruby gems.
|
54
54
|
|
55
|
-
- [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` files. Transformers for chronicle schema
|
55
|
+
- [email](https://github.com/chronicle-app/chronicle-email). Extractors for `mbox` and other email files. Transformers for chronicle schema
|
56
56
|
- [bash](https://github.com/chronicle-app/chronicle-bash). Extract bash history from `~/.bash_history`. Transform it for chronicle schema
|
57
57
|
|
58
58
|
To install any of these, run `gem install chronicle-PROVIDER`.
|
data/lib/chronicle/etl.rb
CHANGED
@@ -1,13 +1,19 @@
|
|
1
1
|
require_relative 'etl/catalog'
|
2
2
|
require_relative 'etl/config'
|
3
|
-
require_relative 'etl/job_definition'
|
4
3
|
require_relative 'etl/exceptions'
|
5
4
|
require_relative 'etl/extractors/extractor'
|
5
|
+
require_relative 'etl/job_definition'
|
6
6
|
require_relative 'etl/job_log'
|
7
7
|
require_relative 'etl/job_logger'
|
8
8
|
require_relative 'etl/job'
|
9
9
|
require_relative 'etl/loaders/loader'
|
10
|
+
require_relative 'etl/models/activity'
|
11
|
+
require_relative 'etl/models/base'
|
12
|
+
require_relative 'etl/models/entity'
|
13
|
+
require_relative 'etl/models/generic'
|
10
14
|
require_relative 'etl/runner'
|
11
15
|
require_relative 'etl/transformers/transformer'
|
16
|
+
require_relative 'etl/utils/hash_utilities'
|
17
|
+
require_relative 'etl/utils/jsonapi'
|
12
18
|
require_relative 'etl/utils/progress_bar'
|
13
|
-
require_relative 'etl/version'
|
19
|
+
require_relative 'etl/version'
|
@@ -28,8 +28,7 @@ class Chronicle::ETL::CsvExtractor < Chronicle::ETL::Extractor
|
|
28
28
|
|
29
29
|
csv_options = {
|
30
30
|
headers: headers,
|
31
|
-
|
32
|
-
converters: [:all]
|
31
|
+
converters: :all
|
33
32
|
}
|
34
33
|
|
35
34
|
stream = read_from_file? ? File.open(@options[:filename]) : @options[:filename]
|
@@ -27,7 +27,10 @@ module Chronicle
|
|
27
27
|
private
|
28
28
|
|
29
29
|
def handle_continuation
|
30
|
-
|
30
|
+
return unless @options[:continuation]
|
31
|
+
|
32
|
+
@options[:load_since] = @options[:continuation].highest_timestamp if @options[:continuation].highest_timestamp
|
33
|
+
@options[:load_after_id] = @options[:continuation].last_id if @options[:continuation].last_id
|
31
34
|
end
|
32
35
|
end
|
33
36
|
end
|
data/lib/chronicle/etl/job.rb
CHANGED
@@ -24,7 +24,7 @@ module Chronicle
|
|
24
24
|
@loader_klass = load_klass(:loader, definition[:loader][:name])
|
25
25
|
@loader_options = definition[:loader][:options] || {}
|
26
26
|
|
27
|
-
set_continuation
|
27
|
+
set_continuation if load_continuation?
|
28
28
|
yield self if block_given?
|
29
29
|
end
|
30
30
|
|
@@ -32,7 +32,7 @@ module Chronicle
|
|
32
32
|
instantiate_klass(:extractor)
|
33
33
|
end
|
34
34
|
|
35
|
-
def instantiate_transformer
|
35
|
+
def instantiate_transformer(data)
|
36
36
|
instantiate_klass(:transformer, data)
|
37
37
|
end
|
38
38
|
|
@@ -40,6 +40,11 @@ module Chronicle
|
|
40
40
|
instantiate_klass(:loader)
|
41
41
|
end
|
42
42
|
|
43
|
+
def save_log?
|
44
|
+
# TODO: this needs more nuance
|
45
|
+
return !id.nil?
|
46
|
+
end
|
47
|
+
|
43
48
|
private
|
44
49
|
|
45
50
|
def instantiate_klass(phase, *args)
|
@@ -57,6 +62,10 @@ module Chronicle
|
|
57
62
|
continuation = Chronicle::ETL::JobLogger.load_latest(@job_id)
|
58
63
|
@extractor_options[:continuation] = continuation
|
59
64
|
end
|
65
|
+
|
66
|
+
def load_continuation?
|
67
|
+
save_log?
|
68
|
+
end
|
60
69
|
end
|
61
70
|
end
|
62
71
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require 'forwardable'
|
2
2
|
|
3
3
|
module Chronicle
|
4
4
|
module ETL
|
@@ -6,6 +6,8 @@ module Chronicle
|
|
6
6
|
# tracking when it ran, if it was successful, and what the latest record
|
7
7
|
# we found is (to use as a cursor for the next time)
|
8
8
|
class JobLog
|
9
|
+
extend Forwardable
|
10
|
+
|
9
11
|
attr_accessor :job,
|
10
12
|
:job_id,
|
11
13
|
:last_id,
|
@@ -15,6 +17,8 @@ module Chronicle
|
|
15
17
|
:finished_at,
|
16
18
|
:success
|
17
19
|
|
20
|
+
def_delegators :@job, :save_log?
|
21
|
+
|
18
22
|
# Create a new JobLog for a given Job
|
19
23
|
def initialize
|
20
24
|
@num_records_processed = 0
|
@@ -64,6 +68,8 @@ module Chronicle
|
|
64
68
|
}
|
65
69
|
end
|
66
70
|
|
71
|
+
private
|
72
|
+
|
67
73
|
# Create a new JobLog and set its instance variables from a serialized hash
|
68
74
|
def self.build_from_serialized attrs
|
69
75
|
attrs.delete(:id)
|
@@ -5,25 +5,28 @@ require 'json'
|
|
5
5
|
module Chronicle
|
6
6
|
module ETL
|
7
7
|
class RestLoader < Chronicle::ETL::Loader
|
8
|
-
def initialize(options={})
|
8
|
+
def initialize( options={} )
|
9
9
|
super(options)
|
10
10
|
end
|
11
11
|
|
12
|
-
def load(
|
12
|
+
def load(record)
|
13
|
+
payload = Chronicle::ETL::Utils::JSONAPI.serialize(record)
|
14
|
+
# have the outer data key that json-api expects
|
15
|
+
payload = { data: payload } unless payload[:data]
|
16
|
+
|
13
17
|
uri = URI.parse("#{@options[:hostname]}#{@options[:endpoint]}")
|
14
18
|
|
15
19
|
header = {
|
16
20
|
"Authorization" => "Bearer #{@options[:access_token]}",
|
17
21
|
"Content-Type": 'application/json'
|
18
22
|
}
|
23
|
+
use_ssl = uri.scheme == 'https'
|
19
24
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
response = http.request(request)
|
25
|
+
Net::HTTP.start(uri.host, uri.port, use_ssl: use_ssl) do |http|
|
26
|
+
request = Net::HTTP::Post.new(uri.request_uri, header)
|
27
|
+
request.body = payload.to_json
|
28
|
+
http.request(request)
|
29
|
+
end
|
27
30
|
end
|
28
31
|
end
|
29
32
|
end
|
@@ -7,14 +7,15 @@ module Chronicle
|
|
7
7
|
super(options)
|
8
8
|
end
|
9
9
|
|
10
|
-
def load(
|
11
|
-
|
12
|
-
|
10
|
+
def load(record)
|
11
|
+
record_hash = record.to_h_flattened
|
12
|
+
@table ||= TTY::Table.new(header: record_hash.keys)
|
13
|
+
values = record_hash.values.map{|x| x.to_s[0..30]}
|
13
14
|
@table << values
|
14
15
|
end
|
15
16
|
|
16
17
|
def finish
|
17
|
-
puts @table.render(:ascii, padding: [0, 1])
|
18
|
+
puts @table.render(:ascii, padding: [0, 1]) if @table
|
18
19
|
end
|
19
20
|
end
|
20
21
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
class Activity < Chronicle::ETL::Models::Base
|
7
|
+
TYPE = 'activities'.freeze
|
8
|
+
ATTRIBUTES = [:verb, :start_at, :end_at].freeze
|
9
|
+
ASSOCIATIONS = [:involved, :actor].freeze
|
10
|
+
|
11
|
+
attr_accessor(*ATTRIBUTES, *ASSOCIATIONS)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
require 'digest'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
# Represents a record that's been transformed by a Transformer and
|
7
|
+
# ready to be loaded. Loosely based on ActiveModel.
|
8
|
+
class Base
|
9
|
+
ATTRIBUTES = [:provider, :provider_id, :lat, :lng].freeze
|
10
|
+
ASSOCIATIONS = [].freeze
|
11
|
+
|
12
|
+
attr_accessor(:id, :dedupe_on, *ATTRIBUTES)
|
13
|
+
|
14
|
+
def initialize(attributes = {})
|
15
|
+
assign_attributes(attributes) if attributes
|
16
|
+
@dedupe_on = []
|
17
|
+
end
|
18
|
+
|
19
|
+
# A unique identifier for this model is formed from a type
|
20
|
+
# and either an id or lids.
|
21
|
+
def identifier_hash
|
22
|
+
{
|
23
|
+
type: self.class::TYPE,
|
24
|
+
id: @id,
|
25
|
+
lids: lids
|
26
|
+
}.compact
|
27
|
+
end
|
28
|
+
|
29
|
+
# Array of local ids that uniquely identify this record
|
30
|
+
def lids
|
31
|
+
@dedupe_on.map do |fields|
|
32
|
+
generate_lid(fields)
|
33
|
+
end.compact.uniq
|
34
|
+
end
|
35
|
+
|
36
|
+
# For a given set of fields of this model, generate a
|
37
|
+
# unique local id by hashing the field values
|
38
|
+
def generate_lid fields
|
39
|
+
values = fields.sort.map do |field|
|
40
|
+
instance_variable = "@#{field.to_s}"
|
41
|
+
self.instance_variable_get(instance_variable)
|
42
|
+
end
|
43
|
+
|
44
|
+
return if values.any? { |e| e.nil? }
|
45
|
+
|
46
|
+
Digest::SHA256.hexdigest(values.join(","))
|
47
|
+
end
|
48
|
+
|
49
|
+
# Set of attribute names that this model has is Base's shared
|
50
|
+
# attributes combined with the child class's
|
51
|
+
def attribute_list
|
52
|
+
(ATTRIBUTES + self.class::ATTRIBUTES).uniq
|
53
|
+
end
|
54
|
+
|
55
|
+
# All of this record's attributes
|
56
|
+
def attributes
|
57
|
+
attributes = {}
|
58
|
+
attribute_list.each do |attribute|
|
59
|
+
instance_variable = "@#{attribute.to_s}"
|
60
|
+
attributes[attribute] = self.instance_variable_get(instance_variable)
|
61
|
+
end
|
62
|
+
attributes.compact
|
63
|
+
end
|
64
|
+
|
65
|
+
# All of this record's associations
|
66
|
+
def associations
|
67
|
+
association_list = ASSOCIATIONS + self.class::ASSOCIATIONS
|
68
|
+
attributes = {}
|
69
|
+
association_list.each do |attribute|
|
70
|
+
instance_variable = "@#{attribute.to_s}"
|
71
|
+
association = self.instance_variable_get(instance_variable)
|
72
|
+
attributes[attribute] = association if association
|
73
|
+
end
|
74
|
+
attributes.compact
|
75
|
+
end
|
76
|
+
|
77
|
+
def associations_hash
|
78
|
+
Hash[associations.map do |k, v|
|
79
|
+
[k, v.to_h]
|
80
|
+
end]
|
81
|
+
end
|
82
|
+
|
83
|
+
# FIXME: move this to a Utils module
|
84
|
+
def to_h_flattened
|
85
|
+
Chronicle::ETL::Utils::HashUtilities.flatten_hash(to_h)
|
86
|
+
end
|
87
|
+
|
88
|
+
def to_h
|
89
|
+
identifier_hash.merge(attributes).merge(associations_hash)
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
|
94
|
+
def assign_attributes attributes
|
95
|
+
attributes.each do |k, v|
|
96
|
+
setter = :"#{k}="
|
97
|
+
public_send(setter, v) if respond_to? setter
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
class Entity < Chronicle::ETL::Models::Base
|
7
|
+
TYPE = 'entities'.freeze
|
8
|
+
ATTRIBUTES = [:title, :body, :represents, :slug].freeze
|
9
|
+
ASSOCIATIONS = [].freeze # TODO: add these to reflect Chronicle Schema
|
10
|
+
|
11
|
+
attr_accessor(*ATTRIBUTES)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'chronicle/etl/models/base'
|
2
|
+
|
3
|
+
module Chronicle
|
4
|
+
module ETL
|
5
|
+
module Models
|
6
|
+
class Generic < Chronicle::ETL::Models::Base
|
7
|
+
TYPE = 'generic'
|
8
|
+
|
9
|
+
attr_accessor :properties
|
10
|
+
|
11
|
+
def initialize(properties = {})
|
12
|
+
@properties = properties
|
13
|
+
super
|
14
|
+
end
|
15
|
+
|
16
|
+
# Generic models have arbitrary attributes stored in @properties
|
17
|
+
def attributes
|
18
|
+
@properties.transform_keys(&:to_sym)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/chronicle/etl/runner.rb
CHANGED
@@ -18,9 +18,14 @@ class Chronicle::ETL::Runner
|
|
18
18
|
|
19
19
|
extractor.extract do |data, metadata|
|
20
20
|
transformer = @job.instantiate_transformer(data)
|
21
|
-
|
21
|
+
record = transformer.transform
|
22
|
+
|
23
|
+
unless record.is_a?(Chronicle::ETL::Models::Base)
|
24
|
+
raise Chronicle::ETL::InvalidTransformedRecordError, "Transformed data is not a type of Chronicle::ETL::Models"
|
25
|
+
end
|
26
|
+
|
22
27
|
@job_logger.log_transformation(transformer)
|
23
|
-
loader.load(
|
28
|
+
loader.load(record)
|
24
29
|
progress_bar.increment
|
25
30
|
end
|
26
31
|
|
@@ -11,12 +11,12 @@ module Chronicle
|
|
11
11
|
def initialize(options = {}, data)
|
12
12
|
@options = options
|
13
13
|
@data = data
|
14
|
+
@record = Chronicle::ETL::Models::Activity.new
|
14
15
|
end
|
15
16
|
|
16
|
-
#
|
17
|
-
|
18
|
-
|
19
|
-
end
|
17
|
+
# @abstract Subclass is expected to implement #transform
|
18
|
+
# @!method transform
|
19
|
+
# The main entrypoint for transforming a record. Called by a Runner on each extracted record
|
20
20
|
|
21
21
|
# The domain or provider-specific id of the record this transformer is working on.
|
22
22
|
# Used for building a cursor so an extractor doesn't have to start from the beginning of a
|
@@ -31,5 +31,4 @@ module Chronicle
|
|
31
31
|
end
|
32
32
|
end
|
33
33
|
|
34
|
-
require_relative 'json_transformer'
|
35
34
|
require_relative 'null_transformer'
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
module Utils
|
4
|
+
module HashUtilities
|
5
|
+
def self.flatten_hash(hash)
|
6
|
+
hash.each_with_object({}) do |(k, v), h|
|
7
|
+
if v.is_a? Hash
|
8
|
+
flatten_hash(v).map do |h_k, h_v|
|
9
|
+
h["#{k}.#{h_k}".to_sym] = h_v
|
10
|
+
end
|
11
|
+
else
|
12
|
+
h[k] = v
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Chronicle
|
2
|
+
module ETL
|
3
|
+
module Utils
|
4
|
+
module JSONAPI
|
5
|
+
# For a given Chronicle::ETL::Model, serialize it as jsonapi
|
6
|
+
def self.serialize(record)
|
7
|
+
return unless record.is_a? Chronicle::ETL::Models::Base
|
8
|
+
|
9
|
+
obj = record.identifier_hash
|
10
|
+
obj[:attributes] = record.attributes
|
11
|
+
|
12
|
+
relationships = Hash[record.associations.map do |k, v|
|
13
|
+
if v.is_a?(Array)
|
14
|
+
data = { data: v.map{ |association| serialize(association) } }
|
15
|
+
else
|
16
|
+
data = { data: serialize(v) }
|
17
|
+
end
|
18
|
+
|
19
|
+
[k, data]
|
20
|
+
end]
|
21
|
+
|
22
|
+
obj[:relationships] = relationships if relationships.any?
|
23
|
+
obj
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chronicle-etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Louis
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -238,10 +238,15 @@ files:
|
|
238
238
|
- lib/chronicle/etl/loaders/rest_loader.rb
|
239
239
|
- lib/chronicle/etl/loaders/stdout_loader.rb
|
240
240
|
- lib/chronicle/etl/loaders/table_loader.rb
|
241
|
+
- lib/chronicle/etl/models/activity.rb
|
242
|
+
- lib/chronicle/etl/models/base.rb
|
243
|
+
- lib/chronicle/etl/models/entity.rb
|
244
|
+
- lib/chronicle/etl/models/generic.rb
|
241
245
|
- lib/chronicle/etl/runner.rb
|
242
|
-
- lib/chronicle/etl/transformers/json_transformer.rb
|
243
246
|
- lib/chronicle/etl/transformers/null_transformer.rb
|
244
247
|
- lib/chronicle/etl/transformers/transformer.rb
|
248
|
+
- lib/chronicle/etl/utils/hash_utilities.rb
|
249
|
+
- lib/chronicle/etl/utils/jsonapi.rb
|
245
250
|
- lib/chronicle/etl/utils/progress_bar.rb
|
246
251
|
- lib/chronicle/etl/version.rb
|
247
252
|
homepage: https://github.com/chronicle-app
|