micdrop 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.devcontainer/devcontainer.json +22 -0
- data/.rubocop.yml +8 -0
- data/.vscode/tasks.json +17 -0
- data/README.md +448 -0
- data/Rakefile +12 -0
- data/TODO.md +15 -0
- data/examples/csvs_to_sql.rb +227 -0
- data/examples/data/customers-100.csv +101 -0
- data/examples/data/json/1.json +1 -0
- data/examples/data/json/2.json +1 -0
- data/examples/data/json/3.json +1 -0
- data/examples/data/json/4.json +1 -0
- data/examples/data/json/5.json +1 -0
- data/examples/data/json/6.json +1 -0
- data/examples/data/json/7.json +1 -0
- data/examples/data/json/8.json +1 -0
- data/examples/data/json/9.json +1 -0
- data/examples/data/json/a.json +1 -0
- data/examples/data/organizations-100.csv +101 -0
- data/examples/data/people-100.csv +101 -0
- data/examples/data/readme.md +5 -0
- data/examples/json_files_to_sql.rb +54 -0
- data/lib/micdrop/errors.rb +23 -0
- data/lib/micdrop/ext/sequel.rb +121 -0
- data/lib/micdrop/files_source.rb +73 -0
- data/lib/micdrop/item_context.rb +512 -0
- data/lib/micdrop/record_context.rb +195 -0
- data/lib/micdrop/stop_skip.rb +7 -0
- data/lib/micdrop/structure_builder.rb +170 -0
- data/lib/micdrop/version.rb +5 -0
- data/lib/micdrop.rb +54 -0
- data/sig/micdrop.rbs +4 -0
- metadata +79 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
|
|
4
|
+
require "micdrop"
|
|
5
|
+
require "sequel"
|
|
6
|
+
require "micdrop/ext/sequel"
|
|
7
|
+
|
|
8
|
+
DB = Sequel.sqlite "test.db"
|
|
9
|
+
|
|
10
|
+
# Create the destination data structure.
|
|
11
|
+
# Obviously in a real import script, these would probably already exist.
|
|
12
|
+
|
|
13
|
+
DB.create_table :people do
|
|
14
|
+
primary_key :id
|
|
15
|
+
String :f_name
|
|
16
|
+
String :l_name
|
|
17
|
+
String :addr1
|
|
18
|
+
String :addr2
|
|
19
|
+
String :city
|
|
20
|
+
String :state
|
|
21
|
+
String :zip
|
|
22
|
+
String :_tmp_id # Add a temporary column for storing the old system IDs
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Now start the migration
|
|
26
|
+
|
|
27
|
+
# Our source will iterate over all the JSON files in the given directory
|
|
28
|
+
source = Micdrop::FilesSource.new(__dir__, glob: "data/json/*.json")
|
|
29
|
+
sink = Micdrop::Ext::Sequel::InsertSink.new DB[:people]
|
|
30
|
+
|
|
31
|
+
Micdrop.migrate source, sink do
|
|
32
|
+
# The files source exposes the basename and content as takeable items
|
|
33
|
+
take :basename, put: :_tmp_id
|
|
34
|
+
take(:content).parse_json do
|
|
35
|
+
# parse_json accepts a block that will enter the sub-record
|
|
36
|
+
take "residency" do
|
|
37
|
+
regex(/(?<street>.+?) ?(?<unit>(?:Apt\.?|Suite|Unit|#) [#0-9a-zA-Z-]+)?\n(?<city>.+?), (?<state>[A-Z]{2}) (?<zip>\d{5}(?:-\d{4})?)/) do
|
|
38
|
+
# regex also enters a sub-record
|
|
39
|
+
take :street, put: :addr1
|
|
40
|
+
take :unit, put: :addr2
|
|
41
|
+
take :city, put: :city
|
|
42
|
+
take :state, put: :state
|
|
43
|
+
take :zip, put: :zip
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
take "name" do
|
|
47
|
+
split " " do
|
|
48
|
+
# split can enter a sub-record as well
|
|
49
|
+
take 0, put: :f_name
|
|
50
|
+
take 1, put: :l_name
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Micdrop
|
|
4
|
+
##
|
|
5
|
+
# An error on the sink side of the migration
|
|
6
|
+
class SinkError < StandardError
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
##
|
|
10
|
+
# An error on the source side of the migration
|
|
11
|
+
class SourceError < StandardError
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
##
|
|
15
|
+
# An error with the current data value that prevents conversion operations from working
|
|
16
|
+
class ValueError < StandardError
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
##
|
|
20
|
+
# An error with the data pipeline iteself
|
|
21
|
+
class PipelineError < StandardError
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "sequel"
|
|
4
|
+
|
|
5
|
+
module Micdrop
|
|
6
|
+
module Ext
|
|
7
|
+
module Sequel
|
|
8
|
+
##
|
|
9
|
+
# A sink which will exclusively insert new items into the database
|
|
10
|
+
class InsertSink
|
|
11
|
+
def initialize(dataset)
|
|
12
|
+
@dataset = dataset
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def <<(collector)
|
|
16
|
+
@dataset.insert(**collector)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
##
|
|
21
|
+
# A sink which will always issue an update statement
|
|
22
|
+
class UpdateSink
|
|
23
|
+
def initialize(dataset, key_columns)
|
|
24
|
+
@dataset = dataset
|
|
25
|
+
@key_columns = if key_columns.is_a? Symbol
|
|
26
|
+
[key_columns]
|
|
27
|
+
elsif key_columns.respond_to? :each
|
|
28
|
+
key_columns
|
|
29
|
+
# TODO: else throw error
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def <<(collector)
|
|
34
|
+
dataset = @dataset
|
|
35
|
+
@key_columns.each do |col|
|
|
36
|
+
dataset = dataset.where(**{ col => collector[col] })
|
|
37
|
+
end
|
|
38
|
+
dataset.update(**collector)
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
##
|
|
43
|
+
# A sink which will update an item if it exists, or insert it otherwise
|
|
44
|
+
class InsertUpdateSink
|
|
45
|
+
def initialize(dataset, key_columns, update_actions: {}, default_update_action: :coalesce,
|
|
46
|
+
match_empty_key: false)
|
|
47
|
+
@dataset = dataset
|
|
48
|
+
@key_columns = if key_columns.is_a? Symbol
|
|
49
|
+
[key_columns]
|
|
50
|
+
elsif key_columns.respond_to? :each
|
|
51
|
+
key_columns
|
|
52
|
+
# TODO: else throw error
|
|
53
|
+
end
|
|
54
|
+
@update_actions = update_actions
|
|
55
|
+
@default_update_action = default_update_action
|
|
56
|
+
@match_empty_key = match_empty_key
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def <<(collector)
|
|
60
|
+
dataset = @dataset
|
|
61
|
+
@key_columns.each do |col|
|
|
62
|
+
dataset = dataset.where(**{ col => collector[col] })
|
|
63
|
+
end
|
|
64
|
+
existing = dataset.limit(2).all
|
|
65
|
+
if existing.count > 1
|
|
66
|
+
raise Micdrop::SinkError, "Key column(s) of this InsertUpdateSink are not unique"
|
|
67
|
+
elsif existing.empty?
|
|
68
|
+
dataset.insert(**collector)
|
|
69
|
+
else
|
|
70
|
+
dataset.update(**update_merge(existing.first, collector))
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
private
|
|
75
|
+
|
|
76
|
+
def update_merge(existing, collector)
|
|
77
|
+
if @update_actions.empty?
|
|
78
|
+
# If we don't have per-column actions, we can take shortcuts for some actions types
|
|
79
|
+
return collector if @default_update_action == :always_overwrite
|
|
80
|
+
return collector.compact if @default_update_action == :coalesce
|
|
81
|
+
end
|
|
82
|
+
# Otherwise merge according to the rules specified
|
|
83
|
+
existing.merge(collector) do |key, oldval, newval|
|
|
84
|
+
case @update_actions.fetch(key, @default_update_action)
|
|
85
|
+
when :coalesce then newval.nil? ? oldval : newval
|
|
86
|
+
when :overwrite_nulls then oldval.nil? ? newval : oldval
|
|
87
|
+
when :always_overwrite then newval
|
|
88
|
+
when :keep_existing then oldval
|
|
89
|
+
when :append then format("%s %s", oldval, newval)
|
|
90
|
+
when :append_line then format("%s\n%s", oldval, newval)
|
|
91
|
+
when :prepend then format("%s %s", newval, oldval)
|
|
92
|
+
when :prepend_line then format("%s\n%s", newval, oldval)
|
|
93
|
+
when :add then oldval + newval
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
##
|
|
102
|
+
# Sequel-specific extensions for ItemContext
|
|
103
|
+
class ItemContext
|
|
104
|
+
def db_lookup(dataset, key_col, val_col, pass_if_not_found: false, warn_if_not_found: nil, apply_if_not_found: nil)
|
|
105
|
+
# TODO: allow registering db_lookups like we do normal lookups
|
|
106
|
+
warn_if_not_found = true if warn_if_not_found.nil? && apply_if_not_found.nil?
|
|
107
|
+
found = dataset.where(key_col => @value).get(val_col)
|
|
108
|
+
if found.nil?
|
|
109
|
+
warn format "Value %s not found in db_lookup", @value if warn_if_not_found
|
|
110
|
+
if !apply_if_not_found.nil?
|
|
111
|
+
apply apply_if_not_found
|
|
112
|
+
elsif !pass_if_not_found
|
|
113
|
+
@value = nil
|
|
114
|
+
end
|
|
115
|
+
else
|
|
116
|
+
@value = found
|
|
117
|
+
end
|
|
118
|
+
self
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Micdrop
|
|
4
|
+
##
|
|
5
|
+
# Takes a list of files, directory, or glob pattern as a source
|
|
6
|
+
#
|
|
7
|
+
# Records in a file source will have the following items available to `take`:
|
|
8
|
+
# * :content (The full content of the file, lazy-loaded only if you `take` it)
|
|
9
|
+
# * :stream (A file IO stream)
|
|
10
|
+
# * :filename (The filename that was used to load the file, e.g. files/s.json)
|
|
11
|
+
# * :basename (The basename of the name, e.g. x.json)
|
|
12
|
+
# * :path (The full path to the file, e.g. /data/migration/files/x.json)
|
|
13
|
+
# * Anything returned by File.stat (:ctime, :mtime, :size, etc...)
|
|
14
|
+
class FilesSource
|
|
15
|
+
def initialize(dir, files: nil, glob: nil, binary_mode: false, **file_opts)
|
|
16
|
+
@dir = dir
|
|
17
|
+
@files = if files.nil?
|
|
18
|
+
if glob.nil?
|
|
19
|
+
Dir.children(dir)
|
|
20
|
+
else
|
|
21
|
+
Dir.glob(glob, flags: File::FNM_EXTGLOB, base: dir)
|
|
22
|
+
end
|
|
23
|
+
else
|
|
24
|
+
files
|
|
25
|
+
end
|
|
26
|
+
@binary_mode = binary_mode
|
|
27
|
+
@file_opts = file_opts
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def each_pair
|
|
31
|
+
return enum_for :each_pair unless block_given?
|
|
32
|
+
|
|
33
|
+
@files.each do |filename|
|
|
34
|
+
path = File.join(@dir, filename)
|
|
35
|
+
unless File.file? path
|
|
36
|
+
warn format("%s is not a file and will be skipped", path)
|
|
37
|
+
next
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
yield filename, FilesSourceRecord.new(path, @binary_mode, @file_opts)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
##
|
|
46
|
+
# Wrapper object to expose files as a source item
|
|
47
|
+
class FilesSourceRecord
|
|
48
|
+
def initialize(filename, binary_mode, file_opts)
|
|
49
|
+
@filename = filename
|
|
50
|
+
@binary_mode = binary_mode
|
|
51
|
+
@file_opts = file_opts
|
|
52
|
+
@stat = nil
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def [](key)
|
|
56
|
+
case key
|
|
57
|
+
when :contents, :content
|
|
58
|
+
File.open @filename, @binary_mode ? "rb" : "r", **@file_opts, &:read
|
|
59
|
+
when :stream
|
|
60
|
+
File.open @filename, @binary_mode ? "rb" : "r", **@file_opts
|
|
61
|
+
when :path
|
|
62
|
+
File.absolute_path @filename
|
|
63
|
+
when :basename
|
|
64
|
+
File.basename @filename
|
|
65
|
+
when :filename
|
|
66
|
+
@filename
|
|
67
|
+
else
|
|
68
|
+
@stat = File.stat(@filename) if @stat.nil?
|
|
69
|
+
@stat.method(key).call
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|