forklift_etl 1.1.8 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +1 -1
- data/lib/forklift/patterns/mysql_patterns.rb +36 -4
- data/lib/forklift/plan.rb +1 -0
- data/lib/forklift/transports/csv.rb +1 -1
- data/lib/forklift/transports/elasticsearch.rb +1 -1
- data/lib/forklift/transports/mysql.rb +46 -24
- data/lib/forklift/version.rb +1 -1
- data/readme.md +4 -1
- data/spec/integration/mysql_patterns_spec.rb +2 -0
- data/spec/integration/mysql_spec.rb +3 -2
- data/spec/unit/connection/mysql_spec.rb +6 -4
- metadata +2 -2
data/Gemfile.lock
CHANGED
@@ -2,14 +2,18 @@ module Forklift
|
|
2
2
|
module Patterns
|
3
3
|
class Mysql
|
4
4
|
|
5
|
-
def self.pipe(source, from_table, destination, to_table)
|
5
|
+
def self.pipe(source, from_table, destination, to_table, tmp_table="_forklift_tmp")
|
6
6
|
start = Time.new.to_i
|
7
7
|
from_db = source.current_database
|
8
8
|
to_db = destination.current_database
|
9
9
|
source.forklift.logger.log("mysql pipe: `#{from_db}`.`#{from_table}` => `#{to_db}`.`#{to_table}`")
|
10
|
+
|
11
|
+
source.q("drop table if exists `#{to_db}`.`#{tmp_table}`")
|
12
|
+
source.q("create table `#{to_db}`.`#{tmp_table}` like `#{from_db}`.`#{from_table}`")
|
13
|
+
source.q("insert into `#{to_db}`.`#{tmp_table}` select * from `#{from_db}`.`#{from_table}`")
|
10
14
|
source.q("drop table if exists `#{to_db}`.`#{to_table}`")
|
11
|
-
source.q("
|
12
|
-
|
15
|
+
source.q("rename table `#{to_db}`.`#{tmp_table}` to `#{to_db}`.`#{to_table}`")
|
16
|
+
|
13
17
|
delta = Time.new.to_i - start
|
14
18
|
source.forklift.logger.log(" ^ moved #{destination.count(to_table, to_db)} rows in #{delta}s")
|
15
19
|
end
|
@@ -59,7 +63,9 @@ module Forklift
|
|
59
63
|
if self.can_incremental_pipe?(source, from_table, destination, to_table, matcher)
|
60
64
|
begin
|
61
65
|
incremental_pipe(source, from_table, destination, to_table, matcher, primary_key)
|
62
|
-
rescue
|
66
|
+
rescue Exception => e
|
67
|
+
source.forklift.logger.log("! incremental_pipe failure on #{from_table} => #{to_table}: #{e} ")
|
68
|
+
source.forklift.logger.log("! falling back to pipe...")
|
63
69
|
pipe(source, from_table, destination, to_table)
|
64
70
|
end
|
65
71
|
else
|
@@ -98,6 +104,32 @@ module Forklift
|
|
98
104
|
end
|
99
105
|
end
|
100
106
|
|
107
|
+
# The high water method will stub a row in all tables with a `default_matcher` column prentending to have a record from `time`
|
108
|
+
# This enabled partial forklift funs which will only extract data "later than X"
|
109
|
+
# TODO: assumes all columns have a default NULL setting
|
110
|
+
def self.write_high_water_mark(db, time, matcher=source.default_matcher)
|
111
|
+
db.tables.each do |table|
|
112
|
+
columns, types = db.columns(table, nil, true)
|
113
|
+
if columns.include?(matcher)
|
114
|
+
row = {}
|
115
|
+
i = 0
|
116
|
+
while( i < columns.length )
|
117
|
+
if(columns[i] == matcher)
|
118
|
+
row[column[i]] << time.to_s(:db)
|
119
|
+
elsif( types[i] =~ /text/ || types[i] =~ /varchar/ )
|
120
|
+
row[column[i]] << "~~stub~~"
|
121
|
+
elsif( types[i] =~ /float/ || types[i] =~ /int/ )
|
122
|
+
row[column[i]] << 0
|
123
|
+
else
|
124
|
+
row[column[i]] << "NULL"
|
125
|
+
end
|
126
|
+
i = i + 1
|
127
|
+
end
|
128
|
+
db.write([row], table)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
101
133
|
end
|
102
134
|
end
|
103
135
|
end
|
data/lib/forklift/plan.rb
CHANGED
@@ -28,7 +28,11 @@ module Forklift
|
|
28
28
|
q("DROP table `#{database}`.`#{table}`");
|
29
29
|
end
|
30
30
|
|
31
|
-
def
|
31
|
+
def rename(table, new_table, database=current_database, new_database=current_database)
|
32
|
+
q("RENAME TABLE `#{database}`.`#{table}` TO `#{new_database}`.`#{new_table}`")
|
33
|
+
end
|
34
|
+
|
35
|
+
def read(query, database=current_database, looping=true, limit=forklift.config[:batch_size], offset=0)
|
32
36
|
loop_count = 0
|
33
37
|
# TODO: Detect limit/offset already present in query
|
34
38
|
q("USE `#{database}`")
|
@@ -77,14 +81,25 @@ module Forklift
|
|
77
81
|
end
|
78
82
|
end
|
79
83
|
end
|
84
|
+
end
|
80
85
|
|
86
|
+
insert_q = "INSERT INTO `#{database}`.`#{table}` (#{safe_columns(columns)}) VALUES "
|
87
|
+
delete_q = "DELETE FROM `#{database}`.`#{table}` WHERE `#{primary_key}` IN "
|
88
|
+
delete_keys = []
|
89
|
+
data.each do |d|
|
81
90
|
if(to_update == true && !d[primary_key.to_sym].nil?)
|
82
|
-
|
91
|
+
delete_keys << d[primary_key.to_sym]
|
83
92
|
end
|
84
|
-
|
85
|
-
insert_q
|
86
|
-
|
93
|
+
insert_q << safe_values(columns, d)
|
94
|
+
insert_q << ","
|
95
|
+
end
|
96
|
+
|
97
|
+
if delete_keys.length > 0
|
98
|
+
delete_q << "(#{delete_keys.join(',')})"
|
99
|
+
q(delete_q)
|
87
100
|
end
|
101
|
+
insert_q = insert_q[0...-1]
|
102
|
+
q(insert_q)
|
88
103
|
forklift.logger.log "wrote #{data.length} rows to `#{database}`.`#{table}`"
|
89
104
|
end
|
90
105
|
end
|
@@ -177,12 +192,15 @@ module Forklift
|
|
177
192
|
end
|
178
193
|
end
|
179
194
|
|
180
|
-
def columns(table, database=current_database)
|
195
|
+
def columns(table, database=current_database, return_types=false)
|
181
196
|
cols = []
|
197
|
+
types = []
|
182
198
|
read("describe `#{database}`.`#{table}`").each do |row|
|
183
|
-
cols
|
199
|
+
cols << row[:Field]
|
200
|
+
types << row[:Type]
|
184
201
|
end
|
185
|
-
cols
|
202
|
+
return cols if return_types == false
|
203
|
+
return cols, types
|
186
204
|
end
|
187
205
|
|
188
206
|
def dump(file, options=[])
|
@@ -257,27 +275,31 @@ module Forklift
|
|
257
275
|
return a.join(', ')
|
258
276
|
end
|
259
277
|
|
260
|
-
def safe_values(
|
278
|
+
def safe_values(columns, d)
|
261
279
|
a = []
|
262
|
-
|
280
|
+
sym_cols = columns.map { |s| s.to_sym }
|
281
|
+
sym_cols.each do |c|
|
263
282
|
part = "NULL"
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
283
|
+
v = d[c]
|
284
|
+
unless v.nil?
|
285
|
+
if( [::String, ::Symbol].include?(v.class) )
|
286
|
+
s = v.to_s
|
287
|
+
s.gsub!('\\') { '\\\\' }
|
288
|
+
s.gsub!('\"', '\/"')
|
289
|
+
s.gsub!('"', '\"')
|
290
|
+
part = "\"#{s}\""
|
291
|
+
elsif( [::Date, ::Time, ::DateTime].include?(v.class) )
|
292
|
+
s = v.to_s(:db)
|
293
|
+
part = "\"#{s}\""
|
294
|
+
elsif( [::Fixnum].include?(v.class) )
|
295
|
+
part = v
|
296
|
+
elsif( [::Float, ::BigDecimal].include?(v.class) )
|
297
|
+
part = v.to_f
|
298
|
+
end
|
277
299
|
end
|
278
300
|
a << part
|
279
301
|
end
|
280
|
-
return a.join(', ')
|
302
|
+
return "( #{a.join(', ')} )"
|
281
303
|
end
|
282
304
|
|
283
305
|
#/private
|
data/lib/forklift/version.rb
CHANGED
data/readme.md
CHANGED
@@ -370,7 +370,8 @@ end
|
|
370
370
|
- max_timestamp(table, matcher=default_matcher, database=current_database)
|
371
371
|
- return the timestamp of the max(matcher) or 1970-01-01
|
372
372
|
- truncate!(table, database=current_database)
|
373
|
-
- columns(table, database=current_database)
|
373
|
+
- columns(table, database=current_database, return_types=false)
|
374
|
+
- rename(table, new_table, database, new_database)
|
374
375
|
- dump(file)
|
375
376
|
- mysqldump the database to `file` via gzip
|
376
377
|
|
@@ -384,6 +385,8 @@ end
|
|
384
385
|
- mysql_optimistic_import(source, destination)
|
385
386
|
- tries to do an incramental table copy, falls back to a full table copy
|
386
387
|
- this differs from `pipe`, as all data is loaded into forklift, rather than relying on mysql transfer methods
|
388
|
+
- write_high_water_mark(db, time, matcher)
|
389
|
+
- The high water method will stub a row in all tables with a `default_matcher` column prentending to have a record from `time`
|
387
390
|
|
388
391
|
### Elasticsearch
|
389
392
|
|
@@ -37,7 +37,8 @@ describe 'mysql' do
|
|
37
37
|
it "can write new data" do
|
38
38
|
table = "users"
|
39
39
|
data = [
|
40
|
-
{email: 'other@example.com', first_name: 'other', last_name: 'n', created_at: Time.new.to_s(:db), updated_at: Time.new.to_s(:db)}
|
40
|
+
{email: 'other@example.com', first_name: 'other', last_name: 'n', created_at: Time.new.to_s(:db), updated_at: Time.new.to_s(:db)},
|
41
|
+
{email: 'else@example.com', first_name: 'else', last_name: 'n', created_at: Time.new.to_s(:db), updated_at: Time.new.to_s(:db)}
|
41
42
|
]
|
42
43
|
plan = SpecPlan.new
|
43
44
|
plan.do! {
|
@@ -47,7 +48,7 @@ describe 'mysql' do
|
|
47
48
|
|
48
49
|
destination = SpecClient.mysql('forklift_test_source_a')
|
49
50
|
count = destination.query('select count(1) as "count" from users').first['count']
|
50
|
-
expect(count).to eql
|
51
|
+
expect(count).to eql 7
|
51
52
|
end
|
52
53
|
|
53
54
|
it "can update existing data" do
|
@@ -90,13 +90,15 @@ describe Forklift::Connection::Mysql do
|
|
90
90
|
subject { described_class.new({}, {}) }
|
91
91
|
|
92
92
|
it "escapes one trailing backslash" do
|
93
|
-
|
94
|
-
|
93
|
+
columns = ['col']
|
94
|
+
values = {:col => "foo\\"}
|
95
|
+
expect(subject.send(:safe_values, columns, values)).to eq("( \"foo\\\\\" )")
|
95
96
|
end
|
96
97
|
|
97
98
|
it "escapes two trailing backslashes" do
|
98
|
-
|
99
|
-
|
99
|
+
columns = ['col']
|
100
|
+
values = {:col => "foo\\\\" }
|
101
|
+
expect(subject.send(:safe_values, columns, values)).to eq("( \"foo\\\\\\\\\" )")
|
100
102
|
end
|
101
103
|
end
|
102
104
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forklift_etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.9
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-08-
|
12
|
+
date: 2014-08-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|