forklift_etl 1.1.8 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- forklift_etl (1.1.8)
4
+ forklift_etl (1.1.9)
5
5
  activesupport (~> 4.0, >= 4.0.0)
6
6
  elasticsearch (~> 1.0, >= 1.0.0)
7
7
  lumberjack (~> 1.0, >= 1.0.0)
@@ -2,14 +2,18 @@ module Forklift
2
2
  module Patterns
3
3
  class Mysql
4
4
 
5
- def self.pipe(source, from_table, destination, to_table)
5
+ def self.pipe(source, from_table, destination, to_table, tmp_table="_forklift_tmp")
6
6
  start = Time.new.to_i
7
7
  from_db = source.current_database
8
8
  to_db = destination.current_database
9
9
  source.forklift.logger.log("mysql pipe: `#{from_db}`.`#{from_table}` => `#{to_db}`.`#{to_table}`")
10
+
11
+ source.q("drop table if exists `#{to_db}`.`#{tmp_table}`")
12
+ source.q("create table `#{to_db}`.`#{tmp_table}` like `#{from_db}`.`#{from_table}`")
13
+ source.q("insert into `#{to_db}`.`#{tmp_table}` select * from `#{from_db}`.`#{from_table}`")
10
14
  source.q("drop table if exists `#{to_db}`.`#{to_table}`")
11
- source.q("create table `#{to_db}`.`#{to_table}` like `#{from_db}`.`#{from_table}`")
12
- source.q("insert into `#{to_db}`.`#{to_table}` select * from `#{from_db}`.`#{from_table}`")
15
+ source.q("rename table `#{to_db}`.`#{tmp_table}` to `#{to_db}`.`#{to_table}`")
16
+
13
17
  delta = Time.new.to_i - start
14
18
  source.forklift.logger.log(" ^ moved #{destination.count(to_table, to_db)} rows in #{delta}s")
15
19
  end
@@ -59,7 +63,9 @@ module Forklift
59
63
  if self.can_incremental_pipe?(source, from_table, destination, to_table, matcher)
60
64
  begin
61
65
  incremental_pipe(source, from_table, destination, to_table, matcher, primary_key)
62
- rescue
66
+ rescue Exception => e
67
+ source.forklift.logger.log("! incremental_pipe failure on #{from_table} => #{to_table}: #{e} ")
68
+ source.forklift.logger.log("! falling back to pipe...")
63
69
  pipe(source, from_table, destination, to_table)
64
70
  end
65
71
  else
@@ -98,6 +104,32 @@ module Forklift
98
104
  end
99
105
  end
100
106
 
107
+ # The high water method will stub a row in all tables with a `default_matcher` column prentending to have a record from `time`
108
+ # This enabled partial forklift funs which will only extract data "later than X"
109
+ # TODO: assumes all columns have a default NULL setting
110
+ def self.write_high_water_mark(db, time, matcher=source.default_matcher)
111
+ db.tables.each do |table|
112
+ columns, types = db.columns(table, nil, true)
113
+ if columns.include?(matcher)
114
+ row = {}
115
+ i = 0
116
+ while( i < columns.length )
117
+ if(columns[i] == matcher)
118
+ row[column[i]] << time.to_s(:db)
119
+ elsif( types[i] =~ /text/ || types[i] =~ /varchar/ )
120
+ row[column[i]] << "~~stub~~"
121
+ elsif( types[i] =~ /float/ || types[i] =~ /int/ )
122
+ row[column[i]] << 0
123
+ else
124
+ row[column[i]] << "NULL"
125
+ end
126
+ i = i + 1
127
+ end
128
+ db.write([row], table)
129
+ end
130
+ end
131
+ end
132
+
101
133
  end
102
134
  end
103
135
  end
data/lib/forklift/plan.rb CHANGED
@@ -137,6 +137,7 @@ module Forklift
137
137
  def default_config
138
138
  return {
139
139
  project_root: Dir.pwd,
140
+ batch_size: 1000,
140
141
  logger: {
141
142
  stdout: true,
142
143
  debug: false,
@@ -18,7 +18,7 @@ module Forklift
18
18
  @forklift
19
19
  end
20
20
 
21
- def read(size=1000)
21
+ def read(size=forklift.config[:batch_size])
22
22
  data = []
23
23
  CSV.foreach(config[:file], headers: true, converters: :all) do |row|
24
24
  data << row.to_hash.symbolize_keys
@@ -18,7 +18,7 @@ module Forklift
18
18
  @forklift
19
19
  end
20
20
 
21
- def read(index, query, looping=true, from=0, size=1000)
21
+ def read(index, query, looping=true, from=0, size=forklift.config[:batch_size])
22
22
  offset = 0
23
23
  loop_count = 0
24
24
 
@@ -28,7 +28,11 @@ module Forklift
28
28
  q("DROP table `#{database}`.`#{table}`");
29
29
  end
30
30
 
31
- def read(query, database=current_database, looping=true, limit=1000, offset=0)
31
+ def rename(table, new_table, database=current_database, new_database=current_database)
32
+ q("RENAME TABLE `#{database}`.`#{table}` TO `#{new_database}`.`#{new_table}`")
33
+ end
34
+
35
+ def read(query, database=current_database, looping=true, limit=forklift.config[:batch_size], offset=0)
32
36
  loop_count = 0
33
37
  # TODO: Detect limit/offset already present in query
34
38
  q("USE `#{database}`")
@@ -77,14 +81,25 @@ module Forklift
77
81
  end
78
82
  end
79
83
  end
84
+ end
80
85
 
86
+ insert_q = "INSERT INTO `#{database}`.`#{table}` (#{safe_columns(columns)}) VALUES "
87
+ delete_q = "DELETE FROM `#{database}`.`#{table}` WHERE `#{primary_key}` IN "
88
+ delete_keys = []
89
+ data.each do |d|
81
90
  if(to_update == true && !d[primary_key.to_sym].nil?)
82
- q("DELETE FROM `#{database}`.`#{table}` WHERE `#{primary_key}` = #{d[primary_key.to_sym]}")
91
+ delete_keys << d[primary_key.to_sym]
83
92
  end
84
-
85
- insert_q = "INSERT INTO `#{database}`.`#{table}` (#{safe_columns(d.keys)}) VALUES (#{safe_values(d.values)});"
86
- q(insert_q)
93
+ insert_q << safe_values(columns, d)
94
+ insert_q << ","
95
+ end
96
+
97
+ if delete_keys.length > 0
98
+ delete_q << "(#{delete_keys.join(',')})"
99
+ q(delete_q)
87
100
  end
101
+ insert_q = insert_q[0...-1]
102
+ q(insert_q)
88
103
  forklift.logger.log "wrote #{data.length} rows to `#{database}`.`#{table}`"
89
104
  end
90
105
  end
@@ -177,12 +192,15 @@ module Forklift
177
192
  end
178
193
  end
179
194
 
180
- def columns(table, database=current_database)
195
+ def columns(table, database=current_database, return_types=false)
181
196
  cols = []
197
+ types = []
182
198
  read("describe `#{database}`.`#{table}`").each do |row|
183
- cols << row[:Field]
199
+ cols << row[:Field]
200
+ types << row[:Type]
184
201
  end
185
- cols
202
+ return cols if return_types == false
203
+ return cols, types
186
204
  end
187
205
 
188
206
  def dump(file, options=[])
@@ -257,27 +275,31 @@ module Forklift
257
275
  return a.join(', ')
258
276
  end
259
277
 
260
- def safe_values(values)
278
+ def safe_values(columns, d)
261
279
  a = []
262
- values.each do |v|
280
+ sym_cols = columns.map { |s| s.to_sym }
281
+ sym_cols.each do |c|
263
282
  part = "NULL"
264
- if( [::String, ::Symbol].include?(v.class) )
265
- s = v.to_s
266
- s.gsub!('\\') { '\\\\' }
267
- s.gsub!('\"', '\/"')
268
- s.gsub!('"', '\"')
269
- part = "\"#{s}\""
270
- elsif( [::Date, ::Time, ::DateTime].include?(v.class) )
271
- s = v.to_s(:db)
272
- part = "\"#{s}\""
273
- elsif( [::Fixnum].include?(v.class) )
274
- part = v
275
- elsif( [::Float, ::BigDecimal].include?(v.class) )
276
- part = v.to_f
283
+ v = d[c]
284
+ unless v.nil?
285
+ if( [::String, ::Symbol].include?(v.class) )
286
+ s = v.to_s
287
+ s.gsub!('\\') { '\\\\' }
288
+ s.gsub!('\"', '\/"')
289
+ s.gsub!('"', '\"')
290
+ part = "\"#{s}\""
291
+ elsif( [::Date, ::Time, ::DateTime].include?(v.class) )
292
+ s = v.to_s(:db)
293
+ part = "\"#{s}\""
294
+ elsif( [::Fixnum].include?(v.class) )
295
+ part = v
296
+ elsif( [::Float, ::BigDecimal].include?(v.class) )
297
+ part = v.to_f
298
+ end
277
299
  end
278
300
  a << part
279
301
  end
280
- return a.join(', ')
302
+ return "( #{a.join(', ')} )"
281
303
  end
282
304
 
283
305
  #/private
@@ -1,3 +1,3 @@
1
1
  module Forklift
2
- VERSION = "1.1.8"
2
+ VERSION = "1.1.9"
3
3
  end
data/readme.md CHANGED
@@ -370,7 +370,8 @@ end
370
370
  - max_timestamp(table, matcher=default_matcher, database=current_database)
371
371
  - return the timestamp of the max(matcher) or 1970-01-01
372
372
  - truncate!(table, database=current_database)
373
- - columns(table, database=current_database)
373
+ - columns(table, database=current_database, return_types=false)
374
+ - rename(table, new_table, database, new_database)
374
375
  - dump(file)
375
376
  - mysqldump the database to `file` via gzip
376
377
 
@@ -384,6 +385,8 @@ end
384
385
  - mysql_optimistic_import(source, destination)
385
386
  - tries to do an incramental table copy, falls back to a full table copy
386
387
  - this differs from `pipe`, as all data is loaded into forklift, rather than relying on mysql transfer methods
388
+ - write_high_water_mark(db, time, matcher)
389
+ - The high water method will stub a row in all tables with a `default_matcher` column prentending to have a record from `time`
387
390
 
388
391
  ### Elasticsearch
389
392
 
@@ -73,4 +73,6 @@ describe 'mysql patterns' do
73
73
  expect(destination.read('select first_name from users where id = 1')[0][:first_name]).to eql 'EvanAgain'
74
74
  }
75
75
  end
76
+
77
+ it "can write the high_water_mark"
76
78
  end
@@ -37,7 +37,8 @@ describe 'mysql' do
37
37
  it "can write new data" do
38
38
  table = "users"
39
39
  data = [
40
- {email: 'other@example.com', first_name: 'other', last_name: 'n', created_at: Time.new.to_s(:db), updated_at: Time.new.to_s(:db)}
40
+ {email: 'other@example.com', first_name: 'other', last_name: 'n', created_at: Time.new.to_s(:db), updated_at: Time.new.to_s(:db)},
41
+ {email: 'else@example.com', first_name: 'else', last_name: 'n', created_at: Time.new.to_s(:db), updated_at: Time.new.to_s(:db)}
41
42
  ]
42
43
  plan = SpecPlan.new
43
44
  plan.do! {
@@ -47,7 +48,7 @@ describe 'mysql' do
47
48
 
48
49
  destination = SpecClient.mysql('forklift_test_source_a')
49
50
  count = destination.query('select count(1) as "count" from users').first['count']
50
- expect(count).to eql 6
51
+ expect(count).to eql 7
51
52
  end
52
53
 
53
54
  it "can update existing data" do
@@ -90,13 +90,15 @@ describe Forklift::Connection::Mysql do
90
90
  subject { described_class.new({}, {}) }
91
91
 
92
92
  it "escapes one trailing backslash" do
93
- values = ["foo\\"]
94
- expect(subject.send(:safe_values, values)).to eq("\"foo\\\\\"")
93
+ columns = ['col']
94
+ values = {:col => "foo\\"}
95
+ expect(subject.send(:safe_values, columns, values)).to eq("( \"foo\\\\\" )")
95
96
  end
96
97
 
97
98
  it "escapes two trailing backslashes" do
98
- values = ["foo\\\\"]
99
- expect(subject.send(:safe_values, values)).to eq("\"foo\\\\\\\\\"")
99
+ columns = ['col']
100
+ values = {:col => "foo\\\\" }
101
+ expect(subject.send(:safe_values, columns, values)).to eq("( \"foo\\\\\\\\\" )")
100
102
  end
101
103
  end
102
104
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: forklift_etl
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.8
4
+ version: 1.1.9
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-08-20 00:00:00.000000000 Z
12
+ date: 2014-08-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport