forklift_etl 1.1.8 → 1.1.9

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- forklift_etl (1.1.8)
4
+ forklift_etl (1.1.9)
5
5
  activesupport (~> 4.0, >= 4.0.0)
6
6
  elasticsearch (~> 1.0, >= 1.0.0)
7
7
  lumberjack (~> 1.0, >= 1.0.0)
@@ -2,14 +2,18 @@ module Forklift
2
2
  module Patterns
3
3
  class Mysql
4
4
 
5
- def self.pipe(source, from_table, destination, to_table)
5
+ def self.pipe(source, from_table, destination, to_table, tmp_table="_forklift_tmp")
6
6
  start = Time.new.to_i
7
7
  from_db = source.current_database
8
8
  to_db = destination.current_database
9
9
  source.forklift.logger.log("mysql pipe: `#{from_db}`.`#{from_table}` => `#{to_db}`.`#{to_table}`")
10
+
11
+ source.q("drop table if exists `#{to_db}`.`#{tmp_table}`")
12
+ source.q("create table `#{to_db}`.`#{tmp_table}` like `#{from_db}`.`#{from_table}`")
13
+ source.q("insert into `#{to_db}`.`#{tmp_table}` select * from `#{from_db}`.`#{from_table}`")
10
14
  source.q("drop table if exists `#{to_db}`.`#{to_table}`")
11
- source.q("create table `#{to_db}`.`#{to_table}` like `#{from_db}`.`#{from_table}`")
12
- source.q("insert into `#{to_db}`.`#{to_table}` select * from `#{from_db}`.`#{from_table}`")
15
+ source.q("rename table `#{to_db}`.`#{tmp_table}` to `#{to_db}`.`#{to_table}`")
16
+
13
17
  delta = Time.new.to_i - start
14
18
  source.forklift.logger.log(" ^ moved #{destination.count(to_table, to_db)} rows in #{delta}s")
15
19
  end
@@ -59,7 +63,9 @@ module Forklift
59
63
  if self.can_incremental_pipe?(source, from_table, destination, to_table, matcher)
60
64
  begin
61
65
  incremental_pipe(source, from_table, destination, to_table, matcher, primary_key)
62
- rescue
66
+ rescue Exception => e
67
+ source.forklift.logger.log("! incremental_pipe failure on #{from_table} => #{to_table}: #{e} ")
68
+ source.forklift.logger.log("! falling back to pipe...")
63
69
  pipe(source, from_table, destination, to_table)
64
70
  end
65
71
  else
@@ -98,6 +104,32 @@ module Forklift
98
104
  end
99
105
  end
100
106
 
107
+ # The high water method will stub a row in all tables with a `default_matcher` column prentending to have a record from `time`
108
+ # This enabled partial forklift funs which will only extract data "later than X"
109
+ # TODO: assumes all columns have a default NULL setting
110
+ def self.write_high_water_mark(db, time, matcher=source.default_matcher)
111
+ db.tables.each do |table|
112
+ columns, types = db.columns(table, nil, true)
113
+ if columns.include?(matcher)
114
+ row = {}
115
+ i = 0
116
+ while( i < columns.length )
117
+ if(columns[i] == matcher)
118
+ row[column[i]] << time.to_s(:db)
119
+ elsif( types[i] =~ /text/ || types[i] =~ /varchar/ )
120
+ row[column[i]] << "~~stub~~"
121
+ elsif( types[i] =~ /float/ || types[i] =~ /int/ )
122
+ row[column[i]] << 0
123
+ else
124
+ row[column[i]] << "NULL"
125
+ end
126
+ i = i + 1
127
+ end
128
+ db.write([row], table)
129
+ end
130
+ end
131
+ end
132
+
101
133
  end
102
134
  end
103
135
  end
data/lib/forklift/plan.rb CHANGED
@@ -137,6 +137,7 @@ module Forklift
137
137
  def default_config
138
138
  return {
139
139
  project_root: Dir.pwd,
140
+ batch_size: 1000,
140
141
  logger: {
141
142
  stdout: true,
142
143
  debug: false,
@@ -18,7 +18,7 @@ module Forklift
18
18
  @forklift
19
19
  end
20
20
 
21
- def read(size=1000)
21
+ def read(size=forklift.config[:batch_size])
22
22
  data = []
23
23
  CSV.foreach(config[:file], headers: true, converters: :all) do |row|
24
24
  data << row.to_hash.symbolize_keys
@@ -18,7 +18,7 @@ module Forklift
18
18
  @forklift
19
19
  end
20
20
 
21
- def read(index, query, looping=true, from=0, size=1000)
21
+ def read(index, query, looping=true, from=0, size=forklift.config[:batch_size])
22
22
  offset = 0
23
23
  loop_count = 0
24
24
 
@@ -28,7 +28,11 @@ module Forklift
28
28
  q("DROP table `#{database}`.`#{table}`");
29
29
  end
30
30
 
31
- def read(query, database=current_database, looping=true, limit=1000, offset=0)
31
+ def rename(table, new_table, database=current_database, new_database=current_database)
32
+ q("RENAME TABLE `#{database}`.`#{table}` TO `#{new_database}`.`#{new_table}`")
33
+ end
34
+
35
+ def read(query, database=current_database, looping=true, limit=forklift.config[:batch_size], offset=0)
32
36
  loop_count = 0
33
37
  # TODO: Detect limit/offset already present in query
34
38
  q("USE `#{database}`")
@@ -77,14 +81,25 @@ module Forklift
77
81
  end
78
82
  end
79
83
  end
84
+ end
80
85
 
86
+ insert_q = "INSERT INTO `#{database}`.`#{table}` (#{safe_columns(columns)}) VALUES "
87
+ delete_q = "DELETE FROM `#{database}`.`#{table}` WHERE `#{primary_key}` IN "
88
+ delete_keys = []
89
+ data.each do |d|
81
90
  if(to_update == true && !d[primary_key.to_sym].nil?)
82
- q("DELETE FROM `#{database}`.`#{table}` WHERE `#{primary_key}` = #{d[primary_key.to_sym]}")
91
+ delete_keys << d[primary_key.to_sym]
83
92
  end
84
-
85
- insert_q = "INSERT INTO `#{database}`.`#{table}` (#{safe_columns(d.keys)}) VALUES (#{safe_values(d.values)});"
86
- q(insert_q)
93
+ insert_q << safe_values(columns, d)
94
+ insert_q << ","
95
+ end
96
+
97
+ if delete_keys.length > 0
98
+ delete_q << "(#{delete_keys.join(',')})"
99
+ q(delete_q)
87
100
  end
101
+ insert_q = insert_q[0...-1]
102
+ q(insert_q)
88
103
  forklift.logger.log "wrote #{data.length} rows to `#{database}`.`#{table}`"
89
104
  end
90
105
  end
@@ -177,12 +192,15 @@ module Forklift
177
192
  end
178
193
  end
179
194
 
180
- def columns(table, database=current_database)
195
+ def columns(table, database=current_database, return_types=false)
181
196
  cols = []
197
+ types = []
182
198
  read("describe `#{database}`.`#{table}`").each do |row|
183
- cols << row[:Field]
199
+ cols << row[:Field]
200
+ types << row[:Type]
184
201
  end
185
- cols
202
+ return cols if return_types == false
203
+ return cols, types
186
204
  end
187
205
 
188
206
  def dump(file, options=[])
@@ -257,27 +275,31 @@ module Forklift
257
275
  return a.join(', ')
258
276
  end
259
277
 
260
- def safe_values(values)
278
+ def safe_values(columns, d)
261
279
  a = []
262
- values.each do |v|
280
+ sym_cols = columns.map { |s| s.to_sym }
281
+ sym_cols.each do |c|
263
282
  part = "NULL"
264
- if( [::String, ::Symbol].include?(v.class) )
265
- s = v.to_s
266
- s.gsub!('\\') { '\\\\' }
267
- s.gsub!('\"', '\/"')
268
- s.gsub!('"', '\"')
269
- part = "\"#{s}\""
270
- elsif( [::Date, ::Time, ::DateTime].include?(v.class) )
271
- s = v.to_s(:db)
272
- part = "\"#{s}\""
273
- elsif( [::Fixnum].include?(v.class) )
274
- part = v
275
- elsif( [::Float, ::BigDecimal].include?(v.class) )
276
- part = v.to_f
283
+ v = d[c]
284
+ unless v.nil?
285
+ if( [::String, ::Symbol].include?(v.class) )
286
+ s = v.to_s
287
+ s.gsub!('\\') { '\\\\' }
288
+ s.gsub!('\"', '\/"')
289
+ s.gsub!('"', '\"')
290
+ part = "\"#{s}\""
291
+ elsif( [::Date, ::Time, ::DateTime].include?(v.class) )
292
+ s = v.to_s(:db)
293
+ part = "\"#{s}\""
294
+ elsif( [::Fixnum].include?(v.class) )
295
+ part = v
296
+ elsif( [::Float, ::BigDecimal].include?(v.class) )
297
+ part = v.to_f
298
+ end
277
299
  end
278
300
  a << part
279
301
  end
280
- return a.join(', ')
302
+ return "( #{a.join(', ')} )"
281
303
  end
282
304
 
283
305
  #/private
@@ -1,3 +1,3 @@
1
1
  module Forklift
2
- VERSION = "1.1.8"
2
+ VERSION = "1.1.9"
3
3
  end
data/readme.md CHANGED
@@ -370,7 +370,8 @@ end
370
370
  - max_timestamp(table, matcher=default_matcher, database=current_database)
371
371
  - return the timestamp of the max(matcher) or 1970-01-01
372
372
  - truncate!(table, database=current_database)
373
- - columns(table, database=current_database)
373
+ - columns(table, database=current_database, return_types=false)
374
+ - rename(table, new_table, database, new_database)
374
375
  - dump(file)
375
376
  - mysqldump the database to `file` via gzip
376
377
 
@@ -384,6 +385,8 @@ end
384
385
  - mysql_optimistic_import(source, destination)
385
386
  - tries to do an incramental table copy, falls back to a full table copy
386
387
  - this differs from `pipe`, as all data is loaded into forklift, rather than relying on mysql transfer methods
388
+ - write_high_water_mark(db, time, matcher)
389
+ - The high water method will stub a row in all tables with a `default_matcher` column prentending to have a record from `time`
387
390
 
388
391
  ### Elasticsearch
389
392
 
@@ -73,4 +73,6 @@ describe 'mysql patterns' do
73
73
  expect(destination.read('select first_name from users where id = 1')[0][:first_name]).to eql 'EvanAgain'
74
74
  }
75
75
  end
76
+
77
+ it "can write the high_water_mark"
76
78
  end
@@ -37,7 +37,8 @@ describe 'mysql' do
37
37
  it "can write new data" do
38
38
  table = "users"
39
39
  data = [
40
- {email: 'other@example.com', first_name: 'other', last_name: 'n', created_at: Time.new.to_s(:db), updated_at: Time.new.to_s(:db)}
40
+ {email: 'other@example.com', first_name: 'other', last_name: 'n', created_at: Time.new.to_s(:db), updated_at: Time.new.to_s(:db)},
41
+ {email: 'else@example.com', first_name: 'else', last_name: 'n', created_at: Time.new.to_s(:db), updated_at: Time.new.to_s(:db)}
41
42
  ]
42
43
  plan = SpecPlan.new
43
44
  plan.do! {
@@ -47,7 +48,7 @@ describe 'mysql' do
47
48
 
48
49
  destination = SpecClient.mysql('forklift_test_source_a')
49
50
  count = destination.query('select count(1) as "count" from users').first['count']
50
- expect(count).to eql 6
51
+ expect(count).to eql 7
51
52
  end
52
53
 
53
54
  it "can update existing data" do
@@ -90,13 +90,15 @@ describe Forklift::Connection::Mysql do
90
90
  subject { described_class.new({}, {}) }
91
91
 
92
92
  it "escapes one trailing backslash" do
93
- values = ["foo\\"]
94
- expect(subject.send(:safe_values, values)).to eq("\"foo\\\\\"")
93
+ columns = ['col']
94
+ values = {:col => "foo\\"}
95
+ expect(subject.send(:safe_values, columns, values)).to eq("( \"foo\\\\\" )")
95
96
  end
96
97
 
97
98
  it "escapes two trailing backslashes" do
98
- values = ["foo\\\\"]
99
- expect(subject.send(:safe_values, values)).to eq("\"foo\\\\\\\\\"")
99
+ columns = ['col']
100
+ values = {:col => "foo\\\\" }
101
+ expect(subject.send(:safe_values, columns, values)).to eq("( \"foo\\\\\\\\\" )")
100
102
  end
101
103
  end
102
104
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: forklift_etl
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.8
4
+ version: 1.1.9
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-08-20 00:00:00.000000000 Z
12
+ date: 2014-08-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport