forklift_etl 1.0.11 → 1.0.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/Gemfile.lock +1 -1
- data/bin/forklift +2 -0
- data/example/config/connections/csv/csv.yml +1 -0
- data/lib/forklift/transports/csv.rb +70 -0
- data/lib/forklift/version.rb +1 -1
- data/readme.md +21 -16
- data/spec/config/connections/csv/forklift_test_destination.yml +1 -0
- data/spec/config/connections/csv/forklift_test_source.yml +1 -0
- data/spec/integration/csv_spec.rb +84 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/support/dumps/csv/source.csv +6 -0
- data/spec/support/spec_client.rb +4 -0
- data/spec/support/spec_seeds.rb +10 -0
- metadata +12 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
N2E0NGRiYzA0ZGVmYzIyMDY5MDRjMDI3ODI5NGNmMTZkMjJiNzc2OQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ZDI5Y2VjYWRmNWRmZGY2ZWZjZDBjNDk5N2RlNTQ5MzBhMjBlOWU5ZA==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YjZkZWZhMzk3MjE1NjUzMDg3ODg0ZmZkMDFkMGViMjQ0NGQ3NzRjZTZlNzBl
|
10
|
+
MGJlNmVkZTI2ZjdkZWNhZGUzZjQ5OTcwMGM3ZmVjMWEyOGFkNzE5NGRlMDA3
|
11
|
+
NzI5ZjZmNzI2MDNiZDIzMzY1MzM3MTI1YWZiMDFiYmFmMTJhZWU=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NDk3NDU5NDQwZjBjYWMwZmQ1MjYyNjU4ZTI5NmMwMjFiMWY2YjQwYWU3Nzdj
|
14
|
+
MmI1YjM1MzJlYmQ4ZjAwZjUzYTMzZDBiYWI5MWI3MTJhOGRlNDRjOTgwZDY1
|
15
|
+
NmZlYjJkZTcyYzgzMTUzODY5ZDdhNGQ2NDI0YWU1ZGIwNjA2OTQ=
|
data/Gemfile.lock
CHANGED
data/bin/forklift
CHANGED
@@ -15,6 +15,8 @@ def generate
|
|
15
15
|
Dir.mkdir "#{p}/config"
|
16
16
|
Dir.mkdir "#{p}/config/connections"
|
17
17
|
Dir.mkdir "#{p}/config/connections/mysql"
|
18
|
+
Dir.mkdir "#{p}/config/connections/elasticsearch"
|
19
|
+
Dir.mkdir "#{p}/config/connections/csv"
|
18
20
|
Dir.mkdir "#{p}/log"
|
19
21
|
Dir.mkdir "#{p}/pid"
|
20
22
|
Dir.mkdir "#{p}/template"
|
@@ -0,0 +1 @@
|
|
1
|
+
:file: /path/to/file.csv
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module Forklift
|
5
|
+
module Connection
|
6
|
+
class Csv < Forklift::Base::Connection
|
7
|
+
|
8
|
+
def initialize(config, forklift)
|
9
|
+
@config = config
|
10
|
+
@forklift = forklift
|
11
|
+
end
|
12
|
+
|
13
|
+
def config
|
14
|
+
@config
|
15
|
+
end
|
16
|
+
|
17
|
+
def forklift
|
18
|
+
@forklift
|
19
|
+
end
|
20
|
+
|
21
|
+
def read(size=1000)
|
22
|
+
data = []
|
23
|
+
CSV.foreach(config[:file], :headers => true, :converters => :all) do |row|
|
24
|
+
data << row.to_hash.symbolize_keys
|
25
|
+
if(data.length == size)
|
26
|
+
if block_given?
|
27
|
+
yield data
|
28
|
+
data = []
|
29
|
+
else
|
30
|
+
return data
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
if block_given?
|
36
|
+
yield data
|
37
|
+
else
|
38
|
+
return data
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def write(data, append=true)
|
43
|
+
if (append == false)
|
44
|
+
FileUtils.rm(config[:file], {:force => true})
|
45
|
+
end
|
46
|
+
|
47
|
+
if( !File.exists?(config[:file]) )
|
48
|
+
keys = data.first.keys
|
49
|
+
row = {}
|
50
|
+
keys.each do |k|
|
51
|
+
row[k] = k
|
52
|
+
end
|
53
|
+
data = [row] + data
|
54
|
+
end
|
55
|
+
|
56
|
+
CSV.open(config[:file],'a') do |file|
|
57
|
+
data.each do |row|
|
58
|
+
file << row.values
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
#/private
|
67
|
+
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
data/lib/forklift/version.rb
CHANGED
data/readme.md
CHANGED
@@ -6,13 +6,7 @@ Moving heavy databases around. [![Gem Version](https://badge.fury.io/rb/forklift
|
|
6
6
|
|
7
7
|
## What?
|
8
8
|
|
9
|
-
[Forklift](https://github.com/taskrabbit/forklift) is a ruby gem that makes it easy for you to move your data around. Forklift can be an integral part of your datawarehouse pipeline or a backup
|
10
|
-
|
11
|
-
## What does TaskRabbit use this for?
|
12
|
-
|
13
|
-
At TaskRabbit, the website you see at [www.taskrabbit.com](https://www.taskrabbit.com) is actually made up of many [smaller rails applications](http://en.wikipedia.org/wiki/Service-oriented_architecture). When analyzing our site, we need to collect all of this data into one place so we can easily join across it.
|
14
|
-
|
15
|
-
We replicate all of our databases into one server in our office, and then use Forklift to extract the data we want into a common place. This gives us the option to both look at live data and to have a more accessible transformed set which we create on a rolling basis. Our "Forklift Loop" also git-pulls to check for any new transformations before each run.
|
9
|
+
[Forklift](https://github.com/taskrabbit/forklift) is a ruby gem that makes it easy for you to move your data around. Forklift can be an integral part of your datawarehouse pipeline or a backup tool. Forklift can collect and collapse data from multiple sources or across a single source. In forklift's first version, it was only a MySQL tool but now, you can create transports to deal with the data of your choice.
|
16
10
|
|
17
11
|
## Set up
|
18
12
|
|
@@ -42,6 +36,8 @@ Forklift expects your project to be arranged like:
|
|
42
36
|
| ├── (DB).yml
|
43
37
|
| ├── elasticsearch/
|
44
38
|
| ├── (DB).yml
|
39
|
+
| ├── csv/
|
40
|
+
| ├── (file).yml
|
45
41
|
├── log/
|
46
42
|
├── pid/
|
47
43
|
├── template/
|
@@ -53,11 +49,13 @@ Forklift expects your project to be arranged like:
|
|
53
49
|
```
|
54
50
|
|
55
51
|
To enable a foklift connection, all you need to do is place the yml config file for it within `/config/connections/(type)/(name).yml`
|
56
|
-
|
52
|
+
Files you place within `/patterns/` or `connections/(type)/` will be loaded automatically.
|
53
|
+
|
54
|
+
## Examples
|
57
55
|
|
58
|
-
|
56
|
+
### Example Project
|
59
57
|
|
60
|
-
Visit the [`/example`]() directory to see a whole forklift project.
|
58
|
+
Visit the [`/example`](https://github.com/taskrabbit/forklift/tree/master/example) directory to see a whole forklift project.
|
61
59
|
|
62
60
|
### Simple extract and load (no transformations)
|
63
61
|
|
@@ -145,7 +143,7 @@ plan.do! do
|
|
145
143
|
end
|
146
144
|
```
|
147
145
|
|
148
|
-
|
146
|
+
## Forklift Emails
|
149
147
|
|
150
148
|
#### Setup
|
151
149
|
Put this at the end of your plan inside the `do!` block.
|
@@ -239,7 +237,7 @@ end
|
|
239
237
|
|
240
238
|
### Steps
|
241
239
|
|
242
|
-
You can optionally
|
240
|
+
You can optionally divide up your forklift plan into steps:
|
243
241
|
```ruby
|
244
242
|
plan = Forklift::Plan.new
|
245
243
|
plan.do! do
|
@@ -265,15 +263,15 @@ plan.do! do
|
|
265
263
|
end
|
266
264
|
```
|
267
265
|
|
268
|
-
When you use steps, you can run your whole plan, or just part if it with command line arguments. For example, `forklift plan.rb "Elasticsearch Import"` would just run that
|
266
|
+
When you use steps, you can run your whole plan, or just part if it with command line arguments. For example, `forklift plan.rb "Elasticsearch Import"` would just run that single portion of the plan. Note that any parts of your plan not within a step will be run each time.
|
269
267
|
|
270
268
|
## Transports
|
271
269
|
|
272
|
-
Transports are how you interact with your data. Every transport defines `read` and `write` methods which handle arrays of data objects (and helper methods required).
|
270
|
+
Transports are how you interact with your data. Every transport defines `read` and `write` methods which handle arrays of data objects (and the helper methods required).
|
273
271
|
|
274
272
|
Each transport should have a config file in `./config/connections/#{transport}/`. It will be loaded at boot.
|
275
273
|
|
276
|
-
Transports
|
274
|
+
Transports optionally define helper methods which are a shortcut to copy data *within* a transport, like the mysql `pipe` methods (i.e.: `insert into #{to_db}.#{to_table}; select * from #{from_db}.#{from_table})`. A transport may also define other helpers (like how to create a MySQL dump). These should be defined in `/patterns/#{type}.rb` within the `Forklift::Patterns::#{type}` namespace.
|
277
275
|
|
278
276
|
### Creating your own transport
|
279
277
|
|
@@ -375,6 +373,13 @@ end
|
|
375
373
|
|
376
374
|
- delete_index(index)
|
377
375
|
|
376
|
+
### Csv
|
377
|
+
|
378
|
+
#### Forklift methods
|
379
|
+
|
380
|
+
- read(size)
|
381
|
+
- write(data, append=true)
|
382
|
+
|
378
383
|
## Transformations
|
379
384
|
|
380
385
|
Forklift allows you to create both Ruby transformations and script transformations.
|
@@ -407,4 +412,4 @@ end
|
|
407
412
|
- If testing locally, mailcatcher (https://github.com/sj26/mailcatcher) is a helpful gem to test your email sending
|
408
413
|
|
409
414
|
## Contributing and Testing
|
410
|
-
To run this test suite, you will need access to both a mysql and elasticsearch database. Test configurations are saved in `/spec/config/connections`.
|
415
|
+
To run this test suite, you will need access to both a mysql and elasticsearch database. Test configurations are saved in `/spec/config/connections`.
|
@@ -0,0 +1 @@
|
|
1
|
+
:file: /tmp/destination.csv
|
@@ -0,0 +1 @@
|
|
1
|
+
:file: /tmp/source.csv
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'csv'
|
3
|
+
|
4
|
+
describe 'csv' do
|
5
|
+
|
6
|
+
after(:each) do
|
7
|
+
SpecSeeds.setup_csv
|
8
|
+
end
|
9
|
+
|
10
|
+
it "can read data (simple)" do
|
11
|
+
plan = SpecPlan.new
|
12
|
+
@rows = []
|
13
|
+
|
14
|
+
plan.do! {
|
15
|
+
source = plan.connections[:csv][:forklift_test_source]
|
16
|
+
source.read {|data|
|
17
|
+
@rows = (@rows + data)
|
18
|
+
}
|
19
|
+
}
|
20
|
+
|
21
|
+
expect(@rows.length).to eql 5
|
22
|
+
expect(@rows.first[:vendor_id]).to eql 1
|
23
|
+
expect(@rows.last[:vendor_id]).to eql 5
|
24
|
+
end
|
25
|
+
|
26
|
+
it "can read partial data" do
|
27
|
+
plan = SpecPlan.new
|
28
|
+
@rows = []
|
29
|
+
|
30
|
+
plan.do! {
|
31
|
+
source = plan.connections[:csv][:forklift_test_source]
|
32
|
+
@rows = source.read(3)
|
33
|
+
}
|
34
|
+
|
35
|
+
expect(@rows.length).to eql 3
|
36
|
+
expect(@rows.first[:vendor_id]).to eql 1
|
37
|
+
expect(@rows.last[:vendor_id]).to eql 3
|
38
|
+
end
|
39
|
+
|
40
|
+
it "can write data (simple)" do
|
41
|
+
plan = SpecPlan.new
|
42
|
+
data = [
|
43
|
+
{:thing => 1, :when => Time.now},
|
44
|
+
{:thing => 2, :when => Time.now},
|
45
|
+
]
|
46
|
+
|
47
|
+
plan.do! {
|
48
|
+
destination = plan.connections[:csv][:forklift_test_destination]
|
49
|
+
destination.write(data)
|
50
|
+
}
|
51
|
+
|
52
|
+
@rows = SpecClient.csv('/tmp/destination.csv')
|
53
|
+
expect(@rows.length).to eql 2
|
54
|
+
expect(@rows.first[:thing]).to eql 1
|
55
|
+
expect(@rows.last[:thing]).to eql 2
|
56
|
+
end
|
57
|
+
|
58
|
+
it "can append data" do
|
59
|
+
plan = SpecPlan.new
|
60
|
+
|
61
|
+
plan.do! {
|
62
|
+
destination = plan.connections[:csv][:forklift_test_destination]
|
63
|
+
|
64
|
+
data = [
|
65
|
+
{:thing => 1, :when => Time.now},
|
66
|
+
{:thing => 2, :when => Time.now},
|
67
|
+
]
|
68
|
+
|
69
|
+
destination.write(data)
|
70
|
+
|
71
|
+
data = [
|
72
|
+
{:thing => 3, :when => Time.now},
|
73
|
+
]
|
74
|
+
|
75
|
+
destination.write(data)
|
76
|
+
}
|
77
|
+
|
78
|
+
@rows = SpecClient.csv('/tmp/destination.csv')
|
79
|
+
expect(@rows.length).to eql 3
|
80
|
+
expect(@rows.first[:thing]).to eql 1
|
81
|
+
expect(@rows.last[:thing]).to eql 3
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -0,0 +1,6 @@
|
|
1
|
+
vendor_id,vendor_name,created_at,updated_at
|
2
|
+
1,Evan's Hats,2000-01-01 00:00:01,2000-01-01 00:00:01
|
3
|
+
2,Aaron's Scarves,2000-01-01 00:00:02,2000-01-01 00:00:02
|
4
|
+
3,Pablo's Shirts,2000-01-01 00:00:03,2000-01-01 00:00:03
|
5
|
+
4,Kevin's Headies,2000-01-01 00:00:04,2000-01-01 00:00:04
|
6
|
+
5,Brian's Boots,2000-01-01 00:00:05,2000-01-01 00:00:05
|
data/spec/support/spec_client.rb
CHANGED
data/spec/support/spec_seeds.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'json'
|
2
|
+
require 'fileutils'
|
2
3
|
|
3
4
|
class SpecSeeds
|
4
5
|
|
@@ -66,4 +67,13 @@ class SpecSeeds
|
|
66
67
|
end
|
67
68
|
end
|
68
69
|
|
70
|
+
def self.setup_csv
|
71
|
+
seed = File.join(File.dirname(__FILE__), '..', 'support', 'dumps', 'csv', "source.csv")
|
72
|
+
source = '/tmp/source.csv'
|
73
|
+
destination = '/tmp/destination.csv'
|
74
|
+
FileUtils.rm(source, {:force => true})
|
75
|
+
FileUtils.rm(destination, {:force => true})
|
76
|
+
FileUtils.copy(seed, source)
|
77
|
+
end
|
78
|
+
|
69
79
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forklift_etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Evan Tahler
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-04-
|
11
|
+
date: 2014-04-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -127,6 +127,7 @@ files:
|
|
127
127
|
- bin/forklift
|
128
128
|
- example/Gemfile
|
129
129
|
- example/Gemfile.lock
|
130
|
+
- example/config/connections/csv/csv.yml
|
130
131
|
- example/config/connections/elasticsearch/source.yml
|
131
132
|
- example/config/connections/mysql/destination.yml
|
132
133
|
- example/config/connections/mysql/source.yml
|
@@ -147,10 +148,13 @@ files:
|
|
147
148
|
- lib/forklift/patterns/elasticsearch_patterns.rb
|
148
149
|
- lib/forklift/patterns/mysql_patterns.rb
|
149
150
|
- lib/forklift/plan.rb
|
151
|
+
- lib/forklift/transports/csv.rb
|
150
152
|
- lib/forklift/transports/elasticsearch.rb
|
151
153
|
- lib/forklift/transports/mysql.rb
|
152
154
|
- lib/forklift/version.rb
|
153
155
|
- readme.md
|
156
|
+
- spec/config/connections/csv/forklift_test_destination.yml
|
157
|
+
- spec/config/connections/csv/forklift_test_source.yml
|
154
158
|
- spec/config/connections/elasticsearch/forklift_test.yml
|
155
159
|
- spec/config/connections/mysql/forklift_test_destination.yml
|
156
160
|
- spec/config/connections/mysql/forklift_test_source_a.yml
|
@@ -158,12 +162,14 @@ files:
|
|
158
162
|
- spec/config/connections/mysql/forklift_test_working.yml
|
159
163
|
- spec/config/email.yml
|
160
164
|
- spec/integration/basic_spec.rb
|
165
|
+
- spec/integration/csv_spec.rb
|
161
166
|
- spec/integration/elasticsearch_patterns_spec.rb
|
162
167
|
- spec/integration/elasticsearch_spec.rb
|
163
168
|
- spec/integration/multi_transport_spec.rb
|
164
169
|
- spec/integration/mysql_patterns_spec.rb
|
165
170
|
- spec/integration/mysql_spec.rb
|
166
171
|
- spec/spec_helper.rb
|
172
|
+
- spec/support/dumps/csv/source.csv
|
167
173
|
- spec/support/dumps/elasticsearch/forklift_test.json
|
168
174
|
- spec/support/dumps/mysql/forklift_test_source_a.sql
|
169
175
|
- spec/support/dumps/mysql/forklift_test_source_b.sql
|
@@ -205,6 +211,8 @@ signing_key:
|
|
205
211
|
specification_version: 4
|
206
212
|
summary: ! 'Forklift: Moving big databases around. A ruby ETL tool.'
|
207
213
|
test_files:
|
214
|
+
- spec/config/connections/csv/forklift_test_destination.yml
|
215
|
+
- spec/config/connections/csv/forklift_test_source.yml
|
208
216
|
- spec/config/connections/elasticsearch/forklift_test.yml
|
209
217
|
- spec/config/connections/mysql/forklift_test_destination.yml
|
210
218
|
- spec/config/connections/mysql/forklift_test_source_a.yml
|
@@ -212,12 +220,14 @@ test_files:
|
|
212
220
|
- spec/config/connections/mysql/forklift_test_working.yml
|
213
221
|
- spec/config/email.yml
|
214
222
|
- spec/integration/basic_spec.rb
|
223
|
+
- spec/integration/csv_spec.rb
|
215
224
|
- spec/integration/elasticsearch_patterns_spec.rb
|
216
225
|
- spec/integration/elasticsearch_spec.rb
|
217
226
|
- spec/integration/multi_transport_spec.rb
|
218
227
|
- spec/integration/mysql_patterns_spec.rb
|
219
228
|
- spec/integration/mysql_spec.rb
|
220
229
|
- spec/spec_helper.rb
|
230
|
+
- spec/support/dumps/csv/source.csv
|
221
231
|
- spec/support/dumps/elasticsearch/forklift_test.json
|
222
232
|
- spec/support/dumps/mysql/forklift_test_source_a.sql
|
223
233
|
- spec/support/dumps/mysql/forklift_test_source_b.sql
|