data_task 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +39 -0
- data/.travis.yml +17 -0
- data/Gemfile +3 -0
- data/LICENSE +22 -0
- data/README.md +93 -0
- data/Rakefile +17 -0
- data/data_task.gemspec +31 -0
- data/lib/data_task/adapters/greenplum.rb +159 -0
- data/lib/data_task/adapters/postgres.rb +430 -0
- data/lib/data_task/adapters/sqlite.rb +239 -0
- data/lib/data_task/adapters/support/booleans.rb +19 -0
- data/lib/data_task/adapters/support/connection_persistence.rb +18 -0
- data/lib/data_task/adapters/support/transactions.rb +28 -0
- data/lib/data_task/data.rb +34 -0
- data/lib/data_task/db.rb +57 -0
- data/lib/data_task/sql.rb +49 -0
- data/lib/data_task/tasks/examples.rake +35 -0
- data/lib/data_task/util.rb +6 -0
- data/lib/data_task/version.rb +5 -0
- data/lib/data_task.rb +76 -0
- data/test/config/database.yml +10 -0
- data/test/data_spec.rb +81 -0
- data/test/db_spec.rb +202 -0
- data/test/helper.rb +37 -0
- data/test/postgresql_spec.rb +249 -0
- data/test/sql_spec.rb +46 -0
- data/test/table_creation.rb +34 -0
- data/test/test_rake_table_task.rb +207 -0
- metadata +179 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
ZDk5OWQzMDU3YzNkZGUyZTczODUxNTdhODk4MTg4MjU0MGU3NDYwOQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
NzRmMWMzNjM4MGJjYmZjYTkwYTIwOTFiNjQ0YTVjZWZjNmU1NmQwNg==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NmNmMjc3MzFmMzBmOGI2MmFjNDBkODAyNDQ5YmZiNzBjMjFmY2JhNDFhMTk5
|
10
|
+
M2NmZjk5MzU4ZWRiNzFkMDYxYTgxN2I1MTM0ZjI0OGUzMjRjOTM4YmVmNmVm
|
11
|
+
ZjlkYzhhM2Y0YmU5YTdiNTY2NTMyMTVlMzhiNDE2ODU5YzQ2M2I=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
MzdiNGYyN2IyOWU4YWNkYjgwNzE1MzRmYThlNGNlODU1NmQ2N2ZhZDU1YTcz
|
14
|
+
YWEwZjQyM2Q4NWJkYmFlZGI4YjVhNWIxY2M4ODZjMzc3OGM4YmFlZWQ3YTcx
|
15
|
+
ZTcwNDMyZTQ3YThmYjY2MjIyNGQ1OGMyMzNlMmEzOTdjY2Y2NTQ=
|
data/.gitignore
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.config
|
4
|
+
Gemfile.lock
|
5
|
+
InstalledFiles
|
6
|
+
coverage
|
7
|
+
doc/
|
8
|
+
pkg
|
9
|
+
rdoc
|
10
|
+
spec/reports
|
11
|
+
test/tmp
|
12
|
+
test/version_tmp
|
13
|
+
tmp
|
14
|
+
*.bundle
|
15
|
+
*.so
|
16
|
+
*.o
|
17
|
+
*.a
|
18
|
+
mkmf.log
|
19
|
+
|
20
|
+
## Documentation cache and generated files:
|
21
|
+
/.yardoc/
|
22
|
+
/_yardoc/
|
23
|
+
/doc/
|
24
|
+
/rdoc/
|
25
|
+
|
26
|
+
## Environment normalisation:
|
27
|
+
/.bundle/
|
28
|
+
/lib/bundler/man/
|
29
|
+
|
30
|
+
# vim
|
31
|
+
[._]*.s[a-w][a-z]
|
32
|
+
[._]s[a-w][a-z]
|
33
|
+
*.un~
|
34
|
+
Session.vim
|
35
|
+
.netrwhist
|
36
|
+
*~
|
37
|
+
|
38
|
+
# host-specific configuration
|
39
|
+
/config/database.yml
|
data/.travis.yml
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
language: ruby
|
2
|
+
cache: bundler
|
3
|
+
rvm:
|
4
|
+
- 1.9.3
|
5
|
+
services: postgresql
|
6
|
+
before_script:
|
7
|
+
- psql -c 'create database ci_test;' -U postgres
|
8
|
+
- sqlite3 ci_test ''
|
9
|
+
|
10
|
+
env:
|
11
|
+
- DATATASK_ENV=postgres_test
|
12
|
+
- DATATASK_ENV=sqlite_test
|
13
|
+
|
14
|
+
notifications:
|
15
|
+
email:
|
16
|
+
on_success: never
|
17
|
+
on_failure: change
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Shahin Saneinejad
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
[![Build Status](https://travis-ci.org/shahin/data_task.svg?branch=master)](https://travis-ci.org/shahin/data_task)
|
2
|
+
[![Coverage Status](https://img.shields.io/coveralls/shahin/data_task.svg)](https://coveralls.io/r/shahin/data_task?branch=master)
|
3
|
+
|
4
|
+
# DataTask
|
5
|
+
|
6
|
+
DataTask extends Rake's dependency-based programming language to databases. This gem provides the `data` task, analogous to Rake's built-in `file` task but extended to work with pluggable backends beyond the local filesystem.
|
7
|
+
|
8
|
+
Adapters are included for Sqlite3, PostgreSQL, and Greenplum.
|
9
|
+
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
Add this line to your application's Gemfile:
|
13
|
+
|
14
|
+
gem 'data_task'
|
15
|
+
|
16
|
+
And then execute:
|
17
|
+
|
18
|
+
$ bundle
|
19
|
+
|
20
|
+
Or install it yourself as:
|
21
|
+
|
22
|
+
$ gem install data_task
|
23
|
+
|
24
|
+
## Usage
|
25
|
+
|
26
|
+
To write your first data task, connect to your database by instantiating an adapter:
|
27
|
+
|
28
|
+
```
|
29
|
+
postgres = Rake::DataTask::Postgres.new(
|
30
|
+
'host' => 'localhost',
|
31
|
+
'port' => 5432,
|
32
|
+
'database' => 'example',
|
33
|
+
'username' => 'postgres'
|
34
|
+
)
|
35
|
+
```
|
36
|
+
|
37
|
+
Then use this adapter instance as the target for a data task:
|
38
|
+
|
39
|
+
```
|
40
|
+
desc "Load a data file into PostgreSQL for analysis."
|
41
|
+
data postgres['raw'] => 'raw.txt' do
|
42
|
+
# Add loading logic here
|
43
|
+
end
|
44
|
+
```
|
45
|
+
|
46
|
+
Rake will run this task if and only if (a) the table 'raw' is does not exist yet, or (b) the table 'raw' exists but has a timestamp earlier than the file 'raw.txt'. Since database tables now have timestamps associated with them, they can serve as targets or as dependencies in data tasks.
|
47
|
+
|
48
|
+
Here's a runnable example Rakefile:
|
49
|
+
|
50
|
+
```
|
51
|
+
require 'rake'
|
52
|
+
require 'data_task'
|
53
|
+
|
54
|
+
# connect to the database
|
55
|
+
postgres = Rake::DataTask::Postgres.new(
|
56
|
+
'host' => 'localhost',
|
57
|
+
'port' => 5432,
|
58
|
+
'database' => 'example',
|
59
|
+
'username' => 'postgres'
|
60
|
+
)
|
61
|
+
|
62
|
+
# mark raw.txt as a potential dependency
|
63
|
+
file 'raw.txt'
|
64
|
+
|
65
|
+
# define a loader for the postgres table 'raw', dependent on raw.txt
|
66
|
+
desc "Load a data file into PostgreSQL for analysis."
|
67
|
+
data postgres['raw'] => 'raw.txt' do
|
68
|
+
postgres.create_table 'raw', nil, '(var1 text)'
|
69
|
+
postgres.execute "copy raw.txt to raw"
|
70
|
+
end
|
71
|
+
```
|
72
|
+
|
73
|
+
To run it:
|
74
|
+
|
75
|
+
1. paste the example into a file named 'Rakefile',
|
76
|
+
2. make sure the PostgreSQL configuration matches your server,
|
77
|
+
3. open a terminal and run the commands below:
|
78
|
+
|
79
|
+
```
|
80
|
+
$ echo "v1" > raw.txt
|
81
|
+
$ rake 'raw'
|
82
|
+
```
|
83
|
+
|
84
|
+
The contents of raw.txt should be in your table 'raw' on PostgreSQL. Running the rake command a second time will result in no operations as long as raw.txt hasn't changed. With big data files, this can be a big time-saver.
|
85
|
+
|
86
|
+
|
87
|
+
## Contributing
|
88
|
+
|
89
|
+
1. Fork it ( https://github.com/shahin/data_task/fork )
|
90
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
91
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
92
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
93
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'data_task'
|
2
|
+
|
3
|
+
desc "Run tests"
|
4
|
+
task :default => :'data_task:test'
|
5
|
+
|
6
|
+
namespace :data_task do
|
7
|
+
|
8
|
+
require 'bundler/gem_tasks'
|
9
|
+
require 'rake/testtask'
|
10
|
+
|
11
|
+
Rake::TestTask.new do |t|
|
12
|
+
t.libs << "spec"
|
13
|
+
t.test_files = FileList['test/**/*_spec.rb', 'test/test_*.rb']
|
14
|
+
t.verbose
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
data/data_task.gemspec
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'data_task/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "data_task"
|
8
|
+
spec.version = Rake::DataTask::VERSION
|
9
|
+
spec.authors = ["Shahin Saneinejad"]
|
10
|
+
spec.email = ["shahin.saneinejad@gmail.com"]
|
11
|
+
spec.summary = %q{A Rake task for managing data across multiple datastores.}
|
12
|
+
spec.description = %q{DataTask provides dependency-based programming for data workflows on top of the Rake build tool.}
|
13
|
+
spec.homepage = "https://github.com/shahin/data_task"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.required_ruby_version = '>= 1.9.3'
|
22
|
+
|
23
|
+
spec.add_runtime_dependency 'rake', '~> 10.0.4'
|
24
|
+
spec.add_runtime_dependency 'pg', '~> 0.17.1'
|
25
|
+
spec.add_runtime_dependency 'sqlite3'
|
26
|
+
|
27
|
+
spec.add_development_dependency 'bundler', '~> 1.6'
|
28
|
+
spec.add_development_dependency 'minitest-around', '~> 0.2'
|
29
|
+
spec.add_development_dependency 'minitest-spec-context', '~> 0.0.3'
|
30
|
+
spec.add_development_dependency 'coveralls'
|
31
|
+
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
require 'pg'
|
2
|
+
require_relative 'support/transactions'
|
3
|
+
require_relative 'support/booleans'
|
4
|
+
require_relative './postgres'
|
5
|
+
|
6
|
+
module Rake
|
7
|
+
module DataTask
|
8
|
+
|
9
|
+
class Greenplum < Postgres
|
10
|
+
|
11
|
+
TABLE_TRACKER_HELPER_NAME = "operations"
|
12
|
+
|
13
|
+
def self.set_up_tracking options
|
14
|
+
tear_down_tracking options
|
15
|
+
super
|
16
|
+
|
17
|
+
execute "alter table #{TABLE_TRACKER_NAME} rename to #{TABLE_TRACKER_HELPER_NAME}"
|
18
|
+
|
19
|
+
# Greenplum tracks CREATE and TRUNCATE operations in its pg_stat_operations system view.
|
20
|
+
# Join this view with the tracking table so that we can track CREATE and TRUNCATE from within
|
21
|
+
# the database instead of from application code.
|
22
|
+
|
23
|
+
execute <<-EOSQL
|
24
|
+
create view fixed_pg_stat_operations as
|
25
|
+
-- GP's pg_stat_operations enum values like 'TABLE' are inconsistent so fix them here
|
26
|
+
select
|
27
|
+
pso.classname,
|
28
|
+
pso.objname,
|
29
|
+
pso.objid,
|
30
|
+
pso.schemaname,
|
31
|
+
pso.usestatus,
|
32
|
+
pso.usename,
|
33
|
+
pso.actionname,
|
34
|
+
case
|
35
|
+
when pso.actionname = 'TRUNCATE' then '#{relation_type_values[:table]}'
|
36
|
+
when pso.subtype = 'TABLE' then '#{relation_type_values[:table]}'
|
37
|
+
else pso.subtype
|
38
|
+
end as subtype,
|
39
|
+
pso.statime
|
40
|
+
from pg_stat_operations pso
|
41
|
+
EOSQL
|
42
|
+
|
43
|
+
execute <<-EOSQL
|
44
|
+
create view #{TABLE_TRACKER_NAME} as
|
45
|
+
select
|
46
|
+
relation_name,
|
47
|
+
relation_type,
|
48
|
+
operation,
|
49
|
+
time
|
50
|
+
from (
|
51
|
+
|
52
|
+
select
|
53
|
+
a.*,
|
54
|
+
rank() over (partition by relation_name, relation_type order by time desc)
|
55
|
+
from (
|
56
|
+
|
57
|
+
-- select all CREATE and TRUNCATE operations tracked by Greenplum
|
58
|
+
select
|
59
|
+
pso.objname as relation_name,
|
60
|
+
pso.subtype as relation_type,
|
61
|
+
pso.actionname as operation,
|
62
|
+
pso.statime as time
|
63
|
+
from fixed_pg_stat_operations pso
|
64
|
+
where pso.actionname not in ('ANALYZE', 'VACUUM')
|
65
|
+
|
66
|
+
union all
|
67
|
+
|
68
|
+
-- select all operations tracked by Greenplum (PostgreSQL) table rules
|
69
|
+
select
|
70
|
+
ttb.relation_name,
|
71
|
+
ttb.relation_type,
|
72
|
+
ttb.operation,
|
73
|
+
ttb.time
|
74
|
+
from
|
75
|
+
#{TABLE_TRACKER_HELPER_NAME} ttb
|
76
|
+
-- return only operations for tables that exist in system tables
|
77
|
+
join fixed_pg_stat_operations pso on (
|
78
|
+
ttb.relation_name = pso.objname and
|
79
|
+
ttb.relation_type = pso.subtype and
|
80
|
+
pso.actionname = 'CREATE'
|
81
|
+
)
|
82
|
+
|
83
|
+
) a
|
84
|
+
) b
|
85
|
+
-- take only the latest operation per table
|
86
|
+
where rank = 1
|
87
|
+
EOSQL
|
88
|
+
|
89
|
+
# make sure we do deletes and inserts on the helper table, not the view
|
90
|
+
execute <<-EOSQL
|
91
|
+
create rule delete_operation_record as on delete to #{TABLE_TRACKER_NAME}
|
92
|
+
do instead
|
93
|
+
delete from #{TABLE_TRACKER_HELPER_NAME}
|
94
|
+
where
|
95
|
+
relation_name = OLD.relation_name and
|
96
|
+
relation_type = OLD.relation_type and
|
97
|
+
operation = OLD.operation
|
98
|
+
;
|
99
|
+
|
100
|
+
create rule insert_operation_record as on insert to #{TABLE_TRACKER_NAME}
|
101
|
+
do instead
|
102
|
+
insert into #{TABLE_TRACKER_HELPER_NAME} values (
|
103
|
+
NEW.relation_name,
|
104
|
+
NEW.relation_type,
|
105
|
+
NEW.operation,
|
106
|
+
NEW.time
|
107
|
+
)
|
108
|
+
;
|
109
|
+
EOSQL
|
110
|
+
end
|
111
|
+
|
112
|
+
def self.tear_down_tracking options
|
113
|
+
drop_view "fixed_pg_stat_operations"
|
114
|
+
drop_view TABLE_TRACKER_NAME
|
115
|
+
drop_table TABLE_TRACKER_HELPER_NAME
|
116
|
+
end
|
117
|
+
|
118
|
+
def tracking_tables?
|
119
|
+
view_exists?(TABLE_TRACKER_NAME)
|
120
|
+
end
|
121
|
+
|
122
|
+
def drop_table table_name
|
123
|
+
execute "drop table if exists #{table_name} cascade"
|
124
|
+
return if table_name.casecmp(TABLE_TRACKER_HELPER_NAME) == 0 ||
|
125
|
+
table_name.casecmp(TABLE_TRACKER_NAME) == 0
|
126
|
+
track_drop table_name
|
127
|
+
end
|
128
|
+
|
129
|
+
def track_drop table_name
|
130
|
+
execute <<-EOSQL
|
131
|
+
delete from #{TABLE_TRACKER_HELPER_NAME}
|
132
|
+
where
|
133
|
+
relation_name = '#{table_name}' and
|
134
|
+
relation_type = '#{relation_type_values[:table]}'
|
135
|
+
EOSQL
|
136
|
+
end
|
137
|
+
|
138
|
+
|
139
|
+
|
140
|
+
private
|
141
|
+
|
142
|
+
def operations_supported_by_db
|
143
|
+
operations_supported_by_rules & [:create, :truncate]
|
144
|
+
end
|
145
|
+
|
146
|
+
def track_creation table_name, n_tuples
|
147
|
+
# nothing to do; Greenplum tracks this operation in system tables already
|
148
|
+
return nil
|
149
|
+
end
|
150
|
+
|
151
|
+
def track_truncate table_name
|
152
|
+
# nothing to do; Greenplum tracks this operation in system tables already
|
153
|
+
return nil
|
154
|
+
end
|
155
|
+
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
159
|
+
end
|