data_task 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +39 -0
- data/.travis.yml +17 -0
- data/Gemfile +3 -0
- data/LICENSE +22 -0
- data/README.md +93 -0
- data/Rakefile +17 -0
- data/data_task.gemspec +31 -0
- data/lib/data_task/adapters/greenplum.rb +159 -0
- data/lib/data_task/adapters/postgres.rb +430 -0
- data/lib/data_task/adapters/sqlite.rb +239 -0
- data/lib/data_task/adapters/support/booleans.rb +19 -0
- data/lib/data_task/adapters/support/connection_persistence.rb +18 -0
- data/lib/data_task/adapters/support/transactions.rb +28 -0
- data/lib/data_task/data.rb +34 -0
- data/lib/data_task/db.rb +57 -0
- data/lib/data_task/sql.rb +49 -0
- data/lib/data_task/tasks/examples.rake +35 -0
- data/lib/data_task/util.rb +6 -0
- data/lib/data_task/version.rb +5 -0
- data/lib/data_task.rb +76 -0
- data/test/config/database.yml +10 -0
- data/test/data_spec.rb +81 -0
- data/test/db_spec.rb +202 -0
- data/test/helper.rb +37 -0
- data/test/postgresql_spec.rb +249 -0
- data/test/sql_spec.rb +46 -0
- data/test/table_creation.rb +34 -0
- data/test/test_rake_table_task.rb +207 -0
- metadata +179 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
ZDk5OWQzMDU3YzNkZGUyZTczODUxNTdhODk4MTg4MjU0MGU3NDYwOQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
NzRmMWMzNjM4MGJjYmZjYTkwYTIwOTFiNjQ0YTVjZWZjNmU1NmQwNg==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NmNmMjc3MzFmMzBmOGI2MmFjNDBkODAyNDQ5YmZiNzBjMjFmY2JhNDFhMTk5
|
10
|
+
M2NmZjk5MzU4ZWRiNzFkMDYxYTgxN2I1MTM0ZjI0OGUzMjRjOTM4YmVmNmVm
|
11
|
+
ZjlkYzhhM2Y0YmU5YTdiNTY2NTMyMTVlMzhiNDE2ODU5YzQ2M2I=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
MzdiNGYyN2IyOWU4YWNkYjgwNzE1MzRmYThlNGNlODU1NmQ2N2ZhZDU1YTcz
|
14
|
+
YWEwZjQyM2Q4NWJkYmFlZGI4YjVhNWIxY2M4ODZjMzc3OGM4YmFlZWQ3YTcx
|
15
|
+
ZTcwNDMyZTQ3YThmYjY2MjIyNGQ1OGMyMzNlMmEzOTdjY2Y2NTQ=
|
data/.gitignore
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.config
|
4
|
+
Gemfile.lock
|
5
|
+
InstalledFiles
|
6
|
+
coverage
|
7
|
+
doc/
|
8
|
+
pkg
|
9
|
+
rdoc
|
10
|
+
spec/reports
|
11
|
+
test/tmp
|
12
|
+
test/version_tmp
|
13
|
+
tmp
|
14
|
+
*.bundle
|
15
|
+
*.so
|
16
|
+
*.o
|
17
|
+
*.a
|
18
|
+
mkmf.log
|
19
|
+
|
20
|
+
## Documentation cache and generated files:
|
21
|
+
/.yardoc/
|
22
|
+
/_yardoc/
|
23
|
+
/doc/
|
24
|
+
/rdoc/
|
25
|
+
|
26
|
+
## Environment normalisation:
|
27
|
+
/.bundle/
|
28
|
+
/lib/bundler/man/
|
29
|
+
|
30
|
+
# vim
|
31
|
+
[._]*.s[a-w][a-z]
|
32
|
+
[._]s[a-w][a-z]
|
33
|
+
*.un~
|
34
|
+
Session.vim
|
35
|
+
.netrwhist
|
36
|
+
*~
|
37
|
+
|
38
|
+
# host-specific configuration
|
39
|
+
/config/database.yml
|
data/.travis.yml
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
language: ruby
|
2
|
+
cache: bundler
|
3
|
+
rvm:
|
4
|
+
- 1.9.3
|
5
|
+
services: postgresql
|
6
|
+
before_script:
|
7
|
+
- psql -c 'create database ci_test;' -U postgres
|
8
|
+
- sqlite3 ci_test ''
|
9
|
+
|
10
|
+
env:
|
11
|
+
- DATATASK_ENV=postgres_test
|
12
|
+
- DATATASK_ENV=sqlite_test
|
13
|
+
|
14
|
+
notifications:
|
15
|
+
email:
|
16
|
+
on_success: never
|
17
|
+
on_failure: change
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Shahin Saneinejad
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
[](https://travis-ci.org/shahin/data_task)
|
2
|
+
[](https://coveralls.io/r/shahin/data_task?branch=master)
|
3
|
+
|
4
|
+
# DataTask
|
5
|
+
|
6
|
+
DataTask extends Rake's dependency-based programming language to databases. This gem provides the `data` task, analogous to Rake's built-in `file` task but extended to work with pluggable backends beyond the local filesystem.
|
7
|
+
|
8
|
+
Adapters are included for Sqlite3, PostgreSQL, and Greenplum.
|
9
|
+
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
Add this line to your application's Gemfile:
|
13
|
+
|
14
|
+
gem 'data_task'
|
15
|
+
|
16
|
+
And then execute:
|
17
|
+
|
18
|
+
$ bundle
|
19
|
+
|
20
|
+
Or install it yourself as:
|
21
|
+
|
22
|
+
$ gem install data_task
|
23
|
+
|
24
|
+
## Usage
|
25
|
+
|
26
|
+
To write your first data task, connect to your database by instantiating an adapter:
|
27
|
+
|
28
|
+
```
|
29
|
+
postgres = Rake::DataTask::Postgres.new(
|
30
|
+
'host' => 'localhost',
|
31
|
+
'port' => 5432,
|
32
|
+
'database' => 'example',
|
33
|
+
'username' => 'postgres'
|
34
|
+
)
|
35
|
+
```
|
36
|
+
|
37
|
+
Then use this adapter instance as the target for a data task:
|
38
|
+
|
39
|
+
```
|
40
|
+
desc "Load a data file into PostgreSQL for analysis."
|
41
|
+
data postgres['raw'] => 'raw.txt' do
|
42
|
+
# Add loading logic here
|
43
|
+
end
|
44
|
+
```
|
45
|
+
|
46
|
+
Rake will run this task if and only if (a) the table 'raw' is does not exist yet, or (b) the table 'raw' exists but has a timestamp earlier than the file 'raw.txt'. Since database tables now have timestamps associated with them, they can serve as targets or as dependencies in data tasks.
|
47
|
+
|
48
|
+
Here's a runnable example Rakefile:
|
49
|
+
|
50
|
+
```
|
51
|
+
require 'rake'
|
52
|
+
require 'data_task'
|
53
|
+
|
54
|
+
# connect to the database
|
55
|
+
postgres = Rake::DataTask::Postgres.new(
|
56
|
+
'host' => 'localhost',
|
57
|
+
'port' => 5432,
|
58
|
+
'database' => 'example',
|
59
|
+
'username' => 'postgres'
|
60
|
+
)
|
61
|
+
|
62
|
+
# mark raw.txt as a potential dependency
|
63
|
+
file 'raw.txt'
|
64
|
+
|
65
|
+
# define a loader for the postgres table 'raw', dependent on raw.txt
|
66
|
+
desc "Load a data file into PostgreSQL for analysis."
|
67
|
+
data postgres['raw'] => 'raw.txt' do
|
68
|
+
postgres.create_table 'raw', nil, '(var1 text)'
|
69
|
+
postgres.execute "copy raw.txt to raw"
|
70
|
+
end
|
71
|
+
```
|
72
|
+
|
73
|
+
To run it:
|
74
|
+
|
75
|
+
1. paste the example into a file named 'Rakefile',
|
76
|
+
2. make sure the PostgreSQL configuration matches your server,
|
77
|
+
3. open a terminal and run the commands below:
|
78
|
+
|
79
|
+
```
|
80
|
+
$ echo "v1" > raw.txt
|
81
|
+
$ rake 'raw'
|
82
|
+
```
|
83
|
+
|
84
|
+
The contents of raw.txt should be in your table 'raw' on PostgreSQL. Running the rake command a second time will result in no operations as long as raw.txt hasn't changed. With big data files, this can be a big time-saver.
|
85
|
+
|
86
|
+
|
87
|
+
## Contributing
|
88
|
+
|
89
|
+
1. Fork it ( https://github.com/shahin/data_task/fork )
|
90
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
91
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
92
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
93
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'data_task'
|
2
|
+
|
3
|
+
desc "Run tests"
|
4
|
+
task :default => :'data_task:test'
|
5
|
+
|
6
|
+
namespace :data_task do
|
7
|
+
|
8
|
+
require 'bundler/gem_tasks'
|
9
|
+
require 'rake/testtask'
|
10
|
+
|
11
|
+
Rake::TestTask.new do |t|
|
12
|
+
t.libs << "spec"
|
13
|
+
t.test_files = FileList['test/**/*_spec.rb', 'test/test_*.rb']
|
14
|
+
t.verbose
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
data/data_task.gemspec
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'data_task/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "data_task"
|
8
|
+
spec.version = Rake::DataTask::VERSION
|
9
|
+
spec.authors = ["Shahin Saneinejad"]
|
10
|
+
spec.email = ["shahin.saneinejad@gmail.com"]
|
11
|
+
spec.summary = %q{A Rake task for managing data across multiple datastores.}
|
12
|
+
spec.description = %q{DataTask provides dependency-based programming for data workflows on top of the Rake build tool.}
|
13
|
+
spec.homepage = "https://github.com/shahin/data_task"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.required_ruby_version = '>= 1.9.3'
|
22
|
+
|
23
|
+
spec.add_runtime_dependency 'rake', '~> 10.0.4'
|
24
|
+
spec.add_runtime_dependency 'pg', '~> 0.17.1'
|
25
|
+
spec.add_runtime_dependency 'sqlite3'
|
26
|
+
|
27
|
+
spec.add_development_dependency 'bundler', '~> 1.6'
|
28
|
+
spec.add_development_dependency 'minitest-around', '~> 0.2'
|
29
|
+
spec.add_development_dependency 'minitest-spec-context', '~> 0.0.3'
|
30
|
+
spec.add_development_dependency 'coveralls'
|
31
|
+
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
require 'pg'
|
2
|
+
require_relative 'support/transactions'
|
3
|
+
require_relative 'support/booleans'
|
4
|
+
require_relative './postgres'
|
5
|
+
|
6
|
+
module Rake
|
7
|
+
module DataTask
|
8
|
+
|
9
|
+
class Greenplum < Postgres
|
10
|
+
|
11
|
+
TABLE_TRACKER_HELPER_NAME = "operations"
|
12
|
+
|
13
|
+
def self.set_up_tracking options
|
14
|
+
tear_down_tracking options
|
15
|
+
super
|
16
|
+
|
17
|
+
execute "alter table #{TABLE_TRACKER_NAME} rename to #{TABLE_TRACKER_HELPER_NAME}"
|
18
|
+
|
19
|
+
# Greenplum tracks CREATE and TRUNCATE operations in its pg_stat_operations system view.
|
20
|
+
# Join this view with the tracking table so that we can track CREATE and TRUNCATE from within
|
21
|
+
# the database instead of from application code.
|
22
|
+
|
23
|
+
execute <<-EOSQL
|
24
|
+
create view fixed_pg_stat_operations as
|
25
|
+
-- GP's pg_stat_operations enum values like 'TABLE' are inconsistent so fix them here
|
26
|
+
select
|
27
|
+
pso.classname,
|
28
|
+
pso.objname,
|
29
|
+
pso.objid,
|
30
|
+
pso.schemaname,
|
31
|
+
pso.usestatus,
|
32
|
+
pso.usename,
|
33
|
+
pso.actionname,
|
34
|
+
case
|
35
|
+
when pso.actionname = 'TRUNCATE' then '#{relation_type_values[:table]}'
|
36
|
+
when pso.subtype = 'TABLE' then '#{relation_type_values[:table]}'
|
37
|
+
else pso.subtype
|
38
|
+
end as subtype,
|
39
|
+
pso.statime
|
40
|
+
from pg_stat_operations pso
|
41
|
+
EOSQL
|
42
|
+
|
43
|
+
execute <<-EOSQL
|
44
|
+
create view #{TABLE_TRACKER_NAME} as
|
45
|
+
select
|
46
|
+
relation_name,
|
47
|
+
relation_type,
|
48
|
+
operation,
|
49
|
+
time
|
50
|
+
from (
|
51
|
+
|
52
|
+
select
|
53
|
+
a.*,
|
54
|
+
rank() over (partition by relation_name, relation_type order by time desc)
|
55
|
+
from (
|
56
|
+
|
57
|
+
-- select all CREATE and TRUNCATE operations tracked by Greenplum
|
58
|
+
select
|
59
|
+
pso.objname as relation_name,
|
60
|
+
pso.subtype as relation_type,
|
61
|
+
pso.actionname as operation,
|
62
|
+
pso.statime as time
|
63
|
+
from fixed_pg_stat_operations pso
|
64
|
+
where pso.actionname not in ('ANALYZE', 'VACUUM')
|
65
|
+
|
66
|
+
union all
|
67
|
+
|
68
|
+
-- select all operations tracked by Greenplum (PostgreSQL) table rules
|
69
|
+
select
|
70
|
+
ttb.relation_name,
|
71
|
+
ttb.relation_type,
|
72
|
+
ttb.operation,
|
73
|
+
ttb.time
|
74
|
+
from
|
75
|
+
#{TABLE_TRACKER_HELPER_NAME} ttb
|
76
|
+
-- return only operations for tables that exist in system tables
|
77
|
+
join fixed_pg_stat_operations pso on (
|
78
|
+
ttb.relation_name = pso.objname and
|
79
|
+
ttb.relation_type = pso.subtype and
|
80
|
+
pso.actionname = 'CREATE'
|
81
|
+
)
|
82
|
+
|
83
|
+
) a
|
84
|
+
) b
|
85
|
+
-- take only the latest operation per table
|
86
|
+
where rank = 1
|
87
|
+
EOSQL
|
88
|
+
|
89
|
+
# make sure we do deletes and inserts on the helper table, not the view
|
90
|
+
execute <<-EOSQL
|
91
|
+
create rule delete_operation_record as on delete to #{TABLE_TRACKER_NAME}
|
92
|
+
do instead
|
93
|
+
delete from #{TABLE_TRACKER_HELPER_NAME}
|
94
|
+
where
|
95
|
+
relation_name = OLD.relation_name and
|
96
|
+
relation_type = OLD.relation_type and
|
97
|
+
operation = OLD.operation
|
98
|
+
;
|
99
|
+
|
100
|
+
create rule insert_operation_record as on insert to #{TABLE_TRACKER_NAME}
|
101
|
+
do instead
|
102
|
+
insert into #{TABLE_TRACKER_HELPER_NAME} values (
|
103
|
+
NEW.relation_name,
|
104
|
+
NEW.relation_type,
|
105
|
+
NEW.operation,
|
106
|
+
NEW.time
|
107
|
+
)
|
108
|
+
;
|
109
|
+
EOSQL
|
110
|
+
end
|
111
|
+
|
112
|
+
def self.tear_down_tracking options
|
113
|
+
drop_view "fixed_pg_stat_operations"
|
114
|
+
drop_view TABLE_TRACKER_NAME
|
115
|
+
drop_table TABLE_TRACKER_HELPER_NAME
|
116
|
+
end
|
117
|
+
|
118
|
+
def tracking_tables?
|
119
|
+
view_exists?(TABLE_TRACKER_NAME)
|
120
|
+
end
|
121
|
+
|
122
|
+
def drop_table table_name
|
123
|
+
execute "drop table if exists #{table_name} cascade"
|
124
|
+
return if table_name.casecmp(TABLE_TRACKER_HELPER_NAME) == 0 ||
|
125
|
+
table_name.casecmp(TABLE_TRACKER_NAME) == 0
|
126
|
+
track_drop table_name
|
127
|
+
end
|
128
|
+
|
129
|
+
def track_drop table_name
|
130
|
+
execute <<-EOSQL
|
131
|
+
delete from #{TABLE_TRACKER_HELPER_NAME}
|
132
|
+
where
|
133
|
+
relation_name = '#{table_name}' and
|
134
|
+
relation_type = '#{relation_type_values[:table]}'
|
135
|
+
EOSQL
|
136
|
+
end
|
137
|
+
|
138
|
+
|
139
|
+
|
140
|
+
private
|
141
|
+
|
142
|
+
def operations_supported_by_db
|
143
|
+
operations_supported_by_rules & [:create, :truncate]
|
144
|
+
end
|
145
|
+
|
146
|
+
def track_creation table_name, n_tuples
|
147
|
+
# nothing to do; Greenplum tracks this operation in system tables already
|
148
|
+
return nil
|
149
|
+
end
|
150
|
+
|
151
|
+
def track_truncate table_name
|
152
|
+
# nothing to do; Greenplum tracks this operation in system tables already
|
153
|
+
return nil
|
154
|
+
end
|
155
|
+
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
159
|
+
end
|