data_task 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ZDk5OWQzMDU3YzNkZGUyZTczODUxNTdhODk4MTg4MjU0MGU3NDYwOQ==
5
+ data.tar.gz: !binary |-
6
+ NzRmMWMzNjM4MGJjYmZjYTkwYTIwOTFiNjQ0YTVjZWZjNmU1NmQwNg==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ NmNmMjc3MzFmMzBmOGI2MmFjNDBkODAyNDQ5YmZiNzBjMjFmY2JhNDFhMTk5
10
+ M2NmZjk5MzU4ZWRiNzFkMDYxYTgxN2I1MTM0ZjI0OGUzMjRjOTM4YmVmNmVm
11
+ ZjlkYzhhM2Y0YmU5YTdiNTY2NTMyMTVlMzhiNDE2ODU5YzQ2M2I=
12
+ data.tar.gz: !binary |-
13
+ MzdiNGYyN2IyOWU4YWNkYjgwNzE1MzRmYThlNGNlODU1NmQ2N2ZhZDU1YTcz
14
+ YWEwZjQyM2Q4NWJkYmFlZGI4YjVhNWIxY2M4ODZjMzc3OGM4YmFlZWQ3YTcx
15
+ ZTcwNDMyZTQ3YThmYjY2MjIyNGQ1OGMyMzNlMmEzOTdjY2Y2NTQ=
data/.gitignore ADDED
@@ -0,0 +1,39 @@
1
+ *.gem
2
+ *.rbc
3
+ .config
4
+ Gemfile.lock
5
+ InstalledFiles
6
+ coverage
7
+ doc/
8
+ pkg
9
+ rdoc
10
+ spec/reports
11
+ test/tmp
12
+ test/version_tmp
13
+ tmp
14
+ *.bundle
15
+ *.so
16
+ *.o
17
+ *.a
18
+ mkmf.log
19
+
20
+ ## Documentation cache and generated files:
21
+ /.yardoc/
22
+ /_yardoc/
23
+ /doc/
24
+ /rdoc/
25
+
26
+ ## Environment normalisation:
27
+ /.bundle/
28
+ /lib/bundler/man/
29
+
30
+ # vim
31
+ [._]*.s[a-w][a-z]
32
+ [._]s[a-w][a-z]
33
+ *.un~
34
+ Session.vim
35
+ .netrwhist
36
+ *~
37
+
38
+ # host-specific configuration
39
+ /config/database.yml
data/.travis.yml ADDED
@@ -0,0 +1,17 @@
1
+ language: ruby
2
+ cache: bundler
3
+ rvm:
4
+ - 1.9.3
5
+ services: postgresql
6
+ before_script:
7
+ - psql -c 'create database ci_test;' -U postgres
8
+ - sqlite3 ci_test ''
9
+
10
+ env:
11
+ - DATATASK_ENV=postgres_test
12
+ - DATATASK_ENV=sqlite_test
13
+
14
+ notifications:
15
+ email:
16
+ on_success: never
17
+ on_failure: change
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Shahin Saneinejad
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,93 @@
1
+ [![Build Status](https://travis-ci.org/shahin/data_task.svg?branch=master)](https://travis-ci.org/shahin/data_task)
2
+ [![Coverage Status](https://img.shields.io/coveralls/shahin/data_task.svg)](https://coveralls.io/r/shahin/data_task?branch=master)
3
+
4
+ # DataTask
5
+
6
+ DataTask extends Rake's dependency-based programming language to databases. This gem provides the `data` task, analogous to Rake's built-in `file` task but extended to work with pluggable backends beyond the local filesystem.
7
+
8
+ Adapters are included for Sqlite3, PostgreSQL, and Greenplum.
9
+
10
+ ## Installation
11
+
12
+ Add this line to your application's Gemfile:
13
+
14
+ gem 'data_task'
15
+
16
+ And then execute:
17
+
18
+ $ bundle
19
+
20
+ Or install it yourself as:
21
+
22
+ $ gem install data_task
23
+
24
+ ## Usage
25
+
26
+ To write your first data task, connect to your database by instantiating an adapter:
27
+
28
+ ```
29
+ postgres = Rake::DataTask::Postgres.new(
30
+ 'host' => 'localhost',
31
+ 'port' => 5432,
32
+ 'database' => 'example',
33
+ 'username' => 'postgres'
34
+ )
35
+ ```
36
+
37
+ Then use this adapter instance as the target for a data task:
38
+
39
+ ```
40
+ desc "Load a data file into PostgreSQL for analysis."
41
+ data postgres['raw'] => 'raw.txt' do
42
+ # Add loading logic here
43
+ end
44
+ ```
45
+
46
+ Rake will run this task if and only if (a) the table 'raw' is does not exist yet, or (b) the table 'raw' exists but has a timestamp earlier than the file 'raw.txt'. Since database tables now have timestamps associated with them, they can serve as targets or as dependencies in data tasks.
47
+
48
+ Here's a runnable example Rakefile:
49
+
50
+ ```
51
+ require 'rake'
52
+ require 'data_task'
53
+
54
+ # connect to the database
55
+ postgres = Rake::DataTask::Postgres.new(
56
+ 'host' => 'localhost',
57
+ 'port' => 5432,
58
+ 'database' => 'example',
59
+ 'username' => 'postgres'
60
+ )
61
+
62
+ # mark raw.txt as a potential dependency
63
+ file 'raw.txt'
64
+
65
+ # define a loader for the postgres table 'raw', dependent on raw.txt
66
+ desc "Load a data file into PostgreSQL for analysis."
67
+ data postgres['raw'] => 'raw.txt' do
68
+ postgres.create_table 'raw', nil, '(var1 text)'
69
+ postgres.execute "copy raw.txt to raw"
70
+ end
71
+ ```
72
+
73
+ To run it:
74
+
75
+ 1. paste the example into a file named 'Rakefile',
76
+ 2. make sure the PostgreSQL configuration matches your server,
77
+ 3. open a terminal and run the commands below:
78
+
79
+ ```
80
+ $ echo "v1" > raw.txt
81
+ $ rake 'raw'
82
+ ```
83
+
84
+ The contents of raw.txt should be in your table 'raw' on PostgreSQL. Running the rake command a second time will result in no operations as long as raw.txt hasn't changed. With big data files, this can be a big time-saver.
85
+
86
+
87
+ ## Contributing
88
+
89
+ 1. Fork it ( https://github.com/shahin/data_task/fork )
90
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
91
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
92
+ 4. Push to the branch (`git push origin my-new-feature`)
93
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,17 @@
1
+ require 'data_task'
2
+
3
+ desc "Run tests"
4
+ task :default => :'data_task:test'
5
+
6
+ namespace :data_task do
7
+
8
+ require 'bundler/gem_tasks'
9
+ require 'rake/testtask'
10
+
11
+ Rake::TestTask.new do |t|
12
+ t.libs << "spec"
13
+ t.test_files = FileList['test/**/*_spec.rb', 'test/test_*.rb']
14
+ t.verbose
15
+ end
16
+
17
+ end
data/data_task.gemspec ADDED
@@ -0,0 +1,31 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'data_task/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "data_task"
8
+ spec.version = Rake::DataTask::VERSION
9
+ spec.authors = ["Shahin Saneinejad"]
10
+ spec.email = ["shahin.saneinejad@gmail.com"]
11
+ spec.summary = %q{A Rake task for managing data across multiple datastores.}
12
+ spec.description = %q{DataTask provides dependency-based programming for data workflows on top of the Rake build tool.}
13
+ spec.homepage = "https://github.com/shahin/data_task"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.required_ruby_version = '>= 1.9.3'
22
+
23
+ spec.add_runtime_dependency 'rake', '~> 10.0.4'
24
+ spec.add_runtime_dependency 'pg', '~> 0.17.1'
25
+ spec.add_runtime_dependency 'sqlite3'
26
+
27
+ spec.add_development_dependency 'bundler', '~> 1.6'
28
+ spec.add_development_dependency 'minitest-around', '~> 0.2'
29
+ spec.add_development_dependency 'minitest-spec-context', '~> 0.0.3'
30
+ spec.add_development_dependency 'coveralls'
31
+ end
@@ -0,0 +1,159 @@
1
+ require 'pg'
2
+ require_relative 'support/transactions'
3
+ require_relative 'support/booleans'
4
+ require_relative './postgres'
5
+
6
+ module Rake
7
+ module DataTask
8
+
9
+ class Greenplum < Postgres
10
+
11
+ TABLE_TRACKER_HELPER_NAME = "operations"
12
+
13
+ def self.set_up_tracking options
14
+ tear_down_tracking options
15
+ super
16
+
17
+ execute "alter table #{TABLE_TRACKER_NAME} rename to #{TABLE_TRACKER_HELPER_NAME}"
18
+
19
+ # Greenplum tracks CREATE and TRUNCATE operations in its pg_stat_operations system view.
20
+ # Join this view with the tracking table so that we can track CREATE and TRUNCATE from within
21
+ # the database instead of from application code.
22
+
23
+ execute <<-EOSQL
24
+ create view fixed_pg_stat_operations as
25
+ -- GP's pg_stat_operations enum values like 'TABLE' are inconsistent so fix them here
26
+ select
27
+ pso.classname,
28
+ pso.objname,
29
+ pso.objid,
30
+ pso.schemaname,
31
+ pso.usestatus,
32
+ pso.usename,
33
+ pso.actionname,
34
+ case
35
+ when pso.actionname = 'TRUNCATE' then '#{relation_type_values[:table]}'
36
+ when pso.subtype = 'TABLE' then '#{relation_type_values[:table]}'
37
+ else pso.subtype
38
+ end as subtype,
39
+ pso.statime
40
+ from pg_stat_operations pso
41
+ EOSQL
42
+
43
+ execute <<-EOSQL
44
+ create view #{TABLE_TRACKER_NAME} as
45
+ select
46
+ relation_name,
47
+ relation_type,
48
+ operation,
49
+ time
50
+ from (
51
+
52
+ select
53
+ a.*,
54
+ rank() over (partition by relation_name, relation_type order by time desc)
55
+ from (
56
+
57
+ -- select all CREATE and TRUNCATE operations tracked by Greenplum
58
+ select
59
+ pso.objname as relation_name,
60
+ pso.subtype as relation_type,
61
+ pso.actionname as operation,
62
+ pso.statime as time
63
+ from fixed_pg_stat_operations pso
64
+ where pso.actionname not in ('ANALYZE', 'VACUUM')
65
+
66
+ union all
67
+
68
+ -- select all operations tracked by Greenplum (PostgreSQL) table rules
69
+ select
70
+ ttb.relation_name,
71
+ ttb.relation_type,
72
+ ttb.operation,
73
+ ttb.time
74
+ from
75
+ #{TABLE_TRACKER_HELPER_NAME} ttb
76
+ -- return only operations for tables that exist in system tables
77
+ join fixed_pg_stat_operations pso on (
78
+ ttb.relation_name = pso.objname and
79
+ ttb.relation_type = pso.subtype and
80
+ pso.actionname = 'CREATE'
81
+ )
82
+
83
+ ) a
84
+ ) b
85
+ -- take only the latest operation per table
86
+ where rank = 1
87
+ EOSQL
88
+
89
+ # make sure we do deletes and inserts on the helper table, not the view
90
+ execute <<-EOSQL
91
+ create rule delete_operation_record as on delete to #{TABLE_TRACKER_NAME}
92
+ do instead
93
+ delete from #{TABLE_TRACKER_HELPER_NAME}
94
+ where
95
+ relation_name = OLD.relation_name and
96
+ relation_type = OLD.relation_type and
97
+ operation = OLD.operation
98
+ ;
99
+
100
+ create rule insert_operation_record as on insert to #{TABLE_TRACKER_NAME}
101
+ do instead
102
+ insert into #{TABLE_TRACKER_HELPER_NAME} values (
103
+ NEW.relation_name,
104
+ NEW.relation_type,
105
+ NEW.operation,
106
+ NEW.time
107
+ )
108
+ ;
109
+ EOSQL
110
+ end
111
+
112
+ def self.tear_down_tracking options
113
+ drop_view "fixed_pg_stat_operations"
114
+ drop_view TABLE_TRACKER_NAME
115
+ drop_table TABLE_TRACKER_HELPER_NAME
116
+ end
117
+
118
+ def tracking_tables?
119
+ view_exists?(TABLE_TRACKER_NAME)
120
+ end
121
+
122
+ def drop_table table_name
123
+ execute "drop table if exists #{table_name} cascade"
124
+ return if table_name.casecmp(TABLE_TRACKER_HELPER_NAME) == 0 ||
125
+ table_name.casecmp(TABLE_TRACKER_NAME) == 0
126
+ track_drop table_name
127
+ end
128
+
129
+ def track_drop table_name
130
+ execute <<-EOSQL
131
+ delete from #{TABLE_TRACKER_HELPER_NAME}
132
+ where
133
+ relation_name = '#{table_name}' and
134
+ relation_type = '#{relation_type_values[:table]}'
135
+ EOSQL
136
+ end
137
+
138
+
139
+
140
+ private
141
+
142
+ def operations_supported_by_db
143
+ operations_supported_by_rules & [:create, :truncate]
144
+ end
145
+
146
+ def track_creation table_name, n_tuples
147
+ # nothing to do; Greenplum tracks this operation in system tables already
148
+ return nil
149
+ end
150
+
151
+ def track_truncate table_name
152
+ # nothing to do; Greenplum tracks this operation in system tables already
153
+ return nil
154
+ end
155
+
156
+ end
157
+
158
+ end
159
+ end