data_task 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ZDk5OWQzMDU3YzNkZGUyZTczODUxNTdhODk4MTg4MjU0MGU3NDYwOQ==
5
+ data.tar.gz: !binary |-
6
+ NzRmMWMzNjM4MGJjYmZjYTkwYTIwOTFiNjQ0YTVjZWZjNmU1NmQwNg==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ NmNmMjc3MzFmMzBmOGI2MmFjNDBkODAyNDQ5YmZiNzBjMjFmY2JhNDFhMTk5
10
+ M2NmZjk5MzU4ZWRiNzFkMDYxYTgxN2I1MTM0ZjI0OGUzMjRjOTM4YmVmNmVm
11
+ ZjlkYzhhM2Y0YmU5YTdiNTY2NTMyMTVlMzhiNDE2ODU5YzQ2M2I=
12
+ data.tar.gz: !binary |-
13
+ MzdiNGYyN2IyOWU4YWNkYjgwNzE1MzRmYThlNGNlODU1NmQ2N2ZhZDU1YTcz
14
+ YWEwZjQyM2Q4NWJkYmFlZGI4YjVhNWIxY2M4ODZjMzc3OGM4YmFlZWQ3YTcx
15
+ ZTcwNDMyZTQ3YThmYjY2MjIyNGQ1OGMyMzNlMmEzOTdjY2Y2NTQ=
data/.gitignore ADDED
@@ -0,0 +1,39 @@
1
+ *.gem
2
+ *.rbc
3
+ .config
4
+ Gemfile.lock
5
+ InstalledFiles
6
+ coverage
7
+ doc/
8
+ pkg
9
+ rdoc
10
+ spec/reports
11
+ test/tmp
12
+ test/version_tmp
13
+ tmp
14
+ *.bundle
15
+ *.so
16
+ *.o
17
+ *.a
18
+ mkmf.log
19
+
20
+ ## Documentation cache and generated files:
21
+ /.yardoc/
22
+ /_yardoc/
23
+ /doc/
24
+ /rdoc/
25
+
26
+ ## Environment normalisation:
27
+ /.bundle/
28
+ /lib/bundler/man/
29
+
30
+ # vim
31
+ [._]*.s[a-w][a-z]
32
+ [._]s[a-w][a-z]
33
+ *.un~
34
+ Session.vim
35
+ .netrwhist
36
+ *~
37
+
38
+ # host-specific configuration
39
+ /config/database.yml
data/.travis.yml ADDED
@@ -0,0 +1,17 @@
1
+ language: ruby
2
+ cache: bundler
3
+ rvm:
4
+ - 1.9.3
5
+ services: postgresql
6
+ before_script:
7
+ - psql -c 'create database ci_test;' -U postgres
8
+ - sqlite3 ci_test ''
9
+
10
+ env:
11
+ - DATATASK_ENV=postgres_test
12
+ - DATATASK_ENV=sqlite_test
13
+
14
+ notifications:
15
+ email:
16
+ on_success: never
17
+ on_failure: change
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Shahin Saneinejad
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,93 @@
1
+ [![Build Status](https://travis-ci.org/shahin/data_task.svg?branch=master)](https://travis-ci.org/shahin/data_task)
2
+ [![Coverage Status](https://img.shields.io/coveralls/shahin/data_task.svg)](https://coveralls.io/r/shahin/data_task?branch=master)
3
+
4
+ # DataTask
5
+
6
+ DataTask extends Rake's dependency-based programming language to databases. This gem provides the `data` task, analogous to Rake's built-in `file` task but extended to work with pluggable backends beyond the local filesystem.
7
+
8
+ Adapters are included for Sqlite3, PostgreSQL, and Greenplum.
9
+
10
+ ## Installation
11
+
12
+ Add this line to your application's Gemfile:
13
+
14
+ gem 'data_task'
15
+
16
+ And then execute:
17
+
18
+ $ bundle
19
+
20
+ Or install it yourself as:
21
+
22
+ $ gem install data_task
23
+
24
+ ## Usage
25
+
26
+ To write your first data task, connect to your database by instantiating an adapter:
27
+
28
+ ```
29
+ postgres = Rake::DataTask::Postgres.new(
30
+ 'host' => 'localhost',
31
+ 'port' => 5432,
32
+ 'database' => 'example',
33
+ 'username' => 'postgres'
34
+ )
35
+ ```
36
+
37
+ Then use this adapter instance as the target for a data task:
38
+
39
+ ```
40
+ desc "Load a data file into PostgreSQL for analysis."
41
+ data postgres['raw'] => 'raw.txt' do
42
+ # Add loading logic here
43
+ end
44
+ ```
45
+
46
+ Rake will run this task if and only if (a) the table 'raw' is does not exist yet, or (b) the table 'raw' exists but has a timestamp earlier than the file 'raw.txt'. Since database tables now have timestamps associated with them, they can serve as targets or as dependencies in data tasks.
47
+
48
+ Here's a runnable example Rakefile:
49
+
50
+ ```
51
+ require 'rake'
52
+ require 'data_task'
53
+
54
+ # connect to the database
55
+ postgres = Rake::DataTask::Postgres.new(
56
+ 'host' => 'localhost',
57
+ 'port' => 5432,
58
+ 'database' => 'example',
59
+ 'username' => 'postgres'
60
+ )
61
+
62
+ # mark raw.txt as a potential dependency
63
+ file 'raw.txt'
64
+
65
+ # define a loader for the postgres table 'raw', dependent on raw.txt
66
+ desc "Load a data file into PostgreSQL for analysis."
67
+ data postgres['raw'] => 'raw.txt' do
68
+ postgres.create_table 'raw', nil, '(var1 text)'
69
+ postgres.execute "copy raw.txt to raw"
70
+ end
71
+ ```
72
+
73
+ To run it:
74
+
75
+ 1. paste the example into a file named 'Rakefile',
76
+ 2. make sure the PostgreSQL configuration matches your server,
77
+ 3. open a terminal and run the commands below:
78
+
79
+ ```
80
+ $ echo "v1" > raw.txt
81
+ $ rake 'raw'
82
+ ```
83
+
84
+ The contents of raw.txt should be in your table 'raw' on PostgreSQL. Running the rake command a second time will result in no operations as long as raw.txt hasn't changed. With big data files, this can be a big time-saver.
85
+
86
+
87
+ ## Contributing
88
+
89
+ 1. Fork it ( https://github.com/shahin/data_task/fork )
90
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
91
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
92
+ 4. Push to the branch (`git push origin my-new-feature`)
93
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,17 @@
1
+ require 'data_task'
2
+
3
+ desc "Run tests"
4
+ task :default => :'data_task:test'
5
+
6
+ namespace :data_task do
7
+
8
+ require 'bundler/gem_tasks'
9
+ require 'rake/testtask'
10
+
11
+ Rake::TestTask.new do |t|
12
+ t.libs << "spec"
13
+ t.test_files = FileList['test/**/*_spec.rb', 'test/test_*.rb']
14
+ t.verbose
15
+ end
16
+
17
+ end
data/data_task.gemspec ADDED
@@ -0,0 +1,31 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'data_task/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "data_task"
8
+ spec.version = Rake::DataTask::VERSION
9
+ spec.authors = ["Shahin Saneinejad"]
10
+ spec.email = ["shahin.saneinejad@gmail.com"]
11
+ spec.summary = %q{A Rake task for managing data across multiple datastores.}
12
+ spec.description = %q{DataTask provides dependency-based programming for data workflows on top of the Rake build tool.}
13
+ spec.homepage = "https://github.com/shahin/data_task"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.required_ruby_version = '>= 1.9.3'
22
+
23
+ spec.add_runtime_dependency 'rake', '~> 10.0.4'
24
+ spec.add_runtime_dependency 'pg', '~> 0.17.1'
25
+ spec.add_runtime_dependency 'sqlite3'
26
+
27
+ spec.add_development_dependency 'bundler', '~> 1.6'
28
+ spec.add_development_dependency 'minitest-around', '~> 0.2'
29
+ spec.add_development_dependency 'minitest-spec-context', '~> 0.0.3'
30
+ spec.add_development_dependency 'coveralls'
31
+ end
@@ -0,0 +1,159 @@
1
+ require 'pg'
2
+ require_relative 'support/transactions'
3
+ require_relative 'support/booleans'
4
+ require_relative './postgres'
5
+
6
+ module Rake
7
+ module DataTask
8
+
9
+ class Greenplum < Postgres
10
+
11
+ TABLE_TRACKER_HELPER_NAME = "operations"
12
+
13
+ def self.set_up_tracking options
14
+ tear_down_tracking options
15
+ super
16
+
17
+ execute "alter table #{TABLE_TRACKER_NAME} rename to #{TABLE_TRACKER_HELPER_NAME}"
18
+
19
+ # Greenplum tracks CREATE and TRUNCATE operations in its pg_stat_operations system view.
20
+ # Join this view with the tracking table so that we can track CREATE and TRUNCATE from within
21
+ # the database instead of from application code.
22
+
23
+ execute <<-EOSQL
24
+ create view fixed_pg_stat_operations as
25
+ -- GP's pg_stat_operations enum values like 'TABLE' are inconsistent so fix them here
26
+ select
27
+ pso.classname,
28
+ pso.objname,
29
+ pso.objid,
30
+ pso.schemaname,
31
+ pso.usestatus,
32
+ pso.usename,
33
+ pso.actionname,
34
+ case
35
+ when pso.actionname = 'TRUNCATE' then '#{relation_type_values[:table]}'
36
+ when pso.subtype = 'TABLE' then '#{relation_type_values[:table]}'
37
+ else pso.subtype
38
+ end as subtype,
39
+ pso.statime
40
+ from pg_stat_operations pso
41
+ EOSQL
42
+
43
+ execute <<-EOSQL
44
+ create view #{TABLE_TRACKER_NAME} as
45
+ select
46
+ relation_name,
47
+ relation_type,
48
+ operation,
49
+ time
50
+ from (
51
+
52
+ select
53
+ a.*,
54
+ rank() over (partition by relation_name, relation_type order by time desc)
55
+ from (
56
+
57
+ -- select all CREATE and TRUNCATE operations tracked by Greenplum
58
+ select
59
+ pso.objname as relation_name,
60
+ pso.subtype as relation_type,
61
+ pso.actionname as operation,
62
+ pso.statime as time
63
+ from fixed_pg_stat_operations pso
64
+ where pso.actionname not in ('ANALYZE', 'VACUUM')
65
+
66
+ union all
67
+
68
+ -- select all operations tracked by Greenplum (PostgreSQL) table rules
69
+ select
70
+ ttb.relation_name,
71
+ ttb.relation_type,
72
+ ttb.operation,
73
+ ttb.time
74
+ from
75
+ #{TABLE_TRACKER_HELPER_NAME} ttb
76
+ -- return only operations for tables that exist in system tables
77
+ join fixed_pg_stat_operations pso on (
78
+ ttb.relation_name = pso.objname and
79
+ ttb.relation_type = pso.subtype and
80
+ pso.actionname = 'CREATE'
81
+ )
82
+
83
+ ) a
84
+ ) b
85
+ -- take only the latest operation per table
86
+ where rank = 1
87
+ EOSQL
88
+
89
+ # make sure we do deletes and inserts on the helper table, not the view
90
+ execute <<-EOSQL
91
+ create rule delete_operation_record as on delete to #{TABLE_TRACKER_NAME}
92
+ do instead
93
+ delete from #{TABLE_TRACKER_HELPER_NAME}
94
+ where
95
+ relation_name = OLD.relation_name and
96
+ relation_type = OLD.relation_type and
97
+ operation = OLD.operation
98
+ ;
99
+
100
+ create rule insert_operation_record as on insert to #{TABLE_TRACKER_NAME}
101
+ do instead
102
+ insert into #{TABLE_TRACKER_HELPER_NAME} values (
103
+ NEW.relation_name,
104
+ NEW.relation_type,
105
+ NEW.operation,
106
+ NEW.time
107
+ )
108
+ ;
109
+ EOSQL
110
+ end
111
+
112
+ def self.tear_down_tracking options
113
+ drop_view "fixed_pg_stat_operations"
114
+ drop_view TABLE_TRACKER_NAME
115
+ drop_table TABLE_TRACKER_HELPER_NAME
116
+ end
117
+
118
+ def tracking_tables?
119
+ view_exists?(TABLE_TRACKER_NAME)
120
+ end
121
+
122
+ def drop_table table_name
123
+ execute "drop table if exists #{table_name} cascade"
124
+ return if table_name.casecmp(TABLE_TRACKER_HELPER_NAME) == 0 ||
125
+ table_name.casecmp(TABLE_TRACKER_NAME) == 0
126
+ track_drop table_name
127
+ end
128
+
129
+ def track_drop table_name
130
+ execute <<-EOSQL
131
+ delete from #{TABLE_TRACKER_HELPER_NAME}
132
+ where
133
+ relation_name = '#{table_name}' and
134
+ relation_type = '#{relation_type_values[:table]}'
135
+ EOSQL
136
+ end
137
+
138
+
139
+
140
+ private
141
+
142
+ def operations_supported_by_db
143
+ operations_supported_by_rules & [:create, :truncate]
144
+ end
145
+
146
+ def track_creation table_name, n_tuples
147
+ # nothing to do; Greenplum tracks this operation in system tables already
148
+ return nil
149
+ end
150
+
151
+ def track_truncate table_name
152
+ # nothing to do; Greenplum tracks this operation in system tables already
153
+ return nil
154
+ end
155
+
156
+ end
157
+
158
+ end
159
+ end