cranium 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +21 -0
- data/.ruby-version +1 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +3 -0
- data/Vagrantfile +24 -0
- data/bin/cranium +9 -0
- data/config/cucumber.yml +9 -0
- data/cranium.gemspec +26 -0
- data/db/setup.sql +8 -0
- data/docker-compose.yml +8 -0
- data/examples/config.rb +14 -0
- data/examples/deduplication.rb +27 -0
- data/examples/import_csv_with_field_lookup_inserting_new_dimension_keys.rb +26 -0
- data/examples/incremental_extract.rb +17 -0
- data/examples/lookup_with_multiple_fields.rb +25 -0
- data/features/archive.feature +49 -0
- data/features/extract/incremental_extract.feature +56 -0
- data/features/extract/simple_extract.feature +85 -0
- data/features/import/import_csv_to_database_as_delta.feature +38 -0
- data/features/import/import_csv_to_database_with_delete_insert_merging.feature +51 -0
- data/features/import/import_csv_to_database_with_truncate_insert.feature +49 -0
- data/features/import/import_csv_to_database_with_update_merging.feature +46 -0
- data/features/import/import_csv_with_always_inserting_new_dimension_keys.feature +137 -0
- data/features/import/import_csv_with_field_lookup_inserting_new_dimension_keys.feature +62 -0
- data/features/import/import_csv_with_field_lookup_transformation.feature +125 -0
- data/features/import/import_csv_with_transformation.feature +55 -0
- data/features/import/import_multiple_csv_files_without_transformations.feature +44 -0
- data/features/import/import_with_load_id_from_sequence.feature +53 -0
- data/features/import/import_with_lookup_from_multiple_fields.feature +64 -0
- data/features/read.feature +56 -0
- data/features/remove.feature +44 -0
- data/features/restore_database_connection.feature +55 -0
- data/features/step_definitions/database_table_steps.rb +40 -0
- data/features/step_definitions/definition_steps.rb +3 -0
- data/features/step_definitions/execution_steps.rb +23 -0
- data/features/step_definitions/file_steps.rb +39 -0
- data/features/support/class_extensions.rb +24 -0
- data/features/support/env.rb +27 -0
- data/features/support/randomize.rb +22 -0
- data/features/support/stop_on_first_error.rb +5 -0
- data/features/transform/deduplication.feature +37 -0
- data/features/transform/empty_transformation.feature +72 -0
- data/features/transform/join.feature +180 -0
- data/features/transform/join_multiple_files_into_one_output_file.feature +46 -0
- data/features/transform/output_rows.feature +70 -0
- data/features/transform/projection.feature +34 -0
- data/features/transform/raw_ruby_transformation.feature +69 -0
- data/features/transform/split_field.feature +39 -0
- data/lib/cranium/application.rb +104 -0
- data/lib/cranium/archiver.rb +36 -0
- data/lib/cranium/attribute_dsl.rb +43 -0
- data/lib/cranium/command_line_options.rb +27 -0
- data/lib/cranium/configuration.rb +33 -0
- data/lib/cranium/data_importer.rb +35 -0
- data/lib/cranium/data_reader.rb +48 -0
- data/lib/cranium/data_transformer.rb +126 -0
- data/lib/cranium/database.rb +36 -0
- data/lib/cranium/definition_registry.rb +21 -0
- data/lib/cranium/dimension_manager.rb +65 -0
- data/lib/cranium/dsl/database_definition.rb +23 -0
- data/lib/cranium/dsl/extract_definition.rb +28 -0
- data/lib/cranium/dsl/import_definition.rb +50 -0
- data/lib/cranium/dsl/source_definition.rb +67 -0
- data/lib/cranium/dsl.rb +100 -0
- data/lib/cranium/extensions/file.rb +7 -0
- data/lib/cranium/extensions/sequel_greenplum.rb +30 -0
- data/lib/cranium/external_table.rb +75 -0
- data/lib/cranium/extract/data_extractor.rb +11 -0
- data/lib/cranium/extract/storage.rb +57 -0
- data/lib/cranium/extract/strategy/base.rb +27 -0
- data/lib/cranium/extract/strategy/incremental.rb +16 -0
- data/lib/cranium/extract/strategy/simple.rb +9 -0
- data/lib/cranium/extract/strategy.rb +7 -0
- data/lib/cranium/extract.rb +7 -0
- data/lib/cranium/import_strategy/base.rb +55 -0
- data/lib/cranium/import_strategy/delete_insert.rb +40 -0
- data/lib/cranium/import_strategy/delta.rb +8 -0
- data/lib/cranium/import_strategy/merge.rb +50 -0
- data/lib/cranium/import_strategy/truncate_insert.rb +19 -0
- data/lib/cranium/import_strategy.rb +9 -0
- data/lib/cranium/logging.rb +15 -0
- data/lib/cranium/profiling.rb +13 -0
- data/lib/cranium/progress_output.rb +37 -0
- data/lib/cranium/sequel/hash.rb +32 -0
- data/lib/cranium/sequel.rb +5 -0
- data/lib/cranium/source_registry.rb +21 -0
- data/lib/cranium/test_framework/cucumber_table.rb +140 -0
- data/lib/cranium/test_framework/database_entity.rb +29 -0
- data/lib/cranium/test_framework/database_sequence.rb +16 -0
- data/lib/cranium/test_framework/database_table.rb +33 -0
- data/lib/cranium/test_framework/upload_directory.rb +39 -0
- data/lib/cranium/test_framework/world.rb +66 -0
- data/lib/cranium/test_framework.rb +10 -0
- data/lib/cranium/transformation/duplication_index.rb +42 -0
- data/lib/cranium/transformation/index.rb +83 -0
- data/lib/cranium/transformation/join.rb +141 -0
- data/lib/cranium/transformation/sequence.rb +42 -0
- data/lib/cranium/transformation.rb +8 -0
- data/lib/cranium/transformation_record.rb +45 -0
- data/lib/cranium.rb +57 -0
- data/rake/test.rake +31 -0
- data/spec/cranium/application_spec.rb +166 -0
- data/spec/cranium/archiver_spec.rb +44 -0
- data/spec/cranium/command_line_options_spec.rb +32 -0
- data/spec/cranium/configuration_spec.rb +31 -0
- data/spec/cranium/data_importer_spec.rb +55 -0
- data/spec/cranium/data_transformer_spec.rb +16 -0
- data/spec/cranium/database_spec.rb +69 -0
- data/spec/cranium/definition_registry_spec.rb +45 -0
- data/spec/cranium/dimension_manager_spec.rb +63 -0
- data/spec/cranium/dsl/database_definition_spec.rb +23 -0
- data/spec/cranium/dsl/extract_definition_spec.rb +76 -0
- data/spec/cranium/dsl/import_definition_spec.rb +153 -0
- data/spec/cranium/dsl/source_definition_spec.rb +84 -0
- data/spec/cranium/dsl_spec.rb +119 -0
- data/spec/cranium/external_table_spec.rb +71 -0
- data/spec/cranium/extract/storage_spec.rb +125 -0
- data/spec/cranium/logging_spec.rb +37 -0
- data/spec/cranium/sequel/hash_spec.rb +56 -0
- data/spec/cranium/source_registry_spec.rb +31 -0
- data/spec/cranium/test_framework/cucumber_table_spec.rb +144 -0
- data/spec/cranium/transformation/duplication_index_spec.rb +75 -0
- data/spec/cranium/transformation/index_spec.rb +178 -0
- data/spec/cranium/transformation/join_spec.rb +43 -0
- data/spec/cranium/transformation/sequence_spec.rb +83 -0
- data/spec/cranium/transformation_record_spec.rb +78 -0
- data/spec/cranium_spec.rb +53 -0
- data/spec/spec_helper.rb +1 -0
- metadata +362 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: aa44efc27b0fc354b994ea3c8738d5e6aeab2415
|
|
4
|
+
data.tar.gz: 036dfbc242e5858bad5fd374cc0b1c27bda4b6da
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: a90bc3b7ee0cd635b13e9deb325e4bc58fe83889816ac9b7bc02637318f096b953caa01ce44a6314ac67a74d0374beba991548ba46349250f01914e591cf1554
|
|
7
|
+
data.tar.gz: d95a477491949134ef37a86c165d4fdc19615fa12e53fed042af1bd838788a290c64b9ffa9a59db420b446e46544103315f6a18f337c1d09d2713665a9a2fc7d
|
data/.gitignore
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
*.gem
|
|
2
|
+
*.rbc
|
|
3
|
+
.bundle
|
|
4
|
+
.config
|
|
5
|
+
.yardoc
|
|
6
|
+
.vagrant
|
|
7
|
+
.idea
|
|
8
|
+
Gemfile.lock
|
|
9
|
+
InstalledFiles
|
|
10
|
+
_yardoc
|
|
11
|
+
coverage
|
|
12
|
+
doc/
|
|
13
|
+
lib/bundler/man
|
|
14
|
+
pkg
|
|
15
|
+
rdoc
|
|
16
|
+
spec/reports
|
|
17
|
+
test/tmp
|
|
18
|
+
test/version_tmp
|
|
19
|
+
tmp
|
|
20
|
+
atlassian-ide-plugin.xml
|
|
21
|
+
log/
|
data/.ruby-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
2.3.0
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Copyright (c) 2013 Zoltan Ormandi
|
|
2
|
+
|
|
3
|
+
MIT License
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
6
|
+
a copy of this software and associated documentation files (the
|
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
11
|
+
the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be
|
|
14
|
+
included in all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Cranium
|
|
2
|
+
|
|
3
|
+
TODO: Write a gem description
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
Add this line to your application's Gemfile:
|
|
8
|
+
|
|
9
|
+
gem 'cranium'
|
|
10
|
+
|
|
11
|
+
And then execute:
|
|
12
|
+
|
|
13
|
+
$ bundle
|
|
14
|
+
|
|
15
|
+
Or install it yourself as:
|
|
16
|
+
|
|
17
|
+
$ gem install cranium
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
TODO: Write usage instructions here
|
|
22
|
+
|
|
23
|
+
## Contributing
|
|
24
|
+
|
|
25
|
+
1. Fork it
|
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/Vagrantfile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# -*- mode: ruby -*-
|
|
2
|
+
# vi: set ft=ruby :
|
|
3
|
+
|
|
4
|
+
FileUtils.mkdir_p("tmp/custdata") unless Dir.exists?("tmp/custdata")
|
|
5
|
+
|
|
6
|
+
# Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
|
|
7
|
+
VAGRANTFILE_API_VERSION = "2"
|
|
8
|
+
|
|
9
|
+
Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
|
|
10
|
+
config.vm.box = "si-build.v2"
|
|
11
|
+
config.vm.box_url = "http://vboxes.ett.local/si-build.v2.box"
|
|
12
|
+
config.vbguest.auto_update = false
|
|
13
|
+
|
|
14
|
+
config.vm.hostname = 'cranium-build'
|
|
15
|
+
config.vm.network :private_network, ip: "192.168.56.43"
|
|
16
|
+
|
|
17
|
+
config.vm.provider :virtualbox do |virtual_machine|
|
|
18
|
+
virtual_machine.name = "cranium"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
config.vm.synced_folder "tmp/custdata", "/home/gpadmin/gpfdist-data", owner: "gpadmin", group: "gpadmin"
|
|
22
|
+
|
|
23
|
+
config.vm.provision :shell, inline: "su - gpadmin -c 'cat /vagrant/db/setup.sql | psql'"
|
|
24
|
+
end
|
data/bin/cranium
ADDED
data/config/cucumber.yml
ADDED
data/cranium.gemspec
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Gem::Specification.new do |spec|
|
|
2
|
+
spec.name = 'cranium'
|
|
3
|
+
spec.version = '0.2.0'
|
|
4
|
+
spec.authors = ['Emarsys Technologies']
|
|
5
|
+
spec.email = ['smart-insight-dev@emarsys.com']
|
|
6
|
+
spec.description = %q{Provides Extract, Transform and Load functionality for loading data from CSV files to a Greenplum database.}
|
|
7
|
+
spec.summary = %q{Pure Ruby ETL framework}
|
|
8
|
+
spec.homepage = 'https://github.com/emartech/cranium'
|
|
9
|
+
spec.license = 'MIT'
|
|
10
|
+
|
|
11
|
+
spec.files = `git ls-files`.split($/)
|
|
12
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
13
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
14
|
+
spec.require_paths = ['lib']
|
|
15
|
+
|
|
16
|
+
spec.add_runtime_dependency 'pg', '~> 0'
|
|
17
|
+
spec.add_runtime_dependency 'progressbar', '~> 0'
|
|
18
|
+
spec.add_runtime_dependency 'sequel', '~> 4'
|
|
19
|
+
spec.add_runtime_dependency 'slop', '~> 3'
|
|
20
|
+
|
|
21
|
+
spec.add_development_dependency 'bundler', '~> 1'
|
|
22
|
+
spec.add_development_dependency 'rake', '~> 10'
|
|
23
|
+
spec.add_development_dependency 'rspec', '~> 3'
|
|
24
|
+
spec.add_development_dependency 'ruby-prof', '~> 0'
|
|
25
|
+
spec.add_development_dependency 'cucumber', '~> 1'
|
|
26
|
+
end
|
data/db/setup.sql
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
CREATE RESOURCE QUEUE smart_insight WITH (ACTIVE_STATEMENTS=10, PRIORITY=MEDIUM);
|
|
2
|
+
|
|
3
|
+
CREATE ROLE cranium WITH RESOURCE QUEUE smart_insight CREATEEXTTABLE LOGIN PASSWORD 'cranium';
|
|
4
|
+
COMMENT ON ROLE cranium IS 'Cranium test user';
|
|
5
|
+
|
|
6
|
+
CREATE DATABASE cranium WITH OWNER=cranium;
|
|
7
|
+
|
|
8
|
+
CREATE ROLE database_administrator WITH SUPERUSER LOGIN PASSWORD 'emarsys';
|
data/docker-compose.yml
ADDED
data/examples/config.rb
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
require 'logger'
|
|
2
|
+
|
|
3
|
+
Cranium.configure do |config|
|
|
4
|
+
config.greenplum_connection_string = "postgres://cranium:cranium@192.168.56.43:5432/cranium"
|
|
5
|
+
config.gpfdist_url = "192.168.56.43:8123"
|
|
6
|
+
config.gpfdist_home_directory = "tmp/custdata"
|
|
7
|
+
config.upload_directory = "cranium_build"
|
|
8
|
+
config.archive_directory = "cranium_archive"
|
|
9
|
+
config.loggers << Logger.new("log/application.log")
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
database :suite do
|
|
13
|
+
connect_to "postgres://cranium:cranium@192.168.56.43:5432/cranium"
|
|
14
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
require_relative 'config'
|
|
2
|
+
|
|
3
|
+
source :sales_items do
|
|
4
|
+
file "sales_items*.csv"
|
|
5
|
+
field :order_id, String
|
|
6
|
+
field :date, Date
|
|
7
|
+
field :customer, Integer
|
|
8
|
+
field :item, String
|
|
9
|
+
field :item_name, String
|
|
10
|
+
field :quantity, Float
|
|
11
|
+
field :c_sales_amount, Float
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
source :products do
|
|
15
|
+
field :item_id
|
|
16
|
+
field :item_name
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
deduplicate :sales_items, into: :products, by: [:item]
|
|
21
|
+
|
|
22
|
+
# Equivalent to
|
|
23
|
+
|
|
24
|
+
transform :sales_items => :products do |record|
|
|
25
|
+
deduplicate_by :item
|
|
26
|
+
end
|
|
27
|
+
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
require_relative 'config'
|
|
2
|
+
|
|
3
|
+
source :purchases do
|
|
4
|
+
field :user_id, String
|
|
5
|
+
field :amount, String
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
source :transformed_purchases do
|
|
9
|
+
field :contact_key, Integer
|
|
10
|
+
field :amount, String
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
transform :purchases => :transformed_purchases do |record|
|
|
14
|
+
record[:contact_key] = lookup :contact_key,
|
|
15
|
+
from_table: :dim_contact,
|
|
16
|
+
match_column: :user_id,
|
|
17
|
+
to_value: record[:user_id],
|
|
18
|
+
if_not_found_then_insert: { contact_key: next_value_in_sequence("dim_contact_contact_key_seq"),
|
|
19
|
+
name: "Unknown contact #{record[:user_id]}" }
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
import :transformed_purchases do
|
|
23
|
+
into :fct_purchases
|
|
24
|
+
put :contact_key
|
|
25
|
+
put :amount
|
|
26
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
require_relative 'config'
|
|
2
|
+
|
|
3
|
+
extract :contacts do
|
|
4
|
+
from :suite
|
|
5
|
+
incrementally_by :created
|
|
6
|
+
query <<-sql
|
|
7
|
+
SELECT *
|
|
8
|
+
FROM contacts
|
|
9
|
+
WHERE created BETWEEN '#{last_extracted_value_of :created, "1970-01-01 00:00:00"}' AND '#{Time.now - 60*10}'
|
|
10
|
+
sql
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
extract :contacts do
|
|
14
|
+
from :suite
|
|
15
|
+
incrementally_by :id
|
|
16
|
+
query "SELECT * FROM akarmi WHERE id > #{last_extracted_value_of :id, 0}"
|
|
17
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
require_relative 'config'
|
|
2
|
+
|
|
3
|
+
source :purchases do
|
|
4
|
+
field :user_id, String
|
|
5
|
+
field :amount, String
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
source :transformed_purchases do
|
|
9
|
+
field :contact_key, Integer
|
|
10
|
+
field :amount, String
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
transform :purchases => :transformed_purchases do |record|
|
|
14
|
+
record[:contact_key] = lookup :contact_key,
|
|
15
|
+
from_table: :dim_contact,
|
|
16
|
+
match: {:user_id => record[:user_id], :another_field => record[:another_value]},
|
|
17
|
+
if_not_found_then_insert: { contact_key: next_value_in_sequence("dim_contact_contact_key_seq"),
|
|
18
|
+
name: "Unknown contact #{record[:user_id]}" }
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
import :transformed_purchases do
|
|
22
|
+
into :fct_purchases
|
|
23
|
+
put :contact_key
|
|
24
|
+
put :amount
|
|
25
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
Feature: Archive source files
|
|
2
|
+
|
|
3
|
+
Scenario:
|
|
4
|
+
Given no "/tmp/cranium_archive" directory
|
|
5
|
+
And a "products_1.csv" data file containing:
|
|
6
|
+
"""
|
|
7
|
+
"""
|
|
8
|
+
And a "products_2.csv" data file containing:
|
|
9
|
+
"""
|
|
10
|
+
"""
|
|
11
|
+
And a "contacts.csv" data file containing:
|
|
12
|
+
"""
|
|
13
|
+
"""
|
|
14
|
+
And a "purchases.csv" data file containing:
|
|
15
|
+
"""
|
|
16
|
+
"""
|
|
17
|
+
And the following definition:
|
|
18
|
+
"""
|
|
19
|
+
Cranium.configure do |config|
|
|
20
|
+
config.archive_directory = "/tmp/cranium_archive"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
source :products do
|
|
24
|
+
file "products_*.csv"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
source :products_transformed do end
|
|
28
|
+
|
|
29
|
+
source :contacts do
|
|
30
|
+
file "contacts.csv"
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
source :purchases do
|
|
34
|
+
file "purchases.csv"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
transform :products => :products_transformed do |record|
|
|
38
|
+
output record
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
archive :products, :contacts
|
|
42
|
+
"""
|
|
43
|
+
When I execute the definition
|
|
44
|
+
Then the process should exit successfully
|
|
45
|
+
And the "/tmp/cranium_archive/" directory should contain the following files:
|
|
46
|
+
| filename |
|
|
47
|
+
| .*contacts.csv |
|
|
48
|
+
| .*products_1.csv |
|
|
49
|
+
| .*products_2.csv |
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
Feature: Extracting data incrementally from a database table to CSV
|
|
2
|
+
|
|
3
|
+
Incremental extracts work by indicating that a field (or fields) should be used to detect new data rows
|
|
4
|
+
in the table. The highest extracted values are saved from one process and passed on to the next when the
|
|
5
|
+
process is run again. This approach typically works best with id or timestamp fields.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
Background:
|
|
9
|
+
Given the following definition:
|
|
10
|
+
"""
|
|
11
|
+
database :suite do
|
|
12
|
+
connect_to Cranium.configuration.greenplum_connection_string
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
extract :contacts do
|
|
16
|
+
from :suite
|
|
17
|
+
incrementally_by :id
|
|
18
|
+
query "SELECT id, name FROM contacts WHERE id > #{last_extracted_value_of :id, 0} ORDER BY id DESC"
|
|
19
|
+
end
|
|
20
|
+
"""
|
|
21
|
+
And a database table called "contacts" with the following fields:
|
|
22
|
+
| field_name | field_type |
|
|
23
|
+
| id | INTEGER |
|
|
24
|
+
| name | TEXT |
|
|
25
|
+
And only the following rows in the "contacts" database table:
|
|
26
|
+
| id | name |
|
|
27
|
+
| 1 | John Doe |
|
|
28
|
+
| 2 | Jane Doe |
|
|
29
|
+
And the definition is executed
|
|
30
|
+
And the "contacts.csv" file is deleted
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
Scenario: Successful extract
|
|
34
|
+
Given the following new rows in the "contacts" database table:
|
|
35
|
+
| id | name |
|
|
36
|
+
| 3 | John Smith |
|
|
37
|
+
| 4 | Jane Smith |
|
|
38
|
+
When I execute the definition again
|
|
39
|
+
Then the process should exit successfully
|
|
40
|
+
And there should be a "contacts.csv" data file in the upload directory containing:
|
|
41
|
+
"""
|
|
42
|
+
id,name
|
|
43
|
+
4,Jane Smith
|
|
44
|
+
3,John Smith
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
Scenario: Incremental extract doesn't remember empty 'last extracted value' - bugfix
|
|
49
|
+
Given the definition is executed again
|
|
50
|
+
And the "contacts.csv" file is deleted
|
|
51
|
+
When I execute the definition again
|
|
52
|
+
Then the process should exit successfully
|
|
53
|
+
And there should be a "contacts.csv" data file in the upload directory containing:
|
|
54
|
+
"""
|
|
55
|
+
id,name
|
|
56
|
+
"""
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
Feature: Extracting data from a database table to CSV
|
|
2
|
+
|
|
3
|
+
Data can be extracted from a database table into a CSV file. The CSV file is named after the extract process
|
|
4
|
+
and is placed in the upload directory specified in the configuration.
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
Background:
|
|
8
|
+
Given a database table called "contacts" with the following fields:
|
|
9
|
+
| field_name | field_type |
|
|
10
|
+
| id | INTEGER |
|
|
11
|
+
| name | TEXT |
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
Scenario: Successful extract using raw SQL
|
|
15
|
+
Given only the following rows in the "contacts" database table:
|
|
16
|
+
| id | name |
|
|
17
|
+
| 1 | John Doe |
|
|
18
|
+
| 2 | Jane Doe |
|
|
19
|
+
| 3 | John Smith |
|
|
20
|
+
And the following definition:
|
|
21
|
+
"""
|
|
22
|
+
database :suite do
|
|
23
|
+
connect_to Cranium.configuration.greenplum_connection_string
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
extract :contacts do
|
|
27
|
+
from :suite
|
|
28
|
+
query "SELECT id, name FROM contacts WHERE name LIKE '%Doe%' ORDER BY id"
|
|
29
|
+
end
|
|
30
|
+
"""
|
|
31
|
+
When I execute the definition
|
|
32
|
+
Then the process should exit successfully
|
|
33
|
+
And there should be a "contacts.csv" data file in the upload directory containing:
|
|
34
|
+
"""
|
|
35
|
+
id,name
|
|
36
|
+
1,John Doe
|
|
37
|
+
2,Jane Doe
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
Scenario: Successful extract with overrided columns
|
|
41
|
+
Given only the following rows in the "contacts" database table:
|
|
42
|
+
| id | name |
|
|
43
|
+
| 1 | John Doe |
|
|
44
|
+
| 2 | Jane Doe |
|
|
45
|
+
| 3 | John Smith |
|
|
46
|
+
And the following definition:
|
|
47
|
+
"""
|
|
48
|
+
database :suite do
|
|
49
|
+
connect_to Cranium.configuration.greenplum_connection_string
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
extract :contacts do
|
|
53
|
+
from :suite
|
|
54
|
+
columns %w(uid full_name)
|
|
55
|
+
query "SELECT id, name FROM contacts WHERE name LIKE '%Doe%' ORDER BY id"
|
|
56
|
+
end
|
|
57
|
+
"""
|
|
58
|
+
When I execute the definition
|
|
59
|
+
Then the process should exit successfully
|
|
60
|
+
And there should be a "contacts.csv" data file in the upload directory containing:
|
|
61
|
+
"""
|
|
62
|
+
uid,full_name
|
|
63
|
+
1,John Doe
|
|
64
|
+
2,Jane Doe
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
Scenario: Extract should fail if file already exists
|
|
68
|
+
Given an empty "contacts.csv" data file
|
|
69
|
+
And the following definition:
|
|
70
|
+
"""
|
|
71
|
+
database :suite do
|
|
72
|
+
connect_to Cranium.configuration.greenplum_connection_string
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
extract :contacts do
|
|
76
|
+
from :suite
|
|
77
|
+
query "SELECT id, name FROM contacts WHERE name LIKE '%Doe%' ORDER BY id"
|
|
78
|
+
end
|
|
79
|
+
"""
|
|
80
|
+
When I execute the definition
|
|
81
|
+
Then the process should exit with an error
|
|
82
|
+
And the error message should contain:
|
|
83
|
+
"""
|
|
84
|
+
Extract halted: a file named "contacts.csv" already exists
|
|
85
|
+
"""
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
Feature: Import a CSV file into the database as a delta
|
|
2
|
+
|
|
3
|
+
Scenario: Successful import
|
|
4
|
+
Given a database table called "dim_product" with the following fields:
|
|
5
|
+
| field_name | field_type |
|
|
6
|
+
| item | TEXT |
|
|
7
|
+
| title | TEXT |
|
|
8
|
+
| category | TEXT |
|
|
9
|
+
| description | TEXT |
|
|
10
|
+
And a "products.csv" data file containing:
|
|
11
|
+
"""
|
|
12
|
+
id,name,category,description
|
|
13
|
+
JNI-123,Just a product name,Main category > Subcategory > Sub-subcategory,Some description
|
|
14
|
+
CDI-234,Another product name,Smart Insight > Cool stuff > Scripts,Another description
|
|
15
|
+
"""
|
|
16
|
+
And the following definition:
|
|
17
|
+
"""
|
|
18
|
+
source :products do
|
|
19
|
+
field :id, String
|
|
20
|
+
field :name, String
|
|
21
|
+
field :category, String
|
|
22
|
+
field :description, String
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
import :products do
|
|
26
|
+
into :dim_product
|
|
27
|
+
put :id => :item
|
|
28
|
+
put :name => :title
|
|
29
|
+
put :category => :category
|
|
30
|
+
put :description => :description
|
|
31
|
+
end
|
|
32
|
+
"""
|
|
33
|
+
When I execute the definition
|
|
34
|
+
Then the process should exit successfully
|
|
35
|
+
And the "dim_product" table should contain:
|
|
36
|
+
| item | title | category | description |
|
|
37
|
+
| JNI-123 | Just a product name | Main category > Subcategory > Sub-subcategory | Some description |
|
|
38
|
+
| CDI-234 | Another product name | Smart Insight > Cool stuff > Scripts | Another description |
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
Feature: Import a CSV file into the database with merging
|
|
2
|
+
|
|
3
|
+
The merge_on property can be used to specify an id field that is used to detect duplicates while importing.
|
|
4
|
+
Duplicates are updated and new items are added.
|
|
5
|
+
|
|
6
|
+
Scenario: Successful import with merged items
|
|
7
|
+
Given a database table called "lkp_categories" with the following fields:
|
|
8
|
+
| field_name | field_type |
|
|
9
|
+
| contact_id | INTEGER |
|
|
10
|
+
| category_id | TEXT |
|
|
11
|
+
And only the following rows in the "lkp_categories" database table:
|
|
12
|
+
| contact_id (i) | category_id (s) |
|
|
13
|
+
| 1 | A |
|
|
14
|
+
| 1 | B |
|
|
15
|
+
| 1 | C |
|
|
16
|
+
| 2 | A |
|
|
17
|
+
| 2 | D |
|
|
18
|
+
And a "category_lookup.csv" data file containing:
|
|
19
|
+
"""
|
|
20
|
+
user_id,category_id
|
|
21
|
+
1,A
|
|
22
|
+
1,E
|
|
23
|
+
3,E
|
|
24
|
+
3,F
|
|
25
|
+
"""
|
|
26
|
+
And the following definition:
|
|
27
|
+
"""
|
|
28
|
+
source :category_lookup do
|
|
29
|
+
field :user_id, Integer
|
|
30
|
+
field :category_id, String
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
import :category_lookup do
|
|
34
|
+
into :lkp_categories
|
|
35
|
+
|
|
36
|
+
put :user_id => :contact_id
|
|
37
|
+
put :category_id => :category_id
|
|
38
|
+
|
|
39
|
+
delete_insert_on :contact_id
|
|
40
|
+
end
|
|
41
|
+
"""
|
|
42
|
+
When I execute the definition
|
|
43
|
+
Then the process should exit successfully
|
|
44
|
+
And the "lkp_categories" table should contain:
|
|
45
|
+
| contact_id (i) | category_id (s) |
|
|
46
|
+
| 1 | A |
|
|
47
|
+
| 1 | E |
|
|
48
|
+
| 2 | A |
|
|
49
|
+
| 2 | D |
|
|
50
|
+
| 3 | E |
|
|
51
|
+
| 3 | F |
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
Feature: Import a CSV file into the database with truncation
|
|
2
|
+
|
|
3
|
+
The merge_on property can be used to specify an id field that is used to detect duplicates while importing.
|
|
4
|
+
Duplicates are updated and new items are added.
|
|
5
|
+
|
|
6
|
+
Scenario: Successful import with merged items
|
|
7
|
+
Given a database table called "lkp_categories" with the following fields:
|
|
8
|
+
| field_name | field_type |
|
|
9
|
+
| contact_id | INTEGER |
|
|
10
|
+
| category_id | TEXT |
|
|
11
|
+
And only the following rows in the "lkp_categories" database table:
|
|
12
|
+
| contact_id (i) | category_id (s) |
|
|
13
|
+
| 1 | A |
|
|
14
|
+
| 1 | B |
|
|
15
|
+
| 1 | C |
|
|
16
|
+
| 2 | A |
|
|
17
|
+
| 2 | D |
|
|
18
|
+
And a "category_lookup.csv" data file containing:
|
|
19
|
+
"""
|
|
20
|
+
user_id,category_id
|
|
21
|
+
1,A
|
|
22
|
+
1,E
|
|
23
|
+
3,E
|
|
24
|
+
3,F
|
|
25
|
+
"""
|
|
26
|
+
And the following definition:
|
|
27
|
+
"""
|
|
28
|
+
source :category_lookup do
|
|
29
|
+
field :user_id, Integer
|
|
30
|
+
field :category_id, String
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
import :category_lookup do
|
|
34
|
+
into :lkp_categories
|
|
35
|
+
|
|
36
|
+
put :user_id => :contact_id
|
|
37
|
+
put :category_id => :category_id
|
|
38
|
+
|
|
39
|
+
truncate_insert true
|
|
40
|
+
end
|
|
41
|
+
"""
|
|
42
|
+
When I execute the definition
|
|
43
|
+
Then the process should exit successfully
|
|
44
|
+
And the "lkp_categories" table should contain:
|
|
45
|
+
| contact_id (i) | category_id (s) |
|
|
46
|
+
| 1 | A |
|
|
47
|
+
| 1 | E |
|
|
48
|
+
| 3 | E |
|
|
49
|
+
| 3 | F |
|