cranium 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (132) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +21 -0
  3. data/.ruby-version +1 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +29 -0
  7. data/Rakefile +3 -0
  8. data/Vagrantfile +24 -0
  9. data/bin/cranium +9 -0
  10. data/config/cucumber.yml +9 -0
  11. data/cranium.gemspec +26 -0
  12. data/db/setup.sql +8 -0
  13. data/docker-compose.yml +8 -0
  14. data/examples/config.rb +14 -0
  15. data/examples/deduplication.rb +27 -0
  16. data/examples/import_csv_with_field_lookup_inserting_new_dimension_keys.rb +26 -0
  17. data/examples/incremental_extract.rb +17 -0
  18. data/examples/lookup_with_multiple_fields.rb +25 -0
  19. data/features/archive.feature +49 -0
  20. data/features/extract/incremental_extract.feature +56 -0
  21. data/features/extract/simple_extract.feature +85 -0
  22. data/features/import/import_csv_to_database_as_delta.feature +38 -0
  23. data/features/import/import_csv_to_database_with_delete_insert_merging.feature +51 -0
  24. data/features/import/import_csv_to_database_with_truncate_insert.feature +49 -0
  25. data/features/import/import_csv_to_database_with_update_merging.feature +46 -0
  26. data/features/import/import_csv_with_always_inserting_new_dimension_keys.feature +137 -0
  27. data/features/import/import_csv_with_field_lookup_inserting_new_dimension_keys.feature +62 -0
  28. data/features/import/import_csv_with_field_lookup_transformation.feature +125 -0
  29. data/features/import/import_csv_with_transformation.feature +55 -0
  30. data/features/import/import_multiple_csv_files_without_transformations.feature +44 -0
  31. data/features/import/import_with_load_id_from_sequence.feature +53 -0
  32. data/features/import/import_with_lookup_from_multiple_fields.feature +64 -0
  33. data/features/read.feature +56 -0
  34. data/features/remove.feature +44 -0
  35. data/features/restore_database_connection.feature +55 -0
  36. data/features/step_definitions/database_table_steps.rb +40 -0
  37. data/features/step_definitions/definition_steps.rb +3 -0
  38. data/features/step_definitions/execution_steps.rb +23 -0
  39. data/features/step_definitions/file_steps.rb +39 -0
  40. data/features/support/class_extensions.rb +24 -0
  41. data/features/support/env.rb +27 -0
  42. data/features/support/randomize.rb +22 -0
  43. data/features/support/stop_on_first_error.rb +5 -0
  44. data/features/transform/deduplication.feature +37 -0
  45. data/features/transform/empty_transformation.feature +72 -0
  46. data/features/transform/join.feature +180 -0
  47. data/features/transform/join_multiple_files_into_one_output_file.feature +46 -0
  48. data/features/transform/output_rows.feature +70 -0
  49. data/features/transform/projection.feature +34 -0
  50. data/features/transform/raw_ruby_transformation.feature +69 -0
  51. data/features/transform/split_field.feature +39 -0
  52. data/lib/cranium/application.rb +104 -0
  53. data/lib/cranium/archiver.rb +36 -0
  54. data/lib/cranium/attribute_dsl.rb +43 -0
  55. data/lib/cranium/command_line_options.rb +27 -0
  56. data/lib/cranium/configuration.rb +33 -0
  57. data/lib/cranium/data_importer.rb +35 -0
  58. data/lib/cranium/data_reader.rb +48 -0
  59. data/lib/cranium/data_transformer.rb +126 -0
  60. data/lib/cranium/database.rb +36 -0
  61. data/lib/cranium/definition_registry.rb +21 -0
  62. data/lib/cranium/dimension_manager.rb +65 -0
  63. data/lib/cranium/dsl/database_definition.rb +23 -0
  64. data/lib/cranium/dsl/extract_definition.rb +28 -0
  65. data/lib/cranium/dsl/import_definition.rb +50 -0
  66. data/lib/cranium/dsl/source_definition.rb +67 -0
  67. data/lib/cranium/dsl.rb +100 -0
  68. data/lib/cranium/extensions/file.rb +7 -0
  69. data/lib/cranium/extensions/sequel_greenplum.rb +30 -0
  70. data/lib/cranium/external_table.rb +75 -0
  71. data/lib/cranium/extract/data_extractor.rb +11 -0
  72. data/lib/cranium/extract/storage.rb +57 -0
  73. data/lib/cranium/extract/strategy/base.rb +27 -0
  74. data/lib/cranium/extract/strategy/incremental.rb +16 -0
  75. data/lib/cranium/extract/strategy/simple.rb +9 -0
  76. data/lib/cranium/extract/strategy.rb +7 -0
  77. data/lib/cranium/extract.rb +7 -0
  78. data/lib/cranium/import_strategy/base.rb +55 -0
  79. data/lib/cranium/import_strategy/delete_insert.rb +40 -0
  80. data/lib/cranium/import_strategy/delta.rb +8 -0
  81. data/lib/cranium/import_strategy/merge.rb +50 -0
  82. data/lib/cranium/import_strategy/truncate_insert.rb +19 -0
  83. data/lib/cranium/import_strategy.rb +9 -0
  84. data/lib/cranium/logging.rb +15 -0
  85. data/lib/cranium/profiling.rb +13 -0
  86. data/lib/cranium/progress_output.rb +37 -0
  87. data/lib/cranium/sequel/hash.rb +32 -0
  88. data/lib/cranium/sequel.rb +5 -0
  89. data/lib/cranium/source_registry.rb +21 -0
  90. data/lib/cranium/test_framework/cucumber_table.rb +140 -0
  91. data/lib/cranium/test_framework/database_entity.rb +29 -0
  92. data/lib/cranium/test_framework/database_sequence.rb +16 -0
  93. data/lib/cranium/test_framework/database_table.rb +33 -0
  94. data/lib/cranium/test_framework/upload_directory.rb +39 -0
  95. data/lib/cranium/test_framework/world.rb +66 -0
  96. data/lib/cranium/test_framework.rb +10 -0
  97. data/lib/cranium/transformation/duplication_index.rb +42 -0
  98. data/lib/cranium/transformation/index.rb +83 -0
  99. data/lib/cranium/transformation/join.rb +141 -0
  100. data/lib/cranium/transformation/sequence.rb +42 -0
  101. data/lib/cranium/transformation.rb +8 -0
  102. data/lib/cranium/transformation_record.rb +45 -0
  103. data/lib/cranium.rb +57 -0
  104. data/rake/test.rake +31 -0
  105. data/spec/cranium/application_spec.rb +166 -0
  106. data/spec/cranium/archiver_spec.rb +44 -0
  107. data/spec/cranium/command_line_options_spec.rb +32 -0
  108. data/spec/cranium/configuration_spec.rb +31 -0
  109. data/spec/cranium/data_importer_spec.rb +55 -0
  110. data/spec/cranium/data_transformer_spec.rb +16 -0
  111. data/spec/cranium/database_spec.rb +69 -0
  112. data/spec/cranium/definition_registry_spec.rb +45 -0
  113. data/spec/cranium/dimension_manager_spec.rb +63 -0
  114. data/spec/cranium/dsl/database_definition_spec.rb +23 -0
  115. data/spec/cranium/dsl/extract_definition_spec.rb +76 -0
  116. data/spec/cranium/dsl/import_definition_spec.rb +153 -0
  117. data/spec/cranium/dsl/source_definition_spec.rb +84 -0
  118. data/spec/cranium/dsl_spec.rb +119 -0
  119. data/spec/cranium/external_table_spec.rb +71 -0
  120. data/spec/cranium/extract/storage_spec.rb +125 -0
  121. data/spec/cranium/logging_spec.rb +37 -0
  122. data/spec/cranium/sequel/hash_spec.rb +56 -0
  123. data/spec/cranium/source_registry_spec.rb +31 -0
  124. data/spec/cranium/test_framework/cucumber_table_spec.rb +144 -0
  125. data/spec/cranium/transformation/duplication_index_spec.rb +75 -0
  126. data/spec/cranium/transformation/index_spec.rb +178 -0
  127. data/spec/cranium/transformation/join_spec.rb +43 -0
  128. data/spec/cranium/transformation/sequence_spec.rb +83 -0
  129. data/spec/cranium/transformation_record_spec.rb +78 -0
  130. data/spec/cranium_spec.rb +53 -0
  131. data/spec/spec_helper.rb +1 -0
  132. metadata +362 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: aa44efc27b0fc354b994ea3c8738d5e6aeab2415
4
+ data.tar.gz: 036dfbc242e5858bad5fd374cc0b1c27bda4b6da
5
+ SHA512:
6
+ metadata.gz: a90bc3b7ee0cd635b13e9deb325e4bc58fe83889816ac9b7bc02637318f096b953caa01ce44a6314ac67a74d0374beba991548ba46349250f01914e591cf1554
7
+ data.tar.gz: d95a477491949134ef37a86c165d4fdc19615fa12e53fed042af1bd838788a290c64b9ffa9a59db420b446e46544103315f6a18f337c1d09d2713665a9a2fc7d
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ .vagrant
7
+ .idea
8
+ Gemfile.lock
9
+ InstalledFiles
10
+ _yardoc
11
+ coverage
12
+ doc/
13
+ lib/bundler/man
14
+ pkg
15
+ rdoc
16
+ spec/reports
17
+ test/tmp
18
+ test/version_tmp
19
+ tmp
20
+ atlassian-ide-plugin.xml
21
+ log/
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.3.0
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in cranium.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Zoltan Ormandi
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Cranium
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'cranium'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install cranium
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ import *Dir['rake/**/*.rake']
data/Vagrantfile ADDED
@@ -0,0 +1,24 @@
1
+ # -*- mode: ruby -*-
2
+ # vi: set ft=ruby :
3
+
4
+ FileUtils.mkdir_p("tmp/custdata") unless Dir.exists?("tmp/custdata")
5
+
6
+ # Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
7
+ VAGRANTFILE_API_VERSION = "2"
8
+
9
+ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
10
+ config.vm.box = "si-build.v2"
11
+ config.vm.box_url = "http://vboxes.ett.local/si-build.v2.box"
12
+ config.vbguest.auto_update = false
13
+
14
+ config.vm.hostname = 'cranium-build'
15
+ config.vm.network :private_network, ip: "192.168.56.43"
16
+
17
+ config.vm.provider :virtualbox do |virtual_machine|
18
+ virtual_machine.name = "cranium"
19
+ end
20
+
21
+ config.vm.synced_folder "tmp/custdata", "/home/gpadmin/gpfdist-data", owner: "gpadmin", group: "gpadmin"
22
+
23
+ config.vm.provision :shell, inline: "su - gpadmin -c 'cat /vagrant/db/setup.sql | psql'"
24
+ end
data/bin/cranium ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'cranium'
4
+
5
+ if ENV["PROFILING"] == "yes"
6
+ require 'cranium/profiling'
7
+ end
8
+
9
+ Cranium.application(ARGV).run
@@ -0,0 +1,9 @@
1
+ default:
2
+ --backtrace
3
+ --tags ~@wip
4
+ --require features
5
+
6
+ build:
7
+ --backtrace
8
+ --format progress
9
+ --tags ~@wip
data/cranium.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = 'cranium'
3
+ spec.version = '0.2.0'
4
+ spec.authors = ['Emarsys Technologies']
5
+ spec.email = ['smart-insight-dev@emarsys.com']
6
+ spec.description = %q{Provides Extract, Transform and Load functionality for loading data from CSV files to a Greenplum database.}
7
+ spec.summary = %q{Pure Ruby ETL framework}
8
+ spec.homepage = 'https://github.com/emartech/cranium'
9
+ spec.license = 'MIT'
10
+
11
+ spec.files = `git ls-files`.split($/)
12
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
13
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
14
+ spec.require_paths = ['lib']
15
+
16
+ spec.add_runtime_dependency 'pg', '~> 0'
17
+ spec.add_runtime_dependency 'progressbar', '~> 0'
18
+ spec.add_runtime_dependency 'sequel', '~> 4'
19
+ spec.add_runtime_dependency 'slop', '~> 3'
20
+
21
+ spec.add_development_dependency 'bundler', '~> 1'
22
+ spec.add_development_dependency 'rake', '~> 10'
23
+ spec.add_development_dependency 'rspec', '~> 3'
24
+ spec.add_development_dependency 'ruby-prof', '~> 0'
25
+ spec.add_development_dependency 'cucumber', '~> 1'
26
+ end
data/db/setup.sql ADDED
@@ -0,0 +1,8 @@
1
+ CREATE RESOURCE QUEUE smart_insight WITH (ACTIVE_STATEMENTS=10, PRIORITY=MEDIUM);
2
+
3
+ CREATE ROLE cranium WITH RESOURCE QUEUE smart_insight CREATEEXTTABLE LOGIN PASSWORD 'cranium';
4
+ COMMENT ON ROLE cranium IS 'Cranium test user';
5
+
6
+ CREATE DATABASE cranium WITH OWNER=cranium;
7
+
8
+ CREATE ROLE database_administrator WITH SUPERUSER LOGIN PASSWORD 'emarsys';
@@ -0,0 +1,8 @@
1
+ greenplum:
2
+ image: kevinmtrowbridge/gpdb-docker
3
+ ports:
4
+ - 22:22
5
+ - 5432:5432
6
+ - 8123:8123
7
+ volumes:
8
+ - ./tmp/custdata:/home/gpadmin/gpfdist-data
@@ -0,0 +1,14 @@
1
+ require 'logger'
2
+
3
+ Cranium.configure do |config|
4
+ config.greenplum_connection_string = "postgres://cranium:cranium@192.168.56.43:5432/cranium"
5
+ config.gpfdist_url = "192.168.56.43:8123"
6
+ config.gpfdist_home_directory = "tmp/custdata"
7
+ config.upload_directory = "cranium_build"
8
+ config.archive_directory = "cranium_archive"
9
+ config.loggers << Logger.new("log/application.log")
10
+ end
11
+
12
+ database :suite do
13
+ connect_to "postgres://cranium:cranium@192.168.56.43:5432/cranium"
14
+ end
@@ -0,0 +1,27 @@
1
+ require_relative 'config'
2
+
3
+ source :sales_items do
4
+ file "sales_items*.csv"
5
+ field :order_id, String
6
+ field :date, Date
7
+ field :customer, Integer
8
+ field :item, String
9
+ field :item_name, String
10
+ field :quantity, Float
11
+ field :c_sales_amount, Float
12
+ end
13
+
14
+ source :products do
15
+ field :item_id
16
+ field :item_name
17
+ end
18
+
19
+
20
+ deduplicate :sales_items, into: :products, by: [:item]
21
+
22
+ # Equivalent to
23
+
24
+ transform :sales_items => :products do |record|
25
+ deduplicate_by :item
26
+ end
27
+
@@ -0,0 +1,26 @@
1
+ require_relative 'config'
2
+
3
+ source :purchases do
4
+ field :user_id, String
5
+ field :amount, String
6
+ end
7
+
8
+ source :transformed_purchases do
9
+ field :contact_key, Integer
10
+ field :amount, String
11
+ end
12
+
13
+ transform :purchases => :transformed_purchases do |record|
14
+ record[:contact_key] = lookup :contact_key,
15
+ from_table: :dim_contact,
16
+ match_column: :user_id,
17
+ to_value: record[:user_id],
18
+ if_not_found_then_insert: { contact_key: next_value_in_sequence("dim_contact_contact_key_seq"),
19
+ name: "Unknown contact #{record[:user_id]}" }
20
+ end
21
+
22
+ import :transformed_purchases do
23
+ into :fct_purchases
24
+ put :contact_key
25
+ put :amount
26
+ end
@@ -0,0 +1,17 @@
1
+ require_relative 'config'
2
+
3
+ extract :contacts do
4
+ from :suite
5
+ incrementally_by :created
6
+ query <<-sql
7
+ SELECT *
8
+ FROM contacts
9
+ WHERE created BETWEEN '#{last_extracted_value_of :created, "1970-01-01 00:00:00"}' AND '#{Time.now - 60*10}'
10
+ sql
11
+ end
12
+
13
+ extract :contacts do
14
+ from :suite
15
+ incrementally_by :id
16
+ query "SELECT * FROM akarmi WHERE id > #{last_extracted_value_of :id, 0}"
17
+ end
@@ -0,0 +1,25 @@
1
+ require_relative 'config'
2
+
3
+ source :purchases do
4
+ field :user_id, String
5
+ field :amount, String
6
+ end
7
+
8
+ source :transformed_purchases do
9
+ field :contact_key, Integer
10
+ field :amount, String
11
+ end
12
+
13
+ transform :purchases => :transformed_purchases do |record|
14
+ record[:contact_key] = lookup :contact_key,
15
+ from_table: :dim_contact,
16
+ match: {:user_id => record[:user_id], :another_field => record[:another_value]},
17
+ if_not_found_then_insert: { contact_key: next_value_in_sequence("dim_contact_contact_key_seq"),
18
+ name: "Unknown contact #{record[:user_id]}" }
19
+ end
20
+
21
+ import :transformed_purchases do
22
+ into :fct_purchases
23
+ put :contact_key
24
+ put :amount
25
+ end
@@ -0,0 +1,49 @@
1
+ Feature: Archive source files
2
+
3
+ Scenario:
4
+ Given no "/tmp/cranium_archive" directory
5
+ And a "products_1.csv" data file containing:
6
+ """
7
+ """
8
+ And a "products_2.csv" data file containing:
9
+ """
10
+ """
11
+ And a "contacts.csv" data file containing:
12
+ """
13
+ """
14
+ And a "purchases.csv" data file containing:
15
+ """
16
+ """
17
+ And the following definition:
18
+ """
19
+ Cranium.configure do |config|
20
+ config.archive_directory = "/tmp/cranium_archive"
21
+ end
22
+
23
+ source :products do
24
+ file "products_*.csv"
25
+ end
26
+
27
+ source :products_transformed do end
28
+
29
+ source :contacts do
30
+ file "contacts.csv"
31
+ end
32
+
33
+ source :purchases do
34
+ file "purchases.csv"
35
+ end
36
+
37
+ transform :products => :products_transformed do |record|
38
+ output record
39
+ end
40
+
41
+ archive :products, :contacts
42
+ """
43
+ When I execute the definition
44
+ Then the process should exit successfully
45
+ And the "/tmp/cranium_archive/" directory should contain the following files:
46
+ | filename |
47
+ | .*contacts.csv |
48
+ | .*products_1.csv |
49
+ | .*products_2.csv |
@@ -0,0 +1,56 @@
1
+ Feature: Extracting data incrementally from a database table to CSV
2
+
3
+ Incremental extracts work by indicating that a field (or fields) should be used to detect new data rows
4
+ in the table. The highest extracted values are saved from one process and passed on to the next when the
5
+ process is run again. This approach typically works best with id or timestamp fields.
6
+
7
+
8
+ Background:
9
+ Given the following definition:
10
+ """
11
+ database :suite do
12
+ connect_to Cranium.configuration.greenplum_connection_string
13
+ end
14
+
15
+ extract :contacts do
16
+ from :suite
17
+ incrementally_by :id
18
+ query "SELECT id, name FROM contacts WHERE id > #{last_extracted_value_of :id, 0} ORDER BY id DESC"
19
+ end
20
+ """
21
+ And a database table called "contacts" with the following fields:
22
+ | field_name | field_type |
23
+ | id | INTEGER |
24
+ | name | TEXT |
25
+ And only the following rows in the "contacts" database table:
26
+ | id | name |
27
+ | 1 | John Doe |
28
+ | 2 | Jane Doe |
29
+ And the definition is executed
30
+ And the "contacts.csv" file is deleted
31
+
32
+
33
+ Scenario: Successful extract
34
+ Given the following new rows in the "contacts" database table:
35
+ | id | name |
36
+ | 3 | John Smith |
37
+ | 4 | Jane Smith |
38
+ When I execute the definition again
39
+ Then the process should exit successfully
40
+ And there should be a "contacts.csv" data file in the upload directory containing:
41
+ """
42
+ id,name
43
+ 4,Jane Smith
44
+ 3,John Smith
45
+ """
46
+
47
+
48
+ Scenario: Incremental extract doesn't remember empty 'last extracted value' - bugfix
49
+ Given the definition is executed again
50
+ And the "contacts.csv" file is deleted
51
+ When I execute the definition again
52
+ Then the process should exit successfully
53
+ And there should be a "contacts.csv" data file in the upload directory containing:
54
+ """
55
+ id,name
56
+ """
@@ -0,0 +1,85 @@
1
+ Feature: Extracting data from a database table to CSV
2
+
3
+ Data can be extracted from a database table into a CSV file. The CSV file is named after the extract process
4
+ and is placed in the upload directory specified in the configuration.
5
+
6
+
7
+ Background:
8
+ Given a database table called "contacts" with the following fields:
9
+ | field_name | field_type |
10
+ | id | INTEGER |
11
+ | name | TEXT |
12
+
13
+
14
+ Scenario: Successful extract using raw SQL
15
+ Given only the following rows in the "contacts" database table:
16
+ | id | name |
17
+ | 1 | John Doe |
18
+ | 2 | Jane Doe |
19
+ | 3 | John Smith |
20
+ And the following definition:
21
+ """
22
+ database :suite do
23
+ connect_to Cranium.configuration.greenplum_connection_string
24
+ end
25
+
26
+ extract :contacts do
27
+ from :suite
28
+ query "SELECT id, name FROM contacts WHERE name LIKE '%Doe%' ORDER BY id"
29
+ end
30
+ """
31
+ When I execute the definition
32
+ Then the process should exit successfully
33
+ And there should be a "contacts.csv" data file in the upload directory containing:
34
+ """
35
+ id,name
36
+ 1,John Doe
37
+ 2,Jane Doe
38
+ """
39
+
40
+ Scenario: Successful extract with overrided columns
41
+ Given only the following rows in the "contacts" database table:
42
+ | id | name |
43
+ | 1 | John Doe |
44
+ | 2 | Jane Doe |
45
+ | 3 | John Smith |
46
+ And the following definition:
47
+ """
48
+ database :suite do
49
+ connect_to Cranium.configuration.greenplum_connection_string
50
+ end
51
+
52
+ extract :contacts do
53
+ from :suite
54
+ columns %w(uid full_name)
55
+ query "SELECT id, name FROM contacts WHERE name LIKE '%Doe%' ORDER BY id"
56
+ end
57
+ """
58
+ When I execute the definition
59
+ Then the process should exit successfully
60
+ And there should be a "contacts.csv" data file in the upload directory containing:
61
+ """
62
+ uid,full_name
63
+ 1,John Doe
64
+ 2,Jane Doe
65
+ """
66
+
67
+ Scenario: Extract should fail if file already exists
68
+ Given an empty "contacts.csv" data file
69
+ And the following definition:
70
+ """
71
+ database :suite do
72
+ connect_to Cranium.configuration.greenplum_connection_string
73
+ end
74
+
75
+ extract :contacts do
76
+ from :suite
77
+ query "SELECT id, name FROM contacts WHERE name LIKE '%Doe%' ORDER BY id"
78
+ end
79
+ """
80
+ When I execute the definition
81
+ Then the process should exit with an error
82
+ And the error message should contain:
83
+ """
84
+ Extract halted: a file named "contacts.csv" already exists
85
+ """
@@ -0,0 +1,38 @@
1
+ Feature: Import a CSV file into the database as a delta
2
+
3
+ Scenario: Successful import
4
+ Given a database table called "dim_product" with the following fields:
5
+ | field_name | field_type |
6
+ | item | TEXT |
7
+ | title | TEXT |
8
+ | category | TEXT |
9
+ | description | TEXT |
10
+ And a "products.csv" data file containing:
11
+ """
12
+ id,name,category,description
13
+ JNI-123,Just a product name,Main category > Subcategory > Sub-subcategory,Some description
14
+ CDI-234,Another product name,Smart Insight > Cool stuff > Scripts,Another description
15
+ """
16
+ And the following definition:
17
+ """
18
+ source :products do
19
+ field :id, String
20
+ field :name, String
21
+ field :category, String
22
+ field :description, String
23
+ end
24
+
25
+ import :products do
26
+ into :dim_product
27
+ put :id => :item
28
+ put :name => :title
29
+ put :category => :category
30
+ put :description => :description
31
+ end
32
+ """
33
+ When I execute the definition
34
+ Then the process should exit successfully
35
+ And the "dim_product" table should contain:
36
+ | item | title | category | description |
37
+ | JNI-123 | Just a product name | Main category > Subcategory > Sub-subcategory | Some description |
38
+ | CDI-234 | Another product name | Smart Insight > Cool stuff > Scripts | Another description |
@@ -0,0 +1,51 @@
1
+ Feature: Import a CSV file into the database with merging
2
+
3
+ The merge_on property can be used to specify an id field that is used to detect duplicates while importing.
4
+ Duplicates are updated and new items are added.
5
+
6
+ Scenario: Successful import with merged items
7
+ Given a database table called "lkp_categories" with the following fields:
8
+ | field_name | field_type |
9
+ | contact_id | INTEGER |
10
+ | category_id | TEXT |
11
+ And only the following rows in the "lkp_categories" database table:
12
+ | contact_id (i) | category_id (s) |
13
+ | 1 | A |
14
+ | 1 | B |
15
+ | 1 | C |
16
+ | 2 | A |
17
+ | 2 | D |
18
+ And a "category_lookup.csv" data file containing:
19
+ """
20
+ user_id,category_id
21
+ 1,A
22
+ 1,E
23
+ 3,E
24
+ 3,F
25
+ """
26
+ And the following definition:
27
+ """
28
+ source :category_lookup do
29
+ field :user_id, Integer
30
+ field :category_id, String
31
+ end
32
+
33
+ import :category_lookup do
34
+ into :lkp_categories
35
+
36
+ put :user_id => :contact_id
37
+ put :category_id => :category_id
38
+
39
+ delete_insert_on :contact_id
40
+ end
41
+ """
42
+ When I execute the definition
43
+ Then the process should exit successfully
44
+ And the "lkp_categories" table should contain:
45
+ | contact_id (i) | category_id (s) |
46
+ | 1 | A |
47
+ | 1 | E |
48
+ | 2 | A |
49
+ | 2 | D |
50
+ | 3 | E |
51
+ | 3 | F |
@@ -0,0 +1,49 @@
1
+ Feature: Import a CSV file into the database with truncation
2
+
3
+ The merge_on property can be used to specify an id field that is used to detect duplicates while importing.
4
+ Duplicates are updated and new items are added.
5
+
6
+ Scenario: Successful import with merged items
7
+ Given a database table called "lkp_categories" with the following fields:
8
+ | field_name | field_type |
9
+ | contact_id | INTEGER |
10
+ | category_id | TEXT |
11
+ And only the following rows in the "lkp_categories" database table:
12
+ | contact_id (i) | category_id (s) |
13
+ | 1 | A |
14
+ | 1 | B |
15
+ | 1 | C |
16
+ | 2 | A |
17
+ | 2 | D |
18
+ And a "category_lookup.csv" data file containing:
19
+ """
20
+ user_id,category_id
21
+ 1,A
22
+ 1,E
23
+ 3,E
24
+ 3,F
25
+ """
26
+ And the following definition:
27
+ """
28
+ source :category_lookup do
29
+ field :user_id, Integer
30
+ field :category_id, String
31
+ end
32
+
33
+ import :category_lookup do
34
+ into :lkp_categories
35
+
36
+ put :user_id => :contact_id
37
+ put :category_id => :category_id
38
+
39
+ truncate_insert true
40
+ end
41
+ """
42
+ When I execute the definition
43
+ Then the process should exit successfully
44
+ And the "lkp_categories" table should contain:
45
+ | contact_id (i) | category_id (s) |
46
+ | 1 | A |
47
+ | 1 | E |
48
+ | 3 | E |
49
+ | 3 | F |