cranium 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +21 -0
  3. data/.ruby-version +1 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +29 -0
  7. data/Rakefile +3 -0
  8. data/Vagrantfile +24 -0
  9. data/bin/cranium +9 -0
  10. data/config/cucumber.yml +9 -0
  11. data/cranium.gemspec +26 -0
  12. data/db/setup.sql +8 -0
  13. data/docker-compose.yml +8 -0
  14. data/examples/config.rb +14 -0
  15. data/examples/deduplication.rb +27 -0
  16. data/examples/import_csv_with_field_lookup_inserting_new_dimension_keys.rb +26 -0
  17. data/examples/incremental_extract.rb +17 -0
  18. data/examples/lookup_with_multiple_fields.rb +25 -0
  19. data/features/archive.feature +49 -0
  20. data/features/extract/incremental_extract.feature +56 -0
  21. data/features/extract/simple_extract.feature +85 -0
  22. data/features/import/import_csv_to_database_as_delta.feature +38 -0
  23. data/features/import/import_csv_to_database_with_delete_insert_merging.feature +51 -0
  24. data/features/import/import_csv_to_database_with_truncate_insert.feature +49 -0
  25. data/features/import/import_csv_to_database_with_update_merging.feature +46 -0
  26. data/features/import/import_csv_with_always_inserting_new_dimension_keys.feature +137 -0
  27. data/features/import/import_csv_with_field_lookup_inserting_new_dimension_keys.feature +62 -0
  28. data/features/import/import_csv_with_field_lookup_transformation.feature +125 -0
  29. data/features/import/import_csv_with_transformation.feature +55 -0
  30. data/features/import/import_multiple_csv_files_without_transformations.feature +44 -0
  31. data/features/import/import_with_load_id_from_sequence.feature +53 -0
  32. data/features/import/import_with_lookup_from_multiple_fields.feature +64 -0
  33. data/features/read.feature +56 -0
  34. data/features/remove.feature +44 -0
  35. data/features/restore_database_connection.feature +55 -0
  36. data/features/step_definitions/database_table_steps.rb +40 -0
  37. data/features/step_definitions/definition_steps.rb +3 -0
  38. data/features/step_definitions/execution_steps.rb +23 -0
  39. data/features/step_definitions/file_steps.rb +39 -0
  40. data/features/support/class_extensions.rb +24 -0
  41. data/features/support/env.rb +27 -0
  42. data/features/support/randomize.rb +22 -0
  43. data/features/support/stop_on_first_error.rb +5 -0
  44. data/features/transform/deduplication.feature +37 -0
  45. data/features/transform/empty_transformation.feature +72 -0
  46. data/features/transform/join.feature +180 -0
  47. data/features/transform/join_multiple_files_into_one_output_file.feature +46 -0
  48. data/features/transform/output_rows.feature +70 -0
  49. data/features/transform/projection.feature +34 -0
  50. data/features/transform/raw_ruby_transformation.feature +69 -0
  51. data/features/transform/split_field.feature +39 -0
  52. data/lib/cranium/application.rb +104 -0
  53. data/lib/cranium/archiver.rb +36 -0
  54. data/lib/cranium/attribute_dsl.rb +43 -0
  55. data/lib/cranium/command_line_options.rb +27 -0
  56. data/lib/cranium/configuration.rb +33 -0
  57. data/lib/cranium/data_importer.rb +35 -0
  58. data/lib/cranium/data_reader.rb +48 -0
  59. data/lib/cranium/data_transformer.rb +126 -0
  60. data/lib/cranium/database.rb +36 -0
  61. data/lib/cranium/definition_registry.rb +21 -0
  62. data/lib/cranium/dimension_manager.rb +65 -0
  63. data/lib/cranium/dsl/database_definition.rb +23 -0
  64. data/lib/cranium/dsl/extract_definition.rb +28 -0
  65. data/lib/cranium/dsl/import_definition.rb +50 -0
  66. data/lib/cranium/dsl/source_definition.rb +67 -0
  67. data/lib/cranium/dsl.rb +100 -0
  68. data/lib/cranium/extensions/file.rb +7 -0
  69. data/lib/cranium/extensions/sequel_greenplum.rb +30 -0
  70. data/lib/cranium/external_table.rb +75 -0
  71. data/lib/cranium/extract/data_extractor.rb +11 -0
  72. data/lib/cranium/extract/storage.rb +57 -0
  73. data/lib/cranium/extract/strategy/base.rb +27 -0
  74. data/lib/cranium/extract/strategy/incremental.rb +16 -0
  75. data/lib/cranium/extract/strategy/simple.rb +9 -0
  76. data/lib/cranium/extract/strategy.rb +7 -0
  77. data/lib/cranium/extract.rb +7 -0
  78. data/lib/cranium/import_strategy/base.rb +55 -0
  79. data/lib/cranium/import_strategy/delete_insert.rb +40 -0
  80. data/lib/cranium/import_strategy/delta.rb +8 -0
  81. data/lib/cranium/import_strategy/merge.rb +50 -0
  82. data/lib/cranium/import_strategy/truncate_insert.rb +19 -0
  83. data/lib/cranium/import_strategy.rb +9 -0
  84. data/lib/cranium/logging.rb +15 -0
  85. data/lib/cranium/profiling.rb +13 -0
  86. data/lib/cranium/progress_output.rb +37 -0
  87. data/lib/cranium/sequel/hash.rb +32 -0
  88. data/lib/cranium/sequel.rb +5 -0
  89. data/lib/cranium/source_registry.rb +21 -0
  90. data/lib/cranium/test_framework/cucumber_table.rb +140 -0
  91. data/lib/cranium/test_framework/database_entity.rb +29 -0
  92. data/lib/cranium/test_framework/database_sequence.rb +16 -0
  93. data/lib/cranium/test_framework/database_table.rb +33 -0
  94. data/lib/cranium/test_framework/upload_directory.rb +39 -0
  95. data/lib/cranium/test_framework/world.rb +66 -0
  96. data/lib/cranium/test_framework.rb +10 -0
  97. data/lib/cranium/transformation/duplication_index.rb +42 -0
  98. data/lib/cranium/transformation/index.rb +83 -0
  99. data/lib/cranium/transformation/join.rb +141 -0
  100. data/lib/cranium/transformation/sequence.rb +42 -0
  101. data/lib/cranium/transformation.rb +8 -0
  102. data/lib/cranium/transformation_record.rb +45 -0
  103. data/lib/cranium.rb +57 -0
  104. data/rake/test.rake +31 -0
  105. data/spec/cranium/application_spec.rb +166 -0
  106. data/spec/cranium/archiver_spec.rb +44 -0
  107. data/spec/cranium/command_line_options_spec.rb +32 -0
  108. data/spec/cranium/configuration_spec.rb +31 -0
  109. data/spec/cranium/data_importer_spec.rb +55 -0
  110. data/spec/cranium/data_transformer_spec.rb +16 -0
  111. data/spec/cranium/database_spec.rb +69 -0
  112. data/spec/cranium/definition_registry_spec.rb +45 -0
  113. data/spec/cranium/dimension_manager_spec.rb +63 -0
  114. data/spec/cranium/dsl/database_definition_spec.rb +23 -0
  115. data/spec/cranium/dsl/extract_definition_spec.rb +76 -0
  116. data/spec/cranium/dsl/import_definition_spec.rb +153 -0
  117. data/spec/cranium/dsl/source_definition_spec.rb +84 -0
  118. data/spec/cranium/dsl_spec.rb +119 -0
  119. data/spec/cranium/external_table_spec.rb +71 -0
  120. data/spec/cranium/extract/storage_spec.rb +125 -0
  121. data/spec/cranium/logging_spec.rb +37 -0
  122. data/spec/cranium/sequel/hash_spec.rb +56 -0
  123. data/spec/cranium/source_registry_spec.rb +31 -0
  124. data/spec/cranium/test_framework/cucumber_table_spec.rb +144 -0
  125. data/spec/cranium/transformation/duplication_index_spec.rb +75 -0
  126. data/spec/cranium/transformation/index_spec.rb +178 -0
  127. data/spec/cranium/transformation/join_spec.rb +43 -0
  128. data/spec/cranium/transformation/sequence_spec.rb +83 -0
  129. data/spec/cranium/transformation_record_spec.rb +78 -0
  130. data/spec/cranium_spec.rb +53 -0
  131. data/spec/spec_helper.rb +1 -0
  132. metadata +362 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: aa44efc27b0fc354b994ea3c8738d5e6aeab2415
4
+ data.tar.gz: 036dfbc242e5858bad5fd374cc0b1c27bda4b6da
5
+ SHA512:
6
+ metadata.gz: a90bc3b7ee0cd635b13e9deb325e4bc58fe83889816ac9b7bc02637318f096b953caa01ce44a6314ac67a74d0374beba991548ba46349250f01914e591cf1554
7
+ data.tar.gz: d95a477491949134ef37a86c165d4fdc19615fa12e53fed042af1bd838788a290c64b9ffa9a59db420b446e46544103315f6a18f337c1d09d2713665a9a2fc7d
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ .vagrant
7
+ .idea
8
+ Gemfile.lock
9
+ InstalledFiles
10
+ _yardoc
11
+ coverage
12
+ doc/
13
+ lib/bundler/man
14
+ pkg
15
+ rdoc
16
+ spec/reports
17
+ test/tmp
18
+ test/version_tmp
19
+ tmp
20
+ atlassian-ide-plugin.xml
21
+ log/
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.3.0
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in cranium.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Zoltan Ormandi
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Cranium
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'cranium'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install cranium
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ import *Dir['rake/**/*.rake']
data/Vagrantfile ADDED
@@ -0,0 +1,24 @@
1
+ # -*- mode: ruby -*-
2
+ # vi: set ft=ruby :
3
+
4
+ FileUtils.mkdir_p("tmp/custdata") unless Dir.exists?("tmp/custdata")
5
+
6
+ # Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
7
+ VAGRANTFILE_API_VERSION = "2"
8
+
9
+ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
10
+ config.vm.box = "si-build.v2"
11
+ config.vm.box_url = "http://vboxes.ett.local/si-build.v2.box"
12
+ config.vbguest.auto_update = false
13
+
14
+ config.vm.hostname = 'cranium-build'
15
+ config.vm.network :private_network, ip: "192.168.56.43"
16
+
17
+ config.vm.provider :virtualbox do |virtual_machine|
18
+ virtual_machine.name = "cranium"
19
+ end
20
+
21
+ config.vm.synced_folder "tmp/custdata", "/home/gpadmin/gpfdist-data", owner: "gpadmin", group: "gpadmin"
22
+
23
+ config.vm.provision :shell, inline: "su - gpadmin -c 'cat /vagrant/db/setup.sql | psql'"
24
+ end
data/bin/cranium ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'cranium'
4
+
5
+ if ENV["PROFILING"] == "yes"
6
+ require 'cranium/profiling'
7
+ end
8
+
9
+ Cranium.application(ARGV).run
@@ -0,0 +1,9 @@
1
+ default:
2
+ --backtrace
3
+ --tags ~@wip
4
+ --require features
5
+
6
+ build:
7
+ --backtrace
8
+ --format progress
9
+ --tags ~@wip
data/cranium.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = 'cranium'
3
+ spec.version = '0.2.0'
4
+ spec.authors = ['Emarsys Technologies']
5
+ spec.email = ['smart-insight-dev@emarsys.com']
6
+ spec.description = %q{Provides Extract, Transform and Load functionality for loading data from CSV files to a Greenplum database.}
7
+ spec.summary = %q{Pure Ruby ETL framework}
8
+ spec.homepage = 'https://github.com/emartech/cranium'
9
+ spec.license = 'MIT'
10
+
11
+ spec.files = `git ls-files`.split($/)
12
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
13
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
14
+ spec.require_paths = ['lib']
15
+
16
+ spec.add_runtime_dependency 'pg', '~> 0'
17
+ spec.add_runtime_dependency 'progressbar', '~> 0'
18
+ spec.add_runtime_dependency 'sequel', '~> 4'
19
+ spec.add_runtime_dependency 'slop', '~> 3'
20
+
21
+ spec.add_development_dependency 'bundler', '~> 1'
22
+ spec.add_development_dependency 'rake', '~> 10'
23
+ spec.add_development_dependency 'rspec', '~> 3'
24
+ spec.add_development_dependency 'ruby-prof', '~> 0'
25
+ spec.add_development_dependency 'cucumber', '~> 1'
26
+ end
data/db/setup.sql ADDED
@@ -0,0 +1,8 @@
1
+ CREATE RESOURCE QUEUE smart_insight WITH (ACTIVE_STATEMENTS=10, PRIORITY=MEDIUM);
2
+
3
+ CREATE ROLE cranium WITH RESOURCE QUEUE smart_insight CREATEEXTTABLE LOGIN PASSWORD 'cranium';
4
+ COMMENT ON ROLE cranium IS 'Cranium test user';
5
+
6
+ CREATE DATABASE cranium WITH OWNER=cranium;
7
+
8
+ CREATE ROLE database_administrator WITH SUPERUSER LOGIN PASSWORD 'emarsys';
@@ -0,0 +1,8 @@
1
+ greenplum:
2
+ image: kevinmtrowbridge/gpdb-docker
3
+ ports:
4
+ - 22:22
5
+ - 5432:5432
6
+ - 8123:8123
7
+ volumes:
8
+ - ./tmp/custdata:/home/gpadmin/gpfdist-data
@@ -0,0 +1,14 @@
1
+ require 'logger'
2
+
3
+ Cranium.configure do |config|
4
+ config.greenplum_connection_string = "postgres://cranium:cranium@192.168.56.43:5432/cranium"
5
+ config.gpfdist_url = "192.168.56.43:8123"
6
+ config.gpfdist_home_directory = "tmp/custdata"
7
+ config.upload_directory = "cranium_build"
8
+ config.archive_directory = "cranium_archive"
9
+ config.loggers << Logger.new("log/application.log")
10
+ end
11
+
12
+ database :suite do
13
+ connect_to "postgres://cranium:cranium@192.168.56.43:5432/cranium"
14
+ end
@@ -0,0 +1,27 @@
1
+ require_relative 'config'
2
+
3
+ source :sales_items do
4
+ file "sales_items*.csv"
5
+ field :order_id, String
6
+ field :date, Date
7
+ field :customer, Integer
8
+ field :item, String
9
+ field :item_name, String
10
+ field :quantity, Float
11
+ field :c_sales_amount, Float
12
+ end
13
+
14
+ source :products do
15
+ field :item_id
16
+ field :item_name
17
+ end
18
+
19
+
20
+ deduplicate :sales_items, into: :products, by: [:item]
21
+
22
+ # Equivalent to
23
+
24
+ transform :sales_items => :products do |record|
25
+ deduplicate_by :item
26
+ end
27
+
@@ -0,0 +1,26 @@
1
+ require_relative 'config'
2
+
3
+ source :purchases do
4
+ field :user_id, String
5
+ field :amount, String
6
+ end
7
+
8
+ source :transformed_purchases do
9
+ field :contact_key, Integer
10
+ field :amount, String
11
+ end
12
+
13
+ transform :purchases => :transformed_purchases do |record|
14
+ record[:contact_key] = lookup :contact_key,
15
+ from_table: :dim_contact,
16
+ match_column: :user_id,
17
+ to_value: record[:user_id],
18
+ if_not_found_then_insert: { contact_key: next_value_in_sequence("dim_contact_contact_key_seq"),
19
+ name: "Unknown contact #{record[:user_id]}" }
20
+ end
21
+
22
+ import :transformed_purchases do
23
+ into :fct_purchases
24
+ put :contact_key
25
+ put :amount
26
+ end
@@ -0,0 +1,17 @@
1
+ require_relative 'config'
2
+
3
+ extract :contacts do
4
+ from :suite
5
+ incrementally_by :created
6
+ query <<-sql
7
+ SELECT *
8
+ FROM contacts
9
+ WHERE created BETWEEN '#{last_extracted_value_of :created, "1970-01-01 00:00:00"}' AND '#{Time.now - 60*10}'
10
+ sql
11
+ end
12
+
13
+ extract :contacts do
14
+ from :suite
15
+ incrementally_by :id
16
+ query "SELECT * FROM akarmi WHERE id > #{last_extracted_value_of :id, 0}"
17
+ end
@@ -0,0 +1,25 @@
1
+ require_relative 'config'
2
+
3
+ source :purchases do
4
+ field :user_id, String
5
+ field :amount, String
6
+ end
7
+
8
+ source :transformed_purchases do
9
+ field :contact_key, Integer
10
+ field :amount, String
11
+ end
12
+
13
+ transform :purchases => :transformed_purchases do |record|
14
+ record[:contact_key] = lookup :contact_key,
15
+ from_table: :dim_contact,
16
+ match: {:user_id => record[:user_id], :another_field => record[:another_value]},
17
+ if_not_found_then_insert: { contact_key: next_value_in_sequence("dim_contact_contact_key_seq"),
18
+ name: "Unknown contact #{record[:user_id]}" }
19
+ end
20
+
21
+ import :transformed_purchases do
22
+ into :fct_purchases
23
+ put :contact_key
24
+ put :amount
25
+ end
@@ -0,0 +1,49 @@
1
+ Feature: Archive source files
2
+
3
+ Scenario:
4
+ Given no "/tmp/cranium_archive" directory
5
+ And a "products_1.csv" data file containing:
6
+ """
7
+ """
8
+ And a "products_2.csv" data file containing:
9
+ """
10
+ """
11
+ And a "contacts.csv" data file containing:
12
+ """
13
+ """
14
+ And a "purchases.csv" data file containing:
15
+ """
16
+ """
17
+ And the following definition:
18
+ """
19
+ Cranium.configure do |config|
20
+ config.archive_directory = "/tmp/cranium_archive"
21
+ end
22
+
23
+ source :products do
24
+ file "products_*.csv"
25
+ end
26
+
27
+ source :products_transformed do end
28
+
29
+ source :contacts do
30
+ file "contacts.csv"
31
+ end
32
+
33
+ source :purchases do
34
+ file "purchases.csv"
35
+ end
36
+
37
+ transform :products => :products_transformed do |record|
38
+ output record
39
+ end
40
+
41
+ archive :products, :contacts
42
+ """
43
+ When I execute the definition
44
+ Then the process should exit successfully
45
+ And the "/tmp/cranium_archive/" directory should contain the following files:
46
+ | filename |
47
+ | .*contacts.csv |
48
+ | .*products_1.csv |
49
+ | .*products_2.csv |
@@ -0,0 +1,56 @@
1
+ Feature: Extracting data incrementally from a database table to CSV
2
+
3
+ Incremental extracts work by indicating that a field (or fields) should be used to detect new data rows
4
+ in the table. The highest extracted values are saved from one process and passed on to the next when the
5
+ process is run again. This approach typically works best with id or timestamp fields.
6
+
7
+
8
+ Background:
9
+ Given the following definition:
10
+ """
11
+ database :suite do
12
+ connect_to Cranium.configuration.greenplum_connection_string
13
+ end
14
+
15
+ extract :contacts do
16
+ from :suite
17
+ incrementally_by :id
18
+ query "SELECT id, name FROM contacts WHERE id > #{last_extracted_value_of :id, 0} ORDER BY id DESC"
19
+ end
20
+ """
21
+ And a database table called "contacts" with the following fields:
22
+ | field_name | field_type |
23
+ | id | INTEGER |
24
+ | name | TEXT |
25
+ And only the following rows in the "contacts" database table:
26
+ | id | name |
27
+ | 1 | John Doe |
28
+ | 2 | Jane Doe |
29
+ And the definition is executed
30
+ And the "contacts.csv" file is deleted
31
+
32
+
33
+ Scenario: Successful extract
34
+ Given the following new rows in the "contacts" database table:
35
+ | id | name |
36
+ | 3 | John Smith |
37
+ | 4 | Jane Smith |
38
+ When I execute the definition again
39
+ Then the process should exit successfully
40
+ And there should be a "contacts.csv" data file in the upload directory containing:
41
+ """
42
+ id,name
43
+ 4,Jane Smith
44
+ 3,John Smith
45
+ """
46
+
47
+
48
+ Scenario: Incremental extract doesn't remember empty 'last extracted value' - bugfix
49
+ Given the definition is executed again
50
+ And the "contacts.csv" file is deleted
51
+ When I execute the definition again
52
+ Then the process should exit successfully
53
+ And there should be a "contacts.csv" data file in the upload directory containing:
54
+ """
55
+ id,name
56
+ """
@@ -0,0 +1,85 @@
1
+ Feature: Extracting data from a database table to CSV
2
+
3
+ Data can be extracted from a database table into a CSV file. The CSV file is named after the extract process
4
+ and is placed in the upload directory specified in the configuration.
5
+
6
+
7
+ Background:
8
+ Given a database table called "contacts" with the following fields:
9
+ | field_name | field_type |
10
+ | id | INTEGER |
11
+ | name | TEXT |
12
+
13
+
14
+ Scenario: Successful extract using raw SQL
15
+ Given only the following rows in the "contacts" database table:
16
+ | id | name |
17
+ | 1 | John Doe |
18
+ | 2 | Jane Doe |
19
+ | 3 | John Smith |
20
+ And the following definition:
21
+ """
22
+ database :suite do
23
+ connect_to Cranium.configuration.greenplum_connection_string
24
+ end
25
+
26
+ extract :contacts do
27
+ from :suite
28
+ query "SELECT id, name FROM contacts WHERE name LIKE '%Doe%' ORDER BY id"
29
+ end
30
+ """
31
+ When I execute the definition
32
+ Then the process should exit successfully
33
+ And there should be a "contacts.csv" data file in the upload directory containing:
34
+ """
35
+ id,name
36
+ 1,John Doe
37
+ 2,Jane Doe
38
+ """
39
+
40
+ Scenario: Successful extract with overrided columns
41
+ Given only the following rows in the "contacts" database table:
42
+ | id | name |
43
+ | 1 | John Doe |
44
+ | 2 | Jane Doe |
45
+ | 3 | John Smith |
46
+ And the following definition:
47
+ """
48
+ database :suite do
49
+ connect_to Cranium.configuration.greenplum_connection_string
50
+ end
51
+
52
+ extract :contacts do
53
+ from :suite
54
+ columns %w(uid full_name)
55
+ query "SELECT id, name FROM contacts WHERE name LIKE '%Doe%' ORDER BY id"
56
+ end
57
+ """
58
+ When I execute the definition
59
+ Then the process should exit successfully
60
+ And there should be a "contacts.csv" data file in the upload directory containing:
61
+ """
62
+ uid,full_name
63
+ 1,John Doe
64
+ 2,Jane Doe
65
+ """
66
+
67
+ Scenario: Extract should fail if file already exists
68
+ Given an empty "contacts.csv" data file
69
+ And the following definition:
70
+ """
71
+ database :suite do
72
+ connect_to Cranium.configuration.greenplum_connection_string
73
+ end
74
+
75
+ extract :contacts do
76
+ from :suite
77
+ query "SELECT id, name FROM contacts WHERE name LIKE '%Doe%' ORDER BY id"
78
+ end
79
+ """
80
+ When I execute the definition
81
+ Then the process should exit with an error
82
+ And the error message should contain:
83
+ """
84
+ Extract halted: a file named "contacts.csv" already exists
85
+ """
@@ -0,0 +1,38 @@
1
+ Feature: Import a CSV file into the database as a delta
2
+
3
+ Scenario: Successful import
4
+ Given a database table called "dim_product" with the following fields:
5
+ | field_name | field_type |
6
+ | item | TEXT |
7
+ | title | TEXT |
8
+ | category | TEXT |
9
+ | description | TEXT |
10
+ And a "products.csv" data file containing:
11
+ """
12
+ id,name,category,description
13
+ JNI-123,Just a product name,Main category > Subcategory > Sub-subcategory,Some description
14
+ CDI-234,Another product name,Smart Insight > Cool stuff > Scripts,Another description
15
+ """
16
+ And the following definition:
17
+ """
18
+ source :products do
19
+ field :id, String
20
+ field :name, String
21
+ field :category, String
22
+ field :description, String
23
+ end
24
+
25
+ import :products do
26
+ into :dim_product
27
+ put :id => :item
28
+ put :name => :title
29
+ put :category => :category
30
+ put :description => :description
31
+ end
32
+ """
33
+ When I execute the definition
34
+ Then the process should exit successfully
35
+ And the "dim_product" table should contain:
36
+ | item | title | category | description |
37
+ | JNI-123 | Just a product name | Main category > Subcategory > Sub-subcategory | Some description |
38
+ | CDI-234 | Another product name | Smart Insight > Cool stuff > Scripts | Another description |
@@ -0,0 +1,51 @@
1
+ Feature: Import a CSV file into the database with merging
2
+
3
+ The merge_on property can be used to specify an id field that is used to detect duplicates while importing.
4
+ Duplicates are updated and new items are added.
5
+
6
+ Scenario: Successful import with merged items
7
+ Given a database table called "lkp_categories" with the following fields:
8
+ | field_name | field_type |
9
+ | contact_id | INTEGER |
10
+ | category_id | TEXT |
11
+ And only the following rows in the "lkp_categories" database table:
12
+ | contact_id (i) | category_id (s) |
13
+ | 1 | A |
14
+ | 1 | B |
15
+ | 1 | C |
16
+ | 2 | A |
17
+ | 2 | D |
18
+ And a "category_lookup.csv" data file containing:
19
+ """
20
+ user_id,category_id
21
+ 1,A
22
+ 1,E
23
+ 3,E
24
+ 3,F
25
+ """
26
+ And the following definition:
27
+ """
28
+ source :category_lookup do
29
+ field :user_id, Integer
30
+ field :category_id, String
31
+ end
32
+
33
+ import :category_lookup do
34
+ into :lkp_categories
35
+
36
+ put :user_id => :contact_id
37
+ put :category_id => :category_id
38
+
39
+ delete_insert_on :contact_id
40
+ end
41
+ """
42
+ When I execute the definition
43
+ Then the process should exit successfully
44
+ And the "lkp_categories" table should contain:
45
+ | contact_id (i) | category_id (s) |
46
+ | 1 | A |
47
+ | 1 | E |
48
+ | 2 | A |
49
+ | 2 | D |
50
+ | 3 | E |
51
+ | 3 | F |
@@ -0,0 +1,49 @@
1
+ Feature: Import a CSV file into the database with truncation
2
+
3
+ The merge_on property can be used to specify an id field that is used to detect duplicates while importing.
4
+ Duplicates are updated and new items are added.
5
+
6
+ Scenario: Successful import with merged items
7
+ Given a database table called "lkp_categories" with the following fields:
8
+ | field_name | field_type |
9
+ | contact_id | INTEGER |
10
+ | category_id | TEXT |
11
+ And only the following rows in the "lkp_categories" database table:
12
+ | contact_id (i) | category_id (s) |
13
+ | 1 | A |
14
+ | 1 | B |
15
+ | 1 | C |
16
+ | 2 | A |
17
+ | 2 | D |
18
+ And a "category_lookup.csv" data file containing:
19
+ """
20
+ user_id,category_id
21
+ 1,A
22
+ 1,E
23
+ 3,E
24
+ 3,F
25
+ """
26
+ And the following definition:
27
+ """
28
+ source :category_lookup do
29
+ field :user_id, Integer
30
+ field :category_id, String
31
+ end
32
+
33
+ import :category_lookup do
34
+ into :lkp_categories
35
+
36
+ put :user_id => :contact_id
37
+ put :category_id => :category_id
38
+
39
+ truncate_insert true
40
+ end
41
+ """
42
+ When I execute the definition
43
+ Then the process should exit successfully
44
+ And the "lkp_categories" table should contain:
45
+ | contact_id (i) | category_id (s) |
46
+ | 1 | A |
47
+ | 1 | E |
48
+ | 3 | E |
49
+ | 3 | F |