cranium 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +21 -0
- data/.ruby-version +1 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +3 -0
- data/Vagrantfile +24 -0
- data/bin/cranium +9 -0
- data/config/cucumber.yml +9 -0
- data/cranium.gemspec +26 -0
- data/db/setup.sql +8 -0
- data/docker-compose.yml +8 -0
- data/examples/config.rb +14 -0
- data/examples/deduplication.rb +27 -0
- data/examples/import_csv_with_field_lookup_inserting_new_dimension_keys.rb +26 -0
- data/examples/incremental_extract.rb +17 -0
- data/examples/lookup_with_multiple_fields.rb +25 -0
- data/features/archive.feature +49 -0
- data/features/extract/incremental_extract.feature +56 -0
- data/features/extract/simple_extract.feature +85 -0
- data/features/import/import_csv_to_database_as_delta.feature +38 -0
- data/features/import/import_csv_to_database_with_delete_insert_merging.feature +51 -0
- data/features/import/import_csv_to_database_with_truncate_insert.feature +49 -0
- data/features/import/import_csv_to_database_with_update_merging.feature +46 -0
- data/features/import/import_csv_with_always_inserting_new_dimension_keys.feature +137 -0
- data/features/import/import_csv_with_field_lookup_inserting_new_dimension_keys.feature +62 -0
- data/features/import/import_csv_with_field_lookup_transformation.feature +125 -0
- data/features/import/import_csv_with_transformation.feature +55 -0
- data/features/import/import_multiple_csv_files_without_transformations.feature +44 -0
- data/features/import/import_with_load_id_from_sequence.feature +53 -0
- data/features/import/import_with_lookup_from_multiple_fields.feature +64 -0
- data/features/read.feature +56 -0
- data/features/remove.feature +44 -0
- data/features/restore_database_connection.feature +55 -0
- data/features/step_definitions/database_table_steps.rb +40 -0
- data/features/step_definitions/definition_steps.rb +3 -0
- data/features/step_definitions/execution_steps.rb +23 -0
- data/features/step_definitions/file_steps.rb +39 -0
- data/features/support/class_extensions.rb +24 -0
- data/features/support/env.rb +27 -0
- data/features/support/randomize.rb +22 -0
- data/features/support/stop_on_first_error.rb +5 -0
- data/features/transform/deduplication.feature +37 -0
- data/features/transform/empty_transformation.feature +72 -0
- data/features/transform/join.feature +180 -0
- data/features/transform/join_multiple_files_into_one_output_file.feature +46 -0
- data/features/transform/output_rows.feature +70 -0
- data/features/transform/projection.feature +34 -0
- data/features/transform/raw_ruby_transformation.feature +69 -0
- data/features/transform/split_field.feature +39 -0
- data/lib/cranium/application.rb +104 -0
- data/lib/cranium/archiver.rb +36 -0
- data/lib/cranium/attribute_dsl.rb +43 -0
- data/lib/cranium/command_line_options.rb +27 -0
- data/lib/cranium/configuration.rb +33 -0
- data/lib/cranium/data_importer.rb +35 -0
- data/lib/cranium/data_reader.rb +48 -0
- data/lib/cranium/data_transformer.rb +126 -0
- data/lib/cranium/database.rb +36 -0
- data/lib/cranium/definition_registry.rb +21 -0
- data/lib/cranium/dimension_manager.rb +65 -0
- data/lib/cranium/dsl/database_definition.rb +23 -0
- data/lib/cranium/dsl/extract_definition.rb +28 -0
- data/lib/cranium/dsl/import_definition.rb +50 -0
- data/lib/cranium/dsl/source_definition.rb +67 -0
- data/lib/cranium/dsl.rb +100 -0
- data/lib/cranium/extensions/file.rb +7 -0
- data/lib/cranium/extensions/sequel_greenplum.rb +30 -0
- data/lib/cranium/external_table.rb +75 -0
- data/lib/cranium/extract/data_extractor.rb +11 -0
- data/lib/cranium/extract/storage.rb +57 -0
- data/lib/cranium/extract/strategy/base.rb +27 -0
- data/lib/cranium/extract/strategy/incremental.rb +16 -0
- data/lib/cranium/extract/strategy/simple.rb +9 -0
- data/lib/cranium/extract/strategy.rb +7 -0
- data/lib/cranium/extract.rb +7 -0
- data/lib/cranium/import_strategy/base.rb +55 -0
- data/lib/cranium/import_strategy/delete_insert.rb +40 -0
- data/lib/cranium/import_strategy/delta.rb +8 -0
- data/lib/cranium/import_strategy/merge.rb +50 -0
- data/lib/cranium/import_strategy/truncate_insert.rb +19 -0
- data/lib/cranium/import_strategy.rb +9 -0
- data/lib/cranium/logging.rb +15 -0
- data/lib/cranium/profiling.rb +13 -0
- data/lib/cranium/progress_output.rb +37 -0
- data/lib/cranium/sequel/hash.rb +32 -0
- data/lib/cranium/sequel.rb +5 -0
- data/lib/cranium/source_registry.rb +21 -0
- data/lib/cranium/test_framework/cucumber_table.rb +140 -0
- data/lib/cranium/test_framework/database_entity.rb +29 -0
- data/lib/cranium/test_framework/database_sequence.rb +16 -0
- data/lib/cranium/test_framework/database_table.rb +33 -0
- data/lib/cranium/test_framework/upload_directory.rb +39 -0
- data/lib/cranium/test_framework/world.rb +66 -0
- data/lib/cranium/test_framework.rb +10 -0
- data/lib/cranium/transformation/duplication_index.rb +42 -0
- data/lib/cranium/transformation/index.rb +83 -0
- data/lib/cranium/transformation/join.rb +141 -0
- data/lib/cranium/transformation/sequence.rb +42 -0
- data/lib/cranium/transformation.rb +8 -0
- data/lib/cranium/transformation_record.rb +45 -0
- data/lib/cranium.rb +57 -0
- data/rake/test.rake +31 -0
- data/spec/cranium/application_spec.rb +166 -0
- data/spec/cranium/archiver_spec.rb +44 -0
- data/spec/cranium/command_line_options_spec.rb +32 -0
- data/spec/cranium/configuration_spec.rb +31 -0
- data/spec/cranium/data_importer_spec.rb +55 -0
- data/spec/cranium/data_transformer_spec.rb +16 -0
- data/spec/cranium/database_spec.rb +69 -0
- data/spec/cranium/definition_registry_spec.rb +45 -0
- data/spec/cranium/dimension_manager_spec.rb +63 -0
- data/spec/cranium/dsl/database_definition_spec.rb +23 -0
- data/spec/cranium/dsl/extract_definition_spec.rb +76 -0
- data/spec/cranium/dsl/import_definition_spec.rb +153 -0
- data/spec/cranium/dsl/source_definition_spec.rb +84 -0
- data/spec/cranium/dsl_spec.rb +119 -0
- data/spec/cranium/external_table_spec.rb +71 -0
- data/spec/cranium/extract/storage_spec.rb +125 -0
- data/spec/cranium/logging_spec.rb +37 -0
- data/spec/cranium/sequel/hash_spec.rb +56 -0
- data/spec/cranium/source_registry_spec.rb +31 -0
- data/spec/cranium/test_framework/cucumber_table_spec.rb +144 -0
- data/spec/cranium/transformation/duplication_index_spec.rb +75 -0
- data/spec/cranium/transformation/index_spec.rb +178 -0
- data/spec/cranium/transformation/join_spec.rb +43 -0
- data/spec/cranium/transformation/sequence_spec.rb +83 -0
- data/spec/cranium/transformation_record_spec.rb +78 -0
- data/spec/cranium_spec.rb +53 -0
- data/spec/spec_helper.rb +1 -0
- metadata +362 -0
@@ -0,0 +1,46 @@
|
|
1
|
+
Feature: Import a CSV file into the database with merging
|
2
|
+
|
3
|
+
The merge_on property can be used to specify an id field that is used to detect duplicates while importing.
|
4
|
+
Duplicates are updated and new items are added.
|
5
|
+
|
6
|
+
Scenario: Successful import with merged items
|
7
|
+
Given a database table called "dim_product" with the following fields:
|
8
|
+
| field_name | field_type |
|
9
|
+
| item | TEXT |
|
10
|
+
| title | TEXT |
|
11
|
+
| description | TEXT |
|
12
|
+
And only the following rows in the "dim_product" database table:
|
13
|
+
| item | title | description |
|
14
|
+
| JNI-123 | Just a product name | Very interesting description |
|
15
|
+
| CDI-234 | Another product name | Yet another cool description |
|
16
|
+
And a "products.csv" data file containing:
|
17
|
+
"""
|
18
|
+
id,name,description
|
19
|
+
JNI-123,Just a product name,"Very interesting description, updated"
|
20
|
+
CDI-234,Updated product name,Yet another cool description
|
21
|
+
KLM-987,Inserted product name,This is the best product
|
22
|
+
"""
|
23
|
+
And the following definition:
|
24
|
+
"""
|
25
|
+
source :products do
|
26
|
+
field :id, String
|
27
|
+
field :name, String
|
28
|
+
field :description, String
|
29
|
+
end
|
30
|
+
|
31
|
+
import :products do
|
32
|
+
into :dim_product
|
33
|
+
put :id => :item
|
34
|
+
put :name => :title
|
35
|
+
put :description => :description
|
36
|
+
|
37
|
+
merge_on :id => :item
|
38
|
+
end
|
39
|
+
"""
|
40
|
+
When I execute the definition
|
41
|
+
Then the process should exit successfully
|
42
|
+
And the "dim_product" table should contain:
|
43
|
+
| item | title | description |
|
44
|
+
| JNI-123 | Just a product name | Very interesting description, updated |
|
45
|
+
| CDI-234 | Updated product name | Yet another cool description |
|
46
|
+
| KLM-987 | Inserted product name | This is the best product |
|
@@ -0,0 +1,137 @@
|
|
1
|
+
Feature: Import a CSV file into the database with new dimension values always inserted
|
2
|
+
|
3
|
+
Scenario: Successful import
|
4
|
+
Given a database table called "dim_contact" with the following fields:
|
5
|
+
| field_name | field_type |
|
6
|
+
| contact_key | SERIAL |
|
7
|
+
| user_id | TEXT |
|
8
|
+
| name | TEXT |
|
9
|
+
And only the following rows in the "dim_contact" database table:
|
10
|
+
| contact_key (i) | user_id | name |
|
11
|
+
| 10 | 1 | Alma |
|
12
|
+
And the current value in sequence "dim_contact_contact_key_seq" is 10
|
13
|
+
And a database table called "fct_purchases" with the following fields:
|
14
|
+
| field_name | field_type |
|
15
|
+
| contact_key | INTEGER |
|
16
|
+
| amount | TEXT |
|
17
|
+
And a "purchases.csv" data file containing:
|
18
|
+
"""
|
19
|
+
user_id,amount
|
20
|
+
1,100
|
21
|
+
NA,200
|
22
|
+
NA,300
|
23
|
+
"""
|
24
|
+
And the following definition:
|
25
|
+
"""
|
26
|
+
source :purchases do
|
27
|
+
field :user_id, String
|
28
|
+
field :amount, String
|
29
|
+
end
|
30
|
+
|
31
|
+
source :transformed_purchases do
|
32
|
+
field :contact_key, Integer
|
33
|
+
field :amount, String
|
34
|
+
end
|
35
|
+
|
36
|
+
transform :purchases => :transformed_purchases do |record|
|
37
|
+
record[:contact_key] = insert :contact_key,
|
38
|
+
table: :dim_contact,
|
39
|
+
record: {contact_key: next_value_in_sequence("dim_contact_contact_key_seq"), user_id: record[:user_id], name: "Unknown contact #{record[:user_id]}"}
|
40
|
+
output record
|
41
|
+
end
|
42
|
+
|
43
|
+
import :transformed_purchases do
|
44
|
+
into :fct_purchases
|
45
|
+
put :contact_key
|
46
|
+
put :amount
|
47
|
+
end
|
48
|
+
"""
|
49
|
+
When I execute the definition
|
50
|
+
Then the process should exit successfully
|
51
|
+
And the "fct_purchases" table should contain:
|
52
|
+
| contact_key (i) | amount |
|
53
|
+
| 11 | 100 |
|
54
|
+
| 12 | 200 |
|
55
|
+
| 13 | 300 |
|
56
|
+
And the "dim_contact" table should contain:
|
57
|
+
| contact_key (i) | user_id | name |
|
58
|
+
| 10 | 1 | Alma |
|
59
|
+
| 11 | 1 | Unknown contact 1 |
|
60
|
+
| 12 | NA | Unknown contact NA |
|
61
|
+
| 13 | NA | Unknown contact NA |
|
62
|
+
|
63
|
+
|
64
|
+
Scenario: Example use case for the insert
|
65
|
+
If purchases made by a predefined contact identifier (NA in this case) do not look for it insert .
|
66
|
+
Otherwise use lookup to find or create that contact
|
67
|
+
|
68
|
+
Given a database table called "dim_contact" with the following fields:
|
69
|
+
| field_name | field_type |
|
70
|
+
| contact_key | SERIAL |
|
71
|
+
| user_id | TEXT |
|
72
|
+
| name | TEXT |
|
73
|
+
And only the following rows in the "dim_contact" database table:
|
74
|
+
| contact_key (i) | user_id | name |
|
75
|
+
| 10 | 1 | Alma |
|
76
|
+
And the current value in sequence "dim_contact_contact_key_seq" is 10
|
77
|
+
And a database table called "fct_purchases" with the following fields:
|
78
|
+
| field_name | field_type |
|
79
|
+
| contact_key | INTEGER |
|
80
|
+
| amount | TEXT |
|
81
|
+
And a "purchases.csv" data file containing:
|
82
|
+
"""
|
83
|
+
user_id,amount
|
84
|
+
1,100
|
85
|
+
NA,200
|
86
|
+
NA,300
|
87
|
+
2,400
|
88
|
+
2,500
|
89
|
+
"""
|
90
|
+
And the following definition:
|
91
|
+
"""
|
92
|
+
source :purchases do
|
93
|
+
field :user_id, String
|
94
|
+
field :amount, String
|
95
|
+
end
|
96
|
+
|
97
|
+
source :transformed_purchases do
|
98
|
+
field :contact_key, Integer
|
99
|
+
field :amount, String
|
100
|
+
end
|
101
|
+
|
102
|
+
transform :purchases => :transformed_purchases do |record|
|
103
|
+
if record[:user_id] == 'NA'
|
104
|
+
record[:contact_key] = insert :contact_key,
|
105
|
+
table: :dim_contact,
|
106
|
+
record: {contact_key: next_value_in_sequence("dim_contact_contact_key_seq"), user_id: record[:user_id], name: "Unknown contact #{record[:user_id]}"}
|
107
|
+
else
|
108
|
+
record[:contact_key] = lookup :contact_key,
|
109
|
+
from_table: :dim_contact,
|
110
|
+
match_column: :user_id,
|
111
|
+
to_value: record[:user_id],
|
112
|
+
if_not_found_then_insert: {contact_key: next_value_in_sequence("dim_contact_contact_key_seq"), name: "Unknown contact #{record[:user_id]}"}
|
113
|
+
end
|
114
|
+
output record
|
115
|
+
end
|
116
|
+
|
117
|
+
import :transformed_purchases do
|
118
|
+
into :fct_purchases
|
119
|
+
put :contact_key
|
120
|
+
put :amount
|
121
|
+
end
|
122
|
+
"""
|
123
|
+
When I execute the definition
|
124
|
+
Then the process should exit successfully
|
125
|
+
And the "fct_purchases" table should contain:
|
126
|
+
| contact_key (i) | amount |
|
127
|
+
| 10 | 100 |
|
128
|
+
| 11 | 200 |
|
129
|
+
| 12 | 300 |
|
130
|
+
| 13 | 400 |
|
131
|
+
| 13 | 500 |
|
132
|
+
And the "dim_contact" table should contain:
|
133
|
+
| contact_key (i) | user_id | name |
|
134
|
+
| 10 | 1 | Alma |
|
135
|
+
| 11 | NA | Unknown contact NA |
|
136
|
+
| 12 | NA | Unknown contact NA |
|
137
|
+
| 13 | 2 | Unknown contact 2 |
|
@@ -0,0 +1,62 @@
|
|
1
|
+
Feature: Import a CSV file into the database with new dimension values inserted when not found during lookup
|
2
|
+
|
3
|
+
Scenario: Successful import
|
4
|
+
Given a database table called "dim_contact" with the following fields:
|
5
|
+
| field_name | field_type |
|
6
|
+
| contact_key | SERIAL |
|
7
|
+
| user_id | TEXT |
|
8
|
+
| name | TEXT |
|
9
|
+
And only the following rows in the "dim_contact" database table:
|
10
|
+
| contact_key (i) | user_id | name |
|
11
|
+
| 10 | 1 | Alma |
|
12
|
+
And the current value in sequence "dim_contact_contact_key_seq" is 10
|
13
|
+
And a database table called "fct_purchases" with the following fields:
|
14
|
+
| field_name | field_type |
|
15
|
+
| contact_key | INTEGER |
|
16
|
+
| amount | TEXT |
|
17
|
+
And a "purchases.csv" data file containing:
|
18
|
+
"""
|
19
|
+
user_id,amount
|
20
|
+
1,100
|
21
|
+
2,200
|
22
|
+
2,300
|
23
|
+
"""
|
24
|
+
And the following definition:
|
25
|
+
"""
|
26
|
+
source :purchases do
|
27
|
+
field :user_id, String
|
28
|
+
field :amount, String
|
29
|
+
end
|
30
|
+
|
31
|
+
source :transformed_purchases do
|
32
|
+
field :contact_key, Integer
|
33
|
+
field :amount, String
|
34
|
+
end
|
35
|
+
|
36
|
+
transform :purchases => :transformed_purchases do |record|
|
37
|
+
record[:contact_key] = lookup :contact_key,
|
38
|
+
from_table: :dim_contact,
|
39
|
+
match_column: :user_id,
|
40
|
+
to_value: record[:user_id],
|
41
|
+
if_not_found_then_insert: { contact_key: next_value_in_sequence("dim_contact_contact_key_seq"),
|
42
|
+
name: "Unknown contact #{record[:user_id]}" }
|
43
|
+
output record
|
44
|
+
end
|
45
|
+
|
46
|
+
import :transformed_purchases do
|
47
|
+
into :fct_purchases
|
48
|
+
put :contact_key
|
49
|
+
put :amount
|
50
|
+
end
|
51
|
+
"""
|
52
|
+
When I execute the definition
|
53
|
+
Then the process should exit successfully
|
54
|
+
And the "fct_purchases" table should contain:
|
55
|
+
| contact_key (i) | amount |
|
56
|
+
| 10 | 100 |
|
57
|
+
| 11 | 200 |
|
58
|
+
| 11 | 300 |
|
59
|
+
And the "dim_contact" table should contain:
|
60
|
+
| contact_key (i) | user_id | name |
|
61
|
+
| 10 | 1 | Alma |
|
62
|
+
| 11 | 2 | Unknown contact 2 |
|
@@ -0,0 +1,125 @@
|
|
1
|
+
Feature: Import a CSV file into the database with IDs looked up from the database
|
2
|
+
|
3
|
+
Scenario: Successful import
|
4
|
+
Given a database table called "dim_contact" with the following fields:
|
5
|
+
| field_name | field_type |
|
6
|
+
| contact_key | SERIAL |
|
7
|
+
| user_id | TEXT |
|
8
|
+
| name | TEXT |
|
9
|
+
And only the following rows in the "dim_contact" database table:
|
10
|
+
| contact_key (i) | user_id | name |
|
11
|
+
| 10 | 1 | Alma |
|
12
|
+
| 20 | 2 | Korte |
|
13
|
+
And a database table called "fct_purchases" with the following fields:
|
14
|
+
| field_name | field_type |
|
15
|
+
| contact_key | INTEGER |
|
16
|
+
| amount | TEXT |
|
17
|
+
And a "purchases.csv" data file containing:
|
18
|
+
"""
|
19
|
+
user_id,amount
|
20
|
+
1,100
|
21
|
+
2,200
|
22
|
+
3,300
|
23
|
+
"""
|
24
|
+
And the following definition:
|
25
|
+
"""
|
26
|
+
source :purchases do
|
27
|
+
field :user_id, String
|
28
|
+
field :amount, String
|
29
|
+
end
|
30
|
+
|
31
|
+
source :transformed_purchases do
|
32
|
+
field :contact_key, Integer
|
33
|
+
field :amount, String
|
34
|
+
end
|
35
|
+
|
36
|
+
transform :purchases => :transformed_purchases do |record|
|
37
|
+
record[:contact_key] = lookup :contact_key,
|
38
|
+
from_table: :dim_contact,
|
39
|
+
match_column: :user_id,
|
40
|
+
to_value: record[:user_id],
|
41
|
+
if_not_found_then: -1
|
42
|
+
output record
|
43
|
+
end
|
44
|
+
|
45
|
+
import :transformed_purchases do
|
46
|
+
into :fct_purchases
|
47
|
+
put :contact_key => :contact_key
|
48
|
+
put :amount => :amount
|
49
|
+
end
|
50
|
+
"""
|
51
|
+
When I execute the definition
|
52
|
+
Then the process should exit successfully
|
53
|
+
And the "fct_purchases" table should contain:
|
54
|
+
| contact_key (i) | amount |
|
55
|
+
| 10 | 100 |
|
56
|
+
| 20 | 200 |
|
57
|
+
| -1 | 300 |
|
58
|
+
|
59
|
+
|
60
|
+
Scenario: Multiple fields looked up by one key
|
61
|
+
Given a database table called "dim_contact" with the following fields:
|
62
|
+
| field_name | field_type |
|
63
|
+
| contact_key_1 | INTEGER |
|
64
|
+
| contact_key_2 | INTEGER |
|
65
|
+
| user_id | TEXT |
|
66
|
+
| name | TEXT |
|
67
|
+
And only the following rows in the "dim_contact" database table:
|
68
|
+
| contact_key_1 (i) | contact_key_2 (i) | user_id | name |
|
69
|
+
| 10 | 100 | 1 | Alma |
|
70
|
+
| 20 | 200 | 2 | Korte |
|
71
|
+
And a database table called "fct_purchases" with the following fields:
|
72
|
+
| field_name | field_type |
|
73
|
+
| contact_key_1 | INTEGER |
|
74
|
+
| contact_key_2 | INTEGER |
|
75
|
+
| amount | TEXT |
|
76
|
+
And a "purchases.csv" data file containing:
|
77
|
+
"""
|
78
|
+
user_id,amount
|
79
|
+
1,100
|
80
|
+
2,200
|
81
|
+
3,300
|
82
|
+
"""
|
83
|
+
And the following definition:
|
84
|
+
"""
|
85
|
+
source :purchases do
|
86
|
+
field :user_id, String
|
87
|
+
field :amount, String
|
88
|
+
end
|
89
|
+
|
90
|
+
source :transformed_purchases do
|
91
|
+
field :contact_key_1, Integer
|
92
|
+
field :contact_key_2, Integer
|
93
|
+
field :amount, String
|
94
|
+
end
|
95
|
+
|
96
|
+
transform :purchases => :transformed_purchases do |record|
|
97
|
+
record[:contact_key_1] = lookup :contact_key_1,
|
98
|
+
from_table: :dim_contact,
|
99
|
+
match_column: :user_id,
|
100
|
+
to_value: record[:user_id],
|
101
|
+
if_not_found_then: -1
|
102
|
+
|
103
|
+
record[:contact_key_2] = lookup :contact_key_2,
|
104
|
+
from_table: :dim_contact,
|
105
|
+
match_column: :user_id,
|
106
|
+
to_value: record[:user_id],
|
107
|
+
if_not_found_then: -2
|
108
|
+
|
109
|
+
output record
|
110
|
+
end
|
111
|
+
|
112
|
+
import :transformed_purchases do
|
113
|
+
into :fct_purchases
|
114
|
+
put :contact_key_1 => :contact_key_1
|
115
|
+
put :contact_key_2 => :contact_key_2
|
116
|
+
put :amount => :amount
|
117
|
+
end
|
118
|
+
"""
|
119
|
+
When I execute the definition
|
120
|
+
Then the process should exit successfully
|
121
|
+
And the "fct_purchases" table should contain:
|
122
|
+
| contact_key_1 (i) | contact_key_2 (i) | amount |
|
123
|
+
| 10 | 100 | 100 |
|
124
|
+
| 20 | 200 | 200 |
|
125
|
+
| -1 | -2 | 300 |
|
@@ -0,0 +1,55 @@
|
|
1
|
+
Feature: Import a CSV file into the database with a split transformation
|
2
|
+
|
3
|
+
Scenario: Successful import
|
4
|
+
Given a database table called "dim_product" with the following fields:
|
5
|
+
| field_name | field_type |
|
6
|
+
| item | TEXT |
|
7
|
+
| title | TEXT |
|
8
|
+
| category1 | TEXT |
|
9
|
+
| category2 | TEXT |
|
10
|
+
| category3 | TEXT |
|
11
|
+
And a "products.csv" data file containing:
|
12
|
+
"""
|
13
|
+
id,name,category
|
14
|
+
JNI-123,Just a product name,Main category > Subcategory > Sub-subcategory > Ultra-subcategory
|
15
|
+
CDI-234,Another product name,Smart Insight > Cool stuff | 3dim > 2dim > 1dim
|
16
|
+
"""
|
17
|
+
And the following definition:
|
18
|
+
"""
|
19
|
+
source :products do
|
20
|
+
encoding "UTF-8"
|
21
|
+
delimiter ','
|
22
|
+
field :id, String
|
23
|
+
field :name, String
|
24
|
+
field :category, String
|
25
|
+
end
|
26
|
+
|
27
|
+
source :transformed_products do
|
28
|
+
field :id, String
|
29
|
+
field :name, String
|
30
|
+
field :main_category, String
|
31
|
+
field :sub_category, String
|
32
|
+
field :department, String
|
33
|
+
end
|
34
|
+
|
35
|
+
transform :products => :transformed_products do |record|
|
36
|
+
record.split_field :category, into: [:category], by: "|"
|
37
|
+
record.split_field :category, into: [:main_category, :sub_category, :department], by: ">"
|
38
|
+
output record
|
39
|
+
end
|
40
|
+
|
41
|
+
import :transformed_products do
|
42
|
+
into :dim_product
|
43
|
+
put :id => :item
|
44
|
+
put :name => :title
|
45
|
+
put :main_category => :category1
|
46
|
+
put :sub_category => :category2
|
47
|
+
put :department => :category3
|
48
|
+
end
|
49
|
+
"""
|
50
|
+
When I execute the definition
|
51
|
+
Then the process should exit successfully
|
52
|
+
And the "dim_product" table should contain:
|
53
|
+
| item | title | category1 | category2 | category3 |
|
54
|
+
| JNI-123 | Just a product name | Main category | Subcategory | Sub-subcategory |
|
55
|
+
| CDI-234 | Another product name | Smart Insight | Cool stuff | Cool stuff |
|
@@ -0,0 +1,44 @@
|
|
1
|
+
Feature: Import multiple CSV files into the database without any transformations
|
2
|
+
|
3
|
+
Scenario: Successful import
|
4
|
+
Given a database table called "dim_product" with the following fields:
|
5
|
+
| field_name | field_type |
|
6
|
+
| item | TEXT |
|
7
|
+
| title | TEXT |
|
8
|
+
| category | TEXT |
|
9
|
+
And a "products1.csv" data file containing:
|
10
|
+
"""
|
11
|
+
id,name,category
|
12
|
+
PROD-1,product name 1,Main category > Subcategory > Sub-subcategory
|
13
|
+
PROD-2,product name 2,Main category > Subcategory > Sub-subcategory
|
14
|
+
"""
|
15
|
+
And a "products2.csv" data file containing:
|
16
|
+
"""
|
17
|
+
id,name,category
|
18
|
+
PROD-3,product name 3,Main category > Subcategory > Sub-subcategory
|
19
|
+
PROD-4,product name 4,Main category > Subcategory > Sub-subcategory
|
20
|
+
"""
|
21
|
+
And the following definition:
|
22
|
+
"""
|
23
|
+
source :products do
|
24
|
+
file "products*.csv"
|
25
|
+
field :id, String
|
26
|
+
field :name, String
|
27
|
+
field :category, String
|
28
|
+
end
|
29
|
+
|
30
|
+
import :products do
|
31
|
+
into :dim_product
|
32
|
+
put :id => :item
|
33
|
+
put :name => :title
|
34
|
+
put :category => :category
|
35
|
+
end
|
36
|
+
"""
|
37
|
+
When I execute the definition
|
38
|
+
Then the process should exit successfully
|
39
|
+
And the "dim_product" table should contain:
|
40
|
+
| item | title | category |
|
41
|
+
| PROD-1 | product name 1 | Main category > Subcategory > Sub-subcategory |
|
42
|
+
| PROD-2 | product name 2 | Main category > Subcategory > Sub-subcategory |
|
43
|
+
| PROD-3 | product name 3 | Main category > Subcategory > Sub-subcategory |
|
44
|
+
| PROD-4 | product name 4 | Main category > Subcategory > Sub-subcategory |
|
@@ -0,0 +1,53 @@
|
|
1
|
+
Feature: Import data and assign a load id (audit information) from a sequence to all records
|
2
|
+
|
3
|
+
Scenario: Successful import
|
4
|
+
Given a database table called "dim_product" with the following fields:
|
5
|
+
| field_name | field_type |
|
6
|
+
| load_id | INTEGER |
|
7
|
+
| item | TEXT |
|
8
|
+
| title | TEXT |
|
9
|
+
And a sequence called "some_sequence" starting from 33
|
10
|
+
And a "products.csv" data file containing:
|
11
|
+
"""
|
12
|
+
id,name,category
|
13
|
+
JNI-123,Just a product name,Main category > Subcategory > Sub-subcategory > Ultra-subcategory
|
14
|
+
CDI-234,Another product name,Smart Insight > Cool stuff | 3dim > 2dim > 1dim
|
15
|
+
"""
|
16
|
+
And the following definition:
|
17
|
+
"""
|
18
|
+
LOAD_ID = sequence("some_sequence").next_value
|
19
|
+
|
20
|
+
source :products do
|
21
|
+
encoding "UTF-8"
|
22
|
+
delimiter ','
|
23
|
+
|
24
|
+
field :id, String
|
25
|
+
field :name, String
|
26
|
+
end
|
27
|
+
|
28
|
+
source :transformed_products do
|
29
|
+
field :load_id, Integer
|
30
|
+
|
31
|
+
field :id, String
|
32
|
+
field :name, String
|
33
|
+
end
|
34
|
+
|
35
|
+
transform :products => :transformed_products do |record|
|
36
|
+
record[:load_id] = LOAD_ID
|
37
|
+
output record
|
38
|
+
end
|
39
|
+
|
40
|
+
import :transformed_products do
|
41
|
+
into :dim_product
|
42
|
+
|
43
|
+
put :load_id
|
44
|
+
put :id => :item
|
45
|
+
put :name => :title
|
46
|
+
end
|
47
|
+
"""
|
48
|
+
When I execute the definition
|
49
|
+
Then the process should exit successfully
|
50
|
+
And the "dim_product" table should contain:
|
51
|
+
| load_id (i) | item | title |
|
52
|
+
| 34 | JNI-123 | Just a product name |
|
53
|
+
| 34 | CDI-234 | Another product name |
|
@@ -0,0 +1,64 @@
|
|
1
|
+
Feature: Import a CSV file into the database with IDs looked up from multiple columns of the database
|
2
|
+
|
3
|
+
Scenario: Successful import
|
4
|
+
Given a database table called "dim_contact" with the following fields:
|
5
|
+
| field_name | field_type |
|
6
|
+
| contact_key | SERIAL |
|
7
|
+
| user_id_part_1 | TEXT |
|
8
|
+
| user_id_part_2 | TEXT |
|
9
|
+
| name | TEXT |
|
10
|
+
And only the following rows in the "dim_contact" database table:
|
11
|
+
| contact_key (i) | user_id_part_1 | user_id_part_2 | name |
|
12
|
+
| 11 | 1 | 1 | Alma |
|
13
|
+
| 12 | 1 | 2 | Korte |
|
14
|
+
| 21 | 2 | 1 | Szilva |
|
15
|
+
| 22 | 2 | 2 | Barack |
|
16
|
+
And a database table called "fct_purchases" with the following fields:
|
17
|
+
| field_name | field_type |
|
18
|
+
| contact_key | INTEGER |
|
19
|
+
| amount | TEXT |
|
20
|
+
And a "purchases.csv" data file containing:
|
21
|
+
"""
|
22
|
+
user_id_1,user_id_2,amount
|
23
|
+
1,1,100
|
24
|
+
1,2,200
|
25
|
+
2,1,300
|
26
|
+
2,2,400
|
27
|
+
3,1,500
|
28
|
+
"""
|
29
|
+
And the following definition:
|
30
|
+
"""
|
31
|
+
source :purchases do
|
32
|
+
field :user_id_1, String
|
33
|
+
field :user_id_2, String
|
34
|
+
field :amount, String
|
35
|
+
end
|
36
|
+
|
37
|
+
source :transformed_purchases do
|
38
|
+
field :contact_key, Integer
|
39
|
+
field :amount, String
|
40
|
+
end
|
41
|
+
|
42
|
+
transform :purchases => :transformed_purchases do |record|
|
43
|
+
record[:contact_key] = lookup :contact_key,
|
44
|
+
from_table: :dim_contact,
|
45
|
+
match: { :user_id_part_1 => record[:user_id_1], :user_id_part_2 => record[:user_id_2] },
|
46
|
+
if_not_found_then: -1
|
47
|
+
output record
|
48
|
+
end
|
49
|
+
|
50
|
+
import :transformed_purchases do
|
51
|
+
into :fct_purchases
|
52
|
+
put :contact_key => :contact_key
|
53
|
+
put :amount => :amount
|
54
|
+
end
|
55
|
+
"""
|
56
|
+
When I execute the definition
|
57
|
+
Then the process should exit successfully
|
58
|
+
And the "fct_purchases" table should contain:
|
59
|
+
| contact_key (i) | amount |
|
60
|
+
| 11 | 100 |
|
61
|
+
| 12 | 200 |
|
62
|
+
| 21 | 300 |
|
63
|
+
| 22 | 400 |
|
64
|
+
| -1 | 500 |
|
@@ -0,0 +1,56 @@
|
|
1
|
+
Feature: Remove source files
|
2
|
+
|
3
|
+
Scenario:
|
4
|
+
Given a "forms.csv" data file containing:
|
5
|
+
"""
|
6
|
+
id,name
|
7
|
+
1,Landing form
|
8
|
+
2,Other form
|
9
|
+
"""
|
10
|
+
And a "contacts_extract.csv" data file containing:
|
11
|
+
"""
|
12
|
+
id,created,form_id
|
13
|
+
1,2001-01-01,1
|
14
|
+
2,2002-02-02,2
|
15
|
+
3,2003-03-03,1
|
16
|
+
"""
|
17
|
+
And the following definition:
|
18
|
+
"""
|
19
|
+
source :forms do
|
20
|
+
field :id, Integer
|
21
|
+
field :name, String
|
22
|
+
end
|
23
|
+
|
24
|
+
source :contacts_extract do
|
25
|
+
field :id, Integer
|
26
|
+
field :created, String
|
27
|
+
field :form_id, Integer
|
28
|
+
end
|
29
|
+
|
30
|
+
source :contacts do
|
31
|
+
field :id, Integer
|
32
|
+
field :created, String
|
33
|
+
field :form, String
|
34
|
+
end
|
35
|
+
|
36
|
+
form_mapping = {}
|
37
|
+
|
38
|
+
read :forms do |record|
|
39
|
+
form_mapping[record[:id]] = record[:name]
|
40
|
+
end
|
41
|
+
|
42
|
+
transform :contacts_extract => :contacts do |record|
|
43
|
+
record[:form] = form_mapping[record[:form_id]]
|
44
|
+
output record
|
45
|
+
end
|
46
|
+
|
47
|
+
"""
|
48
|
+
When I execute the definition
|
49
|
+
Then the process should exit successfully
|
50
|
+
And there should be a "contacts.csv" data file in the upload directory containing:
|
51
|
+
"""
|
52
|
+
id,created,form
|
53
|
+
1,2001-01-01,Landing form
|
54
|
+
2,2002-02-02,Other form
|
55
|
+
3,2003-03-03,Landing form
|
56
|
+
"""
|
@@ -0,0 +1,44 @@
|
|
1
|
+
Feature: Remove source files
|
2
|
+
|
3
|
+
Scenario:
|
4
|
+
Given a "contacts_extract_1.csv" data file containing:
|
5
|
+
"""
|
6
|
+
"""
|
7
|
+
And a "contacts_extract_2.csv" data file containing:
|
8
|
+
"""
|
9
|
+
"""
|
10
|
+
And a "clicks_extract_1.csv" data file containing:
|
11
|
+
"""
|
12
|
+
"""
|
13
|
+
And a "products.csv" data file containing:
|
14
|
+
"""
|
15
|
+
"""
|
16
|
+
And the following definition:
|
17
|
+
"""
|
18
|
+
source :contacts_extract do
|
19
|
+
file "contacts_extract_*.csv"
|
20
|
+
end
|
21
|
+
|
22
|
+
source :clicks_extract do
|
23
|
+
file "clicks_extract_*.csv"
|
24
|
+
end
|
25
|
+
|
26
|
+
source :products do
|
27
|
+
file "products.csv"
|
28
|
+
end
|
29
|
+
|
30
|
+
source :products_transformed do end
|
31
|
+
|
32
|
+
transform :products => :products_transformed do |record|
|
33
|
+
output record
|
34
|
+
end
|
35
|
+
|
36
|
+
remove :contacts_extract, :clicks_extract
|
37
|
+
"""
|
38
|
+
When I execute the definition
|
39
|
+
Then the process should exit successfully
|
40
|
+
And the upload directory should contain the following files:
|
41
|
+
| filename |
|
42
|
+
| definition.rb |
|
43
|
+
| products.csv |
|
44
|
+
| products_transformed.csv |
|