cranium 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +21 -0
  3. data/.ruby-version +1 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +29 -0
  7. data/Rakefile +3 -0
  8. data/Vagrantfile +24 -0
  9. data/bin/cranium +9 -0
  10. data/config/cucumber.yml +9 -0
  11. data/cranium.gemspec +26 -0
  12. data/db/setup.sql +8 -0
  13. data/docker-compose.yml +8 -0
  14. data/examples/config.rb +14 -0
  15. data/examples/deduplication.rb +27 -0
  16. data/examples/import_csv_with_field_lookup_inserting_new_dimension_keys.rb +26 -0
  17. data/examples/incremental_extract.rb +17 -0
  18. data/examples/lookup_with_multiple_fields.rb +25 -0
  19. data/features/archive.feature +49 -0
  20. data/features/extract/incremental_extract.feature +56 -0
  21. data/features/extract/simple_extract.feature +85 -0
  22. data/features/import/import_csv_to_database_as_delta.feature +38 -0
  23. data/features/import/import_csv_to_database_with_delete_insert_merging.feature +51 -0
  24. data/features/import/import_csv_to_database_with_truncate_insert.feature +49 -0
  25. data/features/import/import_csv_to_database_with_update_merging.feature +46 -0
  26. data/features/import/import_csv_with_always_inserting_new_dimension_keys.feature +137 -0
  27. data/features/import/import_csv_with_field_lookup_inserting_new_dimension_keys.feature +62 -0
  28. data/features/import/import_csv_with_field_lookup_transformation.feature +125 -0
  29. data/features/import/import_csv_with_transformation.feature +55 -0
  30. data/features/import/import_multiple_csv_files_without_transformations.feature +44 -0
  31. data/features/import/import_with_load_id_from_sequence.feature +53 -0
  32. data/features/import/import_with_lookup_from_multiple_fields.feature +64 -0
  33. data/features/read.feature +56 -0
  34. data/features/remove.feature +44 -0
  35. data/features/restore_database_connection.feature +55 -0
  36. data/features/step_definitions/database_table_steps.rb +40 -0
  37. data/features/step_definitions/definition_steps.rb +3 -0
  38. data/features/step_definitions/execution_steps.rb +23 -0
  39. data/features/step_definitions/file_steps.rb +39 -0
  40. data/features/support/class_extensions.rb +24 -0
  41. data/features/support/env.rb +27 -0
  42. data/features/support/randomize.rb +22 -0
  43. data/features/support/stop_on_first_error.rb +5 -0
  44. data/features/transform/deduplication.feature +37 -0
  45. data/features/transform/empty_transformation.feature +72 -0
  46. data/features/transform/join.feature +180 -0
  47. data/features/transform/join_multiple_files_into_one_output_file.feature +46 -0
  48. data/features/transform/output_rows.feature +70 -0
  49. data/features/transform/projection.feature +34 -0
  50. data/features/transform/raw_ruby_transformation.feature +69 -0
  51. data/features/transform/split_field.feature +39 -0
  52. data/lib/cranium/application.rb +104 -0
  53. data/lib/cranium/archiver.rb +36 -0
  54. data/lib/cranium/attribute_dsl.rb +43 -0
  55. data/lib/cranium/command_line_options.rb +27 -0
  56. data/lib/cranium/configuration.rb +33 -0
  57. data/lib/cranium/data_importer.rb +35 -0
  58. data/lib/cranium/data_reader.rb +48 -0
  59. data/lib/cranium/data_transformer.rb +126 -0
  60. data/lib/cranium/database.rb +36 -0
  61. data/lib/cranium/definition_registry.rb +21 -0
  62. data/lib/cranium/dimension_manager.rb +65 -0
  63. data/lib/cranium/dsl/database_definition.rb +23 -0
  64. data/lib/cranium/dsl/extract_definition.rb +28 -0
  65. data/lib/cranium/dsl/import_definition.rb +50 -0
  66. data/lib/cranium/dsl/source_definition.rb +67 -0
  67. data/lib/cranium/dsl.rb +100 -0
  68. data/lib/cranium/extensions/file.rb +7 -0
  69. data/lib/cranium/extensions/sequel_greenplum.rb +30 -0
  70. data/lib/cranium/external_table.rb +75 -0
  71. data/lib/cranium/extract/data_extractor.rb +11 -0
  72. data/lib/cranium/extract/storage.rb +57 -0
  73. data/lib/cranium/extract/strategy/base.rb +27 -0
  74. data/lib/cranium/extract/strategy/incremental.rb +16 -0
  75. data/lib/cranium/extract/strategy/simple.rb +9 -0
  76. data/lib/cranium/extract/strategy.rb +7 -0
  77. data/lib/cranium/extract.rb +7 -0
  78. data/lib/cranium/import_strategy/base.rb +55 -0
  79. data/lib/cranium/import_strategy/delete_insert.rb +40 -0
  80. data/lib/cranium/import_strategy/delta.rb +8 -0
  81. data/lib/cranium/import_strategy/merge.rb +50 -0
  82. data/lib/cranium/import_strategy/truncate_insert.rb +19 -0
  83. data/lib/cranium/import_strategy.rb +9 -0
  84. data/lib/cranium/logging.rb +15 -0
  85. data/lib/cranium/profiling.rb +13 -0
  86. data/lib/cranium/progress_output.rb +37 -0
  87. data/lib/cranium/sequel/hash.rb +32 -0
  88. data/lib/cranium/sequel.rb +5 -0
  89. data/lib/cranium/source_registry.rb +21 -0
  90. data/lib/cranium/test_framework/cucumber_table.rb +140 -0
  91. data/lib/cranium/test_framework/database_entity.rb +29 -0
  92. data/lib/cranium/test_framework/database_sequence.rb +16 -0
  93. data/lib/cranium/test_framework/database_table.rb +33 -0
  94. data/lib/cranium/test_framework/upload_directory.rb +39 -0
  95. data/lib/cranium/test_framework/world.rb +66 -0
  96. data/lib/cranium/test_framework.rb +10 -0
  97. data/lib/cranium/transformation/duplication_index.rb +42 -0
  98. data/lib/cranium/transformation/index.rb +83 -0
  99. data/lib/cranium/transformation/join.rb +141 -0
  100. data/lib/cranium/transformation/sequence.rb +42 -0
  101. data/lib/cranium/transformation.rb +8 -0
  102. data/lib/cranium/transformation_record.rb +45 -0
  103. data/lib/cranium.rb +57 -0
  104. data/rake/test.rake +31 -0
  105. data/spec/cranium/application_spec.rb +166 -0
  106. data/spec/cranium/archiver_spec.rb +44 -0
  107. data/spec/cranium/command_line_options_spec.rb +32 -0
  108. data/spec/cranium/configuration_spec.rb +31 -0
  109. data/spec/cranium/data_importer_spec.rb +55 -0
  110. data/spec/cranium/data_transformer_spec.rb +16 -0
  111. data/spec/cranium/database_spec.rb +69 -0
  112. data/spec/cranium/definition_registry_spec.rb +45 -0
  113. data/spec/cranium/dimension_manager_spec.rb +63 -0
  114. data/spec/cranium/dsl/database_definition_spec.rb +23 -0
  115. data/spec/cranium/dsl/extract_definition_spec.rb +76 -0
  116. data/spec/cranium/dsl/import_definition_spec.rb +153 -0
  117. data/spec/cranium/dsl/source_definition_spec.rb +84 -0
  118. data/spec/cranium/dsl_spec.rb +119 -0
  119. data/spec/cranium/external_table_spec.rb +71 -0
  120. data/spec/cranium/extract/storage_spec.rb +125 -0
  121. data/spec/cranium/logging_spec.rb +37 -0
  122. data/spec/cranium/sequel/hash_spec.rb +56 -0
  123. data/spec/cranium/source_registry_spec.rb +31 -0
  124. data/spec/cranium/test_framework/cucumber_table_spec.rb +144 -0
  125. data/spec/cranium/transformation/duplication_index_spec.rb +75 -0
  126. data/spec/cranium/transformation/index_spec.rb +178 -0
  127. data/spec/cranium/transformation/join_spec.rb +43 -0
  128. data/spec/cranium/transformation/sequence_spec.rb +83 -0
  129. data/spec/cranium/transformation_record_spec.rb +78 -0
  130. data/spec/cranium_spec.rb +53 -0
  131. data/spec/spec_helper.rb +1 -0
  132. metadata +362 -0
@@ -0,0 +1,46 @@
1
+ Feature: Import a CSV file into the database with merging
2
+
3
+ The merge_on property can be used to specify an id field that is used to detect duplicates while importing.
4
+ Duplicates are updated and new items are added.
5
+
6
+ Scenario: Successful import with merged items
7
+ Given a database table called "dim_product" with the following fields:
8
+ | field_name | field_type |
9
+ | item | TEXT |
10
+ | title | TEXT |
11
+ | description | TEXT |
12
+ And only the following rows in the "dim_product" database table:
13
+ | item | title | description |
14
+ | JNI-123 | Just a product name | Very interesting description |
15
+ | CDI-234 | Another product name | Yet another cool description |
16
+ And a "products.csv" data file containing:
17
+ """
18
+ id,name,description
19
+ JNI-123,Just a product name,"Very interesting description, updated"
20
+ CDI-234,Updated product name,Yet another cool description
21
+ KLM-987,Inserted product name,This is the best product
22
+ """
23
+ And the following definition:
24
+ """
25
+ source :products do
26
+ field :id, String
27
+ field :name, String
28
+ field :description, String
29
+ end
30
+
31
+ import :products do
32
+ into :dim_product
33
+ put :id => :item
34
+ put :name => :title
35
+ put :description => :description
36
+
37
+ merge_on :id => :item
38
+ end
39
+ """
40
+ When I execute the definition
41
+ Then the process should exit successfully
42
+ And the "dim_product" table should contain:
43
+ | item | title | description |
44
+ | JNI-123 | Just a product name | Very interesting description, updated |
45
+ | CDI-234 | Updated product name | Yet another cool description |
46
+ | KLM-987 | Inserted product name | This is the best product |
@@ -0,0 +1,137 @@
1
+ Feature: Import a CSV file into the database with new dimension values always inserted
2
+
3
+ Scenario: Successful import
4
+ Given a database table called "dim_contact" with the following fields:
5
+ | field_name | field_type |
6
+ | contact_key | SERIAL |
7
+ | user_id | TEXT |
8
+ | name | TEXT |
9
+ And only the following rows in the "dim_contact" database table:
10
+ | contact_key (i) | user_id | name |
11
+ | 10 | 1 | Alma |
12
+ And the current value in sequence "dim_contact_contact_key_seq" is 10
13
+ And a database table called "fct_purchases" with the following fields:
14
+ | field_name | field_type |
15
+ | contact_key | INTEGER |
16
+ | amount | TEXT |
17
+ And a "purchases.csv" data file containing:
18
+ """
19
+ user_id,amount
20
+ 1,100
21
+ NA,200
22
+ NA,300
23
+ """
24
+ And the following definition:
25
+ """
26
+ source :purchases do
27
+ field :user_id, String
28
+ field :amount, String
29
+ end
30
+
31
+ source :transformed_purchases do
32
+ field :contact_key, Integer
33
+ field :amount, String
34
+ end
35
+
36
+ transform :purchases => :transformed_purchases do |record|
37
+ record[:contact_key] = insert :contact_key,
38
+ table: :dim_contact,
39
+ record: {contact_key: next_value_in_sequence("dim_contact_contact_key_seq"), user_id: record[:user_id], name: "Unknown contact #{record[:user_id]}"}
40
+ output record
41
+ end
42
+
43
+ import :transformed_purchases do
44
+ into :fct_purchases
45
+ put :contact_key
46
+ put :amount
47
+ end
48
+ """
49
+ When I execute the definition
50
+ Then the process should exit successfully
51
+ And the "fct_purchases" table should contain:
52
+ | contact_key (i) | amount |
53
+ | 11 | 100 |
54
+ | 12 | 200 |
55
+ | 13 | 300 |
56
+ And the "dim_contact" table should contain:
57
+ | contact_key (i) | user_id | name |
58
+ | 10 | 1 | Alma |
59
+ | 11 | 1 | Unknown contact 1 |
60
+ | 12 | NA | Unknown contact NA |
61
+ | 13 | NA | Unknown contact NA |
62
+
63
+
64
+ Scenario: Example use case for the insert
65
+ If purchases made by a predefined contact identifier (NA in this case) do not look for it insert .
66
+ Otherwise use lookup to find or create that contact
67
+
68
+ Given a database table called "dim_contact" with the following fields:
69
+ | field_name | field_type |
70
+ | contact_key | SERIAL |
71
+ | user_id | TEXT |
72
+ | name | TEXT |
73
+ And only the following rows in the "dim_contact" database table:
74
+ | contact_key (i) | user_id | name |
75
+ | 10 | 1 | Alma |
76
+ And the current value in sequence "dim_contact_contact_key_seq" is 10
77
+ And a database table called "fct_purchases" with the following fields:
78
+ | field_name | field_type |
79
+ | contact_key | INTEGER |
80
+ | amount | TEXT |
81
+ And a "purchases.csv" data file containing:
82
+ """
83
+ user_id,amount
84
+ 1,100
85
+ NA,200
86
+ NA,300
87
+ 2,400
88
+ 2,500
89
+ """
90
+ And the following definition:
91
+ """
92
+ source :purchases do
93
+ field :user_id, String
94
+ field :amount, String
95
+ end
96
+
97
+ source :transformed_purchases do
98
+ field :contact_key, Integer
99
+ field :amount, String
100
+ end
101
+
102
+ transform :purchases => :transformed_purchases do |record|
103
+ if record[:user_id] == 'NA'
104
+ record[:contact_key] = insert :contact_key,
105
+ table: :dim_contact,
106
+ record: {contact_key: next_value_in_sequence("dim_contact_contact_key_seq"), user_id: record[:user_id], name: "Unknown contact #{record[:user_id]}"}
107
+ else
108
+ record[:contact_key] = lookup :contact_key,
109
+ from_table: :dim_contact,
110
+ match_column: :user_id,
111
+ to_value: record[:user_id],
112
+ if_not_found_then_insert: {contact_key: next_value_in_sequence("dim_contact_contact_key_seq"), name: "Unknown contact #{record[:user_id]}"}
113
+ end
114
+ output record
115
+ end
116
+
117
+ import :transformed_purchases do
118
+ into :fct_purchases
119
+ put :contact_key
120
+ put :amount
121
+ end
122
+ """
123
+ When I execute the definition
124
+ Then the process should exit successfully
125
+ And the "fct_purchases" table should contain:
126
+ | contact_key (i) | amount |
127
+ | 10 | 100 |
128
+ | 11 | 200 |
129
+ | 12 | 300 |
130
+ | 13 | 400 |
131
+ | 13 | 500 |
132
+ And the "dim_contact" table should contain:
133
+ | contact_key (i) | user_id | name |
134
+ | 10 | 1 | Alma |
135
+ | 11 | NA | Unknown contact NA |
136
+ | 12 | NA | Unknown contact NA |
137
+ | 13 | 2 | Unknown contact 2 |
@@ -0,0 +1,62 @@
1
+ Feature: Import a CSV file into the database with new dimension values inserted when not found during lookup
2
+
3
+ Scenario: Successful import
4
+ Given a database table called "dim_contact" with the following fields:
5
+ | field_name | field_type |
6
+ | contact_key | SERIAL |
7
+ | user_id | TEXT |
8
+ | name | TEXT |
9
+ And only the following rows in the "dim_contact" database table:
10
+ | contact_key (i) | user_id | name |
11
+ | 10 | 1 | Alma |
12
+ And the current value in sequence "dim_contact_contact_key_seq" is 10
13
+ And a database table called "fct_purchases" with the following fields:
14
+ | field_name | field_type |
15
+ | contact_key | INTEGER |
16
+ | amount | TEXT |
17
+ And a "purchases.csv" data file containing:
18
+ """
19
+ user_id,amount
20
+ 1,100
21
+ 2,200
22
+ 2,300
23
+ """
24
+ And the following definition:
25
+ """
26
+ source :purchases do
27
+ field :user_id, String
28
+ field :amount, String
29
+ end
30
+
31
+ source :transformed_purchases do
32
+ field :contact_key, Integer
33
+ field :amount, String
34
+ end
35
+
36
+ transform :purchases => :transformed_purchases do |record|
37
+ record[:contact_key] = lookup :contact_key,
38
+ from_table: :dim_contact,
39
+ match_column: :user_id,
40
+ to_value: record[:user_id],
41
+ if_not_found_then_insert: { contact_key: next_value_in_sequence("dim_contact_contact_key_seq"),
42
+ name: "Unknown contact #{record[:user_id]}" }
43
+ output record
44
+ end
45
+
46
+ import :transformed_purchases do
47
+ into :fct_purchases
48
+ put :contact_key
49
+ put :amount
50
+ end
51
+ """
52
+ When I execute the definition
53
+ Then the process should exit successfully
54
+ And the "fct_purchases" table should contain:
55
+ | contact_key (i) | amount |
56
+ | 10 | 100 |
57
+ | 11 | 200 |
58
+ | 11 | 300 |
59
+ And the "dim_contact" table should contain:
60
+ | contact_key (i) | user_id | name |
61
+ | 10 | 1 | Alma |
62
+ | 11 | 2 | Unknown contact 2 |
@@ -0,0 +1,125 @@
1
+ Feature: Import a CSV file into the database with IDs looked up from the database
2
+
3
+ Scenario: Successful import
4
+ Given a database table called "dim_contact" with the following fields:
5
+ | field_name | field_type |
6
+ | contact_key | SERIAL |
7
+ | user_id | TEXT |
8
+ | name | TEXT |
9
+ And only the following rows in the "dim_contact" database table:
10
+ | contact_key (i) | user_id | name |
11
+ | 10 | 1 | Alma |
12
+ | 20 | 2 | Korte |
13
+ And a database table called "fct_purchases" with the following fields:
14
+ | field_name | field_type |
15
+ | contact_key | INTEGER |
16
+ | amount | TEXT |
17
+ And a "purchases.csv" data file containing:
18
+ """
19
+ user_id,amount
20
+ 1,100
21
+ 2,200
22
+ 3,300
23
+ """
24
+ And the following definition:
25
+ """
26
+ source :purchases do
27
+ field :user_id, String
28
+ field :amount, String
29
+ end
30
+
31
+ source :transformed_purchases do
32
+ field :contact_key, Integer
33
+ field :amount, String
34
+ end
35
+
36
+ transform :purchases => :transformed_purchases do |record|
37
+ record[:contact_key] = lookup :contact_key,
38
+ from_table: :dim_contact,
39
+ match_column: :user_id,
40
+ to_value: record[:user_id],
41
+ if_not_found_then: -1
42
+ output record
43
+ end
44
+
45
+ import :transformed_purchases do
46
+ into :fct_purchases
47
+ put :contact_key => :contact_key
48
+ put :amount => :amount
49
+ end
50
+ """
51
+ When I execute the definition
52
+ Then the process should exit successfully
53
+ And the "fct_purchases" table should contain:
54
+ | contact_key (i) | amount |
55
+ | 10 | 100 |
56
+ | 20 | 200 |
57
+ | -1 | 300 |
58
+
59
+
60
+ Scenario: Multiple fields looked up by one key
61
+ Given a database table called "dim_contact" with the following fields:
62
+ | field_name | field_type |
63
+ | contact_key_1 | INTEGER |
64
+ | contact_key_2 | INTEGER |
65
+ | user_id | TEXT |
66
+ | name | TEXT |
67
+ And only the following rows in the "dim_contact" database table:
68
+ | contact_key_1 (i) | contact_key_2 (i) | user_id | name |
69
+ | 10 | 100 | 1 | Alma |
70
+ | 20 | 200 | 2 | Korte |
71
+ And a database table called "fct_purchases" with the following fields:
72
+ | field_name | field_type |
73
+ | contact_key_1 | INTEGER |
74
+ | contact_key_2 | INTEGER |
75
+ | amount | TEXT |
76
+ And a "purchases.csv" data file containing:
77
+ """
78
+ user_id,amount
79
+ 1,100
80
+ 2,200
81
+ 3,300
82
+ """
83
+ And the following definition:
84
+ """
85
+ source :purchases do
86
+ field :user_id, String
87
+ field :amount, String
88
+ end
89
+
90
+ source :transformed_purchases do
91
+ field :contact_key_1, Integer
92
+ field :contact_key_2, Integer
93
+ field :amount, String
94
+ end
95
+
96
+ transform :purchases => :transformed_purchases do |record|
97
+ record[:contact_key_1] = lookup :contact_key_1,
98
+ from_table: :dim_contact,
99
+ match_column: :user_id,
100
+ to_value: record[:user_id],
101
+ if_not_found_then: -1
102
+
103
+ record[:contact_key_2] = lookup :contact_key_2,
104
+ from_table: :dim_contact,
105
+ match_column: :user_id,
106
+ to_value: record[:user_id],
107
+ if_not_found_then: -2
108
+
109
+ output record
110
+ end
111
+
112
+ import :transformed_purchases do
113
+ into :fct_purchases
114
+ put :contact_key_1 => :contact_key_1
115
+ put :contact_key_2 => :contact_key_2
116
+ put :amount => :amount
117
+ end
118
+ """
119
+ When I execute the definition
120
+ Then the process should exit successfully
121
+ And the "fct_purchases" table should contain:
122
+ | contact_key_1 (i) | contact_key_2 (i) | amount |
123
+ | 10 | 100 | 100 |
124
+ | 20 | 200 | 200 |
125
+ | -1 | -2 | 300 |
@@ -0,0 +1,55 @@
1
+ Feature: Import a CSV file into the database with a split transformation
2
+
3
+ Scenario: Successful import
4
+ Given a database table called "dim_product" with the following fields:
5
+ | field_name | field_type |
6
+ | item | TEXT |
7
+ | title | TEXT |
8
+ | category1 | TEXT |
9
+ | category2 | TEXT |
10
+ | category3 | TEXT |
11
+ And a "products.csv" data file containing:
12
+ """
13
+ id,name,category
14
+ JNI-123,Just a product name,Main category > Subcategory > Sub-subcategory > Ultra-subcategory
15
+ CDI-234,Another product name,Smart Insight > Cool stuff | 3dim > 2dim > 1dim
16
+ """
17
+ And the following definition:
18
+ """
19
+ source :products do
20
+ encoding "UTF-8"
21
+ delimiter ','
22
+ field :id, String
23
+ field :name, String
24
+ field :category, String
25
+ end
26
+
27
+ source :transformed_products do
28
+ field :id, String
29
+ field :name, String
30
+ field :main_category, String
31
+ field :sub_category, String
32
+ field :department, String
33
+ end
34
+
35
+ transform :products => :transformed_products do |record|
36
+ record.split_field :category, into: [:category], by: "|"
37
+ record.split_field :category, into: [:main_category, :sub_category, :department], by: ">"
38
+ output record
39
+ end
40
+
41
+ import :transformed_products do
42
+ into :dim_product
43
+ put :id => :item
44
+ put :name => :title
45
+ put :main_category => :category1
46
+ put :sub_category => :category2
47
+ put :department => :category3
48
+ end
49
+ """
50
+ When I execute the definition
51
+ Then the process should exit successfully
52
+ And the "dim_product" table should contain:
53
+ | item | title | category1 | category2 | category3 |
54
+ | JNI-123 | Just a product name | Main category | Subcategory | Sub-subcategory |
55
+ | CDI-234 | Another product name | Smart Insight | Cool stuff | Cool stuff |
@@ -0,0 +1,44 @@
1
+ Feature: Import multiple CSV files into the database without any transformations
2
+
3
+ Scenario: Successful import
4
+ Given a database table called "dim_product" with the following fields:
5
+ | field_name | field_type |
6
+ | item | TEXT |
7
+ | title | TEXT |
8
+ | category | TEXT |
9
+ And a "products1.csv" data file containing:
10
+ """
11
+ id,name,category
12
+ PROD-1,product name 1,Main category > Subcategory > Sub-subcategory
13
+ PROD-2,product name 2,Main category > Subcategory > Sub-subcategory
14
+ """
15
+ And a "products2.csv" data file containing:
16
+ """
17
+ id,name,category
18
+ PROD-3,product name 3,Main category > Subcategory > Sub-subcategory
19
+ PROD-4,product name 4,Main category > Subcategory > Sub-subcategory
20
+ """
21
+ And the following definition:
22
+ """
23
+ source :products do
24
+ file "products*.csv"
25
+ field :id, String
26
+ field :name, String
27
+ field :category, String
28
+ end
29
+
30
+ import :products do
31
+ into :dim_product
32
+ put :id => :item
33
+ put :name => :title
34
+ put :category => :category
35
+ end
36
+ """
37
+ When I execute the definition
38
+ Then the process should exit successfully
39
+ And the "dim_product" table should contain:
40
+ | item | title | category |
41
+ | PROD-1 | product name 1 | Main category > Subcategory > Sub-subcategory |
42
+ | PROD-2 | product name 2 | Main category > Subcategory > Sub-subcategory |
43
+ | PROD-3 | product name 3 | Main category > Subcategory > Sub-subcategory |
44
+ | PROD-4 | product name 4 | Main category > Subcategory > Sub-subcategory |
@@ -0,0 +1,53 @@
1
+ Feature: Import data and assign a load id (audit information) from a sequence to all records
2
+
3
+ Scenario: Successful import
4
+ Given a database table called "dim_product" with the following fields:
5
+ | field_name | field_type |
6
+ | load_id | INTEGER |
7
+ | item | TEXT |
8
+ | title | TEXT |
9
+ And a sequence called "some_sequence" starting from 33
10
+ And a "products.csv" data file containing:
11
+ """
12
+ id,name,category
13
+ JNI-123,Just a product name,Main category > Subcategory > Sub-subcategory > Ultra-subcategory
14
+ CDI-234,Another product name,Smart Insight > Cool stuff | 3dim > 2dim > 1dim
15
+ """
16
+ And the following definition:
17
+ """
18
+ LOAD_ID = sequence("some_sequence").next_value
19
+
20
+ source :products do
21
+ encoding "UTF-8"
22
+ delimiter ','
23
+
24
+ field :id, String
25
+ field :name, String
26
+ end
27
+
28
+ source :transformed_products do
29
+ field :load_id, Integer
30
+
31
+ field :id, String
32
+ field :name, String
33
+ end
34
+
35
+ transform :products => :transformed_products do |record|
36
+ record[:load_id] = LOAD_ID
37
+ output record
38
+ end
39
+
40
+ import :transformed_products do
41
+ into :dim_product
42
+
43
+ put :load_id
44
+ put :id => :item
45
+ put :name => :title
46
+ end
47
+ """
48
+ When I execute the definition
49
+ Then the process should exit successfully
50
+ And the "dim_product" table should contain:
51
+ | load_id (i) | item | title |
52
+ | 34 | JNI-123 | Just a product name |
53
+ | 34 | CDI-234 | Another product name |
@@ -0,0 +1,64 @@
1
+ Feature: Import a CSV file into the database with IDs looked up from multiple columns of the database
2
+
3
+ Scenario: Successful import
4
+ Given a database table called "dim_contact" with the following fields:
5
+ | field_name | field_type |
6
+ | contact_key | SERIAL |
7
+ | user_id_part_1 | TEXT |
8
+ | user_id_part_2 | TEXT |
9
+ | name | TEXT |
10
+ And only the following rows in the "dim_contact" database table:
11
+ | contact_key (i) | user_id_part_1 | user_id_part_2 | name |
12
+ | 11 | 1 | 1 | Alma |
13
+ | 12 | 1 | 2 | Korte |
14
+ | 21 | 2 | 1 | Szilva |
15
+ | 22 | 2 | 2 | Barack |
16
+ And a database table called "fct_purchases" with the following fields:
17
+ | field_name | field_type |
18
+ | contact_key | INTEGER |
19
+ | amount | TEXT |
20
+ And a "purchases.csv" data file containing:
21
+ """
22
+ user_id_1,user_id_2,amount
23
+ 1,1,100
24
+ 1,2,200
25
+ 2,1,300
26
+ 2,2,400
27
+ 3,1,500
28
+ """
29
+ And the following definition:
30
+ """
31
+ source :purchases do
32
+ field :user_id_1, String
33
+ field :user_id_2, String
34
+ field :amount, String
35
+ end
36
+
37
+ source :transformed_purchases do
38
+ field :contact_key, Integer
39
+ field :amount, String
40
+ end
41
+
42
+ transform :purchases => :transformed_purchases do |record|
43
+ record[:contact_key] = lookup :contact_key,
44
+ from_table: :dim_contact,
45
+ match: { :user_id_part_1 => record[:user_id_1], :user_id_part_2 => record[:user_id_2] },
46
+ if_not_found_then: -1
47
+ output record
48
+ end
49
+
50
+ import :transformed_purchases do
51
+ into :fct_purchases
52
+ put :contact_key => :contact_key
53
+ put :amount => :amount
54
+ end
55
+ """
56
+ When I execute the definition
57
+ Then the process should exit successfully
58
+ And the "fct_purchases" table should contain:
59
+ | contact_key (i) | amount |
60
+ | 11 | 100 |
61
+ | 12 | 200 |
62
+ | 21 | 300 |
63
+ | 22 | 400 |
64
+ | -1 | 500 |
@@ -0,0 +1,56 @@
1
+ Feature: Remove source files
2
+
3
+ Scenario:
4
+ Given a "forms.csv" data file containing:
5
+ """
6
+ id,name
7
+ 1,Landing form
8
+ 2,Other form
9
+ """
10
+ And a "contacts_extract.csv" data file containing:
11
+ """
12
+ id,created,form_id
13
+ 1,2001-01-01,1
14
+ 2,2002-02-02,2
15
+ 3,2003-03-03,1
16
+ """
17
+ And the following definition:
18
+ """
19
+ source :forms do
20
+ field :id, Integer
21
+ field :name, String
22
+ end
23
+
24
+ source :contacts_extract do
25
+ field :id, Integer
26
+ field :created, String
27
+ field :form_id, Integer
28
+ end
29
+
30
+ source :contacts do
31
+ field :id, Integer
32
+ field :created, String
33
+ field :form, String
34
+ end
35
+
36
+ form_mapping = {}
37
+
38
+ read :forms do |record|
39
+ form_mapping[record[:id]] = record[:name]
40
+ end
41
+
42
+ transform :contacts_extract => :contacts do |record|
43
+ record[:form] = form_mapping[record[:form_id]]
44
+ output record
45
+ end
46
+
47
+ """
48
+ When I execute the definition
49
+ Then the process should exit successfully
50
+ And there should be a "contacts.csv" data file in the upload directory containing:
51
+ """
52
+ id,created,form
53
+ 1,2001-01-01,Landing form
54
+ 2,2002-02-02,Other form
55
+ 3,2003-03-03,Landing form
56
+ """
@@ -0,0 +1,44 @@
1
+ Feature: Remove source files
2
+
3
+ Scenario:
4
+ Given a "contacts_extract_1.csv" data file containing:
5
+ """
6
+ """
7
+ And a "contacts_extract_2.csv" data file containing:
8
+ """
9
+ """
10
+ And a "clicks_extract_1.csv" data file containing:
11
+ """
12
+ """
13
+ And a "products.csv" data file containing:
14
+ """
15
+ """
16
+ And the following definition:
17
+ """
18
+ source :contacts_extract do
19
+ file "contacts_extract_*.csv"
20
+ end
21
+
22
+ source :clicks_extract do
23
+ file "clicks_extract_*.csv"
24
+ end
25
+
26
+ source :products do
27
+ file "products.csv"
28
+ end
29
+
30
+ source :products_transformed do end
31
+
32
+ transform :products => :products_transformed do |record|
33
+ output record
34
+ end
35
+
36
+ remove :contacts_extract, :clicks_extract
37
+ """
38
+ When I execute the definition
39
+ Then the process should exit successfully
40
+ And the upload directory should contain the following files:
41
+ | filename |
42
+ | definition.rb |
43
+ | products.csv |
44
+ | products_transformed.csv |