cranium 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/cranium.gemspec +1 -1
- data/features/import/import_csv_with_always_inserting_new_dimension_keys.feature +62 -1
- data/features/step_definitions/database_table_steps.rb +5 -0
- data/features/step_definitions/file_steps.rb +7 -0
- data/lib/cranium/dimension_manager.rb +5 -1
- data/lib/cranium/test_framework/database_table.rb +6 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 95d417150867a455805bdc4b21372050ee81f8cc
|
4
|
+
data.tar.gz: 6a1ee615b84a1a6eefd83e4590d947e408fb1244
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ee2e4a162601873d76cc96b1d88b02b4e6348019f04a899d68ba6c1723a9d1f678f51314bdc748e30357f7d04d8026330c1d428761c7c65506d46483bc45e3e
|
7
|
+
data.tar.gz: 114072465d737d6d8fd18ea884d7ed3c1f5b26bfc4dbdf3c2da2e6c6fb0994d5922b75502de2c0cbe75bb60042d3a1c90f788258403a580098175e9e7f108c23
|
data/cranium.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = 'cranium'
|
3
|
-
spec.version = '0.
|
3
|
+
spec.version = '0.3.0'
|
4
4
|
spec.authors = ['Emarsys Technologies']
|
5
5
|
spec.email = ['smart-insight-dev@emarsys.com']
|
6
6
|
spec.description = %q{Provides Extract, Transform and Load functionality for loading data from CSV files to a Greenplum database.}
|
@@ -62,7 +62,7 @@ Feature: Import a CSV file into the database with new dimension values always in
|
|
62
62
|
|
63
63
|
|
64
64
|
Scenario: Example use case for the insert
|
65
|
-
If purchases made by a predefined contact identifier (NA in this case) do not look for it insert
|
65
|
+
If purchases made by a predefined contact identifier (NA in this case) do not look for it insert.
|
66
66
|
Otherwise use lookup to find or create that contact
|
67
67
|
|
68
68
|
Given a database table called "dim_contact" with the following fields:
|
@@ -135,3 +135,64 @@ Feature: Import a CSV file into the database with new dimension values always in
|
|
135
135
|
| 11 | NA | Unknown contact NA |
|
136
136
|
| 12 | NA | Unknown contact NA |
|
137
137
|
| 13 | 2 | Unknown contact 2 |
|
138
|
+
|
139
|
+
|
140
|
+
Scenario: Successful import with a large number of users (contacts) to insert
|
141
|
+
Given a database table called "dim_contact" with the following fields:
|
142
|
+
| field_name | field_type |
|
143
|
+
| contact_key | SERIAL |
|
144
|
+
| user_id | TEXT |
|
145
|
+
| name | TEXT |
|
146
|
+
| load_id | NUMERIC |
|
147
|
+
| load_date | DATE |
|
148
|
+
| is_generated | BOOLEAN |
|
149
|
+
And only the following rows in the "dim_contact" database table:
|
150
|
+
| contact_key (i) | user_id | name |
|
151
|
+
| 10 | 1 | Alma |
|
152
|
+
And the current value in sequence "dim_contact_contact_key_seq" is 10
|
153
|
+
And a database table called "fct_purchases" with the following fields:
|
154
|
+
| field_name | field_type |
|
155
|
+
| contact_key | INTEGER |
|
156
|
+
| amount | TEXT |
|
157
|
+
And a 500_000 lines long "purchases.csv" data file containing rows like:
|
158
|
+
"""
|
159
|
+
user_id,amount
|
160
|
+
NA,100
|
161
|
+
"""
|
162
|
+
And the following definition:
|
163
|
+
"""
|
164
|
+
source :purchases do
|
165
|
+
field :user_id, String
|
166
|
+
field :amount, String
|
167
|
+
end
|
168
|
+
|
169
|
+
source :transformed_purchases do
|
170
|
+
field :contact_key, Integer
|
171
|
+
field :amount, String
|
172
|
+
end
|
173
|
+
|
174
|
+
now = Time.now
|
175
|
+
transform :purchases => :transformed_purchases do |record|
|
176
|
+
record[:contact_key] = insert :contact_key,
|
177
|
+
table: :dim_contact,
|
178
|
+
record: {
|
179
|
+
load_id: 1,
|
180
|
+
load_date: now,
|
181
|
+
contact_key: next_value_in_sequence("dim_contact_contact_key_seq"),
|
182
|
+
user_id: record[:user_id],
|
183
|
+
name: "Unknown contact #{record[:user_id]}",
|
184
|
+
is_generated: true
|
185
|
+
}
|
186
|
+
output record
|
187
|
+
end
|
188
|
+
|
189
|
+
import :transformed_purchases do
|
190
|
+
into :fct_purchases
|
191
|
+
put :contact_key
|
192
|
+
put :amount
|
193
|
+
end
|
194
|
+
"""
|
195
|
+
When I execute the definition
|
196
|
+
Then the process should exit successfully
|
197
|
+
And the "fct_purchases" table should contain 500_000 purchases
|
198
|
+
And the "dim_contact" table should contain 500_001 contacts
|
@@ -38,3 +38,8 @@ Then(/^the "([^"]*)" table should contain:$/) do |table_name, data|
|
|
38
38
|
|
39
39
|
expect(database_table(table_name).content(data.fields)).to match_array expected_data
|
40
40
|
end
|
41
|
+
|
42
|
+
|
43
|
+
Then(/^the "([^"]*)" table should contain ([\d_]+) .+$/) do |table_name, count|
|
44
|
+
expect(database_table(table_name).count).to eq count.to_i
|
45
|
+
end
|
@@ -13,6 +13,13 @@ Given /^an? "([^"]*)" data file containing:$/ do |file_name, content|
|
|
13
13
|
end
|
14
14
|
|
15
15
|
|
16
|
+
Given /^an? ([\d_]+) lines long "([^"]*)" data file containing rows like:$/ do |lines_count, file_name, content|
|
17
|
+
lines = content.split("\n")
|
18
|
+
|
19
|
+
upload_directory.save_file file_name, "#{lines.first}\n" + "#{lines.last}\n" * lines_count.to_i
|
20
|
+
end
|
21
|
+
|
22
|
+
|
16
23
|
Given /^the "([^"]*)" file is deleted$/ do |file_name|
|
17
24
|
upload_directory.delete_file file_name
|
18
25
|
end
|
@@ -36,7 +36,7 @@ class Cranium::DimensionManager
|
|
36
36
|
|
37
37
|
|
38
38
|
def flush
|
39
|
-
db.multi_insert(@rows) unless @rows.empty?
|
39
|
+
db.multi_insert(@rows, slice: INSERT_BATCH_SIZE) unless @rows.empty?
|
40
40
|
@rows = []
|
41
41
|
end
|
42
42
|
|
@@ -44,6 +44,10 @@ class Cranium::DimensionManager
|
|
44
44
|
|
45
45
|
private
|
46
46
|
|
47
|
+
INSERT_BATCH_SIZE = 100_000.freeze
|
48
|
+
|
49
|
+
|
50
|
+
|
47
51
|
def to_multi_key_cache(table_data)
|
48
52
|
Hash[table_data.map { |row| [row[0..-2], row.last] }]
|
49
53
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cranium
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Emarsys Technologies
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-04-
|
11
|
+
date: 2016-04-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pg
|