cranium 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +21 -0
- data/.ruby-version +1 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +3 -0
- data/Vagrantfile +24 -0
- data/bin/cranium +9 -0
- data/config/cucumber.yml +9 -0
- data/cranium.gemspec +26 -0
- data/db/setup.sql +8 -0
- data/docker-compose.yml +8 -0
- data/examples/config.rb +14 -0
- data/examples/deduplication.rb +27 -0
- data/examples/import_csv_with_field_lookup_inserting_new_dimension_keys.rb +26 -0
- data/examples/incremental_extract.rb +17 -0
- data/examples/lookup_with_multiple_fields.rb +25 -0
- data/features/archive.feature +49 -0
- data/features/extract/incremental_extract.feature +56 -0
- data/features/extract/simple_extract.feature +85 -0
- data/features/import/import_csv_to_database_as_delta.feature +38 -0
- data/features/import/import_csv_to_database_with_delete_insert_merging.feature +51 -0
- data/features/import/import_csv_to_database_with_truncate_insert.feature +49 -0
- data/features/import/import_csv_to_database_with_update_merging.feature +46 -0
- data/features/import/import_csv_with_always_inserting_new_dimension_keys.feature +137 -0
- data/features/import/import_csv_with_field_lookup_inserting_new_dimension_keys.feature +62 -0
- data/features/import/import_csv_with_field_lookup_transformation.feature +125 -0
- data/features/import/import_csv_with_transformation.feature +55 -0
- data/features/import/import_multiple_csv_files_without_transformations.feature +44 -0
- data/features/import/import_with_load_id_from_sequence.feature +53 -0
- data/features/import/import_with_lookup_from_multiple_fields.feature +64 -0
- data/features/read.feature +56 -0
- data/features/remove.feature +44 -0
- data/features/restore_database_connection.feature +55 -0
- data/features/step_definitions/database_table_steps.rb +40 -0
- data/features/step_definitions/definition_steps.rb +3 -0
- data/features/step_definitions/execution_steps.rb +23 -0
- data/features/step_definitions/file_steps.rb +39 -0
- data/features/support/class_extensions.rb +24 -0
- data/features/support/env.rb +27 -0
- data/features/support/randomize.rb +22 -0
- data/features/support/stop_on_first_error.rb +5 -0
- data/features/transform/deduplication.feature +37 -0
- data/features/transform/empty_transformation.feature +72 -0
- data/features/transform/join.feature +180 -0
- data/features/transform/join_multiple_files_into_one_output_file.feature +46 -0
- data/features/transform/output_rows.feature +70 -0
- data/features/transform/projection.feature +34 -0
- data/features/transform/raw_ruby_transformation.feature +69 -0
- data/features/transform/split_field.feature +39 -0
- data/lib/cranium/application.rb +104 -0
- data/lib/cranium/archiver.rb +36 -0
- data/lib/cranium/attribute_dsl.rb +43 -0
- data/lib/cranium/command_line_options.rb +27 -0
- data/lib/cranium/configuration.rb +33 -0
- data/lib/cranium/data_importer.rb +35 -0
- data/lib/cranium/data_reader.rb +48 -0
- data/lib/cranium/data_transformer.rb +126 -0
- data/lib/cranium/database.rb +36 -0
- data/lib/cranium/definition_registry.rb +21 -0
- data/lib/cranium/dimension_manager.rb +65 -0
- data/lib/cranium/dsl/database_definition.rb +23 -0
- data/lib/cranium/dsl/extract_definition.rb +28 -0
- data/lib/cranium/dsl/import_definition.rb +50 -0
- data/lib/cranium/dsl/source_definition.rb +67 -0
- data/lib/cranium/dsl.rb +100 -0
- data/lib/cranium/extensions/file.rb +7 -0
- data/lib/cranium/extensions/sequel_greenplum.rb +30 -0
- data/lib/cranium/external_table.rb +75 -0
- data/lib/cranium/extract/data_extractor.rb +11 -0
- data/lib/cranium/extract/storage.rb +57 -0
- data/lib/cranium/extract/strategy/base.rb +27 -0
- data/lib/cranium/extract/strategy/incremental.rb +16 -0
- data/lib/cranium/extract/strategy/simple.rb +9 -0
- data/lib/cranium/extract/strategy.rb +7 -0
- data/lib/cranium/extract.rb +7 -0
- data/lib/cranium/import_strategy/base.rb +55 -0
- data/lib/cranium/import_strategy/delete_insert.rb +40 -0
- data/lib/cranium/import_strategy/delta.rb +8 -0
- data/lib/cranium/import_strategy/merge.rb +50 -0
- data/lib/cranium/import_strategy/truncate_insert.rb +19 -0
- data/lib/cranium/import_strategy.rb +9 -0
- data/lib/cranium/logging.rb +15 -0
- data/lib/cranium/profiling.rb +13 -0
- data/lib/cranium/progress_output.rb +37 -0
- data/lib/cranium/sequel/hash.rb +32 -0
- data/lib/cranium/sequel.rb +5 -0
- data/lib/cranium/source_registry.rb +21 -0
- data/lib/cranium/test_framework/cucumber_table.rb +140 -0
- data/lib/cranium/test_framework/database_entity.rb +29 -0
- data/lib/cranium/test_framework/database_sequence.rb +16 -0
- data/lib/cranium/test_framework/database_table.rb +33 -0
- data/lib/cranium/test_framework/upload_directory.rb +39 -0
- data/lib/cranium/test_framework/world.rb +66 -0
- data/lib/cranium/test_framework.rb +10 -0
- data/lib/cranium/transformation/duplication_index.rb +42 -0
- data/lib/cranium/transformation/index.rb +83 -0
- data/lib/cranium/transformation/join.rb +141 -0
- data/lib/cranium/transformation/sequence.rb +42 -0
- data/lib/cranium/transformation.rb +8 -0
- data/lib/cranium/transformation_record.rb +45 -0
- data/lib/cranium.rb +57 -0
- data/rake/test.rake +31 -0
- data/spec/cranium/application_spec.rb +166 -0
- data/spec/cranium/archiver_spec.rb +44 -0
- data/spec/cranium/command_line_options_spec.rb +32 -0
- data/spec/cranium/configuration_spec.rb +31 -0
- data/spec/cranium/data_importer_spec.rb +55 -0
- data/spec/cranium/data_transformer_spec.rb +16 -0
- data/spec/cranium/database_spec.rb +69 -0
- data/spec/cranium/definition_registry_spec.rb +45 -0
- data/spec/cranium/dimension_manager_spec.rb +63 -0
- data/spec/cranium/dsl/database_definition_spec.rb +23 -0
- data/spec/cranium/dsl/extract_definition_spec.rb +76 -0
- data/spec/cranium/dsl/import_definition_spec.rb +153 -0
- data/spec/cranium/dsl/source_definition_spec.rb +84 -0
- data/spec/cranium/dsl_spec.rb +119 -0
- data/spec/cranium/external_table_spec.rb +71 -0
- data/spec/cranium/extract/storage_spec.rb +125 -0
- data/spec/cranium/logging_spec.rb +37 -0
- data/spec/cranium/sequel/hash_spec.rb +56 -0
- data/spec/cranium/source_registry_spec.rb +31 -0
- data/spec/cranium/test_framework/cucumber_table_spec.rb +144 -0
- data/spec/cranium/transformation/duplication_index_spec.rb +75 -0
- data/spec/cranium/transformation/index_spec.rb +178 -0
- data/spec/cranium/transformation/join_spec.rb +43 -0
- data/spec/cranium/transformation/sequence_spec.rb +83 -0
- data/spec/cranium/transformation_record_spec.rb +78 -0
- data/spec/cranium_spec.rb +53 -0
- data/spec/spec_helper.rb +1 -0
- metadata +362 -0
@@ -0,0 +1,69 @@
|
|
1
|
+
Feature: Raw Ruby transformation
|
2
|
+
|
3
|
+
Scenario: A transform block can use the record as a Hash
|
4
|
+
Given a "products.csv" data file containing:
|
5
|
+
"""
|
6
|
+
id,name,category
|
7
|
+
JNI-123,Just a product name,Main category > Subcategory > Sub-subcategory
|
8
|
+
CDI-234,Another product name,Smart Insight > Cool stuff > Scripts
|
9
|
+
"""
|
10
|
+
And the following definition:
|
11
|
+
"""
|
12
|
+
source :products do
|
13
|
+
field :id, String
|
14
|
+
field :name, String
|
15
|
+
field :category, String
|
16
|
+
end
|
17
|
+
|
18
|
+
source :transformed_products do
|
19
|
+
field :item, String
|
20
|
+
field :title, String
|
21
|
+
field :category, String
|
22
|
+
end
|
23
|
+
|
24
|
+
transform :products => :transformed_products do |record|
|
25
|
+
record[:item] = "*#{record[:id]}*"
|
26
|
+
record[:title] = record[:name].chars.first
|
27
|
+
output record
|
28
|
+
end
|
29
|
+
"""
|
30
|
+
When I execute the definition
|
31
|
+
Then the process should exit successfully
|
32
|
+
And there should be a "transformed_products.csv" data file in the upload directory containing:
|
33
|
+
"""
|
34
|
+
item,title,category
|
35
|
+
*JNI-123*,J,Main category > Subcategory > Sub-subcategory
|
36
|
+
*CDI-234*,A,Smart Insight > Cool stuff > Scripts
|
37
|
+
"""
|
38
|
+
|
39
|
+
|
40
|
+
Scenario: Records can be skipped
|
41
|
+
Given a "products.csv" data file containing:
|
42
|
+
"""
|
43
|
+
id
|
44
|
+
1
|
45
|
+
2
|
46
|
+
3
|
47
|
+
"""
|
48
|
+
And the following definition:
|
49
|
+
"""
|
50
|
+
source :products do
|
51
|
+
field :id, Integer
|
52
|
+
end
|
53
|
+
|
54
|
+
source :transformed_products do
|
55
|
+
field :id, Integer
|
56
|
+
end
|
57
|
+
|
58
|
+
transform :products => :transformed_products do |record|
|
59
|
+
output record unless "2" == record[:id]
|
60
|
+
end
|
61
|
+
"""
|
62
|
+
When I execute the definition
|
63
|
+
Then the process should exit successfully
|
64
|
+
And there should be a "transformed_products.csv" data file in the upload directory containing:
|
65
|
+
"""
|
66
|
+
id
|
67
|
+
1
|
68
|
+
3
|
69
|
+
"""
|
@@ -0,0 +1,39 @@
|
|
1
|
+
Feature: Split field
|
2
|
+
|
3
|
+
Scenario: A single field can be split into multiple fields
|
4
|
+
Given a "products.csv" data file containing:
|
5
|
+
"""
|
6
|
+
id,name,category
|
7
|
+
JNI-123,Just a product name,Main category > Subcategory > Sub-subcategory > Ultra-subcategory
|
8
|
+
CDI-234,Another product name,Smart Insight > Cool stuff | 3dim > 2dim > 1dim
|
9
|
+
"""
|
10
|
+
And the following definition:
|
11
|
+
"""
|
12
|
+
source :products do
|
13
|
+
field :item, String
|
14
|
+
field :title, String
|
15
|
+
field :category, String
|
16
|
+
end
|
17
|
+
|
18
|
+
source :transformed_products do
|
19
|
+
field :item, String
|
20
|
+
field :title, String
|
21
|
+
field :main_category, String
|
22
|
+
field :sub_category, String
|
23
|
+
field :department, String
|
24
|
+
end
|
25
|
+
|
26
|
+
transform :products => :transformed_products do |record|
|
27
|
+
record.split_field :category, into: [:category], by: "|"
|
28
|
+
record.split_field :category, into: [:main_category, :sub_category, :department], by: ">"
|
29
|
+
output record
|
30
|
+
end
|
31
|
+
"""
|
32
|
+
When I execute the definition
|
33
|
+
Then the process should exit successfully
|
34
|
+
And there should be a "transformed_products.csv" data file in the upload directory containing:
|
35
|
+
"""
|
36
|
+
item,title,main_category,sub_category,department
|
37
|
+
JNI-123,Just a product name,Main category,Subcategory,Sub-subcategory
|
38
|
+
CDI-234,Another product name,Smart Insight,Cool stuff,Cool stuff
|
39
|
+
"""
|
@@ -0,0 +1,104 @@
|
|
1
|
+
class Cranium::Application
|
2
|
+
|
3
|
+
include Cranium::Logging
|
4
|
+
|
5
|
+
attr_reader :sources
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
def initialize(arguments)
|
10
|
+
@sources = Cranium::SourceRegistry.new
|
11
|
+
@hooks = {}
|
12
|
+
|
13
|
+
@options = Cranium::CommandLineOptions.new arguments
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
def load_arguments
|
19
|
+
options.load_arguments
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
def cranium_arguments
|
25
|
+
options.cranium_arguments
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
def register_source(name, &block)
|
31
|
+
@sources.register_source(name, &block).resolve_files
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
|
36
|
+
def run
|
37
|
+
process_file = validate_file options.cranium_arguments[:load]
|
38
|
+
|
39
|
+
begin
|
40
|
+
load process_file
|
41
|
+
rescue Exception => ex
|
42
|
+
log :error, ex
|
43
|
+
raise
|
44
|
+
ensure
|
45
|
+
apply_hook :after
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
def after_import(&block)
|
52
|
+
register_hook :after_import, &block
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
def register_hook(name, &block)
|
58
|
+
@hooks[name] ||= []
|
59
|
+
@hooks[name] << block
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
def apply_hook(name)
|
65
|
+
unless @hooks[name].nil?
|
66
|
+
@hooks[name].each do |block|
|
67
|
+
block.call
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
|
74
|
+
private
|
75
|
+
|
76
|
+
attr_reader :options
|
77
|
+
|
78
|
+
|
79
|
+
|
80
|
+
def validate_file(load_file)
|
81
|
+
exit_if_no_file_specified load_file
|
82
|
+
exit_if_no_such_file_exists load_file
|
83
|
+
load_file
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
|
88
|
+
def exit_if_no_file_specified(file)
|
89
|
+
if file.nil? || file.empty?
|
90
|
+
$stderr.puts "ERROR: No file specified"
|
91
|
+
exit 1
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
def exit_if_no_such_file_exists(file)
|
98
|
+
unless File.exists? file
|
99
|
+
$stderr.puts "ERROR: File '#{file}' does not exist"
|
100
|
+
exit 1
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
module Cranium::Archiver
|
4
|
+
|
5
|
+
def self.archive(*files)
|
6
|
+
create_archive_directory
|
7
|
+
archive_files files
|
8
|
+
end
|
9
|
+
|
10
|
+
|
11
|
+
|
12
|
+
def self.remove(*files)
|
13
|
+
files.each do |file_name|
|
14
|
+
FileUtils.rm File.join(Cranium.configuration.upload_path, file_name)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def self.create_archive_directory
|
23
|
+
FileUtils.mkpath Cranium.configuration.archive_directory unless Dir.exists? Cranium.configuration.archive_directory
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
def self.archive_files(files)
|
29
|
+
archive_datetime = Time.now.strftime("%Y-%m-%d_%Hh%Mm%Ss")
|
30
|
+
files.each do |file_name|
|
31
|
+
FileUtils.mv File.join(Cranium.configuration.upload_path, file_name),
|
32
|
+
File.join(Cranium.configuration.archive_directory, "#{archive_datetime}_#{file_name}")
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Cranium::AttributeDSL
|
2
|
+
|
3
|
+
def define_attribute(name)
|
4
|
+
class_eval <<-attribute_method
|
5
|
+
|
6
|
+
def #{name}(*args)
|
7
|
+
return @#{name} if args.count.zero?
|
8
|
+
|
9
|
+
@#{name} = args.first
|
10
|
+
end
|
11
|
+
|
12
|
+
attribute_method
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
def define_array_attribute(name)
|
18
|
+
class_eval <<-attribute_method
|
19
|
+
|
20
|
+
def #{name}(*args)
|
21
|
+
return @#{name} || [] if args.count.zero?
|
22
|
+
|
23
|
+
@#{name} = args
|
24
|
+
end
|
25
|
+
|
26
|
+
attribute_method
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
def define_boolean_attribute(name)
|
32
|
+
class_eval <<-attribute_method
|
33
|
+
|
34
|
+
def #{name}(*args)
|
35
|
+
return !!@#{name} if args.count.zero?
|
36
|
+
|
37
|
+
@#{name} = !!args
|
38
|
+
end
|
39
|
+
|
40
|
+
attribute_method
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require "slop"
|
2
|
+
|
3
|
+
class Cranium::CommandLineOptions
|
4
|
+
|
5
|
+
def initialize(arguments)
|
6
|
+
@arguments = Slop.parse(arguments, autocreate: true).to_hash
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
def cranium_arguments
|
12
|
+
@cranium_arguments ||= Hash[arguments.map { |k, v| [$1.to_sym, v] if k.to_s =~ /\Acranium\-(.*)/ }.compact]
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
def load_arguments
|
18
|
+
@load_arguments ||= Hash[arguments.map { |k, v| [k, v] unless k.to_s =~ /\Acranium\-(.*)/ }.compact]
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
attr_reader :arguments
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
class Cranium::Configuration
|
2
|
+
|
3
|
+
STORAGE_DIRECTORY_NAME = ".cranium"
|
4
|
+
|
5
|
+
attr_writer :storage_directory
|
6
|
+
attr_accessor :archive_directory
|
7
|
+
attr_accessor :greenplum_connection_string
|
8
|
+
attr_accessor :mysql_connection_string
|
9
|
+
attr_accessor :upload_directory
|
10
|
+
attr_accessor :gpfdist_home_directory
|
11
|
+
attr_accessor :gpfdist_url
|
12
|
+
attr_accessor :loggers
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@loggers = []
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
def upload_path
|
23
|
+
File.join gpfdist_home_directory, upload_directory
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
def storage_directory
|
29
|
+
return @storage_directory unless @storage_directory.nil?
|
30
|
+
File.join upload_path, STORAGE_DIRECTORY_NAME
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
class Cranium::DataImporter
|
2
|
+
|
3
|
+
include Cranium::Logging
|
4
|
+
|
5
|
+
|
6
|
+
def import(import_definition)
|
7
|
+
number_of_items_imported = 0
|
8
|
+
Cranium::Database.connection.transaction do
|
9
|
+
number_of_items_imported = importer_for_definition(import_definition).import
|
10
|
+
Cranium.application.apply_hook(:after_import)
|
11
|
+
end
|
12
|
+
|
13
|
+
record_metric import_definition.name, number_of_items_imported.to_s
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def importer_for_definition(import_definition)
|
20
|
+
if [!import_definition.merge_fields.empty?, !import_definition.delete_insert_on.empty?, import_definition.truncate_insert].count(true) > 1
|
21
|
+
raise StandardError, "Import should not combine merge_on, delete_insert_on and truncate_insert settings"
|
22
|
+
end
|
23
|
+
|
24
|
+
if !import_definition.merge_fields.empty?
|
25
|
+
Cranium::ImportStrategy::Merge.new(import_definition)
|
26
|
+
elsif !import_definition.delete_insert_on.empty?
|
27
|
+
Cranium::ImportStrategy::DeleteInsert.new(import_definition)
|
28
|
+
elsif import_definition.truncate_insert
|
29
|
+
Cranium::ImportStrategy::TruncateInsert.new(import_definition)
|
30
|
+
else
|
31
|
+
Cranium::ImportStrategy::Delta.new(import_definition)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'cranium/extensions/file'
|
3
|
+
|
4
|
+
class Cranium::DataReader
|
5
|
+
|
6
|
+
def initialize(source)
|
7
|
+
@source = source
|
8
|
+
@source_field_names = @source.fields.keys
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
def read(&block)
|
14
|
+
@source.files.each do |input_file|
|
15
|
+
read_input_file File.join(Cranium.configuration.upload_path, input_file), block
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def read_input_file(input_file, read_block)
|
24
|
+
Cranium::ProgressOutput.show_progress File.basename(input_file), File.line_count(input_file) do |progress_bar|
|
25
|
+
line_number = 0
|
26
|
+
CSV.foreach input_file, csv_read_options_for(@source) do |row|
|
27
|
+
next if 1 == (line_number += 1)
|
28
|
+
|
29
|
+
record = Hash[@source_field_names.zip row]
|
30
|
+
self.instance_exec record, &read_block
|
31
|
+
|
32
|
+
progress_bar.inc
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
def csv_read_options_for(source_definition)
|
40
|
+
{
|
41
|
+
encoding: source_definition.encoding,
|
42
|
+
col_sep: source_definition.delimiter,
|
43
|
+
quote_char: source_definition.quote,
|
44
|
+
return_headers: false
|
45
|
+
}
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'cranium/extensions/file'
|
3
|
+
|
4
|
+
class Cranium::DataTransformer
|
5
|
+
|
6
|
+
def initialize(source, target)
|
7
|
+
@source, @target = source, target
|
8
|
+
@index = Cranium::Transformation::Index.new
|
9
|
+
@target_fields = @target.fields.keys
|
10
|
+
@record = Cranium::TransformationRecord.new @source.fields.keys, @target_fields
|
11
|
+
end
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
def transform(&block)
|
16
|
+
raise StandardError, "Source definition '#{@target.name}' cannot overrride the file name because it is a transformation target" if @target.file_name_overriden?
|
17
|
+
|
18
|
+
CSV.open "#{Cranium.configuration.upload_path}/#{@target.file}", "w:#{@target.encoding}", csv_write_options_for(@target) do |target_file|
|
19
|
+
@target_file = target_file
|
20
|
+
@source.files.each do |input_file|
|
21
|
+
transform_input_file File.join(Cranium.configuration.upload_path, input_file), block
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
@target.resolve_files
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def transform_input_file(input_file, transformation_block)
|
33
|
+
Cranium::ProgressOutput.show_progress File.basename(input_file), File.line_count(input_file) do |progress_bar|
|
34
|
+
line_number = 0
|
35
|
+
CSV.foreach input_file, csv_read_options_for(@source) do |row|
|
36
|
+
next if 1 == (line_number += 1)
|
37
|
+
|
38
|
+
@record.input_data = row
|
39
|
+
self.instance_exec @record, &transformation_block
|
40
|
+
|
41
|
+
progress_bar.inc
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
def csv_write_options_for(source_definition)
|
49
|
+
{
|
50
|
+
col_sep: source_definition.delimiter,
|
51
|
+
quote_char: source_definition.quote,
|
52
|
+
write_headers: true,
|
53
|
+
headers: source_definition.fields.keys
|
54
|
+
}
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
def csv_read_options_for(source_definition)
|
60
|
+
{
|
61
|
+
encoding: source_definition.encoding,
|
62
|
+
col_sep: source_definition.delimiter,
|
63
|
+
quote_char: source_definition.quote,
|
64
|
+
return_headers: false
|
65
|
+
}
|
66
|
+
end
|
67
|
+
|
68
|
+
|
69
|
+
|
70
|
+
def output(record)
|
71
|
+
@target_file << prepare_for_output(case record
|
72
|
+
when Cranium::TransformationRecord
|
73
|
+
record.data
|
74
|
+
when Hash
|
75
|
+
record
|
76
|
+
else
|
77
|
+
raise ArgumentError, "Cannot write '#{record.class}' to file as CSV record"
|
78
|
+
end)
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
|
83
|
+
def prepare_for_output(hash)
|
84
|
+
hash.
|
85
|
+
keep_if { |key| @target_fields.include? key }.
|
86
|
+
sort_by { |field, _| @target_fields.index(field) }.
|
87
|
+
map { |item| item[1] }.
|
88
|
+
map { |value| strip(value) }
|
89
|
+
end
|
90
|
+
|
91
|
+
|
92
|
+
|
93
|
+
def strip(value)
|
94
|
+
return value unless value.respond_to? :strip
|
95
|
+
value.strip
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
|
100
|
+
def unique_on_fields?(*fields)
|
101
|
+
not Cranium::Transformation::DuplicationIndex[*fields].duplicate? @record
|
102
|
+
end
|
103
|
+
|
104
|
+
|
105
|
+
|
106
|
+
def lookup(field_name, settings)
|
107
|
+
@index.lookup field_name, settings
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
def insert(field_name, settings)
|
113
|
+
@index.insert field_name, settings
|
114
|
+
end
|
115
|
+
|
116
|
+
|
117
|
+
|
118
|
+
def next_value_in_sequence(name)
|
119
|
+
Cranium::Transformation::Sequence.by_name name
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
|
124
|
+
alias_method :sequence, :next_value_in_sequence
|
125
|
+
|
126
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
require 'sequel/extensions/connection_validator'
|
3
|
+
|
4
|
+
module Cranium::Database
|
5
|
+
|
6
|
+
def self.connection
|
7
|
+
@connection ||= setup_connection(Cranium.configuration.greenplum_connection_string)
|
8
|
+
end
|
9
|
+
|
10
|
+
|
11
|
+
|
12
|
+
def self.[](name)
|
13
|
+
@connections ||= {}
|
14
|
+
@connections[name] ||= setup_connection(@definitions[name].connect_to)
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
def self.register_database(name, &block)
|
20
|
+
@definitions ||= Cranium::DefinitionRegistry.new Cranium::DSL::DatabaseDefinition
|
21
|
+
@definitions.register_definition name, &block
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
|
29
|
+
def self.setup_connection(connection_string)
|
30
|
+
connection = Sequel.connect connection_string, loggers: Cranium.configuration.loggers
|
31
|
+
connection.extension :connection_validator
|
32
|
+
connection.pool.connection_validation_timeout = -1
|
33
|
+
return connection
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
class Cranium::DefinitionRegistry
|
2
|
+
|
3
|
+
def initialize(definition_class)
|
4
|
+
@definition_class = definition_class
|
5
|
+
@definitions = {}
|
6
|
+
end
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
def [](name)
|
11
|
+
@definitions[name]
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
def register_definition(name, &block)
|
17
|
+
definition = @definition_class.new name
|
18
|
+
definition.instance_eval &block
|
19
|
+
@definitions[name] = definition
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
class Cranium::DimensionManager
|
2
|
+
|
3
|
+
attr_reader :rows
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
def self.for(table_name, key_fields)
|
8
|
+
@instances ||= {}
|
9
|
+
@instances[[table_name, key_fields]] ||= self.new table_name, key_fields
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
|
14
|
+
def initialize(table_name, key_fields)
|
15
|
+
@table_name, @key_fields = table_name, key_fields
|
16
|
+
@rows = []
|
17
|
+
|
18
|
+
Cranium.application.after_import { flush }
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
def insert(target_key, row)
|
24
|
+
raise ArgumentError, "Required attribute '#{target_key}' missing" unless row.has_key? target_key
|
25
|
+
|
26
|
+
@rows << resolve_sequence_values(row)
|
27
|
+
row[target_key]
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
|
32
|
+
def create_cache_for_field(value_field)
|
33
|
+
to_multi_key_cache(db.select_map(@key_fields + [value_field]))
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
|
38
|
+
def flush
|
39
|
+
db.multi_insert(@rows) unless @rows.empty?
|
40
|
+
@rows = []
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def to_multi_key_cache(table_data)
|
48
|
+
Hash[table_data.map { |row| [row[0..-2], row.last] }]
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
|
53
|
+
def resolve_sequence_values(row)
|
54
|
+
row.each do |key, value|
|
55
|
+
row[key] = value.next_value if value.is_a? Cranium::Transformation::Sequence
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
def db
|
62
|
+
Cranium::Database.connection[@table_name]
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|