cranium 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +21 -0
- data/.ruby-version +1 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +3 -0
- data/Vagrantfile +24 -0
- data/bin/cranium +9 -0
- data/config/cucumber.yml +9 -0
- data/cranium.gemspec +26 -0
- data/db/setup.sql +8 -0
- data/docker-compose.yml +8 -0
- data/examples/config.rb +14 -0
- data/examples/deduplication.rb +27 -0
- data/examples/import_csv_with_field_lookup_inserting_new_dimension_keys.rb +26 -0
- data/examples/incremental_extract.rb +17 -0
- data/examples/lookup_with_multiple_fields.rb +25 -0
- data/features/archive.feature +49 -0
- data/features/extract/incremental_extract.feature +56 -0
- data/features/extract/simple_extract.feature +85 -0
- data/features/import/import_csv_to_database_as_delta.feature +38 -0
- data/features/import/import_csv_to_database_with_delete_insert_merging.feature +51 -0
- data/features/import/import_csv_to_database_with_truncate_insert.feature +49 -0
- data/features/import/import_csv_to_database_with_update_merging.feature +46 -0
- data/features/import/import_csv_with_always_inserting_new_dimension_keys.feature +137 -0
- data/features/import/import_csv_with_field_lookup_inserting_new_dimension_keys.feature +62 -0
- data/features/import/import_csv_with_field_lookup_transformation.feature +125 -0
- data/features/import/import_csv_with_transformation.feature +55 -0
- data/features/import/import_multiple_csv_files_without_transformations.feature +44 -0
- data/features/import/import_with_load_id_from_sequence.feature +53 -0
- data/features/import/import_with_lookup_from_multiple_fields.feature +64 -0
- data/features/read.feature +56 -0
- data/features/remove.feature +44 -0
- data/features/restore_database_connection.feature +55 -0
- data/features/step_definitions/database_table_steps.rb +40 -0
- data/features/step_definitions/definition_steps.rb +3 -0
- data/features/step_definitions/execution_steps.rb +23 -0
- data/features/step_definitions/file_steps.rb +39 -0
- data/features/support/class_extensions.rb +24 -0
- data/features/support/env.rb +27 -0
- data/features/support/randomize.rb +22 -0
- data/features/support/stop_on_first_error.rb +5 -0
- data/features/transform/deduplication.feature +37 -0
- data/features/transform/empty_transformation.feature +72 -0
- data/features/transform/join.feature +180 -0
- data/features/transform/join_multiple_files_into_one_output_file.feature +46 -0
- data/features/transform/output_rows.feature +70 -0
- data/features/transform/projection.feature +34 -0
- data/features/transform/raw_ruby_transformation.feature +69 -0
- data/features/transform/split_field.feature +39 -0
- data/lib/cranium/application.rb +104 -0
- data/lib/cranium/archiver.rb +36 -0
- data/lib/cranium/attribute_dsl.rb +43 -0
- data/lib/cranium/command_line_options.rb +27 -0
- data/lib/cranium/configuration.rb +33 -0
- data/lib/cranium/data_importer.rb +35 -0
- data/lib/cranium/data_reader.rb +48 -0
- data/lib/cranium/data_transformer.rb +126 -0
- data/lib/cranium/database.rb +36 -0
- data/lib/cranium/definition_registry.rb +21 -0
- data/lib/cranium/dimension_manager.rb +65 -0
- data/lib/cranium/dsl/database_definition.rb +23 -0
- data/lib/cranium/dsl/extract_definition.rb +28 -0
- data/lib/cranium/dsl/import_definition.rb +50 -0
- data/lib/cranium/dsl/source_definition.rb +67 -0
- data/lib/cranium/dsl.rb +100 -0
- data/lib/cranium/extensions/file.rb +7 -0
- data/lib/cranium/extensions/sequel_greenplum.rb +30 -0
- data/lib/cranium/external_table.rb +75 -0
- data/lib/cranium/extract/data_extractor.rb +11 -0
- data/lib/cranium/extract/storage.rb +57 -0
- data/lib/cranium/extract/strategy/base.rb +27 -0
- data/lib/cranium/extract/strategy/incremental.rb +16 -0
- data/lib/cranium/extract/strategy/simple.rb +9 -0
- data/lib/cranium/extract/strategy.rb +7 -0
- data/lib/cranium/extract.rb +7 -0
- data/lib/cranium/import_strategy/base.rb +55 -0
- data/lib/cranium/import_strategy/delete_insert.rb +40 -0
- data/lib/cranium/import_strategy/delta.rb +8 -0
- data/lib/cranium/import_strategy/merge.rb +50 -0
- data/lib/cranium/import_strategy/truncate_insert.rb +19 -0
- data/lib/cranium/import_strategy.rb +9 -0
- data/lib/cranium/logging.rb +15 -0
- data/lib/cranium/profiling.rb +13 -0
- data/lib/cranium/progress_output.rb +37 -0
- data/lib/cranium/sequel/hash.rb +32 -0
- data/lib/cranium/sequel.rb +5 -0
- data/lib/cranium/source_registry.rb +21 -0
- data/lib/cranium/test_framework/cucumber_table.rb +140 -0
- data/lib/cranium/test_framework/database_entity.rb +29 -0
- data/lib/cranium/test_framework/database_sequence.rb +16 -0
- data/lib/cranium/test_framework/database_table.rb +33 -0
- data/lib/cranium/test_framework/upload_directory.rb +39 -0
- data/lib/cranium/test_framework/world.rb +66 -0
- data/lib/cranium/test_framework.rb +10 -0
- data/lib/cranium/transformation/duplication_index.rb +42 -0
- data/lib/cranium/transformation/index.rb +83 -0
- data/lib/cranium/transformation/join.rb +141 -0
- data/lib/cranium/transformation/sequence.rb +42 -0
- data/lib/cranium/transformation.rb +8 -0
- data/lib/cranium/transformation_record.rb +45 -0
- data/lib/cranium.rb +57 -0
- data/rake/test.rake +31 -0
- data/spec/cranium/application_spec.rb +166 -0
- data/spec/cranium/archiver_spec.rb +44 -0
- data/spec/cranium/command_line_options_spec.rb +32 -0
- data/spec/cranium/configuration_spec.rb +31 -0
- data/spec/cranium/data_importer_spec.rb +55 -0
- data/spec/cranium/data_transformer_spec.rb +16 -0
- data/spec/cranium/database_spec.rb +69 -0
- data/spec/cranium/definition_registry_spec.rb +45 -0
- data/spec/cranium/dimension_manager_spec.rb +63 -0
- data/spec/cranium/dsl/database_definition_spec.rb +23 -0
- data/spec/cranium/dsl/extract_definition_spec.rb +76 -0
- data/spec/cranium/dsl/import_definition_spec.rb +153 -0
- data/spec/cranium/dsl/source_definition_spec.rb +84 -0
- data/spec/cranium/dsl_spec.rb +119 -0
- data/spec/cranium/external_table_spec.rb +71 -0
- data/spec/cranium/extract/storage_spec.rb +125 -0
- data/spec/cranium/logging_spec.rb +37 -0
- data/spec/cranium/sequel/hash_spec.rb +56 -0
- data/spec/cranium/source_registry_spec.rb +31 -0
- data/spec/cranium/test_framework/cucumber_table_spec.rb +144 -0
- data/spec/cranium/transformation/duplication_index_spec.rb +75 -0
- data/spec/cranium/transformation/index_spec.rb +178 -0
- data/spec/cranium/transformation/join_spec.rb +43 -0
- data/spec/cranium/transformation/sequence_spec.rb +83 -0
- data/spec/cranium/transformation_record_spec.rb +78 -0
- data/spec/cranium_spec.rb +53 -0
- data/spec/spec_helper.rb +1 -0
- metadata +362 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
Feature: Raw Ruby transformation
|
|
2
|
+
|
|
3
|
+
Scenario: A transform block can use the record as a Hash
|
|
4
|
+
Given a "products.csv" data file containing:
|
|
5
|
+
"""
|
|
6
|
+
id,name,category
|
|
7
|
+
JNI-123,Just a product name,Main category > Subcategory > Sub-subcategory
|
|
8
|
+
CDI-234,Another product name,Smart Insight > Cool stuff > Scripts
|
|
9
|
+
"""
|
|
10
|
+
And the following definition:
|
|
11
|
+
"""
|
|
12
|
+
source :products do
|
|
13
|
+
field :id, String
|
|
14
|
+
field :name, String
|
|
15
|
+
field :category, String
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
source :transformed_products do
|
|
19
|
+
field :item, String
|
|
20
|
+
field :title, String
|
|
21
|
+
field :category, String
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
transform :products => :transformed_products do |record|
|
|
25
|
+
record[:item] = "*#{record[:id]}*"
|
|
26
|
+
record[:title] = record[:name].chars.first
|
|
27
|
+
output record
|
|
28
|
+
end
|
|
29
|
+
"""
|
|
30
|
+
When I execute the definition
|
|
31
|
+
Then the process should exit successfully
|
|
32
|
+
And there should be a "transformed_products.csv" data file in the upload directory containing:
|
|
33
|
+
"""
|
|
34
|
+
item,title,category
|
|
35
|
+
*JNI-123*,J,Main category > Subcategory > Sub-subcategory
|
|
36
|
+
*CDI-234*,A,Smart Insight > Cool stuff > Scripts
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
Scenario: Records can be skipped
|
|
41
|
+
Given a "products.csv" data file containing:
|
|
42
|
+
"""
|
|
43
|
+
id
|
|
44
|
+
1
|
|
45
|
+
2
|
|
46
|
+
3
|
|
47
|
+
"""
|
|
48
|
+
And the following definition:
|
|
49
|
+
"""
|
|
50
|
+
source :products do
|
|
51
|
+
field :id, Integer
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
source :transformed_products do
|
|
55
|
+
field :id, Integer
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
transform :products => :transformed_products do |record|
|
|
59
|
+
output record unless "2" == record[:id]
|
|
60
|
+
end
|
|
61
|
+
"""
|
|
62
|
+
When I execute the definition
|
|
63
|
+
Then the process should exit successfully
|
|
64
|
+
And there should be a "transformed_products.csv" data file in the upload directory containing:
|
|
65
|
+
"""
|
|
66
|
+
id
|
|
67
|
+
1
|
|
68
|
+
3
|
|
69
|
+
"""
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
Feature: Split field
|
|
2
|
+
|
|
3
|
+
Scenario: A single field can be split into multiple fields
|
|
4
|
+
Given a "products.csv" data file containing:
|
|
5
|
+
"""
|
|
6
|
+
id,name,category
|
|
7
|
+
JNI-123,Just a product name,Main category > Subcategory > Sub-subcategory > Ultra-subcategory
|
|
8
|
+
CDI-234,Another product name,Smart Insight > Cool stuff | 3dim > 2dim > 1dim
|
|
9
|
+
"""
|
|
10
|
+
And the following definition:
|
|
11
|
+
"""
|
|
12
|
+
source :products do
|
|
13
|
+
field :item, String
|
|
14
|
+
field :title, String
|
|
15
|
+
field :category, String
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
source :transformed_products do
|
|
19
|
+
field :item, String
|
|
20
|
+
field :title, String
|
|
21
|
+
field :main_category, String
|
|
22
|
+
field :sub_category, String
|
|
23
|
+
field :department, String
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
transform :products => :transformed_products do |record|
|
|
27
|
+
record.split_field :category, into: [:category], by: "|"
|
|
28
|
+
record.split_field :category, into: [:main_category, :sub_category, :department], by: ">"
|
|
29
|
+
output record
|
|
30
|
+
end
|
|
31
|
+
"""
|
|
32
|
+
When I execute the definition
|
|
33
|
+
Then the process should exit successfully
|
|
34
|
+
And there should be a "transformed_products.csv" data file in the upload directory containing:
|
|
35
|
+
"""
|
|
36
|
+
item,title,main_category,sub_category,department
|
|
37
|
+
JNI-123,Just a product name,Main category,Subcategory,Sub-subcategory
|
|
38
|
+
CDI-234,Another product name,Smart Insight,Cool stuff,Cool stuff
|
|
39
|
+
"""
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
class Cranium::Application
|
|
2
|
+
|
|
3
|
+
include Cranium::Logging
|
|
4
|
+
|
|
5
|
+
attr_reader :sources
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def initialize(arguments)
|
|
10
|
+
@sources = Cranium::SourceRegistry.new
|
|
11
|
+
@hooks = {}
|
|
12
|
+
|
|
13
|
+
@options = Cranium::CommandLineOptions.new arguments
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def load_arguments
|
|
19
|
+
options.load_arguments
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def cranium_arguments
|
|
25
|
+
options.cranium_arguments
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def register_source(name, &block)
|
|
31
|
+
@sources.register_source(name, &block).resolve_files
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def run
|
|
37
|
+
process_file = validate_file options.cranium_arguments[:load]
|
|
38
|
+
|
|
39
|
+
begin
|
|
40
|
+
load process_file
|
|
41
|
+
rescue Exception => ex
|
|
42
|
+
log :error, ex
|
|
43
|
+
raise
|
|
44
|
+
ensure
|
|
45
|
+
apply_hook :after
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def after_import(&block)
|
|
52
|
+
register_hook :after_import, &block
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def register_hook(name, &block)
|
|
58
|
+
@hooks[name] ||= []
|
|
59
|
+
@hooks[name] << block
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def apply_hook(name)
|
|
65
|
+
unless @hooks[name].nil?
|
|
66
|
+
@hooks[name].each do |block|
|
|
67
|
+
block.call
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
private
|
|
75
|
+
|
|
76
|
+
attr_reader :options
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def validate_file(load_file)
|
|
81
|
+
exit_if_no_file_specified load_file
|
|
82
|
+
exit_if_no_such_file_exists load_file
|
|
83
|
+
load_file
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def exit_if_no_file_specified(file)
|
|
89
|
+
if file.nil? || file.empty?
|
|
90
|
+
$stderr.puts "ERROR: No file specified"
|
|
91
|
+
exit 1
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def exit_if_no_such_file_exists(file)
|
|
98
|
+
unless File.exists? file
|
|
99
|
+
$stderr.puts "ERROR: File '#{file}' does not exist"
|
|
100
|
+
exit 1
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
require 'fileutils'
|
|
2
|
+
|
|
3
|
+
module Cranium::Archiver
|
|
4
|
+
|
|
5
|
+
def self.archive(*files)
|
|
6
|
+
create_archive_directory
|
|
7
|
+
archive_files files
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def self.remove(*files)
|
|
13
|
+
files.each do |file_name|
|
|
14
|
+
FileUtils.rm File.join(Cranium.configuration.upload_path, file_name)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
def self.create_archive_directory
|
|
23
|
+
FileUtils.mkpath Cranium.configuration.archive_directory unless Dir.exists? Cranium.configuration.archive_directory
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def self.archive_files(files)
|
|
29
|
+
archive_datetime = Time.now.strftime("%Y-%m-%d_%Hh%Mm%Ss")
|
|
30
|
+
files.each do |file_name|
|
|
31
|
+
FileUtils.mv File.join(Cranium.configuration.upload_path, file_name),
|
|
32
|
+
File.join(Cranium.configuration.archive_directory, "#{archive_datetime}_#{file_name}")
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
module Cranium::AttributeDSL
|
|
2
|
+
|
|
3
|
+
def define_attribute(name)
|
|
4
|
+
class_eval <<-attribute_method
|
|
5
|
+
|
|
6
|
+
def #{name}(*args)
|
|
7
|
+
return @#{name} if args.count.zero?
|
|
8
|
+
|
|
9
|
+
@#{name} = args.first
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
attribute_method
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def define_array_attribute(name)
|
|
18
|
+
class_eval <<-attribute_method
|
|
19
|
+
|
|
20
|
+
def #{name}(*args)
|
|
21
|
+
return @#{name} || [] if args.count.zero?
|
|
22
|
+
|
|
23
|
+
@#{name} = args
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
attribute_method
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def define_boolean_attribute(name)
|
|
32
|
+
class_eval <<-attribute_method
|
|
33
|
+
|
|
34
|
+
def #{name}(*args)
|
|
35
|
+
return !!@#{name} if args.count.zero?
|
|
36
|
+
|
|
37
|
+
@#{name} = !!args
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
attribute_method
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
require "slop"
|
|
2
|
+
|
|
3
|
+
class Cranium::CommandLineOptions
|
|
4
|
+
|
|
5
|
+
def initialize(arguments)
|
|
6
|
+
@arguments = Slop.parse(arguments, autocreate: true).to_hash
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def cranium_arguments
|
|
12
|
+
@cranium_arguments ||= Hash[arguments.map { |k, v| [$1.to_sym, v] if k.to_s =~ /\Acranium\-(.*)/ }.compact]
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def load_arguments
|
|
18
|
+
@load_arguments ||= Hash[arguments.map { |k, v| [k, v] unless k.to_s =~ /\Acranium\-(.*)/ }.compact]
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
attr_reader :arguments
|
|
26
|
+
|
|
27
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
class Cranium::Configuration
|
|
2
|
+
|
|
3
|
+
STORAGE_DIRECTORY_NAME = ".cranium"
|
|
4
|
+
|
|
5
|
+
attr_writer :storage_directory
|
|
6
|
+
attr_accessor :archive_directory
|
|
7
|
+
attr_accessor :greenplum_connection_string
|
|
8
|
+
attr_accessor :mysql_connection_string
|
|
9
|
+
attr_accessor :upload_directory
|
|
10
|
+
attr_accessor :gpfdist_home_directory
|
|
11
|
+
attr_accessor :gpfdist_url
|
|
12
|
+
attr_accessor :loggers
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def initialize
|
|
17
|
+
@loggers = []
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upload_path
|
|
23
|
+
File.join gpfdist_home_directory, upload_directory
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def storage_directory
|
|
29
|
+
return @storage_directory unless @storage_directory.nil?
|
|
30
|
+
File.join upload_path, STORAGE_DIRECTORY_NAME
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
class Cranium::DataImporter
|
|
2
|
+
|
|
3
|
+
include Cranium::Logging
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def import(import_definition)
|
|
7
|
+
number_of_items_imported = 0
|
|
8
|
+
Cranium::Database.connection.transaction do
|
|
9
|
+
number_of_items_imported = importer_for_definition(import_definition).import
|
|
10
|
+
Cranium.application.apply_hook(:after_import)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
record_metric import_definition.name, number_of_items_imported.to_s
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def importer_for_definition(import_definition)
|
|
20
|
+
if [!import_definition.merge_fields.empty?, !import_definition.delete_insert_on.empty?, import_definition.truncate_insert].count(true) > 1
|
|
21
|
+
raise StandardError, "Import should not combine merge_on, delete_insert_on and truncate_insert settings"
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
if !import_definition.merge_fields.empty?
|
|
25
|
+
Cranium::ImportStrategy::Merge.new(import_definition)
|
|
26
|
+
elsif !import_definition.delete_insert_on.empty?
|
|
27
|
+
Cranium::ImportStrategy::DeleteInsert.new(import_definition)
|
|
28
|
+
elsif import_definition.truncate_insert
|
|
29
|
+
Cranium::ImportStrategy::TruncateInsert.new(import_definition)
|
|
30
|
+
else
|
|
31
|
+
Cranium::ImportStrategy::Delta.new(import_definition)
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
require 'csv'
|
|
2
|
+
require 'cranium/extensions/file'
|
|
3
|
+
|
|
4
|
+
class Cranium::DataReader
|
|
5
|
+
|
|
6
|
+
def initialize(source)
|
|
7
|
+
@source = source
|
|
8
|
+
@source_field_names = @source.fields.keys
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def read(&block)
|
|
14
|
+
@source.files.each do |input_file|
|
|
15
|
+
read_input_file File.join(Cranium.configuration.upload_path, input_file), block
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
def read_input_file(input_file, read_block)
|
|
24
|
+
Cranium::ProgressOutput.show_progress File.basename(input_file), File.line_count(input_file) do |progress_bar|
|
|
25
|
+
line_number = 0
|
|
26
|
+
CSV.foreach input_file, csv_read_options_for(@source) do |row|
|
|
27
|
+
next if 1 == (line_number += 1)
|
|
28
|
+
|
|
29
|
+
record = Hash[@source_field_names.zip row]
|
|
30
|
+
self.instance_exec record, &read_block
|
|
31
|
+
|
|
32
|
+
progress_bar.inc
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def csv_read_options_for(source_definition)
|
|
40
|
+
{
|
|
41
|
+
encoding: source_definition.encoding,
|
|
42
|
+
col_sep: source_definition.delimiter,
|
|
43
|
+
quote_char: source_definition.quote,
|
|
44
|
+
return_headers: false
|
|
45
|
+
}
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
end
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
require 'csv'
|
|
2
|
+
require 'cranium/extensions/file'
|
|
3
|
+
|
|
4
|
+
class Cranium::DataTransformer
|
|
5
|
+
|
|
6
|
+
def initialize(source, target)
|
|
7
|
+
@source, @target = source, target
|
|
8
|
+
@index = Cranium::Transformation::Index.new
|
|
9
|
+
@target_fields = @target.fields.keys
|
|
10
|
+
@record = Cranium::TransformationRecord.new @source.fields.keys, @target_fields
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def transform(&block)
|
|
16
|
+
raise StandardError, "Source definition '#{@target.name}' cannot overrride the file name because it is a transformation target" if @target.file_name_overriden?
|
|
17
|
+
|
|
18
|
+
CSV.open "#{Cranium.configuration.upload_path}/#{@target.file}", "w:#{@target.encoding}", csv_write_options_for(@target) do |target_file|
|
|
19
|
+
@target_file = target_file
|
|
20
|
+
@source.files.each do |input_file|
|
|
21
|
+
transform_input_file File.join(Cranium.configuration.upload_path, input_file), block
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
@target.resolve_files
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def transform_input_file(input_file, transformation_block)
|
|
33
|
+
Cranium::ProgressOutput.show_progress File.basename(input_file), File.line_count(input_file) do |progress_bar|
|
|
34
|
+
line_number = 0
|
|
35
|
+
CSV.foreach input_file, csv_read_options_for(@source) do |row|
|
|
36
|
+
next if 1 == (line_number += 1)
|
|
37
|
+
|
|
38
|
+
@record.input_data = row
|
|
39
|
+
self.instance_exec @record, &transformation_block
|
|
40
|
+
|
|
41
|
+
progress_bar.inc
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def csv_write_options_for(source_definition)
|
|
49
|
+
{
|
|
50
|
+
col_sep: source_definition.delimiter,
|
|
51
|
+
quote_char: source_definition.quote,
|
|
52
|
+
write_headers: true,
|
|
53
|
+
headers: source_definition.fields.keys
|
|
54
|
+
}
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def csv_read_options_for(source_definition)
|
|
60
|
+
{
|
|
61
|
+
encoding: source_definition.encoding,
|
|
62
|
+
col_sep: source_definition.delimiter,
|
|
63
|
+
quote_char: source_definition.quote,
|
|
64
|
+
return_headers: false
|
|
65
|
+
}
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def output(record)
|
|
71
|
+
@target_file << prepare_for_output(case record
|
|
72
|
+
when Cranium::TransformationRecord
|
|
73
|
+
record.data
|
|
74
|
+
when Hash
|
|
75
|
+
record
|
|
76
|
+
else
|
|
77
|
+
raise ArgumentError, "Cannot write '#{record.class}' to file as CSV record"
|
|
78
|
+
end)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def prepare_for_output(hash)
|
|
84
|
+
hash.
|
|
85
|
+
keep_if { |key| @target_fields.include? key }.
|
|
86
|
+
sort_by { |field, _| @target_fields.index(field) }.
|
|
87
|
+
map { |item| item[1] }.
|
|
88
|
+
map { |value| strip(value) }
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def strip(value)
|
|
94
|
+
return value unless value.respond_to? :strip
|
|
95
|
+
value.strip
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def unique_on_fields?(*fields)
|
|
101
|
+
not Cranium::Transformation::DuplicationIndex[*fields].duplicate? @record
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def lookup(field_name, settings)
|
|
107
|
+
@index.lookup field_name, settings
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def insert(field_name, settings)
|
|
113
|
+
@index.insert field_name, settings
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def next_value_in_sequence(name)
|
|
119
|
+
Cranium::Transformation::Sequence.by_name name
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
alias_method :sequence, :next_value_in_sequence
|
|
125
|
+
|
|
126
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
require 'sequel'
|
|
2
|
+
require 'sequel/extensions/connection_validator'
|
|
3
|
+
|
|
4
|
+
module Cranium::Database
|
|
5
|
+
|
|
6
|
+
def self.connection
|
|
7
|
+
@connection ||= setup_connection(Cranium.configuration.greenplum_connection_string)
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def self.[](name)
|
|
13
|
+
@connections ||= {}
|
|
14
|
+
@connections[name] ||= setup_connection(@definitions[name].connect_to)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def self.register_database(name, &block)
|
|
20
|
+
@definitions ||= Cranium::DefinitionRegistry.new Cranium::DSL::DatabaseDefinition
|
|
21
|
+
@definitions.register_definition name, &block
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def self.setup_connection(connection_string)
|
|
30
|
+
connection = Sequel.connect connection_string, loggers: Cranium.configuration.loggers
|
|
31
|
+
connection.extension :connection_validator
|
|
32
|
+
connection.pool.connection_validation_timeout = -1
|
|
33
|
+
return connection
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
class Cranium::DefinitionRegistry
|
|
2
|
+
|
|
3
|
+
def initialize(definition_class)
|
|
4
|
+
@definition_class = definition_class
|
|
5
|
+
@definitions = {}
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def [](name)
|
|
11
|
+
@definitions[name]
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def register_definition(name, &block)
|
|
17
|
+
definition = @definition_class.new name
|
|
18
|
+
definition.instance_eval &block
|
|
19
|
+
@definitions[name] = definition
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
class Cranium::DimensionManager
|
|
2
|
+
|
|
3
|
+
attr_reader :rows
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def self.for(table_name, key_fields)
|
|
8
|
+
@instances ||= {}
|
|
9
|
+
@instances[[table_name, key_fields]] ||= self.new table_name, key_fields
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def initialize(table_name, key_fields)
|
|
15
|
+
@table_name, @key_fields = table_name, key_fields
|
|
16
|
+
@rows = []
|
|
17
|
+
|
|
18
|
+
Cranium.application.after_import { flush }
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def insert(target_key, row)
|
|
24
|
+
raise ArgumentError, "Required attribute '#{target_key}' missing" unless row.has_key? target_key
|
|
25
|
+
|
|
26
|
+
@rows << resolve_sequence_values(row)
|
|
27
|
+
row[target_key]
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def create_cache_for_field(value_field)
|
|
33
|
+
to_multi_key_cache(db.select_map(@key_fields + [value_field]))
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def flush
|
|
39
|
+
db.multi_insert(@rows) unless @rows.empty?
|
|
40
|
+
@rows = []
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def to_multi_key_cache(table_data)
|
|
48
|
+
Hash[table_data.map { |row| [row[0..-2], row.last] }]
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def resolve_sequence_values(row)
|
|
54
|
+
row.each do |key, value|
|
|
55
|
+
row[key] = value.next_value if value.is_a? Cranium::Transformation::Sequence
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def db
|
|
62
|
+
Cranium::Database.connection[@table_name]
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
end
|