data_forge 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/Gemfile +2 -0
- data/LICENSE +20 -0
- data/README.md +49 -0
- data/Rakefile +16 -0
- data/bin/forge +4 -0
- data/config/cucumber.yml +8 -0
- data/data_forge.gemspec +26 -0
- data/features/accessing_command_line_parameters.feature +52 -0
- data/features/deduplication.feature +49 -0
- data/features/file/file_format_options.feature +146 -0
- data/features/file/has_header_row.feature +62 -0
- data/features/step_definitions/file_steps.rb +8 -0
- data/features/support/env.rb +8 -0
- data/features/transform/output_command.feature +123 -0
- data/features/transform/outputting_to_multiple_files.feature +57 -0
- data/features/transform/overwrite_original_file.feature +37 -0
- data/features/transform/record_transformation.feature +47 -0
- data/lib/data_forge/cli/main.rb +21 -0
- data/lib/data_forge/cli/options.rb +62 -0
- data/lib/data_forge/cli.rb +24 -0
- data/lib/data_forge/dsl/attributes.rb +15 -0
- data/lib/data_forge/dsl/commands.rb +23 -0
- data/lib/data_forge/dsl/helpers.rb +22 -0
- data/lib/data_forge/dsl.rb +9 -0
- data/lib/data_forge/file/csv/csv_record_file_definition.rb +46 -0
- data/lib/data_forge/file/csv/csv_record_file_reader.rb +42 -0
- data/lib/data_forge/file/csv/csv_record_file_writer.rb +62 -0
- data/lib/data_forge/file/csv.rb +13 -0
- data/lib/data_forge/file/record_file_definition.rb +17 -0
- data/lib/data_forge/file/record_file_reader.rb +22 -0
- data/lib/data_forge/file/record_file_writer.rb +32 -0
- data/lib/data_forge/file.rb +36 -0
- data/lib/data_forge/transformation/deduplication.rb +38 -0
- data/lib/data_forge/transformation/ruby_transformation.rb +33 -0
- data/lib/data_forge/transformation/ruby_transformation_context.rb +27 -0
- data/lib/data_forge/transformation/transformation_base.rb +29 -0
- data/lib/data_forge/transformation.rb +10 -0
- data/lib/data_forge/version.rb +3 -0
- data/lib/data_forge.rb +13 -0
- data/spec/data_forge/cli/main_spec.rb +45 -0
- data/spec/data_forge/cli/options_spec.rb +64 -0
- data/spec/data_forge/cli_spec.rb +54 -0
- data/spec/data_forge/dsl/commands_spec.rb +42 -0
- data/spec/data_forge/dsl/helpers_spec.rb +24 -0
- data/spec/data_forge/file/csv/csv_record_file_definition_spec.rb +97 -0
- data/spec/data_forge/file/csv/csv_record_file_reader_spec.rb +78 -0
- data/spec/data_forge/file/csv/csv_record_file_writer_spec.rb +100 -0
- data/spec/data_forge/file/record_file_definition_spec.rb +17 -0
- data/spec/data_forge/file/record_file_reader_spec.rb +15 -0
- data/spec/data_forge/file/record_file_writer_spec.rb +15 -0
- data/spec/data_forge/file_spec.rb +49 -0
- data/spec/data_forge/transformation/deduplication_spec.rb +77 -0
- data/spec/data_forge/transformation/ruby_transformation_context_spec.rb +49 -0
- data/spec/data_forge/transformation/ruby_transformation_spec.rb +71 -0
- data/spec/data_forge_spec.rb +9 -0
- data/spec/spec_helper.rb +17 -0
- data/spec/support/helpers/record_reader_helper.rb +17 -0
- data/spec/support/helpers/record_writer_helper.rb +16 -0
- metadata +218 -0
@@ -0,0 +1,37 @@
|
|
1
|
+
Feature: Overwriting the original file with a transformation
|
2
|
+
|
3
|
+
If the argument to a `transform` block is a single file (or rather, its name as a symbol) then both the source
|
4
|
+
and the target of that transformation will be the specified file. In this case the data in the file will be
|
5
|
+
processed and the file overwritten with the transformed data.
|
6
|
+
|
7
|
+
|
8
|
+
Scenario:
|
9
|
+
Given a file named "command_script.rb" with:
|
10
|
+
"""
|
11
|
+
file :items do
|
12
|
+
field :name
|
13
|
+
end
|
14
|
+
|
15
|
+
transform :items do |record|
|
16
|
+
record[:name] = record[:name][0]
|
17
|
+
output record
|
18
|
+
end
|
19
|
+
"""
|
20
|
+
And a file named "items.csv" with:
|
21
|
+
"""
|
22
|
+
name
|
23
|
+
ab
|
24
|
+
cd
|
25
|
+
ef
|
26
|
+
"""
|
27
|
+
When I run `forge command_script.rb`
|
28
|
+
Then the exit status should be 0
|
29
|
+
And a file named "items.csv" should exist
|
30
|
+
And the file "items.csv" should contain exactly:
|
31
|
+
"""
|
32
|
+
name
|
33
|
+
a
|
34
|
+
c
|
35
|
+
e
|
36
|
+
|
37
|
+
"""
|
@@ -0,0 +1,47 @@
|
|
1
|
+
Feature: Transforming the record
|
2
|
+
|
3
|
+
The `transform` block is passed the current record of the file that is being read. This record is a Hash with
|
4
|
+
its keys defined in the corresponding `file` definition and its values read from the input file. The record
|
5
|
+
can be transformed arbitrarily with pure Ruby code. The `output` command will write the record to file
|
6
|
+
using only the keys that are defined in the `file` definition of the transformation's target file.
|
7
|
+
|
8
|
+
|
9
|
+
Scenario: Using the record as a Hash
|
10
|
+
Given a file named "command_script.rb" with:
|
11
|
+
"""
|
12
|
+
file :products do
|
13
|
+
field :id
|
14
|
+
field :name
|
15
|
+
field :main_category
|
16
|
+
field :subcategory
|
17
|
+
end
|
18
|
+
|
19
|
+
file :transformed_products do
|
20
|
+
field :item
|
21
|
+
field :title
|
22
|
+
field :category
|
23
|
+
end
|
24
|
+
|
25
|
+
transform :products, into: :transformed_products do |record|
|
26
|
+
record[:item] = record[:id]
|
27
|
+
record[:title] = record[:name].upcase
|
28
|
+
record[:category] = [record[:main_category], record[:subcategory]].join " > "
|
29
|
+
output record
|
30
|
+
end
|
31
|
+
"""
|
32
|
+
And a file named "products.csv" with:
|
33
|
+
"""
|
34
|
+
id,name,main_category,subcategory
|
35
|
+
IE-123,first product,Main category,Subcategory
|
36
|
+
TM-234,second product,Group,Subgroup
|
37
|
+
"""
|
38
|
+
When I run `forge command_script.rb`
|
39
|
+
Then the exit status should be 0
|
40
|
+
And a file named "transformed_products.csv" should exist
|
41
|
+
And the file "transformed_products.csv" should contain exactly:
|
42
|
+
"""
|
43
|
+
item,title,category
|
44
|
+
IE-123,FIRST PRODUCT,Main category > Subcategory
|
45
|
+
TM-234,SECOND PRODUCT,Group > Subgroup
|
46
|
+
|
47
|
+
"""
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module DataForge
|
2
|
+
module CLI
|
3
|
+
class Main
|
4
|
+
|
5
|
+
def initialize(argv, stdin=STDIN, stdout=STDOUT, stderr=STDERR, kernel=Kernel)
|
6
|
+
@argv, @stdin, @stdout, @stderr, @kernel = argv, stdin, stdout, stderr, kernel
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
def execute!
|
12
|
+
options = CLI.parse_options @argv, @stdout
|
13
|
+
load options.command_script if options.execute
|
14
|
+
rescue Exception => e
|
15
|
+
@stderr.puts "ERROR: " + e.message
|
16
|
+
@kernel.exit 1
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
|
3
|
+
module DataForge
|
4
|
+
module CLI
|
5
|
+
class Options
|
6
|
+
|
7
|
+
def self.parse(args, output = STDOUT)
|
8
|
+
args = args.dup
|
9
|
+
|
10
|
+
options = new
|
11
|
+
|
12
|
+
OptionParser.new do |parser|
|
13
|
+
parser.default_argv = args
|
14
|
+
parser.banner = "Usage: [bundle exec] forge [options] command_script.rb"
|
15
|
+
|
16
|
+
parser.separator ""
|
17
|
+
parser.separator "Options:"
|
18
|
+
|
19
|
+
parser.on("-Uname=value",
|
20
|
+
/^(?<name>\w+)=(?<value>\S+)$/,
|
21
|
+
"User-defined parameter value to be passed to the command script.",
|
22
|
+
"Can be specified multiple times (with a different name).") do |_, name, value|
|
23
|
+
options.user_params[name.to_sym] = value
|
24
|
+
end
|
25
|
+
|
26
|
+
parser.separator ""
|
27
|
+
parser.separator "Common options:"
|
28
|
+
|
29
|
+
parser.on_tail("-h", "--help", "Show this message") do
|
30
|
+
output.puts parser
|
31
|
+
options.execute = false
|
32
|
+
end
|
33
|
+
|
34
|
+
parser.on_tail("-v", "--version", "Show version information") do
|
35
|
+
output.puts "DataForge, version #{DataForge::VERSION}"
|
36
|
+
options.execute = false
|
37
|
+
end
|
38
|
+
end.parse!
|
39
|
+
|
40
|
+
if options.execute
|
41
|
+
raise "No command script specified" if args.empty?
|
42
|
+
raise "More than one command script specified" unless args.size == 1
|
43
|
+
options.command_script = args.first
|
44
|
+
end
|
45
|
+
|
46
|
+
options
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
attr_accessor :command_script, :execute, :user_params
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
def initialize
|
56
|
+
@execute = true
|
57
|
+
@user_params = {}
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module DataForge
|
2
|
+
module CLI
|
3
|
+
|
4
|
+
autoload :Main, 'data_forge/cli/main'
|
5
|
+
autoload :Options, 'data_forge/cli/options'
|
6
|
+
|
7
|
+
|
8
|
+
class << self
|
9
|
+
|
10
|
+
attr_reader :command_script, :user_params
|
11
|
+
|
12
|
+
|
13
|
+
|
14
|
+
def parse_options(args, stdout)
|
15
|
+
Options.parse(args, stdout).tap do |options|
|
16
|
+
@command_script = options.command_script
|
17
|
+
@user_params = options.user_params
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module DataForge
|
2
|
+
module DSL
|
3
|
+
module Attributes
|
4
|
+
|
5
|
+
def define_attribute(name)
|
6
|
+
define_method name do |*args|
|
7
|
+
return instance_variable_get "@#{name}" if args.count.zero?
|
8
|
+
|
9
|
+
instance_variable_set "@#{name}", args.first
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module DataForge
|
2
|
+
module DSL
|
3
|
+
module Commands
|
4
|
+
|
5
|
+
def file(name, &initialization_block)
|
6
|
+
File.register_file_definition name, &initialization_block
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
def transform(source, options = {}, &transformation_block)
|
12
|
+
Transformation::RubyTransformation.from_input(source, options, &transformation_block).execute
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
def deduplicate(source, options = {})
|
18
|
+
Transformation::Deduplication.from_input(source, options).execute
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module DataForge
|
2
|
+
module DSL
|
3
|
+
module Helpers
|
4
|
+
|
5
|
+
end
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
def Object.const_missing(name)
|
12
|
+
case name
|
13
|
+
when :COMMAND_SCRIPT
|
14
|
+
DataForge::CLI.command_script
|
15
|
+
|
16
|
+
when :PARAMS
|
17
|
+
DataForge::CLI.user_params
|
18
|
+
|
19
|
+
else
|
20
|
+
raise NameError, "uninitialized constant #{name}"
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module DataForge
|
2
|
+
module File
|
3
|
+
module CSV
|
4
|
+
class CSVRecordFileDefinition
|
5
|
+
|
6
|
+
class << self
|
7
|
+
include DataForge::DSL::Attributes
|
8
|
+
end
|
9
|
+
|
10
|
+
attr_reader :name, :fields
|
11
|
+
define_attribute :file_name
|
12
|
+
define_attribute :delimiter
|
13
|
+
define_attribute :quote
|
14
|
+
define_attribute :encoding
|
15
|
+
define_attribute :has_header_row
|
16
|
+
|
17
|
+
alias :separator :delimiter
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
def initialize(name)
|
22
|
+
@name = name
|
23
|
+
@file_name = "#{name.to_s}.csv"
|
24
|
+
@fields = {}
|
25
|
+
@delimiter = ","
|
26
|
+
@quote = '"'
|
27
|
+
@encoding = "UTF-8"
|
28
|
+
@has_header_row = true
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
|
33
|
+
def field(name, type = String)
|
34
|
+
@fields[name] = type
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
def field_names
|
40
|
+
@fields.keys
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module DataForge
|
2
|
+
module File
|
3
|
+
module CSV
|
4
|
+
class CSVRecordFileReader
|
5
|
+
|
6
|
+
attr_reader :definition, :name, :fields
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
def initialize(definition)
|
11
|
+
@definition = definition
|
12
|
+
@name = definition.name
|
13
|
+
@fields = definition.field_names
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
def each_record(&block)
|
19
|
+
::CSV.open definition.file_name, csv_options do |csv_file|
|
20
|
+
csv_file.shift if definition.has_header_row
|
21
|
+
|
22
|
+
until (row = csv_file.shift).nil?
|
23
|
+
block.call Hash[definition.field_names.zip row]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def csv_options
|
33
|
+
{col_sep: definition.delimiter,
|
34
|
+
quote_char: definition.quote,
|
35
|
+
encoding: definition.encoding,
|
36
|
+
return_headers: false}
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'tmpdir'
|
2
|
+
|
3
|
+
module DataForge
|
4
|
+
module File
|
5
|
+
module CSV
|
6
|
+
class CSVRecordFileWriter
|
7
|
+
|
8
|
+
attr_reader :definition, :name, :fields
|
9
|
+
|
10
|
+
|
11
|
+
|
12
|
+
def initialize(definition)
|
13
|
+
@definition = definition
|
14
|
+
@name = definition.name
|
15
|
+
@fields = definition.field_names
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
def open
|
21
|
+
@tempfile_name = tempfile_name
|
22
|
+
@csv_file = ::CSV.open @tempfile_name, "w", csv_options
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
def close
|
28
|
+
@csv_file.close
|
29
|
+
FileUtils.move @tempfile_name, definition.file_name
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
def write(record)
|
35
|
+
@csv_file << fields.map { |field| record[field] }
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def tempfile_name
|
43
|
+
Dir::Tmpname.make_tmpname [definition.name.to_s, ".csv"], 1
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
def csv_options
|
49
|
+
options = {col_sep: definition.delimiter,
|
50
|
+
quote_char: definition.quote,
|
51
|
+
encoding: definition.encoding,
|
52
|
+
write_headers: false}
|
53
|
+
|
54
|
+
options.merge!({write_headers: true,
|
55
|
+
headers: definition.field_names}) if definition.has_header_row
|
56
|
+
options
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module DataForge
|
4
|
+
module File
|
5
|
+
module CSV
|
6
|
+
|
7
|
+
autoload :CSVRecordFileDefinition, 'data_forge/file/csv/csv_record_file_definition'
|
8
|
+
autoload :CSVRecordFileReader, 'data_forge/file/csv/csv_record_file_reader'
|
9
|
+
autoload :CSVRecordFileWriter, 'data_forge/file/csv/csv_record_file_writer'
|
10
|
+
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module DataForge
|
2
|
+
module File
|
3
|
+
module RecordFileDefinition
|
4
|
+
|
5
|
+
def self.from_input(name, &initialization_block)
|
6
|
+
CSV::CSVRecordFileDefinition.new(name).tap { |definition| definition.instance_eval &initialization_block }
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
# Interface definition
|
12
|
+
|
13
|
+
attr_reader :name, :fields
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module DataForge
|
2
|
+
module File
|
3
|
+
class RecordFileReader
|
4
|
+
|
5
|
+
def self.for(definition)
|
6
|
+
CSV::CSVRecordFileReader.new definition
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
# Interface definition
|
12
|
+
|
13
|
+
attr_reader :definition, :fields, :name
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
def each_record(&block)
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module DataForge
|
2
|
+
module File
|
3
|
+
class RecordFileWriter
|
4
|
+
|
5
|
+
def self.for(definition)
|
6
|
+
CSV::CSVRecordFileWriter.new definition
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
# Interface definition
|
12
|
+
|
13
|
+
attr_reader :definition, :fields, :name
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
def open
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
def close
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
def write(record)
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module DataForge
|
2
|
+
module File
|
3
|
+
|
4
|
+
autoload :CSV, 'data_forge/file/csv'
|
5
|
+
autoload :RecordFileDefinition, 'data_forge/file/record_file_definition'
|
6
|
+
autoload :RecordFileReader, 'data_forge/file/record_file_reader'
|
7
|
+
autoload :RecordFileWriter, 'data_forge/file/record_file_writer'
|
8
|
+
|
9
|
+
|
10
|
+
@file_definitions = {}
|
11
|
+
|
12
|
+
class << self
|
13
|
+
|
14
|
+
def register_file_definition(name, &initialization_block)
|
15
|
+
@file_definitions[name] = File::RecordFileDefinition.from_input name, &initialization_block
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
def reader_for(definition_name)
|
21
|
+
raise "Unknown file reference '#{definition_name}'" unless @file_definitions.has_key? definition_name
|
22
|
+
|
23
|
+
RecordFileReader.for @file_definitions[definition_name]
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
def writer_for(definition_name)
|
29
|
+
raise "Unknown file reference '#{definition_name}'" unless @file_definitions.has_key? definition_name
|
30
|
+
|
31
|
+
RecordFileWriter.for @file_definitions[definition_name]
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module DataForge
|
2
|
+
module Transformation
|
3
|
+
class Deduplication < TransformationBase
|
4
|
+
|
5
|
+
class << self
|
6
|
+
def from_input(source_name, options = {})
|
7
|
+
reader = File.reader_for source_name
|
8
|
+
writer = File.writer_for(options.fetch :into, source_name)
|
9
|
+
unique_fields = Array(options.fetch :using, reader.fields)
|
10
|
+
|
11
|
+
new reader, writer, unique_fields
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
def initialize(reader, writer, unique_fields)
|
18
|
+
@reader, @writer, @unique_fields = reader, writer, unique_fields
|
19
|
+
@fingerprints = Set.new
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
def execute
|
25
|
+
with_writer @writer do |writer|
|
26
|
+
@reader.each_record do |record|
|
27
|
+
fingerprint = @unique_fields.map { |field_name| record[field_name] }
|
28
|
+
unless @fingerprints.include? fingerprint
|
29
|
+
@fingerprints.add fingerprint
|
30
|
+
writer.write record
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module DataForge
|
2
|
+
module Transformation
|
3
|
+
class RubyTransformation < TransformationBase
|
4
|
+
|
5
|
+
class << self
|
6
|
+
def from_input(source_name, options = {}, &block)
|
7
|
+
reader = File.reader_for source_name
|
8
|
+
writers = Array(options.fetch :into, source_name).map { |target_name| File.writer_for target_name }
|
9
|
+
|
10
|
+
new reader, writers, &block
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
def initialize(record_reader, record_writers, &transformation_block)
|
17
|
+
@record_reader, @record_writers, @transformation_block = record_reader, record_writers, transformation_block
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
def execute
|
23
|
+
with_writers @record_writers do |writers|
|
24
|
+
context = RubyTransformationContext.new writers
|
25
|
+
@record_reader.each_record do |record|
|
26
|
+
context.instance_exec record, &@transformation_block
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module DataForge
|
2
|
+
module Transformation
|
3
|
+
class RubyTransformationContext
|
4
|
+
|
5
|
+
def initialize(writers)
|
6
|
+
@_writer_names = writers.map { |writer| writer.name }
|
7
|
+
@_writers_hash = Hash[@_writer_names.zip writers]
|
8
|
+
@_default_writer = writers.first
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
def output(record, options = {})
|
14
|
+
if options.has_key? :to
|
15
|
+
Array(options[:to]).each do |target_writer_name|
|
16
|
+
raise "Unknown target file '#{target_writer_name}' for `output` command" unless @_writer_names.include? target_writer_name
|
17
|
+
@_writers_hash[target_writer_name].write record
|
18
|
+
end
|
19
|
+
else
|
20
|
+
raise "Missing :to directive for `output` command in multiple file transformation" if @_writers_hash.count > 1
|
21
|
+
@_default_writer.write record
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module DataForge
|
2
|
+
module Transformation
|
3
|
+
class TransformationBase
|
4
|
+
|
5
|
+
protected
|
6
|
+
|
7
|
+
def with_writers(writers)
|
8
|
+
writers.each { |writer| writer.open }
|
9
|
+
begin
|
10
|
+
yield writers
|
11
|
+
ensure
|
12
|
+
writers.each { |writer| writer.close }
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
def with_writer(writer)
|
19
|
+
writer.open
|
20
|
+
begin
|
21
|
+
yield writer
|
22
|
+
ensure
|
23
|
+
writer.close
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module DataForge
|
2
|
+
module Transformation
|
3
|
+
|
4
|
+
autoload :Deduplication, 'data_forge/transformation/deduplication'
|
5
|
+
autoload :RubyTransformation, 'data_forge/transformation/ruby_transformation'
|
6
|
+
autoload :TransformationBase, 'data_forge/transformation/transformation_base'
|
7
|
+
autoload :RubyTransformationContext, 'data_forge/transformation/ruby_transformation_context'
|
8
|
+
|
9
|
+
end
|
10
|
+
end
|