data_forge 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/.gitignore +19 -0
  2. data/.rspec +2 -0
  3. data/.travis.yml +5 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +20 -0
  6. data/README.md +49 -0
  7. data/Rakefile +16 -0
  8. data/bin/forge +4 -0
  9. data/config/cucumber.yml +8 -0
  10. data/data_forge.gemspec +26 -0
  11. data/features/accessing_command_line_parameters.feature +52 -0
  12. data/features/deduplication.feature +49 -0
  13. data/features/file/file_format_options.feature +146 -0
  14. data/features/file/has_header_row.feature +62 -0
  15. data/features/step_definitions/file_steps.rb +8 -0
  16. data/features/support/env.rb +8 -0
  17. data/features/transform/output_command.feature +123 -0
  18. data/features/transform/outputting_to_multiple_files.feature +57 -0
  19. data/features/transform/overwrite_original_file.feature +37 -0
  20. data/features/transform/record_transformation.feature +47 -0
  21. data/lib/data_forge/cli/main.rb +21 -0
  22. data/lib/data_forge/cli/options.rb +62 -0
  23. data/lib/data_forge/cli.rb +24 -0
  24. data/lib/data_forge/dsl/attributes.rb +15 -0
  25. data/lib/data_forge/dsl/commands.rb +23 -0
  26. data/lib/data_forge/dsl/helpers.rb +22 -0
  27. data/lib/data_forge/dsl.rb +9 -0
  28. data/lib/data_forge/file/csv/csv_record_file_definition.rb +46 -0
  29. data/lib/data_forge/file/csv/csv_record_file_reader.rb +42 -0
  30. data/lib/data_forge/file/csv/csv_record_file_writer.rb +62 -0
  31. data/lib/data_forge/file/csv.rb +13 -0
  32. data/lib/data_forge/file/record_file_definition.rb +17 -0
  33. data/lib/data_forge/file/record_file_reader.rb +22 -0
  34. data/lib/data_forge/file/record_file_writer.rb +32 -0
  35. data/lib/data_forge/file.rb +36 -0
  36. data/lib/data_forge/transformation/deduplication.rb +38 -0
  37. data/lib/data_forge/transformation/ruby_transformation.rb +33 -0
  38. data/lib/data_forge/transformation/ruby_transformation_context.rb +27 -0
  39. data/lib/data_forge/transformation/transformation_base.rb +29 -0
  40. data/lib/data_forge/transformation.rb +10 -0
  41. data/lib/data_forge/version.rb +3 -0
  42. data/lib/data_forge.rb +13 -0
  43. data/spec/data_forge/cli/main_spec.rb +45 -0
  44. data/spec/data_forge/cli/options_spec.rb +64 -0
  45. data/spec/data_forge/cli_spec.rb +54 -0
  46. data/spec/data_forge/dsl/commands_spec.rb +42 -0
  47. data/spec/data_forge/dsl/helpers_spec.rb +24 -0
  48. data/spec/data_forge/file/csv/csv_record_file_definition_spec.rb +97 -0
  49. data/spec/data_forge/file/csv/csv_record_file_reader_spec.rb +78 -0
  50. data/spec/data_forge/file/csv/csv_record_file_writer_spec.rb +100 -0
  51. data/spec/data_forge/file/record_file_definition_spec.rb +17 -0
  52. data/spec/data_forge/file/record_file_reader_spec.rb +15 -0
  53. data/spec/data_forge/file/record_file_writer_spec.rb +15 -0
  54. data/spec/data_forge/file_spec.rb +49 -0
  55. data/spec/data_forge/transformation/deduplication_spec.rb +77 -0
  56. data/spec/data_forge/transformation/ruby_transformation_context_spec.rb +49 -0
  57. data/spec/data_forge/transformation/ruby_transformation_spec.rb +71 -0
  58. data/spec/data_forge_spec.rb +9 -0
  59. data/spec/spec_helper.rb +17 -0
  60. data/spec/support/helpers/record_reader_helper.rb +17 -0
  61. data/spec/support/helpers/record_writer_helper.rb +16 -0
  62. metadata +218 -0
data/lib/data_forge.rb ADDED
@@ -0,0 +1,13 @@
1
+ require 'data_forge/version'
2
+
3
+ module DataForge
4
+
5
+ autoload :CLI, 'data_forge/cli'
6
+ autoload :DSL, 'data_forge/dsl'
7
+ autoload :File, 'data_forge/file'
8
+ autoload :Transformation, 'data_forge/transformation'
9
+
10
+ end
11
+
12
+ self.extend DataForge::DSL::Commands,
13
+ DataForge::DSL::Helpers
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataForge::CLI::Main do
4
+
5
+ let(:options) { DataForge::CLI::Options.new }
6
+ let(:args) { double "ARGV" }
7
+ let(:stdout) { double "STDOUT" }
8
+ let(:stderr) { double "STDERR" }
9
+ let(:kernel) { double "Kernel" }
10
+
11
+ subject { described_class.new args, STDIN, stdout, stderr, kernel }
12
+
13
+ before do
14
+ allow(DataForge::CLI).to receive(:parse_options).with(args, stdout).and_return options
15
+ end
16
+
17
+
18
+ describe "#execute!" do
19
+ it "should execute the command script specified in the options" do
20
+ options.command_script = "command_script.rb"
21
+
22
+ expect(subject).to receive(:load).with("command_script.rb")
23
+
24
+ subject.execute!
25
+ end
26
+
27
+ it "should not execute the command script if the options direct to stop execution" do
28
+ options.execute = false
29
+
30
+ expect(subject).not_to receive(:load)
31
+
32
+ subject.execute!
33
+ end
34
+
35
+ it "should output an error message in case of an error" do
36
+ allow(subject).to receive(:load).and_raise "Error message"
37
+
38
+ expect(stderr).to receive(:puts).with "ERROR: Error message"
39
+ expect(kernel).to receive(:exit).with 1
40
+
41
+ subject.execute!
42
+ end
43
+ end
44
+
45
+ end
@@ -0,0 +1,64 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataForge::CLI::Options do
4
+
5
+ let(:stdout) { StringIO.new }
6
+ subject { DataForge::CLI::Options }
7
+
8
+ describe ".parse" do
9
+ context "when parsing the command script parameter" do
10
+ it "should accept a single command script parameter" do
11
+ options = subject.parse(%w[command_script.rb])
12
+
13
+ expect(options.command_script).to eq "command_script.rb"
14
+ end
15
+
16
+ it "should raise an error if no command script is specified" do
17
+ expect { subject.parse(%w[]) }.to raise_error "No command script specified"
18
+ end
19
+
20
+ it "should raise an error if there is more than one command script specified" do
21
+ expect { subject.parse(%w[command_script1.rb command_script2.rb]) }.to raise_error "More than one command script specified"
22
+ end
23
+ end
24
+
25
+
26
+ context "when parsing the --help switch" do
27
+ it "should print the help information" do
28
+ subject.parse(%w[--help], stdout)
29
+
30
+ expect(stdout.string).to include "Usage: [bundle exec] forge [options] command_script.rb"
31
+ end
32
+ end
33
+
34
+
35
+ context "when parsing the --version switch" do
36
+ it "should print the version" do
37
+ subject.parse(%w[--version], stdout)
38
+
39
+ expect(stdout.string).to match /DataForge, version \d+(\.\d+)*/
40
+ end
41
+ end
42
+
43
+
44
+ context "when parsing user-defined parameters" do
45
+ it "should accept a name-value pair as a parameter with the -U switch" do
46
+ options = subject.parse(%w[-Ucustomer=test command_script.rb])
47
+
48
+ expect(options.user_params).to eq(customer: "test")
49
+ end
50
+
51
+ it "should accept multiple user-defined parameters" do
52
+ options = subject.parse(%w[-Ucustomer=test -Udata_file=items.csv command_script.rb])
53
+
54
+ expect(options.user_params).to eq(customer: "test", data_file: "items.csv")
55
+ end
56
+ end
57
+
58
+
59
+ it "should raise an error if an unknown option is specified" do
60
+ expect { subject.parse(%w[--unknown]) }.to raise_error OptionParser::InvalidOption
61
+ end
62
+ end
63
+
64
+ end
@@ -0,0 +1,54 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataForge::CLI do
4
+ let(:options) { DataForge::CLI::Options.new }
5
+ let(:args) { double "ARGV" }
6
+ let(:stdout) { double "STDOUT" }
7
+
8
+ before do
9
+ allow(DataForge::CLI::Options).to receive(:parse).with(args, stdout).and_return options
10
+ end
11
+
12
+ after do
13
+ subject.instance_variable_set :@command_script, nil
14
+ subject.instance_variable_set :@user_params, nil
15
+ end
16
+
17
+
18
+ describe ".parse_options" do
19
+ it "should return the command line options parsed into an Options object" do
20
+ expect(subject.parse_options args, stdout).to eq options
21
+ end
22
+ end
23
+
24
+
25
+ describe ".command_script" do
26
+ it "should be nil by default" do
27
+ expect(subject.command_script).to be_nil
28
+ end
29
+
30
+ it "should return the command script specified in options that were parsed" do
31
+ options.command_script = "command_script.rb"
32
+
33
+ subject.parse_options args, stdout
34
+
35
+ expect(subject.command_script).to eq "command_script.rb"
36
+ end
37
+ end
38
+
39
+
40
+ describe ".user_params" do
41
+ it "should be nil by default" do
42
+ expect(subject.user_params).to be_nil
43
+ end
44
+
45
+ it "should return the user-defined parameters specified in options that were parsed" do
46
+ options.user_params = {p1: "v1", p2: "v2"}
47
+
48
+ subject.parse_options args, stdout
49
+
50
+ expect(subject.user_params).to eq(p1: "v1", p2: "v2")
51
+ end
52
+ end
53
+
54
+ end
@@ -0,0 +1,42 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataForge::DSL::Commands do
4
+
5
+ let(:dsl_object) { Object.new.tap { |object| object.extend DataForge::DSL::Commands } }
6
+ let(:block) { lambda {} }
7
+
8
+ describe "#file" do
9
+ it "should register a file descriptor" do
10
+ expect(DataForge::File).to receive(:register_file_definition).with(:name) { |&blk| expect(blk).to be block }
11
+
12
+ dsl_object.file :name, &block
13
+ end
14
+ end
15
+
16
+
17
+ describe "#transform" do
18
+ it "should create a file transformation and execute it" do
19
+ transformation = instance_double "DataForge::Transformation::RubyTransformation"
20
+
21
+ allow(DataForge::Transformation::RubyTransformation).to receive(:from_input)
22
+ .with(:source, into: :target) { |&blk| expect(blk).to be block }
23
+ .and_return(transformation)
24
+ expect(transformation).to receive(:execute)
25
+
26
+ dsl_object.transform :source, into: :target, &block
27
+ end
28
+ end
29
+
30
+
31
+ describe "#deduplicate" do
32
+ it "should create a deduplication transformation and execute it" do
33
+ deduplication = instance_double "DataForge::Transformation::Deduplication"
34
+ allow(DataForge::Transformation::Deduplication).to receive(:from_input).with(:items, into: :unique_items, using: :item_id).and_return(deduplication)
35
+
36
+ expect(deduplication).to receive(:execute)
37
+
38
+ dsl_object.deduplicate :items, into: :unique_items, using: :item_id
39
+ end
40
+ end
41
+
42
+ end
@@ -0,0 +1,24 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataForge::DSL::Helpers do
4
+
5
+ subject { Object.new.extend described_class }
6
+
7
+ describe "PARAMS" do
8
+ it "should return the user parameters passed in through the CLI" do
9
+ expect(DataForge::CLI).to receive(:user_params).and_return "user defined parameters"
10
+
11
+ expect(subject.instance_eval { PARAMS }).to eq "user defined parameters"
12
+ end
13
+ end
14
+
15
+
16
+ describe "COMMAND_SCRIPT" do
17
+ it "should return the command script that is currently executing" do
18
+ expect(DataForge::CLI).to receive(:command_script).and_return "command_script.rb"
19
+
20
+ expect(subject.instance_eval { COMMAND_SCRIPT }).to eq "command_script.rb"
21
+ end
22
+ end
23
+
24
+ end
@@ -0,0 +1,97 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataForge::File::CSV::CSVRecordFileDefinition do
4
+
5
+ subject { described_class.new :definition_name }
6
+
7
+ describe "#name" do
8
+ it "should return the name of the descriptor" do
9
+ expect(subject.name).to eq :definition_name
10
+ end
11
+ end
12
+
13
+
14
+ describe "#field" do
15
+ it "should define a field with a type" do
16
+ subject.field :field1, String
17
+
18
+ expect(subject.fields).to eq(field1: String)
19
+ end
20
+
21
+ context "when called without a type" do
22
+ it "should define a String field" do
23
+ subject.field :field1
24
+
25
+ expect(subject.fields).to eq(field1: String)
26
+ end
27
+ end
28
+ end
29
+
30
+
31
+ describe "#fields" do
32
+ it "should return an empty Hash if no fields are defined" do
33
+ expect(subject.fields).to eq({})
34
+ end
35
+
36
+ it "should return the fields and types (as a Hash) that are defined" do
37
+ subject.field :field1, String
38
+ subject.field :field2, Fixnum
39
+
40
+ expect(subject.fields).to eq(field1: String, field2: Fixnum)
41
+ end
42
+ end
43
+
44
+
45
+ describe "#field_names" do
46
+ it "should return an empty array if no fields are defined" do
47
+ expect(subject.field_names).to eq []
48
+ end
49
+
50
+ it "should return the names of the fields that are defined" do
51
+ subject.field :field1, String
52
+ subject.field :field2, Fixnum
53
+
54
+ expect(subject.field_names).to eq [:field1, :field2]
55
+ end
56
+ end
57
+
58
+
59
+ describe "attributes" do
60
+ {file_name: "definition_name.csv",
61
+ delimiter: ",",
62
+ quote: '"',
63
+ encoding: "UTF-8",
64
+ has_header_row: true}
65
+ .each do |attribute_name, default_value|
66
+
67
+ describe "#{attribute_name}" do
68
+ it "should return or set the attribute value" do
69
+ subject.public_send attribute_name, "new value"
70
+
71
+ expect(subject.public_send(attribute_name)).to eq "new value"
72
+ end
73
+
74
+ context "when not overridden" do
75
+ it "should return the default value of the attribute" do
76
+ expect(subject.send(attribute_name)).to eq default_value
77
+ end
78
+ end
79
+ end
80
+
81
+ end
82
+ end
83
+
84
+
85
+ describe "#separator" do
86
+ it "should be an alias for #delimiter" do
87
+ expect(subject.separator).to eq ","
88
+
89
+ subject.delimiter ";"
90
+ expect(subject.separator).to eq ";"
91
+
92
+ subject.separator "|"
93
+ expect(subject.delimiter).to eq "|"
94
+ end
95
+ end
96
+
97
+ end
@@ -0,0 +1,78 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataForge::File::CSV::CSVRecordFileReader do
4
+
5
+ let(:csv_file) { instance_double "CSV" }
6
+ let(:definition) { instance_double "DataForge::File::CSV::CSVRecordFileDefinition",
7
+ name: :test,
8
+ file_name: "test.csv",
9
+ field_names: [:field1, :field2],
10
+ delimiter: "delimiter",
11
+ quote: "quote",
12
+ encoding: "encoding",
13
+ has_header_row: true }
14
+
15
+ subject { described_class.new definition }
16
+
17
+
18
+ describe "#definition" do
19
+ it "should return the file definition the writer was created for" do
20
+ expect(subject.definition).to eq definition
21
+ end
22
+ end
23
+
24
+
25
+ describe "#name" do
26
+ it "should return the file definition's name" do
27
+ expect(subject.name).to eq :test
28
+ end
29
+ end
30
+
31
+
32
+ describe "#fields" do
33
+ it "should return the file definition's fields" do
34
+ expect(subject.fields).to eq [:field1, :field2]
35
+ end
36
+ end
37
+
38
+
39
+ describe "#each_record" do
40
+ context "when the CSV file has a header row" do
41
+ it "should skip the header row and iterate through all records in the CSV file" do
42
+ expect(CSV).to receive(:open).with("test.csv", {col_sep: "delimiter",
43
+ quote_char: "quote",
44
+ encoding: "encoding",
45
+ return_headers: false}).and_yield csv_file
46
+ allow(csv_file).to receive(:shift).and_return(["field1", "field2"], [1, 2], [3, 4], nil)
47
+
48
+ records = []
49
+ subject.each_record { |record| records << record }
50
+
51
+ expect(records).to eq [{field1: 1, field2: 2}, {field1: 3, field2: 4}]
52
+ end
53
+ end
54
+
55
+
56
+ context "when the CSV file has no header row" do
57
+ let(:definition) { instance_double "DataForge::File::CSV::CSVRecordFileDefinition",
58
+ name: :test,
59
+ file_name: "test.csv",
60
+ field_names: [:field1, :field2],
61
+ delimiter: "delimiter",
62
+ quote: "quote",
63
+ encoding: "encoding",
64
+ has_header_row: false }
65
+
66
+ it "should iterate through all records in the CSV file" do
67
+ allow(CSV).to receive(:open).and_yield csv_file
68
+ allow(csv_file).to receive(:shift).and_return([1, 2], [3, 4], nil)
69
+
70
+ records = []
71
+ subject.each_record { |record| records << record }
72
+
73
+ expect(records).to eq [{field1: 1, field2: 2}, {field1: 3, field2: 4}]
74
+ end
75
+ end
76
+ end
77
+
78
+ end
@@ -0,0 +1,100 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataForge::File::CSV::CSVRecordFileWriter do
4
+
5
+ let(:csv_file) { instance_double "CSV" }
6
+ let(:definition) { instance_double "DataForge::File::CSV::CSVRecordFileDefinition",
7
+ name: :test,
8
+ file_name: "test.csv",
9
+ field_names: [:field1, :field2, :field3],
10
+ delimiter: "delimiter",
11
+ quote: "quote",
12
+ encoding: "encoding",
13
+ has_header_row: true }
14
+
15
+ subject { described_class.new definition }
16
+
17
+ before do
18
+ allow(Dir::Tmpname).to receive(:make_tmpname).with(["test", ".csv"], 1).and_return("generated_tempname.csv")
19
+ allow(CSV).to receive(:open).and_return csv_file
20
+ end
21
+
22
+
23
+ describe "#definition" do
24
+ it "should return the file definition the writer was created for" do
25
+ expect(subject.definition).to eq definition
26
+ end
27
+ end
28
+
29
+
30
+ describe "#name" do
31
+ it "should return the file definition's name" do
32
+ expect(subject.name).to eq :test
33
+ end
34
+ end
35
+
36
+
37
+ describe "#fields" do
38
+ it "should return the file definition's fields" do
39
+ expect(subject.fields).to eq [:field1, :field2, :field3]
40
+ end
41
+ end
42
+
43
+
44
+ describe "#open" do
45
+ it "should open a CSV file for writing with a temporary filename" do
46
+ expect(CSV).to receive(:open).with("generated_tempname.csv", "w", anything)
47
+
48
+ subject.open
49
+ end
50
+
51
+
52
+ it "should use the file definition's settings as CSV options" do
53
+ expect(CSV).to receive(:open).with(anything, "w", {col_sep: "delimiter",
54
+ quote_char: "quote",
55
+ encoding: "encoding",
56
+ write_headers: true,
57
+ headers: [:field1, :field2, :field3]})
58
+
59
+ subject.open
60
+ end
61
+
62
+
63
+ context "when a file has no header" do
64
+ it "should open a CSV file with no header row" do
65
+ allow(definition).to receive(:has_header_row).and_return false
66
+
67
+ expect(CSV).to receive(:open).with(anything, "w", {col_sep: "delimiter",
68
+ quote_char: "quote",
69
+ encoding: "encoding",
70
+ write_headers: false})
71
+
72
+ subject.open
73
+ end
74
+ end
75
+ end
76
+
77
+
78
+ describe "#close" do
79
+ it "should close and rename the open file" do
80
+ subject.open
81
+
82
+ expect(csv_file).to receive :close
83
+ expect(FileUtils).to receive(:move).with("generated_tempname.csv", "test.csv")
84
+
85
+ subject.close
86
+ end
87
+ end
88
+
89
+
90
+ describe "#output_record" do
91
+ it "should write the specified fields of a hash (in the specified order) as a row into the CSV file" do
92
+ subject.open
93
+
94
+ expect(csv_file).to receive(:<<).with ["a", "b", "c"]
95
+
96
+ subject.write(field3: "c", field1: "a", field4: "d", field2: "b")
97
+ end
98
+ end
99
+
100
+ end
@@ -0,0 +1,17 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataForge::File::RecordFileDefinition do
4
+
5
+ describe ".from_input" do
6
+ it "should instantiate a CSV file definition and initalize it with the initializer block" do
7
+ definition = instance_double "DataForge::File::CSV::CSVRecordFileDefinition"
8
+ initializer_block = lambda {}
9
+
10
+ expect(DataForge::File::CSV::CSVRecordFileDefinition).to receive(:new).with(:test).and_return definition
11
+ expect(definition).to receive(:instance_eval) { |&block| expect(block).to be initializer_block }
12
+
13
+ expect(subject.from_input :test, &initializer_block).to eq definition
14
+ end
15
+ end
16
+
17
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataForge::File::RecordFileReader do
4
+
5
+ describe ".for" do
6
+ it "should return a record reader for the specified file definition" do
7
+ definition = instance_double DataForge::File::CSV::CSVRecordFileDefinition
8
+
9
+ expect(DataForge::File::CSV::CSVRecordFileReader).to receive(:new).with(definition).and_return "record reader"
10
+
11
+ expect(described_class.for definition).to eq "record reader"
12
+ end
13
+ end
14
+
15
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataForge::File::RecordFileWriter do
4
+
5
+ describe ".for" do
6
+ it "should return a record writer for the specified file definition" do
7
+ definition = instance_double DataForge::File::CSV::CSVRecordFileDefinition
8
+
9
+ expect(DataForge::File::CSV::CSVRecordFileWriter).to receive(:new).with(definition).and_return "record writer"
10
+
11
+ expect(described_class.for definition).to eq "record writer"
12
+ end
13
+ end
14
+
15
+ end
@@ -0,0 +1,49 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataForge::File do
4
+
5
+ let(:definition) { instance_double "DataForge::File::RecordFileDefinition" }
6
+
7
+ before do
8
+ allow(DataForge::File::RecordFileDefinition).to receive(:from_input).with(:definition_name).and_return definition
9
+ end
10
+
11
+ after do
12
+ subject.instance_variable_set :@file_definitions, {}
13
+ end
14
+
15
+
16
+ describe ".reader_for" do
17
+ let(:reader) { double "Reader" }
18
+
19
+ it "should return a record reader for the file with the specified name" do
20
+ subject.register_file_definition :definition_name
21
+
22
+ expect(DataForge::File::RecordFileReader).to receive(:for).with(definition).and_return reader
23
+
24
+ expect(subject.reader_for :definition_name).to eq reader
25
+ end
26
+
27
+ it "should raise an error if there is no file registered by the specified name" do
28
+ expect { subject.reader_for :definition_name }.to raise_error "Unknown file reference 'definition_name'"
29
+ end
30
+ end
31
+
32
+
33
+ describe ".writer_for" do
34
+ let(:writer) { double "Writer" }
35
+
36
+ it "should return a record writer for the file with the specified name" do
37
+ subject.register_file_definition :definition_name
38
+
39
+ expect(DataForge::File::RecordFileWriter).to receive(:for).with(definition).and_return writer
40
+
41
+ expect(subject.writer_for :definition_name).to eq writer
42
+ end
43
+
44
+ it "should raise an error if there is no file registered by the specified name" do
45
+ expect { subject.writer_for :definition_name }.to raise_error "Unknown file reference 'definition_name'"
46
+ end
47
+ end
48
+
49
+ end