data_forge 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/.gitignore +19 -0
  2. data/.rspec +2 -0
  3. data/.travis.yml +5 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +20 -0
  6. data/README.md +49 -0
  7. data/Rakefile +16 -0
  8. data/bin/forge +4 -0
  9. data/config/cucumber.yml +8 -0
  10. data/data_forge.gemspec +26 -0
  11. data/features/accessing_command_line_parameters.feature +52 -0
  12. data/features/deduplication.feature +49 -0
  13. data/features/file/file_format_options.feature +146 -0
  14. data/features/file/has_header_row.feature +62 -0
  15. data/features/step_definitions/file_steps.rb +8 -0
  16. data/features/support/env.rb +8 -0
  17. data/features/transform/output_command.feature +123 -0
  18. data/features/transform/outputting_to_multiple_files.feature +57 -0
  19. data/features/transform/overwrite_original_file.feature +37 -0
  20. data/features/transform/record_transformation.feature +47 -0
  21. data/lib/data_forge/cli/main.rb +21 -0
  22. data/lib/data_forge/cli/options.rb +62 -0
  23. data/lib/data_forge/cli.rb +24 -0
  24. data/lib/data_forge/dsl/attributes.rb +15 -0
  25. data/lib/data_forge/dsl/commands.rb +23 -0
  26. data/lib/data_forge/dsl/helpers.rb +22 -0
  27. data/lib/data_forge/dsl.rb +9 -0
  28. data/lib/data_forge/file/csv/csv_record_file_definition.rb +46 -0
  29. data/lib/data_forge/file/csv/csv_record_file_reader.rb +42 -0
  30. data/lib/data_forge/file/csv/csv_record_file_writer.rb +62 -0
  31. data/lib/data_forge/file/csv.rb +13 -0
  32. data/lib/data_forge/file/record_file_definition.rb +17 -0
  33. data/lib/data_forge/file/record_file_reader.rb +22 -0
  34. data/lib/data_forge/file/record_file_writer.rb +32 -0
  35. data/lib/data_forge/file.rb +36 -0
  36. data/lib/data_forge/transformation/deduplication.rb +38 -0
  37. data/lib/data_forge/transformation/ruby_transformation.rb +33 -0
  38. data/lib/data_forge/transformation/ruby_transformation_context.rb +27 -0
  39. data/lib/data_forge/transformation/transformation_base.rb +29 -0
  40. data/lib/data_forge/transformation.rb +10 -0
  41. data/lib/data_forge/version.rb +3 -0
  42. data/lib/data_forge.rb +13 -0
  43. data/spec/data_forge/cli/main_spec.rb +45 -0
  44. data/spec/data_forge/cli/options_spec.rb +64 -0
  45. data/spec/data_forge/cli_spec.rb +54 -0
  46. data/spec/data_forge/dsl/commands_spec.rb +42 -0
  47. data/spec/data_forge/dsl/helpers_spec.rb +24 -0
  48. data/spec/data_forge/file/csv/csv_record_file_definition_spec.rb +97 -0
  49. data/spec/data_forge/file/csv/csv_record_file_reader_spec.rb +78 -0
  50. data/spec/data_forge/file/csv/csv_record_file_writer_spec.rb +100 -0
  51. data/spec/data_forge/file/record_file_definition_spec.rb +17 -0
  52. data/spec/data_forge/file/record_file_reader_spec.rb +15 -0
  53. data/spec/data_forge/file/record_file_writer_spec.rb +15 -0
  54. data/spec/data_forge/file_spec.rb +49 -0
  55. data/spec/data_forge/transformation/deduplication_spec.rb +77 -0
  56. data/spec/data_forge/transformation/ruby_transformation_context_spec.rb +49 -0
  57. data/spec/data_forge/transformation/ruby_transformation_spec.rb +71 -0
  58. data/spec/data_forge_spec.rb +9 -0
  59. data/spec/spec_helper.rb +17 -0
  60. data/spec/support/helpers/record_reader_helper.rb +17 -0
  61. data/spec/support/helpers/record_writer_helper.rb +16 -0
  62. metadata +218 -0
@@ -0,0 +1,37 @@
1
+ Feature: Overwriting the original file with a transformation
2
+
3
+ If the argument to a `transform` block is a single file (or rather, its name as a symbol) then both the source
4
+ and the target of that transformation will be the specified file. In this case the data in the file will be
5
+ processed and the file overwritten with the transformed data.
6
+
7
+
8
+ Scenario:
9
+ Given a file named "command_script.rb" with:
10
+ """
11
+ file :items do
12
+ field :name
13
+ end
14
+
15
+ transform :items do |record|
16
+ record[:name] = record[:name][0]
17
+ output record
18
+ end
19
+ """
20
+ And a file named "items.csv" with:
21
+ """
22
+ name
23
+ ab
24
+ cd
25
+ ef
26
+ """
27
+ When I run `forge command_script.rb`
28
+ Then the exit status should be 0
29
+ And a file named "items.csv" should exist
30
+ And the file "items.csv" should contain exactly:
31
+ """
32
+ name
33
+ a
34
+ c
35
+ e
36
+
37
+ """
@@ -0,0 +1,47 @@
1
+ Feature: Transforming the record
2
+
3
+ The `transform` block is passed the current record of the file that is being read. This record is a Hash with
4
+ its keys defined in the corresponding `file` definition and its values read from the input file. The record
5
+ can be transformed arbitrarily with pure Ruby code. The `output` command will write the record to file
6
+ using only the keys that are defined in the `file` definition of the transformation's target file.
7
+
8
+
9
+ Scenario: Using the record as a Hash
10
+ Given a file named "command_script.rb" with:
11
+ """
12
+ file :products do
13
+ field :id
14
+ field :name
15
+ field :main_category
16
+ field :subcategory
17
+ end
18
+
19
+ file :transformed_products do
20
+ field :item
21
+ field :title
22
+ field :category
23
+ end
24
+
25
+ transform :products, into: :transformed_products do |record|
26
+ record[:item] = record[:id]
27
+ record[:title] = record[:name].upcase
28
+ record[:category] = [record[:main_category], record[:subcategory]].join " > "
29
+ output record
30
+ end
31
+ """
32
+ And a file named "products.csv" with:
33
+ """
34
+ id,name,main_category,subcategory
35
+ IE-123,first product,Main category,Subcategory
36
+ TM-234,second product,Group,Subgroup
37
+ """
38
+ When I run `forge command_script.rb`
39
+ Then the exit status should be 0
40
+ And a file named "transformed_products.csv" should exist
41
+ And the file "transformed_products.csv" should contain exactly:
42
+ """
43
+ item,title,category
44
+ IE-123,FIRST PRODUCT,Main category > Subcategory
45
+ TM-234,SECOND PRODUCT,Group > Subgroup
46
+
47
+ """
@@ -0,0 +1,21 @@
1
+ module DataForge
2
+ module CLI
3
+ class Main
4
+
5
+ def initialize(argv, stdin=STDIN, stdout=STDOUT, stderr=STDERR, kernel=Kernel)
6
+ @argv, @stdin, @stdout, @stderr, @kernel = argv, stdin, stdout, stderr, kernel
7
+ end
8
+
9
+
10
+
11
+ def execute!
12
+ options = CLI.parse_options @argv, @stdout
13
+ load options.command_script if options.execute
14
+ rescue Exception => e
15
+ @stderr.puts "ERROR: " + e.message
16
+ @kernel.exit 1
17
+ end
18
+
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,62 @@
1
+ require 'optparse'
2
+
3
+ module DataForge
4
+ module CLI
5
+ class Options
6
+
7
+ def self.parse(args, output = STDOUT)
8
+ args = args.dup
9
+
10
+ options = new
11
+
12
+ OptionParser.new do |parser|
13
+ parser.default_argv = args
14
+ parser.banner = "Usage: [bundle exec] forge [options] command_script.rb"
15
+
16
+ parser.separator ""
17
+ parser.separator "Options:"
18
+
19
+ parser.on("-Uname=value",
20
+ /^(?<name>\w+)=(?<value>\S+)$/,
21
+ "User-defined parameter value to be passed to the command script.",
22
+ "Can be specified multiple times (with a different name).") do |_, name, value|
23
+ options.user_params[name.to_sym] = value
24
+ end
25
+
26
+ parser.separator ""
27
+ parser.separator "Common options:"
28
+
29
+ parser.on_tail("-h", "--help", "Show this message") do
30
+ output.puts parser
31
+ options.execute = false
32
+ end
33
+
34
+ parser.on_tail("-v", "--version", "Show version information") do
35
+ output.puts "DataForge, version #{DataForge::VERSION}"
36
+ options.execute = false
37
+ end
38
+ end.parse!
39
+
40
+ if options.execute
41
+ raise "No command script specified" if args.empty?
42
+ raise "More than one command script specified" unless args.size == 1
43
+ options.command_script = args.first
44
+ end
45
+
46
+ options
47
+ end
48
+
49
+
50
+
51
+ attr_accessor :command_script, :execute, :user_params
52
+
53
+
54
+
55
+ def initialize
56
+ @execute = true
57
+ @user_params = {}
58
+ end
59
+
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,24 @@
1
+ module DataForge
2
+ module CLI
3
+
4
+ autoload :Main, 'data_forge/cli/main'
5
+ autoload :Options, 'data_forge/cli/options'
6
+
7
+
8
+ class << self
9
+
10
+ attr_reader :command_script, :user_params
11
+
12
+
13
+
14
+ def parse_options(args, stdout)
15
+ Options.parse(args, stdout).tap do |options|
16
+ @command_script = options.command_script
17
+ @user_params = options.user_params
18
+ end
19
+ end
20
+
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,15 @@
1
+ module DataForge
2
+ module DSL
3
+ module Attributes
4
+
5
+ def define_attribute(name)
6
+ define_method name do |*args|
7
+ return instance_variable_get "@#{name}" if args.count.zero?
8
+
9
+ instance_variable_set "@#{name}", args.first
10
+ end
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,23 @@
1
+ module DataForge
2
+ module DSL
3
+ module Commands
4
+
5
+ def file(name, &initialization_block)
6
+ File.register_file_definition name, &initialization_block
7
+ end
8
+
9
+
10
+
11
+ def transform(source, options = {}, &transformation_block)
12
+ Transformation::RubyTransformation.from_input(source, options, &transformation_block).execute
13
+ end
14
+
15
+
16
+
17
+ def deduplicate(source, options = {})
18
+ Transformation::Deduplication.from_input(source, options).execute
19
+ end
20
+
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,22 @@
1
+ module DataForge
2
+ module DSL
3
+ module Helpers
4
+
5
+ end
6
+ end
7
+ end
8
+
9
+
10
+
11
+ def Object.const_missing(name)
12
+ case name
13
+ when :COMMAND_SCRIPT
14
+ DataForge::CLI.command_script
15
+
16
+ when :PARAMS
17
+ DataForge::CLI.user_params
18
+
19
+ else
20
+ raise NameError, "uninitialized constant #{name}"
21
+ end
22
+ end
@@ -0,0 +1,9 @@
1
+ module DataForge
2
+ module DSL
3
+
4
+ autoload :Attributes, 'data_forge/dsl/attributes'
5
+ autoload :Commands, 'data_forge/dsl/commands'
6
+ autoload :Helpers, 'data_forge/dsl/helpers'
7
+
8
+ end
9
+ end
@@ -0,0 +1,46 @@
1
+ module DataForge
2
+ module File
3
+ module CSV
4
+ class CSVRecordFileDefinition
5
+
6
+ class << self
7
+ include DataForge::DSL::Attributes
8
+ end
9
+
10
+ attr_reader :name, :fields
11
+ define_attribute :file_name
12
+ define_attribute :delimiter
13
+ define_attribute :quote
14
+ define_attribute :encoding
15
+ define_attribute :has_header_row
16
+
17
+ alias :separator :delimiter
18
+
19
+
20
+
21
+ def initialize(name)
22
+ @name = name
23
+ @file_name = "#{name.to_s}.csv"
24
+ @fields = {}
25
+ @delimiter = ","
26
+ @quote = '"'
27
+ @encoding = "UTF-8"
28
+ @has_header_row = true
29
+ end
30
+
31
+
32
+
33
+ def field(name, type = String)
34
+ @fields[name] = type
35
+ end
36
+
37
+
38
+
39
+ def field_names
40
+ @fields.keys
41
+ end
42
+
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,42 @@
1
+ module DataForge
2
+ module File
3
+ module CSV
4
+ class CSVRecordFileReader
5
+
6
+ attr_reader :definition, :name, :fields
7
+
8
+
9
+
10
+ def initialize(definition)
11
+ @definition = definition
12
+ @name = definition.name
13
+ @fields = definition.field_names
14
+ end
15
+
16
+
17
+
18
+ def each_record(&block)
19
+ ::CSV.open definition.file_name, csv_options do |csv_file|
20
+ csv_file.shift if definition.has_header_row
21
+
22
+ until (row = csv_file.shift).nil?
23
+ block.call Hash[definition.field_names.zip row]
24
+ end
25
+ end
26
+ end
27
+
28
+
29
+
30
+ private
31
+
32
+ def csv_options
33
+ {col_sep: definition.delimiter,
34
+ quote_char: definition.quote,
35
+ encoding: definition.encoding,
36
+ return_headers: false}
37
+ end
38
+
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,62 @@
1
+ require 'tmpdir'
2
+
3
+ module DataForge
4
+ module File
5
+ module CSV
6
+ class CSVRecordFileWriter
7
+
8
+ attr_reader :definition, :name, :fields
9
+
10
+
11
+
12
+ def initialize(definition)
13
+ @definition = definition
14
+ @name = definition.name
15
+ @fields = definition.field_names
16
+ end
17
+
18
+
19
+
20
+ def open
21
+ @tempfile_name = tempfile_name
22
+ @csv_file = ::CSV.open @tempfile_name, "w", csv_options
23
+ end
24
+
25
+
26
+
27
+ def close
28
+ @csv_file.close
29
+ FileUtils.move @tempfile_name, definition.file_name
30
+ end
31
+
32
+
33
+
34
+ def write(record)
35
+ @csv_file << fields.map { |field| record[field] }
36
+ end
37
+
38
+
39
+
40
+ private
41
+
42
+ def tempfile_name
43
+ Dir::Tmpname.make_tmpname [definition.name.to_s, ".csv"], 1
44
+ end
45
+
46
+
47
+
48
+ def csv_options
49
+ options = {col_sep: definition.delimiter,
50
+ quote_char: definition.quote,
51
+ encoding: definition.encoding,
52
+ write_headers: false}
53
+
54
+ options.merge!({write_headers: true,
55
+ headers: definition.field_names}) if definition.has_header_row
56
+ options
57
+ end
58
+
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,13 @@
1
+ require 'csv'
2
+
3
+ module DataForge
4
+ module File
5
+ module CSV
6
+
7
+ autoload :CSVRecordFileDefinition, 'data_forge/file/csv/csv_record_file_definition'
8
+ autoload :CSVRecordFileReader, 'data_forge/file/csv/csv_record_file_reader'
9
+ autoload :CSVRecordFileWriter, 'data_forge/file/csv/csv_record_file_writer'
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,17 @@
1
+ module DataForge
2
+ module File
3
+ module RecordFileDefinition
4
+
5
+ def self.from_input(name, &initialization_block)
6
+ CSV::CSVRecordFileDefinition.new(name).tap { |definition| definition.instance_eval &initialization_block }
7
+ end
8
+
9
+
10
+
11
+ # Interface definition
12
+
13
+ attr_reader :name, :fields
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,22 @@
1
+ module DataForge
2
+ module File
3
+ class RecordFileReader
4
+
5
+ def self.for(definition)
6
+ CSV::CSVRecordFileReader.new definition
7
+ end
8
+
9
+
10
+
11
+ # Interface definition
12
+
13
+ attr_reader :definition, :fields, :name
14
+
15
+
16
+
17
+ def each_record(&block)
18
+ end
19
+
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,32 @@
1
+ module DataForge
2
+ module File
3
+ class RecordFileWriter
4
+
5
+ def self.for(definition)
6
+ CSV::CSVRecordFileWriter.new definition
7
+ end
8
+
9
+
10
+
11
+ # Interface definition
12
+
13
+ attr_reader :definition, :fields, :name
14
+
15
+
16
+
17
+ def open
18
+ end
19
+
20
+
21
+
22
+ def close
23
+ end
24
+
25
+
26
+
27
+ def write(record)
28
+ end
29
+
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,36 @@
1
+ module DataForge
2
+ module File
3
+
4
+ autoload :CSV, 'data_forge/file/csv'
5
+ autoload :RecordFileDefinition, 'data_forge/file/record_file_definition'
6
+ autoload :RecordFileReader, 'data_forge/file/record_file_reader'
7
+ autoload :RecordFileWriter, 'data_forge/file/record_file_writer'
8
+
9
+
10
+ @file_definitions = {}
11
+
12
+ class << self
13
+
14
+ def register_file_definition(name, &initialization_block)
15
+ @file_definitions[name] = File::RecordFileDefinition.from_input name, &initialization_block
16
+ end
17
+
18
+
19
+
20
+ def reader_for(definition_name)
21
+ raise "Unknown file reference '#{definition_name}'" unless @file_definitions.has_key? definition_name
22
+
23
+ RecordFileReader.for @file_definitions[definition_name]
24
+ end
25
+
26
+
27
+
28
+ def writer_for(definition_name)
29
+ raise "Unknown file reference '#{definition_name}'" unless @file_definitions.has_key? definition_name
30
+
31
+ RecordFileWriter.for @file_definitions[definition_name]
32
+ end
33
+
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,38 @@
1
+ module DataForge
2
+ module Transformation
3
+ class Deduplication < TransformationBase
4
+
5
+ class << self
6
+ def from_input(source_name, options = {})
7
+ reader = File.reader_for source_name
8
+ writer = File.writer_for(options.fetch :into, source_name)
9
+ unique_fields = Array(options.fetch :using, reader.fields)
10
+
11
+ new reader, writer, unique_fields
12
+ end
13
+ end
14
+
15
+
16
+
17
+ def initialize(reader, writer, unique_fields)
18
+ @reader, @writer, @unique_fields = reader, writer, unique_fields
19
+ @fingerprints = Set.new
20
+ end
21
+
22
+
23
+
24
+ def execute
25
+ with_writer @writer do |writer|
26
+ @reader.each_record do |record|
27
+ fingerprint = @unique_fields.map { |field_name| record[field_name] }
28
+ unless @fingerprints.include? fingerprint
29
+ @fingerprints.add fingerprint
30
+ writer.write record
31
+ end
32
+ end
33
+ end
34
+ end
35
+
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,33 @@
1
+ module DataForge
2
+ module Transformation
3
+ class RubyTransformation < TransformationBase
4
+
5
+ class << self
6
+ def from_input(source_name, options = {}, &block)
7
+ reader = File.reader_for source_name
8
+ writers = Array(options.fetch :into, source_name).map { |target_name| File.writer_for target_name }
9
+
10
+ new reader, writers, &block
11
+ end
12
+ end
13
+
14
+
15
+
16
+ def initialize(record_reader, record_writers, &transformation_block)
17
+ @record_reader, @record_writers, @transformation_block = record_reader, record_writers, transformation_block
18
+ end
19
+
20
+
21
+
22
+ def execute
23
+ with_writers @record_writers do |writers|
24
+ context = RubyTransformationContext.new writers
25
+ @record_reader.each_record do |record|
26
+ context.instance_exec record, &@transformation_block
27
+ end
28
+ end
29
+ end
30
+
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,27 @@
1
+ module DataForge
2
+ module Transformation
3
+ class RubyTransformationContext
4
+
5
+ def initialize(writers)
6
+ @_writer_names = writers.map { |writer| writer.name }
7
+ @_writers_hash = Hash[@_writer_names.zip writers]
8
+ @_default_writer = writers.first
9
+ end
10
+
11
+
12
+
13
+ def output(record, options = {})
14
+ if options.has_key? :to
15
+ Array(options[:to]).each do |target_writer_name|
16
+ raise "Unknown target file '#{target_writer_name}' for `output` command" unless @_writer_names.include? target_writer_name
17
+ @_writers_hash[target_writer_name].write record
18
+ end
19
+ else
20
+ raise "Missing :to directive for `output` command in multiple file transformation" if @_writers_hash.count > 1
21
+ @_default_writer.write record
22
+ end
23
+ end
24
+
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,29 @@
1
+ module DataForge
2
+ module Transformation
3
+ class TransformationBase
4
+
5
+ protected
6
+
7
+ def with_writers(writers)
8
+ writers.each { |writer| writer.open }
9
+ begin
10
+ yield writers
11
+ ensure
12
+ writers.each { |writer| writer.close }
13
+ end
14
+ end
15
+
16
+
17
+
18
+ def with_writer(writer)
19
+ writer.open
20
+ begin
21
+ yield writer
22
+ ensure
23
+ writer.close
24
+ end
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,10 @@
1
+ module DataForge
2
+ module Transformation
3
+
4
+ autoload :Deduplication, 'data_forge/transformation/deduplication'
5
+ autoload :RubyTransformation, 'data_forge/transformation/ruby_transformation'
6
+ autoload :TransformationBase, 'data_forge/transformation/transformation_base'
7
+ autoload :RubyTransformationContext, 'data_forge/transformation/ruby_transformation_context'
8
+
9
+ end
10
+ end
@@ -0,0 +1,3 @@
1
+ module DataForge
2
+ VERSION = "0.1"
3
+ end