data_forge 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. data/.gitignore +19 -0
  2. data/.rspec +2 -0
  3. data/.travis.yml +5 -0
  4. data/Gemfile +2 -0
  5. data/LICENSE +20 -0
  6. data/README.md +49 -0
  7. data/Rakefile +16 -0
  8. data/bin/forge +4 -0
  9. data/config/cucumber.yml +8 -0
  10. data/data_forge.gemspec +26 -0
  11. data/features/accessing_command_line_parameters.feature +52 -0
  12. data/features/deduplication.feature +49 -0
  13. data/features/file/file_format_options.feature +146 -0
  14. data/features/file/has_header_row.feature +62 -0
  15. data/features/step_definitions/file_steps.rb +8 -0
  16. data/features/support/env.rb +8 -0
  17. data/features/transform/output_command.feature +123 -0
  18. data/features/transform/outputting_to_multiple_files.feature +57 -0
  19. data/features/transform/overwrite_original_file.feature +37 -0
  20. data/features/transform/record_transformation.feature +47 -0
  21. data/lib/data_forge/cli/main.rb +21 -0
  22. data/lib/data_forge/cli/options.rb +62 -0
  23. data/lib/data_forge/cli.rb +24 -0
  24. data/lib/data_forge/dsl/attributes.rb +15 -0
  25. data/lib/data_forge/dsl/commands.rb +23 -0
  26. data/lib/data_forge/dsl/helpers.rb +22 -0
  27. data/lib/data_forge/dsl.rb +9 -0
  28. data/lib/data_forge/file/csv/csv_record_file_definition.rb +46 -0
  29. data/lib/data_forge/file/csv/csv_record_file_reader.rb +42 -0
  30. data/lib/data_forge/file/csv/csv_record_file_writer.rb +62 -0
  31. data/lib/data_forge/file/csv.rb +13 -0
  32. data/lib/data_forge/file/record_file_definition.rb +17 -0
  33. data/lib/data_forge/file/record_file_reader.rb +22 -0
  34. data/lib/data_forge/file/record_file_writer.rb +32 -0
  35. data/lib/data_forge/file.rb +36 -0
  36. data/lib/data_forge/transformation/deduplication.rb +38 -0
  37. data/lib/data_forge/transformation/ruby_transformation.rb +33 -0
  38. data/lib/data_forge/transformation/ruby_transformation_context.rb +27 -0
  39. data/lib/data_forge/transformation/transformation_base.rb +29 -0
  40. data/lib/data_forge/transformation.rb +10 -0
  41. data/lib/data_forge/version.rb +3 -0
  42. data/lib/data_forge.rb +13 -0
  43. data/spec/data_forge/cli/main_spec.rb +45 -0
  44. data/spec/data_forge/cli/options_spec.rb +64 -0
  45. data/spec/data_forge/cli_spec.rb +54 -0
  46. data/spec/data_forge/dsl/commands_spec.rb +42 -0
  47. data/spec/data_forge/dsl/helpers_spec.rb +24 -0
  48. data/spec/data_forge/file/csv/csv_record_file_definition_spec.rb +97 -0
  49. data/spec/data_forge/file/csv/csv_record_file_reader_spec.rb +78 -0
  50. data/spec/data_forge/file/csv/csv_record_file_writer_spec.rb +100 -0
  51. data/spec/data_forge/file/record_file_definition_spec.rb +17 -0
  52. data/spec/data_forge/file/record_file_reader_spec.rb +15 -0
  53. data/spec/data_forge/file/record_file_writer_spec.rb +15 -0
  54. data/spec/data_forge/file_spec.rb +49 -0
  55. data/spec/data_forge/transformation/deduplication_spec.rb +77 -0
  56. data/spec/data_forge/transformation/ruby_transformation_context_spec.rb +49 -0
  57. data/spec/data_forge/transformation/ruby_transformation_spec.rb +71 -0
  58. data/spec/data_forge_spec.rb +9 -0
  59. data/spec/spec_helper.rb +17 -0
  60. data/spec/support/helpers/record_reader_helper.rb +17 -0
  61. data/spec/support/helpers/record_writer_helper.rb +16 -0
  62. metadata +218 -0
@@ -0,0 +1,37 @@
1
+ Feature: Overwriting the original file with a transformation
2
+
3
+ If the argument to a `transform` block is a single file (or rather, its name as a symbol) then both the source
4
+ and the target of that transformation will be the specified file. In this case the data in the file will be
5
+ processed and the file overwritten with the transformed data.
6
+
7
+
8
+ Scenario:
9
+ Given a file named "command_script.rb" with:
10
+ """
11
+ file :items do
12
+ field :name
13
+ end
14
+
15
+ transform :items do |record|
16
+ record[:name] = record[:name][0]
17
+ output record
18
+ end
19
+ """
20
+ And a file named "items.csv" with:
21
+ """
22
+ name
23
+ ab
24
+ cd
25
+ ef
26
+ """
27
+ When I run `forge command_script.rb`
28
+ Then the exit status should be 0
29
+ And a file named "items.csv" should exist
30
+ And the file "items.csv" should contain exactly:
31
+ """
32
+ name
33
+ a
34
+ c
35
+ e
36
+
37
+ """
@@ -0,0 +1,47 @@
1
+ Feature: Transforming the record
2
+
3
+ The `transform` block is passed the current record of the file that is being read. This record is a Hash with
4
+ its keys defined in the corresponding `file` definition and its values read from the input file. The record
5
+ can be transformed arbitrarily with pure Ruby code. The `output` command will write the record to file
6
+ using only the keys that are defined in the `file` definition of the transformation's target file.
7
+
8
+
9
+ Scenario: Using the record as a Hash
10
+ Given a file named "command_script.rb" with:
11
+ """
12
+ file :products do
13
+ field :id
14
+ field :name
15
+ field :main_category
16
+ field :subcategory
17
+ end
18
+
19
+ file :transformed_products do
20
+ field :item
21
+ field :title
22
+ field :category
23
+ end
24
+
25
+ transform :products, into: :transformed_products do |record|
26
+ record[:item] = record[:id]
27
+ record[:title] = record[:name].upcase
28
+ record[:category] = [record[:main_category], record[:subcategory]].join " > "
29
+ output record
30
+ end
31
+ """
32
+ And a file named "products.csv" with:
33
+ """
34
+ id,name,main_category,subcategory
35
+ IE-123,first product,Main category,Subcategory
36
+ TM-234,second product,Group,Subgroup
37
+ """
38
+ When I run `forge command_script.rb`
39
+ Then the exit status should be 0
40
+ And a file named "transformed_products.csv" should exist
41
+ And the file "transformed_products.csv" should contain exactly:
42
+ """
43
+ item,title,category
44
+ IE-123,FIRST PRODUCT,Main category > Subcategory
45
+ TM-234,SECOND PRODUCT,Group > Subgroup
46
+
47
+ """
@@ -0,0 +1,21 @@
1
+ module DataForge
2
+ module CLI
3
+ class Main
4
+
5
+ def initialize(argv, stdin=STDIN, stdout=STDOUT, stderr=STDERR, kernel=Kernel)
6
+ @argv, @stdin, @stdout, @stderr, @kernel = argv, stdin, stdout, stderr, kernel
7
+ end
8
+
9
+
10
+
11
+ def execute!
12
+ options = CLI.parse_options @argv, @stdout
13
+ load options.command_script if options.execute
14
+ rescue Exception => e
15
+ @stderr.puts "ERROR: " + e.message
16
+ @kernel.exit 1
17
+ end
18
+
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,62 @@
1
+ require 'optparse'
2
+
3
+ module DataForge
4
+ module CLI
5
+ class Options
6
+
7
+ def self.parse(args, output = STDOUT)
8
+ args = args.dup
9
+
10
+ options = new
11
+
12
+ OptionParser.new do |parser|
13
+ parser.default_argv = args
14
+ parser.banner = "Usage: [bundle exec] forge [options] command_script.rb"
15
+
16
+ parser.separator ""
17
+ parser.separator "Options:"
18
+
19
+ parser.on("-Uname=value",
20
+ /^(?<name>\w+)=(?<value>\S+)$/,
21
+ "User-defined parameter value to be passed to the command script.",
22
+ "Can be specified multiple times (with a different name).") do |_, name, value|
23
+ options.user_params[name.to_sym] = value
24
+ end
25
+
26
+ parser.separator ""
27
+ parser.separator "Common options:"
28
+
29
+ parser.on_tail("-h", "--help", "Show this message") do
30
+ output.puts parser
31
+ options.execute = false
32
+ end
33
+
34
+ parser.on_tail("-v", "--version", "Show version information") do
35
+ output.puts "DataForge, version #{DataForge::VERSION}"
36
+ options.execute = false
37
+ end
38
+ end.parse!
39
+
40
+ if options.execute
41
+ raise "No command script specified" if args.empty?
42
+ raise "More than one command script specified" unless args.size == 1
43
+ options.command_script = args.first
44
+ end
45
+
46
+ options
47
+ end
48
+
49
+
50
+
51
+ attr_accessor :command_script, :execute, :user_params
52
+
53
+
54
+
55
+ def initialize
56
+ @execute = true
57
+ @user_params = {}
58
+ end
59
+
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,24 @@
1
+ module DataForge
2
+ module CLI
3
+
4
+ autoload :Main, 'data_forge/cli/main'
5
+ autoload :Options, 'data_forge/cli/options'
6
+
7
+
8
+ class << self
9
+
10
+ attr_reader :command_script, :user_params
11
+
12
+
13
+
14
+ def parse_options(args, stdout)
15
+ Options.parse(args, stdout).tap do |options|
16
+ @command_script = options.command_script
17
+ @user_params = options.user_params
18
+ end
19
+ end
20
+
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,15 @@
1
+ module DataForge
2
+ module DSL
3
+ module Attributes
4
+
5
+ def define_attribute(name)
6
+ define_method name do |*args|
7
+ return instance_variable_get "@#{name}" if args.count.zero?
8
+
9
+ instance_variable_set "@#{name}", args.first
10
+ end
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,23 @@
1
+ module DataForge
2
+ module DSL
3
+ module Commands
4
+
5
+ def file(name, &initialization_block)
6
+ File.register_file_definition name, &initialization_block
7
+ end
8
+
9
+
10
+
11
+ def transform(source, options = {}, &transformation_block)
12
+ Transformation::RubyTransformation.from_input(source, options, &transformation_block).execute
13
+ end
14
+
15
+
16
+
17
+ def deduplicate(source, options = {})
18
+ Transformation::Deduplication.from_input(source, options).execute
19
+ end
20
+
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,22 @@
1
+ module DataForge
2
+ module DSL
3
+ module Helpers
4
+
5
+ end
6
+ end
7
+ end
8
+
9
+
10
+
11
+ def Object.const_missing(name)
12
+ case name
13
+ when :COMMAND_SCRIPT
14
+ DataForge::CLI.command_script
15
+
16
+ when :PARAMS
17
+ DataForge::CLI.user_params
18
+
19
+ else
20
+ raise NameError, "uninitialized constant #{name}"
21
+ end
22
+ end
@@ -0,0 +1,9 @@
1
+ module DataForge
2
+ module DSL
3
+
4
+ autoload :Attributes, 'data_forge/dsl/attributes'
5
+ autoload :Commands, 'data_forge/dsl/commands'
6
+ autoload :Helpers, 'data_forge/dsl/helpers'
7
+
8
+ end
9
+ end
@@ -0,0 +1,46 @@
1
+ module DataForge
2
+ module File
3
+ module CSV
4
+ class CSVRecordFileDefinition
5
+
6
+ class << self
7
+ include DataForge::DSL::Attributes
8
+ end
9
+
10
+ attr_reader :name, :fields
11
+ define_attribute :file_name
12
+ define_attribute :delimiter
13
+ define_attribute :quote
14
+ define_attribute :encoding
15
+ define_attribute :has_header_row
16
+
17
+ alias :separator :delimiter
18
+
19
+
20
+
21
+ def initialize(name)
22
+ @name = name
23
+ @file_name = "#{name.to_s}.csv"
24
+ @fields = {}
25
+ @delimiter = ","
26
+ @quote = '"'
27
+ @encoding = "UTF-8"
28
+ @has_header_row = true
29
+ end
30
+
31
+
32
+
33
+ def field(name, type = String)
34
+ @fields[name] = type
35
+ end
36
+
37
+
38
+
39
+ def field_names
40
+ @fields.keys
41
+ end
42
+
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,42 @@
1
+ module DataForge
2
+ module File
3
+ module CSV
4
+ class CSVRecordFileReader
5
+
6
+ attr_reader :definition, :name, :fields
7
+
8
+
9
+
10
+ def initialize(definition)
11
+ @definition = definition
12
+ @name = definition.name
13
+ @fields = definition.field_names
14
+ end
15
+
16
+
17
+
18
+ def each_record(&block)
19
+ ::CSV.open definition.file_name, csv_options do |csv_file|
20
+ csv_file.shift if definition.has_header_row
21
+
22
+ until (row = csv_file.shift).nil?
23
+ block.call Hash[definition.field_names.zip row]
24
+ end
25
+ end
26
+ end
27
+
28
+
29
+
30
+ private
31
+
32
+ def csv_options
33
+ {col_sep: definition.delimiter,
34
+ quote_char: definition.quote,
35
+ encoding: definition.encoding,
36
+ return_headers: false}
37
+ end
38
+
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,62 @@
1
+ require 'tmpdir'
2
+
3
+ module DataForge
4
+ module File
5
+ module CSV
6
+ class CSVRecordFileWriter
7
+
8
+ attr_reader :definition, :name, :fields
9
+
10
+
11
+
12
+ def initialize(definition)
13
+ @definition = definition
14
+ @name = definition.name
15
+ @fields = definition.field_names
16
+ end
17
+
18
+
19
+
20
+ def open
21
+ @tempfile_name = tempfile_name
22
+ @csv_file = ::CSV.open @tempfile_name, "w", csv_options
23
+ end
24
+
25
+
26
+
27
+ def close
28
+ @csv_file.close
29
+ FileUtils.move @tempfile_name, definition.file_name
30
+ end
31
+
32
+
33
+
34
+ def write(record)
35
+ @csv_file << fields.map { |field| record[field] }
36
+ end
37
+
38
+
39
+
40
+ private
41
+
42
+ def tempfile_name
43
+ Dir::Tmpname.make_tmpname [definition.name.to_s, ".csv"], 1
44
+ end
45
+
46
+
47
+
48
+ def csv_options
49
+ options = {col_sep: definition.delimiter,
50
+ quote_char: definition.quote,
51
+ encoding: definition.encoding,
52
+ write_headers: false}
53
+
54
+ options.merge!({write_headers: true,
55
+ headers: definition.field_names}) if definition.has_header_row
56
+ options
57
+ end
58
+
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,13 @@
1
+ require 'csv'
2
+
3
+ module DataForge
4
+ module File
5
+ module CSV
6
+
7
+ autoload :CSVRecordFileDefinition, 'data_forge/file/csv/csv_record_file_definition'
8
+ autoload :CSVRecordFileReader, 'data_forge/file/csv/csv_record_file_reader'
9
+ autoload :CSVRecordFileWriter, 'data_forge/file/csv/csv_record_file_writer'
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,17 @@
1
+ module DataForge
2
+ module File
3
+ module RecordFileDefinition
4
+
5
+ def self.from_input(name, &initialization_block)
6
+ CSV::CSVRecordFileDefinition.new(name).tap { |definition| definition.instance_eval &initialization_block }
7
+ end
8
+
9
+
10
+
11
+ # Interface definition
12
+
13
+ attr_reader :name, :fields
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,22 @@
1
+ module DataForge
2
+ module File
3
+ class RecordFileReader
4
+
5
+ def self.for(definition)
6
+ CSV::CSVRecordFileReader.new definition
7
+ end
8
+
9
+
10
+
11
+ # Interface definition
12
+
13
+ attr_reader :definition, :fields, :name
14
+
15
+
16
+
17
+ def each_record(&block)
18
+ end
19
+
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,32 @@
1
+ module DataForge
2
+ module File
3
+ class RecordFileWriter
4
+
5
+ def self.for(definition)
6
+ CSV::CSVRecordFileWriter.new definition
7
+ end
8
+
9
+
10
+
11
+ # Interface definition
12
+
13
+ attr_reader :definition, :fields, :name
14
+
15
+
16
+
17
+ def open
18
+ end
19
+
20
+
21
+
22
+ def close
23
+ end
24
+
25
+
26
+
27
+ def write(record)
28
+ end
29
+
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,36 @@
1
+ module DataForge
2
+ module File
3
+
4
+ autoload :CSV, 'data_forge/file/csv'
5
+ autoload :RecordFileDefinition, 'data_forge/file/record_file_definition'
6
+ autoload :RecordFileReader, 'data_forge/file/record_file_reader'
7
+ autoload :RecordFileWriter, 'data_forge/file/record_file_writer'
8
+
9
+
10
+ @file_definitions = {}
11
+
12
+ class << self
13
+
14
+ def register_file_definition(name, &initialization_block)
15
+ @file_definitions[name] = File::RecordFileDefinition.from_input name, &initialization_block
16
+ end
17
+
18
+
19
+
20
+ def reader_for(definition_name)
21
+ raise "Unknown file reference '#{definition_name}'" unless @file_definitions.has_key? definition_name
22
+
23
+ RecordFileReader.for @file_definitions[definition_name]
24
+ end
25
+
26
+
27
+
28
+ def writer_for(definition_name)
29
+ raise "Unknown file reference '#{definition_name}'" unless @file_definitions.has_key? definition_name
30
+
31
+ RecordFileWriter.for @file_definitions[definition_name]
32
+ end
33
+
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,38 @@
1
+ module DataForge
2
+ module Transformation
3
+ class Deduplication < TransformationBase
4
+
5
+ class << self
6
+ def from_input(source_name, options = {})
7
+ reader = File.reader_for source_name
8
+ writer = File.writer_for(options.fetch :into, source_name)
9
+ unique_fields = Array(options.fetch :using, reader.fields)
10
+
11
+ new reader, writer, unique_fields
12
+ end
13
+ end
14
+
15
+
16
+
17
+ def initialize(reader, writer, unique_fields)
18
+ @reader, @writer, @unique_fields = reader, writer, unique_fields
19
+ @fingerprints = Set.new
20
+ end
21
+
22
+
23
+
24
+ def execute
25
+ with_writer @writer do |writer|
26
+ @reader.each_record do |record|
27
+ fingerprint = @unique_fields.map { |field_name| record[field_name] }
28
+ unless @fingerprints.include? fingerprint
29
+ @fingerprints.add fingerprint
30
+ writer.write record
31
+ end
32
+ end
33
+ end
34
+ end
35
+
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,33 @@
1
+ module DataForge
2
+ module Transformation
3
+ class RubyTransformation < TransformationBase
4
+
5
+ class << self
6
+ def from_input(source_name, options = {}, &block)
7
+ reader = File.reader_for source_name
8
+ writers = Array(options.fetch :into, source_name).map { |target_name| File.writer_for target_name }
9
+
10
+ new reader, writers, &block
11
+ end
12
+ end
13
+
14
+
15
+
16
+ def initialize(record_reader, record_writers, &transformation_block)
17
+ @record_reader, @record_writers, @transformation_block = record_reader, record_writers, transformation_block
18
+ end
19
+
20
+
21
+
22
+ def execute
23
+ with_writers @record_writers do |writers|
24
+ context = RubyTransformationContext.new writers
25
+ @record_reader.each_record do |record|
26
+ context.instance_exec record, &@transformation_block
27
+ end
28
+ end
29
+ end
30
+
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,27 @@
1
+ module DataForge
2
+ module Transformation
3
+ class RubyTransformationContext
4
+
5
+ def initialize(writers)
6
+ @_writer_names = writers.map { |writer| writer.name }
7
+ @_writers_hash = Hash[@_writer_names.zip writers]
8
+ @_default_writer = writers.first
9
+ end
10
+
11
+
12
+
13
+ def output(record, options = {})
14
+ if options.has_key? :to
15
+ Array(options[:to]).each do |target_writer_name|
16
+ raise "Unknown target file '#{target_writer_name}' for `output` command" unless @_writer_names.include? target_writer_name
17
+ @_writers_hash[target_writer_name].write record
18
+ end
19
+ else
20
+ raise "Missing :to directive for `output` command in multiple file transformation" if @_writers_hash.count > 1
21
+ @_default_writer.write record
22
+ end
23
+ end
24
+
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,29 @@
1
+ module DataForge
2
+ module Transformation
3
+ class TransformationBase
4
+
5
+ protected
6
+
7
+ def with_writers(writers)
8
+ writers.each { |writer| writer.open }
9
+ begin
10
+ yield writers
11
+ ensure
12
+ writers.each { |writer| writer.close }
13
+ end
14
+ end
15
+
16
+
17
+
18
+ def with_writer(writer)
19
+ writer.open
20
+ begin
21
+ yield writer
22
+ ensure
23
+ writer.close
24
+ end
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,10 @@
1
+ module DataForge
2
+ module Transformation
3
+
4
+ autoload :Deduplication, 'data_forge/transformation/deduplication'
5
+ autoload :RubyTransformation, 'data_forge/transformation/ruby_transformation'
6
+ autoload :TransformationBase, 'data_forge/transformation/transformation_base'
7
+ autoload :RubyTransformationContext, 'data_forge/transformation/ruby_transformation_context'
8
+
9
+ end
10
+ end
@@ -0,0 +1,3 @@
1
+ module DataForge
2
+ VERSION = "0.1"
3
+ end