arc-furnace 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +6 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +127 -0
  8. data/Rakefile +9 -0
  9. data/arc-furnace.gemspec +30 -0
  10. data/lib/arc-furnace.rb +12 -0
  11. data/lib/arc-furnace/abstract_join.rb +53 -0
  12. data/lib/arc-furnace/all_fields_csv_sink.rb +68 -0
  13. data/lib/arc-furnace/binary_key_merging_hash.rb +38 -0
  14. data/lib/arc-furnace/block_transform.rb +18 -0
  15. data/lib/arc-furnace/block_unfold.rb +18 -0
  16. data/lib/arc-furnace/csv_sink.rb +21 -0
  17. data/lib/arc-furnace/csv_source.rb +33 -0
  18. data/lib/arc-furnace/csv_to_hash_with_duplicate_headers.rb +19 -0
  19. data/lib/arc-furnace/dsl.rb +6 -0
  20. data/lib/arc-furnace/enumerator_source.rb +36 -0
  21. data/lib/arc-furnace/error_handler.rb +27 -0
  22. data/lib/arc-furnace/excel_source.rb +38 -0
  23. data/lib/arc-furnace/fixed_column_csv_sink.rb +37 -0
  24. data/lib/arc-furnace/hash.rb +41 -0
  25. data/lib/arc-furnace/inner_join.rb +27 -0
  26. data/lib/arc-furnace/logging_error_handler.rb +0 -0
  27. data/lib/arc-furnace/merging_hash.rb +41 -0
  28. data/lib/arc-furnace/multi_csv_source.rb +50 -0
  29. data/lib/arc-furnace/node.rb +5 -0
  30. data/lib/arc-furnace/nodes.rb +19 -0
  31. data/lib/arc-furnace/outer_join.rb +14 -0
  32. data/lib/arc-furnace/pipeline.rb +158 -0
  33. data/lib/arc-furnace/private_attr.rb +28 -0
  34. data/lib/arc-furnace/sink.rb +21 -0
  35. data/lib/arc-furnace/source.rb +40 -0
  36. data/lib/arc-furnace/suffixed_fixed_column_csv_sink.rb +18 -0
  37. data/lib/arc-furnace/transform.rb +28 -0
  38. data/lib/arc-furnace/unfold.rb +45 -0
  39. data/lib/arc-furnace/version.rb +3 -0
  40. metadata +182 -0
@@ -0,0 +1,21 @@
1
+ require 'arc-furnace/sink'
2
+
3
+ module ArcFurnace
4
+ class CSVSink < Sink
5
+ private_attr_reader :csv, :fields
6
+
7
+ def initialize(filename: , fields: , encoding: 'UTF-8', force_quotes: false)
8
+ @csv = CSV.open(filename, 'wb', encoding: encoding, headers: true, force_quotes: force_quotes)
9
+ @fields = fields
10
+ csv << fields
11
+ end
12
+
13
+ def finalize
14
+ csv.close
15
+ end
16
+
17
+ def row(hash)
18
+ csv << fields.map { |field_id| hash[field_id] }
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,33 @@
1
+ require 'arc-furnace/source'
2
+ require 'arc-furnace/csv_to_hash_with_duplicate_headers'
3
+ require 'csv'
4
+
5
+ module ArcFurnace
6
+ class CSVSource < Source
7
+ include CSVToHashWithDuplicateHeaders
8
+ private_attr_reader :csv, :file
9
+ attr_reader :value
10
+
11
+ def initialize(filename: , encoding: 'UTF-8')
12
+ @file = File.open(filename, encoding: encoding)
13
+ @csv = CSV.new(file, encoding: encoding, headers: true).each
14
+ advance
15
+ end
16
+
17
+ # Is this source empty?
18
+ def empty?
19
+ !value
20
+ end
21
+
22
+ def advance
23
+ @value =
24
+ begin
25
+ csv_to_hash_with_duplicates(csv.next) if csv
26
+ rescue StopIteration
27
+ file.close
28
+ @csv = nil
29
+ nil
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,19 @@
1
+ module ArcFurnace
2
+ module CSVToHashWithDuplicateHeaders
3
+ def csv_to_hash_with_duplicates(row)
4
+ result = {}
5
+ row.each do |column, value|
6
+ unless value.nil?
7
+ existing_value = result[column]
8
+ result[column] =
9
+ if existing_value
10
+ Array.wrap(existing_value) + [ value ]
11
+ else
12
+ value
13
+ end
14
+ end
15
+ end
16
+ result
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,6 @@
1
+ require 'arc-furnace/pipeline'
2
+
3
+ # Backwards-compatibility
4
+ module ArcFurnace
5
+ DSL = Pipeline
6
+ end
@@ -0,0 +1,36 @@
1
+ require 'arc-furnace/source'
2
+
3
+ module ArcFurnace
4
+ class EnumeratorSource < Source
5
+
6
+ private_attr_reader :enumerator
7
+ attr_reader :value
8
+
9
+ def initialize
10
+ @enumerator = build_enumerator
11
+ advance
12
+ end
13
+
14
+ # Is this source empty?
15
+ def empty?
16
+ !value
17
+ end
18
+
19
+ def advance
20
+ @value =
21
+ begin
22
+ enumerator.next if enumerator
23
+ rescue StopIteration
24
+ @enumerator = nil
25
+ nil
26
+ end
27
+ end
28
+
29
+ protected
30
+
31
+ # Return the enumerator
32
+ def build_enumerator
33
+ raise "Unimplemented!"
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,27 @@
1
+ require 'arc-furnace/source'
2
+
3
+ module ArcFurnace
4
+ class ErrorHandler
5
+
6
+ # Called during a join operation when a source row is missing a value for the join key.
7
+ def missing_join_key(source_row:, node_id:)
8
+ # nothing
9
+ end
10
+
11
+ # Called during a join operation when the hash is missing a value for the join key.
12
+ def missing_hash_key(key:, source_row:, node_id:)
13
+ # nothing
14
+ end
15
+
16
+ # Called when a hash node is missing a primary key during the build process.
17
+ def missing_primary_key(source_row:, node_id:)
18
+ # nothing
19
+ end
20
+
21
+ # Called when a hash node has duplicate source rows
22
+ def duplicate_primary_key(duplicate_row:, key:, node_id:)
23
+ # nothing
24
+ end
25
+
26
+ end
27
+ end
@@ -0,0 +1,38 @@
1
+ require 'arc-furnace/enumerator_source'
2
+ require 'roo'
3
+
4
+ module ArcFurnace
5
+ class ExcelSource < EnumeratorSource
6
+
7
+ private_attr_reader :excel, :enumerator
8
+ attr_reader :value
9
+
10
+ def initialize(filename: )
11
+ @excel = Roo::Excelx.new(filename)
12
+ super()
13
+ end
14
+
15
+ def close
16
+ @excel.close if @excel
17
+ end
18
+
19
+ def build_enumerator
20
+ header_row = excel.row(1)
21
+
22
+ last_row_index = excel.last_row
23
+ current_row_index = 2
24
+
25
+ Enumerator.new do |yielder|
26
+ until current_row_index > last_row_index
27
+ row = header_row.each_with_object(::Hash.new).each_with_index do |(header, result), index|
28
+ value = excel.cell(current_row_index, index + 1)
29
+ coerced_value = (value.is_a?(String) ? value : excel.excelx_value(current_row_index, index + 1)).try(:to_s).try(:strip)
30
+ result[header] = coerced_value unless coerced_value.blank?
31
+ end
32
+ current_row_index += 1
33
+ yielder << row
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,37 @@
1
+ require 'arc-furnace/sink'
2
+
3
+ module ArcFurnace
4
+ class FixedColumnCSVSink < Sink
5
+ private_attr_reader :fields, :csv
6
+
7
+ # Expects filename to a filename to open the csv
8
+ # Expects fields to a hash of Column name => column count
9
+ def initialize(filename: , fields: , encoding: 'UTF-8', force_quotes: false)
10
+ @csv = CSV.open(filename, 'wb', encoding: encoding, headers: true, force_quotes: force_quotes)
11
+ @fields = fields
12
+ write_header
13
+ end
14
+
15
+ def write_header
16
+ csv << fields.each_with_object([]) do |(key, count), result|
17
+ count.times { result << key }
18
+ end
19
+ end
20
+
21
+ def finalize
22
+ csv.close
23
+ end
24
+
25
+ def row(hash)
26
+ row = []
27
+ fields.each do |column_name, count|
28
+ values = Array.wrap(hash[column_name])
29
+ (values.slice(0, count) || []).each do |value|
30
+ row << value
31
+ end
32
+ (count - values.length).times { row << nil }
33
+ end
34
+ csv << row
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,41 @@
1
+ require 'arc-furnace/node'
2
+
3
+ module ArcFurnace
4
+ class Hash < Node
5
+ attr_reader :key_column
6
+ private_attr_reader :source, :hash
7
+
8
+ def initialize(source: , key_column:)
9
+ @source = source
10
+ @key_column = key_column
11
+ @hash = {}
12
+ end
13
+
14
+ # Pass a block that accepts two argument, the join key
15
+ # for each value and the value
16
+ def each(&block)
17
+ hash.each(&block)
18
+ end
19
+
20
+ def prepare
21
+ loop do
22
+ break if source.empty?
23
+ row = source.row
24
+ key = row[key_column]
25
+ if key
26
+ if hash.include?(key)
27
+ error_handler.duplicate_primary_key(duplicate_row: row, key: key, node_id: node_id)
28
+ end
29
+ hash[key] = row
30
+ else
31
+ error_handler.missing_primary_key(source_row: row, node_id: node_id)
32
+ end
33
+ end
34
+ end
35
+
36
+ def get(primary_key)
37
+ hash[primary_key]
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,27 @@
1
+ require 'arc-furnace/abstract_join'
2
+
3
+ module ArcFurnace
4
+ # Perform a join between a hash and a source, only producing rows
5
+ # from the source that match a row from the hash. The resulting row
6
+ # will merge the source "into" the hash, that is, values from the
7
+ # source that share the same keys will overwrite values in the hash
8
+ # value for the corresponding source row.
9
+ #
10
+ # Example:
11
+ # Source row { id: "foo", key1: "boo", key2: "bar" }
12
+ # Matching hash row { id: "foo", key1: "bar", key3: "baz" }
13
+ # Result row: { id: "foo", key1: "boo", key2: "bar", key3: "baz" }
14
+ class InnerJoin < AbstractJoin
15
+
16
+ def advance
17
+ loop do
18
+ @value = source.row
19
+ break if value.nil?
20
+ if merge_source_row(value)
21
+ break
22
+ end
23
+ end
24
+ end
25
+
26
+ end
27
+ end
File without changes
@@ -0,0 +1,41 @@
1
+ module ArcFurnace
2
+ # This allows one to merge multiple rows into one such as:
3
+ # key, value
4
+ # 1, foo
5
+ # 1, bar
6
+ # 2, baz
7
+ # Results in:
8
+ # 1 => { key => 1, value: [foo, bar] }
9
+ # 2 => { key => 2, value: baz }
10
+ class MergingHash < ::ArcFurnace::Hash
11
+ private_attr_reader :source, :hash
12
+
13
+ def prepare
14
+ loop do
15
+ break if source.empty?
16
+ row = source.row
17
+ row_key = row[key_column]
18
+ if row_key
19
+ row_entry = hash[row_key] ||= {}
20
+ row.each do |column, values|
21
+ existing_column_values = row_entry[column]
22
+ if existing_column_values && column != key_column
23
+ if existing_column_values.is_a?(Array)
24
+ existing_column_values.concat(Array.wrap(values))
25
+ else
26
+ new_row_entry = Array.wrap(existing_column_values)
27
+ new_row_entry.concat(Array.wrap(values))
28
+ row_entry[column] = new_row_entry
29
+ end
30
+ else
31
+ unless values.nil?
32
+ row_entry[column] = values.dup
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,50 @@
1
+ require 'arc-furnace/source'
2
+ require 'arc-furnace/csv_to_hash_with_duplicate_headers'
3
+ require 'csv'
4
+
5
+ module ArcFurnace
6
+ class MultiCSVSource < Source
7
+ include CSVToHashWithDuplicateHeaders
8
+ private_attr_reader :csv, :file, :filenames, :encoding
9
+ attr_reader :value
10
+
11
+ def initialize(filenames: , encoding: 'UTF-8')
12
+ @encoding = encoding
13
+ @filenames = filenames.reverse
14
+ open_next_file
15
+ end
16
+
17
+ # Is this source empty?
18
+ def empty?
19
+ !value
20
+ end
21
+
22
+ def advance
23
+ advance_in_current_file || open_next_file
24
+ end
25
+
26
+ private
27
+
28
+ def advance_in_current_file
29
+ @value =
30
+ begin
31
+ csv_to_hash_with_duplicates(csv.next)
32
+ rescue StopIteration
33
+ nil
34
+ end
35
+ value
36
+ end
37
+
38
+ def open_next_file
39
+ file.close if file
40
+ @file = nil
41
+ if filenames.empty?
42
+ nil
43
+ else
44
+ @file = File.open(filenames.pop, encoding: encoding)
45
+ @csv = CSV.new(file, encoding: encoding, headers: true).each
46
+ advance_in_current_file || open_next_file
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,5 @@
1
+ module ArcFurnace
2
+ class Node
3
+ attr_accessor :error_handler, :node_id
4
+ end
5
+ end
@@ -0,0 +1,19 @@
1
+ require 'arc-furnace/all_fields_csv_sink'
2
+ require 'arc-furnace/binary_key_merging_hash'
3
+ require 'arc-furnace/block_transform'
4
+ require 'arc-furnace/block_unfold'
5
+ require 'arc-furnace/csv_sink'
6
+ require 'arc-furnace/csv_source'
7
+ require 'arc-furnace/enumerator_source'
8
+ require 'arc-furnace/fixed_column_csv_sink'
9
+ require 'arc-furnace/hash'
10
+ require 'arc-furnace/inner_join'
11
+ require 'arc-furnace/merging_hash'
12
+ require 'arc-furnace/multi_csv_source'
13
+ require 'arc-furnace/node'
14
+ require 'arc-furnace/outer_join'
15
+ require 'arc-furnace/sink'
16
+ require 'arc-furnace/source'
17
+ require 'arc-furnace/suffixed_fixed_column_csv_sink'
18
+ require 'arc-furnace/transform'
19
+ require 'arc-furnace/unfold'
@@ -0,0 +1,14 @@
1
+ require 'arc-furnace/abstract_join'
2
+
3
+ module ArcFurnace
4
+ class OuterJoin < AbstractJoin
5
+
6
+ def advance
7
+ @value = source.row
8
+ unless value.nil?
9
+ merge_source_row(value)
10
+ end
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,158 @@
1
+ require 'eigenclass'
2
+ require 'arc-furnace/nodes'
3
+ require 'arc-furnace/error_handler'
4
+
5
+ module ArcFurnace
6
+ class Pipeline
7
+
8
+ eattr_accessor :sink_node, :sink_source, :intermediates_map
9
+ @intermediates_map = {}
10
+
11
+ # Ensure that subclasses don't overwrite the parent's transform
12
+ # node definitions
13
+ def self.inherited(subclass)
14
+ subclass.intermediates_map = intermediates_map.dup
15
+ end
16
+
17
+ # Define the sink for this transformation. Only a single sink may be
18
+ # specified per transformation. The sink is delivered a hash per row or
19
+ # entity, and feeds them from the graph of nodes above it.
20
+ def self.sink(type: , source:, params:)
21
+ if sink_node
22
+ raise 'Sink already defined!'
23
+ end
24
+
25
+ @sink_node = -> do
26
+ type.new(resolve_parameters(params))
27
+ end
28
+ @sink_source = source
29
+ end
30
+
31
+ # Define a hash node, processing all rows from it's source and caching them
32
+ # in-memory.
33
+ def self.hash_node(name, type: ArcFurnace::Hash, params:)
34
+ define_intermediate(name, type: type, params: params)
35
+ end
36
+
37
+ # A source that has row semantics, delivering a hash per row (or per entity)
38
+ # for the source.
39
+ def self.source(name, type:, params:)
40
+ raise "Source #{type} is not a Source!" unless type <= Source
41
+ define_intermediate(name, type: type, params: params)
42
+ end
43
+
44
+ # Define an inner join node where rows from the source are dropped
45
+ # if an associated entity is not found in the hash for the join key
46
+ def self.inner_join(name, type: ArcFurnace::InnerJoin, params:)
47
+ define_intermediate(name, type: type, params: params)
48
+ end
49
+
50
+ # Define an outer join nod e where rows from the source are kept
51
+ # even if an associated entity is not found in the hash for the join key
52
+ def self.outer_join(name, type: ArcFurnace::OuterJoin, params:)
53
+ define_intermediate(name, type: type, params: params)
54
+ end
55
+
56
+ # Define a node that transforms rows. By default you get a BlockTransform
57
+ # (and when this metaprogramming method is passed a block) that will be passed
58
+ # a hash for each row. The result of the block becomes the row for the next
59
+ # downstream node.
60
+ def self.transform(name, type: BlockTransform, params: {}, &block)
61
+ if block
62
+ params[:block] = block
63
+ end
64
+ raise "Transform #{type} is not a Transform!" unless type <= Transform
65
+ define_intermediate(name, type: type, params: params)
66
+ end
67
+
68
+ # Define a node that unfolds rows. By default you get a BlocUnfold
69
+ # (and when this metaprogramming method is passed a block) that will be passed
70
+ # a hash for each row. The result of the block becomes the set of rows for the next
71
+ # downstream node.
72
+ def self.unfold(name, type: BlockUnfold, params: {}, &block)
73
+ if block
74
+ params[:block] = block
75
+ end
76
+ raise "Unfold #{type} is not an Unfold!" unless type <= Unfold
77
+ define_intermediate(name, type: type, params: params)
78
+ end
79
+
80
+ # Create an instance to run a transformation, passing the parameters to
81
+ # instantiate the transform instance with. The resulting class instance
82
+ # will have a single public method--#execute, which will perform the
83
+ # transformation.
84
+ def self.instance(params = {})
85
+ DSLInstance.new(self, params)
86
+ end
87
+
88
+ private
89
+
90
+ def self.define_intermediate(name, type:, params:)
91
+ intermediates_map[name] = -> do
92
+ type.new(resolve_parameters(params))
93
+ end
94
+ end
95
+
96
+ class DSLInstance
97
+ attr_reader :sink_node, :sink_source, :intermediates_map, :params, :dsl_class, :error_handler
98
+
99
+ def initialize(dsl_class, error_handler: ErrorHandler.new, **params)
100
+ @dsl_class = dsl_class
101
+ @params = params
102
+ @intermediates_map = {}
103
+ @error_handler = error_handler
104
+ end
105
+
106
+ def execute
107
+ build
108
+ prepare
109
+ run
110
+ end
111
+
112
+ private
113
+
114
+ def run
115
+ while (row = sink_source.row)
116
+ sink_node.row(row)
117
+ end
118
+ sink_node.finalize
119
+ end
120
+
121
+ def prepare
122
+ intermediates_map.each do |node_id, instance|
123
+ instance.error_handler = error_handler
124
+ instance.node_id = node_id
125
+ instance.prepare
126
+ end
127
+ sink_node.prepare
128
+ end
129
+
130
+ def build
131
+ dsl_class.intermediates_map.each do |key, instance|
132
+ intermediates_map[key] = instance_exec(&instance) if instance
133
+ end
134
+ @sink_node = instance_exec(&dsl_class.sink_node)
135
+ @sink_source = intermediates_map[dsl_class.sink_source]
136
+ end
137
+
138
+ def resolve_parameters(params_to_resolve)
139
+ params_to_resolve.each_with_object({}) do |(key, value), result|
140
+ result[key] =
141
+ if value.is_a?(Symbol)
142
+ # Allow resolution of intermediates
143
+ resolve_parameter(value)
144
+ elsif value.nil?
145
+ resolve_parameter(key)
146
+ else
147
+ value
148
+ end
149
+ end
150
+ end
151
+
152
+ def resolve_parameter(key)
153
+ self.params[key] || self.intermediates_map[key] || (raise "Unknown key #{key}!")
154
+ end
155
+
156
+ end
157
+ end
158
+ end