arc-furnace 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +6 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +127 -0
  8. data/Rakefile +9 -0
  9. data/arc-furnace.gemspec +30 -0
  10. data/lib/arc-furnace.rb +12 -0
  11. data/lib/arc-furnace/abstract_join.rb +53 -0
  12. data/lib/arc-furnace/all_fields_csv_sink.rb +68 -0
  13. data/lib/arc-furnace/binary_key_merging_hash.rb +38 -0
  14. data/lib/arc-furnace/block_transform.rb +18 -0
  15. data/lib/arc-furnace/block_unfold.rb +18 -0
  16. data/lib/arc-furnace/csv_sink.rb +21 -0
  17. data/lib/arc-furnace/csv_source.rb +33 -0
  18. data/lib/arc-furnace/csv_to_hash_with_duplicate_headers.rb +19 -0
  19. data/lib/arc-furnace/dsl.rb +6 -0
  20. data/lib/arc-furnace/enumerator_source.rb +36 -0
  21. data/lib/arc-furnace/error_handler.rb +27 -0
  22. data/lib/arc-furnace/excel_source.rb +38 -0
  23. data/lib/arc-furnace/fixed_column_csv_sink.rb +37 -0
  24. data/lib/arc-furnace/hash.rb +41 -0
  25. data/lib/arc-furnace/inner_join.rb +27 -0
  26. data/lib/arc-furnace/logging_error_handler.rb +0 -0
  27. data/lib/arc-furnace/merging_hash.rb +41 -0
  28. data/lib/arc-furnace/multi_csv_source.rb +50 -0
  29. data/lib/arc-furnace/node.rb +5 -0
  30. data/lib/arc-furnace/nodes.rb +19 -0
  31. data/lib/arc-furnace/outer_join.rb +14 -0
  32. data/lib/arc-furnace/pipeline.rb +158 -0
  33. data/lib/arc-furnace/private_attr.rb +28 -0
  34. data/lib/arc-furnace/sink.rb +21 -0
  35. data/lib/arc-furnace/source.rb +40 -0
  36. data/lib/arc-furnace/suffixed_fixed_column_csv_sink.rb +18 -0
  37. data/lib/arc-furnace/transform.rb +28 -0
  38. data/lib/arc-furnace/unfold.rb +45 -0
  39. data/lib/arc-furnace/version.rb +3 -0
  40. metadata +182 -0
@@ -0,0 +1,21 @@
1
+ require 'arc-furnace/sink'
2
+
3
+ module ArcFurnace
4
+ class CSVSink < Sink
5
+ private_attr_reader :csv, :fields
6
+
7
+ def initialize(filename: , fields: , encoding: 'UTF-8', force_quotes: false)
8
+ @csv = CSV.open(filename, 'wb', encoding: encoding, headers: true, force_quotes: force_quotes)
9
+ @fields = fields
10
+ csv << fields
11
+ end
12
+
13
+ def finalize
14
+ csv.close
15
+ end
16
+
17
+ def row(hash)
18
+ csv << fields.map { |field_id| hash[field_id] }
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,33 @@
1
+ require 'arc-furnace/source'
2
+ require 'arc-furnace/csv_to_hash_with_duplicate_headers'
3
+ require 'csv'
4
+
5
+ module ArcFurnace
6
+ class CSVSource < Source
7
+ include CSVToHashWithDuplicateHeaders
8
+ private_attr_reader :csv, :file
9
+ attr_reader :value
10
+
11
+ def initialize(filename: , encoding: 'UTF-8')
12
+ @file = File.open(filename, encoding: encoding)
13
+ @csv = CSV.new(file, encoding: encoding, headers: true).each
14
+ advance
15
+ end
16
+
17
+ # Is this source empty?
18
+ def empty?
19
+ !value
20
+ end
21
+
22
+ def advance
23
+ @value =
24
+ begin
25
+ csv_to_hash_with_duplicates(csv.next) if csv
26
+ rescue StopIteration
27
+ file.close
28
+ @csv = nil
29
+ nil
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,19 @@
1
+ module ArcFurnace
2
+ module CSVToHashWithDuplicateHeaders
3
+ def csv_to_hash_with_duplicates(row)
4
+ result = {}
5
+ row.each do |column, value|
6
+ unless value.nil?
7
+ existing_value = result[column]
8
+ result[column] =
9
+ if existing_value
10
+ Array.wrap(existing_value) + [ value ]
11
+ else
12
+ value
13
+ end
14
+ end
15
+ end
16
+ result
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,6 @@
1
+ require 'arc-furnace/pipeline'
2
+
3
+ # Backwards-compatibility
4
+ module ArcFurnace
5
+ DSL = Pipeline
6
+ end
@@ -0,0 +1,36 @@
1
+ require 'arc-furnace/source'
2
+
3
+ module ArcFurnace
4
+ class EnumeratorSource < Source
5
+
6
+ private_attr_reader :enumerator
7
+ attr_reader :value
8
+
9
+ def initialize
10
+ @enumerator = build_enumerator
11
+ advance
12
+ end
13
+
14
+ # Is this source empty?
15
+ def empty?
16
+ !value
17
+ end
18
+
19
+ def advance
20
+ @value =
21
+ begin
22
+ enumerator.next if enumerator
23
+ rescue StopIteration
24
+ @enumerator = nil
25
+ nil
26
+ end
27
+ end
28
+
29
+ protected
30
+
31
+ # Return the enumerator
32
+ def build_enumerator
33
+ raise "Unimplemented!"
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,27 @@
1
+ require 'arc-furnace/source'
2
+
3
+ module ArcFurnace
4
+ class ErrorHandler
5
+
6
+ # Called during a join operation when a source row is missing a value for the join key.
7
+ def missing_join_key(source_row:, node_id:)
8
+ # nothing
9
+ end
10
+
11
+ # Called during a join operation when the hash is missing a value for the join key.
12
+ def missing_hash_key(key:, source_row:, node_id:)
13
+ # nothing
14
+ end
15
+
16
+ # Called when a hash node is missing a primary key during the build process.
17
+ def missing_primary_key(source_row:, node_id:)
18
+ # nothing
19
+ end
20
+
21
+ # Called when a hash node has duplicate source rows
22
+ def duplicate_primary_key(duplicate_row:, key:, node_id:)
23
+ # nothing
24
+ end
25
+
26
+ end
27
+ end
@@ -0,0 +1,38 @@
1
+ require 'arc-furnace/enumerator_source'
2
+ require 'roo'
3
+
4
+ module ArcFurnace
5
+ class ExcelSource < EnumeratorSource
6
+
7
+ private_attr_reader :excel, :enumerator
8
+ attr_reader :value
9
+
10
+ def initialize(filename: )
11
+ @excel = Roo::Excelx.new(filename)
12
+ super()
13
+ end
14
+
15
+ def close
16
+ @excel.close if @excel
17
+ end
18
+
19
+ def build_enumerator
20
+ header_row = excel.row(1)
21
+
22
+ last_row_index = excel.last_row
23
+ current_row_index = 2
24
+
25
+ Enumerator.new do |yielder|
26
+ until current_row_index > last_row_index
27
+ row = header_row.each_with_object(::Hash.new).each_with_index do |(header, result), index|
28
+ value = excel.cell(current_row_index, index + 1)
29
+ coerced_value = (value.is_a?(String) ? value : excel.excelx_value(current_row_index, index + 1)).try(:to_s).try(:strip)
30
+ result[header] = coerced_value unless coerced_value.blank?
31
+ end
32
+ current_row_index += 1
33
+ yielder << row
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,37 @@
1
+ require 'arc-furnace/sink'
2
+
3
+ module ArcFurnace
4
+ class FixedColumnCSVSink < Sink
5
+ private_attr_reader :fields, :csv
6
+
7
+ # Expects filename to a filename to open the csv
8
+ # Expects fields to a hash of Column name => column count
9
+ def initialize(filename: , fields: , encoding: 'UTF-8', force_quotes: false)
10
+ @csv = CSV.open(filename, 'wb', encoding: encoding, headers: true, force_quotes: force_quotes)
11
+ @fields = fields
12
+ write_header
13
+ end
14
+
15
+ def write_header
16
+ csv << fields.each_with_object([]) do |(key, count), result|
17
+ count.times { result << key }
18
+ end
19
+ end
20
+
21
+ def finalize
22
+ csv.close
23
+ end
24
+
25
+ def row(hash)
26
+ row = []
27
+ fields.each do |column_name, count|
28
+ values = Array.wrap(hash[column_name])
29
+ (values.slice(0, count) || []).each do |value|
30
+ row << value
31
+ end
32
+ (count - values.length).times { row << nil }
33
+ end
34
+ csv << row
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,41 @@
1
+ require 'arc-furnace/node'
2
+
3
+ module ArcFurnace
4
+ class Hash < Node
5
+ attr_reader :key_column
6
+ private_attr_reader :source, :hash
7
+
8
+ def initialize(source: , key_column:)
9
+ @source = source
10
+ @key_column = key_column
11
+ @hash = {}
12
+ end
13
+
14
+ # Pass a block that accepts two argument, the join key
15
+ # for each value and the value
16
+ def each(&block)
17
+ hash.each(&block)
18
+ end
19
+
20
+ def prepare
21
+ loop do
22
+ break if source.empty?
23
+ row = source.row
24
+ key = row[key_column]
25
+ if key
26
+ if hash.include?(key)
27
+ error_handler.duplicate_primary_key(duplicate_row: row, key: key, node_id: node_id)
28
+ end
29
+ hash[key] = row
30
+ else
31
+ error_handler.missing_primary_key(source_row: row, node_id: node_id)
32
+ end
33
+ end
34
+ end
35
+
36
+ def get(primary_key)
37
+ hash[primary_key]
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,27 @@
1
+ require 'arc-furnace/abstract_join'
2
+
3
+ module ArcFurnace
4
+ # Perform a join between a hash and a source, only producing rows
5
+ # from the source that match a row from the hash. The resulting row
6
+ # will merge the source "into" the hash, that is, values from the
7
+ # source that share the same keys will overwrite values in the hash
8
+ # value for the corresponding source row.
9
+ #
10
+ # Example:
11
+ # Source row { id: "foo", key1: "boo", key2: "bar" }
12
+ # Matching hash row { id: "foo", key1: "bar", key3: "baz" }
13
+ # Result row: { id: "foo", key1: "boo", key2: "bar", key3: "baz" }
14
+ class InnerJoin < AbstractJoin
15
+
16
+ def advance
17
+ loop do
18
+ @value = source.row
19
+ break if value.nil?
20
+ if merge_source_row(value)
21
+ break
22
+ end
23
+ end
24
+ end
25
+
26
+ end
27
+ end
File without changes
@@ -0,0 +1,41 @@
1
+ module ArcFurnace
2
+ # This allows one to merge multiple rows into one such as:
3
+ # key, value
4
+ # 1, foo
5
+ # 1, bar
6
+ # 2, baz
7
+ # Results in:
8
+ # 1 => { key => 1, value: [foo, bar] }
9
+ # 2 => { key => 2, value: baz }
10
+ class MergingHash < ::ArcFurnace::Hash
11
+ private_attr_reader :source, :hash
12
+
13
+ def prepare
14
+ loop do
15
+ break if source.empty?
16
+ row = source.row
17
+ row_key = row[key_column]
18
+ if row_key
19
+ row_entry = hash[row_key] ||= {}
20
+ row.each do |column, values|
21
+ existing_column_values = row_entry[column]
22
+ if existing_column_values && column != key_column
23
+ if existing_column_values.is_a?(Array)
24
+ existing_column_values.concat(Array.wrap(values))
25
+ else
26
+ new_row_entry = Array.wrap(existing_column_values)
27
+ new_row_entry.concat(Array.wrap(values))
28
+ row_entry[column] = new_row_entry
29
+ end
30
+ else
31
+ unless values.nil?
32
+ row_entry[column] = values.dup
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,50 @@
1
+ require 'arc-furnace/source'
2
+ require 'arc-furnace/csv_to_hash_with_duplicate_headers'
3
+ require 'csv'
4
+
5
+ module ArcFurnace
6
+ class MultiCSVSource < Source
7
+ include CSVToHashWithDuplicateHeaders
8
+ private_attr_reader :csv, :file, :filenames, :encoding
9
+ attr_reader :value
10
+
11
+ def initialize(filenames: , encoding: 'UTF-8')
12
+ @encoding = encoding
13
+ @filenames = filenames.reverse
14
+ open_next_file
15
+ end
16
+
17
+ # Is this source empty?
18
+ def empty?
19
+ !value
20
+ end
21
+
22
+ def advance
23
+ advance_in_current_file || open_next_file
24
+ end
25
+
26
+ private
27
+
28
+ def advance_in_current_file
29
+ @value =
30
+ begin
31
+ csv_to_hash_with_duplicates(csv.next)
32
+ rescue StopIteration
33
+ nil
34
+ end
35
+ value
36
+ end
37
+
38
+ def open_next_file
39
+ file.close if file
40
+ @file = nil
41
+ if filenames.empty?
42
+ nil
43
+ else
44
+ @file = File.open(filenames.pop, encoding: encoding)
45
+ @csv = CSV.new(file, encoding: encoding, headers: true).each
46
+ advance_in_current_file || open_next_file
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,5 @@
1
+ module ArcFurnace
2
+ class Node
3
+ attr_accessor :error_handler, :node_id
4
+ end
5
+ end
@@ -0,0 +1,19 @@
1
+ require 'arc-furnace/all_fields_csv_sink'
2
+ require 'arc-furnace/binary_key_merging_hash'
3
+ require 'arc-furnace/block_transform'
4
+ require 'arc-furnace/block_unfold'
5
+ require 'arc-furnace/csv_sink'
6
+ require 'arc-furnace/csv_source'
7
+ require 'arc-furnace/enumerator_source'
8
+ require 'arc-furnace/fixed_column_csv_sink'
9
+ require 'arc-furnace/hash'
10
+ require 'arc-furnace/inner_join'
11
+ require 'arc-furnace/merging_hash'
12
+ require 'arc-furnace/multi_csv_source'
13
+ require 'arc-furnace/node'
14
+ require 'arc-furnace/outer_join'
15
+ require 'arc-furnace/sink'
16
+ require 'arc-furnace/source'
17
+ require 'arc-furnace/suffixed_fixed_column_csv_sink'
18
+ require 'arc-furnace/transform'
19
+ require 'arc-furnace/unfold'
@@ -0,0 +1,14 @@
1
+ require 'arc-furnace/abstract_join'
2
+
3
+ module ArcFurnace
4
+ class OuterJoin < AbstractJoin
5
+
6
+ def advance
7
+ @value = source.row
8
+ unless value.nil?
9
+ merge_source_row(value)
10
+ end
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,158 @@
1
+ require 'eigenclass'
2
+ require 'arc-furnace/nodes'
3
+ require 'arc-furnace/error_handler'
4
+
5
+ module ArcFurnace
6
+ class Pipeline
7
+
8
+ eattr_accessor :sink_node, :sink_source, :intermediates_map
9
+ @intermediates_map = {}
10
+
11
+ # Ensure that subclasses don't overwrite the parent's transform
12
+ # node definitions
13
+ def self.inherited(subclass)
14
+ subclass.intermediates_map = intermediates_map.dup
15
+ end
16
+
17
+ # Define the sink for this transformation. Only a single sink may be
18
+ # specified per transformation. The sink is delivered a hash per row or
19
+ # entity, and feeds them from the graph of nodes above it.
20
+ def self.sink(type: , source:, params:)
21
+ if sink_node
22
+ raise 'Sink already defined!'
23
+ end
24
+
25
+ @sink_node = -> do
26
+ type.new(resolve_parameters(params))
27
+ end
28
+ @sink_source = source
29
+ end
30
+
31
+ # Define a hash node, processing all rows from it's source and caching them
32
+ # in-memory.
33
+ def self.hash_node(name, type: ArcFurnace::Hash, params:)
34
+ define_intermediate(name, type: type, params: params)
35
+ end
36
+
37
+ # A source that has row semantics, delivering a hash per row (or per entity)
38
+ # for the source.
39
+ def self.source(name, type:, params:)
40
+ raise "Source #{type} is not a Source!" unless type <= Source
41
+ define_intermediate(name, type: type, params: params)
42
+ end
43
+
44
+ # Define an inner join node where rows from the source are dropped
45
+ # if an associated entity is not found in the hash for the join key
46
+ def self.inner_join(name, type: ArcFurnace::InnerJoin, params:)
47
+ define_intermediate(name, type: type, params: params)
48
+ end
49
+
50
+ # Define an outer join nod e where rows from the source are kept
51
+ # even if an associated entity is not found in the hash for the join key
52
+ def self.outer_join(name, type: ArcFurnace::OuterJoin, params:)
53
+ define_intermediate(name, type: type, params: params)
54
+ end
55
+
56
+ # Define a node that transforms rows. By default you get a BlockTransform
57
+ # (and when this metaprogramming method is passed a block) that will be passed
58
+ # a hash for each row. The result of the block becomes the row for the next
59
+ # downstream node.
60
+ def self.transform(name, type: BlockTransform, params: {}, &block)
61
+ if block
62
+ params[:block] = block
63
+ end
64
+ raise "Transform #{type} is not a Transform!" unless type <= Transform
65
+ define_intermediate(name, type: type, params: params)
66
+ end
67
+
68
+ # Define a node that unfolds rows. By default you get a BlocUnfold
69
+ # (and when this metaprogramming method is passed a block) that will be passed
70
+ # a hash for each row. The result of the block becomes the set of rows for the next
71
+ # downstream node.
72
+ def self.unfold(name, type: BlockUnfold, params: {}, &block)
73
+ if block
74
+ params[:block] = block
75
+ end
76
+ raise "Unfold #{type} is not an Unfold!" unless type <= Unfold
77
+ define_intermediate(name, type: type, params: params)
78
+ end
79
+
80
+ # Create an instance to run a transformation, passing the parameters to
81
+ # instantiate the transform instance with. The resulting class instance
82
+ # will have a single public method--#execute, which will perform the
83
+ # transformation.
84
+ def self.instance(params = {})
85
+ DSLInstance.new(self, params)
86
+ end
87
+
88
+ private
89
+
90
+ def self.define_intermediate(name, type:, params:)
91
+ intermediates_map[name] = -> do
92
+ type.new(resolve_parameters(params))
93
+ end
94
+ end
95
+
96
+ class DSLInstance
97
+ attr_reader :sink_node, :sink_source, :intermediates_map, :params, :dsl_class, :error_handler
98
+
99
+ def initialize(dsl_class, error_handler: ErrorHandler.new, **params)
100
+ @dsl_class = dsl_class
101
+ @params = params
102
+ @intermediates_map = {}
103
+ @error_handler = error_handler
104
+ end
105
+
106
+ def execute
107
+ build
108
+ prepare
109
+ run
110
+ end
111
+
112
+ private
113
+
114
+ def run
115
+ while (row = sink_source.row)
116
+ sink_node.row(row)
117
+ end
118
+ sink_node.finalize
119
+ end
120
+
121
+ def prepare
122
+ intermediates_map.each do |node_id, instance|
123
+ instance.error_handler = error_handler
124
+ instance.node_id = node_id
125
+ instance.prepare
126
+ end
127
+ sink_node.prepare
128
+ end
129
+
130
+ def build
131
+ dsl_class.intermediates_map.each do |key, instance|
132
+ intermediates_map[key] = instance_exec(&instance) if instance
133
+ end
134
+ @sink_node = instance_exec(&dsl_class.sink_node)
135
+ @sink_source = intermediates_map[dsl_class.sink_source]
136
+ end
137
+
138
+ def resolve_parameters(params_to_resolve)
139
+ params_to_resolve.each_with_object({}) do |(key, value), result|
140
+ result[key] =
141
+ if value.is_a?(Symbol)
142
+ # Allow resolution of intermediates
143
+ resolve_parameter(value)
144
+ elsif value.nil?
145
+ resolve_parameter(key)
146
+ else
147
+ value
148
+ end
149
+ end
150
+ end
151
+
152
+ def resolve_parameter(key)
153
+ self.params[key] || self.intermediates_map[key] || (raise "Unknown key #{key}!")
154
+ end
155
+
156
+ end
157
+ end
158
+ end