arc-furnace 0.1.0 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +12 -5
- data/lib/arc-furnace/block_filter.rb +18 -0
- data/lib/arc-furnace/enumerator_source.rb +1 -1
- data/lib/arc-furnace/filter.rb +37 -0
- data/lib/arc-furnace/nodes.rb +2 -0
- data/lib/arc-furnace/pipeline.rb +38 -26
- data/lib/arc-furnace/source.rb +6 -4
- data/lib/arc-furnace/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2ab49afa557ed851dd866c6dd1486bde91062890
|
4
|
+
data.tar.gz: f66fa651a0e3ac6df2c3824d5daecfe1244859c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 236b5f55f9914dc37a89579cc543304880af745d49096186e94397e6eca5ea41024437bb16461e5c457786e47fbc3db67fed268a424f72b7c7c057be026d796e
|
7
|
+
data.tar.gz: 6833e0e9e9e84442ce686cdd68e55efdd3c8535ec57b7fe225c10f2811e390e5af02525086ab90cac8ee158533fe619f7984f4f0cb834fda25f2dd57670ae694
|
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# ArcFurnace
|
2
|
+
[][gem]
|
2
3
|
[][travis]
|
3
4
|
|
5
|
+
[gem]: https://rubygems.org/gems/arc-furnace
|
4
6
|
[travis]: http://travis-ci.org/salsify/arc-furnace
|
5
7
|
|
6
8
|
ArcFurnace melts, melds, and transforms your scrap data into perfectly crafted data for ingest into applications,
|
@@ -71,7 +73,7 @@ require a stream of data (`Hash`, `Transform`, `Join`, `Sink`) will have one.
|
|
71
73
|
|
72
74
|
#### Hashes
|
73
75
|
|
74
|
-
A `Hash` provides indexed access to a `Source` but pre-computing the index based on a key. The processing happens during the
|
76
|
+
A `Hash` provides indexed access to a `Source` but pre-computing the index based on a key. The processing happens during the
|
75
77
|
prepare stage of pipeline processing. Hashes have a simple interface, `#get(primary_key)`, to requesting data. Hashes
|
76
78
|
are almost exclusively used as inputs to one side of joins.
|
77
79
|
|
@@ -82,6 +84,12 @@ key is the key that the hash was rolled-up on, however, the `key_column` option
|
|
82
84
|
may override this. Note the default join is an inner join, which will drop source rows if the hash does not contain
|
83
85
|
a matching row.
|
84
86
|
|
87
|
+
#### Filters
|
88
|
+
|
89
|
+
A `Filter` acts as a source, however, takes a source as an input and determines whether to pass each row to
|
90
|
+
the next downstream node by calling the `#filter` method on itself. There is an associated `BlockFilter` and
|
91
|
+
sugar on `Pipeline` to make this easy.
|
92
|
+
|
85
93
|
#### Transforms
|
86
94
|
|
87
95
|
A `Transform` acts as a source, however, takes a source as an input and transforms each input. The `BlockTransform` and
|
@@ -100,7 +108,7 @@ subscribe to the `#row(hash)` interace--each output row is passed to this method
|
|
100
108
|
|
101
109
|
### General pipeline development process
|
102
110
|
|
103
|
-
1. Define a source. Choose an existing `Source` implementation in this library (`CSVSource` or `ExcelSource`),
|
111
|
+
1. Define a source. Choose an existing `Source` implementation in this library (`CSVSource` or `ExcelSource`),
|
104
112
|
extend the `EnumeratorSource`, or implement the `row()` method for a new source.
|
105
113
|
2. Define any transformations, or joins. This may cause you to revisit #1.
|
106
114
|
3. Define the sink. This is generally custom, or, may be one of the provided `CSVSink` types.
|
@@ -114,9 +122,8 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
114
122
|
|
115
123
|
## TODOs
|
116
124
|
|
117
|
-
1. Add
|
118
|
-
2. Add
|
119
|
-
3. Add sugar to define a `BlockTransform` on a `Source` definition in a `Pipeline`.
|
125
|
+
1. Add examples for `ErrorHandler` interface.
|
126
|
+
2. Add sugar to define a `BlockTransform` on a `Source` definition in a `Pipeline`.
|
120
127
|
|
121
128
|
## Contributing
|
122
129
|
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'arc-furnace/filter'
|
2
|
+
|
3
|
+
module ArcFurnace
|
4
|
+
class BlockFilter < Filter
|
5
|
+
private_attr_reader :block
|
6
|
+
|
7
|
+
def initialize(source:, block:)
|
8
|
+
raise 'Must specify a block' if block.nil?
|
9
|
+
@block = block
|
10
|
+
super(source: source)
|
11
|
+
end
|
12
|
+
|
13
|
+
def filter(row)
|
14
|
+
block.call(row)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'arc-furnace/source'
|
2
|
+
|
3
|
+
# Filters limit rows to downstream nodes. They act just like Enumerable#filter:
|
4
|
+
# when the #filter method returns true, the row is passed downstream. when
|
5
|
+
# it returns false, the row is skipped.
|
6
|
+
module ArcFurnace
|
7
|
+
class Filter < Source
|
8
|
+
|
9
|
+
private_attr_reader :source
|
10
|
+
attr_reader :value
|
11
|
+
|
12
|
+
def initialize(source:)
|
13
|
+
@source = source
|
14
|
+
advance
|
15
|
+
end
|
16
|
+
|
17
|
+
# Given a row from the source, tell if it should be passed down to the next
|
18
|
+
# node downstream from this node.
|
19
|
+
#
|
20
|
+
# This method must return a boolean
|
21
|
+
def filter(row)
|
22
|
+
raise "Unimplemented"
|
23
|
+
end
|
24
|
+
|
25
|
+
def empty?
|
26
|
+
value.nil? && source.empty?
|
27
|
+
end
|
28
|
+
|
29
|
+
def advance
|
30
|
+
loop do
|
31
|
+
@value = source.row
|
32
|
+
break if value.nil? || filter(value)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
data/lib/arc-furnace/nodes.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
require 'arc-furnace/all_fields_csv_sink'
|
2
2
|
require 'arc-furnace/binary_key_merging_hash'
|
3
|
+
require 'arc-furnace/block_filter'
|
3
4
|
require 'arc-furnace/block_transform'
|
4
5
|
require 'arc-furnace/block_unfold'
|
5
6
|
require 'arc-furnace/csv_sink'
|
6
7
|
require 'arc-furnace/csv_source'
|
7
8
|
require 'arc-furnace/enumerator_source'
|
9
|
+
require 'arc-furnace/filter'
|
8
10
|
require 'arc-furnace/fixed_column_csv_sink'
|
9
11
|
require 'arc-furnace/hash'
|
10
12
|
require 'arc-furnace/inner_join'
|
data/lib/arc-furnace/pipeline.rb
CHANGED
@@ -23,58 +23,70 @@ module ArcFurnace
|
|
23
23
|
end
|
24
24
|
|
25
25
|
@sink_node = -> do
|
26
|
-
type.new(resolve_parameters(params))
|
26
|
+
type.new(resolve_parameters(:sink, params))
|
27
27
|
end
|
28
28
|
@sink_source = source
|
29
29
|
end
|
30
30
|
|
31
31
|
# Define a hash node, processing all rows from it's source and caching them
|
32
32
|
# in-memory.
|
33
|
-
def self.hash_node(
|
34
|
-
define_intermediate(
|
33
|
+
def self.hash_node(node_id, type: ArcFurnace::Hash, params:)
|
34
|
+
define_intermediate(node_id, type: type, params: params)
|
35
35
|
end
|
36
36
|
|
37
37
|
# A source that has row semantics, delivering a hash per row (or per entity)
|
38
38
|
# for the source.
|
39
|
-
def self.source(
|
39
|
+
def self.source(node_id, type:, params:)
|
40
40
|
raise "Source #{type} is not a Source!" unless type <= Source
|
41
|
-
define_intermediate(
|
41
|
+
define_intermediate(node_id, type: type, params: params)
|
42
42
|
end
|
43
43
|
|
44
44
|
# Define an inner join node where rows from the source are dropped
|
45
45
|
# if an associated entity is not found in the hash for the join key
|
46
|
-
def self.inner_join(
|
47
|
-
define_intermediate(
|
46
|
+
def self.inner_join(node_id, type: ArcFurnace::InnerJoin, params:)
|
47
|
+
define_intermediate(node_id, type: type, params: params)
|
48
48
|
end
|
49
49
|
|
50
50
|
# Define an outer join nod e where rows from the source are kept
|
51
51
|
# even if an associated entity is not found in the hash for the join key
|
52
|
-
def self.outer_join(
|
53
|
-
define_intermediate(
|
52
|
+
def self.outer_join(node_id, type: ArcFurnace::OuterJoin, params:)
|
53
|
+
define_intermediate(node_id, type: type, params: params)
|
54
54
|
end
|
55
55
|
|
56
56
|
# Define a node that transforms rows. By default you get a BlockTransform
|
57
57
|
# (and when this metaprogramming method is passed a block) that will be passed
|
58
58
|
# a hash for each row. The result of the block becomes the row for the next
|
59
59
|
# downstream node.
|
60
|
-
def self.transform(
|
61
|
-
if
|
60
|
+
def self.transform(node_id, type: BlockTransform, params: {}, &block)
|
61
|
+
if block_given? && type <= BlockTransform
|
62
62
|
params[:block] = block
|
63
63
|
end
|
64
64
|
raise "Transform #{type} is not a Transform!" unless type <= Transform
|
65
|
-
define_intermediate(
|
65
|
+
define_intermediate(node_id, type: type, params: params)
|
66
66
|
end
|
67
67
|
|
68
|
-
# Define a node that unfolds rows. By default you get a
|
68
|
+
# Define a node that unfolds rows. By default you get a BlockUnfold
|
69
69
|
# (and when this metaprogramming method is passed a block) that will be passed
|
70
70
|
# a hash for each row. The result of the block becomes the set of rows for the next
|
71
71
|
# downstream node.
|
72
|
-
def self.unfold(
|
73
|
-
if
|
72
|
+
def self.unfold(node_id, type: BlockUnfold, params: {}, &block)
|
73
|
+
if block_given? && type <= BlockUnfold
|
74
74
|
params[:block] = block
|
75
75
|
end
|
76
76
|
raise "Unfold #{type} is not an Unfold!" unless type <= Unfold
|
77
|
-
define_intermediate(
|
77
|
+
define_intermediate(node_id, type: type, params: params)
|
78
|
+
end
|
79
|
+
|
80
|
+
# Define a node that filters rows. By default you get a BlockFilter
|
81
|
+
# (and when this metaprogramming method is passed a block) that will be passed
|
82
|
+
# a hash for each row. The result of the block determines if a given row
|
83
|
+
# flows to a downstream node
|
84
|
+
def self.filter(node_id, type: BlockFilter, params: {}, &block)
|
85
|
+
if block_given? && type <= BlockFilter
|
86
|
+
params[:block] = block
|
87
|
+
end
|
88
|
+
raise "Filter #{type} is not a Filter!" unless type <= Filter
|
89
|
+
define_intermediate(node_id, type: type, params: params)
|
78
90
|
end
|
79
91
|
|
80
92
|
# Create an instance to run a transformation, passing the parameters to
|
@@ -82,18 +94,18 @@ module ArcFurnace
|
|
82
94
|
# will have a single public method--#execute, which will perform the
|
83
95
|
# transformation.
|
84
96
|
def self.instance(params = {})
|
85
|
-
|
97
|
+
PipelineInstance.new(self, params)
|
86
98
|
end
|
87
99
|
|
88
100
|
private
|
89
101
|
|
90
|
-
def self.define_intermediate(
|
91
|
-
intermediates_map[
|
92
|
-
type.new(resolve_parameters(params))
|
102
|
+
def self.define_intermediate(node_id, type:, params:)
|
103
|
+
intermediates_map[node_id] = -> do
|
104
|
+
type.new(resolve_parameters(node_id, params))
|
93
105
|
end
|
94
106
|
end
|
95
107
|
|
96
|
-
class
|
108
|
+
class PipelineInstance
|
97
109
|
attr_reader :sink_node, :sink_source, :intermediates_map, :params, :dsl_class, :error_handler
|
98
110
|
|
99
111
|
def initialize(dsl_class, error_handler: ErrorHandler.new, **params)
|
@@ -135,22 +147,22 @@ module ArcFurnace
|
|
135
147
|
@sink_source = intermediates_map[dsl_class.sink_source]
|
136
148
|
end
|
137
149
|
|
138
|
-
def resolve_parameters(params_to_resolve)
|
150
|
+
def resolve_parameters(node_id, params_to_resolve)
|
139
151
|
params_to_resolve.each_with_object({}) do |(key, value), result|
|
140
152
|
result[key] =
|
141
153
|
if value.is_a?(Symbol)
|
142
154
|
# Allow resolution of intermediates
|
143
|
-
resolve_parameter(value)
|
155
|
+
resolve_parameter(node_id, value)
|
144
156
|
elsif value.nil?
|
145
|
-
resolve_parameter(key)
|
157
|
+
resolve_parameter(node_id, key)
|
146
158
|
else
|
147
159
|
value
|
148
160
|
end
|
149
161
|
end
|
150
162
|
end
|
151
163
|
|
152
|
-
def resolve_parameter(key)
|
153
|
-
self.params[key] || self.intermediates_map[key] || (raise "Unknown key #{key}!")
|
164
|
+
def resolve_parameter(node_id, key)
|
165
|
+
self.params[key] || self.intermediates_map[key] || (raise "When processing node #{node_id}: Unknown key #{key}!")
|
154
166
|
end
|
155
167
|
|
156
168
|
end
|
data/lib/arc-furnace/source.rb
CHANGED
@@ -4,6 +4,8 @@ module ArcFurnace
|
|
4
4
|
class Source < Node
|
5
5
|
extend Forwardable
|
6
6
|
|
7
|
+
# Called to prepare anything this source needs to do before providing rows.
|
8
|
+
# For instance, opening a source file or database connection.
|
7
9
|
def prepare
|
8
10
|
|
9
11
|
end
|
@@ -17,23 +19,23 @@ module ArcFurnace
|
|
17
19
|
|
18
20
|
# Is this source empty?
|
19
21
|
def empty?
|
20
|
-
|
22
|
+
raise 'Unimplemented'
|
21
23
|
end
|
22
24
|
|
23
25
|
# The current value this source points at
|
24
26
|
# This is generally the only method required to implement a source.
|
25
27
|
def value
|
26
|
-
|
28
|
+
raise 'Unimplemented'
|
27
29
|
end
|
28
30
|
|
29
|
-
# Close the source
|
31
|
+
# Close the source. Called by the framework at the end of processing.
|
30
32
|
def close
|
31
33
|
|
32
34
|
end
|
33
35
|
|
34
36
|
# Advance this source by one. #advance specifies no return value contract
|
35
37
|
def advance
|
36
|
-
|
38
|
+
raise 'Unimplemented'
|
37
39
|
end
|
38
40
|
|
39
41
|
end
|
data/lib/arc-furnace/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arc-furnace
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Daniel Spangenberger
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2015-10-
|
12
|
+
date: 2015-10-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: msgpack
|
@@ -129,6 +129,7 @@ files:
|
|
129
129
|
- lib/arc-furnace/abstract_join.rb
|
130
130
|
- lib/arc-furnace/all_fields_csv_sink.rb
|
131
131
|
- lib/arc-furnace/binary_key_merging_hash.rb
|
132
|
+
- lib/arc-furnace/block_filter.rb
|
132
133
|
- lib/arc-furnace/block_transform.rb
|
133
134
|
- lib/arc-furnace/block_unfold.rb
|
134
135
|
- lib/arc-furnace/csv_sink.rb
|
@@ -138,6 +139,7 @@ files:
|
|
138
139
|
- lib/arc-furnace/enumerator_source.rb
|
139
140
|
- lib/arc-furnace/error_handler.rb
|
140
141
|
- lib/arc-furnace/excel_source.rb
|
142
|
+
- lib/arc-furnace/filter.rb
|
141
143
|
- lib/arc-furnace/fixed_column_csv_sink.rb
|
142
144
|
- lib/arc-furnace/hash.rb
|
143
145
|
- lib/arc-furnace/inner_join.rb
|