arc-furnace 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.travis.yml +6 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +21 -0
- data/README.md +127 -0
- data/Rakefile +9 -0
- data/arc-furnace.gemspec +30 -0
- data/lib/arc-furnace.rb +12 -0
- data/lib/arc-furnace/abstract_join.rb +53 -0
- data/lib/arc-furnace/all_fields_csv_sink.rb +68 -0
- data/lib/arc-furnace/binary_key_merging_hash.rb +38 -0
- data/lib/arc-furnace/block_transform.rb +18 -0
- data/lib/arc-furnace/block_unfold.rb +18 -0
- data/lib/arc-furnace/csv_sink.rb +21 -0
- data/lib/arc-furnace/csv_source.rb +33 -0
- data/lib/arc-furnace/csv_to_hash_with_duplicate_headers.rb +19 -0
- data/lib/arc-furnace/dsl.rb +6 -0
- data/lib/arc-furnace/enumerator_source.rb +36 -0
- data/lib/arc-furnace/error_handler.rb +27 -0
- data/lib/arc-furnace/excel_source.rb +38 -0
- data/lib/arc-furnace/fixed_column_csv_sink.rb +37 -0
- data/lib/arc-furnace/hash.rb +41 -0
- data/lib/arc-furnace/inner_join.rb +27 -0
- data/lib/arc-furnace/logging_error_handler.rb +0 -0
- data/lib/arc-furnace/merging_hash.rb +41 -0
- data/lib/arc-furnace/multi_csv_source.rb +50 -0
- data/lib/arc-furnace/node.rb +5 -0
- data/lib/arc-furnace/nodes.rb +19 -0
- data/lib/arc-furnace/outer_join.rb +14 -0
- data/lib/arc-furnace/pipeline.rb +158 -0
- data/lib/arc-furnace/private_attr.rb +28 -0
- data/lib/arc-furnace/sink.rb +21 -0
- data/lib/arc-furnace/source.rb +40 -0
- data/lib/arc-furnace/suffixed_fixed_column_csv_sink.rb +18 -0
- data/lib/arc-furnace/transform.rb +28 -0
- data/lib/arc-furnace/unfold.rb +45 -0
- data/lib/arc-furnace/version.rb +3 -0
- metadata +182 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 38e438f57e6350ddce2937cf2ef03d6d93ea20a8
|
4
|
+
data.tar.gz: 17ee74aba7edef53647c2a2e4efe6b4d3bda477f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ee8d88b55edd486dbd496d2831096ed53eaad124e6043629747b9503bd6473267c1f18086edae0425152b08638fc7ed4961cfecc79b72a92d709768c49316b25
|
7
|
+
data.tar.gz: 39114078eab439a660c83479b9251b0e9f1690aa8e0110b51a4dc061aee5f7d64421189c69f0705349d95339b14e7e5b118124b1822a7c3e747446313c585ec7
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Daniel Spangenberger
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
# ArcFurnace
|
2
|
+
[][travis]
|
3
|
+
|
4
|
+
[travis]: http://travis-ci.org/salsify/arc-furnace
|
5
|
+
|
6
|
+
ArcFurnace melts, melds, and transforms your scrap data into perfectly crafted data for ingest into applications,
|
7
|
+
analysis, or whatnot. ArcFurnace simplifies simple ETL (Extract, Transform, Load) tasks for small to medium sets of data
|
8
|
+
using a programmatic DSL interface. Here's an example:
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
class Transform < ArcFurnace::Pipeline
|
12
|
+
|
13
|
+
source :marketing_info_csv, type: ArcFurnace::CSVSource, params: { filename: :marketing_filename }
|
14
|
+
|
15
|
+
transform :marketing_info_source, params: { source: :marketing_info_csv } do |row|
|
16
|
+
row.delete('Name')
|
17
|
+
row
|
18
|
+
end
|
19
|
+
|
20
|
+
source :product_attributes,
|
21
|
+
type: ArcFurnace::MultiCSVSource,
|
22
|
+
params: { filenames: :product_attribute_filenames }
|
23
|
+
|
24
|
+
hash_node :marketing_info,
|
25
|
+
params: {
|
26
|
+
key_column: :primary_key,
|
27
|
+
source: :marketing_info_source
|
28
|
+
}
|
29
|
+
|
30
|
+
outer_join :join_results,
|
31
|
+
params: {
|
32
|
+
source: :product_attributes,
|
33
|
+
hash: :marketing_info
|
34
|
+
}
|
35
|
+
|
36
|
+
sink type: ArcFurnace::AllFieldsCSVSink,
|
37
|
+
source: :join_results,
|
38
|
+
params: { filename: :destination_name }
|
39
|
+
|
40
|
+
end
|
41
|
+
```
|
42
|
+
|
43
|
+
## Installation
|
44
|
+
|
45
|
+
Add this line to your application's Gemfile:
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
gem 'arc-furnace', github: 'salsify/arc-furnace'
|
49
|
+
```
|
50
|
+
|
51
|
+
And then execute:
|
52
|
+
|
53
|
+
$ bundle
|
54
|
+
|
55
|
+
## Usage
|
56
|
+
|
57
|
+
ArcFurnace provides a few concepts useful to extracting and transforming data.
|
58
|
+
|
59
|
+
### Node Types Available
|
60
|
+
|
61
|
+
#### Pipelines
|
62
|
+
|
63
|
+
Pipelines define a a complete transformation and define a directed, acyclic graph of
|
64
|
+
operations that define how data is transformed. Each type of node in a `Pipeline` is defined below, but
|
65
|
+
a Pipelines defines the network of nodes that transform data.
|
66
|
+
|
67
|
+
#### Sources
|
68
|
+
|
69
|
+
A `Source` provides values to a `Pipeline`. A `Pipeline` may have many sources. Essentially, any nodes that
|
70
|
+
require a stream of data (`Hash`, `Transform`, `Join`, `Sink`) will have one.
|
71
|
+
|
72
|
+
#### Hashes
|
73
|
+
|
74
|
+
A `Hash` provides indexed access to a `Source` but pre-computing the index based on a key. The processing happens during the
|
75
|
+
prepare stage of pipeline processing. Hashes have a simple interface, `#get(primary_key)`, to requesting data. Hashes
|
76
|
+
are almost exclusively used as inputs to one side of joins.
|
77
|
+
|
78
|
+
#### Joins
|
79
|
+
|
80
|
+
An `InnerJoin` or an `OuterJoin` join two sources of data (one must be a `Hash`) based upon a key. By default the join
|
81
|
+
key is the key that the hash was rolled-up on, however, the `key_column` option on both `InnerJoin` and `OuterJoin`
|
82
|
+
may override this. Note the default join is an inner join, which will drop source rows if the hash does not contain
|
83
|
+
a matching row.
|
84
|
+
|
85
|
+
#### Transforms
|
86
|
+
|
87
|
+
A `Transform` acts as a source, however, takes a source as an input and transforms each input. The `BlockTransform` and
|
88
|
+
associated sugar in the `transform` method of `Pipeline` make this very easy (see the example above).
|
89
|
+
|
90
|
+
#### Unfolds
|
91
|
+
|
92
|
+
An `Unfold` acts as a source, however, takes a source as an input and produces multiple rows for that source as an output.
|
93
|
+
A common case for this is splitting rows into multiple rows depending upon their keys. The `BlockTransform` and associated
|
94
|
+
sugar in the `unfold` method of `Pipeline` make this fiarly easy (see `pipeline_spec.rb`).
|
95
|
+
|
96
|
+
#### Sinks
|
97
|
+
|
98
|
+
Each `Pipeline` has a single sink. Pipelines must produce data somewhere, and that data goes to a sink. Sinks
|
99
|
+
subscribe to the `#row(hash)` interace--each output row is passed to this method for handling.
|
100
|
+
|
101
|
+
### General pipeline development process
|
102
|
+
|
103
|
+
1. Define a source. Choose an existing `Source` implementation in this library (`CSVSource` or `ExcelSource`),
|
104
|
+
extend the `EnumeratorSource`, or implement the `row()` method for a new source.
|
105
|
+
2. Define any transformations, or joins. This may cause you to revisit #1.
|
106
|
+
3. Define the sink. This is generally custom, or, may be one of the provided `CSVSink` types.
|
107
|
+
4. Roll it together in a `Pipeline`.
|
108
|
+
|
109
|
+
## Development
|
110
|
+
|
111
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
|
112
|
+
|
113
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
114
|
+
|
115
|
+
## TODOs
|
116
|
+
|
117
|
+
1. Add a `filter` node and implementation to `Pipeline`
|
118
|
+
2. Add examples for `ErrorHandler` interface.
|
119
|
+
3. Add sugar to define a `BlockTransform` on a `Source` definition in a `Pipeline`.
|
120
|
+
|
121
|
+
## Contributing
|
122
|
+
|
123
|
+
1. Fork it ( https://github.com/[my-github-username]/arc-furnace/fork )
|
124
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
125
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
126
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
127
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/arc-furnace.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'arc-furnace/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "arc-furnace"
|
8
|
+
spec.version = ArcFurnace::VERSION
|
9
|
+
spec.authors = ["Daniel Spangenberger", "Brian Tenggren"]
|
10
|
+
spec.email = ["dan@salsify.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{Melds and transforms data from multiple sources into a single stream}
|
13
|
+
spec.description = %q{An ETL library for Ruby that performs the basic actions of ETL: extract, transform, and load. Easily extensible.}
|
14
|
+
spec.homepage = "http://github.com/salsify/arc-furnace"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
+
spec.bindir = "exe"
|
19
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
|
+
spec.require_paths = ["lib"]
|
21
|
+
|
22
|
+
spec.add_dependency 'msgpack', '~> 0.6'
|
23
|
+
spec.add_dependency 'activesupport', '>= 3.2'
|
24
|
+
spec.add_dependency 'eigenclass', '~> 2'
|
25
|
+
spec.add_dependency 'roo', '>= 2.1'
|
26
|
+
|
27
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
28
|
+
spec.add_development_dependency 'rspec', '~> 3'
|
29
|
+
spec.add_development_dependency 'ice_nine', '>= 0.11'
|
30
|
+
end
|
data/lib/arc-furnace.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'arc-furnace/version'
|
2
|
+
|
3
|
+
require 'active_support'
|
4
|
+
require 'active_support/core_ext'
|
5
|
+
|
6
|
+
require 'arc-furnace/csv_to_hash_with_duplicate_headers'
|
7
|
+
require 'arc-furnace/private_attr'
|
8
|
+
|
9
|
+
require 'arc-furnace/dsl'
|
10
|
+
require 'arc-furnace/error_handler'
|
11
|
+
require 'arc-furnace/nodes'
|
12
|
+
require 'arc-furnace/pipeline'
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'arc-furnace/source'
|
2
|
+
|
3
|
+
module ArcFurnace
|
4
|
+
class AbstractJoin < Source
|
5
|
+
private_attr_reader :hash, :source, :key_column
|
6
|
+
attr_reader :value
|
7
|
+
|
8
|
+
# The source is a source, the hash is a hash, and one can optionally
|
9
|
+
# pass the key column to get the primary key for each source entity, the
|
10
|
+
# default is equijoin semantics--the key of the hash is used.
|
11
|
+
def initialize(source: , hash:, key_column: nil)
|
12
|
+
if source.is_a?(::ArcFurnace::Source) && hash.is_a?(::ArcFurnace::Hash)
|
13
|
+
@hash = hash
|
14
|
+
@source = source
|
15
|
+
@key_column = key_column || hash.key_column
|
16
|
+
else
|
17
|
+
raise 'Must be passed one Hash and one Source!'
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def prepare
|
22
|
+
advance
|
23
|
+
end
|
24
|
+
|
25
|
+
def advance
|
26
|
+
raise "Unimplemented!"
|
27
|
+
end
|
28
|
+
|
29
|
+
delegate empty?: :source
|
30
|
+
|
31
|
+
protected
|
32
|
+
|
33
|
+
def merge_source_row(source_row)
|
34
|
+
key = source_row[key_column]
|
35
|
+
if key
|
36
|
+
if hash_value = hash.get(key)
|
37
|
+
hash_value = hash_value.deep_dup
|
38
|
+
source_row.each do |key, value|
|
39
|
+
hash_value[key] = value
|
40
|
+
end
|
41
|
+
@value = hash_value
|
42
|
+
true
|
43
|
+
else
|
44
|
+
error_handler.missing_hash_key(source_row: source_row, key: key, node_id: node_id)
|
45
|
+
false
|
46
|
+
end
|
47
|
+
else
|
48
|
+
error_handler.missing_join_key(source_row: source_row, node_id: node_id)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'arc-furnace/sink'
|
2
|
+
require 'msgpack'
|
3
|
+
|
4
|
+
module ArcFurnace
|
5
|
+
class AllFieldsCSVSink < Sink
|
6
|
+
private_attr_reader :csv, :fields, :tmp_file, :packer, :fields, :field_mappings
|
7
|
+
|
8
|
+
def initialize(filename: , encoding: 'UTF-8')
|
9
|
+
@tmp_file = Tempfile.new('intermediate_results', encoding: 'binary')
|
10
|
+
@packer = MessagePack::Packer.new(tmp_file)
|
11
|
+
@csv = CSV.open(filename, 'wb', encoding: encoding, headers: true)
|
12
|
+
@fields = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
def prepare(fields = nil)
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
def finalize
|
20
|
+
packer.flush
|
21
|
+
tmp_file.rewind
|
22
|
+
|
23
|
+
write_header_row!
|
24
|
+
|
25
|
+
unpacker = MessagePack::Unpacker.new(tmp_file)
|
26
|
+
unpacker.each do |hash|
|
27
|
+
write_row(hash)
|
28
|
+
end
|
29
|
+
|
30
|
+
csv.close
|
31
|
+
end
|
32
|
+
|
33
|
+
def row(hash)
|
34
|
+
update_field_counts(hash)
|
35
|
+
packer.write(hash)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def write_header_row!
|
41
|
+
header_row = []
|
42
|
+
fields.each do |key, count|
|
43
|
+
count.times { header_row << key }
|
44
|
+
end
|
45
|
+
csv << header_row
|
46
|
+
end
|
47
|
+
|
48
|
+
def write_row(hash)
|
49
|
+
row = []
|
50
|
+
fields.each do |key, count|
|
51
|
+
values = Array.wrap(hash[key])
|
52
|
+
(values.slice(0, count) || []).each do |value|
|
53
|
+
row << value
|
54
|
+
end
|
55
|
+
(count - values.length).times { row << nil }
|
56
|
+
end
|
57
|
+
csv << row
|
58
|
+
end
|
59
|
+
|
60
|
+
def update_field_counts(hash)
|
61
|
+
hash.each do |key, values|
|
62
|
+
value_count = Array.wrap(values).size
|
63
|
+
existing_value_count = fields[key] || 0
|
64
|
+
fields[key] = value_count if value_count > existing_value_count
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'arc-furnace/hash'
|
2
|
+
|
3
|
+
module ArcFurnace
|
4
|
+
# This allows one to merge multiple rows into one such as:
|
5
|
+
# key, attribute, value
|
6
|
+
# 1, value1, foo
|
7
|
+
# 1, value1, bar
|
8
|
+
# 1, value2, baz
|
9
|
+
# Results in:
|
10
|
+
# 1 => { value1 => [foo, bar], value2 => baz }
|
11
|
+
class BinaryKeyMergingHash < ::ArcFurnace::Hash
|
12
|
+
private_attr_reader :source, :hash, :secondary_key, :value_key
|
13
|
+
|
14
|
+
def initialize(source: , primary_key:, secondary_key:, value_key:)
|
15
|
+
super(source: source, key_column: primary_key)
|
16
|
+
@secondary_key = secondary_key
|
17
|
+
@value_key = value_key
|
18
|
+
end
|
19
|
+
|
20
|
+
def prepare
|
21
|
+
loop do
|
22
|
+
break if source.empty?
|
23
|
+
row = source.row
|
24
|
+
row_key = row[key_column]
|
25
|
+
second_key = row[secondary_key]
|
26
|
+
value = row[value_key]
|
27
|
+
if row_key && second_key && value
|
28
|
+
row_entry = hash[row_key] ||= {}
|
29
|
+
value_arr = row_entry[second_key] ||= []
|
30
|
+
value_arr.concat(Array.wrap(value))
|
31
|
+
else
|
32
|
+
error_handler.missing_primary_key(source_row: row, node_id: node_id)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'arc-furnace/transform'
|
2
|
+
|
3
|
+
module ArcFurnace
|
4
|
+
class BlockTransform < Transform
|
5
|
+
private_attr_reader :block
|
6
|
+
|
7
|
+
def initialize(source:, block:)
|
8
|
+
raise 'Must specify a block' if block.nil?
|
9
|
+
@block = block
|
10
|
+
super(source: source)
|
11
|
+
end
|
12
|
+
|
13
|
+
def transform(row)
|
14
|
+
block.call(row)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'arc-furnace/unfold'
|
2
|
+
|
3
|
+
module ArcFurnace
|
4
|
+
class BlockUnfold < Unfold
|
5
|
+
private_attr_reader :block
|
6
|
+
|
7
|
+
def initialize(source:, block:)
|
8
|
+
raise 'Must specify a block' if block.nil?
|
9
|
+
@block = block
|
10
|
+
super(source: source)
|
11
|
+
end
|
12
|
+
|
13
|
+
def unfold(row)
|
14
|
+
block.call(row)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|