arc-furnace 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.travis.yml +6 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +21 -0
- data/README.md +127 -0
- data/Rakefile +9 -0
- data/arc-furnace.gemspec +30 -0
- data/lib/arc-furnace.rb +12 -0
- data/lib/arc-furnace/abstract_join.rb +53 -0
- data/lib/arc-furnace/all_fields_csv_sink.rb +68 -0
- data/lib/arc-furnace/binary_key_merging_hash.rb +38 -0
- data/lib/arc-furnace/block_transform.rb +18 -0
- data/lib/arc-furnace/block_unfold.rb +18 -0
- data/lib/arc-furnace/csv_sink.rb +21 -0
- data/lib/arc-furnace/csv_source.rb +33 -0
- data/lib/arc-furnace/csv_to_hash_with_duplicate_headers.rb +19 -0
- data/lib/arc-furnace/dsl.rb +6 -0
- data/lib/arc-furnace/enumerator_source.rb +36 -0
- data/lib/arc-furnace/error_handler.rb +27 -0
- data/lib/arc-furnace/excel_source.rb +38 -0
- data/lib/arc-furnace/fixed_column_csv_sink.rb +37 -0
- data/lib/arc-furnace/hash.rb +41 -0
- data/lib/arc-furnace/inner_join.rb +27 -0
- data/lib/arc-furnace/logging_error_handler.rb +0 -0
- data/lib/arc-furnace/merging_hash.rb +41 -0
- data/lib/arc-furnace/multi_csv_source.rb +50 -0
- data/lib/arc-furnace/node.rb +5 -0
- data/lib/arc-furnace/nodes.rb +19 -0
- data/lib/arc-furnace/outer_join.rb +14 -0
- data/lib/arc-furnace/pipeline.rb +158 -0
- data/lib/arc-furnace/private_attr.rb +28 -0
- data/lib/arc-furnace/sink.rb +21 -0
- data/lib/arc-furnace/source.rb +40 -0
- data/lib/arc-furnace/suffixed_fixed_column_csv_sink.rb +18 -0
- data/lib/arc-furnace/transform.rb +28 -0
- data/lib/arc-furnace/unfold.rb +45 -0
- data/lib/arc-furnace/version.rb +3 -0
- metadata +182 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 38e438f57e6350ddce2937cf2ef03d6d93ea20a8
|
4
|
+
data.tar.gz: 17ee74aba7edef53647c2a2e4efe6b4d3bda477f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ee8d88b55edd486dbd496d2831096ed53eaad124e6043629747b9503bd6473267c1f18086edae0425152b08638fc7ed4961cfecc79b72a92d709768c49316b25
|
7
|
+
data.tar.gz: 39114078eab439a660c83479b9251b0e9f1690aa8e0110b51a4dc061aee5f7d64421189c69f0705349d95339b14e7e5b118124b1822a7c3e747446313c585ec7
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Daniel Spangenberger
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
# ArcFurnace
|
2
|
+
[![Build Status](https://travis-ci.org/salsify/arc-furnace.svg?branch=master)][travis]
|
3
|
+
|
4
|
+
[travis]: http://travis-ci.org/salsify/arc-furnace
|
5
|
+
|
6
|
+
ArcFurnace melts, melds, and transforms your scrap data into perfectly crafted data for ingest into applications,
|
7
|
+
analysis, or whatnot. ArcFurnace simplifies simple ETL (Extract, Transform, Load) tasks for small to medium sets of data
|
8
|
+
using a programmatic DSL interface. Here's an example:
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
class Transform < ArcFurnace::Pipeline
|
12
|
+
|
13
|
+
source :marketing_info_csv, type: ArcFurnace::CSVSource, params: { filename: :marketing_filename }
|
14
|
+
|
15
|
+
transform :marketing_info_source, params: { source: :marketing_info_csv } do |row|
|
16
|
+
row.delete('Name')
|
17
|
+
row
|
18
|
+
end
|
19
|
+
|
20
|
+
source :product_attributes,
|
21
|
+
type: ArcFurnace::MultiCSVSource,
|
22
|
+
params: { filenames: :product_attribute_filenames }
|
23
|
+
|
24
|
+
hash_node :marketing_info,
|
25
|
+
params: {
|
26
|
+
key_column: :primary_key,
|
27
|
+
source: :marketing_info_source
|
28
|
+
}
|
29
|
+
|
30
|
+
outer_join :join_results,
|
31
|
+
params: {
|
32
|
+
source: :product_attributes,
|
33
|
+
hash: :marketing_info
|
34
|
+
}
|
35
|
+
|
36
|
+
sink type: ArcFurnace::AllFieldsCSVSink,
|
37
|
+
source: :join_results,
|
38
|
+
params: { filename: :destination_name }
|
39
|
+
|
40
|
+
end
|
41
|
+
```
|
42
|
+
|
43
|
+
## Installation
|
44
|
+
|
45
|
+
Add this line to your application's Gemfile:
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
gem 'arc-furnace', github: 'salsify/arc-furnace'
|
49
|
+
```
|
50
|
+
|
51
|
+
And then execute:
|
52
|
+
|
53
|
+
$ bundle
|
54
|
+
|
55
|
+
## Usage
|
56
|
+
|
57
|
+
ArcFurnace provides a few concepts useful to extracting and transforming data.
|
58
|
+
|
59
|
+
### Node Types Available
|
60
|
+
|
61
|
+
#### Pipelines
|
62
|
+
|
63
|
+
Pipelines define a a complete transformation and define a directed, acyclic graph of
|
64
|
+
operations that define how data is transformed. Each type of node in a `Pipeline` is defined below, but
|
65
|
+
a Pipelines defines the network of nodes that transform data.
|
66
|
+
|
67
|
+
#### Sources
|
68
|
+
|
69
|
+
A `Source` provides values to a `Pipeline`. A `Pipeline` may have many sources. Essentially, any nodes that
|
70
|
+
require a stream of data (`Hash`, `Transform`, `Join`, `Sink`) will have one.
|
71
|
+
|
72
|
+
#### Hashes
|
73
|
+
|
74
|
+
A `Hash` provides indexed access to a `Source` but pre-computing the index based on a key. The processing happens during the
|
75
|
+
prepare stage of pipeline processing. Hashes have a simple interface, `#get(primary_key)`, to requesting data. Hashes
|
76
|
+
are almost exclusively used as inputs to one side of joins.
|
77
|
+
|
78
|
+
#### Joins
|
79
|
+
|
80
|
+
An `InnerJoin` or an `OuterJoin` join two sources of data (one must be a `Hash`) based upon a key. By default the join
|
81
|
+
key is the key that the hash was rolled-up on, however, the `key_column` option on both `InnerJoin` and `OuterJoin`
|
82
|
+
may override this. Note the default join is an inner join, which will drop source rows if the hash does not contain
|
83
|
+
a matching row.
|
84
|
+
|
85
|
+
#### Transforms
|
86
|
+
|
87
|
+
A `Transform` acts as a source, however, takes a source as an input and transforms each input. The `BlockTransform` and
|
88
|
+
associated sugar in the `transform` method of `Pipeline` make this very easy (see the example above).
|
89
|
+
|
90
|
+
#### Unfolds
|
91
|
+
|
92
|
+
An `Unfold` acts as a source, however, takes a source as an input and produces multiple rows for that source as an output.
|
93
|
+
A common case for this is splitting rows into multiple rows depending upon their keys. The `BlockTransform` and associated
|
94
|
+
sugar in the `unfold` method of `Pipeline` make this fiarly easy (see `pipeline_spec.rb`).
|
95
|
+
|
96
|
+
#### Sinks
|
97
|
+
|
98
|
+
Each `Pipeline` has a single sink. Pipelines must produce data somewhere, and that data goes to a sink. Sinks
|
99
|
+
subscribe to the `#row(hash)` interace--each output row is passed to this method for handling.
|
100
|
+
|
101
|
+
### General pipeline development process
|
102
|
+
|
103
|
+
1. Define a source. Choose an existing `Source` implementation in this library (`CSVSource` or `ExcelSource`),
|
104
|
+
extend the `EnumeratorSource`, or implement the `row()` method for a new source.
|
105
|
+
2. Define any transformations, or joins. This may cause you to revisit #1.
|
106
|
+
3. Define the sink. This is generally custom, or, may be one of the provided `CSVSink` types.
|
107
|
+
4. Roll it together in a `Pipeline`.
|
108
|
+
|
109
|
+
## Development
|
110
|
+
|
111
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
|
112
|
+
|
113
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
114
|
+
|
115
|
+
## TODOs
|
116
|
+
|
117
|
+
1. Add a `filter` node and implementation to `Pipeline`
|
118
|
+
2. Add examples for `ErrorHandler` interface.
|
119
|
+
3. Add sugar to define a `BlockTransform` on a `Source` definition in a `Pipeline`.
|
120
|
+
|
121
|
+
## Contributing
|
122
|
+
|
123
|
+
1. Fork it ( https://github.com/[my-github-username]/arc-furnace/fork )
|
124
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
125
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
126
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
127
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/arc-furnace.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'arc-furnace/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "arc-furnace"
|
8
|
+
spec.version = ArcFurnace::VERSION
|
9
|
+
spec.authors = ["Daniel Spangenberger", "Brian Tenggren"]
|
10
|
+
spec.email = ["dan@salsify.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{Melds and transforms data from multiple sources into a single stream}
|
13
|
+
spec.description = %q{An ETL library for Ruby that performs the basic actions of ETL: extract, transform, and load. Easily extensible.}
|
14
|
+
spec.homepage = "http://github.com/salsify/arc-furnace"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
+
spec.bindir = "exe"
|
19
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
|
+
spec.require_paths = ["lib"]
|
21
|
+
|
22
|
+
spec.add_dependency 'msgpack', '~> 0.6'
|
23
|
+
spec.add_dependency 'activesupport', '>= 3.2'
|
24
|
+
spec.add_dependency 'eigenclass', '~> 2'
|
25
|
+
spec.add_dependency 'roo', '>= 2.1'
|
26
|
+
|
27
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
28
|
+
spec.add_development_dependency 'rspec', '~> 3'
|
29
|
+
spec.add_development_dependency 'ice_nine', '>= 0.11'
|
30
|
+
end
|
data/lib/arc-furnace.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'arc-furnace/version'
|
2
|
+
|
3
|
+
require 'active_support'
|
4
|
+
require 'active_support/core_ext'
|
5
|
+
|
6
|
+
require 'arc-furnace/csv_to_hash_with_duplicate_headers'
|
7
|
+
require 'arc-furnace/private_attr'
|
8
|
+
|
9
|
+
require 'arc-furnace/dsl'
|
10
|
+
require 'arc-furnace/error_handler'
|
11
|
+
require 'arc-furnace/nodes'
|
12
|
+
require 'arc-furnace/pipeline'
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'arc-furnace/source'
|
2
|
+
|
3
|
+
module ArcFurnace
|
4
|
+
class AbstractJoin < Source
|
5
|
+
private_attr_reader :hash, :source, :key_column
|
6
|
+
attr_reader :value
|
7
|
+
|
8
|
+
# The source is a source, the hash is a hash, and one can optionally
|
9
|
+
# pass the key column to get the primary key for each source entity, the
|
10
|
+
# default is equijoin semantics--the key of the hash is used.
|
11
|
+
def initialize(source: , hash:, key_column: nil)
|
12
|
+
if source.is_a?(::ArcFurnace::Source) && hash.is_a?(::ArcFurnace::Hash)
|
13
|
+
@hash = hash
|
14
|
+
@source = source
|
15
|
+
@key_column = key_column || hash.key_column
|
16
|
+
else
|
17
|
+
raise 'Must be passed one Hash and one Source!'
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def prepare
|
22
|
+
advance
|
23
|
+
end
|
24
|
+
|
25
|
+
def advance
|
26
|
+
raise "Unimplemented!"
|
27
|
+
end
|
28
|
+
|
29
|
+
delegate empty?: :source
|
30
|
+
|
31
|
+
protected
|
32
|
+
|
33
|
+
def merge_source_row(source_row)
|
34
|
+
key = source_row[key_column]
|
35
|
+
if key
|
36
|
+
if hash_value = hash.get(key)
|
37
|
+
hash_value = hash_value.deep_dup
|
38
|
+
source_row.each do |key, value|
|
39
|
+
hash_value[key] = value
|
40
|
+
end
|
41
|
+
@value = hash_value
|
42
|
+
true
|
43
|
+
else
|
44
|
+
error_handler.missing_hash_key(source_row: source_row, key: key, node_id: node_id)
|
45
|
+
false
|
46
|
+
end
|
47
|
+
else
|
48
|
+
error_handler.missing_join_key(source_row: source_row, node_id: node_id)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'arc-furnace/sink'
|
2
|
+
require 'msgpack'
|
3
|
+
|
4
|
+
module ArcFurnace
|
5
|
+
class AllFieldsCSVSink < Sink
|
6
|
+
private_attr_reader :csv, :fields, :tmp_file, :packer, :fields, :field_mappings
|
7
|
+
|
8
|
+
def initialize(filename: , encoding: 'UTF-8')
|
9
|
+
@tmp_file = Tempfile.new('intermediate_results', encoding: 'binary')
|
10
|
+
@packer = MessagePack::Packer.new(tmp_file)
|
11
|
+
@csv = CSV.open(filename, 'wb', encoding: encoding, headers: true)
|
12
|
+
@fields = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
def prepare(fields = nil)
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
def finalize
|
20
|
+
packer.flush
|
21
|
+
tmp_file.rewind
|
22
|
+
|
23
|
+
write_header_row!
|
24
|
+
|
25
|
+
unpacker = MessagePack::Unpacker.new(tmp_file)
|
26
|
+
unpacker.each do |hash|
|
27
|
+
write_row(hash)
|
28
|
+
end
|
29
|
+
|
30
|
+
csv.close
|
31
|
+
end
|
32
|
+
|
33
|
+
def row(hash)
|
34
|
+
update_field_counts(hash)
|
35
|
+
packer.write(hash)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def write_header_row!
|
41
|
+
header_row = []
|
42
|
+
fields.each do |key, count|
|
43
|
+
count.times { header_row << key }
|
44
|
+
end
|
45
|
+
csv << header_row
|
46
|
+
end
|
47
|
+
|
48
|
+
def write_row(hash)
|
49
|
+
row = []
|
50
|
+
fields.each do |key, count|
|
51
|
+
values = Array.wrap(hash[key])
|
52
|
+
(values.slice(0, count) || []).each do |value|
|
53
|
+
row << value
|
54
|
+
end
|
55
|
+
(count - values.length).times { row << nil }
|
56
|
+
end
|
57
|
+
csv << row
|
58
|
+
end
|
59
|
+
|
60
|
+
def update_field_counts(hash)
|
61
|
+
hash.each do |key, values|
|
62
|
+
value_count = Array.wrap(values).size
|
63
|
+
existing_value_count = fields[key] || 0
|
64
|
+
fields[key] = value_count if value_count > existing_value_count
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'arc-furnace/hash'
|
2
|
+
|
3
|
+
module ArcFurnace
|
4
|
+
# This allows one to merge multiple rows into one such as:
|
5
|
+
# key, attribute, value
|
6
|
+
# 1, value1, foo
|
7
|
+
# 1, value1, bar
|
8
|
+
# 1, value2, baz
|
9
|
+
# Results in:
|
10
|
+
# 1 => { value1 => [foo, bar], value2 => baz }
|
11
|
+
class BinaryKeyMergingHash < ::ArcFurnace::Hash
|
12
|
+
private_attr_reader :source, :hash, :secondary_key, :value_key
|
13
|
+
|
14
|
+
def initialize(source: , primary_key:, secondary_key:, value_key:)
|
15
|
+
super(source: source, key_column: primary_key)
|
16
|
+
@secondary_key = secondary_key
|
17
|
+
@value_key = value_key
|
18
|
+
end
|
19
|
+
|
20
|
+
def prepare
|
21
|
+
loop do
|
22
|
+
break if source.empty?
|
23
|
+
row = source.row
|
24
|
+
row_key = row[key_column]
|
25
|
+
second_key = row[secondary_key]
|
26
|
+
value = row[value_key]
|
27
|
+
if row_key && second_key && value
|
28
|
+
row_entry = hash[row_key] ||= {}
|
29
|
+
value_arr = row_entry[second_key] ||= []
|
30
|
+
value_arr.concat(Array.wrap(value))
|
31
|
+
else
|
32
|
+
error_handler.missing_primary_key(source_row: row, node_id: node_id)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'arc-furnace/transform'
|
2
|
+
|
3
|
+
module ArcFurnace
|
4
|
+
class BlockTransform < Transform
|
5
|
+
private_attr_reader :block
|
6
|
+
|
7
|
+
def initialize(source:, block:)
|
8
|
+
raise 'Must specify a block' if block.nil?
|
9
|
+
@block = block
|
10
|
+
super(source: source)
|
11
|
+
end
|
12
|
+
|
13
|
+
def transform(row)
|
14
|
+
block.call(row)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'arc-furnace/unfold'
|
2
|
+
|
3
|
+
module ArcFurnace
|
4
|
+
class BlockUnfold < Unfold
|
5
|
+
private_attr_reader :block
|
6
|
+
|
7
|
+
def initialize(source:, block:)
|
8
|
+
raise 'Must specify a block' if block.nil?
|
9
|
+
@block = block
|
10
|
+
super(source: source)
|
11
|
+
end
|
12
|
+
|
13
|
+
def unfold(row)
|
14
|
+
block.call(row)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|