arc-furnace 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +6 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +127 -0
  8. data/Rakefile +9 -0
  9. data/arc-furnace.gemspec +30 -0
  10. data/lib/arc-furnace.rb +12 -0
  11. data/lib/arc-furnace/abstract_join.rb +53 -0
  12. data/lib/arc-furnace/all_fields_csv_sink.rb +68 -0
  13. data/lib/arc-furnace/binary_key_merging_hash.rb +38 -0
  14. data/lib/arc-furnace/block_transform.rb +18 -0
  15. data/lib/arc-furnace/block_unfold.rb +18 -0
  16. data/lib/arc-furnace/csv_sink.rb +21 -0
  17. data/lib/arc-furnace/csv_source.rb +33 -0
  18. data/lib/arc-furnace/csv_to_hash_with_duplicate_headers.rb +19 -0
  19. data/lib/arc-furnace/dsl.rb +6 -0
  20. data/lib/arc-furnace/enumerator_source.rb +36 -0
  21. data/lib/arc-furnace/error_handler.rb +27 -0
  22. data/lib/arc-furnace/excel_source.rb +38 -0
  23. data/lib/arc-furnace/fixed_column_csv_sink.rb +37 -0
  24. data/lib/arc-furnace/hash.rb +41 -0
  25. data/lib/arc-furnace/inner_join.rb +27 -0
  26. data/lib/arc-furnace/logging_error_handler.rb +0 -0
  27. data/lib/arc-furnace/merging_hash.rb +41 -0
  28. data/lib/arc-furnace/multi_csv_source.rb +50 -0
  29. data/lib/arc-furnace/node.rb +5 -0
  30. data/lib/arc-furnace/nodes.rb +19 -0
  31. data/lib/arc-furnace/outer_join.rb +14 -0
  32. data/lib/arc-furnace/pipeline.rb +158 -0
  33. data/lib/arc-furnace/private_attr.rb +28 -0
  34. data/lib/arc-furnace/sink.rb +21 -0
  35. data/lib/arc-furnace/source.rb +40 -0
  36. data/lib/arc-furnace/suffixed_fixed_column_csv_sink.rb +18 -0
  37. data/lib/arc-furnace/transform.rb +28 -0
  38. data/lib/arc-furnace/unfold.rb +45 -0
  39. data/lib/arc-furnace/version.rb +3 -0
  40. metadata +182 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 38e438f57e6350ddce2937cf2ef03d6d93ea20a8
4
+ data.tar.gz: 17ee74aba7edef53647c2a2e4efe6b4d3bda477f
5
+ SHA512:
6
+ metadata.gz: ee8d88b55edd486dbd496d2831096ed53eaad124e6043629747b9503bd6473267c1f18086edae0425152b08638fc7ed4961cfecc79b72a92d709768c49316b25
7
+ data.tar.gz: 39114078eab439a660c83479b9251b0e9f1690aa8e0110b51a4dc061aee5f7d64421189c69f0705349d95339b14e7e5b118124b1822a7c3e747446313c585ec7
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ sudo: false
3
+ rvm:
4
+ - 2.1.6
5
+ - 2.2.3
6
+ - jruby-9000
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Daniel Spangenberger
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,127 @@
1
+ # ArcFurnace
2
+ [![Build Status](https://travis-ci.org/salsify/arc-furnace.svg?branch=master)][travis]
3
+
4
+ [travis]: http://travis-ci.org/salsify/arc-furnace
5
+
6
+ ArcFurnace melts, melds, and transforms your scrap data into perfectly crafted data for ingest into applications,
7
+ analysis, or whatnot. ArcFurnace simplifies simple ETL (Extract, Transform, Load) tasks for small to medium sets of data
8
+ using a programmatic DSL interface. Here's an example:
9
+
10
+ ```ruby
11
+ class Transform < ArcFurnace::Pipeline
12
+
13
+ source :marketing_info_csv, type: ArcFurnace::CSVSource, params: { filename: :marketing_filename }
14
+
15
+ transform :marketing_info_source, params: { source: :marketing_info_csv } do |row|
16
+ row.delete('Name')
17
+ row
18
+ end
19
+
20
+ source :product_attributes,
21
+ type: ArcFurnace::MultiCSVSource,
22
+ params: { filenames: :product_attribute_filenames }
23
+
24
+ hash_node :marketing_info,
25
+ params: {
26
+ key_column: :primary_key,
27
+ source: :marketing_info_source
28
+ }
29
+
30
+ outer_join :join_results,
31
+ params: {
32
+ source: :product_attributes,
33
+ hash: :marketing_info
34
+ }
35
+
36
+ sink type: ArcFurnace::AllFieldsCSVSink,
37
+ source: :join_results,
38
+ params: { filename: :destination_name }
39
+
40
+ end
41
+ ```
42
+
43
+ ## Installation
44
+
45
+ Add this line to your application's Gemfile:
46
+
47
+ ```ruby
48
+ gem 'arc-furnace', github: 'salsify/arc-furnace'
49
+ ```
50
+
51
+ And then execute:
52
+
53
+ $ bundle
54
+
55
+ ## Usage
56
+
57
+ ArcFurnace provides a few concepts useful to extracting and transforming data.
58
+
59
+ ### Node Types Available
60
+
61
+ #### Pipelines
62
+
63
+ Pipelines define a a complete transformation and define a directed, acyclic graph of
64
+ operations that define how data is transformed. Each type of node in a `Pipeline` is defined below, but
65
+ a Pipelines defines the network of nodes that transform data.
66
+
67
+ #### Sources
68
+
69
+ A `Source` provides values to a `Pipeline`. A `Pipeline` may have many sources. Essentially, any nodes that
70
+ require a stream of data (`Hash`, `Transform`, `Join`, `Sink`) will have one.
71
+
72
+ #### Hashes
73
+
74
+ A `Hash` provides indexed access to a `Source` but pre-computing the index based on a key. The processing happens during the
75
+ prepare stage of pipeline processing. Hashes have a simple interface, `#get(primary_key)`, to requesting data. Hashes
76
+ are almost exclusively used as inputs to one side of joins.
77
+
78
+ #### Joins
79
+
80
+ An `InnerJoin` or an `OuterJoin` join two sources of data (one must be a `Hash`) based upon a key. By default the join
81
+ key is the key that the hash was rolled-up on, however, the `key_column` option on both `InnerJoin` and `OuterJoin`
82
+ may override this. Note the default join is an inner join, which will drop source rows if the hash does not contain
83
+ a matching row.
84
+
85
+ #### Transforms
86
+
87
+ A `Transform` acts as a source, however, takes a source as an input and transforms each input. The `BlockTransform` and
88
+ associated sugar in the `transform` method of `Pipeline` make this very easy (see the example above).
89
+
90
+ #### Unfolds
91
+
92
+ An `Unfold` acts as a source, however, takes a source as an input and produces multiple rows for that source as an output.
93
+ A common case for this is splitting rows into multiple rows depending upon their keys. The `BlockTransform` and associated
94
+ sugar in the `unfold` method of `Pipeline` make this fiarly easy (see `pipeline_spec.rb`).
95
+
96
+ #### Sinks
97
+
98
+ Each `Pipeline` has a single sink. Pipelines must produce data somewhere, and that data goes to a sink. Sinks
99
+ subscribe to the `#row(hash)` interace--each output row is passed to this method for handling.
100
+
101
+ ### General pipeline development process
102
+
103
+ 1. Define a source. Choose an existing `Source` implementation in this library (`CSVSource` or `ExcelSource`),
104
+ extend the `EnumeratorSource`, or implement the `row()` method for a new source.
105
+ 2. Define any transformations, or joins. This may cause you to revisit #1.
106
+ 3. Define the sink. This is generally custom, or, may be one of the provided `CSVSink` types.
107
+ 4. Roll it together in a `Pipeline`.
108
+
109
+ ## Development
110
+
111
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
112
+
113
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
114
+
115
+ ## TODOs
116
+
117
+ 1. Add a `filter` node and implementation to `Pipeline`
118
+ 2. Add examples for `ErrorHandler` interface.
119
+ 3. Add sugar to define a `BlockTransform` on a `Source` definition in a `Pipeline`.
120
+
121
+ ## Contributing
122
+
123
+ 1. Fork it ( https://github.com/[my-github-username]/arc-furnace/fork )
124
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
125
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
126
+ 4. Push to the branch (`git push origin my-new-feature`)
127
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require 'bundler/setup'
2
+ Bundler::GemHelper.install_tasks
3
+
4
+ require 'rspec/core/rake_task'
5
+ RSpec::Core::RakeTask.new(:spec) do |task|
6
+ task.verbose = false
7
+ end
8
+
9
+ task default: :spec
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'arc-furnace/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "arc-furnace"
8
+ spec.version = ArcFurnace::VERSION
9
+ spec.authors = ["Daniel Spangenberger", "Brian Tenggren"]
10
+ spec.email = ["dan@salsify.com"]
11
+
12
+ spec.summary = %q{Melds and transforms data from multiple sources into a single stream}
13
+ spec.description = %q{An ETL library for Ruby that performs the basic actions of ETL: extract, transform, and load. Easily extensible.}
14
+ spec.homepage = "http://github.com/salsify/arc-furnace"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_dependency 'msgpack', '~> 0.6'
23
+ spec.add_dependency 'activesupport', '>= 3.2'
24
+ spec.add_dependency 'eigenclass', '~> 2'
25
+ spec.add_dependency 'roo', '>= 2.1'
26
+
27
+ spec.add_development_dependency 'rake', '~> 10.0'
28
+ spec.add_development_dependency 'rspec', '~> 3'
29
+ spec.add_development_dependency 'ice_nine', '>= 0.11'
30
+ end
@@ -0,0 +1,12 @@
1
+ require 'arc-furnace/version'
2
+
3
+ require 'active_support'
4
+ require 'active_support/core_ext'
5
+
6
+ require 'arc-furnace/csv_to_hash_with_duplicate_headers'
7
+ require 'arc-furnace/private_attr'
8
+
9
+ require 'arc-furnace/dsl'
10
+ require 'arc-furnace/error_handler'
11
+ require 'arc-furnace/nodes'
12
+ require 'arc-furnace/pipeline'
@@ -0,0 +1,53 @@
1
+ require 'arc-furnace/source'
2
+
3
+ module ArcFurnace
4
+ class AbstractJoin < Source
5
+ private_attr_reader :hash, :source, :key_column
6
+ attr_reader :value
7
+
8
+ # The source is a source, the hash is a hash, and one can optionally
9
+ # pass the key column to get the primary key for each source entity, the
10
+ # default is equijoin semantics--the key of the hash is used.
11
+ def initialize(source: , hash:, key_column: nil)
12
+ if source.is_a?(::ArcFurnace::Source) && hash.is_a?(::ArcFurnace::Hash)
13
+ @hash = hash
14
+ @source = source
15
+ @key_column = key_column || hash.key_column
16
+ else
17
+ raise 'Must be passed one Hash and one Source!'
18
+ end
19
+ end
20
+
21
+ def prepare
22
+ advance
23
+ end
24
+
25
+ def advance
26
+ raise "Unimplemented!"
27
+ end
28
+
29
+ delegate empty?: :source
30
+
31
+ protected
32
+
33
+ def merge_source_row(source_row)
34
+ key = source_row[key_column]
35
+ if key
36
+ if hash_value = hash.get(key)
37
+ hash_value = hash_value.deep_dup
38
+ source_row.each do |key, value|
39
+ hash_value[key] = value
40
+ end
41
+ @value = hash_value
42
+ true
43
+ else
44
+ error_handler.missing_hash_key(source_row: source_row, key: key, node_id: node_id)
45
+ false
46
+ end
47
+ else
48
+ error_handler.missing_join_key(source_row: source_row, node_id: node_id)
49
+ end
50
+ end
51
+
52
+ end
53
+ end
@@ -0,0 +1,68 @@
1
+ require 'arc-furnace/sink'
2
+ require 'msgpack'
3
+
4
+ module ArcFurnace
5
+ class AllFieldsCSVSink < Sink
6
+ private_attr_reader :csv, :fields, :tmp_file, :packer, :fields, :field_mappings
7
+
8
+ def initialize(filename: , encoding: 'UTF-8')
9
+ @tmp_file = Tempfile.new('intermediate_results', encoding: 'binary')
10
+ @packer = MessagePack::Packer.new(tmp_file)
11
+ @csv = CSV.open(filename, 'wb', encoding: encoding, headers: true)
12
+ @fields = {}
13
+ end
14
+
15
+ def prepare(fields = nil)
16
+
17
+ end
18
+
19
+ def finalize
20
+ packer.flush
21
+ tmp_file.rewind
22
+
23
+ write_header_row!
24
+
25
+ unpacker = MessagePack::Unpacker.new(tmp_file)
26
+ unpacker.each do |hash|
27
+ write_row(hash)
28
+ end
29
+
30
+ csv.close
31
+ end
32
+
33
+ def row(hash)
34
+ update_field_counts(hash)
35
+ packer.write(hash)
36
+ end
37
+
38
+ private
39
+
40
+ def write_header_row!
41
+ header_row = []
42
+ fields.each do |key, count|
43
+ count.times { header_row << key }
44
+ end
45
+ csv << header_row
46
+ end
47
+
48
+ def write_row(hash)
49
+ row = []
50
+ fields.each do |key, count|
51
+ values = Array.wrap(hash[key])
52
+ (values.slice(0, count) || []).each do |value|
53
+ row << value
54
+ end
55
+ (count - values.length).times { row << nil }
56
+ end
57
+ csv << row
58
+ end
59
+
60
+ def update_field_counts(hash)
61
+ hash.each do |key, values|
62
+ value_count = Array.wrap(values).size
63
+ existing_value_count = fields[key] || 0
64
+ fields[key] = value_count if value_count > existing_value_count
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,38 @@
1
+ require 'arc-furnace/hash'
2
+
3
+ module ArcFurnace
4
+ # This allows one to merge multiple rows into one such as:
5
+ # key, attribute, value
6
+ # 1, value1, foo
7
+ # 1, value1, bar
8
+ # 1, value2, baz
9
+ # Results in:
10
+ # 1 => { value1 => [foo, bar], value2 => baz }
11
+ class BinaryKeyMergingHash < ::ArcFurnace::Hash
12
+ private_attr_reader :source, :hash, :secondary_key, :value_key
13
+
14
+ def initialize(source: , primary_key:, secondary_key:, value_key:)
15
+ super(source: source, key_column: primary_key)
16
+ @secondary_key = secondary_key
17
+ @value_key = value_key
18
+ end
19
+
20
+ def prepare
21
+ loop do
22
+ break if source.empty?
23
+ row = source.row
24
+ row_key = row[key_column]
25
+ second_key = row[secondary_key]
26
+ value = row[value_key]
27
+ if row_key && second_key && value
28
+ row_entry = hash[row_key] ||= {}
29
+ value_arr = row_entry[second_key] ||= []
30
+ value_arr.concat(Array.wrap(value))
31
+ else
32
+ error_handler.missing_primary_key(source_row: row, node_id: node_id)
33
+ end
34
+ end
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,18 @@
1
+ require 'arc-furnace/transform'
2
+
3
+ module ArcFurnace
4
+ class BlockTransform < Transform
5
+ private_attr_reader :block
6
+
7
+ def initialize(source:, block:)
8
+ raise 'Must specify a block' if block.nil?
9
+ @block = block
10
+ super(source: source)
11
+ end
12
+
13
+ def transform(row)
14
+ block.call(row)
15
+ end
16
+
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ require 'arc-furnace/unfold'
2
+
3
+ module ArcFurnace
4
+ class BlockUnfold < Unfold
5
+ private_attr_reader :block
6
+
7
+ def initialize(source:, block:)
8
+ raise 'Must specify a block' if block.nil?
9
+ @block = block
10
+ super(source: source)
11
+ end
12
+
13
+ def unfold(row)
14
+ block.call(row)
15
+ end
16
+
17
+ end
18
+ end