arc-furnace 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.rspec +2 -0
  4. data/.travis.yml +6 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +127 -0
  8. data/Rakefile +9 -0
  9. data/arc-furnace.gemspec +30 -0
  10. data/lib/arc-furnace.rb +12 -0
  11. data/lib/arc-furnace/abstract_join.rb +53 -0
  12. data/lib/arc-furnace/all_fields_csv_sink.rb +68 -0
  13. data/lib/arc-furnace/binary_key_merging_hash.rb +38 -0
  14. data/lib/arc-furnace/block_transform.rb +18 -0
  15. data/lib/arc-furnace/block_unfold.rb +18 -0
  16. data/lib/arc-furnace/csv_sink.rb +21 -0
  17. data/lib/arc-furnace/csv_source.rb +33 -0
  18. data/lib/arc-furnace/csv_to_hash_with_duplicate_headers.rb +19 -0
  19. data/lib/arc-furnace/dsl.rb +6 -0
  20. data/lib/arc-furnace/enumerator_source.rb +36 -0
  21. data/lib/arc-furnace/error_handler.rb +27 -0
  22. data/lib/arc-furnace/excel_source.rb +38 -0
  23. data/lib/arc-furnace/fixed_column_csv_sink.rb +37 -0
  24. data/lib/arc-furnace/hash.rb +41 -0
  25. data/lib/arc-furnace/inner_join.rb +27 -0
  26. data/lib/arc-furnace/logging_error_handler.rb +0 -0
  27. data/lib/arc-furnace/merging_hash.rb +41 -0
  28. data/lib/arc-furnace/multi_csv_source.rb +50 -0
  29. data/lib/arc-furnace/node.rb +5 -0
  30. data/lib/arc-furnace/nodes.rb +19 -0
  31. data/lib/arc-furnace/outer_join.rb +14 -0
  32. data/lib/arc-furnace/pipeline.rb +158 -0
  33. data/lib/arc-furnace/private_attr.rb +28 -0
  34. data/lib/arc-furnace/sink.rb +21 -0
  35. data/lib/arc-furnace/source.rb +40 -0
  36. data/lib/arc-furnace/suffixed_fixed_column_csv_sink.rb +18 -0
  37. data/lib/arc-furnace/transform.rb +28 -0
  38. data/lib/arc-furnace/unfold.rb +45 -0
  39. data/lib/arc-furnace/version.rb +3 -0
  40. metadata +182 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 38e438f57e6350ddce2937cf2ef03d6d93ea20a8
4
+ data.tar.gz: 17ee74aba7edef53647c2a2e4efe6b4d3bda477f
5
+ SHA512:
6
+ metadata.gz: ee8d88b55edd486dbd496d2831096ed53eaad124e6043629747b9503bd6473267c1f18086edae0425152b08638fc7ed4961cfecc79b72a92d709768c49316b25
7
+ data.tar.gz: 39114078eab439a660c83479b9251b0e9f1690aa8e0110b51a4dc061aee5f7d64421189c69f0705349d95339b14e7e5b118124b1822a7c3e747446313c585ec7
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ sudo: false
3
+ rvm:
4
+ - 2.1.6
5
+ - 2.2.3
6
+ - jruby-9000
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Daniel Spangenberger
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,127 @@
1
+ # ArcFurnace
2
+ [![Build Status](https://travis-ci.org/salsify/arc-furnace.svg?branch=master)][travis]
3
+
4
+ [travis]: http://travis-ci.org/salsify/arc-furnace
5
+
6
+ ArcFurnace melts, melds, and transforms your scrap data into perfectly crafted data for ingest into applications,
7
+ analysis, or whatnot. ArcFurnace simplifies simple ETL (Extract, Transform, Load) tasks for small to medium sets of data
8
+ using a programmatic DSL interface. Here's an example:
9
+
10
+ ```ruby
11
+ class Transform < ArcFurnace::Pipeline
12
+
13
+ source :marketing_info_csv, type: ArcFurnace::CSVSource, params: { filename: :marketing_filename }
14
+
15
+ transform :marketing_info_source, params: { source: :marketing_info_csv } do |row|
16
+ row.delete('Name')
17
+ row
18
+ end
19
+
20
+ source :product_attributes,
21
+ type: ArcFurnace::MultiCSVSource,
22
+ params: { filenames: :product_attribute_filenames }
23
+
24
+ hash_node :marketing_info,
25
+ params: {
26
+ key_column: :primary_key,
27
+ source: :marketing_info_source
28
+ }
29
+
30
+ outer_join :join_results,
31
+ params: {
32
+ source: :product_attributes,
33
+ hash: :marketing_info
34
+ }
35
+
36
+ sink type: ArcFurnace::AllFieldsCSVSink,
37
+ source: :join_results,
38
+ params: { filename: :destination_name }
39
+
40
+ end
41
+ ```
42
+
43
+ ## Installation
44
+
45
+ Add this line to your application's Gemfile:
46
+
47
+ ```ruby
48
+ gem 'arc-furnace', github: 'salsify/arc-furnace'
49
+ ```
50
+
51
+ And then execute:
52
+
53
+ $ bundle
54
+
55
+ ## Usage
56
+
57
+ ArcFurnace provides a few concepts useful to extracting and transforming data.
58
+
59
+ ### Node Types Available
60
+
61
+ #### Pipelines
62
+
63
+ Pipelines define a a complete transformation and define a directed, acyclic graph of
64
+ operations that define how data is transformed. Each type of node in a `Pipeline` is defined below, but
65
+ a Pipelines defines the network of nodes that transform data.
66
+
67
+ #### Sources
68
+
69
+ A `Source` provides values to a `Pipeline`. A `Pipeline` may have many sources. Essentially, any nodes that
70
+ require a stream of data (`Hash`, `Transform`, `Join`, `Sink`) will have one.
71
+
72
+ #### Hashes
73
+
74
+ A `Hash` provides indexed access to a `Source` but pre-computing the index based on a key. The processing happens during the
75
+ prepare stage of pipeline processing. Hashes have a simple interface, `#get(primary_key)`, to requesting data. Hashes
76
+ are almost exclusively used as inputs to one side of joins.
77
+
78
+ #### Joins
79
+
80
+ An `InnerJoin` or an `OuterJoin` join two sources of data (one must be a `Hash`) based upon a key. By default the join
81
+ key is the key that the hash was rolled-up on, however, the `key_column` option on both `InnerJoin` and `OuterJoin`
82
+ may override this. Note the default join is an inner join, which will drop source rows if the hash does not contain
83
+ a matching row.
84
+
85
+ #### Transforms
86
+
87
+ A `Transform` acts as a source, however, takes a source as an input and transforms each input. The `BlockTransform` and
88
+ associated sugar in the `transform` method of `Pipeline` make this very easy (see the example above).
89
+
90
+ #### Unfolds
91
+
92
+ An `Unfold` acts as a source, however, takes a source as an input and produces multiple rows for that source as an output.
93
+ A common case for this is splitting rows into multiple rows depending upon their keys. The `BlockTransform` and associated
94
+ sugar in the `unfold` method of `Pipeline` make this fiarly easy (see `pipeline_spec.rb`).
95
+
96
+ #### Sinks
97
+
98
+ Each `Pipeline` has a single sink. Pipelines must produce data somewhere, and that data goes to a sink. Sinks
99
+ subscribe to the `#row(hash)` interace--each output row is passed to this method for handling.
100
+
101
+ ### General pipeline development process
102
+
103
+ 1. Define a source. Choose an existing `Source` implementation in this library (`CSVSource` or `ExcelSource`),
104
+ extend the `EnumeratorSource`, or implement the `row()` method for a new source.
105
+ 2. Define any transformations, or joins. This may cause you to revisit #1.
106
+ 3. Define the sink. This is generally custom, or, may be one of the provided `CSVSink` types.
107
+ 4. Roll it together in a `Pipeline`.
108
+
109
+ ## Development
110
+
111
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
112
+
113
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
114
+
115
+ ## TODOs
116
+
117
+ 1. Add a `filter` node and implementation to `Pipeline`
118
+ 2. Add examples for `ErrorHandler` interface.
119
+ 3. Add sugar to define a `BlockTransform` on a `Source` definition in a `Pipeline`.
120
+
121
+ ## Contributing
122
+
123
+ 1. Fork it ( https://github.com/[my-github-username]/arc-furnace/fork )
124
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
125
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
126
+ 4. Push to the branch (`git push origin my-new-feature`)
127
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require 'bundler/setup'
2
+ Bundler::GemHelper.install_tasks
3
+
4
+ require 'rspec/core/rake_task'
5
+ RSpec::Core::RakeTask.new(:spec) do |task|
6
+ task.verbose = false
7
+ end
8
+
9
+ task default: :spec
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'arc-furnace/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "arc-furnace"
8
+ spec.version = ArcFurnace::VERSION
9
+ spec.authors = ["Daniel Spangenberger", "Brian Tenggren"]
10
+ spec.email = ["dan@salsify.com"]
11
+
12
+ spec.summary = %q{Melds and transforms data from multiple sources into a single stream}
13
+ spec.description = %q{An ETL library for Ruby that performs the basic actions of ETL: extract, transform, and load. Easily extensible.}
14
+ spec.homepage = "http://github.com/salsify/arc-furnace"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_dependency 'msgpack', '~> 0.6'
23
+ spec.add_dependency 'activesupport', '>= 3.2'
24
+ spec.add_dependency 'eigenclass', '~> 2'
25
+ spec.add_dependency 'roo', '>= 2.1'
26
+
27
+ spec.add_development_dependency 'rake', '~> 10.0'
28
+ spec.add_development_dependency 'rspec', '~> 3'
29
+ spec.add_development_dependency 'ice_nine', '>= 0.11'
30
+ end
@@ -0,0 +1,12 @@
1
+ require 'arc-furnace/version'
2
+
3
+ require 'active_support'
4
+ require 'active_support/core_ext'
5
+
6
+ require 'arc-furnace/csv_to_hash_with_duplicate_headers'
7
+ require 'arc-furnace/private_attr'
8
+
9
+ require 'arc-furnace/dsl'
10
+ require 'arc-furnace/error_handler'
11
+ require 'arc-furnace/nodes'
12
+ require 'arc-furnace/pipeline'
@@ -0,0 +1,53 @@
1
+ require 'arc-furnace/source'
2
+
3
+ module ArcFurnace
4
+ class AbstractJoin < Source
5
+ private_attr_reader :hash, :source, :key_column
6
+ attr_reader :value
7
+
8
+ # The source is a source, the hash is a hash, and one can optionally
9
+ # pass the key column to get the primary key for each source entity, the
10
+ # default is equijoin semantics--the key of the hash is used.
11
+ def initialize(source: , hash:, key_column: nil)
12
+ if source.is_a?(::ArcFurnace::Source) && hash.is_a?(::ArcFurnace::Hash)
13
+ @hash = hash
14
+ @source = source
15
+ @key_column = key_column || hash.key_column
16
+ else
17
+ raise 'Must be passed one Hash and one Source!'
18
+ end
19
+ end
20
+
21
+ def prepare
22
+ advance
23
+ end
24
+
25
+ def advance
26
+ raise "Unimplemented!"
27
+ end
28
+
29
+ delegate empty?: :source
30
+
31
+ protected
32
+
33
+ def merge_source_row(source_row)
34
+ key = source_row[key_column]
35
+ if key
36
+ if hash_value = hash.get(key)
37
+ hash_value = hash_value.deep_dup
38
+ source_row.each do |key, value|
39
+ hash_value[key] = value
40
+ end
41
+ @value = hash_value
42
+ true
43
+ else
44
+ error_handler.missing_hash_key(source_row: source_row, key: key, node_id: node_id)
45
+ false
46
+ end
47
+ else
48
+ error_handler.missing_join_key(source_row: source_row, node_id: node_id)
49
+ end
50
+ end
51
+
52
+ end
53
+ end
@@ -0,0 +1,68 @@
1
+ require 'arc-furnace/sink'
2
+ require 'msgpack'
3
+
4
+ module ArcFurnace
5
+ class AllFieldsCSVSink < Sink
6
+ private_attr_reader :csv, :fields, :tmp_file, :packer, :fields, :field_mappings
7
+
8
+ def initialize(filename: , encoding: 'UTF-8')
9
+ @tmp_file = Tempfile.new('intermediate_results', encoding: 'binary')
10
+ @packer = MessagePack::Packer.new(tmp_file)
11
+ @csv = CSV.open(filename, 'wb', encoding: encoding, headers: true)
12
+ @fields = {}
13
+ end
14
+
15
+ def prepare(fields = nil)
16
+
17
+ end
18
+
19
+ def finalize
20
+ packer.flush
21
+ tmp_file.rewind
22
+
23
+ write_header_row!
24
+
25
+ unpacker = MessagePack::Unpacker.new(tmp_file)
26
+ unpacker.each do |hash|
27
+ write_row(hash)
28
+ end
29
+
30
+ csv.close
31
+ end
32
+
33
+ def row(hash)
34
+ update_field_counts(hash)
35
+ packer.write(hash)
36
+ end
37
+
38
+ private
39
+
40
+ def write_header_row!
41
+ header_row = []
42
+ fields.each do |key, count|
43
+ count.times { header_row << key }
44
+ end
45
+ csv << header_row
46
+ end
47
+
48
+ def write_row(hash)
49
+ row = []
50
+ fields.each do |key, count|
51
+ values = Array.wrap(hash[key])
52
+ (values.slice(0, count) || []).each do |value|
53
+ row << value
54
+ end
55
+ (count - values.length).times { row << nil }
56
+ end
57
+ csv << row
58
+ end
59
+
60
+ def update_field_counts(hash)
61
+ hash.each do |key, values|
62
+ value_count = Array.wrap(values).size
63
+ existing_value_count = fields[key] || 0
64
+ fields[key] = value_count if value_count > existing_value_count
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,38 @@
1
+ require 'arc-furnace/hash'
2
+
3
+ module ArcFurnace
4
+ # This allows one to merge multiple rows into one such as:
5
+ # key, attribute, value
6
+ # 1, value1, foo
7
+ # 1, value1, bar
8
+ # 1, value2, baz
9
+ # Results in:
10
+ # 1 => { value1 => [foo, bar], value2 => baz }
11
+ class BinaryKeyMergingHash < ::ArcFurnace::Hash
12
+ private_attr_reader :source, :hash, :secondary_key, :value_key
13
+
14
+ def initialize(source: , primary_key:, secondary_key:, value_key:)
15
+ super(source: source, key_column: primary_key)
16
+ @secondary_key = secondary_key
17
+ @value_key = value_key
18
+ end
19
+
20
+ def prepare
21
+ loop do
22
+ break if source.empty?
23
+ row = source.row
24
+ row_key = row[key_column]
25
+ second_key = row[secondary_key]
26
+ value = row[value_key]
27
+ if row_key && second_key && value
28
+ row_entry = hash[row_key] ||= {}
29
+ value_arr = row_entry[second_key] ||= []
30
+ value_arr.concat(Array.wrap(value))
31
+ else
32
+ error_handler.missing_primary_key(source_row: row, node_id: node_id)
33
+ end
34
+ end
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,18 @@
1
+ require 'arc-furnace/transform'
2
+
3
+ module ArcFurnace
4
+ class BlockTransform < Transform
5
+ private_attr_reader :block
6
+
7
+ def initialize(source:, block:)
8
+ raise 'Must specify a block' if block.nil?
9
+ @block = block
10
+ super(source: source)
11
+ end
12
+
13
+ def transform(row)
14
+ block.call(row)
15
+ end
16
+
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ require 'arc-furnace/unfold'
2
+
3
+ module ArcFurnace
4
+ class BlockUnfold < Unfold
5
+ private_attr_reader :block
6
+
7
+ def initialize(source:, block:)
8
+ raise 'Must specify a block' if block.nil?
9
+ @block = block
10
+ super(source: source)
11
+ end
12
+
13
+ def unfold(row)
14
+ block.call(row)
15
+ end
16
+
17
+ end
18
+ end