datamancer 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5a09ebee71e5c2b54c0b790ae4dc2165e4e9e45a
4
+ data.tar.gz: eabae34cb0d1b34ce072796f3b0edab7117325e2
5
+ SHA512:
6
+ metadata.gz: 84af02d3f38e7e861086bf268eb6670ef8528f6b70035d2bba6f3098b99dd897b29b727ae1037ba80abad412ff34ae543eec4e8c57ed71f4a74a939f35be4b15
7
+ data.tar.gz: 4c88628302bca9be9089ee2a6c1be2991c06201d66ce91bd88a62247b6ad45b64ff0cc90403dce9c1ebd0dcabb6fb807be58f52133394c0f57e5771473d558c2
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ .ruby-version
7
+ Gemfile.lock
8
+ InstalledFiles
9
+ _yardoc
10
+ coverage
11
+ doc/
12
+ lib/bundler/man
13
+ pkg
14
+ rdoc
15
+ spec/reports
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in datamancer.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Matías Battocchia
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,33 @@
1
+ # Datamancer
2
+
3
+ **The Datamancer** is a form of sorcerer whose spells deal with data manipulation between databases.
4
+
5
+ Data targets (sources and destinations) can be **databases** supported by ActiveRecord and **CSV files**. Multiple targets can be present in a single ETL process.
6
+
7
+ Datamancer relies in bulk SQL reading and writing, and does not instantiate ActiveRecord objects, which is used for the sole purpose of connecting to databases.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'datamancer'
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install datamancer
22
+
23
+ ## Usage
24
+
25
+ *Please see the specs, for now.*
26
+
27
+ ## Contributing
28
+
29
+ 1. Fork it
30
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
31
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
32
+ 4. Push to the branch (`git push origin my-new-feature`)
33
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'datamancer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "datamancer"
8
+ spec.version = Datamancer::VERSION
9
+ spec.authors = ["Matías Battocchia"]
10
+ spec.email = ["matias@riseup.net"]
11
+ spec.description = %q{A magical extract, transform, load (ETL) library for data integration.}
12
+ spec.summary = %q{}
13
+ spec.homepage = "https://github.com/matiasbattocchia/datamancer"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_development_dependency "activerecord"
25
+ spec.add_development_dependency "sqlite3"
26
+ spec.add_development_dependency "activerecord-jdbcsqlite3-adapter"
27
+ end
@@ -0,0 +1,183 @@
1
+ module Datamancer
2
+
3
+ def extract args
4
+ raise ArgumentError,
5
+ 'Extract requires a source, i.e. extract(from: source)' unless
6
+ args.is_a?(Hash) && args[:from]
7
+
8
+ headers = case args[:from]
9
+ when String
10
+
11
+ # TODO: The first row (headers row) is dropped; to drop more initial rows should be an option.
12
+
13
+ # TODO: Test the separator option.
14
+
15
+ csv = CSV.read args[:from], col_sep: (args[:separator] || ',')
16
+ csv.shift
17
+ when Hash
18
+ ::ActiveRecord::Base.establish_connection args[:from]
19
+ db = ::ActiveRecord::Base.connection
20
+
21
+ # TODO: Test this.
22
+
23
+ table = args[:table] || args[:from][:table]
24
+
25
+ raise ArgumentError,
26
+ 'Extract requires a database table, i.e. extract(from: source, table: table_name)' unless table
27
+
28
+ db.columns(table).map(&:name)
29
+ end
30
+
31
+ @fields = {}
32
+ @actions = {}
33
+
34
+ headers.each do |header|
35
+ field = header.to_sym
36
+ mapping = header
37
+ @fields[field] = mapping
38
+ end unless args[:exclude]
39
+
40
+ # The reason behind default_actions is the possibility to
41
+ # write reject_if: nil with the DSL.
42
+ default_actions = {reject_if: :ñil, reject_unless: :ñil}
43
+
44
+ define_singleton_method :field do |name, actions = {}, &block|
45
+ actions[:type] ||= actions[:type_default]
46
+ actions = default_actions.merge(actions)
47
+ mapping = actions[:map] || name.to_s
48
+
49
+ raise MissingField,
50
+ "Required field '#{mapping}' was not found in '#{args[:from]}'" unless headers.include? mapping
51
+
52
+ field = name.to_sym
53
+ @fields.delete(actions[:map].to_sym) if actions[:map]
54
+ @fields[field] = mapping
55
+ @actions[field] = actions
56
+ @actions[field][:block] = block
57
+ end
58
+
59
+ yield if block_given?
60
+
61
+ output = case args[:from]
62
+ when String
63
+ @fields.each do |field, mapping|
64
+ index = headers.find_index(mapping)
65
+ @fields[field] = index
66
+ end
67
+
68
+ csv
69
+
70
+ when Hash
71
+ columns = @fields.map { |field, mapping| "#{mapping} AS #{field}" }.join(', ')
72
+
73
+ @fields.keys.each_with_index do |field, index|
74
+ @fields[field] = index
75
+ end
76
+
77
+ db.select_rows("SELECT #{columns} FROM #{table}")
78
+ end
79
+
80
+ output.map! do |array_row|
81
+ hash_row = {}
82
+
83
+ @fields.each do |field, index|
84
+ value = array_row[index]
85
+ hash_row[field] = field_actions field, value, @actions[field]
86
+ end
87
+
88
+ if hash_row.has_value?(:reject)
89
+ puts hash_row
90
+ nil
91
+ else
92
+ hash_row
93
+ end
94
+ end.compact!
95
+
96
+ output
97
+ end
98
+
99
+ def field_actions field, value, actions
100
+ return value unless actions
101
+
102
+ # TODO: Revisit the order of actions.
103
+
104
+ ## Block-passing ##
105
+
106
+ # TODO: Test this.
107
+
108
+ if actions[:block]
109
+ value = actions[:block].call(value)
110
+ end
111
+
112
+ ## Stripping ##
113
+
114
+ # TODO: Test this.
115
+
116
+ if actions[:strip]
117
+ value.strip! if value.respond_to?(:strip!)
118
+ end
119
+
120
+ ## Casting ##
121
+
122
+ # Indexes and :type_default are not good friends.
123
+ # (Because of join while transforming.)
124
+
125
+ # TODO: Test this.
126
+
127
+ if value || actions[:type_default]
128
+
129
+ # TODO: Better data types support. From Mongoid:
130
+
131
+ # [ ] Array
132
+ # [ ] BigDecimal
133
+ # [ ] Boolean
134
+ # [x] Float
135
+ # [ ] Hash
136
+ # [x] Integer
137
+ # [ ] Range
138
+ # [ ] Regexp
139
+ # [x] String
140
+ # [x] Symbol
141
+ # [ ] Date
142
+ # [ ] DateTime
143
+ # [ ] Time
144
+ # [ ] TimeWithZone
145
+
146
+ case actions[:type].to_s
147
+ when 'Complex'
148
+ value = value.to_c
149
+ when 'Float'
150
+ value = value.to_f
151
+ when 'Integer'
152
+ value = value.to_i
153
+ when 'Rational'
154
+ value = value.to_r
155
+ when 'String'
156
+ value = value.to_s
157
+ when 'Symbol'
158
+ value = value.to_sym
159
+ end
160
+ end
161
+
162
+ ## Default value ##
163
+
164
+ # TODO: Test this.
165
+
166
+ if value.nil? || (actions[:empty_default] && value.empty?)
167
+ value = actions[:default]
168
+ end
169
+
170
+ ## Validation ##
171
+
172
+ # TODO: Test this. Test to not reject nil by default.
173
+
174
+ if actions[:reject_if] == value ||
175
+ (actions[:reject_unless] != :ñil &&
176
+ actions[:reject_unless] != value)
177
+
178
+ value = :reject
179
+ end
180
+
181
+ value
182
+ end
183
+ end
@@ -0,0 +1,89 @@
1
+ module Datamancer
2
+
3
+ def load input, args
4
+
5
+ raise ArgumentError,
6
+ 'Load requires a destination, i.e. load(data, to: destination)' unless
7
+ args.is_a?(Hash) && args[:to]
8
+
9
+ ## COLUMNS ##
10
+
11
+ # define_singleton_method :field do |name, options = {}|
12
+ # @columns << (options[:map] || name)
13
+ # end
14
+
15
+ # @columns = []
16
+
17
+ # yield if block_given?
18
+
19
+ ## INSERTS ##
20
+
21
+ define_singleton_method :field do |name, options = {}|
22
+ name = name.to_sym
23
+
24
+ raise MissingField,
25
+ "Required field '#{name}' was not found in '#{args[:to]}'" unless @input_row.include? name
26
+
27
+ @output_row[options[:map] || name] = @input_row[name]
28
+ @output_row.delete(name) if !args[:exclude] && options[:map]
29
+ end
30
+
31
+ inserts = []
32
+
33
+ input.each do |row|
34
+ @input_row = row
35
+ @output_row = args[:exclude] ? {} : row.dup
36
+
37
+ yield if block_given?
38
+
39
+ inserts << @output_row.values
40
+ end
41
+
42
+ columns = @output_row.keys
43
+
44
+ ## LOAD ##
45
+
46
+ # TODO: Set 'w' or 'w+' for CSV writing.
47
+
48
+ if args[:to].is_a?(String)
49
+ mode = if args[:append] then 'a' else 'w' end
50
+
51
+ CSV.open(args[:to], mode) do |csv|
52
+ csv << columns
53
+
54
+ inserts.each do |insert|
55
+ csv << insert
56
+ end
57
+ end
58
+
59
+ else
60
+ ::ActiveRecord::Base.establish_connection(args[:to])
61
+
62
+ # TODO: Test this.
63
+
64
+ table = args[:table] || args[:to][:table]
65
+
66
+ raise ArgumentError,
67
+ 'Load requires a database table, i.e. load(to: destination, table: table_name)' unless table
68
+
69
+ ::ActiveRecord::Base.connection.delete("DELETE FROM #{table}") unless args[:append]
70
+ batch_size = args[:batch] || 1000
71
+
72
+ pre_query = "INSERT INTO #{table} (#{columns.join(',')}) VALUES "
73
+
74
+ # String values must be enclosed by single quotes.
75
+ inserts.map! do |insert|
76
+ insert.map! do |field|
77
+ field.is_a?(String) ? "'#{field}'" : field
78
+ end
79
+
80
+ "(#{insert.join(',')})"
81
+ end
82
+
83
+ until inserts.empty?
84
+ query = pre_query + inserts.pop(batch_size).join(',')
85
+ ::ActiveRecord::Base.connection.execute query
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,121 @@
1
+ module Datamancer
2
+
3
+ def join left, right, attribute
4
+
5
+ attribute = attribute.to_sym
6
+
7
+ left_groups = Hash.new { |hash, key| hash[key] = [] }
8
+ right_groups = Hash.new { |hash, key| hash[key] = [] }
9
+
10
+ left.each do |tuple|
11
+ left_groups[tuple[attribute]] << tuple if tuple[attribute]
12
+ end
13
+
14
+ right.each do |tuple|
15
+ right_groups[tuple[attribute]] << tuple if tuple[attribute]
16
+ end
17
+
18
+ output = Array.new
19
+
20
+ left_groups.each do |key, left_group|
21
+
22
+ if right_group = right_groups[key]
23
+
24
+ left_group.each do |left_tuple|
25
+ right_group.each do |right_tuple|
26
+ output << left_tuple.merge(right_tuple)
27
+ end
28
+ end
29
+
30
+ end
31
+ end
32
+
33
+ output
34
+ end
35
+
36
+ def transform input, args = {}
37
+
38
+ if args[:join]
39
+ raise ArgumentError unless args[:on]
40
+
41
+ input = join input, args[:join], args[:on]
42
+ end
43
+
44
+ # TODO: Method-overriding safeguard.
45
+
46
+ input.first.each_key do |key|
47
+ define_singleton_method key.downcase do
48
+
49
+ # Some methods applied to fields might modify the original fields.
50
+ # Fields could be duplicated in case this be a common problem.
51
+
52
+ #@input_row[key].dup
53
+
54
+ @input_row[key]
55
+ end
56
+ end
57
+
58
+ define_singleton_method :field do |name, value = nil, *args|
59
+ raise MissingField,
60
+ "Required field '#{name}' was not found" unless respond_to?(name)
61
+
62
+ @output_row[name.to_sym] = if value.is_a?(Symbol)
63
+ send(name).send *args.unshift(value)
64
+ else
65
+ value || send(name)
66
+ end
67
+ end
68
+
69
+ define_singleton_method :del_field do |name|
70
+ raise MissingField,
71
+ "Filtered field '#{name}' was not found" unless respond_to?(name)
72
+
73
+ @output_row.delete(name.to_sym)
74
+ end
75
+
76
+ define_singleton_method :new_field do |name, value|
77
+ raise ExistingField,
78
+ "New field '#{name}' already exists" if respond_to?(name)
79
+
80
+ @output_row[name.to_sym] = value
81
+ end
82
+
83
+ input.map do |row|
84
+ @input_row = row
85
+ @output_row = args[:exclude] ? {} : @input_row.dup
86
+
87
+ yield if block_given?
88
+
89
+ @output_row
90
+ end
91
+ end
92
+
93
+ def aggregate input
94
+
95
+ define_singleton_method :dim do |name|
96
+ name = name.to_sym
97
+ @dimensions[name] = @row[name]
98
+ end
99
+
100
+ define_singleton_method :fact do |name|
101
+ name = name.to_sym
102
+ @facts[name] = @row[name]
103
+ end
104
+
105
+ aggregated_input = Hash.new { |hash, key| hash[key] = Hash.new }
106
+
107
+ input.each do |row|
108
+ @row = row
109
+ @dimensions = {}
110
+ @facts = {}
111
+
112
+ yield if block_given?
113
+
114
+ aggregated_input[@dimensions].merge!(@facts) { |_, fact, other_fact| fact + other_fact }
115
+ end
116
+
117
+ aggregated_input.map do |dimensions, facts|
118
+ dimensions.merge(facts)
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,3 @@
1
+ module Datamancer
2
+ VERSION = "0.0.2"
3
+ end
data/lib/datamancer.rb ADDED
@@ -0,0 +1,56 @@
1
+ require 'datamancer/version'
2
+ require 'datamancer/extract'
3
+ require 'datamancer/transform'
4
+ require 'datamancer/load'
5
+
6
+ module Datamancer
7
+ class MissingField < StandardError; end
8
+ class ExistingField < StandardError; end
9
+
10
+ def keyvalue file
11
+ YAML::load_file file
12
+ end
13
+ end
14
+
15
+ # TODO: Decide on extract() default values issue; string?
16
+
17
+ # TODO: Case insensitive and regex field-mapping.
18
+
19
+ # TODO: Better errors and tests for them.
20
+
21
+ # TODO: ETL by batch.
22
+
23
+ # TODO: Methods alias.
24
+ # field => f
25
+ # unfield => u
26
+
27
+ # TODO: Field inclusion policy.
28
+ #
29
+ # OPTIONS
30
+ #
31
+ # :include_all (DEFAULT)
32
+ # :exclude_all
33
+ #
34
+ # METHODS
35
+ #
36
+ # unfield(name)
37
+ # field(name, value = nil)
38
+ #
39
+ # CASES
40
+ #
41
+ # unfield(name) => include all, exclude 'name'
42
+ # field(name) => exclude all, include 'name'
43
+ # field(name, value) => include 'name'
44
+ #
45
+ # unfield(name1) and field(name2) => include all, exclude 'name1', include 'name2'
46
+ # unfield(name1) and field(name2, value) => include all, exclude 'name1', include 'name2'
47
+ # field(name1) and field(name2, value) => exclude all, include 'name1', include 'name2'
48
+ #
49
+ # unfield(name1) and field(name2) and field(name2, value) => include all, exclude 'name1', include 'name2', include 'name3'
50
+ #
51
+ # USAGE
52
+ #
53
+ # Basically :include_all is passed explicity to be used along with field(name) (which otherwise would cause a
54
+ # general exclusion of fields) for documentation purposes.
55
+ #
56
+ # :exclude_all is useful for changing transform()'s default inclusive behaviour when using field(name, value) alone.
@@ -0,0 +1,3 @@
1
+ name,age
2
+ Foo,27
3
+ Bar,42
Binary file
@@ -0,0 +1,5 @@
1
+ name,some_id
2
+ "Foo",1
3
+ "Bar",2
4
+ "Baz",2
5
+ "Foobar",
@@ -0,0 +1,5 @@
1
+ age,some_id
2
+ 0,
3
+ 27,1
4
+ 33,1
5
+ 42,2
@@ -0,0 +1,2 @@
1
+ Argentina: '+54'
2
+ Brazil: '+55'
@@ -0,0 +1,3 @@
1
+ name,age
2
+ "Foo",27
3
+ "Bar",42
Binary file
@@ -0,0 +1,6 @@
1
+ store,coffee_cups,name,croissants
2
+ "A",1,"Foo",0
3
+ A,3,Bar,2
4
+ A,5,Bar,4
5
+ B,7,Foo,6
6
+ B,9,Foo,8
@@ -0,0 +1,155 @@
1
+ require 'spec_helper'
2
+
3
+ describe Datamancer do
4
+
5
+ context 'EXTRACT spell' do
6
+
7
+ sources = {'CSV file' => $dir + '/source.csv',
8
+ 'database' => {adapter: 'sqlite3', database: $dir + '/source.sqlite3', table: 'users'}}
9
+
10
+ sources.each do |source_type, source|
11
+
12
+ context "against a #{source_type.upcase}" do
13
+
14
+ before(:all) do
15
+ @source = source
16
+ @data = [{name: 'Foo', age: 27}, {name: 'Bar', age: 42}]
17
+ end
18
+
19
+ after(:all) do
20
+ ActiveRecord::Base.connection.close
21
+ end if source_type == 'database'
22
+
23
+
24
+ it 'reads data from source implicitly' do
25
+ expect(
26
+ extract(from: @source)
27
+ ).to eq(@data)
28
+
29
+ expect(
30
+ extract(from: @source) do
31
+ field :name
32
+ end
33
+ ).to eq(@data)
34
+
35
+ expect(
36
+ extract(from: @source) do
37
+ field :name
38
+ field :age
39
+ end
40
+ ).to eq(@data)
41
+ end
42
+
43
+
44
+ it 'reads data from source explicitly' do
45
+ expect(
46
+
47
+ extract(from: @source, exclude: true) do
48
+ field :name
49
+ end
50
+
51
+ ).to eq([{name: 'Foo'}, {name: 'Bar'}])
52
+ end
53
+
54
+
55
+ it 'raises an exception if data source is missing' do
56
+ expect {
57
+
58
+ extract(@source)
59
+
60
+ }.to raise_error(ArgumentError, 'Extract requires a source, i.e. extract(from: source)')
61
+ end
62
+
63
+
64
+ it 'raises an exception if a required field is missing' do
65
+
66
+ # TODO: Better explanation for this error.
67
+
68
+ # source = case source_type
69
+ # when 'CSV file' then "#{@source} file"
70
+ # when 'database' then "#{@source[:database]}.#{@source[:table]} table"
71
+ # end
72
+
73
+ expect {
74
+
75
+ extract(from: @source) do
76
+ field :surname
77
+ end
78
+
79
+ }.to raise_error(MissingField,
80
+ "Required field 'surname' was not found in '#{@source}'")
81
+ end
82
+
83
+
84
+ it 'lets fields to be strings' do
85
+ expect(
86
+
87
+ extract(from: @source) do
88
+ field 'name'
89
+ field 'age'
90
+ end
91
+
92
+ ).to eq(@data)
93
+ end
94
+
95
+
96
+ it 'maps fields' do
97
+ expect(
98
+
99
+ extract(from: @source) do
100
+ field :years_old, map: 'age'
101
+ end
102
+
103
+ ).to eq([{name: 'Foo', years_old: 27}, {name: 'Bar', years_old: 42}])
104
+ end
105
+
106
+
107
+ it 'casts types on fields' do
108
+ expect(
109
+
110
+ extract(from: @source) do
111
+ field :age, type: Integer
112
+ end
113
+
114
+ ).to eq([{name: 'Foo', age: 27}, {name: 'Bar', age: 42}])
115
+ end
116
+
117
+ end
118
+ end
119
+
120
+ # TODO: Validations
121
+
122
+ # exclusion
123
+ # inclusion
124
+ # format
125
+ # presence
126
+ # uniqueness (?)
127
+ # length (?)
128
+ # numericality (?)
129
+ # custom validations (?)
130
+
131
+ # When a field validation fails, the row gets dropped.
132
+ # This action is logged and optionally the process halted.
133
+ # Validations should accept options (:allow_nil, :if, etc).
134
+
135
+ # it 'validates fields' do
136
+
137
+ # pending 'Not now...'
138
+
139
+ # expect(
140
+
141
+ # extract(from: @csv_file) do
142
+ # field :name
143
+ # field :age, reject_if: '27'
144
+ # end
145
+
146
+ # ).to eq([{name: 'Bar', age: '42'}])
147
+ # end
148
+
149
+ it 'imports YAML key-value tables' do
150
+ @yml_file = $dir + '/simple.yml'
151
+ expect(keyvalue(@yml_file)).to eq({'Argentina' => '+54', 'Brazil' => '+55'})
152
+ end
153
+
154
+ end
155
+ end
data/spec/load_spec.rb ADDED
@@ -0,0 +1,114 @@
1
+ require 'spec_helper'
2
+
3
+ describe Datamancer do
4
+
5
+ context 'LOAD spell' do
6
+
7
+ destinations = {'CSV file' => $dir + '/destination.csv',
8
+ 'database' => {adapter: 'sqlite3', database: $dir + '/destination.sqlite3', table: 'users'}}
9
+
10
+ destinations.each do |destination_type, destination|
11
+
12
+ context "against a #{destination_type.upcase}" do
13
+
14
+ before(:all) do
15
+ @destination = destination
16
+ @data = [{name: 'Foo', age: 27}, {name: 'Bar', age: 42}]
17
+
18
+ if destination_type == 'database'
19
+ ActiveRecord::Base.establish_connection(destination)
20
+ ActiveRecord::Base.connection.delete('DELETE FROM users')
21
+ end
22
+ end
23
+
24
+ after(:all) do
25
+ ActiveRecord::Base.connection.close
26
+ end if destination_type == 'database'
27
+
28
+
29
+ it 'writes data to destination implicitly' do
30
+ load(@data, to: @destination, append: false)
31
+
32
+ expect(extract from: @destination).to eq(@data)
33
+
34
+ load(@data, to: @destination, append: false) do
35
+ field :name
36
+ end
37
+
38
+ expect(extract from: @destination).to eq(@data)
39
+
40
+ load(@data, to: @destination, append: false) do
41
+ field :name
42
+ field :age
43
+ end
44
+
45
+ expect(extract from: @destination).to eq(@data)
46
+ end
47
+
48
+
49
+ it 'appends data to destination'
50
+
51
+
52
+ it 'writes data to destination explicitly' do
53
+ load(@data, to: @destination, append: false, exclude: true) do
54
+ field :name
55
+ end
56
+
57
+ expect(extract from: @destination).to eq(
58
+ case destination_type
59
+ when 'CSV file'
60
+ [{name: 'Foo'}, {name: 'Bar'}]
61
+ when 'database'
62
+
63
+ # TODO: Fix this empty DB columns thing.
64
+
65
+ [{name: 'Foo', age: nil}, {name: 'Bar', age: nil}]
66
+ end)
67
+ end
68
+
69
+
70
+ it 'raises an exception if data destination is missing' do
71
+ expect {
72
+
73
+ load(@data, @destination)
74
+
75
+ }.to raise_error(ArgumentError, 'Load requires a destination, i.e. load(data, to: destination)')
76
+ end
77
+
78
+
79
+ it 'raises an exception if a required field is missing' do
80
+ expect {
81
+
82
+ load(@data, to: @destination) do
83
+ field :surname
84
+ end
85
+
86
+ }.to raise_error(MissingField,
87
+ "Required field 'surname' was not found in '#{@destination}'")
88
+ end
89
+
90
+
91
+ it 'lets fields to be strings' do
92
+ load(@data, to: @destination, append: false) do
93
+ field 'name'
94
+ field 'age'
95
+ end
96
+
97
+ expect(extract from: @destination).to eq(@data)
98
+ end
99
+
100
+
101
+ it 'maps fields' do
102
+ data = [{name: 'Foo', years_old: '27'}, {name: 'Bar', years_old: '42'}]
103
+
104
+ load(data, to: @destination, append: false) do
105
+ field :years_old, map: 'age'
106
+ end
107
+
108
+ expect(extract from: @destination).to eq(@data)
109
+ end
110
+
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,13 @@
1
+ require 'datamancer'
2
+ require 'csv'
3
+ require 'active_record'
4
+
5
+ if defined? JRUBY_VERSION
6
+ require 'activerecord-jdbc-adapter'
7
+ else
8
+ require 'sqlite3'
9
+ end
10
+
11
+ include Datamancer
12
+
13
+ $dir = __dir__ + '/data'
@@ -0,0 +1,231 @@
1
+ require 'spec_helper'
2
+
3
+ describe Datamancer do
4
+
5
+ context 'TRANSFORMATION spell' do
6
+
7
+ before(:all) do
8
+ csv_file = $dir + '/source.csv'
9
+ @data = extract from: csv_file
10
+ end
11
+
12
+
13
+ it 'passes fields implicitly' do
14
+ expect(
15
+ transform(@data)
16
+ ).to eq(@data)
17
+
18
+ expect(
19
+ transform(@data) do
20
+ field :name
21
+ end
22
+ ).to eq(@data)
23
+ end
24
+
25
+
26
+ it 'passes fields explicitly' do
27
+
28
+ # TODO: What happens when no field is passed?
29
+
30
+ expect(
31
+
32
+ transform(@data, exclude: true) do
33
+ field :name
34
+ end
35
+
36
+ ).to eq([{name: 'Foo'}, {name: 'Bar'}])
37
+ end
38
+
39
+
40
+ it 'filters fields explicitly' do
41
+
42
+ # TODO: What happens when all fields are deleted?
43
+
44
+ expect(
45
+
46
+ transform(@data) do
47
+ del_field :age
48
+ end
49
+
50
+ ).to eq([{name: 'Foo'}, {name: 'Bar'}])
51
+ end
52
+
53
+
54
+ it 'raises an exception if a field is missing' do
55
+ expect {
56
+
57
+ transform(@data) do
58
+ field :surname
59
+ end
60
+
61
+ }.to raise_error(MissingField,
62
+ "Required field 'surname' was not found")
63
+
64
+ expect {
65
+
66
+ transform(@data) do
67
+ del_field :surname
68
+ end
69
+
70
+ }.to raise_error(MissingField,
71
+ "Filtered field 'surname' was not found")
72
+ end
73
+
74
+
75
+ it 'changes fields through *true* expressions' do
76
+ expect(
77
+
78
+ transform(@data) do
79
+ field :name, false
80
+ field :age, true
81
+ end
82
+
83
+ ).to eq([{name: 'Foo', age: true}, {name: 'Bar', age: true}])
84
+ end
85
+
86
+
87
+ it 'creates fields' do
88
+ expect(
89
+ transform(@data) do
90
+ new_field :new, true
91
+ end
92
+ ).to eq([{name: 'Foo', age: '27', new: true}, {name: 'Bar', age: '42', new: true}])
93
+
94
+ # TODO: A better explanation for this error.
95
+
96
+ expect{
97
+ transform(@data) do
98
+ new_field :new
99
+ end
100
+ }.to raise_error(ArgumentError)
101
+
102
+ expect {
103
+ transform(@data) do
104
+ new_field :name, true
105
+ end
106
+ }.to raise_error(ExistingField,
107
+ "New field 'name' already exists")
108
+ end
109
+
110
+
111
+ it 'lets field names to be strings and allows field methods to co-occur' do
112
+ expect(
113
+
114
+ transform(@data) do
115
+ field 'name'
116
+ del_field 'age'
117
+ new_field 'new', true
118
+ end
119
+
120
+ ).to eq([{name: 'Foo', new: true}, {name: 'Bar', new: true}])
121
+ end
122
+
123
+
124
+ it 'changes fields sending messages as symbols' do
125
+ expect(
126
+ transform(@data) do
127
+ field :name, :slice, 0
128
+ field :age, :to_i
129
+ end
130
+ ).to eq([{name: 'F', age: 27}, {name: 'B', age: 42}])
131
+
132
+ expect(
133
+ transform(@data) do
134
+ field :name, 'slice', 0
135
+ field :age, 'to_i'
136
+ end
137
+ ).to eq([{name: 'slice', age: 'to_i'}, {name: 'slice', age: 'to_i'}])
138
+ end
139
+
140
+
141
+ it 'generates field getters per row' do
142
+ expect(
143
+
144
+ transform(@data) do
145
+ new_field :namage, name.downcase + age
146
+ end
147
+
148
+ ).to eq([{name: 'Foo', age: '27', namage: 'foo27'}, {name: 'Bar', age: '42', namage: 'bar42'}])
149
+ end
150
+
151
+
152
+ context 'combines records by' do
153
+
154
+ before(:all) do
155
+ @left_data = extract from: $dir + '/left_source.csv'
156
+ @right_data = extract from: $dir + '/right_source.csv'
157
+ end
158
+
159
+
160
+ it 'inner join' do
161
+
162
+ # TODO: A better explanation for this error.
163
+
164
+ expect {
165
+ transform(@left_data, join: @right_data)
166
+ }.to raise_error(ArgumentError)
167
+
168
+ expect(
169
+ transform(@left_data, join: @right_data, on: 'some_id') do
170
+ del_field :some_id
171
+ new_field :namage, name.downcase + age
172
+ end
173
+ ).to eq([{name: 'Foo', age: '27', namage: 'foo27'},
174
+ {name: 'Foo', age: '33', namage: 'foo33'},
175
+ {name: 'Bar', age: '42', namage: 'bar42'},
176
+ {name: 'Baz', age: '42', namage: 'baz42'}])
177
+ end
178
+
179
+
180
+ it 'left outer join'
181
+ it 'right outer join'
182
+ it 'full outer join'
183
+
184
+ end
185
+ end
186
+
187
+
188
+ context 'aggregation' do
189
+
190
+ before(:all) do
191
+ csv_file = $dir + '/source2.csv'
192
+ @data = extract from: csv_file do
193
+ field :store
194
+ field :coffee_cups, type: Integer
195
+ field :name
196
+ field :croissants, type: Integer
197
+ end
198
+ end
199
+
200
+ it 'projects data to dimensions summing facts' do
201
+
202
+ # TODO: Facts must be additionable. Maybe raise an error?
203
+ # TODO: What happens with bad formatted data?
204
+
205
+ expect(
206
+
207
+ aggregate(@data) do
208
+ dim :name
209
+ fact :coffee_cups
210
+ fact :croissants
211
+ end
212
+
213
+ ).to eq([{name: 'Foo', coffee_cups: 17, croissants: 14},
214
+ {name: 'Bar', coffee_cups: 8, croissants: 6}])
215
+ end
216
+
217
+
218
+ it 'lets fields to be strings' do
219
+ expect(
220
+
221
+ aggregate(@data) do
222
+ dim 'name'
223
+ fact 'coffee_cups'
224
+ end
225
+
226
+ ).to eq([{name: 'Foo', coffee_cups: 17},
227
+ {name: 'Bar', coffee_cups: 8}])
228
+ end
229
+
230
+ end
231
+ end
metadata ADDED
@@ -0,0 +1,163 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: datamancer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Matías Battocchia
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-11-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: activerecord
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: sqlite3
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: activerecord-jdbcsqlite3-adapter
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: A magical extract, transform, load (ETL) library for data integration.
98
+ email:
99
+ - matias@riseup.net
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - .gitignore
105
+ - Gemfile
106
+ - LICENSE.txt
107
+ - README.md
108
+ - Rakefile
109
+ - datamancer.gemspec
110
+ - lib/datamancer.rb
111
+ - lib/datamancer/extract.rb
112
+ - lib/datamancer/load.rb
113
+ - lib/datamancer/transform.rb
114
+ - lib/datamancer/version.rb
115
+ - spec/data/destination.csv
116
+ - spec/data/destination.sqlite3
117
+ - spec/data/left_source.csv
118
+ - spec/data/right_source.csv
119
+ - spec/data/simple.yml
120
+ - spec/data/source.csv
121
+ - spec/data/source.sqlite3
122
+ - spec/data/source2.csv
123
+ - spec/extract_spec.rb
124
+ - spec/load_spec.rb
125
+ - spec/spec_helper.rb
126
+ - spec/transform_spec.rb
127
+ homepage: https://github.com/matiasbattocchia/datamancer
128
+ licenses:
129
+ - MIT
130
+ metadata: {}
131
+ post_install_message:
132
+ rdoc_options: []
133
+ require_paths:
134
+ - lib
135
+ required_ruby_version: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - '>='
138
+ - !ruby/object:Gem::Version
139
+ version: '0'
140
+ required_rubygems_version: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - '>='
143
+ - !ruby/object:Gem::Version
144
+ version: '0'
145
+ requirements: []
146
+ rubyforge_project:
147
+ rubygems_version: 2.0.2
148
+ signing_key:
149
+ specification_version: 4
150
+ summary: ''
151
+ test_files:
152
+ - spec/data/destination.csv
153
+ - spec/data/destination.sqlite3
154
+ - spec/data/left_source.csv
155
+ - spec/data/right_source.csv
156
+ - spec/data/simple.yml
157
+ - spec/data/source.csv
158
+ - spec/data/source.sqlite3
159
+ - spec/data/source2.csv
160
+ - spec/extract_spec.rb
161
+ - spec/load_spec.rb
162
+ - spec/spec_helper.rb
163
+ - spec/transform_spec.rb