assimilate 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .rspec
19
+ .irb_history
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in assimilate.gemspec
4
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,8 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard 'rspec', :version => 2 do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1].gsub(/assimilate./, '')}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+ end
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Jason May
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,36 @@
1
+ # Assimilate
2
+
3
+ Ingest updates from CSV and apply to set of persistent hashes
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'assimilate'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install assimilate
18
+
19
+ ## Usage
20
+
21
+ assimilate --config repo.yml filename
22
+
23
+ or
24
+
25
+ > require 'assimilate'
26
+ > catalog = Assimilate::Catalog.new(:config => configfile)
27
+ > catalog.start_batch(:filename => filename, :datestamp => datestamp, :idfield => idfield)
28
+
29
+ ## Contributing
30
+
31
+ 1. Fork it
32
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
33
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
34
+ 4. Push to the branch (`git push origin my-new-feature`)
35
+ 5. Create new Pull Request
36
+
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ # task :default => :spec
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/assimilate/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Jason May"]
6
+ gem.email = ["jmay@pobox.com"]
7
+ gem.description = %q{Ingest updates from CSV and apply to set of hashes}
8
+ gem.summary = %q{Review & incorporate changes to a repository of persistent hashes in mongodb.}
9
+ gem.homepage = ""
10
+ gem.rubyforge_project = "assimilate"
11
+
12
+ gem.files = `git ls-files`.split($\)
13
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
14
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
15
+ gem.name = "assimilate"
16
+ gem.require_paths = ["lib"]
17
+ gem.version = Assimilate::VERSION
18
+
19
+ gem.add_dependency "mongo", "~> 1.6.0"
20
+ gem.add_dependency "bson_ext", "~> 1.6.0"
21
+ gem.add_dependency 'activesupport', "~> 3.2.0"
22
+
23
+ gem.add_development_dependency "rspec", "~> 2.9.0"
24
+ gem.add_development_dependency "guard-rspec", "~> 0.7.0"
25
+ gem.add_development_dependency "ruby_gntp", "~> 0.3.4"
26
+ end
@@ -0,0 +1,139 @@
1
+ class Assimilate::Batch
2
+ attr_reader :domain, :idfield, :datestamp
3
+
4
+ def initialize(args)
5
+ @catalog = args[:catalog]
6
+ @domain = args[:domain]
7
+ @datestamp = args[:datestamp]
8
+ @idfield = args[:idfield]
9
+ @filename = args[:filename]
10
+
11
+ load_baseline
12
+
13
+ @noops = []
14
+ @changes = []
15
+ @adds = []
16
+ @deletes = []
17
+ end
18
+
19
+ def load_baseline
20
+ stored_records = @catalog.catalog.find(@catalog.domainkey => @domain).to_a
21
+ @baseline = stored_records.each_with_object({}) do |rec, h|
22
+ key = rec[@idfield]
23
+ if h.include?(key)
24
+ raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in domain [#{@domain}]"
25
+ end
26
+ h[key] = rec
27
+ end
28
+ end
29
+
30
+ def stripped_record_for(key)
31
+ @baseline[key] && @baseline[key].select {|k,v| k !~ /^_/}
32
+ end
33
+
34
+ def <<(record)
35
+ @seen ||= Hash.new(0)
36
+
37
+ hash = record.to_hash
38
+ key = hash[@idfield]
39
+ @seen[key] += 1
40
+ current_record = stripped_record_for(key)
41
+ if current_record
42
+ if current_record == hash
43
+ @noops << hash
44
+ else
45
+ @changes << hash
46
+ end
47
+ else
48
+ @adds << hash
49
+ end
50
+ end
51
+
52
+ # compute anything needed before we can write updates to permanent store
53
+ # * find records that have been deleted
54
+ def resolve
55
+ @deleted_keys = @baseline.keys - @seen.keys
56
+
57
+ @updated_field_counts = @changes.each_with_object(Hash.new(0)) do |rec,h|
58
+ key = rec[idfield]
59
+ diffs = rec.diff(stripped_record_for(key))
60
+ diffs.keys.each do |f|
61
+ h[f] += 1
62
+ end
63
+ end
64
+ end
65
+
66
+ def stats
67
+ resolve
68
+ {
69
+ :baseline_count => @baseline.size,
70
+ :final_count => @baseline.size + @adds.count,
71
+ :adds_count => @adds.count,
72
+ :deletes_count => @deleted_keys.count,
73
+ :updates_count => @changes.count,
74
+ :unchanged_count => @noops.count,
75
+ :updated_fields => @updated_field_counts
76
+ }
77
+ end
78
+
79
+ # write the updates to the catalog
80
+ def commit
81
+ resolve
82
+ record_batch
83
+ apply_deletes
84
+ apply_inserts
85
+ apply_updates
86
+ end
87
+
88
+ def record_batch
89
+ raise(Assimilate::DuplicateImportError, "duplicate batch") if @catalog.batches.find('domain' => @domain, 'datestamp' => @datestamp).to_a.any?
90
+ @catalog.batches.insert({
91
+ 'domain' => @domain,
92
+ 'datestamp' => @datestamp,
93
+ 'filename' => @filename
94
+ })
95
+ end
96
+
97
+ def apply_deletes
98
+ @deleted_keys.each do |key|
99
+ @catalog.catalog.update(
100
+ {
101
+ @catalog.domainkey => domain,
102
+ idfield => key
103
+ },
104
+ {"$set" => {:_dt_removed => datestamp}}
105
+ )
106
+ end
107
+ end
108
+
109
+ INSERT_BATCH_SIZE = 1000 # default batch size for bulk loading into mongo
110
+
111
+ def apply_inserts
112
+ @adds.each_slice(INSERT_BATCH_SIZE) do |slice|
113
+ # mongo insert can't handle CSV::Row objects, must be converted to regular hashes
114
+ @catalog.catalog.insert(decorate(slice))
115
+ end
116
+ end
117
+
118
+ def apply_updates
119
+ @changes.each do |rec|
120
+ @catalog.catalog.update(
121
+ {
122
+ @catalog.domainkey => domain,
123
+ idfield => rec[idfield]
124
+ },
125
+ {"$set" => rec}
126
+ )
127
+ end
128
+ end
129
+
130
+ def decorate(records)
131
+ records.map do |r|
132
+ r[@catalog.domainkey] = @domain
133
+ r.to_hash
134
+ end
135
+ end
136
+ end
137
+
138
+ class Assimilate::DuplicateImportError < StandardError
139
+ end
@@ -0,0 +1,39 @@
1
+ require "yaml"
2
+
3
+ # Records in each catalog acquire the following internal attributes:
4
+ # _id Unique ID, assigned by mongo
5
+ # _[domain] Domain key, specified with :domainkey attribute when initializing catalog
6
+ # _dt_first_seen Batch datestamp reference for when this record was first captured
7
+ # _dt_last_seen Batch datestamp reference for when this record was most recently affirmed
8
+ # _dt_last_update Batch datestamp reference for when this record was most recently altered
9
+ # _dt_removed Batch datestamp reference for when this record was removed from input
10
+ #
11
+ # Inbound records must not have attributes named with leading underscores.
12
+ #
13
+ # A "domain" here is a namespace of identifiers.
14
+
15
+ class Assimilate::Catalog
16
+ attr_reader :catalog, :batches, :domainkey
17
+
18
+ def initialize(args)
19
+ @config = YAML.load(File.open(args[:config]))
20
+
21
+ @db = Mongo::Connection.new.db(@config['db'])
22
+ @catalog = @db.collection(@config['catalog'])
23
+ @batches = @db.collection(@config['batch'])
24
+ @domainkey = @config['domain']
25
+ @domainkey = "_#{@domainkey}" unless @domainkey =~ /^_/ # enforce leading underscore on internal attributes
26
+ end
27
+
28
+ def start_batch(args)
29
+ Assimilate::Batch.new(args.merge(:catalog => self))
30
+ end
31
+
32
+ def where(params)
33
+ @catalog.find(params).first.select {|k,v| k !~ /^_/}
34
+ end
35
+
36
+ def active_count
37
+ @catalog.find("_dt_removed" => nil).count
38
+ end
39
+ end
@@ -0,0 +1,3 @@
1
+ module Assimilate
2
+ VERSION = "0.0.1"
3
+ end
data/lib/assimilate.rb ADDED
@@ -0,0 +1,11 @@
1
+ require "mongo"
2
+ require "active_support/core_ext" # needed for Hash#diff
3
+
4
+ require_relative "assimilate/version"
5
+
6
+ require_relative "assimilate/catalog"
7
+ require_relative "assimilate/batch"
8
+
9
+ module Assimilate
10
+ # Your code goes here...
11
+ end
@@ -0,0 +1,6 @@
1
+ ID,name,title
2
+ 1,George Washington,President
3
+ 2,John Adams,Vice President
4
+ 3,Benjamin Franklin,Sage
5
+ 4,Aaron Burr,Duelist
6
+ 5,Alexander Hamilton,Financier
@@ -0,0 +1,5 @@
1
+ ---
2
+ db: assimilate-test
3
+ catalog: forefathers
4
+ batch: files
5
+ domain: resource
@@ -0,0 +1,6 @@
1
+ ID,name,title
2
+ 1,George Washington,President
3
+ 2,John Adams,Vice President
4
+ 3,Benjamin Franklin,Ambassador
5
+ 5,Alexander Hamilton,Financier
6
+ 7,Thomas Jefferson,Anti-Federalist
@@ -0,0 +1,92 @@
1
+ # batch import tests
2
+
3
+ require "spec_helper"
4
+
5
+ describe "importing file" do
6
+ before(:all) do
7
+ @catalog = Assimilate::Catalog.new(:config => File.dirname(__FILE__) + "/../data/test.yml")
8
+ reset_catalog
9
+ end
10
+
11
+ def reset_catalog
12
+ @catalog.catalog.remove
13
+ @catalog.batches.remove
14
+ end
15
+
16
+ def import_data(datestamp, filename = "batch_input.csv")
17
+ @batcher = @catalog.start_batch(domain: 'testdata', datestamp: datestamp, idfield: 'ID')
18
+
19
+ @records = CSV.read(File.dirname(__FILE__) + "/../data/#{filename}", :headers => true)
20
+ @records.each do |rec|
21
+ @batcher << rec
22
+ end
23
+ @batcher.commit
24
+ end
25
+
26
+ describe "into empty catalog" do
27
+ before :each do
28
+ reset_catalog
29
+ import_data("123")
30
+ end
31
+
32
+ it "should return correct import stats" do
33
+ @batcher.stats.should == {
34
+ :baseline_count => 0,
35
+ :final_count => 5,
36
+ :adds_count => 5,
37
+ :deletes_count => 0,
38
+ :updates_count => 0,
39
+ :unchanged_count => 0,
40
+ :updated_fields => {}
41
+ }
42
+ end
43
+
44
+ it "should load the records verbatim" do
45
+ @catalog.catalog.count.should == @records.count
46
+ example = @records[rand(@records.count)]
47
+ @catalog.where('_resource' => 'testdata', 'ID' => example['ID']).should == example.to_hash
48
+ end
49
+
50
+ it "should refuse to do a duplicate import" do
51
+ lambda {import_data("123")}.should raise_error(Assimilate::DuplicateImportError)
52
+ end
53
+
54
+ it "should do all no-ops when importing identical data" do
55
+ lambda {import_data("234")}.should_not raise_error
56
+ @batcher.stats.should == {
57
+ :baseline_count => 5,
58
+ :final_count => 5,
59
+ :adds_count => 0,
60
+ :deletes_count => 0,
61
+ :updates_count => 0,
62
+ :unchanged_count => 5,
63
+ :updated_fields => {}
64
+ }
65
+ @catalog.catalog.count.should == @records.count
66
+ end
67
+ end
68
+
69
+ describe "into existing catalog" do
70
+ before(:all) do
71
+ reset_catalog
72
+ import_data("123")
73
+ end
74
+
75
+ before(:each) do
76
+ import_data("345", "updates.csv")
77
+ end
78
+
79
+ it "should recognize changes" do
80
+ @batcher.stats.should == {
81
+ :baseline_count => 5,
82
+ :final_count => 6,
83
+ :adds_count => 1,
84
+ :deletes_count => 1,
85
+ :updates_count => 1,
86
+ :unchanged_count => 3,
87
+ :updated_fields => {'title' => 1}
88
+ }
89
+ @catalog.active_count.should == @records.count
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,8 @@
1
+ require 'rspec/autorun'
2
+ require "tempfile"
3
+ require "csv"
4
+
5
+ require File.expand_path('../../lib/assimilate', __FILE__)
6
+
7
+ RSpec.configure do |config|
8
+ end
metadata ADDED
@@ -0,0 +1,133 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: assimilate
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jason May
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-04-26 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mongo
16
+ requirement: &2157149420 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 1.6.0
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *2157149420
25
+ - !ruby/object:Gem::Dependency
26
+ name: bson_ext
27
+ requirement: &2157148260 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 1.6.0
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *2157148260
36
+ - !ruby/object:Gem::Dependency
37
+ name: activesupport
38
+ requirement: &2157147300 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: 3.2.0
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *2157147300
47
+ - !ruby/object:Gem::Dependency
48
+ name: rspec
49
+ requirement: &2157146720 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 2.9.0
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *2157146720
58
+ - !ruby/object:Gem::Dependency
59
+ name: guard-rspec
60
+ requirement: &2157146120 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ~>
64
+ - !ruby/object:Gem::Version
65
+ version: 0.7.0
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *2157146120
69
+ - !ruby/object:Gem::Dependency
70
+ name: ruby_gntp
71
+ requirement: &2156851680 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ version: 0.3.4
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *2156851680
80
+ description: Ingest updates from CSV and apply to set of hashes
81
+ email:
82
+ - jmay@pobox.com
83
+ executables: []
84
+ extensions: []
85
+ extra_rdoc_files: []
86
+ files:
87
+ - .gitignore
88
+ - Gemfile
89
+ - Guardfile
90
+ - LICENSE
91
+ - README.md
92
+ - Rakefile
93
+ - assimilate.gemspec
94
+ - lib/assimilate.rb
95
+ - lib/assimilate/batch.rb
96
+ - lib/assimilate/catalog.rb
97
+ - lib/assimilate/version.rb
98
+ - spec/data/batch_input.csv
99
+ - spec/data/test.yml
100
+ - spec/data/updates.csv
101
+ - spec/lib/batch_spec.rb
102
+ - spec/spec_helper.rb
103
+ homepage: ''
104
+ licenses: []
105
+ post_install_message:
106
+ rdoc_options: []
107
+ require_paths:
108
+ - lib
109
+ required_ruby_version: !ruby/object:Gem::Requirement
110
+ none: false
111
+ requirements:
112
+ - - ! '>='
113
+ - !ruby/object:Gem::Version
114
+ version: '0'
115
+ required_rubygems_version: !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ requirements: []
122
+ rubyforge_project: assimilate
123
+ rubygems_version: 1.8.10
124
+ signing_key:
125
+ specification_version: 3
126
+ summary: Review & incorporate changes to a repository of persistent hashes in mongodb.
127
+ test_files:
128
+ - spec/data/batch_input.csv
129
+ - spec/data/test.yml
130
+ - spec/data/updates.csv
131
+ - spec/lib/batch_spec.rb
132
+ - spec/spec_helper.rb
133
+ has_rdoc: