assimilate 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .rspec
19
+ .irb_history
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in assimilate.gemspec
4
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,8 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard 'rspec', :version => 2 do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1].gsub(/assimilate./, '')}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+ end
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Jason May
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,36 @@
1
+ # Assimilate
2
+
3
+ Ingest updates from CSV and apply to set of persistent hashes
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'assimilate'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install assimilate
18
+
19
+ ## Usage
20
+
21
+ assimilate --config repo.yml filename
22
+
23
+ or
24
+
25
+ > require 'assimilate'
26
+ > catalog = Assimilate::Catalog.new(:config => configfile)
27
+ > catalog.start_batch(:filename => filename, :datestamp => datestamp, :idfield => idfield)
28
+
29
+ ## Contributing
30
+
31
+ 1. Fork it
32
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
33
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
34
+ 4. Push to the branch (`git push origin my-new-feature`)
35
+ 5. Create new Pull Request
36
+
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ # task :default => :spec
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/assimilate/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Jason May"]
6
+ gem.email = ["jmay@pobox.com"]
7
+ gem.description = %q{Ingest updates from CSV and apply to set of hashes}
8
+ gem.summary = %q{Review & incorporate changes to a repository of persistent hashes in mongodb.}
9
+ gem.homepage = ""
10
+ gem.rubyforge_project = "assimilate"
11
+
12
+ gem.files = `git ls-files`.split($\)
13
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
14
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
15
+ gem.name = "assimilate"
16
+ gem.require_paths = ["lib"]
17
+ gem.version = Assimilate::VERSION
18
+
19
+ gem.add_dependency "mongo", "~> 1.6.0"
20
+ gem.add_dependency "bson_ext", "~> 1.6.0"
21
+ gem.add_dependency 'activesupport', "~> 3.2.0"
22
+
23
+ gem.add_development_dependency "rspec", "~> 2.9.0"
24
+ gem.add_development_dependency "guard-rspec", "~> 0.7.0"
25
+ gem.add_development_dependency "ruby_gntp", "~> 0.3.4"
26
+ end
@@ -0,0 +1,139 @@
1
+ class Assimilate::Batch
2
+ attr_reader :domain, :idfield, :datestamp
3
+
4
+ def initialize(args)
5
+ @catalog = args[:catalog]
6
+ @domain = args[:domain]
7
+ @datestamp = args[:datestamp]
8
+ @idfield = args[:idfield]
9
+ @filename = args[:filename]
10
+
11
+ load_baseline
12
+
13
+ @noops = []
14
+ @changes = []
15
+ @adds = []
16
+ @deletes = []
17
+ end
18
+
19
+ def load_baseline
20
+ stored_records = @catalog.catalog.find(@catalog.domainkey => @domain).to_a
21
+ @baseline = stored_records.each_with_object({}) do |rec, h|
22
+ key = rec[@idfield]
23
+ if h.include?(key)
24
+ raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in domain [#{@domain}]"
25
+ end
26
+ h[key] = rec
27
+ end
28
+ end
29
+
30
+ def stripped_record_for(key)
31
+ @baseline[key] && @baseline[key].select {|k,v| k !~ /^_/}
32
+ end
33
+
34
+ def <<(record)
35
+ @seen ||= Hash.new(0)
36
+
37
+ hash = record.to_hash
38
+ key = hash[@idfield]
39
+ @seen[key] += 1
40
+ current_record = stripped_record_for(key)
41
+ if current_record
42
+ if current_record == hash
43
+ @noops << hash
44
+ else
45
+ @changes << hash
46
+ end
47
+ else
48
+ @adds << hash
49
+ end
50
+ end
51
+
52
+ # compute anything needed before we can write updates to permanent store
53
+ # * find records that have been deleted
54
+ def resolve
55
+ @deleted_keys = @baseline.keys - @seen.keys
56
+
57
+ @updated_field_counts = @changes.each_with_object(Hash.new(0)) do |rec,h|
58
+ key = rec[idfield]
59
+ diffs = rec.diff(stripped_record_for(key))
60
+ diffs.keys.each do |f|
61
+ h[f] += 1
62
+ end
63
+ end
64
+ end
65
+
66
+ def stats
67
+ resolve
68
+ {
69
+ :baseline_count => @baseline.size,
70
+ :final_count => @baseline.size + @adds.count,
71
+ :adds_count => @adds.count,
72
+ :deletes_count => @deleted_keys.count,
73
+ :updates_count => @changes.count,
74
+ :unchanged_count => @noops.count,
75
+ :updated_fields => @updated_field_counts
76
+ }
77
+ end
78
+
79
+ # write the updates to the catalog
80
+ def commit
81
+ resolve
82
+ record_batch
83
+ apply_deletes
84
+ apply_inserts
85
+ apply_updates
86
+ end
87
+
88
+ def record_batch
89
+ raise(Assimilate::DuplicateImportError, "duplicate batch") if @catalog.batches.find('domain' => @domain, 'datestamp' => @datestamp).to_a.any?
90
+ @catalog.batches.insert({
91
+ 'domain' => @domain,
92
+ 'datestamp' => @datestamp,
93
+ 'filename' => @filename
94
+ })
95
+ end
96
+
97
+ def apply_deletes
98
+ @deleted_keys.each do |key|
99
+ @catalog.catalog.update(
100
+ {
101
+ @catalog.domainkey => domain,
102
+ idfield => key
103
+ },
104
+ {"$set" => {:_dt_removed => datestamp}}
105
+ )
106
+ end
107
+ end
108
+
109
+ INSERT_BATCH_SIZE = 1000 # default batch size for bulk loading into mongo
110
+
111
+ def apply_inserts
112
+ @adds.each_slice(INSERT_BATCH_SIZE) do |slice|
113
+ # mongo insert can't handle CSV::Row objects, must be converted to regular hashes
114
+ @catalog.catalog.insert(decorate(slice))
115
+ end
116
+ end
117
+
118
+ def apply_updates
119
+ @changes.each do |rec|
120
+ @catalog.catalog.update(
121
+ {
122
+ @catalog.domainkey => domain,
123
+ idfield => rec[idfield]
124
+ },
125
+ {"$set" => rec}
126
+ )
127
+ end
128
+ end
129
+
130
+ def decorate(records)
131
+ records.map do |r|
132
+ r[@catalog.domainkey] = @domain
133
+ r.to_hash
134
+ end
135
+ end
136
+ end
137
+
138
+ class Assimilate::DuplicateImportError < StandardError
139
+ end
@@ -0,0 +1,39 @@
1
+ require "yaml"
2
+
3
+ # Records in each catalog acquire the following internal attributes:
4
+ # _id Unique ID, assigned by mongo
5
+ # _[domain] Domain key, specified with :domainkey attribute when initializing catalog
6
+ # _dt_first_seen Batch datestamp reference for when this record was first captured
7
+ # _dt_last_seen Batch datestamp reference for when this record was most recently affirmed
8
+ # _dt_last_update Batch datestamp reference for when this record was most recently altered
9
+ # _dt_removed Batch datestamp reference for when this record was removed from input
10
+ #
11
+ # Inbound records must not have attributes named with leading underscores.
12
+ #
13
+ # A "domain" here is a namespace of identifiers.
14
+
15
+ class Assimilate::Catalog
16
+ attr_reader :catalog, :batches, :domainkey
17
+
18
+ def initialize(args)
19
+ @config = YAML.load(File.open(args[:config]))
20
+
21
+ @db = Mongo::Connection.new.db(@config['db'])
22
+ @catalog = @db.collection(@config['catalog'])
23
+ @batches = @db.collection(@config['batch'])
24
+ @domainkey = @config['domain']
25
+ @domainkey = "_#{@domainkey}" unless @domainkey =~ /^_/ # enforce leading underscore on internal attributes
26
+ end
27
+
28
+ def start_batch(args)
29
+ Assimilate::Batch.new(args.merge(:catalog => self))
30
+ end
31
+
32
+ def where(params)
33
+ @catalog.find(params).first.select {|k,v| k !~ /^_/}
34
+ end
35
+
36
+ def active_count
37
+ @catalog.find("_dt_removed" => nil).count
38
+ end
39
+ end
@@ -0,0 +1,3 @@
1
+ module Assimilate
2
+ VERSION = "0.0.1"
3
+ end
data/lib/assimilate.rb ADDED
@@ -0,0 +1,11 @@
1
+ require "mongo"
2
+ require "active_support/core_ext" # needed for Hash#diff
3
+
4
+ require_relative "assimilate/version"
5
+
6
+ require_relative "assimilate/catalog"
7
+ require_relative "assimilate/batch"
8
+
9
+ module Assimilate
10
+ # Your code goes here...
11
+ end
@@ -0,0 +1,6 @@
1
+ ID,name,title
2
+ 1,George Washington,President
3
+ 2,John Adams,Vice President
4
+ 3,Benjamin Franklin,Sage
5
+ 4,Aaron Burr,Duelist
6
+ 5,Alexander Hamilton,Financier
@@ -0,0 +1,5 @@
1
+ ---
2
+ db: assimilate-test
3
+ catalog: forefathers
4
+ batch: files
5
+ domain: resource
@@ -0,0 +1,6 @@
1
+ ID,name,title
2
+ 1,George Washington,President
3
+ 2,John Adams,Vice President
4
+ 3,Benjamin Franklin,Ambassador
5
+ 5,Alexander Hamilton,Financier
6
+ 7,Thomas Jefferson,Anti-Federalist
@@ -0,0 +1,92 @@
1
+ # batch import tests
2
+
3
+ require "spec_helper"
4
+
5
+ describe "importing file" do
6
+ before(:all) do
7
+ @catalog = Assimilate::Catalog.new(:config => File.dirname(__FILE__) + "/../data/test.yml")
8
+ reset_catalog
9
+ end
10
+
11
+ def reset_catalog
12
+ @catalog.catalog.remove
13
+ @catalog.batches.remove
14
+ end
15
+
16
+ def import_data(datestamp, filename = "batch_input.csv")
17
+ @batcher = @catalog.start_batch(domain: 'testdata', datestamp: datestamp, idfield: 'ID')
18
+
19
+ @records = CSV.read(File.dirname(__FILE__) + "/../data/#{filename}", :headers => true)
20
+ @records.each do |rec|
21
+ @batcher << rec
22
+ end
23
+ @batcher.commit
24
+ end
25
+
26
+ describe "into empty catalog" do
27
+ before :each do
28
+ reset_catalog
29
+ import_data("123")
30
+ end
31
+
32
+ it "should return correct import stats" do
33
+ @batcher.stats.should == {
34
+ :baseline_count => 0,
35
+ :final_count => 5,
36
+ :adds_count => 5,
37
+ :deletes_count => 0,
38
+ :updates_count => 0,
39
+ :unchanged_count => 0,
40
+ :updated_fields => {}
41
+ }
42
+ end
43
+
44
+ it "should load the records verbatim" do
45
+ @catalog.catalog.count.should == @records.count
46
+ example = @records[rand(@records.count)]
47
+ @catalog.where('_resource' => 'testdata', 'ID' => example['ID']).should == example.to_hash
48
+ end
49
+
50
+ it "should refuse to do a duplicate import" do
51
+ lambda {import_data("123")}.should raise_error(Assimilate::DuplicateImportError)
52
+ end
53
+
54
+ it "should do all no-ops when importing identical data" do
55
+ lambda {import_data("234")}.should_not raise_error
56
+ @batcher.stats.should == {
57
+ :baseline_count => 5,
58
+ :final_count => 5,
59
+ :adds_count => 0,
60
+ :deletes_count => 0,
61
+ :updates_count => 0,
62
+ :unchanged_count => 5,
63
+ :updated_fields => {}
64
+ }
65
+ @catalog.catalog.count.should == @records.count
66
+ end
67
+ end
68
+
69
+ describe "into existing catalog" do
70
+ before(:all) do
71
+ reset_catalog
72
+ import_data("123")
73
+ end
74
+
75
+ before(:each) do
76
+ import_data("345", "updates.csv")
77
+ end
78
+
79
+ it "should recognize changes" do
80
+ @batcher.stats.should == {
81
+ :baseline_count => 5,
82
+ :final_count => 6,
83
+ :adds_count => 1,
84
+ :deletes_count => 1,
85
+ :updates_count => 1,
86
+ :unchanged_count => 3,
87
+ :updated_fields => {'title' => 1}
88
+ }
89
+ @catalog.active_count.should == @records.count
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,8 @@
1
+ require 'rspec/autorun'
2
+ require "tempfile"
3
+ require "csv"
4
+
5
+ require File.expand_path('../../lib/assimilate', __FILE__)
6
+
7
+ RSpec.configure do |config|
8
+ end
metadata ADDED
@@ -0,0 +1,133 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: assimilate
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jason May
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-04-26 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mongo
16
+ requirement: &2157149420 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 1.6.0
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *2157149420
25
+ - !ruby/object:Gem::Dependency
26
+ name: bson_ext
27
+ requirement: &2157148260 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 1.6.0
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *2157148260
36
+ - !ruby/object:Gem::Dependency
37
+ name: activesupport
38
+ requirement: &2157147300 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: 3.2.0
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *2157147300
47
+ - !ruby/object:Gem::Dependency
48
+ name: rspec
49
+ requirement: &2157146720 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: 2.9.0
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *2157146720
58
+ - !ruby/object:Gem::Dependency
59
+ name: guard-rspec
60
+ requirement: &2157146120 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ~>
64
+ - !ruby/object:Gem::Version
65
+ version: 0.7.0
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *2157146120
69
+ - !ruby/object:Gem::Dependency
70
+ name: ruby_gntp
71
+ requirement: &2156851680 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ version: 0.3.4
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *2156851680
80
+ description: Ingest updates from CSV and apply to set of hashes
81
+ email:
82
+ - jmay@pobox.com
83
+ executables: []
84
+ extensions: []
85
+ extra_rdoc_files: []
86
+ files:
87
+ - .gitignore
88
+ - Gemfile
89
+ - Guardfile
90
+ - LICENSE
91
+ - README.md
92
+ - Rakefile
93
+ - assimilate.gemspec
94
+ - lib/assimilate.rb
95
+ - lib/assimilate/batch.rb
96
+ - lib/assimilate/catalog.rb
97
+ - lib/assimilate/version.rb
98
+ - spec/data/batch_input.csv
99
+ - spec/data/test.yml
100
+ - spec/data/updates.csv
101
+ - spec/lib/batch_spec.rb
102
+ - spec/spec_helper.rb
103
+ homepage: ''
104
+ licenses: []
105
+ post_install_message:
106
+ rdoc_options: []
107
+ require_paths:
108
+ - lib
109
+ required_ruby_version: !ruby/object:Gem::Requirement
110
+ none: false
111
+ requirements:
112
+ - - ! '>='
113
+ - !ruby/object:Gem::Version
114
+ version: '0'
115
+ required_rubygems_version: !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ requirements: []
122
+ rubyforge_project: assimilate
123
+ rubygems_version: 1.8.10
124
+ signing_key:
125
+ specification_version: 3
126
+ summary: Review & incorporate changes to a repository of persistent hashes in mongodb.
127
+ test_files:
128
+ - spec/data/batch_input.csv
129
+ - spec/data/test.yml
130
+ - spec/data/updates.csv
131
+ - spec/lib/batch_spec.rb
132
+ - spec/spec_helper.rb
133
+ has_rdoc: