assimilate 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/Guardfile +8 -0
- data/LICENSE +22 -0
- data/README.md +36 -0
- data/Rakefile +8 -0
- data/assimilate.gemspec +26 -0
- data/lib/assimilate/batch.rb +139 -0
- data/lib/assimilate/catalog.rb +39 -0
- data/lib/assimilate/version.rb +3 -0
- data/lib/assimilate.rb +11 -0
- data/spec/data/batch_input.csv +6 -0
- data/spec/data/test.yml +5 -0
- data/spec/data/updates.csv +6 -0
- data/spec/lib/batch_spec.rb +92 -0
- data/spec/spec_helper.rb +8 -0
- metadata +133 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Guardfile
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
# A sample Guardfile
|
2
|
+
# More info at https://github.com/guard/guard#readme
|
3
|
+
|
4
|
+
guard 'rspec', :version => 2 do
|
5
|
+
watch(%r{^spec/.+_spec\.rb$})
|
6
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1].gsub(/assimilate./, '')}_spec.rb" }
|
7
|
+
watch('spec/spec_helper.rb') { "spec" }
|
8
|
+
end
|
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Jason May
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# Assimilate
|
2
|
+
|
3
|
+
Ingest updates from CSV and apply to set of persistent hashes
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'assimilate'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install assimilate
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
assimilate --config repo.yml filename
|
22
|
+
|
23
|
+
or
|
24
|
+
|
25
|
+
> require 'assimilate'
|
26
|
+
> catalog = Assimilate::Catalog.new(:config => configfile)
|
27
|
+
> catalog.start_batch(:filename => filename, :datestamp => datestamp, :idfield => idfield)
|
28
|
+
|
29
|
+
## Contributing
|
30
|
+
|
31
|
+
1. Fork it
|
32
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
33
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
34
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
35
|
+
5. Create new Pull Request
|
36
|
+
|
data/Rakefile
ADDED
data/assimilate.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/assimilate/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Jason May"]
|
6
|
+
gem.email = ["jmay@pobox.com"]
|
7
|
+
gem.description = %q{Ingest updates from CSV and apply to set of hashes}
|
8
|
+
gem.summary = %q{Review & incorporate changes to a repository of persistent hashes in mongodb.}
|
9
|
+
gem.homepage = ""
|
10
|
+
gem.rubyforge_project = "assimilate"
|
11
|
+
|
12
|
+
gem.files = `git ls-files`.split($\)
|
13
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
14
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
15
|
+
gem.name = "assimilate"
|
16
|
+
gem.require_paths = ["lib"]
|
17
|
+
gem.version = Assimilate::VERSION
|
18
|
+
|
19
|
+
gem.add_dependency "mongo", "~> 1.6.0"
|
20
|
+
gem.add_dependency "bson_ext", "~> 1.6.0"
|
21
|
+
gem.add_dependency 'activesupport', "~> 3.2.0"
|
22
|
+
|
23
|
+
gem.add_development_dependency "rspec", "~> 2.9.0"
|
24
|
+
gem.add_development_dependency "guard-rspec", "~> 0.7.0"
|
25
|
+
gem.add_development_dependency "ruby_gntp", "~> 0.3.4"
|
26
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
class Assimilate::Batch
|
2
|
+
attr_reader :domain, :idfield, :datestamp
|
3
|
+
|
4
|
+
def initialize(args)
|
5
|
+
@catalog = args[:catalog]
|
6
|
+
@domain = args[:domain]
|
7
|
+
@datestamp = args[:datestamp]
|
8
|
+
@idfield = args[:idfield]
|
9
|
+
@filename = args[:filename]
|
10
|
+
|
11
|
+
load_baseline
|
12
|
+
|
13
|
+
@noops = []
|
14
|
+
@changes = []
|
15
|
+
@adds = []
|
16
|
+
@deletes = []
|
17
|
+
end
|
18
|
+
|
19
|
+
def load_baseline
|
20
|
+
stored_records = @catalog.catalog.find(@catalog.domainkey => @domain).to_a
|
21
|
+
@baseline = stored_records.each_with_object({}) do |rec, h|
|
22
|
+
key = rec[@idfield]
|
23
|
+
if h.include?(key)
|
24
|
+
raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in domain [#{@domain}]"
|
25
|
+
end
|
26
|
+
h[key] = rec
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def stripped_record_for(key)
|
31
|
+
@baseline[key] && @baseline[key].select {|k,v| k !~ /^_/}
|
32
|
+
end
|
33
|
+
|
34
|
+
def <<(record)
|
35
|
+
@seen ||= Hash.new(0)
|
36
|
+
|
37
|
+
hash = record.to_hash
|
38
|
+
key = hash[@idfield]
|
39
|
+
@seen[key] += 1
|
40
|
+
current_record = stripped_record_for(key)
|
41
|
+
if current_record
|
42
|
+
if current_record == hash
|
43
|
+
@noops << hash
|
44
|
+
else
|
45
|
+
@changes << hash
|
46
|
+
end
|
47
|
+
else
|
48
|
+
@adds << hash
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# compute anything needed before we can write updates to permanent store
|
53
|
+
# * find records that have been deleted
|
54
|
+
def resolve
|
55
|
+
@deleted_keys = @baseline.keys - @seen.keys
|
56
|
+
|
57
|
+
@updated_field_counts = @changes.each_with_object(Hash.new(0)) do |rec,h|
|
58
|
+
key = rec[idfield]
|
59
|
+
diffs = rec.diff(stripped_record_for(key))
|
60
|
+
diffs.keys.each do |f|
|
61
|
+
h[f] += 1
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def stats
|
67
|
+
resolve
|
68
|
+
{
|
69
|
+
:baseline_count => @baseline.size,
|
70
|
+
:final_count => @baseline.size + @adds.count,
|
71
|
+
:adds_count => @adds.count,
|
72
|
+
:deletes_count => @deleted_keys.count,
|
73
|
+
:updates_count => @changes.count,
|
74
|
+
:unchanged_count => @noops.count,
|
75
|
+
:updated_fields => @updated_field_counts
|
76
|
+
}
|
77
|
+
end
|
78
|
+
|
79
|
+
# write the updates to the catalog
|
80
|
+
def commit
|
81
|
+
resolve
|
82
|
+
record_batch
|
83
|
+
apply_deletes
|
84
|
+
apply_inserts
|
85
|
+
apply_updates
|
86
|
+
end
|
87
|
+
|
88
|
+
def record_batch
|
89
|
+
raise(Assimilate::DuplicateImportError, "duplicate batch") if @catalog.batches.find('domain' => @domain, 'datestamp' => @datestamp).to_a.any?
|
90
|
+
@catalog.batches.insert({
|
91
|
+
'domain' => @domain,
|
92
|
+
'datestamp' => @datestamp,
|
93
|
+
'filename' => @filename
|
94
|
+
})
|
95
|
+
end
|
96
|
+
|
97
|
+
def apply_deletes
|
98
|
+
@deleted_keys.each do |key|
|
99
|
+
@catalog.catalog.update(
|
100
|
+
{
|
101
|
+
@catalog.domainkey => domain,
|
102
|
+
idfield => key
|
103
|
+
},
|
104
|
+
{"$set" => {:_dt_removed => datestamp}}
|
105
|
+
)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
INSERT_BATCH_SIZE = 1000 # default batch size for bulk loading into mongo
|
110
|
+
|
111
|
+
def apply_inserts
|
112
|
+
@adds.each_slice(INSERT_BATCH_SIZE) do |slice|
|
113
|
+
# mongo insert can't handle CSV::Row objects, must be converted to regular hashes
|
114
|
+
@catalog.catalog.insert(decorate(slice))
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def apply_updates
|
119
|
+
@changes.each do |rec|
|
120
|
+
@catalog.catalog.update(
|
121
|
+
{
|
122
|
+
@catalog.domainkey => domain,
|
123
|
+
idfield => rec[idfield]
|
124
|
+
},
|
125
|
+
{"$set" => rec}
|
126
|
+
)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def decorate(records)
|
131
|
+
records.map do |r|
|
132
|
+
r[@catalog.domainkey] = @domain
|
133
|
+
r.to_hash
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
class Assimilate::DuplicateImportError < StandardError
|
139
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require "yaml"
|
2
|
+
|
3
|
+
# Records in each catalog acquire the following internal attributes:
|
4
|
+
# _id Unique ID, assigned by mongo
|
5
|
+
# _[domain] Domain key, specified with :domainkey attribute when initializing catalog
|
6
|
+
# _dt_first_seen Batch datestamp reference for when this record was first captured
|
7
|
+
# _dt_last_seen Batch datestamp reference for when this record was most recently affirmed
|
8
|
+
# _dt_last_update Batch datestamp reference for when this record was most recently altered
|
9
|
+
# _dt_removed Batch datestamp reference for when this record was removed from input
|
10
|
+
#
|
11
|
+
# Inbound records must not have attributes named with leading underscores.
|
12
|
+
#
|
13
|
+
# A "domain" here is a namespace of identifiers.
|
14
|
+
|
15
|
+
class Assimilate::Catalog
|
16
|
+
attr_reader :catalog, :batches, :domainkey
|
17
|
+
|
18
|
+
def initialize(args)
|
19
|
+
@config = YAML.load(File.open(args[:config]))
|
20
|
+
|
21
|
+
@db = Mongo::Connection.new.db(@config['db'])
|
22
|
+
@catalog = @db.collection(@config['catalog'])
|
23
|
+
@batches = @db.collection(@config['batch'])
|
24
|
+
@domainkey = @config['domain']
|
25
|
+
@domainkey = "_#{@domainkey}" unless @domainkey =~ /^_/ # enforce leading underscore on internal attributes
|
26
|
+
end
|
27
|
+
|
28
|
+
def start_batch(args)
|
29
|
+
Assimilate::Batch.new(args.merge(:catalog => self))
|
30
|
+
end
|
31
|
+
|
32
|
+
def where(params)
|
33
|
+
@catalog.find(params).first.select {|k,v| k !~ /^_/}
|
34
|
+
end
|
35
|
+
|
36
|
+
def active_count
|
37
|
+
@catalog.find("_dt_removed" => nil).count
|
38
|
+
end
|
39
|
+
end
|
data/lib/assimilate.rb
ADDED
data/spec/data/test.yml
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
# batch import tests
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
describe "importing file" do
|
6
|
+
before(:all) do
|
7
|
+
@catalog = Assimilate::Catalog.new(:config => File.dirname(__FILE__) + "/../data/test.yml")
|
8
|
+
reset_catalog
|
9
|
+
end
|
10
|
+
|
11
|
+
def reset_catalog
|
12
|
+
@catalog.catalog.remove
|
13
|
+
@catalog.batches.remove
|
14
|
+
end
|
15
|
+
|
16
|
+
def import_data(datestamp, filename = "batch_input.csv")
|
17
|
+
@batcher = @catalog.start_batch(domain: 'testdata', datestamp: datestamp, idfield: 'ID')
|
18
|
+
|
19
|
+
@records = CSV.read(File.dirname(__FILE__) + "/../data/#{filename}", :headers => true)
|
20
|
+
@records.each do |rec|
|
21
|
+
@batcher << rec
|
22
|
+
end
|
23
|
+
@batcher.commit
|
24
|
+
end
|
25
|
+
|
26
|
+
describe "into empty catalog" do
|
27
|
+
before :each do
|
28
|
+
reset_catalog
|
29
|
+
import_data("123")
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should return correct import stats" do
|
33
|
+
@batcher.stats.should == {
|
34
|
+
:baseline_count => 0,
|
35
|
+
:final_count => 5,
|
36
|
+
:adds_count => 5,
|
37
|
+
:deletes_count => 0,
|
38
|
+
:updates_count => 0,
|
39
|
+
:unchanged_count => 0,
|
40
|
+
:updated_fields => {}
|
41
|
+
}
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should load the records verbatim" do
|
45
|
+
@catalog.catalog.count.should == @records.count
|
46
|
+
example = @records[rand(@records.count)]
|
47
|
+
@catalog.where('_resource' => 'testdata', 'ID' => example['ID']).should == example.to_hash
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should refuse to do a duplicate import" do
|
51
|
+
lambda {import_data("123")}.should raise_error(Assimilate::DuplicateImportError)
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should do all no-ops when importing identical data" do
|
55
|
+
lambda {import_data("234")}.should_not raise_error
|
56
|
+
@batcher.stats.should == {
|
57
|
+
:baseline_count => 5,
|
58
|
+
:final_count => 5,
|
59
|
+
:adds_count => 0,
|
60
|
+
:deletes_count => 0,
|
61
|
+
:updates_count => 0,
|
62
|
+
:unchanged_count => 5,
|
63
|
+
:updated_fields => {}
|
64
|
+
}
|
65
|
+
@catalog.catalog.count.should == @records.count
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
describe "into existing catalog" do
|
70
|
+
before(:all) do
|
71
|
+
reset_catalog
|
72
|
+
import_data("123")
|
73
|
+
end
|
74
|
+
|
75
|
+
before(:each) do
|
76
|
+
import_data("345", "updates.csv")
|
77
|
+
end
|
78
|
+
|
79
|
+
it "should recognize changes" do
|
80
|
+
@batcher.stats.should == {
|
81
|
+
:baseline_count => 5,
|
82
|
+
:final_count => 6,
|
83
|
+
:adds_count => 1,
|
84
|
+
:deletes_count => 1,
|
85
|
+
:updates_count => 1,
|
86
|
+
:unchanged_count => 3,
|
87
|
+
:updated_fields => {'title' => 1}
|
88
|
+
}
|
89
|
+
@catalog.active_count.should == @records.count
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: assimilate
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jason May
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-04-26 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: mongo
|
16
|
+
requirement: &2157149420 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.6.0
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *2157149420
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: bson_ext
|
27
|
+
requirement: &2157148260 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.6.0
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *2157148260
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: activesupport
|
38
|
+
requirement: &2157147300 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 3.2.0
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *2157147300
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rspec
|
49
|
+
requirement: &2157146720 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 2.9.0
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *2157146720
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: guard-rspec
|
60
|
+
requirement: &2157146120 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ~>
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: 0.7.0
|
66
|
+
type: :development
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *2157146120
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: ruby_gntp
|
71
|
+
requirement: &2156851680 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ~>
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: 0.3.4
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: *2156851680
|
80
|
+
description: Ingest updates from CSV and apply to set of hashes
|
81
|
+
email:
|
82
|
+
- jmay@pobox.com
|
83
|
+
executables: []
|
84
|
+
extensions: []
|
85
|
+
extra_rdoc_files: []
|
86
|
+
files:
|
87
|
+
- .gitignore
|
88
|
+
- Gemfile
|
89
|
+
- Guardfile
|
90
|
+
- LICENSE
|
91
|
+
- README.md
|
92
|
+
- Rakefile
|
93
|
+
- assimilate.gemspec
|
94
|
+
- lib/assimilate.rb
|
95
|
+
- lib/assimilate/batch.rb
|
96
|
+
- lib/assimilate/catalog.rb
|
97
|
+
- lib/assimilate/version.rb
|
98
|
+
- spec/data/batch_input.csv
|
99
|
+
- spec/data/test.yml
|
100
|
+
- spec/data/updates.csv
|
101
|
+
- spec/lib/batch_spec.rb
|
102
|
+
- spec/spec_helper.rb
|
103
|
+
homepage: ''
|
104
|
+
licenses: []
|
105
|
+
post_install_message:
|
106
|
+
rdoc_options: []
|
107
|
+
require_paths:
|
108
|
+
- lib
|
109
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
110
|
+
none: false
|
111
|
+
requirements:
|
112
|
+
- - ! '>='
|
113
|
+
- !ruby/object:Gem::Version
|
114
|
+
version: '0'
|
115
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ! '>='
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '0'
|
121
|
+
requirements: []
|
122
|
+
rubyforge_project: assimilate
|
123
|
+
rubygems_version: 1.8.10
|
124
|
+
signing_key:
|
125
|
+
specification_version: 3
|
126
|
+
summary: Review & incorporate changes to a repository of persistent hashes in mongodb.
|
127
|
+
test_files:
|
128
|
+
- spec/data/batch_input.csv
|
129
|
+
- spec/data/test.yml
|
130
|
+
- spec/data/updates.csv
|
131
|
+
- spec/lib/batch_spec.rb
|
132
|
+
- spec/spec_helper.rb
|
133
|
+
has_rdoc:
|