assimilate 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -17,3 +17,4 @@ test/version_tmp
17
17
  tmp
18
18
  .rspec
19
19
  .irb_history
20
+ dev.yml
data/bin/assimilate ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "../lib/assimilate"
4
+
5
+ assimilator = Assimilate::Command.new
6
+ command, options, filenames = assimilator.parse
7
+
8
+ assimilator.execute(command, options, filenames)
@@ -14,6 +14,7 @@ class Assimilate::Batch
14
14
  @changes = []
15
15
  @adds = []
16
16
  @deletes = []
17
+ @resolved = false
17
18
  end
18
19
 
19
20
  def load_baseline
@@ -52,14 +53,18 @@ class Assimilate::Batch
52
53
  # compute anything needed before we can write updates to permanent store
53
54
  # * find records that have been deleted
54
55
  def resolve
55
- @deleted_keys = @baseline.keys - @seen.keys
56
-
57
- @updated_field_counts = @changes.each_with_object(Hash.new(0)) do |rec,h|
58
- key = rec[idfield]
59
- diffs = rec.diff(stripped_record_for(key))
60
- diffs.keys.each do |f|
61
- h[f] += 1
56
+ if !@resolved
57
+ @deleted_keys = @baseline.keys - @seen.keys
58
+
59
+ @updated_field_counts = @changes.each_with_object(Hash.new(0)) do |rec,h|
60
+ key = rec[idfield]
61
+ diffs = rec.diff(stripped_record_for(key))
62
+ diffs.keys.each do |f|
63
+ h[f] += 1
64
+ end
62
65
  end
66
+
67
+ @resolved = true
63
68
  end
64
69
  end
65
70
 
@@ -69,6 +74,7 @@ class Assimilate::Batch
69
74
  :baseline_count => @baseline.size,
70
75
  :final_count => @baseline.size + @adds.count,
71
76
  :adds_count => @adds.count,
77
+ :new_ids => @adds.map {|rec| rec[idfield]},
72
78
  :deletes_count => @deleted_keys.count,
73
79
  :updates_count => @changes.count,
74
80
  :unchanged_count => @noops.count,
@@ -86,7 +92,9 @@ class Assimilate::Batch
86
92
  end
87
93
 
88
94
  def record_batch
89
- raise(Assimilate::DuplicateImportError, "duplicate batch") if @catalog.batches.find('domain' => @domain, 'datestamp' => @datestamp).to_a.any?
95
+ raise(Assimilate::DuplicateImportError, "duplicate batch for datestamp #{datestamp}") if @catalog.batches.find('domain' => @domain, 'datestamp' => @datestamp).to_a.any?
96
+ raise(Assimilate::DuplicateImportError, "duplicate batch for file #{@filename}") if @catalog.batches.find('domain' => @domain, 'filename' => @filename).to_a.any?
97
+
90
98
  @catalog.batches.insert({
91
99
  'domain' => @domain,
92
100
  'datestamp' => @datestamp,
@@ -137,3 +145,6 @@ end
137
145
 
138
146
  class Assimilate::DuplicateImportError < StandardError
139
147
  end
148
+
149
+ class Assimilate::CorruptDataError < StandardError
150
+ end
@@ -29,6 +29,10 @@ class Assimilate::Catalog
29
29
  Assimilate::Batch.new(args.merge(:catalog => self))
30
30
  end
31
31
 
32
+ def extend_data(args)
33
+ Assimilate::Extender.new(args.merge(:catalog => self))
34
+ end
35
+
32
36
  def where(params)
33
37
  @catalog.find(params).first.select {|k,v| k !~ /^_/}
34
38
  end
@@ -0,0 +1,117 @@
1
+ require "optparse"
2
+
3
+ class Assimilate::Command
4
+ attr_reader :command, :options
5
+
6
+ def initialize
7
+ @options = {}
8
+ @parser = OptionParser.new do |opts|
9
+ opts.banner = "Usage: assimilate [command] [options]"
10
+
11
+ opts.on("--config FILENAME", String, "Catalog database configuration file") do |f|
12
+ @options[:config] = f
13
+ end
14
+
15
+ opts.on("--id FIELDNAME", String, "Field name to be used for record identifier") do |f|
16
+ @options[:idfield] = f
17
+ end
18
+
19
+ opts.on("--commit", "Commit changes to database") do
20
+ @options[:commit] = true
21
+ end
22
+
23
+ opts.on("--key FIELDNAME", String, "(*extend* only) Hash key to store extended attributes under") do |f|
24
+ @options[:key] = f
25
+ end
26
+
27
+ opts.on("--datestamp DATESTRING", String, "(*load* only) Datestamp to record for file batch operation") do |s|
28
+ @options[:datestamp] = s
29
+ end
30
+
31
+ opts.on("--domain STRING", String, "Domain value to apply to each record") do |s|
32
+ @options[:domain] = s
33
+ end
34
+ end
35
+ end
36
+
37
+ def parse(argv = ARGV)
38
+ @command = argv.shift
39
+ filenames = @parser.parse(argv)
40
+
41
+ raise OptionParser::MissingArgument, "missing config" unless options[:config]
42
+ raise OptionParser::MissingArgument, "missing idfield" unless options[:idfield]
43
+ raise OptionParser::MissingArgument, "missing domain" unless options[:domain]
44
+ raise "missing filename" unless filenames.any?
45
+
46
+ # argv remnants are filenames
47
+ [@command, @options, filenames]
48
+ end
49
+
50
+ def execute(command, options, filenames = nil)
51
+ filename = filenames.first
52
+
53
+ case command
54
+ when 'load'
55
+ raise OptionParser::MissingArgument, "missing datestamp" unless options[:datestamp]
56
+
57
+ results = Assimilate.load(filename, options)
58
+ logmessage(command, options, results)
59
+
60
+ when 'extend'
61
+ raise OptionParser::MissingArgument, "missing keyfield" unless options[:key]
62
+
63
+ results = Assimilate.extend_data(filename, options)
64
+ logmessage(command, options, results)
65
+
66
+
67
+ else
68
+ raise "unknown command #{command}"
69
+ end
70
+ end
71
+
72
+ def logmessage(command, options, results)
73
+ $stderr.puts <<-EOT
74
+ * assimilate #{command} (#{options.keys.join(', ')})
75
+ EOT
76
+
77
+ case command
78
+ when 'load'
79
+ $stderr.puts <<-EOT
80
+ Original record count: #{results[:baseline_count]}
81
+ Final record count: #{results[:final_count]}
82
+ Unchanged records: #{results[:unchanged_count]}
83
+ New records: #{results[:adds_count]} (#{results[:new_ids].take(10).join(',')})
84
+ Deletes: #{results[:deletes_count]}
85
+ Updates: #{results[:updates_count]}
86
+ EOT
87
+ if results[:updated_fields].any?
88
+ $stderr.puts <<-EOT
89
+ Counts by field:
90
+ EOT
91
+ results[:updated_fields].each do |k,v|
92
+ $stderr.puts <<-EOT
93
+ #{k}: #{v}
94
+ EOT
95
+ end
96
+ end
97
+ when 'extend'
98
+ $stderr.puts <<-EOT
99
+ Original record count: #{results[:baseline_count]}
100
+ Final record count: #{results[:final_count]}
101
+ New identifiers: #{results[:adds_count]} #{options[:idfield]} (#{results[:new_ids].take(10).join(',')})
102
+ Distinct ids: #{results[:distinct_ids]}
103
+ Unchanged records: #{results[:unchanged_count]}
104
+ Updates: #{results[:updates_count]}
105
+ EOT
106
+ if results[:updated_fields].any?
107
+ results[:updated_fields].each do |k,v|
108
+ $stderr.puts <<-EOT
109
+ #{options[:key]}.#{k}: #{v}
110
+ EOT
111
+ end
112
+ end
113
+ else
114
+ $stderr.puts results.inspect
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,103 @@
1
+ class Assimilate::Extender
2
+ attr_reader :domain, :idfield, :keyfield
3
+
4
+ def initialize(args)
5
+ @catalog = args[:catalog]
6
+ @domain = args[:domain]
7
+ @idfield = args[:idfield]
8
+ @filename = args[:filename]
9
+ @keyfield = args[:key]
10
+
11
+ load_baseline
12
+
13
+ @noops = []
14
+ @changes = []
15
+ @adds = []
16
+ @deletes = []
17
+ end
18
+
19
+ def load_baseline
20
+ stored_records = @catalog.catalog.find(@catalog.domainkey => @domain).to_a
21
+ @baseline = stored_records.each_with_object({}) do |rec, h|
22
+ key = rec[@idfield]
23
+ if h.include?(key)
24
+ raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in domain [#{@domain}]"
25
+ end
26
+ h[key] = rec
27
+ end
28
+ end
29
+
30
+ def <<(record)
31
+ @seen ||= Hash.new(0)
32
+
33
+ hash = record.to_hash
34
+ key = hash[@idfield]
35
+ data = hash.reject {|k,v| k == idfield}
36
+ # @seen[key] = data
37
+ current_record = @baseline[key]
38
+ if current_record
39
+ if current_record[@keyfield] == data
40
+ @noops << key
41
+ @seen[key] = {}
42
+ else
43
+ @changes << key
44
+ @seen[key] = data
45
+ end
46
+ else
47
+ @adds << key
48
+ @seen[key] = data
49
+ end
50
+ end
51
+
52
+ def stats
53
+ {
54
+ :baseline_count => @baseline.size,
55
+ :final_count => @baseline.size + @adds.count,
56
+ :distinct_ids => @seen.size,
57
+ :adds_count => @adds.count,
58
+ :new_ids => @adds,
59
+ :updates_count => @changes.count,
60
+ :updated_fields => @seen.each_with_object(Hash.new(0)) {|(k,hash),memo| hash.each {|k,v| memo[k] += 1}},
61
+ :unchanged_count => @noops.count
62
+ }
63
+ end
64
+
65
+ # write all the changes to the catalog
66
+ def commit
67
+ apply_inserts
68
+ apply_updates
69
+ end
70
+
71
+ # an "insert" here means a record for which we have extended data
72
+ # but does not appear in the current catalog, so we need to create
73
+ # a stub entry.
74
+ def apply_inserts
75
+ @adds.each do |key|
76
+ data = @seen[key]
77
+ @catalog.catalog.insert(
78
+ @catalog.domainkey => domain,
79
+ idfield => key,
80
+ keyfield => data
81
+ )
82
+ end
83
+ end
84
+
85
+ # "update" means store the extended data in the record (which must exist)
86
+ def apply_updates
87
+ @changes.each do |key|
88
+ data = @seen[key]
89
+ @catalog.catalog.update(
90
+ {
91
+ @catalog.domainkey => domain,
92
+ idfield => key
93
+ },
94
+ {"$set" => {
95
+ keyfield => data
96
+ }
97
+ }
98
+ )
99
+ end
100
+ end
101
+
102
+
103
+ end
@@ -1,3 +1,3 @@
1
1
  module Assimilate
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/assimilate.rb CHANGED
@@ -1,11 +1,52 @@
1
1
  require "mongo"
2
2
  require "active_support/core_ext" # needed for Hash#diff
3
+ require "csv"
3
4
 
4
5
  require_relative "assimilate/version"
5
6
 
6
7
  require_relative "assimilate/catalog"
7
8
  require_relative "assimilate/batch"
9
+ require_relative "assimilate/extender"
10
+
11
+ require_relative "assimilate/command"
8
12
 
9
13
  module Assimilate
10
- # Your code goes here...
14
+ def self.load(filename, opts = {})
15
+ begin
16
+ catalog = Catalog.new(:config => opts[:config])
17
+ batcher = catalog.start_batch(opts.merge(:filename => filename))
18
+
19
+ records = CSV.read(filename, :headers => true)
20
+ records.each do |rec|
21
+ batcher << rec
22
+ end
23
+ if opts[:commit]
24
+ batcher.commit
25
+ else
26
+ $stderr.puts "suppressing data commit"
27
+ end
28
+ batcher.stats
29
+ # TODO explicit handling for Assimilate exceptions - when code is stable
30
+ # rescue Assimilate::DuplicateImportError => e
31
+ # $stderr.puts e.message
32
+ # exit 1
33
+ end
34
+ end
35
+
36
+ def self.extend_data(filename, opts = {})
37
+ begin
38
+ catalog = Catalog.new(:config => opts[:config])
39
+ extender = catalog.extend_data(opts)
40
+ records = CSV.read(filename, :headers => true)
41
+ records.each do |rec|
42
+ extender << rec
43
+ end
44
+ if opts[:commit]
45
+ extender.commit
46
+ else
47
+ $stderr.puts "suppressing data commit"
48
+ end
49
+ extender.stats
50
+ end
51
+ end
11
52
  end
@@ -4,3 +4,4 @@ ID,name,title
4
4
  3,Benjamin Franklin,Sage
5
5
  4,Aaron Burr,Duelist
6
6
  5,Alexander Hamilton,Financier
7
+ 6,James Madison,Theorist
@@ -0,0 +1,5 @@
1
+ ID,date
2
+ 1,1789/04/30
3
+ 2,1797/03/04
4
+ 6,1809/03/04
5
+ 16,1861/03/04
@@ -0,0 +1,7 @@
1
+ ID,name,title
2
+ 1,George Washington,President
3
+ 2,John Adams,Vice President
4
+ 3,Benjamin Franklin,Sage
5
+ 4,Aaron Burr,Duelist
6
+ 5,Alexander Hamilton,Financier
7
+ 6,James Madison,Theorist
@@ -14,7 +14,7 @@ describe "importing file" do
14
14
  end
15
15
 
16
16
  def import_data(datestamp, filename = "batch_input.csv")
17
- @batcher = @catalog.start_batch(domain: 'testdata', datestamp: datestamp, idfield: 'ID')
17
+ @batcher = @catalog.start_batch(domain: 'testdata', datestamp: datestamp, filename: filename, idfield: 'ID')
18
18
 
19
19
  @records = CSV.read(File.dirname(__FILE__) + "/../data/#{filename}", :headers => true)
20
20
  @records.each do |rec|
@@ -32,8 +32,9 @@ describe "importing file" do
32
32
  it "should return correct import stats" do
33
33
  @batcher.stats.should == {
34
34
  :baseline_count => 0,
35
- :final_count => 5,
36
- :adds_count => 5,
35
+ :final_count => 6,
36
+ :adds_count => 6,
37
+ :new_ids => ["1", "2", "3", "4", "5", "6"],
37
38
  :deletes_count => 0,
38
39
  :updates_count => 0,
39
40
  :unchanged_count => 0,
@@ -51,15 +52,20 @@ describe "importing file" do
51
52
  lambda {import_data("123")}.should raise_error(Assimilate::DuplicateImportError)
52
53
  end
53
54
 
55
+ it "should refuse to re-import same file" do
56
+ lambda {import_data("234")}.should raise_error(Assimilate::DuplicateImportError)
57
+ end
58
+
54
59
  it "should do all no-ops when importing identical data" do
55
- lambda {import_data("234")}.should_not raise_error
60
+ lambda {import_data("234", "duplicate_input.csv")}.should_not raise_error
56
61
  @batcher.stats.should == {
57
- :baseline_count => 5,
58
- :final_count => 5,
62
+ :baseline_count => 6,
63
+ :final_count => 6,
59
64
  :adds_count => 0,
65
+ :new_ids => [],
60
66
  :deletes_count => 0,
61
67
  :updates_count => 0,
62
- :unchanged_count => 5,
68
+ :unchanged_count => 6,
63
69
  :updated_fields => {}
64
70
  }
65
71
  @catalog.catalog.count.should == @records.count
@@ -78,10 +84,11 @@ describe "importing file" do
78
84
 
79
85
  it "should recognize changes" do
80
86
  @batcher.stats.should == {
81
- :baseline_count => 5,
82
- :final_count => 6,
87
+ :baseline_count => 6,
88
+ :final_count => 7,
83
89
  :adds_count => 1,
84
- :deletes_count => 1,
90
+ :new_ids => ["7"],
91
+ :deletes_count => 2,
85
92
  :updates_count => 1,
86
93
  :unchanged_count => 3,
87
94
  :updated_fields => {'title' => 1}
@@ -0,0 +1,77 @@
1
+ # tests for extending the base records
2
+
3
+ require "spec_helper"
4
+
5
+ describe "loading extended data" do
6
+ before(:all) do
7
+ @catalog = Assimilate::Catalog.new(:config => File.dirname(__FILE__) + "/../data/test.yml")
8
+ reset_catalog
9
+ end
10
+
11
+ def reset_catalog
12
+ @catalog.catalog.remove
13
+ @catalog.batches.remove
14
+ end
15
+
16
+ def import_base_data(datestamp, filename = "batch_input.csv")
17
+ @batcher = @catalog.start_batch(domain: 'testdata', datestamp: datestamp, idfield: 'ID')
18
+
19
+ @records = CSV.read(File.dirname(__FILE__) + "/../data/#{filename}", :headers => true)
20
+ @records.each do |rec|
21
+ @batcher << rec
22
+ end
23
+ @batcher.commit
24
+ end
25
+
26
+ describe "into matching catalog entries" do
27
+ before(:all) do
28
+ reset_catalog
29
+ import_base_data("123")
30
+ end
31
+
32
+ def import_extended_data(datestamp, filename)
33
+ @extender = @catalog.extend_data(domain: 'testdata', datastamp: datestamp, idfield: 'ID', key: 'inauguration')
34
+ @records = CSV.read(File.dirname(__FILE__) + "/../data/#{filename}", :headers => true)
35
+ @records.each do |rec|
36
+ @extender << rec
37
+ end
38
+ @extender.commit
39
+ end
40
+
41
+ before(:each) do
42
+ import_extended_data("1001", "dates.csv")
43
+ end
44
+
45
+ it "should capture changes" do
46
+ @extender.stats.should == {
47
+ :baseline_count => 6,
48
+ :final_count => 7,
49
+ :distinct_ids => 4,
50
+ :adds_count => 1,
51
+ :new_ids => ['16'],
52
+ :updates_count => 3,
53
+ :updated_fields => {'date' => 4},
54
+ :unchanged_count => 0
55
+ }
56
+ end
57
+
58
+ it "should do no-ops on duplicate load" do
59
+ # import_extended_data("1002", "dates")
60
+ lambda {import_extended_data("1002", "dates.csv")}.should_not raise_error
61
+
62
+ @extender.stats.should == {
63
+ :baseline_count => 7,
64
+ :final_count => 7,
65
+ :distinct_ids => 4,
66
+ :adds_count => 0,
67
+ :new_ids => [],
68
+ :updates_count => 0,
69
+ :updated_fields => {},
70
+ :unchanged_count => 4
71
+ }
72
+ end
73
+ end
74
+
75
+ # test handling of multiple records for same ID in the extended-data file
76
+ # test importing data at top level (no keyfield for sub-attributes)
77
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: assimilate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-26 00:00:00.000000000 Z
12
+ date: 2012-05-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mongo
16
- requirement: &2157149420 !ruby/object:Gem::Requirement
16
+ requirement: &2151862180 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 1.6.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *2157149420
24
+ version_requirements: *2151862180
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: bson_ext
27
- requirement: &2157148260 !ruby/object:Gem::Requirement
27
+ requirement: &2151861480 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 1.6.0
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *2157148260
35
+ version_requirements: *2151861480
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: activesupport
38
- requirement: &2157147300 !ruby/object:Gem::Requirement
38
+ requirement: &2151860560 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 3.2.0
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *2157147300
46
+ version_requirements: *2151860560
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rspec
49
- requirement: &2157146720 !ruby/object:Gem::Requirement
49
+ requirement: &2151859660 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 2.9.0
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2157146720
57
+ version_requirements: *2151859660
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: guard-rspec
60
- requirement: &2157146120 !ruby/object:Gem::Requirement
60
+ requirement: &2151856200 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: 0.7.0
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *2157146120
68
+ version_requirements: *2151856200
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: ruby_gntp
71
- requirement: &2156851680 !ruby/object:Gem::Requirement
71
+ requirement: &2151855040 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,11 +76,12 @@ dependencies:
76
76
  version: 0.3.4
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *2156851680
79
+ version_requirements: *2151855040
80
80
  description: Ingest updates from CSV and apply to set of hashes
81
81
  email:
82
82
  - jmay@pobox.com
83
- executables: []
83
+ executables:
84
+ - assimilate
84
85
  extensions: []
85
86
  extra_rdoc_files: []
86
87
  files:
@@ -91,14 +92,20 @@ files:
91
92
  - README.md
92
93
  - Rakefile
93
94
  - assimilate.gemspec
95
+ - bin/assimilate
94
96
  - lib/assimilate.rb
95
97
  - lib/assimilate/batch.rb
96
98
  - lib/assimilate/catalog.rb
99
+ - lib/assimilate/command.rb
100
+ - lib/assimilate/extender.rb
97
101
  - lib/assimilate/version.rb
98
102
  - spec/data/batch_input.csv
103
+ - spec/data/dates.csv
104
+ - spec/data/duplicate_input.csv
99
105
  - spec/data/test.yml
100
106
  - spec/data/updates.csv
101
107
  - spec/lib/batch_spec.rb
108
+ - spec/lib/extend_spec.rb
102
109
  - spec/spec_helper.rb
103
110
  homepage: ''
104
111
  licenses: []
@@ -126,8 +133,11 @@ specification_version: 3
126
133
  summary: Review & incorporate changes to a repository of persistent hashes in mongodb.
127
134
  test_files:
128
135
  - spec/data/batch_input.csv
136
+ - spec/data/dates.csv
137
+ - spec/data/duplicate_input.csv
129
138
  - spec/data/test.yml
130
139
  - spec/data/updates.csv
131
140
  - spec/lib/batch_spec.rb
141
+ - spec/lib/extend_spec.rb
132
142
  - spec/spec_helper.rb
133
143
  has_rdoc: