assimilate 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -17,3 +17,4 @@ test/version_tmp
17
17
  tmp
18
18
  .rspec
19
19
  .irb_history
20
+ dev.yml
data/bin/assimilate ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative "../lib/assimilate"
4
+
5
+ assimilator = Assimilate::Command.new
6
+ command, options, filenames = assimilator.parse
7
+
8
+ assimilator.execute(command, options, filenames)
@@ -14,6 +14,7 @@ class Assimilate::Batch
14
14
  @changes = []
15
15
  @adds = []
16
16
  @deletes = []
17
+ @resolved = false
17
18
  end
18
19
 
19
20
  def load_baseline
@@ -52,14 +53,18 @@ class Assimilate::Batch
52
53
  # compute anything needed before we can write updates to permanent store
53
54
  # * find records that have been deleted
54
55
  def resolve
55
- @deleted_keys = @baseline.keys - @seen.keys
56
-
57
- @updated_field_counts = @changes.each_with_object(Hash.new(0)) do |rec,h|
58
- key = rec[idfield]
59
- diffs = rec.diff(stripped_record_for(key))
60
- diffs.keys.each do |f|
61
- h[f] += 1
56
+ if !@resolved
57
+ @deleted_keys = @baseline.keys - @seen.keys
58
+
59
+ @updated_field_counts = @changes.each_with_object(Hash.new(0)) do |rec,h|
60
+ key = rec[idfield]
61
+ diffs = rec.diff(stripped_record_for(key))
62
+ diffs.keys.each do |f|
63
+ h[f] += 1
64
+ end
62
65
  end
66
+
67
+ @resolved = true
63
68
  end
64
69
  end
65
70
 
@@ -69,6 +74,7 @@ class Assimilate::Batch
69
74
  :baseline_count => @baseline.size,
70
75
  :final_count => @baseline.size + @adds.count,
71
76
  :adds_count => @adds.count,
77
+ :new_ids => @adds.map {|rec| rec[idfield]},
72
78
  :deletes_count => @deleted_keys.count,
73
79
  :updates_count => @changes.count,
74
80
  :unchanged_count => @noops.count,
@@ -86,7 +92,9 @@ class Assimilate::Batch
86
92
  end
87
93
 
88
94
  def record_batch
89
- raise(Assimilate::DuplicateImportError, "duplicate batch") if @catalog.batches.find('domain' => @domain, 'datestamp' => @datestamp).to_a.any?
95
+ raise(Assimilate::DuplicateImportError, "duplicate batch for datestamp #{datestamp}") if @catalog.batches.find('domain' => @domain, 'datestamp' => @datestamp).to_a.any?
96
+ raise(Assimilate::DuplicateImportError, "duplicate batch for file #{@filename}") if @catalog.batches.find('domain' => @domain, 'filename' => @filename).to_a.any?
97
+
90
98
  @catalog.batches.insert({
91
99
  'domain' => @domain,
92
100
  'datestamp' => @datestamp,
@@ -137,3 +145,6 @@ end
137
145
 
138
146
  class Assimilate::DuplicateImportError < StandardError
139
147
  end
148
+
149
+ class Assimilate::CorruptDataError < StandardError
150
+ end
@@ -29,6 +29,10 @@ class Assimilate::Catalog
29
29
  Assimilate::Batch.new(args.merge(:catalog => self))
30
30
  end
31
31
 
32
+ def extend_data(args)
33
+ Assimilate::Extender.new(args.merge(:catalog => self))
34
+ end
35
+
32
36
  def where(params)
33
37
  @catalog.find(params).first.select {|k,v| k !~ /^_/}
34
38
  end
@@ -0,0 +1,117 @@
1
+ require "optparse"
2
+
3
+ class Assimilate::Command
4
+ attr_reader :command, :options
5
+
6
+ def initialize
7
+ @options = {}
8
+ @parser = OptionParser.new do |opts|
9
+ opts.banner = "Usage: assimilate [command] [options]"
10
+
11
+ opts.on("--config FILENAME", String, "Catalog database configuration file") do |f|
12
+ @options[:config] = f
13
+ end
14
+
15
+ opts.on("--id FIELDNAME", String, "Field name to be used for record identifier") do |f|
16
+ @options[:idfield] = f
17
+ end
18
+
19
+ opts.on("--commit", "Commit changes to database") do
20
+ @options[:commit] = true
21
+ end
22
+
23
+ opts.on("--key FIELDNAME", String, "(*extend* only) Hash key to store extended attributes under") do |f|
24
+ @options[:key] = f
25
+ end
26
+
27
+ opts.on("--datestamp DATESTRING", String, "(*load* only) Datestamp to record for file batch operation") do |s|
28
+ @options[:datestamp] = s
29
+ end
30
+
31
+ opts.on("--domain STRING", String, "Domain value to apply to each record") do |s|
32
+ @options[:domain] = s
33
+ end
34
+ end
35
+ end
36
+
37
+ def parse(argv = ARGV)
38
+ @command = argv.shift
39
+ filenames = @parser.parse(argv)
40
+
41
+ raise OptionParser::MissingArgument, "missing config" unless options[:config]
42
+ raise OptionParser::MissingArgument, "missing idfield" unless options[:idfield]
43
+ raise OptionParser::MissingArgument, "missing domain" unless options[:domain]
44
+ raise "missing filename" unless filenames.any?
45
+
46
+ # argv remnants are filenames
47
+ [@command, @options, filenames]
48
+ end
49
+
50
+ def execute(command, options, filenames = nil)
51
+ filename = filenames.first
52
+
53
+ case command
54
+ when 'load'
55
+ raise OptionParser::MissingArgument, "missing datestamp" unless options[:datestamp]
56
+
57
+ results = Assimilate.load(filename, options)
58
+ logmessage(command, options, results)
59
+
60
+ when 'extend'
61
+ raise OptionParser::MissingArgument, "missing keyfield" unless options[:key]
62
+
63
+ results = Assimilate.extend_data(filename, options)
64
+ logmessage(command, options, results)
65
+
66
+
67
+ else
68
+ raise "unknown command #{command}"
69
+ end
70
+ end
71
+
72
+ def logmessage(command, options, results)
73
+ $stderr.puts <<-EOT
74
+ * assimilate #{command} (#{options.keys.join(', ')})
75
+ EOT
76
+
77
+ case command
78
+ when 'load'
79
+ $stderr.puts <<-EOT
80
+ Original record count: #{results[:baseline_count]}
81
+ Final record count: #{results[:final_count]}
82
+ Unchanged records: #{results[:unchanged_count]}
83
+ New records: #{results[:adds_count]} (#{results[:new_ids].take(10).join(',')})
84
+ Deletes: #{results[:deletes_count]}
85
+ Updates: #{results[:updates_count]}
86
+ EOT
87
+ if results[:updated_fields].any?
88
+ $stderr.puts <<-EOT
89
+ Counts by field:
90
+ EOT
91
+ results[:updated_fields].each do |k,v|
92
+ $stderr.puts <<-EOT
93
+ #{k}: #{v}
94
+ EOT
95
+ end
96
+ end
97
+ when 'extend'
98
+ $stderr.puts <<-EOT
99
+ Original record count: #{results[:baseline_count]}
100
+ Final record count: #{results[:final_count]}
101
+ New identifiers: #{results[:adds_count]} #{options[:idfield]} (#{results[:new_ids].take(10).join(',')})
102
+ Distinct ids: #{results[:distinct_ids]}
103
+ Unchanged records: #{results[:unchanged_count]}
104
+ Updates: #{results[:updates_count]}
105
+ EOT
106
+ if results[:updated_fields].any?
107
+ results[:updated_fields].each do |k,v|
108
+ $stderr.puts <<-EOT
109
+ #{options[:key]}.#{k}: #{v}
110
+ EOT
111
+ end
112
+ end
113
+ else
114
+ $stderr.puts results.inspect
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,103 @@
1
+ class Assimilate::Extender
2
+ attr_reader :domain, :idfield, :keyfield
3
+
4
+ def initialize(args)
5
+ @catalog = args[:catalog]
6
+ @domain = args[:domain]
7
+ @idfield = args[:idfield]
8
+ @filename = args[:filename]
9
+ @keyfield = args[:key]
10
+
11
+ load_baseline
12
+
13
+ @noops = []
14
+ @changes = []
15
+ @adds = []
16
+ @deletes = []
17
+ end
18
+
19
+ def load_baseline
20
+ stored_records = @catalog.catalog.find(@catalog.domainkey => @domain).to_a
21
+ @baseline = stored_records.each_with_object({}) do |rec, h|
22
+ key = rec[@idfield]
23
+ if h.include?(key)
24
+ raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in domain [#{@domain}]"
25
+ end
26
+ h[key] = rec
27
+ end
28
+ end
29
+
30
+ def <<(record)
31
+ @seen ||= Hash.new(0)
32
+
33
+ hash = record.to_hash
34
+ key = hash[@idfield]
35
+ data = hash.reject {|k,v| k == idfield}
36
+ # @seen[key] = data
37
+ current_record = @baseline[key]
38
+ if current_record
39
+ if current_record[@keyfield] == data
40
+ @noops << key
41
+ @seen[key] = {}
42
+ else
43
+ @changes << key
44
+ @seen[key] = data
45
+ end
46
+ else
47
+ @adds << key
48
+ @seen[key] = data
49
+ end
50
+ end
51
+
52
+ def stats
53
+ {
54
+ :baseline_count => @baseline.size,
55
+ :final_count => @baseline.size + @adds.count,
56
+ :distinct_ids => @seen.size,
57
+ :adds_count => @adds.count,
58
+ :new_ids => @adds,
59
+ :updates_count => @changes.count,
60
+ :updated_fields => @seen.each_with_object(Hash.new(0)) {|(k,hash),memo| hash.each {|k,v| memo[k] += 1}},
61
+ :unchanged_count => @noops.count
62
+ }
63
+ end
64
+
65
+ # write all the changes to the catalog
66
+ def commit
67
+ apply_inserts
68
+ apply_updates
69
+ end
70
+
71
+ # an "insert" here means a record for which we have extended data
72
+ # but does not appear in the current catalog, so we need to create
73
+ # a stub entry.
74
+ def apply_inserts
75
+ @adds.each do |key|
76
+ data = @seen[key]
77
+ @catalog.catalog.insert(
78
+ @catalog.domainkey => domain,
79
+ idfield => key,
80
+ keyfield => data
81
+ )
82
+ end
83
+ end
84
+
85
+ # "update" means store the extended data in the record (which must exist)
86
+ def apply_updates
87
+ @changes.each do |key|
88
+ data = @seen[key]
89
+ @catalog.catalog.update(
90
+ {
91
+ @catalog.domainkey => domain,
92
+ idfield => key
93
+ },
94
+ {"$set" => {
95
+ keyfield => data
96
+ }
97
+ }
98
+ )
99
+ end
100
+ end
101
+
102
+
103
+ end
@@ -1,3 +1,3 @@
1
1
  module Assimilate
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/assimilate.rb CHANGED
@@ -1,11 +1,52 @@
1
1
  require "mongo"
2
2
  require "active_support/core_ext" # needed for Hash#diff
3
+ require "csv"
3
4
 
4
5
  require_relative "assimilate/version"
5
6
 
6
7
  require_relative "assimilate/catalog"
7
8
  require_relative "assimilate/batch"
9
+ require_relative "assimilate/extender"
10
+
11
+ require_relative "assimilate/command"
8
12
 
9
13
  module Assimilate
10
- # Your code goes here...
14
+ def self.load(filename, opts = {})
15
+ begin
16
+ catalog = Catalog.new(:config => opts[:config])
17
+ batcher = catalog.start_batch(opts.merge(:filename => filename))
18
+
19
+ records = CSV.read(filename, :headers => true)
20
+ records.each do |rec|
21
+ batcher << rec
22
+ end
23
+ if opts[:commit]
24
+ batcher.commit
25
+ else
26
+ $stderr.puts "suppressing data commit"
27
+ end
28
+ batcher.stats
29
+ # TODO explicit handling for Assimilate exceptions - when code is stable
30
+ # rescue Assimilate::DuplicateImportError => e
31
+ # $stderr.puts e.message
32
+ # exit 1
33
+ end
34
+ end
35
+
36
+ def self.extend_data(filename, opts = {})
37
+ begin
38
+ catalog = Catalog.new(:config => opts[:config])
39
+ extender = catalog.extend_data(opts)
40
+ records = CSV.read(filename, :headers => true)
41
+ records.each do |rec|
42
+ extender << rec
43
+ end
44
+ if opts[:commit]
45
+ extender.commit
46
+ else
47
+ $stderr.puts "suppressing data commit"
48
+ end
49
+ extender.stats
50
+ end
51
+ end
11
52
  end
@@ -4,3 +4,4 @@ ID,name,title
4
4
  3,Benjamin Franklin,Sage
5
5
  4,Aaron Burr,Duelist
6
6
  5,Alexander Hamilton,Financier
7
+ 6,James Madison,Theorist
@@ -0,0 +1,5 @@
1
+ ID,date
2
+ 1,1789/04/30
3
+ 2,1797/03/04
4
+ 6,1809/03/04
5
+ 16,1861/03/04
@@ -0,0 +1,7 @@
1
+ ID,name,title
2
+ 1,George Washington,President
3
+ 2,John Adams,Vice President
4
+ 3,Benjamin Franklin,Sage
5
+ 4,Aaron Burr,Duelist
6
+ 5,Alexander Hamilton,Financier
7
+ 6,James Madison,Theorist
@@ -14,7 +14,7 @@ describe "importing file" do
14
14
  end
15
15
 
16
16
  def import_data(datestamp, filename = "batch_input.csv")
17
- @batcher = @catalog.start_batch(domain: 'testdata', datestamp: datestamp, idfield: 'ID')
17
+ @batcher = @catalog.start_batch(domain: 'testdata', datestamp: datestamp, filename: filename, idfield: 'ID')
18
18
 
19
19
  @records = CSV.read(File.dirname(__FILE__) + "/../data/#{filename}", :headers => true)
20
20
  @records.each do |rec|
@@ -32,8 +32,9 @@ describe "importing file" do
32
32
  it "should return correct import stats" do
33
33
  @batcher.stats.should == {
34
34
  :baseline_count => 0,
35
- :final_count => 5,
36
- :adds_count => 5,
35
+ :final_count => 6,
36
+ :adds_count => 6,
37
+ :new_ids => ["1", "2", "3", "4", "5", "6"],
37
38
  :deletes_count => 0,
38
39
  :updates_count => 0,
39
40
  :unchanged_count => 0,
@@ -51,15 +52,20 @@ describe "importing file" do
51
52
  lambda {import_data("123")}.should raise_error(Assimilate::DuplicateImportError)
52
53
  end
53
54
 
55
+ it "should refuse to re-import same file" do
56
+ lambda {import_data("234")}.should raise_error(Assimilate::DuplicateImportError)
57
+ end
58
+
54
59
  it "should do all no-ops when importing identical data" do
55
- lambda {import_data("234")}.should_not raise_error
60
+ lambda {import_data("234", "duplicate_input.csv")}.should_not raise_error
56
61
  @batcher.stats.should == {
57
- :baseline_count => 5,
58
- :final_count => 5,
62
+ :baseline_count => 6,
63
+ :final_count => 6,
59
64
  :adds_count => 0,
65
+ :new_ids => [],
60
66
  :deletes_count => 0,
61
67
  :updates_count => 0,
62
- :unchanged_count => 5,
68
+ :unchanged_count => 6,
63
69
  :updated_fields => {}
64
70
  }
65
71
  @catalog.catalog.count.should == @records.count
@@ -78,10 +84,11 @@ describe "importing file" do
78
84
 
79
85
  it "should recognize changes" do
80
86
  @batcher.stats.should == {
81
- :baseline_count => 5,
82
- :final_count => 6,
87
+ :baseline_count => 6,
88
+ :final_count => 7,
83
89
  :adds_count => 1,
84
- :deletes_count => 1,
90
+ :new_ids => ["7"],
91
+ :deletes_count => 2,
85
92
  :updates_count => 1,
86
93
  :unchanged_count => 3,
87
94
  :updated_fields => {'title' => 1}
@@ -0,0 +1,77 @@
1
+ # tests for extending the base records
2
+
3
+ require "spec_helper"
4
+
5
+ describe "loading extended data" do
6
+ before(:all) do
7
+ @catalog = Assimilate::Catalog.new(:config => File.dirname(__FILE__) + "/../data/test.yml")
8
+ reset_catalog
9
+ end
10
+
11
+ def reset_catalog
12
+ @catalog.catalog.remove
13
+ @catalog.batches.remove
14
+ end
15
+
16
+ def import_base_data(datestamp, filename = "batch_input.csv")
17
+ @batcher = @catalog.start_batch(domain: 'testdata', datestamp: datestamp, idfield: 'ID')
18
+
19
+ @records = CSV.read(File.dirname(__FILE__) + "/../data/#{filename}", :headers => true)
20
+ @records.each do |rec|
21
+ @batcher << rec
22
+ end
23
+ @batcher.commit
24
+ end
25
+
26
+ describe "into matching catalog entries" do
27
+ before(:all) do
28
+ reset_catalog
29
+ import_base_data("123")
30
+ end
31
+
32
+ def import_extended_data(datestamp, filename)
33
+ @extender = @catalog.extend_data(domain: 'testdata', datastamp: datestamp, idfield: 'ID', key: 'inauguration')
34
+ @records = CSV.read(File.dirname(__FILE__) + "/../data/#{filename}", :headers => true)
35
+ @records.each do |rec|
36
+ @extender << rec
37
+ end
38
+ @extender.commit
39
+ end
40
+
41
+ before(:each) do
42
+ import_extended_data("1001", "dates.csv")
43
+ end
44
+
45
+ it "should capture changes" do
46
+ @extender.stats.should == {
47
+ :baseline_count => 6,
48
+ :final_count => 7,
49
+ :distinct_ids => 4,
50
+ :adds_count => 1,
51
+ :new_ids => ['16'],
52
+ :updates_count => 3,
53
+ :updated_fields => {'date' => 4},
54
+ :unchanged_count => 0
55
+ }
56
+ end
57
+
58
+ it "should do no-ops on duplicate load" do
59
+ # import_extended_data("1002", "dates")
60
+ lambda {import_extended_data("1002", "dates.csv")}.should_not raise_error
61
+
62
+ @extender.stats.should == {
63
+ :baseline_count => 7,
64
+ :final_count => 7,
65
+ :distinct_ids => 4,
66
+ :adds_count => 0,
67
+ :new_ids => [],
68
+ :updates_count => 0,
69
+ :updated_fields => {},
70
+ :unchanged_count => 4
71
+ }
72
+ end
73
+ end
74
+
75
+ # test handling of multiple records for same ID in the extended-data file
76
+ # test importing data at top level (no keyfield for sub-attributes)
77
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: assimilate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-26 00:00:00.000000000 Z
12
+ date: 2012-05-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mongo
16
- requirement: &2157149420 !ruby/object:Gem::Requirement
16
+ requirement: &2151862180 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 1.6.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *2157149420
24
+ version_requirements: *2151862180
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: bson_ext
27
- requirement: &2157148260 !ruby/object:Gem::Requirement
27
+ requirement: &2151861480 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 1.6.0
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *2157148260
35
+ version_requirements: *2151861480
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: activesupport
38
- requirement: &2157147300 !ruby/object:Gem::Requirement
38
+ requirement: &2151860560 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 3.2.0
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *2157147300
46
+ version_requirements: *2151860560
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rspec
49
- requirement: &2157146720 !ruby/object:Gem::Requirement
49
+ requirement: &2151859660 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 2.9.0
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2157146720
57
+ version_requirements: *2151859660
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: guard-rspec
60
- requirement: &2157146120 !ruby/object:Gem::Requirement
60
+ requirement: &2151856200 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: 0.7.0
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *2157146120
68
+ version_requirements: *2151856200
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: ruby_gntp
71
- requirement: &2156851680 !ruby/object:Gem::Requirement
71
+ requirement: &2151855040 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,11 +76,12 @@ dependencies:
76
76
  version: 0.3.4
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *2156851680
79
+ version_requirements: *2151855040
80
80
  description: Ingest updates from CSV and apply to set of hashes
81
81
  email:
82
82
  - jmay@pobox.com
83
- executables: []
83
+ executables:
84
+ - assimilate
84
85
  extensions: []
85
86
  extra_rdoc_files: []
86
87
  files:
@@ -91,14 +92,20 @@ files:
91
92
  - README.md
92
93
  - Rakefile
93
94
  - assimilate.gemspec
95
+ - bin/assimilate
94
96
  - lib/assimilate.rb
95
97
  - lib/assimilate/batch.rb
96
98
  - lib/assimilate/catalog.rb
99
+ - lib/assimilate/command.rb
100
+ - lib/assimilate/extender.rb
97
101
  - lib/assimilate/version.rb
98
102
  - spec/data/batch_input.csv
103
+ - spec/data/dates.csv
104
+ - spec/data/duplicate_input.csv
99
105
  - spec/data/test.yml
100
106
  - spec/data/updates.csv
101
107
  - spec/lib/batch_spec.rb
108
+ - spec/lib/extend_spec.rb
102
109
  - spec/spec_helper.rb
103
110
  homepage: ''
104
111
  licenses: []
@@ -126,8 +133,11 @@ specification_version: 3
126
133
  summary: Review & incorporate changes to a repository of persistent hashes in mongodb.
127
134
  test_files:
128
135
  - spec/data/batch_input.csv
136
+ - spec/data/dates.csv
137
+ - spec/data/duplicate_input.csv
129
138
  - spec/data/test.yml
130
139
  - spec/data/updates.csv
131
140
  - spec/lib/batch_spec.rb
141
+ - spec/lib/extend_spec.rb
132
142
  - spec/spec_helper.rb
133
143
  has_rdoc: