assimilate 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,6 +3,8 @@ class Assimilate::Batch
3
3
 
4
4
  def initialize(args)
5
5
  @catalog = args[:catalog]
6
+ @domainkey = @catalog.config[:domain]
7
+
6
8
  @domain = args[:domain]
7
9
  @datestamp = args[:datestamp]
8
10
  @idfield = args[:idfield]
@@ -18,11 +20,11 @@ class Assimilate::Batch
18
20
  end
19
21
 
20
22
  def load_baseline
21
- stored_records = @catalog.catalog.find(@catalog.domainkey => @domain).to_a
23
+ stored_records = @catalog.catalog.find(@domainkey => @domain).to_a
22
24
  @baseline = stored_records.each_with_object({}) do |rec, h|
23
25
  key = rec[@idfield]
24
26
  if h.include?(key)
25
- raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in domain [#{@domain}]"
27
+ raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in #{@domainkey} [#{@domain}]"
26
28
  end
27
29
  h[key] = rec
28
30
  end
@@ -54,7 +56,7 @@ class Assimilate::Batch
54
56
  # * find records that have been deleted
55
57
  def resolve
56
58
  if !@resolved
57
- @deleted_keys = @baseline.keys - @seen.keys
59
+ @deleted_keys = (@baseline.keys - @seen.keys).reject {|k| @baseline[k][@catalog.config[:deletion_marker]]}
58
60
 
59
61
  @updated_field_counts = @changes.each_with_object(Hash.new(0)) do |rec,h|
60
62
  key = rec[idfield]
@@ -76,6 +78,7 @@ class Assimilate::Batch
76
78
  :adds_count => @adds.count,
77
79
  :new_ids => @adds.map {|rec| rec[idfield]},
78
80
  :deletes_count => @deleted_keys.count,
81
+ :deleted_ids => @deleted_keys,
79
82
  :updates_count => @changes.count,
80
83
  :unchanged_count => @noops.count,
81
84
  :updated_fields => @updated_field_counts
@@ -92,11 +95,11 @@ class Assimilate::Batch
92
95
  end
93
96
 
94
97
  def record_batch
95
- raise(Assimilate::DuplicateImportError, "duplicate batch for datestamp #{datestamp}") if @catalog.batches.find('domain' => @domain, 'datestamp' => @datestamp).to_a.any?
96
- raise(Assimilate::DuplicateImportError, "duplicate batch for file #{@filename}") if @catalog.batches.find('domain' => @domain, 'filename' => @filename).to_a.any?
98
+ raise(Assimilate::DuplicateImportError, "duplicate batch for datestamp #{datestamp}") if @catalog.batches.find(@domainkey => @domain, 'datestamp' => @datestamp).to_a.any?
99
+ raise(Assimilate::DuplicateImportError, "duplicate batch for file #{@filename}") if @catalog.batches.find(@domainkey => @domain, 'filename' => @filename).to_a.any?
97
100
 
98
101
  @catalog.batches.insert({
99
- 'domain' => @domain,
102
+ @domainkey => @domain,
100
103
  'datestamp' => @datestamp,
101
104
  'filename' => @filename
102
105
  })
@@ -106,10 +109,10 @@ class Assimilate::Batch
106
109
  @deleted_keys.each do |key|
107
110
  @catalog.catalog.update(
108
111
  {
109
- @catalog.domainkey => domain,
112
+ @domainkey => domain,
110
113
  idfield => key
111
114
  },
112
- {"$set" => {:_dt_removed => datestamp}}
115
+ {"$set" => {@catalog.config[:deletion_marker] => datestamp}}
113
116
  )
114
117
  end
115
118
  end
@@ -127,7 +130,7 @@ class Assimilate::Batch
127
130
  @changes.each do |rec|
128
131
  @catalog.catalog.update(
129
132
  {
130
- @catalog.domainkey => domain,
133
+ @domainkey => domain,
131
134
  idfield => rec[idfield]
132
135
  },
133
136
  {"$set" => rec}
@@ -137,7 +140,7 @@ class Assimilate::Batch
137
140
 
138
141
  def decorate(records)
139
142
  records.map do |r|
140
- r[@catalog.domainkey] = @domain
143
+ r[@domainkey] = @domain
141
144
  r.to_hash
142
145
  end
143
146
  end
@@ -1,28 +1,45 @@
1
1
  require "yaml"
2
2
 
3
+ # Catalog configuration:
4
+ # db name of mongo database
5
+ # catalog name of the catalog collection
6
+ # batch name of the batches collection (e.g. "files")
7
+ # domain key to use for specifying record domains (will be prefixed with _)
8
+ # deletion_marker key to use to marker records that have disappeared from the source file
9
+ #
3
10
  # Records in each catalog acquire the following internal attributes:
4
- # _id Unique ID, assigned by mongo
5
- # _[domain] Domain key, specified with :domainkey attribute when initializing catalog
6
- # _dt_first_seen Batch datestamp reference for when this record was first captured
7
- # _dt_last_seen Batch datestamp reference for when this record was most recently affirmed
8
- # _dt_last_update Batch datestamp reference for when this record was most recently altered
9
- # _dt_removed Batch datestamp reference for when this record was removed from input
11
+ # _id Unique ID, assigned by mongo
12
+ # [domain] Domain key, specified with :domainkey attribute when initializing catalog
13
+ # _dt_first_seen Batch datestamp reference for when this record was first captured
14
+ # _dt_last_seen Batch datestamp reference for when this record was most recently affirmed
15
+ # _dt_last_update Batch datestamp reference for when this record was most recently altered
16
+ # [deletion_marker] Batch datestamp reference for when this record was removed from input
10
17
  #
11
18
  # Inbound records must not have attributes named with leading underscores.
12
19
  #
13
20
  # A "domain" here is a namespace of identifiers.
14
21
 
15
22
  class Assimilate::Catalog
16
- attr_reader :catalog, :batches, :domainkey
23
+ attr_reader :catalog, :config, :batches
17
24
 
18
25
  def initialize(args)
19
26
  @config = YAML.load(File.open(args[:config]))
27
+ check_config
28
+
29
+ @db = Mongo::Connection.new.db(@config[:db])
30
+ @catalog = @db.collection(@config[:catalog])
31
+ @batches = @db.collection(@config[:batch])
32
+ end
20
33
 
21
- @db = Mongo::Connection.new.db(@config['db'])
22
- @catalog = @db.collection(@config['catalog'])
23
- @batches = @db.collection(@config['batch'])
24
- @domainkey = @config['domain']
25
- @domainkey = "_#{@domainkey}" unless @domainkey =~ /^_/ # enforce leading underscore on internal attributes
34
+ def check_config
35
+ config.symbolize_keys!
36
+ [:db, :catalog, :batch, :domain, :deletion_marker].each do |key|
37
+ raise Assimilate::InvalidConfiguration, "missing required parameter: #{key}" unless config[key]
38
+ end
39
+ [:domain, :deletion_marker].each do |key|
40
+ # enforce leading underscore on internal attributes
41
+ config[key] = "_#{config[key]}" unless config[key] =~ /^_/
42
+ end
26
43
  end
27
44
 
28
45
  def start_batch(args)
@@ -38,6 +55,9 @@ class Assimilate::Catalog
38
55
  end
39
56
 
40
57
  def active_count
41
- @catalog.find("_dt_removed" => nil).count
58
+ @catalog.find(config[:deletion_marker] => nil).count
42
59
  end
43
60
  end
61
+
62
+ class Assimilate::InvalidConfiguration < StandardError
63
+ end
@@ -81,7 +81,7 @@ EOT
81
81
  Final record count: #{results[:final_count]}
82
82
  Unchanged records: #{results[:unchanged_count]}
83
83
  New records: #{results[:adds_count]} (#{results[:new_ids].take(10).join(',')})
84
- Deletes: #{results[:deletes_count]}
84
+ Deletes: #{results[:deletes_count]} (#{results[:deleted_ids].take(10).join(',')})
85
85
  Updates: #{results[:updates_count]}
86
86
  EOT
87
87
  if results[:updated_fields].any?
@@ -3,6 +3,8 @@ class Assimilate::Extender
3
3
 
4
4
  def initialize(args)
5
5
  @catalog = args[:catalog]
6
+ @domainkey = @catalog.config[:domain]
7
+
6
8
  @domain = args[:domain]
7
9
  @idfield = args[:idfield]
8
10
  @filename = args[:filename]
@@ -17,11 +19,11 @@ class Assimilate::Extender
17
19
  end
18
20
 
19
21
  def load_baseline
20
- stored_records = @catalog.catalog.find(@catalog.domainkey => @domain).to_a
22
+ stored_records = @catalog.catalog.find(@domainkey => @domain).to_a
21
23
  @baseline = stored_records.each_with_object({}) do |rec, h|
22
24
  key = rec[@idfield]
23
25
  if h.include?(key)
24
- raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in domain [#{@domain}]"
26
+ raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in #{@domainkey} [#{@domain}]"
25
27
  end
26
28
  h[key] = rec
27
29
  end
@@ -75,7 +77,7 @@ class Assimilate::Extender
75
77
  @adds.each do |key|
76
78
  data = @seen[key]
77
79
  @catalog.catalog.insert(
78
- @catalog.domainkey => domain,
80
+ @domainkey => domain,
79
81
  idfield => key,
80
82
  keyfield => data
81
83
  )
@@ -88,7 +90,7 @@ class Assimilate::Extender
88
90
  data = @seen[key]
89
91
  @catalog.catalog.update(
90
92
  {
91
- @catalog.domainkey => domain,
93
+ @domainkey => domain,
92
94
  idfield => key
93
95
  },
94
96
  {"$set" => {
@@ -1,3 +1,3 @@
1
1
  module Assimilate
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
data/spec/data/test.yml CHANGED
@@ -3,3 +3,4 @@ db: assimilate-test
3
3
  catalog: forefathers
4
4
  batch: files
5
5
  domain: resource
6
+ deletion_marker: dt_removed
@@ -36,6 +36,7 @@ describe "importing file" do
36
36
  :adds_count => 6,
37
37
  :new_ids => ["1", "2", "3", "4", "5", "6"],
38
38
  :deletes_count => 0,
39
+ :deleted_ids => [],
39
40
  :updates_count => 0,
40
41
  :unchanged_count => 0,
41
42
  :updated_fields => {}
@@ -64,6 +65,7 @@ describe "importing file" do
64
65
  :adds_count => 0,
65
66
  :new_ids => [],
66
67
  :deletes_count => 0,
68
+ :deleted_ids => [],
67
69
  :updates_count => 0,
68
70
  :unchanged_count => 6,
69
71
  :updated_fields => {}
@@ -89,6 +91,7 @@ describe "importing file" do
89
91
  :adds_count => 1,
90
92
  :new_ids => ["7"],
91
93
  :deletes_count => 2,
94
+ :deleted_ids => ['4', '6'],
92
95
  :updates_count => 1,
93
96
  :unchanged_count => 3,
94
97
  :updated_fields => {'title' => 1}
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: assimilate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-05-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mongo
16
- requirement: &2151862180 !ruby/object:Gem::Requirement
16
+ requirement: &2152913700 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 1.6.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *2151862180
24
+ version_requirements: *2152913700
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: bson_ext
27
- requirement: &2151861480 !ruby/object:Gem::Requirement
27
+ requirement: &2152912480 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 1.6.0
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *2151861480
35
+ version_requirements: *2152912480
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: activesupport
38
- requirement: &2151860560 !ruby/object:Gem::Requirement
38
+ requirement: &2152911920 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 3.2.0
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *2151860560
46
+ version_requirements: *2152911920
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rspec
49
- requirement: &2151859660 !ruby/object:Gem::Requirement
49
+ requirement: &2152911460 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 2.9.0
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2151859660
57
+ version_requirements: *2152911460
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: guard-rspec
60
- requirement: &2151856200 !ruby/object:Gem::Requirement
60
+ requirement: &2152910880 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: 0.7.0
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *2151856200
68
+ version_requirements: *2152910880
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: ruby_gntp
71
- requirement: &2151855040 !ruby/object:Gem::Requirement
71
+ requirement: &2152910140 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,7 +76,7 @@ dependencies:
76
76
  version: 0.3.4
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *2151855040
79
+ version_requirements: *2152910140
80
80
  description: Ingest updates from CSV and apply to set of hashes
81
81
  email:
82
82
  - jmay@pobox.com