assimilate 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,8 @@ class Assimilate::Batch
3
3
 
4
4
  def initialize(args)
5
5
  @catalog = args[:catalog]
6
+ @domainkey = @catalog.config[:domain]
7
+
6
8
  @domain = args[:domain]
7
9
  @datestamp = args[:datestamp]
8
10
  @idfield = args[:idfield]
@@ -18,11 +20,11 @@ class Assimilate::Batch
18
20
  end
19
21
 
20
22
  def load_baseline
21
- stored_records = @catalog.catalog.find(@catalog.domainkey => @domain).to_a
23
+ stored_records = @catalog.catalog.find(@domainkey => @domain).to_a
22
24
  @baseline = stored_records.each_with_object({}) do |rec, h|
23
25
  key = rec[@idfield]
24
26
  if h.include?(key)
25
- raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in domain [#{@domain}]"
27
+ raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in #{@domainkey} [#{@domain}]"
26
28
  end
27
29
  h[key] = rec
28
30
  end
@@ -54,7 +56,7 @@ class Assimilate::Batch
54
56
  # * find records that have been deleted
55
57
  def resolve
56
58
  if !@resolved
57
- @deleted_keys = @baseline.keys - @seen.keys
59
+ @deleted_keys = (@baseline.keys - @seen.keys).reject {|k| @baseline[k][@catalog.config[:deletion_marker]]}
58
60
 
59
61
  @updated_field_counts = @changes.each_with_object(Hash.new(0)) do |rec,h|
60
62
  key = rec[idfield]
@@ -76,6 +78,7 @@ class Assimilate::Batch
76
78
  :adds_count => @adds.count,
77
79
  :new_ids => @adds.map {|rec| rec[idfield]},
78
80
  :deletes_count => @deleted_keys.count,
81
+ :deleted_ids => @deleted_keys,
79
82
  :updates_count => @changes.count,
80
83
  :unchanged_count => @noops.count,
81
84
  :updated_fields => @updated_field_counts
@@ -92,11 +95,11 @@ class Assimilate::Batch
92
95
  end
93
96
 
94
97
  def record_batch
95
- raise(Assimilate::DuplicateImportError, "duplicate batch for datestamp #{datestamp}") if @catalog.batches.find('domain' => @domain, 'datestamp' => @datestamp).to_a.any?
96
- raise(Assimilate::DuplicateImportError, "duplicate batch for file #{@filename}") if @catalog.batches.find('domain' => @domain, 'filename' => @filename).to_a.any?
98
+ raise(Assimilate::DuplicateImportError, "duplicate batch for datestamp #{datestamp}") if @catalog.batches.find(@domainkey => @domain, 'datestamp' => @datestamp).to_a.any?
99
+ raise(Assimilate::DuplicateImportError, "duplicate batch for file #{@filename}") if @catalog.batches.find(@domainkey => @domain, 'filename' => @filename).to_a.any?
97
100
 
98
101
  @catalog.batches.insert({
99
- 'domain' => @domain,
102
+ @domainkey => @domain,
100
103
  'datestamp' => @datestamp,
101
104
  'filename' => @filename
102
105
  })
@@ -106,10 +109,10 @@ class Assimilate::Batch
106
109
  @deleted_keys.each do |key|
107
110
  @catalog.catalog.update(
108
111
  {
109
- @catalog.domainkey => domain,
112
+ @domainkey => domain,
110
113
  idfield => key
111
114
  },
112
- {"$set" => {:_dt_removed => datestamp}}
115
+ {"$set" => {@catalog.config[:deletion_marker] => datestamp}}
113
116
  )
114
117
  end
115
118
  end
@@ -127,7 +130,7 @@ class Assimilate::Batch
127
130
  @changes.each do |rec|
128
131
  @catalog.catalog.update(
129
132
  {
130
- @catalog.domainkey => domain,
133
+ @domainkey => domain,
131
134
  idfield => rec[idfield]
132
135
  },
133
136
  {"$set" => rec}
@@ -137,7 +140,7 @@ class Assimilate::Batch
137
140
 
138
141
  def decorate(records)
139
142
  records.map do |r|
140
- r[@catalog.domainkey] = @domain
143
+ r[@domainkey] = @domain
141
144
  r.to_hash
142
145
  end
143
146
  end
@@ -1,28 +1,45 @@
1
1
  require "yaml"
2
2
 
3
+ # Catalog configuration:
4
+ # db name of mongo database
5
+ # catalog name of the catalog collection
6
+ # batch name of the batches collection (e.g. "files")
7
+ # domain key to use for specifying record domains (will be prefixed with _)
8
+ # deletion_marker key to use to marker records that have disappeared from the source file
9
+ #
3
10
  # Records in each catalog acquire the following internal attributes:
4
- # _id Unique ID, assigned by mongo
5
- # _[domain] Domain key, specified with :domainkey attribute when initializing catalog
6
- # _dt_first_seen Batch datestamp reference for when this record was first captured
7
- # _dt_last_seen Batch datestamp reference for when this record was most recently affirmed
8
- # _dt_last_update Batch datestamp reference for when this record was most recently altered
9
- # _dt_removed Batch datestamp reference for when this record was removed from input
11
+ # _id Unique ID, assigned by mongo
12
+ # [domain] Domain key, specified with :domainkey attribute when initializing catalog
13
+ # _dt_first_seen Batch datestamp reference for when this record was first captured
14
+ # _dt_last_seen Batch datestamp reference for when this record was most recently affirmed
15
+ # _dt_last_update Batch datestamp reference for when this record was most recently altered
16
+ # [deletion_marker] Batch datestamp reference for when this record was removed from input
10
17
  #
11
18
  # Inbound records must not have attributes named with leading underscores.
12
19
  #
13
20
  # A "domain" here is a namespace of identifiers.
14
21
 
15
22
  class Assimilate::Catalog
16
- attr_reader :catalog, :batches, :domainkey
23
+ attr_reader :catalog, :config, :batches
17
24
 
18
25
  def initialize(args)
19
26
  @config = YAML.load(File.open(args[:config]))
27
+ check_config
28
+
29
+ @db = Mongo::Connection.new.db(@config[:db])
30
+ @catalog = @db.collection(@config[:catalog])
31
+ @batches = @db.collection(@config[:batch])
32
+ end
20
33
 
21
- @db = Mongo::Connection.new.db(@config['db'])
22
- @catalog = @db.collection(@config['catalog'])
23
- @batches = @db.collection(@config['batch'])
24
- @domainkey = @config['domain']
25
- @domainkey = "_#{@domainkey}" unless @domainkey =~ /^_/ # enforce leading underscore on internal attributes
34
+ def check_config
35
+ config.symbolize_keys!
36
+ [:db, :catalog, :batch, :domain, :deletion_marker].each do |key|
37
+ raise Assimilate::InvalidConfiguration, "missing required parameter: #{key}" unless config[key]
38
+ end
39
+ [:domain, :deletion_marker].each do |key|
40
+ # enforce leading underscore on internal attributes
41
+ config[key] = "_#{config[key]}" unless config[key] =~ /^_/
42
+ end
26
43
  end
27
44
 
28
45
  def start_batch(args)
@@ -38,6 +55,9 @@ class Assimilate::Catalog
38
55
  end
39
56
 
40
57
  def active_count
41
- @catalog.find("_dt_removed" => nil).count
58
+ @catalog.find(config[:deletion_marker] => nil).count
42
59
  end
43
60
  end
61
+
62
+ class Assimilate::InvalidConfiguration < StandardError
63
+ end
@@ -81,7 +81,7 @@ EOT
81
81
  Final record count: #{results[:final_count]}
82
82
  Unchanged records: #{results[:unchanged_count]}
83
83
  New records: #{results[:adds_count]} (#{results[:new_ids].take(10).join(',')})
84
- Deletes: #{results[:deletes_count]}
84
+ Deletes: #{results[:deletes_count]} (#{results[:deleted_ids].take(10).join(',')})
85
85
  Updates: #{results[:updates_count]}
86
86
  EOT
87
87
  if results[:updated_fields].any?
@@ -3,6 +3,8 @@ class Assimilate::Extender
3
3
 
4
4
  def initialize(args)
5
5
  @catalog = args[:catalog]
6
+ @domainkey = @catalog.config[:domain]
7
+
6
8
  @domain = args[:domain]
7
9
  @idfield = args[:idfield]
8
10
  @filename = args[:filename]
@@ -17,11 +19,11 @@ class Assimilate::Extender
17
19
  end
18
20
 
19
21
  def load_baseline
20
- stored_records = @catalog.catalog.find(@catalog.domainkey => @domain).to_a
22
+ stored_records = @catalog.catalog.find(@domainkey => @domain).to_a
21
23
  @baseline = stored_records.each_with_object({}) do |rec, h|
22
24
  key = rec[@idfield]
23
25
  if h.include?(key)
24
- raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in domain [#{@domain}]"
26
+ raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in #{@domainkey} [#{@domain}]"
25
27
  end
26
28
  h[key] = rec
27
29
  end
@@ -75,7 +77,7 @@ class Assimilate::Extender
75
77
  @adds.each do |key|
76
78
  data = @seen[key]
77
79
  @catalog.catalog.insert(
78
- @catalog.domainkey => domain,
80
+ @domainkey => domain,
79
81
  idfield => key,
80
82
  keyfield => data
81
83
  )
@@ -88,7 +90,7 @@ class Assimilate::Extender
88
90
  data = @seen[key]
89
91
  @catalog.catalog.update(
90
92
  {
91
- @catalog.domainkey => domain,
93
+ @domainkey => domain,
92
94
  idfield => key
93
95
  },
94
96
  {"$set" => {
@@ -1,3 +1,3 @@
1
1
  module Assimilate
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
data/spec/data/test.yml CHANGED
@@ -3,3 +3,4 @@ db: assimilate-test
3
3
  catalog: forefathers
4
4
  batch: files
5
5
  domain: resource
6
+ deletion_marker: dt_removed
@@ -36,6 +36,7 @@ describe "importing file" do
36
36
  :adds_count => 6,
37
37
  :new_ids => ["1", "2", "3", "4", "5", "6"],
38
38
  :deletes_count => 0,
39
+ :deleted_ids => [],
39
40
  :updates_count => 0,
40
41
  :unchanged_count => 0,
41
42
  :updated_fields => {}
@@ -64,6 +65,7 @@ describe "importing file" do
64
65
  :adds_count => 0,
65
66
  :new_ids => [],
66
67
  :deletes_count => 0,
68
+ :deleted_ids => [],
67
69
  :updates_count => 0,
68
70
  :unchanged_count => 6,
69
71
  :updated_fields => {}
@@ -89,6 +91,7 @@ describe "importing file" do
89
91
  :adds_count => 1,
90
92
  :new_ids => ["7"],
91
93
  :deletes_count => 2,
94
+ :deleted_ids => ['4', '6'],
92
95
  :updates_count => 1,
93
96
  :unchanged_count => 3,
94
97
  :updated_fields => {'title' => 1}
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: assimilate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-05-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mongo
16
- requirement: &2151862180 !ruby/object:Gem::Requirement
16
+ requirement: &2152913700 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 1.6.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *2151862180
24
+ version_requirements: *2152913700
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: bson_ext
27
- requirement: &2151861480 !ruby/object:Gem::Requirement
27
+ requirement: &2152912480 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 1.6.0
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *2151861480
35
+ version_requirements: *2152912480
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: activesupport
38
- requirement: &2151860560 !ruby/object:Gem::Requirement
38
+ requirement: &2152911920 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 3.2.0
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *2151860560
46
+ version_requirements: *2152911920
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: rspec
49
- requirement: &2151859660 !ruby/object:Gem::Requirement
49
+ requirement: &2152911460 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: 2.9.0
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2151859660
57
+ version_requirements: *2152911460
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: guard-rspec
60
- requirement: &2151856200 !ruby/object:Gem::Requirement
60
+ requirement: &2152910880 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: 0.7.0
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *2151856200
68
+ version_requirements: *2152910880
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: ruby_gntp
71
- requirement: &2151855040 !ruby/object:Gem::Requirement
71
+ requirement: &2152910140 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,7 +76,7 @@ dependencies:
76
76
  version: 0.3.4
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *2151855040
79
+ version_requirements: *2152910140
80
80
  description: Ingest updates from CSV and apply to set of hashes
81
81
  email:
82
82
  - jmay@pobox.com