assimilate 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/assimilate/batch.rb +13 -10
- data/lib/assimilate/catalog.rb +33 -13
- data/lib/assimilate/command.rb +1 -1
- data/lib/assimilate/extender.rb +6 -4
- data/lib/assimilate/version.rb +1 -1
- data/spec/data/test.yml +1 -0
- data/spec/lib/batch_spec.rb +3 -0
- metadata +13 -13
data/lib/assimilate/batch.rb
CHANGED
@@ -3,6 +3,8 @@ class Assimilate::Batch
|
|
3
3
|
|
4
4
|
def initialize(args)
|
5
5
|
@catalog = args[:catalog]
|
6
|
+
@domainkey = @catalog.config[:domain]
|
7
|
+
|
6
8
|
@domain = args[:domain]
|
7
9
|
@datestamp = args[:datestamp]
|
8
10
|
@idfield = args[:idfield]
|
@@ -18,11 +20,11 @@ class Assimilate::Batch
|
|
18
20
|
end
|
19
21
|
|
20
22
|
def load_baseline
|
21
|
-
stored_records = @catalog.catalog.find(@
|
23
|
+
stored_records = @catalog.catalog.find(@domainkey => @domain).to_a
|
22
24
|
@baseline = stored_records.each_with_object({}) do |rec, h|
|
23
25
|
key = rec[@idfield]
|
24
26
|
if h.include?(key)
|
25
|
-
raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in
|
27
|
+
raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in #{@domainkey} [#{@domain}]"
|
26
28
|
end
|
27
29
|
h[key] = rec
|
28
30
|
end
|
@@ -54,7 +56,7 @@ class Assimilate::Batch
|
|
54
56
|
# * find records that have been deleted
|
55
57
|
def resolve
|
56
58
|
if !@resolved
|
57
|
-
@deleted_keys = @baseline.keys - @seen.keys
|
59
|
+
@deleted_keys = (@baseline.keys - @seen.keys).reject {|k| @baseline[k][@catalog.config[:deletion_marker]]}
|
58
60
|
|
59
61
|
@updated_field_counts = @changes.each_with_object(Hash.new(0)) do |rec,h|
|
60
62
|
key = rec[idfield]
|
@@ -76,6 +78,7 @@ class Assimilate::Batch
|
|
76
78
|
:adds_count => @adds.count,
|
77
79
|
:new_ids => @adds.map {|rec| rec[idfield]},
|
78
80
|
:deletes_count => @deleted_keys.count,
|
81
|
+
:deleted_ids => @deleted_keys,
|
79
82
|
:updates_count => @changes.count,
|
80
83
|
:unchanged_count => @noops.count,
|
81
84
|
:updated_fields => @updated_field_counts
|
@@ -92,11 +95,11 @@ class Assimilate::Batch
|
|
92
95
|
end
|
93
96
|
|
94
97
|
def record_batch
|
95
|
-
raise(Assimilate::DuplicateImportError, "duplicate batch for datestamp #{datestamp}") if @catalog.batches.find(
|
96
|
-
raise(Assimilate::DuplicateImportError, "duplicate batch for file #{@filename}") if @catalog.batches.find(
|
98
|
+
raise(Assimilate::DuplicateImportError, "duplicate batch for datestamp #{datestamp}") if @catalog.batches.find(@domainkey => @domain, 'datestamp' => @datestamp).to_a.any?
|
99
|
+
raise(Assimilate::DuplicateImportError, "duplicate batch for file #{@filename}") if @catalog.batches.find(@domainkey => @domain, 'filename' => @filename).to_a.any?
|
97
100
|
|
98
101
|
@catalog.batches.insert({
|
99
|
-
|
102
|
+
@domainkey => @domain,
|
100
103
|
'datestamp' => @datestamp,
|
101
104
|
'filename' => @filename
|
102
105
|
})
|
@@ -106,10 +109,10 @@ class Assimilate::Batch
|
|
106
109
|
@deleted_keys.each do |key|
|
107
110
|
@catalog.catalog.update(
|
108
111
|
{
|
109
|
-
@
|
112
|
+
@domainkey => domain,
|
110
113
|
idfield => key
|
111
114
|
},
|
112
|
-
{"$set" => {:
|
115
|
+
{"$set" => {@catalog.config[:deletion_marker] => datestamp}}
|
113
116
|
)
|
114
117
|
end
|
115
118
|
end
|
@@ -127,7 +130,7 @@ class Assimilate::Batch
|
|
127
130
|
@changes.each do |rec|
|
128
131
|
@catalog.catalog.update(
|
129
132
|
{
|
130
|
-
@
|
133
|
+
@domainkey => domain,
|
131
134
|
idfield => rec[idfield]
|
132
135
|
},
|
133
136
|
{"$set" => rec}
|
@@ -137,7 +140,7 @@ class Assimilate::Batch
|
|
137
140
|
|
138
141
|
def decorate(records)
|
139
142
|
records.map do |r|
|
140
|
-
r[@
|
143
|
+
r[@domainkey] = @domain
|
141
144
|
r.to_hash
|
142
145
|
end
|
143
146
|
end
|
data/lib/assimilate/catalog.rb
CHANGED
@@ -1,28 +1,45 @@
|
|
1
1
|
require "yaml"
|
2
2
|
|
3
|
+
# Catalog configuration:
|
4
|
+
# db name of mongo database
|
5
|
+
# catalog name of the catalog collection
|
6
|
+
# batch name of the batches collection (e.g. "files")
|
7
|
+
# domain key to use for specifying record domains (will be prefixed with _)
|
8
|
+
# deletion_marker key to use to marker records that have disappeared from the source file
|
9
|
+
#
|
3
10
|
# Records in each catalog acquire the following internal attributes:
|
4
|
-
# _id
|
5
|
-
#
|
6
|
-
# _dt_first_seen
|
7
|
-
# _dt_last_seen
|
8
|
-
# _dt_last_update
|
9
|
-
#
|
11
|
+
# _id Unique ID, assigned by mongo
|
12
|
+
# [domain] Domain key, specified with :domainkey attribute when initializing catalog
|
13
|
+
# _dt_first_seen Batch datestamp reference for when this record was first captured
|
14
|
+
# _dt_last_seen Batch datestamp reference for when this record was most recently affirmed
|
15
|
+
# _dt_last_update Batch datestamp reference for when this record was most recently altered
|
16
|
+
# [deletion_marker] Batch datestamp reference for when this record was removed from input
|
10
17
|
#
|
11
18
|
# Inbound records must not have attributes named with leading underscores.
|
12
19
|
#
|
13
20
|
# A "domain" here is a namespace of identifiers.
|
14
21
|
|
15
22
|
class Assimilate::Catalog
|
16
|
-
attr_reader :catalog, :
|
23
|
+
attr_reader :catalog, :config, :batches
|
17
24
|
|
18
25
|
def initialize(args)
|
19
26
|
@config = YAML.load(File.open(args[:config]))
|
27
|
+
check_config
|
28
|
+
|
29
|
+
@db = Mongo::Connection.new.db(@config[:db])
|
30
|
+
@catalog = @db.collection(@config[:catalog])
|
31
|
+
@batches = @db.collection(@config[:batch])
|
32
|
+
end
|
20
33
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
34
|
+
def check_config
|
35
|
+
config.symbolize_keys!
|
36
|
+
[:db, :catalog, :batch, :domain, :deletion_marker].each do |key|
|
37
|
+
raise Assimilate::InvalidConfiguration, "missing required parameter: #{key}" unless config[key]
|
38
|
+
end
|
39
|
+
[:domain, :deletion_marker].each do |key|
|
40
|
+
# enforce leading underscore on internal attributes
|
41
|
+
config[key] = "_#{config[key]}" unless config[key] =~ /^_/
|
42
|
+
end
|
26
43
|
end
|
27
44
|
|
28
45
|
def start_batch(args)
|
@@ -38,6 +55,9 @@ class Assimilate::Catalog
|
|
38
55
|
end
|
39
56
|
|
40
57
|
def active_count
|
41
|
-
@catalog.find(
|
58
|
+
@catalog.find(config[:deletion_marker] => nil).count
|
42
59
|
end
|
43
60
|
end
|
61
|
+
|
62
|
+
class Assimilate::InvalidConfiguration < StandardError
|
63
|
+
end
|
data/lib/assimilate/command.rb
CHANGED
@@ -81,7 +81,7 @@ EOT
|
|
81
81
|
Final record count: #{results[:final_count]}
|
82
82
|
Unchanged records: #{results[:unchanged_count]}
|
83
83
|
New records: #{results[:adds_count]} (#{results[:new_ids].take(10).join(',')})
|
84
|
-
Deletes: #{results[:deletes_count]}
|
84
|
+
Deletes: #{results[:deletes_count]} (#{results[:deleted_ids].take(10).join(',')})
|
85
85
|
Updates: #{results[:updates_count]}
|
86
86
|
EOT
|
87
87
|
if results[:updated_fields].any?
|
data/lib/assimilate/extender.rb
CHANGED
@@ -3,6 +3,8 @@ class Assimilate::Extender
|
|
3
3
|
|
4
4
|
def initialize(args)
|
5
5
|
@catalog = args[:catalog]
|
6
|
+
@domainkey = @catalog.config[:domain]
|
7
|
+
|
6
8
|
@domain = args[:domain]
|
7
9
|
@idfield = args[:idfield]
|
8
10
|
@filename = args[:filename]
|
@@ -17,11 +19,11 @@ class Assimilate::Extender
|
|
17
19
|
end
|
18
20
|
|
19
21
|
def load_baseline
|
20
|
-
stored_records = @catalog.catalog.find(@
|
22
|
+
stored_records = @catalog.catalog.find(@domainkey => @domain).to_a
|
21
23
|
@baseline = stored_records.each_with_object({}) do |rec, h|
|
22
24
|
key = rec[@idfield]
|
23
25
|
if h.include?(key)
|
24
|
-
raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in
|
26
|
+
raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in #{@domainkey} [#{@domain}]"
|
25
27
|
end
|
26
28
|
h[key] = rec
|
27
29
|
end
|
@@ -75,7 +77,7 @@ class Assimilate::Extender
|
|
75
77
|
@adds.each do |key|
|
76
78
|
data = @seen[key]
|
77
79
|
@catalog.catalog.insert(
|
78
|
-
@
|
80
|
+
@domainkey => domain,
|
79
81
|
idfield => key,
|
80
82
|
keyfield => data
|
81
83
|
)
|
@@ -88,7 +90,7 @@ class Assimilate::Extender
|
|
88
90
|
data = @seen[key]
|
89
91
|
@catalog.catalog.update(
|
90
92
|
{
|
91
|
-
@
|
93
|
+
@domainkey => domain,
|
92
94
|
idfield => key
|
93
95
|
},
|
94
96
|
{"$set" => {
|
data/lib/assimilate/version.rb
CHANGED
data/spec/data/test.yml
CHANGED
data/spec/lib/batch_spec.rb
CHANGED
@@ -36,6 +36,7 @@ describe "importing file" do
|
|
36
36
|
:adds_count => 6,
|
37
37
|
:new_ids => ["1", "2", "3", "4", "5", "6"],
|
38
38
|
:deletes_count => 0,
|
39
|
+
:deleted_ids => [],
|
39
40
|
:updates_count => 0,
|
40
41
|
:unchanged_count => 0,
|
41
42
|
:updated_fields => {}
|
@@ -64,6 +65,7 @@ describe "importing file" do
|
|
64
65
|
:adds_count => 0,
|
65
66
|
:new_ids => [],
|
66
67
|
:deletes_count => 0,
|
68
|
+
:deleted_ids => [],
|
67
69
|
:updates_count => 0,
|
68
70
|
:unchanged_count => 6,
|
69
71
|
:updated_fields => {}
|
@@ -89,6 +91,7 @@ describe "importing file" do
|
|
89
91
|
:adds_count => 1,
|
90
92
|
:new_ids => ["7"],
|
91
93
|
:deletes_count => 2,
|
94
|
+
:deleted_ids => ['4', '6'],
|
92
95
|
:updates_count => 1,
|
93
96
|
:unchanged_count => 3,
|
94
97
|
:updated_fields => {'title' => 1}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: assimilate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-05-02 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mongo
|
16
|
-
requirement: &
|
16
|
+
requirement: &2152913700 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.6.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2152913700
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: bson_ext
|
27
|
-
requirement: &
|
27
|
+
requirement: &2152912480 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.6.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2152912480
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: activesupport
|
38
|
-
requirement: &
|
38
|
+
requirement: &2152911920 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 3.2.0
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2152911920
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rspec
|
49
|
-
requirement: &
|
49
|
+
requirement: &2152911460 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 2.9.0
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2152911460
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: guard-rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &2152910880 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: 0.7.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2152910880
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: ruby_gntp
|
71
|
-
requirement: &
|
71
|
+
requirement: &2152910140 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,7 +76,7 @@ dependencies:
|
|
76
76
|
version: 0.3.4
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *2152910140
|
80
80
|
description: Ingest updates from CSV and apply to set of hashes
|
81
81
|
email:
|
82
82
|
- jmay@pobox.com
|