assimilate 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/assimilate/batch.rb +13 -10
- data/lib/assimilate/catalog.rb +33 -13
- data/lib/assimilate/command.rb +1 -1
- data/lib/assimilate/extender.rb +6 -4
- data/lib/assimilate/version.rb +1 -1
- data/spec/data/test.yml +1 -0
- data/spec/lib/batch_spec.rb +3 -0
- metadata +13 -13
data/lib/assimilate/batch.rb
CHANGED
@@ -3,6 +3,8 @@ class Assimilate::Batch
|
|
3
3
|
|
4
4
|
def initialize(args)
|
5
5
|
@catalog = args[:catalog]
|
6
|
+
@domainkey = @catalog.config[:domain]
|
7
|
+
|
6
8
|
@domain = args[:domain]
|
7
9
|
@datestamp = args[:datestamp]
|
8
10
|
@idfield = args[:idfield]
|
@@ -18,11 +20,11 @@ class Assimilate::Batch
|
|
18
20
|
end
|
19
21
|
|
20
22
|
def load_baseline
|
21
|
-
stored_records = @catalog.catalog.find(@
|
23
|
+
stored_records = @catalog.catalog.find(@domainkey => @domain).to_a
|
22
24
|
@baseline = stored_records.each_with_object({}) do |rec, h|
|
23
25
|
key = rec[@idfield]
|
24
26
|
if h.include?(key)
|
25
|
-
raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in
|
27
|
+
raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in #{@domainkey} [#{@domain}]"
|
26
28
|
end
|
27
29
|
h[key] = rec
|
28
30
|
end
|
@@ -54,7 +56,7 @@ class Assimilate::Batch
|
|
54
56
|
# * find records that have been deleted
|
55
57
|
def resolve
|
56
58
|
if !@resolved
|
57
|
-
@deleted_keys = @baseline.keys - @seen.keys
|
59
|
+
@deleted_keys = (@baseline.keys - @seen.keys).reject {|k| @baseline[k][@catalog.config[:deletion_marker]]}
|
58
60
|
|
59
61
|
@updated_field_counts = @changes.each_with_object(Hash.new(0)) do |rec,h|
|
60
62
|
key = rec[idfield]
|
@@ -76,6 +78,7 @@ class Assimilate::Batch
|
|
76
78
|
:adds_count => @adds.count,
|
77
79
|
:new_ids => @adds.map {|rec| rec[idfield]},
|
78
80
|
:deletes_count => @deleted_keys.count,
|
81
|
+
:deleted_ids => @deleted_keys,
|
79
82
|
:updates_count => @changes.count,
|
80
83
|
:unchanged_count => @noops.count,
|
81
84
|
:updated_fields => @updated_field_counts
|
@@ -92,11 +95,11 @@ class Assimilate::Batch
|
|
92
95
|
end
|
93
96
|
|
94
97
|
def record_batch
|
95
|
-
raise(Assimilate::DuplicateImportError, "duplicate batch for datestamp #{datestamp}") if @catalog.batches.find(
|
96
|
-
raise(Assimilate::DuplicateImportError, "duplicate batch for file #{@filename}") if @catalog.batches.find(
|
98
|
+
raise(Assimilate::DuplicateImportError, "duplicate batch for datestamp #{datestamp}") if @catalog.batches.find(@domainkey => @domain, 'datestamp' => @datestamp).to_a.any?
|
99
|
+
raise(Assimilate::DuplicateImportError, "duplicate batch for file #{@filename}") if @catalog.batches.find(@domainkey => @domain, 'filename' => @filename).to_a.any?
|
97
100
|
|
98
101
|
@catalog.batches.insert({
|
99
|
-
|
102
|
+
@domainkey => @domain,
|
100
103
|
'datestamp' => @datestamp,
|
101
104
|
'filename' => @filename
|
102
105
|
})
|
@@ -106,10 +109,10 @@ class Assimilate::Batch
|
|
106
109
|
@deleted_keys.each do |key|
|
107
110
|
@catalog.catalog.update(
|
108
111
|
{
|
109
|
-
@
|
112
|
+
@domainkey => domain,
|
110
113
|
idfield => key
|
111
114
|
},
|
112
|
-
{"$set" => {:
|
115
|
+
{"$set" => {@catalog.config[:deletion_marker] => datestamp}}
|
113
116
|
)
|
114
117
|
end
|
115
118
|
end
|
@@ -127,7 +130,7 @@ class Assimilate::Batch
|
|
127
130
|
@changes.each do |rec|
|
128
131
|
@catalog.catalog.update(
|
129
132
|
{
|
130
|
-
@
|
133
|
+
@domainkey => domain,
|
131
134
|
idfield => rec[idfield]
|
132
135
|
},
|
133
136
|
{"$set" => rec}
|
@@ -137,7 +140,7 @@ class Assimilate::Batch
|
|
137
140
|
|
138
141
|
def decorate(records)
|
139
142
|
records.map do |r|
|
140
|
-
r[@
|
143
|
+
r[@domainkey] = @domain
|
141
144
|
r.to_hash
|
142
145
|
end
|
143
146
|
end
|
data/lib/assimilate/catalog.rb
CHANGED
@@ -1,28 +1,45 @@
|
|
1
1
|
require "yaml"
|
2
2
|
|
3
|
+
# Catalog configuration:
|
4
|
+
# db name of mongo database
|
5
|
+
# catalog name of the catalog collection
|
6
|
+
# batch name of the batches collection (e.g. "files")
|
7
|
+
# domain key to use for specifying record domains (will be prefixed with _)
|
8
|
+
# deletion_marker key to use to marker records that have disappeared from the source file
|
9
|
+
#
|
3
10
|
# Records in each catalog acquire the following internal attributes:
|
4
|
-
# _id
|
5
|
-
#
|
6
|
-
# _dt_first_seen
|
7
|
-
# _dt_last_seen
|
8
|
-
# _dt_last_update
|
9
|
-
#
|
11
|
+
# _id Unique ID, assigned by mongo
|
12
|
+
# [domain] Domain key, specified with :domainkey attribute when initializing catalog
|
13
|
+
# _dt_first_seen Batch datestamp reference for when this record was first captured
|
14
|
+
# _dt_last_seen Batch datestamp reference for when this record was most recently affirmed
|
15
|
+
# _dt_last_update Batch datestamp reference for when this record was most recently altered
|
16
|
+
# [deletion_marker] Batch datestamp reference for when this record was removed from input
|
10
17
|
#
|
11
18
|
# Inbound records must not have attributes named with leading underscores.
|
12
19
|
#
|
13
20
|
# A "domain" here is a namespace of identifiers.
|
14
21
|
|
15
22
|
class Assimilate::Catalog
|
16
|
-
attr_reader :catalog, :
|
23
|
+
attr_reader :catalog, :config, :batches
|
17
24
|
|
18
25
|
def initialize(args)
|
19
26
|
@config = YAML.load(File.open(args[:config]))
|
27
|
+
check_config
|
28
|
+
|
29
|
+
@db = Mongo::Connection.new.db(@config[:db])
|
30
|
+
@catalog = @db.collection(@config[:catalog])
|
31
|
+
@batches = @db.collection(@config[:batch])
|
32
|
+
end
|
20
33
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
34
|
+
def check_config
|
35
|
+
config.symbolize_keys!
|
36
|
+
[:db, :catalog, :batch, :domain, :deletion_marker].each do |key|
|
37
|
+
raise Assimilate::InvalidConfiguration, "missing required parameter: #{key}" unless config[key]
|
38
|
+
end
|
39
|
+
[:domain, :deletion_marker].each do |key|
|
40
|
+
# enforce leading underscore on internal attributes
|
41
|
+
config[key] = "_#{config[key]}" unless config[key] =~ /^_/
|
42
|
+
end
|
26
43
|
end
|
27
44
|
|
28
45
|
def start_batch(args)
|
@@ -38,6 +55,9 @@ class Assimilate::Catalog
|
|
38
55
|
end
|
39
56
|
|
40
57
|
def active_count
|
41
|
-
@catalog.find(
|
58
|
+
@catalog.find(config[:deletion_marker] => nil).count
|
42
59
|
end
|
43
60
|
end
|
61
|
+
|
62
|
+
class Assimilate::InvalidConfiguration < StandardError
|
63
|
+
end
|
data/lib/assimilate/command.rb
CHANGED
@@ -81,7 +81,7 @@ EOT
|
|
81
81
|
Final record count: #{results[:final_count]}
|
82
82
|
Unchanged records: #{results[:unchanged_count]}
|
83
83
|
New records: #{results[:adds_count]} (#{results[:new_ids].take(10).join(',')})
|
84
|
-
Deletes: #{results[:deletes_count]}
|
84
|
+
Deletes: #{results[:deletes_count]} (#{results[:deleted_ids].take(10).join(',')})
|
85
85
|
Updates: #{results[:updates_count]}
|
86
86
|
EOT
|
87
87
|
if results[:updated_fields].any?
|
data/lib/assimilate/extender.rb
CHANGED
@@ -3,6 +3,8 @@ class Assimilate::Extender
|
|
3
3
|
|
4
4
|
def initialize(args)
|
5
5
|
@catalog = args[:catalog]
|
6
|
+
@domainkey = @catalog.config[:domain]
|
7
|
+
|
6
8
|
@domain = args[:domain]
|
7
9
|
@idfield = args[:idfield]
|
8
10
|
@filename = args[:filename]
|
@@ -17,11 +19,11 @@ class Assimilate::Extender
|
|
17
19
|
end
|
18
20
|
|
19
21
|
def load_baseline
|
20
|
-
stored_records = @catalog.catalog.find(@
|
22
|
+
stored_records = @catalog.catalog.find(@domainkey => @domain).to_a
|
21
23
|
@baseline = stored_records.each_with_object({}) do |rec, h|
|
22
24
|
key = rec[@idfield]
|
23
25
|
if h.include?(key)
|
24
|
-
raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in
|
26
|
+
raise Assimilate::CorruptDataError, "Duplicate records for key [#{key}] in #{@domainkey} [#{@domain}]"
|
25
27
|
end
|
26
28
|
h[key] = rec
|
27
29
|
end
|
@@ -75,7 +77,7 @@ class Assimilate::Extender
|
|
75
77
|
@adds.each do |key|
|
76
78
|
data = @seen[key]
|
77
79
|
@catalog.catalog.insert(
|
78
|
-
@
|
80
|
+
@domainkey => domain,
|
79
81
|
idfield => key,
|
80
82
|
keyfield => data
|
81
83
|
)
|
@@ -88,7 +90,7 @@ class Assimilate::Extender
|
|
88
90
|
data = @seen[key]
|
89
91
|
@catalog.catalog.update(
|
90
92
|
{
|
91
|
-
@
|
93
|
+
@domainkey => domain,
|
92
94
|
idfield => key
|
93
95
|
},
|
94
96
|
{"$set" => {
|
data/lib/assimilate/version.rb
CHANGED
data/spec/data/test.yml
CHANGED
data/spec/lib/batch_spec.rb
CHANGED
@@ -36,6 +36,7 @@ describe "importing file" do
|
|
36
36
|
:adds_count => 6,
|
37
37
|
:new_ids => ["1", "2", "3", "4", "5", "6"],
|
38
38
|
:deletes_count => 0,
|
39
|
+
:deleted_ids => [],
|
39
40
|
:updates_count => 0,
|
40
41
|
:unchanged_count => 0,
|
41
42
|
:updated_fields => {}
|
@@ -64,6 +65,7 @@ describe "importing file" do
|
|
64
65
|
:adds_count => 0,
|
65
66
|
:new_ids => [],
|
66
67
|
:deletes_count => 0,
|
68
|
+
:deleted_ids => [],
|
67
69
|
:updates_count => 0,
|
68
70
|
:unchanged_count => 6,
|
69
71
|
:updated_fields => {}
|
@@ -89,6 +91,7 @@ describe "importing file" do
|
|
89
91
|
:adds_count => 1,
|
90
92
|
:new_ids => ["7"],
|
91
93
|
:deletes_count => 2,
|
94
|
+
:deleted_ids => ['4', '6'],
|
92
95
|
:updates_count => 1,
|
93
96
|
:unchanged_count => 3,
|
94
97
|
:updated_fields => {'title' => 1}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: assimilate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-05-02 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mongo
|
16
|
-
requirement: &
|
16
|
+
requirement: &2152913700 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.6.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2152913700
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: bson_ext
|
27
|
-
requirement: &
|
27
|
+
requirement: &2152912480 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.6.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2152912480
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: activesupport
|
38
|
-
requirement: &
|
38
|
+
requirement: &2152911920 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 3.2.0
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2152911920
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rspec
|
49
|
-
requirement: &
|
49
|
+
requirement: &2152911460 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 2.9.0
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2152911460
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: guard-rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &2152910880 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: 0.7.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2152910880
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: ruby_gntp
|
71
|
-
requirement: &
|
71
|
+
requirement: &2152910140 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,7 +76,7 @@ dependencies:
|
|
76
76
|
version: 0.3.4
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *2152910140
|
80
80
|
description: Ingest updates from CSV and apply to set of hashes
|
81
81
|
email:
|
82
82
|
- jmay@pobox.com
|