content_data 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,210 +1,253 @@
1
+ require 'content_server/globals'
1
2
  require 'log'
2
3
  require 'params'
3
- require 'time'
4
4
 
5
5
  module ContentData
6
6
  Params.string('instance_check_level', 'shallow', 'Defines check level. Supported levels are: ' \
7
7
  'shallow - quick, tests instance for file existence and attributes. ' \
8
8
  'deep - can take more time, in addition to shallow recalculates hash sum.')
9
9
 
10
- class Content
11
- attr_reader :checksum, :size, :first_appearance_time
12
-
13
- def initialize(checksum, size, first_appearance_time, content_serializer = nil)
14
- if content_serializer != nil
15
- if (content_serializer.checksum == nil)
16
- raise ArgumentError.new("checksum have to be defined")
17
- else
18
- @checksum = content_serializer.checksum
19
- end
20
- if (content_serializer.size == nil)
21
- raise ArgumentError.new("size have to be defined")
22
- else
23
- @size = content_serializer.size
24
- end
25
- if (content_serializer.first_appearance_time == nil)
26
- raise ArgumentError.new("first_appearance_time have to be defined")
27
- else
28
- @first_appearance_time = ContentData.parse_time(content_serializer.first_appearance_time)
29
- end
10
+ # Content Data(CD) object holds files information as contents and instances
11
+ # Files info retrieved from hardware: checksum, size, time modification, server, device and path
12
+ # Those attributes are divided into content and instance attributes:
13
+ # unique checksum, size are content attributes
14
+ # time modification, server, device and path are instance attributes
15
+ # The relationship between content and instances is 1:many meaning that
16
+ # a content can have instances in many servers.
17
+ # content also has time attribute, which has the value of the time of the first instance.
18
+ # This can be changed by using unify_time method which sets all time attributes for a content and it's
19
+ # instances to the min time off all.
20
+ # Different files(instances) with same content(checksum), are grouped together under that content.
21
+ # Interface methods include:
22
+ # iterate over contents and instances info,
23
+ # unify time, add/remove instance, queries, merge, remove directory and more.
24
+ # Content info data structure:
25
+ # @contents_info = { Checksum -> [size, *instances*, content_modification_time] }
26
+ # *instances* = {[server,path] -> instance_modification_time }
27
+ # Notes:
28
+ # 1. content_modification_time is the instance_modification_time of the first
29
+ # instances which was added to @contents_info
30
+ class ContentData
30
31
 
32
+ def initialize(other = nil)
33
+ ObjectSpace.define_finalizer(self,
34
+ self.class.method(:finalize).to_proc)
35
+ if Params['enable_monitoring']
36
+ ::ContentServer::Globals.process_vars.inc('obj add ContentData')
37
+ end
38
+ if other.nil?
39
+ @contents_info = {} # Checksum --> [size, paths-->time(instance), time(content)]
31
40
  else
32
- @checksum = checksum
33
- @size = size
34
- @first_appearance_time = first_appearance_time
41
+ @contents_info = other.clone_contents_info
35
42
  end
36
43
  end
37
44
 
38
- def to_s
39
- "%s,%d,%s" % [@checksum, @size, ContentData.format_time(@first_appearance_time)]
45
+ def self.finalize(id)
46
+ if Params['enable_monitoring']
47
+ ::ContentServer::Globals.process_vars.inc('obj rem ContentData')
48
+ end
40
49
  end
41
50
 
42
- def ==(other)
43
- return (self.checksum.eql? other.checksum and
44
- self.size.eql? other.size and
45
- self.first_appearance_time.to_i.eql? other.first_appearance_time.to_i)
51
+ # getting a cloned data base
52
+ def clone_contents_info
53
+ @contents_info.keys.inject({}) { |clone_contents_info, checksum|
54
+ instances = @contents_info[checksum]
55
+ size = instances[0]
56
+ content_time = instances[2]
57
+ instances_db = instances[1]
58
+ instances_db_cloned = {}
59
+ instances_db.keys.each { |location|
60
+ instance_mtime = instances_db[location]
61
+ instances_db_cloned[[location[0].clone,location[1].clone]]=instance_mtime
62
+ }
63
+ clone_contents_info[checksum] = [size,
64
+ instances_db_cloned,
65
+ content_time]
66
+ clone_contents_info
67
+ }
46
68
  end
47
- end
48
-
49
- class ContentInstance
50
- attr_reader :checksum, :size, :server_name, :device, :full_path, :modification_time
51
69
 
52
- def initialize(checksum, size, server_name, device, full_path, modification_time, content_instance_serializer = nil)
53
- if content_instance_serializer != nil
54
- if (content_instance_serializer.checksum == nil)
55
- raise ArgumentError.new("checksum have to be defined")
56
- else
57
- @checksum = content_instance_serializer.checksum
58
- end
59
- if (content_instance_serializer.size == nil)
60
- raise ArgumentError.new("size have to be defined")
61
- else
62
- @size = content_instance_serializer.size
63
- end
64
- if (content_instance_serializer.modification_time == nil)
65
- raise ArgumentError.new("modification_time have to be defined")
66
- else
67
- @modification_time = ContentData.parse_time(content_instance_serializer.modification_time)
68
- end
69
- if (content_instance_serializer.server_name == nil)
70
- raise ArgumentError.new("server_name have to be defined")
71
- else
72
- @server_name = content_instance_serializer.server_name
73
- end
74
- if (content_instance_serializer.device == nil)
75
- raise ArgumentError.new("device have to be defined")
76
- else
77
- @device = content_instance_serializer.device
78
- end
79
- if (content_instance_serializer.full_path == nil)
80
- raise ArgumentError.new("full_path have to be defined")
81
- else
82
- @full_path = content_instance_serializer.full_path
83
- end
84
- else
85
- @checksum = checksum
86
- @size = size
87
- @server_name = server_name
88
- @device = device
89
- @full_path = full_path
90
- @modification_time = modification_time
91
- end
70
+ # iterator over @contents_info data structure (not including instances)
71
+ # block is provided with: checksum, size and content modification time
72
+ def each_content(&block)
73
+ @contents_info.keys.each { |checksum|
74
+ content_val = @contents_info[checksum]
75
+ # provide checksum, size and content modification time to the block
76
+ block.call(checksum,content_val[0], content_val[2])
77
+ }
92
78
  end
93
79
 
94
- def global_path
95
- ContentInstance.instance_global_path(@server_name, @full_path)
80
+ # iterator over @contents_info data structure (including instances)
81
+ # block is provided with: checksum, size, content modification time,
82
+ # instance modification time, server and file path
83
+ def each_instance(&block)
84
+ @contents_info.keys.each { |checksum|
85
+ content_info = @contents_info[checksum]
86
+ content_info[1].keys.each {|location|
87
+ # provide the block with: checksum, size, content modification time,instance modification time,
88
+ # server and path.
89
+ instance_modification_time = content_info[1][location]
90
+ block.call(checksum,content_info[0], content_info[2], instance_modification_time,
91
+ location[0], location[1])
92
+ }
93
+ }
96
94
  end
97
95
 
98
- def ContentInstance.instance_global_path(server_name, full_path)
99
- "%s:%s" % [server_name, full_path]
96
+ # iterator of instances over specific content
97
+ # block is provided with: checksum, size, content modification time,
98
+ # instance modification time, server and file path
99
+ def content_each_instance(checksum, &block)
100
+ content_info = @contents_info[checksum]
101
+ content_info[1].keys.each {|location|
102
+ # provide the block with: checksum, size, content modification time,instance modification time,
103
+ # server and path.
104
+ instance_modification_time = content_info[1][location]
105
+ block.call(checksum,content_info[0], content_info[2], instance_modification_time,
106
+ location[0], location[1])
107
+ }
100
108
  end
101
109
 
102
- def to_s
103
- "%s,%d,%s,%s,%s,%s" % [@checksum, @size, @server_name,
104
- @device, @full_path, ContentData.format_time(@modification_time)]
110
+ def contents_size()
111
+ @contents_info.size
105
112
  end
106
113
 
107
- def ==(other)
108
- return (self.checksum.eql? other.checksum and
109
- self.size.eql? other.size and
110
- self.server_name.eql? other.server_name and
111
- self.device.eql? other.device and
112
- self.full_path.eql? other.full_path and
113
- self.modification_time.to_i.eql? other.modification_time.to_i)
114
+ def instances_size(checksum)
115
+ content_info = @contents_info[checksum]
116
+ return 0 if content_info.nil?
117
+ content_info[1].size
114
118
  end
115
- end
116
119
 
117
- # Unfortunately this class is used as mutable for now. So need to be carefull.
118
- # TODO(kolman): Make this class imutable, but add indexing structure to it.
119
- # TODO(kolman): Add wrapper to the class to enable dynamic content data
120
- # (with easy access indexes)
121
- class ContentData
122
- attr_reader :contents, :instances
120
+ def get_instance_mod_time(checksum, location)
121
+ content_info = @contents_info[checksum]
122
+ return nil if content_info.nil?
123
+ instances = content_info[1]
124
+ instance_time = instances[location]
125
+ end
123
126
 
124
- # @param content_data_serializer_str [String]
125
- def initialize(copy = nil)
126
- if copy.nil?
127
- @contents = Hash.new # key is a checksum , value is a refernce to the Content object
128
- @instances = Hash.new # key is an instance global path , value is a reference to the ContentInstance object
127
+ def add_instance(checksum, size, server, path, modification_time)
128
+ location = [server, path]
129
+ content_info = @contents_info[checksum]
130
+ if content_info.nil?
131
+ @contents_info[checksum] = [size,
132
+ {location => modification_time},
133
+ modification_time]
129
134
  else
130
- # Regenerate only the hashes, the values are immutable.
131
- @contents = copy.contents.clone
132
- @instances = copy.instances.clone
135
+ if size != content_info[0]
136
+ Log.warning 'File size different from content size while same checksum'
137
+ Log.warning("instance location:server:'#{location[0]}' path:'#{location[1]}'")
138
+ Log.warning("instance mod time:'#{modification_time}'")
139
+ end
140
+ #override file if needed
141
+ content_info[0] = size
142
+ instances = content_info[1]
143
+ instances[location] = modification_time
133
144
  end
134
145
  end
135
146
 
136
- def add_content(content)
137
- @contents[content.checksum] = content
147
+ def empty?
148
+ @contents_info.empty?
138
149
  end
139
150
 
140
- def add_instance(instance)
141
- if (not @contents.key?(instance.checksum))
142
- Log.warning sprintf("Adding instance while it's" +
143
- " checksum %s does not exists.\n", instance.checksum)
144
- Log.warning sprintf("%s\n", instance.to_s)
145
- return false
146
- elsif (@contents[instance.checksum].size != instance.size)
147
- Log.warning 'File size different from content size while same checksum'
148
- Log.warning instance.to_s
149
- return false
150
- end
151
+ def content_exists(checksum)
152
+ @contents_info.has_key?(checksum)
153
+ end
151
154
 
152
- key = instance.global_path
153
155
 
154
- #override file if needed
155
- @instances[key] = instance
156
+ # TODO (genadyp) consider about using hash for optional defining of parameters
157
+ def instance_exists(path, server, checksum=nil)
158
+ location = [server, path]
159
+ if checksum.nil?
160
+ @contents_info.values.any? { |content_db|
161
+ content_db[1].has_key?(location)
162
+ }
163
+ else
164
+ content_info = @contents_info[checksum]
165
+ return false if content_info.nil?
166
+ content_info[1].has_key?(location)
167
+ end
156
168
  end
157
169
 
158
- def empty?
159
- @contents.empty?
170
+ def stats_by_location(location)
171
+ @contents_info.each_value { |content_db|
172
+ if content_db[1].has_key?(location)
173
+ return [content_db[0], content_db[1][location]]
174
+ end
175
+ }
176
+ return nil
160
177
  end
161
178
 
162
- # TODO rename method with finishing '?', cause it returns a boolean
163
- def content_exists(checksum)
164
- @contents.key? checksum
179
+
180
+ # removes an instance from known content (faster then unknown content)
181
+ # remove also the content, if content becomes empty
182
+ def remove_instance(location, checksum=nil)
183
+ if checksum.nil?
184
+ @contents_info.keys.each { |checksum|
185
+ instances = @contents_info[checksum][1]
186
+ instances.delete(location)
187
+ @contents_info.delete(checksum) if instances.empty?
188
+ }
189
+ else
190
+ content_info = @contents_info[checksum]
191
+ unless content_info.nil?
192
+ instances = content_info[1]
193
+ instances.delete(location)
194
+ @contents_info.delete(checksum) if instances.empty?
195
+ end
196
+ end
165
197
  end
166
198
 
167
- # TODO(kolman): The semantics of thir merge is merge! change in all file.
168
- def merge(content_data)
169
- content_data.contents.values.each { |content|
170
- add_content(content)
171
- }
172
- content_data.instances.values.each { |instance|
173
- add_instance(instance)
199
+ def remove_directory(dir_to_remove, server)
200
+ @contents_info.keys.each { |checksum|
201
+ instances = @contents_info[checksum][1]
202
+ instances.delete_if { |location, _|
203
+ location[0] == server and location[1].scan(dir_to_remove).size > 0
204
+ }
205
+ @contents_info.delete(checksum) if instances.empty?
174
206
  }
175
207
  end
176
208
 
209
+
177
210
  def ==(other)
178
- return false if other == nil
179
- return false unless @contents.size == other.contents.size
180
- return false unless @instances.size == other.instances.size
181
-
182
- @contents.keys.each { |key|
183
- if (@contents[key] != other.contents[key])
184
- Log.debug1 @contents[key].first_appearance_time.to_i
185
- Log.debug1 other.contents[key].first_appearance_time.to_i
186
- return false
187
- end
188
- }
189
- @instances.keys.each { |key|
190
- if (@instances[key] != other.instances[key])
191
- return false
192
- end
211
+ return false if other.nil?
212
+ return false if @contents_info.size != other.contents_size
213
+ other.each_instance { |checksum, size, content_mod_time, instance_mod_time, server, path|
214
+ local_content_info = @contents_info[checksum]
215
+ return false if local_content_info.nil?
216
+ return false if local_content_info[0] != size
217
+ return false if local_content_info[2] != content_mod_time
218
+ #check instances
219
+ local_instances = local_content_info[1]
220
+ return false if other.instances_size(checksum) != local_instances.size
221
+ location = [server, path]
222
+ local_instance_mod_time = local_instances[location]
223
+ return false if local_instance_mod_time.nil?
224
+ return false if local_instance_mod_time != instance_mod_time
193
225
  }
194
- return true
226
+ true
227
+ end
228
+
229
+ def remove_content(checksum)
230
+ @contents_info.delete(checksum)
195
231
  end
196
232
 
197
233
  def to_s
198
- ret = ""
199
- ret << @contents.length.to_s << "\n"
200
- @contents.each_value { |content|
201
- ret << content.to_s << "\n"
234
+ return_str = ""
235
+ contents_str = ""
236
+ instances_str = ""
237
+ instances_counter = 0
238
+ each_content { |checksum, size, content_mod_time|
239
+ contents_str << "%s,%d,%d\n" % [checksum, size, content_mod_time]
202
240
  }
203
- ret << @instances.length.to_s << "\n"
204
- @instances.each_value { |instance|
205
- ret << instance.to_s << "\n"
241
+ instances_counter = 0
242
+ each_instance { |checksum, size, content_mod_time, instance_mod_time, server, path|
243
+ instances_counter += 1
244
+ instances_str << "%s,%d,%s,%s,%d\n" % [checksum, size, server, path, instance_mod_time]
206
245
  }
207
- return ret
246
+ return_str << "%d\n" % [@contents_info.size]
247
+ return_str << contents_str
248
+ return_str << "%d\n" % [instances_counter]
249
+ return_str << instances_str
250
+ return_str
208
251
  end
209
252
 
210
253
  def to_file(filename)
@@ -216,192 +259,61 @@ module ContentData
216
259
  # TODO validation that file indeed contains ContentData missing
217
260
  def from_file(filename)
218
261
  lines = IO.readlines(filename)
219
- i = 0
220
- number_of_contents = lines[i].to_i
221
- i += 1
222
- number_of_contents.times {
223
- parameters = lines[i].split(",")
224
- add_content(Content.new(parameters[0],
225
- parameters[1].to_i,
226
- ContentData.parse_time(parameters[2])))
227
- i += 1
228
- }
229
-
262
+ number_of_contents = lines[0].to_i
263
+ i = 1 + number_of_contents
230
264
  number_of_instances = lines[i].to_i
231
265
  i += 1
232
266
  number_of_instances.times {
233
267
  if lines[i].nil?
234
- Log.debug1 "lines[i] if nil !!!, Backing filename: #{filename} to #{filename}.bad"
268
+ Log.warning "line ##{i} is nil !!!, Backing filename: #{filename} to #{filename}.bad"
235
269
  FileUtils.cp(filename, "#{filename}.bad")
236
- Log.debug1 lines[i].join("\n")
237
- end
238
- parameters = lines[i].split(',')
239
- # bugfix: if file name consist a comma then parsing based on comma separating fails
240
- if (parameters.size > 6)
241
- (5..parameters.size-2).each do |i|
242
- parameters[4] = [parameters[4], parameters[i]].join(",")
243
- end
244
- (5..parameters.size-2).each do |i|
245
- parameters.delete_at(5)
270
+ Log.warning("Lines:\n#{lines[i].join("\n")}")
271
+ else
272
+ parameters = lines[i].split(',')
273
+ # bugfix: if file name consist a comma then parsing based on comma separating fails
274
+ if (parameters.size > 5)
275
+ (4..parameters.size-2).each do |i|
276
+ parameters[3] = [parameters[3], parameters[i]].join(",")
277
+ end
278
+ (4..parameters.size-2).each do |i|
279
+ parameters.delete_at(4)
280
+ end
246
281
  end
247
- end
248
282
 
249
- add_instance(ContentInstance.new(parameters[0],
250
- parameters[1].to_i,
251
- parameters[2],
252
- parameters[3],
253
- parameters[4],
254
- ContentData.parse_time(parameters[5])))
283
+ add_instance(parameters[0],
284
+ parameters[1].to_i,
285
+ parameters[2],
286
+ parameters[3],
287
+ parameters[4].to_i)
288
+ end
255
289
  i += 1
256
290
  }
257
291
  end
258
292
 
259
- def self.parse_time time_str
260
- return nil unless time_str.instance_of? String
261
- seconds_from_epoch = Integer time_str # Not using to_i here because it does not check string is integer.
262
- time = Time.at seconds_from_epoch
263
- end
264
-
265
- def self.format_time(time)
266
- return nil unless time.instance_of?Time
267
- str = time.to_i.to_s
268
- return str
269
- end
270
-
271
- # merges content data a and content data b to a new content data and returns it.
272
- def self.merge(a, b)
273
- return b unless not a.nil?
274
- return a unless not b.nil?
275
-
276
- return nil unless a.instance_of?ContentData
277
- return nil unless b.instance_of?ContentData
278
-
279
- ret = ContentData.new
280
- ret.merge(a)
281
- ret.merge(b)
282
-
283
- return ret
284
- end
285
-
286
- # removed content data a from content data b and returns the new content data.
287
- def self.remove(a, b)
288
- return nil unless a.instance_of?ContentData
289
- return nil unless b.instance_of?ContentData
290
-
291
- ret = ContentData.new
292
-
293
- b.contents.values.each { |content|
294
- #print "%s - %s\n" % [content.checksum, a.content_exists(content.checksum).to_s]
295
- ret.add_content(content) unless a.content_exists(content.checksum)
296
- }
297
-
298
- #Log.debug1 "kaka"
299
-
300
- b.instances.values.each { |instance|
301
- #print "%s - %s\n" % [instance.checksum, a.content_exists(instance.checksum).to_s]
302
- ret.add_instance(instance) unless a.content_exists(instance.checksum)
303
- }
304
-
305
- #print "kuku %s" % ret.contents.size.to_s
306
- #print "kuku %s" % ret.instances.size.to_s
307
- return ret
308
- end
309
-
310
- def self.remove_instances(a, b)
311
- return nil unless a.instance_of?ContentData
312
- return nil unless b.instance_of?ContentData
313
-
314
- ret = ContentData.new
315
- b.instances.values.each do |instance|
316
- if !a.instances.key?(instance.global_path)
317
- ret.add_content(b.contents[instance.checksum])
318
- ret.add_instance(instance)
319
- end
320
- end
321
- return ret
322
- end
323
-
324
- def self.remove_directory(cd, global_dir_path)
325
- return nil unless cd.instance_of?ContentData
326
-
327
- ret = ContentData.new
328
- cd.instances.values.each do |instance|
329
- if instance.global_path.scan(global_dir_path).size == 0
330
- ret.add_content(cd.contents[instance.checksum])
331
- ret.add_instance(instance)
332
- end
333
- end
334
- return ret
335
- end
336
-
337
- # returns the common content in both a and b
338
- def self.intersect(a, b)
339
- b_minus_a = ContentData.remove(a, b)
340
- return ContentData.remove(b_minus_a, b)
341
- end
342
-
343
- # unify time for all entries with same content to minimal time
344
- def self.unify_time(db)
345
- mod_db = ContentData.new # resulting ContentData that will consists objects with unified time
346
- checksum2time = Hash.new # key=checksum value=min_time_for_this_checksum
347
- checksum2instances = Hash.new # key=checksum value=array_of_instances_with_this_checksum (Will be replaced with ContentData method)
348
-
349
- # populate tables with given ContentData entries
350
- db.instances.each_value do |instance|
351
- checksum = instance.checksum
352
- time = instance.modification_time
353
-
354
- unless (checksum2instances.has_key? checksum)
355
- checksum2instances[checksum] = []
356
- end
357
- checksum2instances[checksum] << instance
358
-
359
- if (not checksum2time.has_key? checksum)
360
- checksum2time[checksum] = time
361
- elsif ((checksum2time[checksum] <=> time) > 0)
362
- checksum2time[checksum] = time
363
- end
364
- end
365
-
366
- # update min time table with time information from contents
367
- db.contents.each do |checksum, content|
368
- time = content.first_appearance_time
369
- if (not checksum2time.has_key? checksum)
370
- checksum2time[checksum] = time
371
- elsif ((checksum2time[checksum] <=> time) > 0)
372
- checksum2time[checksum] = time
373
- end
374
- end
375
-
376
- # add content entries to the output table. in need of case update time field with found min time
377
- db.contents.each do |checksum, content|
378
- time = checksum2time[checksum]
379
- if ((content.first_appearance_time <=> time) == 0)
380
- mod_db.add_content(content)
381
- else
382
- mod_db.add_content(Content.new(checksum, content.size, time))
383
- end
384
- end
385
-
386
- # add instance entries to the output table. in need of case update time field with found min time
387
- checksum2instances.each do |checksum, instances|
388
- time = checksum2time[checksum]
389
- instances.each do |instance|
390
- if ((instance.modification_time <=> time) == 0)
391
- mod_db.add_instance(instance)
392
- else # must be bigger then found min time
393
- mod_instance = ContentInstance.new(instance.checksum, instance.size,
394
- instance.server_name, instance.device,
395
- instance.full_path, time)
396
- mod_db.add_instance(mod_instance)
293
+ # for each content, all time fields (content and instances) are replaced with the
294
+ # min time found, while going through all time fields.
295
+ def unify_time()
296
+ @contents_info.keys.each { |checksum|
297
+ content_info = @contents_info[checksum]
298
+ min_time_per_checksum = content_info[2]
299
+ instances = content_info[1]
300
+ instances.keys.each { |location|
301
+ instance_mod_time = instances[location]
302
+ if instance_mod_time < min_time_per_checksum
303
+ min_time_per_checksum = instance_mod_time
397
304
  end
398
- end
399
- end
400
- mod_db
305
+ }
306
+ # update all instances with min time
307
+ instances.keys.each { |location|
308
+ instances[location] = min_time_per_checksum
309
+ }
310
+ # update content time with min time
311
+ content_info[2] = min_time_per_checksum
312
+ }
401
313
  end
402
314
 
403
315
  # Validates index against file system that all instances hold a correct data regarding files
404
- # that they represrents.
316
+ # that they represents.
405
317
  #
406
318
  # There are two levels of validation, controlled by instance_check_level system parameter:
407
319
  # * shallow - quick, tests instance for file existence and attributes.
@@ -411,6 +323,7 @@ module ContentData
411
323
  # Supported key/value combinations:
412
324
  # * key is <tt>:failed</tt> value is <tt>ContentData</tt> used to return failed instances
413
325
  # @return [Boolean] true when index is correct, false otherwise
326
+ # @raise [ArgumentError] when instance_check_level is incorrect
414
327
  def validate(params = nil)
415
328
  # used to answer whether specific param was set
416
329
  param_exists = Proc.new do |param|
@@ -419,47 +332,65 @@ module ContentData
419
332
 
420
333
  # used to process method parameters centrally
421
334
  process_params = Proc.new do |values|
422
- # values is a Hash with keys: :content, :instance and value appropriate to key
423
- if param_exists.call :failed
424
- unless values[:content].nil?
425
- params[:failed].add_content values[:content]
426
- end
427
- unless values[:instance].nil?
428
- # appropriate content should be already added
429
- params[:failed].add_instance values[:instance]
335
+ if param_exists.call(:failed)
336
+ info = values[:details]
337
+ unless info.nil?
338
+ checksum = info[0]
339
+ content_mtime = info[1]
340
+ size = info[2]
341
+ inst_mtime = info[3]
342
+ server = info[4]
343
+ file_path = info[5]
344
+ params[:failed].add_instance(checksum, size, server, file_path, inst_mtime)
430
345
  end
431
346
  end
432
347
  end
433
348
 
434
349
  is_valid = true
435
- instances.each_value do |instance|
436
- unless check_instance instance
437
- is_valid = false
438
-
439
- unless params.nil? || params.empty?
440
- process_params.call :content => contents[instance.checksum], :instance => instance
350
+ @contents_info.keys.each { |checksum|
351
+ instances = @contents_info[checksum]
352
+ content_size = instances[0]
353
+ content_mtime = instances[2]
354
+ instances[1].keys.each { |unique_path|
355
+ instance_mtime = instances[1][unique_path]
356
+ instance_info = [checksum, content_mtime, content_size, instance_mtime]
357
+ instance_info.concat(unique_path)
358
+ unless check_instance(instance_info)
359
+ is_valid = false
360
+
361
+ unless params.nil? || params.empty?
362
+ process_params.call({:details => instance_info})
363
+ end
441
364
  end
442
- end
443
- end
444
-
365
+ }
366
+ }
445
367
  is_valid
446
368
  end
447
369
 
448
- def shallow_check(instance)
449
- path = instance.full_path
370
+ # instance_info is an array:
371
+ # [0] - checksum
372
+ # [1] - content time
373
+ # [2] - content size
374
+ # [3] - instance mtime
375
+ # [4] - server name
376
+ # [5] - file path
377
+ def shallow_check(instance_info)
378
+ path = instance_info[5]
379
+ size = instance_info[2]
380
+ instance_mtime = instance_info[3]
450
381
  is_valid = true
451
382
 
452
383
  if (File.exists?(path))
453
- if File.size(path) != instance.size
384
+ if File.size(path) != size
454
385
  is_valid = false
455
- err_msg = "#{path} size #{File.size(path)} differs from indexed size #{instance.size}"
386
+ err_msg = "#{path} size #{File.size(path)} differs from indexed size #{size}"
456
387
  Log.warning err_msg
457
388
  end
458
389
  #if ContentData.format_time(File.mtime(path)) != instance.modification_time
459
- if File.mtime(path).to_i != instance.modification_time.to_i
390
+ if File.mtime(path).to_i != instance_mtime
460
391
  is_valid = false
461
- err_msg = "#{path} modification time #{File.mtime(path)} differs from " \
462
- + "indexed #{instance.modification_time}"
392
+ err_msg = "#{path} modification time #{File.mtime(path).to_i} differs from " \
393
+ + "indexed #{instance_mtime}"
463
394
  Log.warning err_msg
464
395
  end
465
396
  else
@@ -470,14 +401,22 @@ module ContentData
470
401
  is_valid
471
402
  end
472
403
 
473
- def deep_check(instance)
474
- if shallow_check(instance)
475
- path = instance.full_path
404
+ # instance_info is an array:
405
+ # [0] - checksum
406
+ # [1] - content time
407
+ # [2] - content size
408
+ # [3] - instance mtime
409
+ # [4] - server name
410
+ # [5] - file path
411
+ def deep_check(instance_info)
412
+ if shallow_check(instance_info)
413
+ instance_checksum = instance_info[0]
414
+ path = instance_info[5]
476
415
  current_checksum = FileIndexing::IndexAgent.get_checksum(path)
477
- if instance.checksum == current_checksum
416
+ if instance_checksum == current_checksum
478
417
  true
479
418
  else
480
- err_msg = "#{path} checksum #{current_checksum} differs from indexed #{instance.checksum}"
419
+ err_msg = "#{path} checksum #{current_checksum} differs from indexed #{instance_checksum}"
481
420
  Log.warning err_msg
482
421
  false
483
422
  end
@@ -486,6 +425,7 @@ module ContentData
486
425
  end
487
426
  end
488
427
 
428
+ # @raise [ArgumentError] when instance_check_level is incorrect
489
429
  def check_instance(instance)
490
430
  case Params['instance_check_level']
491
431
  when 'deep'
@@ -558,163 +498,115 @@ module ContentData
558
498
  private :shallow_check, :deep_check, :check_instance
559
499
  end
560
500
 
561
- # Validates index against file system that all instances hold a correct data regarding files
562
- # that they represrents.
563
- #
564
- # There are two levels of validation, controlled by instance_check_level system parameter:
565
- # * shallow - quick, tests instance for file existence and attributes.
566
- # * deep - can take more time, in addition to shallow recalculates hash sum.
567
- # @param [Hash] params hash of parameters of validation, can be used to return additional data.
568
- #
569
- # Supported key/value combinations:
570
- # * key is <tt>:failed</tt> value is <tt>ContentData</tt> used to return failed instances
571
- # @return [Boolean] true when index is correct, false otherwise
572
- # @raise [ArgumentError] when instance_check_level is incorrect
573
- def validate(params = nil)
574
- # used to answer whether specific param was set
575
- param_exists = Proc.new do |param|
576
- !(params.nil? || params[param].nil?)
577
- end
578
-
579
- # used to process method parameters centrally
580
- process_params = Proc.new do |values|
581
- # values is a Hash with keys: :content, :instance and value appropriate to key
582
- if param_exists.call :failed
583
- unless values[:content].nil?
584
- params[:failed].add_content values[:content]
585
- end
586
- unless values[:instance].nil?
587
- # appropriate content should be already added
588
- params[:failed].add_instance values[:instance]
589
- end
590
- end
591
- end
592
-
593
- is_valid = true
594
- instances.each_value do |instance|
595
- unless check_instance instance
596
- is_valid = false
597
-
598
- unless params.nil? || params.empty?
599
- process_params.call :content => contents[instance.checksum], :instance => instance
600
- end
601
- end
602
- end
603
-
604
- is_valid
501
+ # merges content data a and content data b to a new content data and returns it.
502
+ def self.merge(a, b)
503
+ return ContentData.new(a) if b.nil?
504
+ return ContentData.new(b) if a.nil?
505
+ c = ContentData.new(b)
506
+ # Add A instances to content data c
507
+ a.each_instance { |checksum, size, content_mod_time, instance_mod_time, server, path|
508
+ c.add_instance(checksum, size, server, path, instance_mod_time)
509
+ }
510
+ c
605
511
  end
606
512
 
607
- def shallow_check(instance)
608
- path = instance.full_path
609
- is_valid = true
610
-
611
- if (File.exists?(path))
612
- if File.size(path) != instance.size
613
- is_valid = false
614
- err_msg = "#{path} size #{File.size(path)} differs from indexed size #{instance.size}"
615
- Log.warning err_msg
616
- end
617
- #if ContentData.format_time(File.mtime(path)) != instance.modification_time
618
- if File.mtime(path).to_i != instance.modification_time.to_i
619
- is_valid = false
620
- err_msg = "#{path} modification time #{File.mtime(path)} differs from " \
621
- + "indexed #{instance.modification_time}"
622
- Log.warning err_msg
623
- end
624
- else
625
- is_valid = false
626
- err_msg = "Indexed file #{path} doesn't exist"
627
- Log.warning err_msg
628
- end
629
- is_valid
513
+ def self.merge_override_b(a, b)
514
+ return ContentData.new(a) if b.nil?
515
+ return ContentData.new(b) if a.nil?
516
+ # Add A instances to content data B
517
+ a.each_instance { |checksum, size, content_mod_time, instance_mod_time, server, path|
518
+ b.add_instance(checksum, size, server, path, instance_mod_time)
519
+ }
520
+ b
630
521
  end
631
522
 
632
- def deep_check(instance)
633
- if shallow_check(instance)
634
- path = instance.full_path
635
- current_checksum = FileIndexing::IndexAgent.get_checksum(path)
636
- if instance.checksum == current_checksum
637
- true
638
- else
639
- err_msg = "#{path} checksum #{current_checksum} differs from indexed #{instance.checksum}"
640
- Log.warning err_msg
641
- false
642
- end
643
- else
644
- false
645
- end
523
+ # B - A : Remove contents of A from B and return the new content data.
524
+ # instances are ignored
525
+ # e.g
526
+ # A db:
527
+ # Content_1 ->
528
+ # Instance_1
529
+ # Instance_2
530
+ #
531
+ # Content_2 ->
532
+ # Instance_3
533
+ #
534
+ # B db:
535
+ # Content_1 ->
536
+ # Instance_1
537
+ # Instance_2
538
+ #
539
+ # Content_2 ->
540
+ # Instance_3
541
+ # Instance_4
542
+ # Content_3 ->
543
+ # Instance_5
544
+ # B-A db:
545
+ # Content_3 ->
546
+ # Instance_5
547
+ def self.remove(a, b)
548
+ return nil if b.nil?
549
+ return ContentData.new(b) if a.nil?
550
+ c = ContentData.new(b) # create new cloned content C from B
551
+ # remove contents of A from newly cloned content A
552
+ a.each_content { |checksum, size, content_mod_time|
553
+ c.remove_content(checksum)
554
+ }
555
+ c
646
556
  end
647
557
 
648
- # @raise [ArgumentError] when instance_check_level is incorrect
649
- def check_instance(instance)
650
- case Params['instance_check_level']
651
- when 'deep'
652
- deep_check instance
653
- when 'shallow'
654
- shallow_check instance
655
- else
656
- # TODO remove it when params will support set of values
657
- throw ArgumentError.new "Unsupported check level #{Params['instance_check_level']}"
658
- end
558
+ # B - A : Remove instances of A content from B content data B and return the new content data.
559
+ # If all instances are removed then the content record itself will be removed
560
+ # e.g
561
+ # A db:
562
+ # Content_1 ->
563
+ # Instance_1
564
+ # Instance_2
565
+ #
566
+ # Content_2 ->
567
+ # Instance_3
568
+ #
569
+ # B db:
570
+ # Content_1 ->
571
+ # Instance_1
572
+ # Instance_2
573
+ #
574
+ # Content_2 ->
575
+ # Instance_3
576
+ # Instance_4
577
+ # B-A db:
578
+ # Content_2 ->
579
+ # Instance_4
580
+ def self.remove_instances(a, b)
581
+ return nil if b.nil?
582
+ return ContentData.new(b) if a.nil?
583
+ c = ContentData.new(b) # create new cloned content C from B
584
+ # remove contents of A from newly cloned content A
585
+ a.each_instance { |checksum, size, content_mod_time, instance_mod_time, server, path|
586
+ location = [server, path]
587
+ c.remove_instance(location, checksum)
588
+ }
589
+ c
659
590
  end
660
591
 
661
-
662
- # TODO simplify conditions
663
- # This mehod is experimental and shouldn\'t be used
664
- # nil is used to define +/- infinity for to/from method arguments
665
- # from/to values are exlusive in condition'a calculations
666
- # Need to take care about '==' operation that is used for object's comparison.
667
- # In need of case user should define it's own '==' implemementation.
668
- def get_query(variable, params)
669
- raise RuntimeError.new 'This method is experimental and shouldn\'t be used'
670
-
671
- exact = params['exact'].nil? ? Array.new : params['exact']
672
- from = params['from']
673
- to = params ['to']
674
- is_inside = params['is_inside']
675
-
676
- unless ContentInstance.new.instance_variable_defined?("@#{attribute}")
677
- raise ArgumentError "#{variable} isn't a ContentInstance variable"
678
- end
679
-
680
- if (exact.nil? && from.nil? && to.nil?)
681
- raise ArgumentError 'At least one of the argiments {exact, from, to} must be defined'
682
- end
683
-
684
- if (!(from.nil? || to.nil?) && from.kind_of?(to.class))
685
- raise ArgumentError 'to and from arguments should be comparable one with another'
686
- end
687
-
688
- # FIXME add support for from/to for Strings
689
- if ((!from.nil? && !from.kind_of?(Numeric.new.class))\
690
- || (!to.nil? && to.kind_of?(Numeric.new.class)))
691
- raise ArgumentError 'from and to options supported only for numeric values'
692
- end
693
-
694
- if (!exact.empty? && (!from.nil? || !to.nil?))
695
- raise ArgumentError 'exact and from/to options are mutually exclusive'
696
- end
697
-
698
- result_index = ContentData.new
699
- instances.each_value do |instance|
700
- is_match = false
701
- var_value = instance.instance_variable_get("@#{variable}")
702
-
703
- if exact.include? var_value
704
- is_match = true
705
- elsif (from.nil? || var_value > from) && (to.nil? || var_value < to)
706
- is_match = true
707
- end
708
-
709
- if (is_match && is_inside) || (!is_match && !is_inside)
710
- checksum = instance.checksum
711
- result_index.add_content(contents[checksum]) unless result_index.content_exists(checksum)
712
- result_index.add_instance instance
592
+ def self.remove_directory(content_data, dir_to_remove, server_to_remove)
593
+ return nil if content_data.nil?
594
+ result_content_data = ContentData.new()
595
+ content_data.each_instance { |checksum, size, content_mod_time, instance_mod_time, server, path|
596
+ # Keep instance if path is not of server to remove or path does not include dir to remove
597
+ if (server_to_remove!=server) or (path.scan(dir_to_remove).size == 0)
598
+ result_content_data.add_instance(checksum.clone, size, server, path.clone, instance_mod_time)
713
599
  end
714
- end
715
- result_index
600
+ }
601
+ result_content_data
716
602
  end
717
603
 
718
- private :shallow_check, :deep_check, :check_instance, :get_query
604
+ # returns the common content in both a and b
605
+ def self.intersect(a, b)
606
+ return nil if a.nil?
607
+ return nil if b.nil?
608
+ b_minus_a = remove(a, b)
609
+ b_minus_b_minus_a = remove(b_minus_a, b)
610
+ end
719
611
  end
720
612