content_data 0.0.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,397 +2,722 @@ require 'log'
2
2
  require 'params'
3
3
  require 'time'
4
4
 
5
- module BBFS
6
- module ContentData
7
-
8
- class Content
9
- attr_reader :checksum, :size, :first_appearance_time
10
-
11
- def initialize(checksum, size, first_appearance_time, content_serializer = nil)
12
- if content_serializer != nil
13
- if (content_serializer.checksum == nil)
14
- raise ArgumentError.new("checksum have to be defined")
15
- else
16
- @checksum = content_serializer.checksum
17
- end
18
- if (content_serializer.size == nil)
19
- raise ArgumentError.new("size have to be defined")
20
- else
21
- @size = content_serializer.size
22
- end
23
- if (content_serializer.first_appearance_time == nil)
24
- raise ArgumentError.new("first_appearance_time have to be defined")
25
- else
26
- @first_appearance_time = ContentData.parse_time(content_serializer.first_appearance_time)
27
- end
28
-
5
+ module ContentData
6
+ Params.string('instance_check_level', 'shallow', 'Defines check level. Supported levels are: ' \
7
+ 'shallow - quick, tests instance for file existence and attributes. ' \
8
+ 'deep - can take more time, in addition to shallow recalculates hash sum.')
9
+
10
+ class Content
11
+ attr_reader :checksum, :size, :first_appearance_time
12
+
13
+ def initialize(checksum, size, first_appearance_time, content_serializer = nil)
14
+ if content_serializer != nil
15
+ if (content_serializer.checksum == nil)
16
+ raise ArgumentError.new("checksum have to be defined")
29
17
  else
30
- @checksum = checksum
31
- @size = size
32
- @first_appearance_time = first_appearance_time
18
+ @checksum = content_serializer.checksum
19
+ end
20
+ if (content_serializer.size == nil)
21
+ raise ArgumentError.new("size have to be defined")
22
+ else
23
+ @size = content_serializer.size
24
+ end
25
+ if (content_serializer.first_appearance_time == nil)
26
+ raise ArgumentError.new("first_appearance_time have to be defined")
27
+ else
28
+ @first_appearance_time = ContentData.parse_time(content_serializer.first_appearance_time)
33
29
  end
34
- end
35
30
 
36
- def to_s
37
- "%s,%d,%s" % [@checksum, @size, ContentData.format_time(@first_appearance_time)]
31
+ else
32
+ @checksum = checksum
33
+ @size = size
34
+ @first_appearance_time = first_appearance_time
38
35
  end
36
+ end
39
37
 
40
- def ==(other)
41
- return (self.checksum.eql? other.checksum and
42
- self.size.eql? other.size and
43
- self.first_appearance_time.to_i.eql? other.first_appearance_time.to_i)
44
- end
38
+ def to_s
39
+ "%s,%d,%s" % [@checksum, @size, ContentData.format_time(@first_appearance_time)]
45
40
  end
46
41
 
47
- class ContentInstance
48
- attr_reader :checksum, :size, :server_name, :device, :full_path, :modification_time
42
+ def ==(other)
43
+ return (self.checksum.eql? other.checksum and
44
+ self.size.eql? other.size and
45
+ self.first_appearance_time.to_i.eql? other.first_appearance_time.to_i)
46
+ end
47
+ end
49
48
 
50
- def initialize(checksum, size, server_name, device, full_path, modification_time, content_instance_serializer = nil)
51
- if content_instance_serializer != nil
52
- if (content_instance_serializer.checksum == nil)
53
- raise ArgumentError.new("checksum have to be defined")
54
- else
55
- @checksum = content_instance_serializer.checksum
56
- end
57
- if (content_instance_serializer.size == nil)
58
- raise ArgumentError.new("size have to be defined")
59
- else
60
- @size = content_instance_serializer.size
61
- end
62
- if (content_instance_serializer.modification_time == nil)
63
- raise ArgumentError.new("modification_time have to be defined")
64
- else
65
- @modification_time = ContentData.parse_time(content_instance_serializer.modification_time)
66
- end
67
- if (content_instance_serializer.server_name == nil)
68
- raise ArgumentError.new("server_name have to be defined")
69
- else
70
- @server_name = content_instance_serializer.server_name
71
- end
72
- if (content_instance_serializer.device == nil)
73
- raise ArgumentError.new("device have to be defined")
74
- else
75
- @device = content_instance_serializer.device
76
- end
77
- if (content_instance_serializer.full_path == nil)
78
- raise ArgumentError.new("full_path have to be defined")
79
- else
80
- @full_path = content_instance_serializer.full_path
81
- end
49
+ class ContentInstance
50
+ attr_reader :checksum, :size, :server_name, :device, :full_path, :modification_time
51
+
52
+ def initialize(checksum, size, server_name, device, full_path, modification_time, content_instance_serializer = nil)
53
+ if content_instance_serializer != nil
54
+ if (content_instance_serializer.checksum == nil)
55
+ raise ArgumentError.new("checksum have to be defined")
56
+ else
57
+ @checksum = content_instance_serializer.checksum
58
+ end
59
+ if (content_instance_serializer.size == nil)
60
+ raise ArgumentError.new("size have to be defined")
82
61
  else
83
- @checksum = checksum
84
- @size = size
85
- @server_name = server_name
86
- @device = device
87
- @full_path = full_path
88
- @modification_time = modification_time
62
+ @size = content_instance_serializer.size
89
63
  end
64
+ if (content_instance_serializer.modification_time == nil)
65
+ raise ArgumentError.new("modification_time have to be defined")
66
+ else
67
+ @modification_time = ContentData.parse_time(content_instance_serializer.modification_time)
68
+ end
69
+ if (content_instance_serializer.server_name == nil)
70
+ raise ArgumentError.new("server_name have to be defined")
71
+ else
72
+ @server_name = content_instance_serializer.server_name
73
+ end
74
+ if (content_instance_serializer.device == nil)
75
+ raise ArgumentError.new("device have to be defined")
76
+ else
77
+ @device = content_instance_serializer.device
78
+ end
79
+ if (content_instance_serializer.full_path == nil)
80
+ raise ArgumentError.new("full_path have to be defined")
81
+ else
82
+ @full_path = content_instance_serializer.full_path
83
+ end
84
+ else
85
+ @checksum = checksum
86
+ @size = size
87
+ @server_name = server_name
88
+ @device = device
89
+ @full_path = full_path
90
+ @modification_time = modification_time
90
91
  end
92
+ end
91
93
 
92
- def global_path
93
- ContentInstance.instance_global_path(@server_name, @full_path)
94
- end
94
+ def global_path
95
+ ContentInstance.instance_global_path(@server_name, @full_path)
96
+ end
95
97
 
96
- def ContentInstance.instance_global_path(server_name, full_path)
97
- "%s:%s" % [server_name, full_path]
98
- end
98
+ def ContentInstance.instance_global_path(server_name, full_path)
99
+ "%s:%s" % [server_name, full_path]
100
+ end
99
101
 
100
- def to_s
101
- "%s,%d,%s,%s,%s,%s" % [@checksum, @size, @server_name,
102
- @device, @full_path, ContentData.format_time(@modification_time)]
103
- end
102
+ def to_s
103
+ "%s,%d,%s,%s,%s,%s" % [@checksum, @size, @server_name,
104
+ @device, @full_path, ContentData.format_time(@modification_time)]
105
+ end
104
106
 
105
- def ==(other)
106
- return (self.checksum.eql? other.checksum and
107
- self.size.eql? other.size and
108
- self.server_name.eql? other.server_name and
109
- self.device.eql? other.device and
110
- self.full_path.eql? other.full_path and
111
- self.modification_time.to_i.eql? other.modification_time.to_i)
107
+ def ==(other)
108
+ return (self.checksum.eql? other.checksum and
109
+ self.size.eql? other.size and
110
+ self.server_name.eql? other.server_name and
111
+ self.device.eql? other.device and
112
+ self.full_path.eql? other.full_path and
113
+ self.modification_time.to_i.eql? other.modification_time.to_i)
114
+ end
115
+ end
116
+
117
+ # Unfortunately this class is used as mutable for now. So need to be carefull.
118
+ # TODO(kolman): Make this class imutable, but add indexing structure to it.
119
+ # TODO(kolman): Add wrapper to the class to enable dynamic content data
120
+ # (with easy access indexes)
121
+ class ContentData
122
+ attr_reader :contents, :instances
123
+
124
+ # @param content_data_serializer_str [String]
125
+ def initialize(copy = nil)
126
+ if copy.nil?
127
+ @contents = Hash.new # key is a checksum , value is a refernce to the Content object
128
+ @instances = Hash.new # key is an instance global path , value is a reference to the ContentInstance object
129
+ else
130
+ # Regenerate only the hashes, the values are immutable.
131
+ @contents = copy.contents.clone
132
+ @instances = copy.instances.clone
112
133
  end
113
134
  end
114
135
 
115
- # Unfortunately this class is used as mutable for now. So need to be carefull.
116
- # TODO(kolman): Make this class imutable, but add indexing structure to it.
117
- # TODO(kolman): Add wrapper to the class to enable dynamic content data
118
- # (with easy access indexes)
119
- class ContentData
120
- attr_reader :contents, :instances
136
+ def add_content(content)
137
+ @contents[content.checksum] = content
138
+ end
121
139
 
122
- # @param content_data_serializer_str [String]
123
- def initialize(copy = nil)
124
- if copy.nil?
125
- @contents = Hash.new # key is a checksum , value is a refernce to the Content object
126
- @instances = Hash.new # key is an instance global path , value is a reference to the ContentInstance object
127
- else
128
- # Regenerate only the hashes, the values are immutable.
129
- @contents = copy.contents.clone
130
- @instances = copy.instances.clone
131
- end
140
+ def add_instance(instance)
141
+ if (not @contents.key?(instance.checksum))
142
+ Log.warning sprintf("Adding instance while it's" +
143
+ " checksum %s does not exists.\n", instance.checksum)
144
+ Log.warning sprintf("%s\n", instance.to_s)
145
+ return false
146
+ elsif (@contents[instance.checksum].size != instance.size)
147
+ Log.warning 'File size different from content size while same checksum'
148
+ Log.warning instance.to_s
149
+ return false
132
150
  end
133
151
 
134
- def add_content(content)
135
- @contents[content.checksum] = content
136
- end
152
+ key = instance.global_path
137
153
 
138
- def add_instance(instance)
139
- if (not @contents.key?(instance.checksum))
140
- Log.warning sprintf("Adding instance while it's" +
141
- " checksum %s does not exists.\n", instance.checksum)
142
- Log.warning sprintf("%s\n", instance.to_s)
154
+ #override file if needed
155
+ @instances[key] = instance
156
+ end
157
+
158
+ def empty?
159
+ @contents.empty?
160
+ end
161
+
162
+ # TODO rename method with finishing '?', cause it returns a boolean
163
+ def content_exists(checksum)
164
+ @contents.key? checksum
165
+ end
166
+
167
+ # TODO(kolman): The semantics of thir merge is merge! change in all file.
168
+ def merge(content_data)
169
+ content_data.contents.values.each { |content|
170
+ add_content(content)
171
+ }
172
+ content_data.instances.values.each { |instance|
173
+ add_instance(instance)
174
+ }
175
+ end
176
+
177
+ def ==(other)
178
+ return false if other == nil
179
+ return false unless @contents.size == other.contents.size
180
+ return false unless @instances.size == other.instances.size
181
+
182
+ @contents.keys.each { |key|
183
+ if (@contents[key] != other.contents[key])
184
+ Log.info @contents[key].first_appearance_time.to_i
185
+ Log.info other.contents[key].first_appearance_time.to_i
143
186
  return false
144
- elsif (@contents[instance.checksum].size != instance.size)
145
- Log.warning 'File size different from content size while same checksum'
146
- Log.warning instance.to_s
187
+ end
188
+ }
189
+ @instances.keys.each { |key|
190
+ if (@instances[key] != other.instances[key])
147
191
  return false
148
192
  end
193
+ }
194
+ return true
195
+ end
149
196
 
150
- key = instance.global_path
197
+ def to_s
198
+ ret = ""
199
+ ret << @contents.length.to_s << "\n"
200
+ @contents.each_value { |content|
201
+ ret << content.to_s << "\n"
202
+ }
203
+ ret << @instances.length.to_s << "\n"
204
+ @instances.each_value { |instance|
205
+ ret << instance.to_s << "\n"
206
+ }
207
+ return ret
208
+ end
209
+
210
+ def to_file(filename)
211
+ content_data_dir = File.dirname(filename)
212
+ FileUtils.makedirs(content_data_dir) unless File.directory?(content_data_dir)
213
+ File.open(filename, 'w') {|f| f.write(to_s) }
214
+ end
151
215
 
152
- #override file if needed
153
- @instances[key] = instance
216
+ # TODO validation that file indeed contains ContentData missing
217
+ def from_file(filename)
218
+ lines = IO.readlines(filename)
219
+ i = 0
220
+ number_of_contents = lines[i].to_i
221
+ i += 1
222
+ number_of_contents.times {
223
+ parameters = lines[i].split(",")
224
+ add_content(Content.new(parameters[0],
225
+ parameters[1].to_i,
226
+ ContentData.parse_time(parameters[2])))
227
+ i += 1
228
+ }
229
+
230
+ number_of_instances = lines[i].to_i
231
+ i += 1
232
+ number_of_instances.times {
233
+ if lines[i].nil?
234
+ Log.info "lines[i] if nil !!!, Backing filename: #{filename} to #{filename}.bad"
235
+ FileUtils.cp(filename, "#{filename}.bad")
236
+ Log.info lines[i].join("\n")
237
+ end
238
+ parameters = lines[i].split(',')
239
+ # bugfix: if file name consist a comma then parsing based on comma separating fails
240
+ if (parameters.size > 6)
241
+ (5..parameters.size-2).each do |i|
242
+ parameters[4] = [parameters[4], parameters[i]].join(",")
243
+ end
244
+ (5..parameters.size-2).each do |i|
245
+ parameters.delete_at(5)
246
+ end
247
+ end
248
+
249
+ add_instance(ContentInstance.new(parameters[0],
250
+ parameters[1].to_i,
251
+ parameters[2],
252
+ parameters[3],
253
+ parameters[4],
254
+ ContentData.parse_time(parameters[5])))
255
+ i += 1
256
+ }
257
+ end
258
+
259
+ def self.parse_time time_str
260
+ return nil unless time_str.instance_of? String
261
+ seconds_from_epoch = Integer time_str # Not using to_i here because it does not check string is integer.
262
+ time = Time.at seconds_from_epoch
263
+ end
264
+
265
+ def self.format_time(time)
266
+ return nil unless time.instance_of?Time
267
+ str = time.to_i.to_s
268
+ return str
269
+ end
270
+
271
+ # merges content data a and content data b to a new content data and returns it.
272
+ def self.merge(a, b)
273
+ return b unless not a.nil?
274
+ return a unless not b.nil?
275
+
276
+ return nil unless a.instance_of?ContentData
277
+ return nil unless b.instance_of?ContentData
278
+
279
+ ret = ContentData.new
280
+ ret.merge(a)
281
+ ret.merge(b)
282
+
283
+ return ret
284
+ end
285
+
286
+ # removed content data a from content data b and returns the new content data.
287
+ def self.remove(a, b)
288
+ return nil unless a.instance_of?ContentData
289
+ return nil unless b.instance_of?ContentData
290
+
291
+ ret = ContentData.new
292
+
293
+ b.contents.values.each { |content|
294
+ #print "%s - %s\n" % [content.checksum, a.content_exists(content.checksum).to_s]
295
+ ret.add_content(content) unless a.content_exists(content.checksum)
296
+ }
297
+
298
+ #Log.info "kaka"
299
+
300
+ b.instances.values.each { |instance|
301
+ #print "%s - %s\n" % [instance.checksum, a.content_exists(instance.checksum).to_s]
302
+ ret.add_instance(instance) unless a.content_exists(instance.checksum)
303
+ }
304
+
305
+ #print "kuku %s" % ret.contents.size.to_s
306
+ #print "kuku %s" % ret.instances.size.to_s
307
+ return ret
308
+ end
309
+
310
+ def self.remove_instances(a, b)
311
+ return nil unless a.instance_of?ContentData
312
+ return nil unless b.instance_of?ContentData
313
+
314
+ ret = ContentData.new
315
+ b.instances.values.each do |instance|
316
+ if !a.instances.key?(instance.global_path)
317
+ ret.add_content(b.contents[instance.checksum])
318
+ ret.add_instance(instance)
319
+ end
154
320
  end
321
+ return ret
322
+ end
155
323
 
156
- def empty?
157
- @contents.empty?
324
+ def self.remove_directory(cd, global_dir_path)
325
+ return nil unless cd.instance_of?ContentData
326
+
327
+ ret = ContentData.new
328
+ cd.instances.values.each do |instance|
329
+ Log.debug3("global path to check: #{global_dir_path}")
330
+ Log.debug3("instance global path: #{instance.global_path}")
331
+ if instance.global_path.scan(global_dir_path).size == 0
332
+ Log.debug3("Adding instance.")
333
+ ret.add_content(cd.contents[instance.checksum])
334
+ ret.add_instance(instance)
335
+ end
158
336
  end
337
+ return ret
338
+ end
339
+
340
+ # returns the common content in both a and b
341
+ def self.intersect(a, b)
342
+ b_minus_a = ContentData.remove(a, b)
343
+ return ContentData.remove(b_minus_a, b)
344
+ end
159
345
 
160
- def content_exists(checksum)
161
- @contents.key? checksum
346
+ # unify time for all entries with same content to minimal time
347
+ def self.unify_time(db)
348
+ mod_db = ContentData.new # resulting ContentData that will consists objects with unified time
349
+ checksum2time = Hash.new # key=checksum value=min_time_for_this_checksum
350
+ checksum2instances = Hash.new # key=checksum value=array_of_instances_with_this_checksum (Will be replaced with ContentData method)
351
+
352
+ # populate tables with given ContentData entries
353
+ db.instances.each_value do |instance|
354
+ checksum = instance.checksum
355
+ time = instance.modification_time
356
+
357
+ unless (checksum2instances.has_key? checksum)
358
+ checksum2instances[checksum] = []
359
+ end
360
+ checksum2instances[checksum] << instance
361
+
362
+ if (not checksum2time.has_key? checksum)
363
+ checksum2time[checksum] = time
364
+ elsif ((checksum2time[checksum] <=> time) > 0)
365
+ checksum2time[checksum] = time
366
+ end
162
367
  end
163
368
 
164
- # TODO(kolman): The semantics of thir merge is merge! change in all file.
165
- def merge(content_data)
166
- content_data.contents.values.each { |content|
167
- add_content(content)
168
- }
169
- content_data.instances.values.each { |instance|
170
- add_instance(instance)
171
- }
369
+ # update min time table with time information from contents
370
+ db.contents.each do |checksum, content|
371
+ time = content.first_appearance_time
372
+ if (not checksum2time.has_key? checksum)
373
+ checksum2time[checksum] = time
374
+ elsif ((checksum2time[checksum] <=> time) > 0)
375
+ checksum2time[checksum] = time
376
+ end
172
377
  end
173
378
 
174
- def ==(other)
175
- return false if other == nil
176
- return false unless @contents.size == other.contents.size
177
- return false unless @instances.size == other.instances.size
379
+ # add content entries to the output table. in need of case update time field with found min time
380
+ db.contents.each do |checksum, content|
381
+ time = checksum2time[checksum]
382
+ if ((content.first_appearance_time <=> time) == 0)
383
+ mod_db.add_content(content)
384
+ else
385
+ mod_db.add_content(Content.new(checksum, content.size, time))
386
+ end
387
+ end
178
388
 
179
- @contents.keys.each { |key|
180
- if (@contents[key] != other.contents[key])
181
- Log.info @contents[key].first_appearance_time.to_i
182
- Log.info other.contents[key].first_appearance_time.to_i
183
- return false
184
- end
185
- }
186
- @instances.keys.each { |key|
187
- if (@instances[key] != other.instances[key])
188
- return false
389
+ # add instance entries to the output table. in need of case update time field with found min time
390
+ checksum2instances.each do |checksum, instances|
391
+ time = checksum2time[checksum]
392
+ instances.each do |instance|
393
+ if ((instance.modification_time <=> time) == 0)
394
+ mod_db.add_instance(instance)
395
+ else # must be bigger then found min time
396
+ mod_instance = ContentInstance.new(instance.checksum, instance.size,
397
+ instance.server_name, instance.device,
398
+ instance.full_path, time)
399
+ mod_db.add_instance(mod_instance)
189
400
  end
190
- }
191
- return true
401
+ end
192
402
  end
403
+ mod_db
404
+ end
193
405
 
194
- def to_s
195
- ret = ""
196
- ret << @contents.length.to_s << "\n"
197
- @contents.each_value { |content|
198
- ret << content.to_s << "\n"
199
- }
200
- ret << @instances.length.to_s << "\n"
201
- @instances.each_value { |instance|
202
- ret << instance.to_s << "\n"
203
- }
204
- return ret
406
+ # Validates index against file system that all instances hold a correct data regarding files
407
+ # that they represrents.
408
+ #
409
+ # There are two levels of validation, controlled by instance_check_level system parameter:
410
+ # * shallow - quick, tests instance for file existence and attributes.
411
+ # * deep - can take more time, in addition to shallow recalculates hash sum.
412
+ # @param [Hash] params hash of parameters of validation, can be used to return additional data.
413
+ #
414
+ # Supported key/value combinations:
415
+ # * key is <tt>:failed</tt> value is <tt>ContentData</tt> used to return failed instances
416
+ # @return [Boolean] true when index is correct, false otherwise
417
+ def validate(params = nil)
418
+ # used to answer whether specific param was set
419
+ param_exists = Proc.new do |param|
420
+ !(params.nil? || params[param].nil?)
205
421
  end
206
422
 
207
- def to_file(filename)
208
- content_data_dir = File.dirname(filename)
209
- FileUtils.makedirs(content_data_dir) unless File.directory?(content_data_dir)
210
- File.open(filename, 'w') {|f| f.write(to_s) }
423
+ # used to process method parameters centrally
424
+ process_params = Proc.new do |values|
425
+ # values is a Hash with keys: :content, :instance and value appropriate to key
426
+ if param_exists.call :failed
427
+ unless values[:content].nil?
428
+ params[:failed].add_content values[:content]
429
+ end
430
+ unless values[:instance].nil?
431
+ # appropriate content should be already added
432
+ params[:failed].add_instance values[:instance]
433
+ end
434
+ end
211
435
  end
212
436
 
213
- def from_file(filename)
214
- lines = IO.readlines(filename)
215
- i = 0
216
- number_of_contents = lines[i].to_i
217
- i += 1
218
- number_of_contents.times {
219
- parameters = lines[i].split(",")
220
- add_content(Content.new(parameters[0],
221
- parameters[1].to_i,
222
- ContentData.parse_time(parameters[2])))
223
- i += 1
224
- }
225
-
226
- number_of_instances = lines[i].to_i
227
- i += 1
228
- number_of_instances.times {
229
- parameters = lines[i].split(',')
230
- # bugfix: if file name consist a comma then parsing based on comma separating fails
231
- if (parameters.size > 6)
232
- (5..parameters.size-2).each do |i|
233
- parameters[4] = [parameters[4], parameters[i]].join(",")
234
- end
235
- (5..parameters.size-2).each do |i|
236
- parameters.delete_at(5)
237
- end
437
+ is_valid = true
438
+ instances.each_value do |instance|
439
+ unless check_instance instance
440
+ is_valid = false
441
+
442
+ unless params.nil? || params.empty?
443
+ process_params.call :content => contents[instance.checksum], :instance => instance
238
444
  end
445
+ end
446
+ end
447
+
448
+ is_valid
449
+ end
239
450
 
240
- add_instance(ContentInstance.new(parameters[0],
241
- parameters[1].to_i,
242
- parameters[2],
243
- parameters[3],
244
- parameters[4],
245
- ContentData.parse_time(parameters[5])))
246
- i += 1
247
- }
451
+ def shallow_check(instance)
452
+ path = instance.full_path
453
+ is_valid = true
454
+
455
+ if (File.exists?(path))
456
+ if File.size(path) != instance.size
457
+ is_valid = false
458
+ err_msg = "#{path} size #{File.size(path)} differs from indexed size #{instance.size}"
459
+ Log.warning err_msg
460
+ end
461
+ #if ContentData.format_time(File.mtime(path)) != instance.modification_time
462
+ if File.mtime(path).to_i != instance.modification_time.to_i
463
+ is_valid = false
464
+ err_msg = "#{path} modification time #{File.mtime(path)} differs from " \
465
+ + "indexed #{instance.modification_time}"
466
+ Log.warning err_msg
467
+ end
468
+ else
469
+ is_valid = false
470
+ err_msg = "Indexed file #{path} doesn't exist"
471
+ Log.warning err_msg
248
472
  end
473
+ is_valid
474
+ end
249
475
 
250
- def self.parse_time time_str
251
- return nil unless time_str.instance_of? String
252
- seconds_from_epoch = Integer time_str # Not using to_i here because it does not check string is integer.
253
- time = Time.at seconds_from_epoch
476
+ def deep_check(instance)
477
+ if shallow_check(instance)
478
+ path = instance.full_path
479
+ current_checksum = FileIndexing::IndexAgent.get_checksum(path)
480
+ if instance.checksum == current_checksum
481
+ true
482
+ else
483
+ err_msg = "#{path} checksum #{current_checksum} differs from indexed #{instance.checksum}"
484
+ Log.warning err_msg
485
+ false
486
+ end
487
+ else
488
+ false
254
489
  end
490
+ end
255
491
 
256
- def self.format_time(time)
257
- return nil unless time.instance_of?Time
258
- str = time.to_i.to_s
259
- return str
492
+ def check_instance(instance)
493
+ case Params['instance_check_level']
494
+ when 'deep'
495
+ deep_check instance
496
+ when 'shallow'
497
+ shallow_check instance
498
+ else
499
+ # TODO remove it when params will support set of values
500
+ throw ArgumentError.new "Unsupported check level #{Params['instance_check_level']}"
260
501
  end
502
+ end
261
503
 
262
- # merges content data a and content data b to a new content data and returns it.
263
- def self.merge(a, b)
264
- return b unless not a.nil?
265
- return a unless not b.nil?
266
504
 
267
- return nil unless a.instance_of?ContentData
268
- return nil unless b.instance_of?ContentData
505
+ # TODO simplify conditions
506
+ # This mehod is experimental and shouldn\'t be used
507
+ # nil is used to define +/- infinity for to/from method arguments
508
+ # from/to values are exlusive in condition'a calculations
509
+ # Need to take care about '==' operation that is used for object's comparison.
510
+ # In need of case user should define it's own '==' implemementation.
511
+ def get_query(variable, params)
512
+ raise RuntimeError.new 'This method is experimental and shouldn\'t be used'
269
513
 
270
- ret = ContentData.new
271
- ret.merge(a)
272
- ret.merge(b)
514
+ exact = params['exact'].nil? ? Array.new : params['exact']
515
+ from = params['from']
516
+ to = params ['to']
517
+ is_inside = params['is_inside']
273
518
 
274
- return ret
519
+ unless ContentInstance.new.instance_variable_defined?("@#{attribute}")
520
+ raise ArgumentError "#{variable} isn't a ContentInstance variable"
275
521
  end
276
522
 
277
- # removed content data a from content data b and returns the new content data.
278
- def self.remove(a, b)
279
- return nil unless a.instance_of?ContentData
280
- return nil unless b.instance_of?ContentData
523
+ if (exact.nil? && from.nil? && to.nil?)
524
+ raise ArgumentError 'At least one of the argiments {exact, from, to} must be defined'
525
+ end
281
526
 
282
- ret = ContentData.new
527
+ if (!(from.nil? || to.nil?) && from.kind_of?(to.class))
528
+ raise ArgumentError 'to and from arguments should be comparable one with another'
529
+ end
283
530
 
284
- b.contents.values.each { |content|
285
- #print "%s - %s\n" % [content.checksum, a.content_exists(content.checksum).to_s]
286
- ret.add_content(content) unless a.content_exists(content.checksum)
287
- }
531
+ # FIXME add support for from/to for Strings
532
+ if ((!from.nil? && !from.kind_of?(Numeric.new.class))\
533
+ || (!to.nil? && to.kind_of?(Numeric.new.class)))
534
+ raise ArgumentError 'from and to options supported only for numeric values'
535
+ end
288
536
 
289
- #Log.info "kaka"
537
+ if (!exact.empty? && (!from.nil? || !to.nil?))
538
+ raise ArgumentError 'exact and from/to options are mutually exclusive'
539
+ end
290
540
 
291
- b.instances.values.each { |instance|
292
- #print "%s - %s\n" % [instance.checksum, a.content_exists(instance.checksum).to_s]
293
- ret.add_instance(instance) unless a.content_exists(instance.checksum)
294
- }
541
+ result_index = ContentData.new
542
+ instances.each_value do |instance|
543
+ is_match = false
544
+ var_value = instance.instance_variable_get("@#{variable}")
295
545
 
296
- #print "kuku %s" % ret.contents.size.to_s
297
- #print "kuku %s" % ret.instances.size.to_s
298
- return ret
546
+ if exact.include? var_value
547
+ is_match = true
548
+ elsif (from.nil? || var_value > from) && (to.nil? || var_value < to)
549
+ is_match = true
550
+ end
551
+
552
+ if (is_match && is_inside) || (!is_match && !is_inside)
553
+ checksum = instance.checksum
554
+ result_index.add_content(contents[checksum]) unless result_index.content_exists(checksum)
555
+ result_index.add_instance instance
556
+ end
299
557
  end
558
+ result_index
559
+ end
300
560
 
301
- def self.remove_instances(a, b)
302
- return nil unless a.instance_of?ContentData
303
- return nil unless b.instance_of?ContentData
561
+ private :shallow_check, :deep_check, :check_instance
562
+ end
304
563
 
305
- ret = ContentData.new
306
- b.instances.values.each do |instance|
307
- if !a.instances.key?(instance.global_path)
308
- ret.add_content(b.contents[instance.checksum])
309
- ret.add_instance(instance)
310
- end
564
+ # Validates index against file system that all instances hold a correct data regarding files
565
+ # that they represrents.
566
+ #
567
+ # There are two levels of validation, controlled by instance_check_level system parameter:
568
+ # * shallow - quick, tests instance for file existence and attributes.
569
+ # * deep - can take more time, in addition to shallow recalculates hash sum.
570
+ # @param [Hash] params hash of parameters of validation, can be used to return additional data.
571
+ #
572
+ # Supported key/value combinations:
573
+ # * key is <tt>:failed</tt> value is <tt>ContentData</tt> used to return failed instances
574
+ # @return [Boolean] true when index is correct, false otherwise
575
+ # @raise [ArgumentError] when instance_check_level is incorrect
576
+ def validate(params = nil)
577
+ # used to answer whether specific param was set
578
+ param_exists = Proc.new do |param|
579
+ !(params.nil? || params[param].nil?)
580
+ end
581
+
582
+ # used to process method parameters centrally
583
+ process_params = Proc.new do |values|
584
+ # values is a Hash with keys: :content, :instance and value appropriate to key
585
+ if param_exists.call :failed
586
+ unless values[:content].nil?
587
+ params[:failed].add_content values[:content]
588
+ end
589
+ unless values[:instance].nil?
590
+ # appropriate content should be already added
591
+ params[:failed].add_instance values[:instance]
311
592
  end
312
- return ret
313
593
  end
594
+ end
314
595
 
315
- def self.remove_directory(cd, global_dir_path)
316
- return nil unless cd.instance_of?ContentData
317
-
318
- ret = ContentData.new
319
- cd.instances.values.each do |instance|
320
- Log.debug3("global path to check: #{global_dir_path}")
321
- Log.debug3("instance global path: #{instance.global_path}")
322
- if instance.global_path.scan(global_dir_path).size == 0
323
- Log.debug3("Adding instance.")
324
- ret.add_content(cd.contents[instance.checksum])
325
- ret.add_instance(instance)
326
- end
596
+ is_valid = true
597
+ instances.each_value do |instance|
598
+ unless check_instance instance
599
+ is_valid = false
600
+
601
+ unless params.nil? || params.empty?
602
+ process_params.call :content => contents[instance.checksum], :instance => instance
327
603
  end
328
- return ret
329
604
  end
605
+ end
606
+
607
+ is_valid
608
+ end
609
+
610
+ def shallow_check(instance)
611
+ path = instance.full_path
612
+ is_valid = true
330
613
 
331
- # returns the common content in both a and b
332
- def self.intersect(a, b)
333
- b_minus_a = ContentData.remove(a, b)
334
- return ContentData.remove(b_minus_a, b)
614
+ if (File.exists?(path))
615
+ if File.size(path) != instance.size
616
+ is_valid = false
617
+ err_msg = "#{path} size #{File.size(path)} differs from indexed size #{instance.size}"
618
+ Log.warning err_msg
335
619
  end
620
+ #if ContentData.format_time(File.mtime(path)) != instance.modification_time
621
+ if File.mtime(path).to_i != instance.modification_time.to_i
622
+ is_valid = false
623
+ err_msg = "#{path} modification time #{File.mtime(path)} differs from " \
624
+ + "indexed #{instance.modification_time}"
625
+ Log.warning err_msg
626
+ end
627
+ else
628
+ is_valid = false
629
+ err_msg = "Indexed file #{path} doesn't exist"
630
+ Log.warning err_msg
631
+ end
632
+ is_valid
633
+ end
336
634
 
337
- # unify time for all entries with same content to minimal time
338
- def self.unify_time(db)
339
- mod_db = ContentData.new # resulting ContentData that will consists objects with unified time
340
- checksum2time = Hash.new # key=checksum value=min_time_for_this_checksum
341
- checksum2instances = Hash.new # key=checksum value=array_of_instances_with_this_checksum (Will be replaced with ContentData method)
635
+ def deep_check(instance)
636
+ if shallow_check(instance)
637
+ path = instance.full_path
638
+ current_checksum = FileIndexing::IndexAgent.get_checksum(path)
639
+ if instance.checksum == current_checksum
640
+ true
641
+ else
642
+ err_msg = "#{path} checksum #{current_checksum} differs from indexed #{instance.checksum}"
643
+ Log.warning err_msg
644
+ false
645
+ end
646
+ else
647
+ false
648
+ end
649
+ end
342
650
 
343
- # populate tables with given ContentData entries
344
- db.instances.each_value do |instance|
345
- checksum = instance.checksum
346
- time = instance.modification_time
651
+ # @raise [ArgumentError] when instance_check_level is incorrect
652
+ def check_instance(instance)
653
+ case Params['instance_check_level']
654
+ when 'deep'
655
+ deep_check instance
656
+ when 'shallow'
657
+ shallow_check instance
658
+ else
659
+ # TODO remove it when params will support set of values
660
+ throw ArgumentError.new "Unsupported check level #{Params['instance_check_level']}"
661
+ end
662
+ end
347
663
 
348
- unless (checksum2instances.has_key? checksum)
349
- checksum2instances[checksum] = []
350
- end
351
- checksum2instances[checksum] << instance
352
664
 
353
- if (not checksum2time.has_key? checksum)
354
- checksum2time[checksum] = time
355
- elsif ((checksum2time[checksum] <=> time) > 0)
356
- checksum2time[checksum] = time
357
- end
358
- end
665
+ # TODO simplify conditions
666
+ # This mehod is experimental and shouldn\'t be used
667
+ # nil is used to define +/- infinity for to/from method arguments
668
+ # from/to values are exlusive in condition'a calculations
669
+ # Need to take care about '==' operation that is used for object's comparison.
670
+ # In need of case user should define it's own '==' implemementation.
671
+ def get_query(variable, params)
672
+ raise RuntimeError.new 'This method is experimental and shouldn\'t be used'
359
673
 
360
- # update min time table with time information from contents
361
- db.contents.each do |checksum, content|
362
- time = content.first_appearance_time
363
- if (not checksum2time.has_key? checksum)
364
- checksum2time[checksum] = time
365
- elsif ((checksum2time[checksum] <=> time) > 0)
366
- checksum2time[checksum] = time
367
- end
368
- end
674
+ exact = params['exact'].nil? ? Array.new : params['exact']
675
+ from = params['from']
676
+ to = params ['to']
677
+ is_inside = params['is_inside']
369
678
 
370
- # add content entries to the output table. in need of case update time field with found min time
371
- db.contents.each do |checksum, content|
372
- time = checksum2time[checksum]
373
- if ((content.first_appearance_time <=> time) == 0)
374
- mod_db.add_content(content)
375
- else
376
- mod_db.add_content(Content.new(checksum, content.size, time))
377
- end
378
- end
679
+ unless ContentInstance.new.instance_variable_defined?("@#{attribute}")
680
+ raise ArgumentError "#{variable} isn't a ContentInstance variable"
681
+ end
379
682
 
380
- # add instance entries to the output table. in need of case update time field with found min time
381
- checksum2instances.each do |checksum, instances|
382
- time = checksum2time[checksum]
383
- instances.each do |instance|
384
- if ((instance.modification_time <=> time) == 0)
385
- mod_db.add_instance(instance)
386
- else # must be bigger then found min time
387
- mod_instance = ContentInstance.new(instance.checksum, instance.size,
388
- instance.server_name, instance.device,
389
- instance.full_path, time)
390
- mod_db.add_instance(mod_instance)
391
- end
392
- end
393
- end
394
- mod_db
683
+ if (exact.nil? && from.nil? && to.nil?)
684
+ raise ArgumentError 'At least one of the argiments {exact, from, to} must be defined'
685
+ end
686
+
687
+ if (!(from.nil? || to.nil?) && from.kind_of?(to.class))
688
+ raise ArgumentError 'to and from arguments should be comparable one with another'
689
+ end
690
+
691
+ # FIXME add support for from/to for Strings
692
+ if ((!from.nil? && !from.kind_of?(Numeric.new.class))\
693
+ || (!to.nil? && to.kind_of?(Numeric.new.class)))
694
+ raise ArgumentError 'from and to options supported only for numeric values'
695
+ end
696
+
697
+ if (!exact.empty? && (!from.nil? || !to.nil?))
698
+ raise ArgumentError 'exact and from/to options are mutually exclusive'
699
+ end
700
+
701
+ result_index = ContentData.new
702
+ instances.each_value do |instance|
703
+ is_match = false
704
+ var_value = instance.instance_variable_get("@#{variable}")
705
+
706
+ if exact.include? var_value
707
+ is_match = true
708
+ elsif (from.nil? || var_value > from) && (to.nil? || var_value < to)
709
+ is_match = true
710
+ end
711
+
712
+ if (is_match && is_inside) || (!is_match && !is_inside)
713
+ checksum = instance.checksum
714
+ result_index.add_content(contents[checksum]) unless result_index.content_exists(checksum)
715
+ result_index.add_instance instance
395
716
  end
396
717
  end
718
+ result_index
397
719
  end
720
+
721
+ private :shallow_check, :deep_check, :check_instance, :get_query
398
722
  end
723
+