content_server 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +15 -0
  2. data/bin/file_utils +118 -0
  3. data/lib/content_data/content_data.rb +114 -48
  4. data/lib/content_server/version.rb +1 -1
  5. data/lib/file_monitoring/file_monitoring.rb +94 -50
  6. data/lib/file_monitoring/monitor_path.rb +196 -113
  7. data/lib/file_utils/file_utils.rb +10 -49
  8. data/lib/networking/tcp.rb +4 -4
  9. data/spec/content_data/content_data_spec.rb +331 -0
  10. data/spec/content_data/validations_spec.rb +5 -0
  11. data/spec/content_server/content_server_spec.rb +5 -0
  12. data/spec/content_server/file_streamer_spec.rb +5 -0
  13. data/spec/file_copy/copy_spec.rb +5 -0
  14. data/spec/file_indexing/index_agent_spec.rb +5 -0
  15. data/spec/networking/tcp_spec.rb +5 -0
  16. data/spec/validations/index_validations_spec.rb +5 -0
  17. metadata +9 -89
  18. data/test/content_data/content_data_test.rb +0 -291
  19. data/test/file_generator/file_generator_spec.rb +0 -85
  20. data/test/file_monitoring/monitor_path_test.rb +0 -189
  21. data/test/file_monitoring/monitor_path_test/dir1000/test_file.1000 +0 -1000
  22. data/test/file_monitoring/monitor_path_test/dir1000/test_file.1000.0 +0 -1000
  23. data/test/file_monitoring/monitor_path_test/dir1000/test_file.1000.1 +0 -1000
  24. data/test/file_monitoring/monitor_path_test/dir1500/test_file.1500 +0 -1500
  25. data/test/file_monitoring/monitor_path_test/dir1500/test_file.1500.0 +0 -1500
  26. data/test/file_monitoring/monitor_path_test/dir1500/test_file.1500.1 +0 -1500
  27. data/test/file_monitoring/monitor_path_test/test_file.500 +0 -500
  28. data/test/file_monitoring/monitor_path_test/test_file.500.0 +0 -500
  29. data/test/file_monitoring/monitor_path_test/test_file.500.1 +0 -500
  30. data/test/file_utils/fileutil_mksymlink_test.rb +0 -134
  31. data/test/file_utils/fileutil_mksymlink_test/dir1000/dir1500/test_file.1500 +0 -1500
  32. data/test/file_utils/fileutil_mksymlink_test/dir1000/dir1500/test_file.1500.0 +0 -1500
  33. data/test/file_utils/fileutil_mksymlink_test/dir1000/dir1500/test_file.1500.1 +0 -1500
  34. data/test/file_utils/fileutil_mksymlink_test/dir1000/test_file.1000 +0 -1000
  35. data/test/file_utils/fileutil_mksymlink_test/dir1000/test_file.1000.0 +0 -1000
  36. data/test/file_utils/fileutil_mksymlink_test/dir1000/test_file.1000.1 +0 -1000
  37. data/test/file_utils/fileutil_mksymlink_test/test_file.500 +0 -500
  38. data/test/file_utils/fileutil_mksymlink_test/test_file.500.0 +0 -500
  39. data/test/file_utils/fileutil_mksymlink_test/test_file.500.1 +0 -500
  40. data/test/file_utils/time_modification_test.rb +0 -136
  41. data/test/params/params_spec.rb +0 -280
  42. data/test/params/params_test.rb +0 -43
  43. data/test/run_in_background/run_in_background_test.rb +0 -122
  44. data/test/run_in_background/test_app +0 -59
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ODk2ZWZlZGIwODU5Y2I1YTg3YzIyZWZmNDQyOGRiOTJhMWMwODJiZg==
5
+ data.tar.gz: !binary |-
6
+ NWQxY2E1NGE2ZmQ3YzRlYzFjN2QwNTRlMWMxYzFjMmZhODlhNDVlNA==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ MmViNDAxZDU0NTk4ZDY0YzNiYTkxZWVkZjAxYzMyNjZhM2Y0ZDE2MmE3NDNl
10
+ YTE3MTJlZGU0NzBjZjJjYzNmM2Q5YWM1NzEyNzgxZDM4MmRmZDIyOWUxMGY3
11
+ MjU4YjMxMDFlOThkNDM1M2ZiZTJhN2RjNzFkZmZhYWZkZWE4MTM=
12
+ data.tar.gz: !binary |-
13
+ YmQxOWYzZGFhZDY5YjNmOTBkN2VhMDBiOTQwZTdjMmQxNzhmZDgxMzE3YzVl
14
+ OTg4NDRmYmE5ZDU3ZWJhMGU2YzNiYTQ2Y2M4OTEyMGQzYzNmOWJkYzc0YmMw
15
+ OTJhNmQ3YTY3MDIzZDI5NmYzOThhNjA0MjQ4MTI0ZGY5OGI1Nzc=
@@ -0,0 +1,118 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # This utility executable is used to easily execute BBFS operations on files.
4
+
5
+ require 'content_data'
6
+ require 'log'
7
+ require 'params'
8
+
9
+ #############################################
10
+ # Definitions
11
+ #############################################
12
+ Params.string 'command', nil ,'supported commands are: merge, intersect, minus'
13
+ Params.string 'dest', nil ,'destination path'
14
+ Params.string 'cd_a', nil ,'a path'
15
+ Params.string 'cd_b', nil ,'b path'
16
+
17
+ HELP_MSG = <<EOF
18
+ This is an utility functionality for BBFS,
19
+ such as algebra of sets operations on ContentData files.
20
+ Usage:
21
+ To get merge of two ContentData files,
22
+ i.e. contents and appropreate instances that exist in at least one ContentData file:
23
+ file_utils --command=merge --cd_a=<content_data_file1> --cd_b=<content_data_file_2> [--dest=<result_content_data_file>]
24
+ To get intersection of two ContentData files,
25
+ i.e. contents and appropreate instances that exist in the both ContentData files:
26
+ file_utils --command=intersect --cd_a=<content_data_file1> --cd_b=<content_data_file_2> [--dest=<result_content_data_file>]
27
+ To get content_data_2 minus content_data_1,
28
+ i.e. contents and appropreate instances that exist in content_data_file2 and absent in content_data_file1:
29
+ file_utils --command=minus --cd_a=<content_data_file1> --cd_b=<content_data_file_2> [--dest=<result_content_data_file>]
30
+ EOF
31
+ #############################################
32
+ # Init
33
+ #############################################
34
+ Params.init ARGV
35
+ Log.init
36
+
37
+ #############################################
38
+ # Methods
39
+ #############################################
40
+
41
+ # Algebra of sets operations (merge, intersect, minus) on ContentData files
42
+ def content_data_command()
43
+
44
+ ['cd_a', 'cd_b'].each do |param|
45
+ if Params[param].nil?
46
+ err_msg = "--#{param} is not set"
47
+ puts err_msg
48
+ Log.error(err_msg)
49
+ return
50
+ end
51
+ end
52
+
53
+ cd_a = ContentData::ContentData.new()
54
+ begin
55
+ cd_a.from_file(Params['cd_a'])
56
+ rescue Exception => e
57
+ err_msg = "Error loading content data cd_a = %s: %s" % [Params['cd_a'], e.message]
58
+ puts err_msg
59
+ Log.error(err_msg)
60
+ Log.flush
61
+ return
62
+ end
63
+
64
+ cd_b = ContentData::ContentData.new()
65
+ begin
66
+ cd_b.from_file(Params['cd_b'])
67
+ rescue Exception => e
68
+ err_msg = "Error loading content data cd_b = %s: %s" % [Params['cd_b'], e.message]
69
+ puts err_msg
70
+ Log.error(err_msg)
71
+ Log.flush
72
+ return
73
+ end
74
+
75
+ if Params['command'] == "merge"
76
+ res = ContentData.merge(cd_a, cd_b)
77
+ preface = '# ' + "Merge of #{Params['cd_a']} and #{Params['cd_b']}"
78
+ elsif Params['command'] == "intersect"
79
+ res = ContentData.intersect(cd_a, cd_b)
80
+ preface = '# ' + "Intersection of #{Params['cd_a']} and #{Params['cd_b']}"
81
+ elsif Params['command'] == "minus"
82
+ res = ContentData.remove(cd_a, cd_b) # cd_b - cd_a
83
+ preface = '# ' + "Contents and appropreate instances that exist in #{Params['cd_b']}" +
84
+ " and absent in #{Params['cd_a']}"
85
+ end
86
+
87
+ unless res.nil?
88
+ puts preface
89
+ puts res.to_s
90
+ unless Params['dest'].nil?
91
+ res.to_file Params['dest']
92
+ end
93
+ end
94
+ end
95
+
96
+ # Main method
97
+ def run
98
+ if ARGV.empty?
99
+ puts HELP_MSG
100
+ exit
101
+ end
102
+
103
+ if (Params['command'] == "merge" ||
104
+ Params['command'] == "intersect" ||
105
+ Params['command'] == "minus")
106
+ content_data_command
107
+ else
108
+ err_msg = "Unsupported command: #{Params['command']}"
109
+ puts err_msg
110
+ puts HELP_MSG
111
+ Log.error(err_msg)
112
+ end
113
+ end
114
+
115
+ #############################################
116
+ # Main
117
+ #############################################
118
+ run
@@ -35,16 +35,18 @@ module ContentData
35
35
  if other.nil?
36
36
  @contents_info = {} # Checksum --> [size, paths-->time(instance), time(content)]
37
37
  @instances_info = {} # location --> checksum to optimize instances query
38
+ @symlinks_info = {} # [server,symlink path] -> target
38
39
  else
39
40
  @contents_info = other.clone_contents_info
40
41
  @instances_info = other.clone_instances_info # location --> checksum to optimize instances query
42
+ @symlinks_info = other.clone_symlinks_info
41
43
  end
42
44
  end
43
45
 
44
46
  # Content Data unique identification
45
47
  # @return [ID] hash identification
46
48
  def unique_id
47
- @instances_info.hash
49
+ [@contents_info.hash,@symlinks_info.hash]
48
50
  end
49
51
 
50
52
  def clone_instances_info
@@ -83,6 +85,16 @@ module ContentData
83
85
  clone_contents_info
84
86
  end
85
87
 
88
+ def clone_symlinks_info
89
+ symlinks_info_enum = @symlinks_info.each_key
90
+ cloned_symlinks = {}
91
+ loop {
92
+ symlink_key = symlinks_info_enum.next rescue break
93
+ cloned_symlinks[[symlink_key[0].clone, symlink_key[0].clone]] = @symlinks_info[symlink_key].clone
94
+ }
95
+ cloned_symlinks
96
+ end
97
+
86
98
  # iterator over @contents_info data structure (not including instances)
87
99
  # block is provided with: checksum, size and content modification time
88
100
  def each_content(&block)
@@ -131,6 +143,17 @@ module ContentData
131
143
  }
132
144
  end
133
145
 
146
+ # iterator over @symlinks_info data structure
147
+ # block is provided with: server, file path and target
148
+ def each_symlink(&block)
149
+ symlink_enum = @symlinks_info.each_key
150
+ loop {
151
+ symlink_key = symlink_enum.next rescue break
152
+ symlink_target = @symlinks_info[symlink_key]
153
+ block.call(symlink_key[0], symlink_key[1], symlink_target)
154
+ }
155
+ end
156
+
134
157
  def contents_size()
135
158
  @contents_info.length
136
159
  end
@@ -139,6 +162,10 @@ module ContentData
139
162
  @instances_info.length
140
163
  end
141
164
 
165
+ def symlinks_size()
166
+ @symlinks_info.length
167
+ end
168
+
142
169
  def checksum_instances_size(checksum)
143
170
  content_info = @contents_info[checksum]
144
171
  return 0 if content_info.nil?
@@ -181,8 +208,16 @@ module ContentData
181
208
  @instances_info[location] = checksum
182
209
  end
183
210
 
211
+ def add_symlink(server, path, target)
212
+ @symlinks_info[[server,path]] = target
213
+ end
214
+
215
+ def remove_symlink(server, path)
216
+ @symlinks_info.delete([server,path])
217
+ end
218
+
184
219
  def empty?
185
- @contents_info.empty?
220
+ @contents_info.empty? and @symlinks_info.empty?
186
221
  end
187
222
 
188
223
  def content_exists(checksum)
@@ -193,6 +228,11 @@ module ContentData
193
228
  @instances_info.has_key?([server, path])
194
229
  end
195
230
 
231
+ def symlink_exists(path, server)
232
+ @symlinks_info.has_key?([server, path])
233
+ end
234
+
235
+
196
236
  # removes an instance record both in @instances_info and @instances_info.
197
237
  # input params: server & path - are the instance unique key (called location)
198
238
  # removes also the content, if content becomes empty after removing the instance
@@ -208,7 +248,7 @@ module ContentData
208
248
  end
209
249
 
210
250
  # removes all instances records which are located under input param: dir_to_remove.
211
- # found records are removed from both @instances_info and @instances_info.
251
+ # found records are removed from @contents_info , @instances_info and @symlinks_info
212
252
  # input params: server & dir_to_remove - are used to check each instance unique key (called location)
213
253
  # removes also content\s, if a content\s become\s empty after removing instance\s
214
254
  def remove_directory(dir_to_remove, server)
@@ -216,7 +256,9 @@ module ContentData
216
256
  loop {
217
257
  checksum = contents_enum.next rescue break
218
258
  instances = @contents_info[checksum][1]
219
- instances.each_key { |location|
259
+ instances_enum = instances.each_key
260
+ loop {
261
+ location = instances_enum.next rescue break
220
262
  if location[0] == server and location[1].scan(dir_to_remove).size > 0
221
263
  instances.delete(location)
222
264
  @instances_info.delete(location)
@@ -224,27 +266,20 @@ module ContentData
224
266
  }
225
267
  @contents_info.delete(checksum) if instances.empty?
226
268
  }
227
- end
228
269
 
270
+ # handle symlinks
271
+ symlinks_enum = @symlinks_info.each_key
272
+ loop {
273
+ symlink_key = symlinks_enum.next rescue break
274
+ if symlink_key[0] == server and symlink_key[1].scan(dir_to_remove).size > 0
275
+ @symlinks_info.delete(symlink_key)
276
+ end
277
+ }
278
+ end
229
279
 
230
280
  def ==(other)
231
- return false if other.nil?
232
- return false if @contents_info.length != other.contents_size
233
- other.each_instance { |checksum, size, content_mod_time, instance_mod_time, server, path|
234
- return false if instance_exists(path, server) != other.instance_exists(path, server)
235
- local_content_info = @contents_info[checksum]
236
- return false if local_content_info.nil?
237
- return false if local_content_info[0] != size
238
- return false if local_content_info[2] != content_mod_time
239
- #check instances
240
- local_instances = local_content_info[1]
241
- return false if other.checksum_instances_size(checksum) != local_instances.length
242
- location = [server, path]
243
- local_instance_mod_time, _ = local_instances[location]
244
- return false if local_instance_mod_time.nil?
245
- return false if local_instance_mod_time != instance_mod_time
246
- }
247
- true
281
+ return nil if other.nil? # for this case: content_data == nil
282
+ unique_id == other.unique_id
248
283
  end
249
284
 
250
285
  def remove_content(checksum)
@@ -282,6 +317,7 @@ module ContentData
282
317
  content_data_dir = File.dirname(filename)
283
318
  FileUtils.makedirs(content_data_dir) unless File.directory?(content_data_dir)
284
319
  File.open(filename, 'w') { |file|
320
+ # Write contents
285
321
  file.write("#{@contents_info.length}\n")
286
322
  contents_enum = @contents_info.each_key
287
323
  content_chunks = @contents_info.length / CHUNK_SIZE + 1
@@ -291,6 +327,8 @@ module ContentData
291
327
  GC.start
292
328
  chunks_counter += 1
293
329
  end
330
+
331
+ # Write instances
294
332
  file.write("#{@instances_info.length}\n")
295
333
  contents_enum = @contents_info.each_key
296
334
  chunks_counter = 0
@@ -299,6 +337,14 @@ module ContentData
299
337
  GC.start
300
338
  chunks_counter += 1
301
339
  end
340
+
341
+ # Write symlinks
342
+ symlinks_info_enum = @symlinks_info.each_key
343
+ file.write("#{@symlinks_info.length}\n")
344
+ loop {
345
+ symlink_key = symlinks_info_enum.next rescue break
346
+ file.write("#{symlink_key[0]}<#{symlink_key[1]}<#{@symlinks_info[symlink_key]}\n")
347
+ }
302
348
  }
303
349
  end
304
350
 
@@ -307,7 +353,7 @@ module ContentData
307
353
  while chunk_counter < chunk_size
308
354
  checksum = contents_enum.next rescue return
309
355
  content_info = @contents_info[checksum]
310
- file.write("#{checksum},#{content_info[0]},#{content_info[2]}\n")
356
+ file.write("#{checksum}<#{content_info[0]}<#{content_info[2]}\n")
311
357
  chunk_counter += 1
312
358
  end
313
359
  end
@@ -323,8 +369,8 @@ module ContentData
323
369
  # provide the block with: checksum, size, content modification time,instance modification time,
324
370
  # server and path.
325
371
  instance_modification_time,instance_index_time = content_info[1][location]
326
- file.write("#{checksum},#{content_info[0]},#{location[0]},#{location[1]}," +
327
- "#{instance_modification_time},#{instance_index_time}\n")
372
+ file.write("#{checksum}<#{content_info[0]}<#{location[0]}<#{location[1]}<" +
373
+ "#{instance_modification_time}<#{instance_index_time}\n")
328
374
  }
329
375
  chunk_counter += 1
330
376
  break if chunk_counter == chunk_size
@@ -332,6 +378,7 @@ module ContentData
332
378
  end
333
379
 
334
380
  # TODO validation that file indeed contains ContentData missing
381
+ # TODO class level method?
335
382
  # Loading db from file using chunks for better memory performance
336
383
  def from_file(filename)
337
384
  # read first line (number of contents)
@@ -339,11 +386,16 @@ module ContentData
339
386
  # read number of instances.
340
387
  # loop over instances lines (using chunks) and add instances
341
388
 
389
+ unless File.exists? filename
390
+ raise ArgumentError.new "No such a file #{filename}"
391
+ end
392
+
342
393
  File.open(filename, 'r') { |file|
343
394
  # Get number of contents (at first line)
344
395
  number_of_contents = file.gets # this gets the next line or return nil at EOF
345
396
  unless (number_of_contents and number_of_contents.match(/^[\d]+$/)) # check that line is of Number format
346
- return reset_load_from_file(filename, file, "number of contents should be a number. We got:#{number_of_contents}")
397
+ raise("Parse error of content data file:#{filename} line ##{$.}\n" +
398
+ "number of contents should be a number. We got:#{number_of_contents}")
347
399
  end
348
400
  number_of_contents = number_of_contents.to_i
349
401
  # advance file lines over all contents. We need only the instances data to build the content data object
@@ -365,7 +417,8 @@ module ContentData
365
417
  # get number of instances
366
418
  number_of_instances = file.gets
367
419
  unless (number_of_instances and number_of_instances.match(/^[\d]+$/)) # check that line is of Number format
368
- return reset_load_from_file(filename, file, "number of instances should be a Number. We got:#{number_of_instances}")
420
+ raise("Parse error of content data file:#{filename} line ##{$.}\n" +
421
+ "number of instances should be a Number. We got:#{number_of_instances}")
369
422
  end
370
423
  number_of_instances = number_of_instances.to_i
371
424
  # read in instances chunks and GC
@@ -382,35 +435,56 @@ module ContentData
382
435
  GC.start
383
436
  chunk_index += 1
384
437
  end
438
+
439
+ # get number of symlinks
440
+ number_of_symlinks = file.gets
441
+ unless (number_of_symlinks and number_of_symlinks.match(/^[\d]+$/)) # check that line is of Number format
442
+ raise("Parse error of content data file:#{filename} line ##{$.}\n" +
443
+ "number of symlinks should be a Number. We got:#{number_of_symlinks}")
444
+ end
445
+ number_of_symlinks.to_i.times {
446
+ symlinks_line = file.gets
447
+ unless symlinks_line
448
+ raise("Parse error of content data file:#{filename} line ##{$.}\n" +
449
+ "Expected to read symlink line but reached EOF")
450
+ end
451
+ parameters = symlinks_line.split('<')
452
+ if (3 != parameters.length)
453
+ raise("Parse error of content data file:#{filename} line ##{$.}\n" +
454
+ "Expected to read 3 fields ('<' separated) but got #{parameters.length}.\nLine:#{symlinks_line}")
455
+ end
456
+
457
+ @symlinks_info[[parameters[0],parameters[1]]] = parameters[2]
458
+ }
385
459
  }
386
460
  end
387
461
 
388
462
  def read_contents_chunk(filename, file, chunk_size)
389
463
  chunk_index = 0
390
464
  while chunk_index < chunk_size
391
- return reset_load_from_file(filename, file, "Expecting content line but " +
392
- "reached end of file after line #{$.}") unless file.gets
465
+ unless file.gets
466
+ raise("Parse error of content data file:#{filename} line ##{$.}\n" +
467
+ "Expecting content line but reached end of file")
468
+ end
393
469
  chunk_index += 1
394
470
  end
395
471
  true
396
472
  end
397
473
 
398
- def read_instances_chunk(filename, file, chunk_size)
474
+ def read_instances_chunk(filename, file, chunk_size)
399
475
  chunk_index = 0
400
476
  while chunk_index < chunk_size
401
477
  instance_line = file.gets
402
- return reset_load_from_file(filename, file, "Expected to read Instance line but reached EOF") unless instance_line
403
- parameters = instance_line.split(',')
404
- # bugfix: if file name consist a comma then parsing based on comma separating fails
405
- if (parameters.size > 6)
406
- (4..parameters.size-3).each do |i|
407
- parameters[3] = [parameters[3], parameters[i]].join(",")
408
- end
409
- (4..parameters.size-3).each do |i|
410
- parameters.delete_at(4)
411
- end
478
+ unless instance_line
479
+ raise("Parse error of content data file:#{filename} line ##{$.}\n" +
480
+ "Expected to read Instance line but reached EOF")
412
481
  end
413
482
 
483
+ parameters = instance_line.split('<')
484
+ if (6 != parameters.length)
485
+ raise("Parse error of content data file:#{filename} line ##{$.}\n" +
486
+ "Expected to read 6 fields ('<' separated) but got #{parameters.length}.\nLine:#{instance_line}")
487
+ end
414
488
  add_instance(parameters[0], #checksum
415
489
  parameters[1].to_i, # size
416
490
  parameters[2], # server
@@ -422,14 +496,6 @@ module ContentData
422
496
  true
423
497
  end
424
498
 
425
- def reset_load_from_file(file_name, file_io, err_msg)
426
- Log.error("unexpected error reading file:#{file_name}\nError message:#{err_msg}")
427
- @contents_info = {} # Checksum --> [size, paths-->time(instance), time(content)]
428
- @instances_info = {} # location --> checksum to optimize instances query
429
- file_io.close
430
- nil
431
- end
432
-
433
499
  # for each content, all time fields (content and instances) are replaced with the
434
500
  # min time found, while going through all time fields.
435
501
  def unify_time()