file_indexing 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/file_indexing.rb CHANGED
@@ -1,3 +1,5 @@
1
+ require 'content_data'
2
+
1
3
  require_relative 'file_indexing/index_agent'
2
4
  require_relative 'file_indexing/indexer_patterns'
3
5
 
@@ -2,139 +2,153 @@ require 'digest/sha1'
2
2
  require 'logger'
3
3
  require 'pp'
4
4
  require 'time'
5
- require './content_data'
6
- require './indexer_patterns'
5
+
6
+ require 'content_data'
7
+
8
+ require_relative 'indexer_patterns'
9
+
10
+ module BBFS
11
+ module FileIndexing
7
12
 
8
13
  ####################
9
14
  # Index Agent
10
15
  ####################
11
16
 
12
- class IndexAgent
13
- attr_reader :indexed_content
14
-
15
- LOCALTZ = Time.now.zone
16
- ENV['TZ'] = 'UTC'
17
+ class IndexAgent
18
+ attr_reader :indexed_content
17
19
 
18
- def initialize
19
- init_log()
20
- init_db()
21
- end
22
-
23
- def init_db()
24
- @indexed_content = ContentData.new
25
- end
20
+ LOCALTZ = Time.now.zone
21
+ ENV['TZ'] = 'UTC'
26
22
 
27
- def init_log()
28
- @log = Logger.new(STDERR)
29
- @log.level = Logger::WARN
30
- @log.datetime_format = "%Y-%m-%d %H:%M:%S"
31
- end
23
+ def initialize
24
+ init_log()
25
+ init_db()
26
+ end
32
27
 
33
- def set_log(log_path, log_level)
34
- @log = Logger.new(log_path) if log_path
35
- @log.level = log_level
36
- end
28
+ def init_db()
29
+ @indexed_content = ContentData::ContentData.new
30
+ end
37
31
 
38
- # Calculate file checksum (SHA1)
39
- def self.get_checksum(filename)
40
- digest = Digest::SHA1.new
41
- begin
42
- file = File.new(filename)
43
- while buffer = file.read(65536)
44
- digest << buffer
32
+ def init_log()
33
+ @log = Logger.new(STDERR)
34
+ @log.level = Logger::WARN
35
+ @log.datetime_format = "%Y-%m-%d %H:%M:%S"
45
36
  end
46
- #@log.info { digest.hexdigest.downcase + ' ' + filename }
47
- digest.hexdigest.downcase
48
- rescue Errno::EACCES, Errno::ETXTBSY => exp
49
- @log.warn { "#{exp.message}" }
50
- false
51
- ensure
52
- file.close if file != nil
53
- end
54
- end
55
37
 
56
- # get all files
57
- # satisfying the pattern
58
- def collect(pattern)
59
- Dir.glob(pattern.to_s)
60
- end
38
+ def set_log(log_path, log_level)
39
+ @log = Logger.new(log_path) if log_path
40
+ @log.level = log_level
41
+ end
61
42
 
62
- # index device according to the pattern
63
- # store the result
64
- # TODO device support
65
- def index(patterns, otherDB = nil)
66
- abort "#{self.class}: DB not empty. Current implementation permits only one running of index" unless db.contents.empty?
67
-
68
- server_name = `hostname`
69
- permit_patterns = Array.new
70
- forbid_patterns = Array.new
71
- otherDB_table = Hash.new # contains instances from given DB while full path name is a key and instance is a value
72
- otherDB_contents = Hash.new # given DB contents
73
-
74
- # if there is a given DB then populate table with files
75
- # that was already indexed on this server/device
76
- if (otherDB != nil)
77
- otherDB_contents.update(otherDB.contents)
78
- otherDB.instances.each_value do |i|
79
- next unless i.server_name == server_name #and i.device == @device
80
- otherDB_table[i.full_path] = i
43
+ # Calculate file checksum (SHA1)
44
+ def self.get_checksum(filename)
45
+ digest = Digest::SHA1.new
46
+ begin
47
+ file = File.new(filename)
48
+ while buffer = file.read(65536)
49
+ digest << buffer
50
+ end
51
+ #@log.info { digest.hexdigest.downcase + ' ' + filename }
52
+ digest.hexdigest.downcase
53
+ rescue Errno::EACCES, Errno::ETXTBSY => exp
54
+ @log.warn { "#{exp.message}" }
55
+ false
56
+ ensure
57
+ file.close if file != nil
58
+ end
81
59
  end
82
- end
83
60
 
84
- permit_patterns = patterns.positive_patterns
85
- forbid_patterns = patterns.negative_patterns
61
+ # get all files
62
+ # satisfying the pattern
63
+ def collect(pattern)
64
+ Dir.glob(pattern.to_s)
65
+ end
86
66
 
87
- # add files found by positive patterns
88
- files = Array.new
89
- permit_patterns.each_index do |i|
90
- files = files | (collect(permit_patterns[i]));
91
- end
67
+ # index device according to the pattern
68
+ # store the result
69
+ # does not adds automatically otherDB to stored result
70
+ # TODO device support
71
+ def index(patterns, otherDB = nil)
72
+ abort "#{self.class}: DB not empty. Current implementation permits only one running of index" \
73
+ unless @indexed_content.contents.empty?
74
+
75
+ server_name = `hostname`
76
+ permit_patterns = Array.new
77
+ forbid_patterns = Array.new
78
+ otherDB_table = Hash.new # contains instances from given DB while full path name is a key and instance is a value
79
+ otherDB_contents = Hash.new # given DB contents
80
+
81
+ # if there is a given DB then populate table with files
82
+ # that was already indexed on this server/device
83
+ if (otherDB != nil)
84
+ otherDB_contents.update(otherDB.contents)
85
+ otherDB.instances.each_value do |i|
86
+ next unless i.server_name == server_name #and i.device == @device
87
+ otherDB_table[i.full_path] = i
88
+ end
89
+ end
92
90
 
93
- # expand to absolute pathes
94
- files.map! {|f| File.expand_path(f)}
91
+ permit_patterns = patterns.positive_patterns
92
+ forbid_patterns = patterns.negative_patterns
95
93
 
96
- # remove files found by negative patterns
97
- forbid_patterns.each_index do |i|
98
- forbid_files = Array.new(collect(forbid_patterns[i]));
99
- forbid_files.each do |f|
100
- files.delete(File.expand_path(f))
101
- end
102
- end
94
+ # add files found by positive patterns
95
+ files = Array.new
96
+ permit_patterns.each_index do |i|
97
+ files = files | (collect(permit_patterns[i]));
98
+ end
103
99
 
104
- # create and add contents and instances
105
- files.each do |file|
106
- file_stats = File.lstat(file)
100
+ p "Files: #{files}."
107
101
 
108
- # index only files
109
- next if (file_stats.directory?)
102
+ # expand to absolute pathes
103
+ files.map! {|f| File.expand_path(f)}
110
104
 
111
- # keep only files with names in UTF-8
112
- unless file.force_encoding("UTF-8").valid_encoding?
113
- @log.warn { "Non-UTF8 file name \"#{file}\"" }
114
- next
115
- end
105
+ # remove files found by negative patterns
106
+ forbid_patterns.each_index do |i|
107
+ forbid_files = Array.new(collect(forbid_patterns[i]));
108
+ forbid_files.each do |f|
109
+ files.delete(File.expand_path(f))
110
+ end
111
+ end
116
112
 
117
- # add files present in the given DB to the DB and remove these files
118
- # from further processing (save checksum calculation)
119
- if otherDB_table.has_key?(file)
120
- instance = otherDB_table[file]
121
- if instance.size == file_stats.size and instance.modification_time == file_stats.mtime.utc
122
- @indexed_content.add_content(otherDB_contents[instance.checksum])
113
+ # create and add contents and instances
114
+ files.each do |file|
115
+ file_stats = File.lstat(file)
116
+
117
+ # index only files
118
+ next if (file_stats.directory?)
119
+
120
+ # keep only files with names in UTF-8
121
+ unless file.force_encoding("UTF-8").valid_encoding?
122
+ @log.warn { "Non-UTF8 file name \"#{file}\"" }
123
+ next
124
+ end
125
+
126
+ # add files present in the given DB to the DB and remove these files
127
+ # from further processing (save checksum calculation)
128
+ if otherDB_table.has_key?(file)
129
+ instance = otherDB_table[file]
130
+ if instance.size == file_stats.size and instance.modification_time == file_stats.mtime.utc
131
+ @indexed_content.add_content(otherDB_contents[instance.checksum])
132
+ @indexed_content.add_instance(instance)
133
+ next
134
+ end
135
+ end
136
+
137
+ # calculate a checksum
138
+ unless (checksum = self.class.get_checksum(file))
139
+ @log.warn { "Cheksum failure: " + file }
140
+ next
141
+ end
142
+
143
+ @indexed_content.add_content(ContentData::Content.new(checksum, file_stats.size, Time.now.utc)) \
144
+ unless @indexed_content.content_exists(checksum)
145
+
146
+ instance = ContentData::ContentInstance.new(checksum, file_stats.size, server_name, file_stats.dev.to_s,
147
+ File.expand_path(file), file_stats.mtime.utc)
123
148
  @indexed_content.add_instance(instance)
124
- next
125
149
  end
126
150
  end
127
-
128
- # calculate a checksum
129
- unless (checksum = self.class.get_checksum(file))
130
- @log.warn { "Cheksum failure: " + file }
131
- next
132
- end
133
-
134
- @indexed_content.add_content(Content.new(checksum, file_stats.size, Time.now.utc)) unless (@indexed_content.content_exists(checksum))
135
-
136
- instance = ContentInstance.new(checksum, file_stats.size, server_name, file_stats.dev.to_s, File.expand_path(file), file_stats.mtime.utc)
137
- @indexed_content.add_instance(instance)
138
151
  end
152
+
139
153
  end
140
154
  end
@@ -1,61 +1,66 @@
1
- require './agent.pb'
1
+ module BBFS
2
+ module FileIndexing
2
3
 
3
- class IndexerPatterns
4
- attr_reader :positive_patterns, :negative_patterns
4
+ class IndexerPatterns
5
+ attr_reader :positive_patterns, :negative_patterns
5
6
 
6
- # @param indexer_patterns_str [String]
7
- def initialize (indexer_patterns = nil)
8
- @positive_patterns = Array.new
9
- @negative_patterns = Array.new
10
- # TODO add a test (including empty collections)
11
- unless indexer_patterns.nil?
12
- indexer_patterns.positive_patterns.each do |pattern|
13
- add_pattern(pattern)
7
+ # @param indexer_patterns_str [String]
8
+ def initialize (indexer_patterns = nil)
9
+ p "Initialize index patterns #{indexer_patterns}."
10
+ @positive_patterns = Array.new
11
+ @negative_patterns = Array.new
12
+ # TODO add a test (including empty collections)
13
+ if indexer_patterns
14
+ indexer_patterns.positive_patterns.each do |pattern|
15
+ add_pattern(pattern)
16
+ end
17
+ indexer_patterns.negative_patterns.each do |pattern|
18
+ add_pattern(pattern, false)
19
+ end
20
+ end
14
21
  end
15
- indexer_patterns.negative_patterns.each do |pattern|
16
- add_pattern(pattern, false)
17
- end
18
- end
19
- end
20
22
 
21
- def serialize
22
- # TODO add a test (including empty collections)
23
- indexer_patterns = IndexerPatternsMessage.new
24
- positive_patterns.each do |pattern|
25
- indexer_patterns.positive_patterns << pattern
26
- end
27
- negative_patterns.each do |pattern|
28
- indexer_patterns.negative_patterns << pattern
29
- end
30
- indexer_patterns
31
- end
23
+ def serialize
24
+ # TODO add a test (including empty collections)
25
+ indexer_patterns = IndexerPatternsMessage.new
26
+ positive_patterns.each do |pattern|
27
+ indexer_patterns.positive_patterns << pattern
28
+ end
29
+ negative_patterns.each do |pattern|
30
+ indexer_patterns.negative_patterns << pattern
31
+ end
32
+ indexer_patterns
33
+ end
32
34
 
33
- # @param pattern [String]
34
- # @param is_positive [true]
35
- # @param is_positive [false]
36
- def add_pattern(pattern, is_positive = true)
37
- pattern.gsub!(/\\/,'/')
38
- if (is_positive)
39
- @positive_patterns << pattern
40
- else
41
- @negative_patterns << pattern
42
- end
43
- end
35
+ # @param pattern [String]
36
+ # @param is_positive [true]
37
+ # @param is_positive [false]
38
+ def add_pattern(pattern, is_positive = true)
39
+ pattern.gsub!(/\\/,'/')
40
+ if (is_positive)
41
+ @positive_patterns << pattern
42
+ else
43
+ @negative_patterns << pattern
44
+ end
45
+ end
44
46
 
45
- def parse_from_file(file)
46
- input_patterns = IO.readlines(file)
47
- begin
48
- puts "Error loading patterns=%s" % file
49
- raise IOError("Error loading patterns=%s" % file)
50
- end unless not input_patterns.nil?
47
+ def parse_from_file(file)
48
+ input_patterns = IO.readlines(file)
49
+ begin
50
+ puts "Error loading patterns=%s" % file
51
+ raise IOError("Error loading patterns=%s" % file)
52
+ end unless not input_patterns.nil?
51
53
 
52
- input_patterns.each do |pattern|
53
- if (m = /^\s*([+-]):(.*)/.match(pattern))
54
- add_pattern(m[2], m[1].eql?('+') ? true : false)
55
- elsif (not /^\s*[\/\/|#]/.match(pattern)) # not a comment
56
- puts "pattern in incorrect format: #{pattern}"
57
- raise RuntimeError("pattern in incorrect format: #{pattern}")
54
+ input_patterns.each do |pattern|
55
+ if (m = /^\s*([+-]):(.*)/.match(pattern))
56
+ add_pattern(m[2], m[1].eql?('+') ? true : false)
57
+ elsif (not /^\s*[\/\/|#]/.match(pattern)) # not a comment
58
+ puts "pattern in incorrect format: #{pattern}"
59
+ raise RuntimeError("pattern in incorrect format: #{pattern}")
60
+ end
61
+ end
58
62
  end
59
63
  end
64
+
60
65
  end
61
- end
66
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: file_indexing
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,8 +9,19 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-08 00:00:00.000000000Z
13
- dependencies: []
12
+ date: 2012-04-11 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: content_data
16
+ requirement: &70262612434200 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70262612434200
14
25
  description: Indexes files, treats files with same binary sequence as one content.
15
26
  email: kolmanv@gmail.com
16
27
  executables: []