file_indexing 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/file_indexing.rb CHANGED
@@ -1,3 +1,5 @@
1
+ require 'content_data'
2
+
1
3
  require_relative 'file_indexing/index_agent'
2
4
  require_relative 'file_indexing/indexer_patterns'
3
5
 
@@ -2,139 +2,153 @@ require 'digest/sha1'
2
2
  require 'logger'
3
3
  require 'pp'
4
4
  require 'time'
5
- require './content_data'
6
- require './indexer_patterns'
5
+
6
+ require 'content_data'
7
+
8
+ require_relative 'indexer_patterns'
9
+
10
+ module BBFS
11
+ module FileIndexing
7
12
 
8
13
  ####################
9
14
  # Index Agent
10
15
  ####################
11
16
 
12
- class IndexAgent
13
- attr_reader :indexed_content
14
-
15
- LOCALTZ = Time.now.zone
16
- ENV['TZ'] = 'UTC'
17
+ class IndexAgent
18
+ attr_reader :indexed_content
17
19
 
18
- def initialize
19
- init_log()
20
- init_db()
21
- end
22
-
23
- def init_db()
24
- @indexed_content = ContentData.new
25
- end
20
+ LOCALTZ = Time.now.zone
21
+ ENV['TZ'] = 'UTC'
26
22
 
27
- def init_log()
28
- @log = Logger.new(STDERR)
29
- @log.level = Logger::WARN
30
- @log.datetime_format = "%Y-%m-%d %H:%M:%S"
31
- end
23
+ def initialize
24
+ init_log()
25
+ init_db()
26
+ end
32
27
 
33
- def set_log(log_path, log_level)
34
- @log = Logger.new(log_path) if log_path
35
- @log.level = log_level
36
- end
28
+ def init_db()
29
+ @indexed_content = ContentData::ContentData.new
30
+ end
37
31
 
38
- # Calculate file checksum (SHA1)
39
- def self.get_checksum(filename)
40
- digest = Digest::SHA1.new
41
- begin
42
- file = File.new(filename)
43
- while buffer = file.read(65536)
44
- digest << buffer
32
+ def init_log()
33
+ @log = Logger.new(STDERR)
34
+ @log.level = Logger::WARN
35
+ @log.datetime_format = "%Y-%m-%d %H:%M:%S"
45
36
  end
46
- #@log.info { digest.hexdigest.downcase + ' ' + filename }
47
- digest.hexdigest.downcase
48
- rescue Errno::EACCES, Errno::ETXTBSY => exp
49
- @log.warn { "#{exp.message}" }
50
- false
51
- ensure
52
- file.close if file != nil
53
- end
54
- end
55
37
 
56
- # get all files
57
- # satisfying the pattern
58
- def collect(pattern)
59
- Dir.glob(pattern.to_s)
60
- end
38
+ def set_log(log_path, log_level)
39
+ @log = Logger.new(log_path) if log_path
40
+ @log.level = log_level
41
+ end
61
42
 
62
- # index device according to the pattern
63
- # store the result
64
- # TODO device support
65
- def index(patterns, otherDB = nil)
66
- abort "#{self.class}: DB not empty. Current implementation permits only one running of index" unless db.contents.empty?
67
-
68
- server_name = `hostname`
69
- permit_patterns = Array.new
70
- forbid_patterns = Array.new
71
- otherDB_table = Hash.new # contains instances from given DB while full path name is a key and instance is a value
72
- otherDB_contents = Hash.new # given DB contents
73
-
74
- # if there is a given DB then populate table with files
75
- # that was already indexed on this server/device
76
- if (otherDB != nil)
77
- otherDB_contents.update(otherDB.contents)
78
- otherDB.instances.each_value do |i|
79
- next unless i.server_name == server_name #and i.device == @device
80
- otherDB_table[i.full_path] = i
43
+ # Calculate file checksum (SHA1)
44
+ def self.get_checksum(filename)
45
+ digest = Digest::SHA1.new
46
+ begin
47
+ file = File.new(filename)
48
+ while buffer = file.read(65536)
49
+ digest << buffer
50
+ end
51
+ #@log.info { digest.hexdigest.downcase + ' ' + filename }
52
+ digest.hexdigest.downcase
53
+ rescue Errno::EACCES, Errno::ETXTBSY => exp
54
+ @log.warn { "#{exp.message}" }
55
+ false
56
+ ensure
57
+ file.close if file != nil
58
+ end
81
59
  end
82
- end
83
60
 
84
- permit_patterns = patterns.positive_patterns
85
- forbid_patterns = patterns.negative_patterns
61
+ # get all files
62
+ # satisfying the pattern
63
+ def collect(pattern)
64
+ Dir.glob(pattern.to_s)
65
+ end
86
66
 
87
- # add files found by positive patterns
88
- files = Array.new
89
- permit_patterns.each_index do |i|
90
- files = files | (collect(permit_patterns[i]));
91
- end
67
+ # index device according to the pattern
68
+ # store the result
69
+ # does not adds automatically otherDB to stored result
70
+ # TODO device support
71
+ def index(patterns, otherDB = nil)
72
+ abort "#{self.class}: DB not empty. Current implementation permits only one running of index" \
73
+ unless @indexed_content.contents.empty?
74
+
75
+ server_name = `hostname`
76
+ permit_patterns = Array.new
77
+ forbid_patterns = Array.new
78
+ otherDB_table = Hash.new # contains instances from given DB while full path name is a key and instance is a value
79
+ otherDB_contents = Hash.new # given DB contents
80
+
81
+ # if there is a given DB then populate table with files
82
+ # that was already indexed on this server/device
83
+ if (otherDB != nil)
84
+ otherDB_contents.update(otherDB.contents)
85
+ otherDB.instances.each_value do |i|
86
+ next unless i.server_name == server_name #and i.device == @device
87
+ otherDB_table[i.full_path] = i
88
+ end
89
+ end
92
90
 
93
- # expand to absolute pathes
94
- files.map! {|f| File.expand_path(f)}
91
+ permit_patterns = patterns.positive_patterns
92
+ forbid_patterns = patterns.negative_patterns
95
93
 
96
- # remove files found by negative patterns
97
- forbid_patterns.each_index do |i|
98
- forbid_files = Array.new(collect(forbid_patterns[i]));
99
- forbid_files.each do |f|
100
- files.delete(File.expand_path(f))
101
- end
102
- end
94
+ # add files found by positive patterns
95
+ files = Array.new
96
+ permit_patterns.each_index do |i|
97
+ files = files | (collect(permit_patterns[i]));
98
+ end
103
99
 
104
- # create and add contents and instances
105
- files.each do |file|
106
- file_stats = File.lstat(file)
100
+ p "Files: #{files}."
107
101
 
108
- # index only files
109
- next if (file_stats.directory?)
102
+ # expand to absolute pathes
103
+ files.map! {|f| File.expand_path(f)}
110
104
 
111
- # keep only files with names in UTF-8
112
- unless file.force_encoding("UTF-8").valid_encoding?
113
- @log.warn { "Non-UTF8 file name \"#{file}\"" }
114
- next
115
- end
105
+ # remove files found by negative patterns
106
+ forbid_patterns.each_index do |i|
107
+ forbid_files = Array.new(collect(forbid_patterns[i]));
108
+ forbid_files.each do |f|
109
+ files.delete(File.expand_path(f))
110
+ end
111
+ end
116
112
 
117
- # add files present in the given DB to the DB and remove these files
118
- # from further processing (save checksum calculation)
119
- if otherDB_table.has_key?(file)
120
- instance = otherDB_table[file]
121
- if instance.size == file_stats.size and instance.modification_time == file_stats.mtime.utc
122
- @indexed_content.add_content(otherDB_contents[instance.checksum])
113
+ # create and add contents and instances
114
+ files.each do |file|
115
+ file_stats = File.lstat(file)
116
+
117
+ # index only files
118
+ next if (file_stats.directory?)
119
+
120
+ # keep only files with names in UTF-8
121
+ unless file.force_encoding("UTF-8").valid_encoding?
122
+ @log.warn { "Non-UTF8 file name \"#{file}\"" }
123
+ next
124
+ end
125
+
126
+ # add files present in the given DB to the DB and remove these files
127
+ # from further processing (save checksum calculation)
128
+ if otherDB_table.has_key?(file)
129
+ instance = otherDB_table[file]
130
+ if instance.size == file_stats.size and instance.modification_time == file_stats.mtime.utc
131
+ @indexed_content.add_content(otherDB_contents[instance.checksum])
132
+ @indexed_content.add_instance(instance)
133
+ next
134
+ end
135
+ end
136
+
137
+ # calculate a checksum
138
+ unless (checksum = self.class.get_checksum(file))
139
+ @log.warn { "Cheksum failure: " + file }
140
+ next
141
+ end
142
+
143
+ @indexed_content.add_content(ContentData::Content.new(checksum, file_stats.size, Time.now.utc)) \
144
+ unless @indexed_content.content_exists(checksum)
145
+
146
+ instance = ContentData::ContentInstance.new(checksum, file_stats.size, server_name, file_stats.dev.to_s,
147
+ File.expand_path(file), file_stats.mtime.utc)
123
148
  @indexed_content.add_instance(instance)
124
- next
125
149
  end
126
150
  end
127
-
128
- # calculate a checksum
129
- unless (checksum = self.class.get_checksum(file))
130
- @log.warn { "Cheksum failure: " + file }
131
- next
132
- end
133
-
134
- @indexed_content.add_content(Content.new(checksum, file_stats.size, Time.now.utc)) unless (@indexed_content.content_exists(checksum))
135
-
136
- instance = ContentInstance.new(checksum, file_stats.size, server_name, file_stats.dev.to_s, File.expand_path(file), file_stats.mtime.utc)
137
- @indexed_content.add_instance(instance)
138
151
  end
152
+
139
153
  end
140
154
  end
@@ -1,61 +1,66 @@
1
- require './agent.pb'
1
+ module BBFS
2
+ module FileIndexing
2
3
 
3
- class IndexerPatterns
4
- attr_reader :positive_patterns, :negative_patterns
4
+ class IndexerPatterns
5
+ attr_reader :positive_patterns, :negative_patterns
5
6
 
6
- # @param indexer_patterns_str [String]
7
- def initialize (indexer_patterns = nil)
8
- @positive_patterns = Array.new
9
- @negative_patterns = Array.new
10
- # TODO add a test (including empty collections)
11
- unless indexer_patterns.nil?
12
- indexer_patterns.positive_patterns.each do |pattern|
13
- add_pattern(pattern)
7
+ # @param indexer_patterns_str [String]
8
+ def initialize (indexer_patterns = nil)
9
+ p "Initialize index patterns #{indexer_patterns}."
10
+ @positive_patterns = Array.new
11
+ @negative_patterns = Array.new
12
+ # TODO add a test (including empty collections)
13
+ if indexer_patterns
14
+ indexer_patterns.positive_patterns.each do |pattern|
15
+ add_pattern(pattern)
16
+ end
17
+ indexer_patterns.negative_patterns.each do |pattern|
18
+ add_pattern(pattern, false)
19
+ end
20
+ end
14
21
  end
15
- indexer_patterns.negative_patterns.each do |pattern|
16
- add_pattern(pattern, false)
17
- end
18
- end
19
- end
20
22
 
21
- def serialize
22
- # TODO add a test (including empty collections)
23
- indexer_patterns = IndexerPatternsMessage.new
24
- positive_patterns.each do |pattern|
25
- indexer_patterns.positive_patterns << pattern
26
- end
27
- negative_patterns.each do |pattern|
28
- indexer_patterns.negative_patterns << pattern
29
- end
30
- indexer_patterns
31
- end
23
+ def serialize
24
+ # TODO add a test (including empty collections)
25
+ indexer_patterns = IndexerPatternsMessage.new
26
+ positive_patterns.each do |pattern|
27
+ indexer_patterns.positive_patterns << pattern
28
+ end
29
+ negative_patterns.each do |pattern|
30
+ indexer_patterns.negative_patterns << pattern
31
+ end
32
+ indexer_patterns
33
+ end
32
34
 
33
- # @param pattern [String]
34
- # @param is_positive [true]
35
- # @param is_positive [false]
36
- def add_pattern(pattern, is_positive = true)
37
- pattern.gsub!(/\\/,'/')
38
- if (is_positive)
39
- @positive_patterns << pattern
40
- else
41
- @negative_patterns << pattern
42
- end
43
- end
35
+ # @param pattern [String]
36
+ # @param is_positive [true]
37
+ # @param is_positive [false]
38
+ def add_pattern(pattern, is_positive = true)
39
+ pattern.gsub!(/\\/,'/')
40
+ if (is_positive)
41
+ @positive_patterns << pattern
42
+ else
43
+ @negative_patterns << pattern
44
+ end
45
+ end
44
46
 
45
- def parse_from_file(file)
46
- input_patterns = IO.readlines(file)
47
- begin
48
- puts "Error loading patterns=%s" % file
49
- raise IOError("Error loading patterns=%s" % file)
50
- end unless not input_patterns.nil?
47
+ def parse_from_file(file)
48
+ input_patterns = IO.readlines(file)
49
+ begin
50
+ puts "Error loading patterns=%s" % file
51
+ raise IOError("Error loading patterns=%s" % file)
52
+ end unless not input_patterns.nil?
51
53
 
52
- input_patterns.each do |pattern|
53
- if (m = /^\s*([+-]):(.*)/.match(pattern))
54
- add_pattern(m[2], m[1].eql?('+') ? true : false)
55
- elsif (not /^\s*[\/\/|#]/.match(pattern)) # not a comment
56
- puts "pattern in incorrect format: #{pattern}"
57
- raise RuntimeError("pattern in incorrect format: #{pattern}")
54
+ input_patterns.each do |pattern|
55
+ if (m = /^\s*([+-]):(.*)/.match(pattern))
56
+ add_pattern(m[2], m[1].eql?('+') ? true : false)
57
+ elsif (not /^\s*[\/\/|#]/.match(pattern)) # not a comment
58
+ puts "pattern in incorrect format: #{pattern}"
59
+ raise RuntimeError("pattern in incorrect format: #{pattern}")
60
+ end
61
+ end
58
62
  end
59
63
  end
64
+
60
65
  end
61
- end
66
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: file_indexing
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,8 +9,19 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-08 00:00:00.000000000Z
13
- dependencies: []
12
+ date: 2012-04-11 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: content_data
16
+ requirement: &70262612434200 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70262612434200
14
25
  description: Indexes files, treats files with same binary sequence as one content.
15
26
  email: kolmanv@gmail.com
16
27
  executables: []