file_indexing 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/file_indexing.rb +2 -0
- data/lib/file_indexing/index_agent.rb +123 -109
- data/lib/file_indexing/indexer_patterns.rb +56 -51
- metadata +14 -3
data/lib/file_indexing.rb
CHANGED
@@ -2,139 +2,153 @@ require 'digest/sha1'
|
|
2
2
|
require 'logger'
|
3
3
|
require 'pp'
|
4
4
|
require 'time'
|
5
|
-
|
6
|
-
require '
|
5
|
+
|
6
|
+
require 'content_data'
|
7
|
+
|
8
|
+
require_relative 'indexer_patterns'
|
9
|
+
|
10
|
+
module BBFS
|
11
|
+
module FileIndexing
|
7
12
|
|
8
13
|
####################
|
9
14
|
# Index Agent
|
10
15
|
####################
|
11
16
|
|
12
|
-
class IndexAgent
|
13
|
-
|
14
|
-
|
15
|
-
LOCALTZ = Time.now.zone
|
16
|
-
ENV['TZ'] = 'UTC'
|
17
|
+
class IndexAgent
|
18
|
+
attr_reader :indexed_content
|
17
19
|
|
18
|
-
|
19
|
-
|
20
|
-
init_db()
|
21
|
-
end
|
22
|
-
|
23
|
-
def init_db()
|
24
|
-
@indexed_content = ContentData.new
|
25
|
-
end
|
20
|
+
LOCALTZ = Time.now.zone
|
21
|
+
ENV['TZ'] = 'UTC'
|
26
22
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
end
|
23
|
+
def initialize
|
24
|
+
init_log()
|
25
|
+
init_db()
|
26
|
+
end
|
32
27
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
end
|
28
|
+
def init_db()
|
29
|
+
@indexed_content = ContentData::ContentData.new
|
30
|
+
end
|
37
31
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
file = File.new(filename)
|
43
|
-
while buffer = file.read(65536)
|
44
|
-
digest << buffer
|
32
|
+
def init_log()
|
33
|
+
@log = Logger.new(STDERR)
|
34
|
+
@log.level = Logger::WARN
|
35
|
+
@log.datetime_format = "%Y-%m-%d %H:%M:%S"
|
45
36
|
end
|
46
|
-
#@log.info { digest.hexdigest.downcase + ' ' + filename }
|
47
|
-
digest.hexdigest.downcase
|
48
|
-
rescue Errno::EACCES, Errno::ETXTBSY => exp
|
49
|
-
@log.warn { "#{exp.message}" }
|
50
|
-
false
|
51
|
-
ensure
|
52
|
-
file.close if file != nil
|
53
|
-
end
|
54
|
-
end
|
55
37
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
end
|
38
|
+
def set_log(log_path, log_level)
|
39
|
+
@log = Logger.new(log_path) if log_path
|
40
|
+
@log.level = log_level
|
41
|
+
end
|
61
42
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
otherDB.instances.each_value do |i|
|
79
|
-
next unless i.server_name == server_name #and i.device == @device
|
80
|
-
otherDB_table[i.full_path] = i
|
43
|
+
# Calculate file checksum (SHA1)
|
44
|
+
def self.get_checksum(filename)
|
45
|
+
digest = Digest::SHA1.new
|
46
|
+
begin
|
47
|
+
file = File.new(filename)
|
48
|
+
while buffer = file.read(65536)
|
49
|
+
digest << buffer
|
50
|
+
end
|
51
|
+
#@log.info { digest.hexdigest.downcase + ' ' + filename }
|
52
|
+
digest.hexdigest.downcase
|
53
|
+
rescue Errno::EACCES, Errno::ETXTBSY => exp
|
54
|
+
@log.warn { "#{exp.message}" }
|
55
|
+
false
|
56
|
+
ensure
|
57
|
+
file.close if file != nil
|
58
|
+
end
|
81
59
|
end
|
82
|
-
end
|
83
60
|
|
84
|
-
|
85
|
-
|
61
|
+
# get all files
|
62
|
+
# satisfying the pattern
|
63
|
+
def collect(pattern)
|
64
|
+
Dir.glob(pattern.to_s)
|
65
|
+
end
|
86
66
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
67
|
+
# index device according to the pattern
|
68
|
+
# store the result
|
69
|
+
# does not adds automatically otherDB to stored result
|
70
|
+
# TODO device support
|
71
|
+
def index(patterns, otherDB = nil)
|
72
|
+
abort "#{self.class}: DB not empty. Current implementation permits only one running of index" \
|
73
|
+
unless @indexed_content.contents.empty?
|
74
|
+
|
75
|
+
server_name = `hostname`
|
76
|
+
permit_patterns = Array.new
|
77
|
+
forbid_patterns = Array.new
|
78
|
+
otherDB_table = Hash.new # contains instances from given DB while full path name is a key and instance is a value
|
79
|
+
otherDB_contents = Hash.new # given DB contents
|
80
|
+
|
81
|
+
# if there is a given DB then populate table with files
|
82
|
+
# that was already indexed on this server/device
|
83
|
+
if (otherDB != nil)
|
84
|
+
otherDB_contents.update(otherDB.contents)
|
85
|
+
otherDB.instances.each_value do |i|
|
86
|
+
next unless i.server_name == server_name #and i.device == @device
|
87
|
+
otherDB_table[i.full_path] = i
|
88
|
+
end
|
89
|
+
end
|
92
90
|
|
93
|
-
|
94
|
-
|
91
|
+
permit_patterns = patterns.positive_patterns
|
92
|
+
forbid_patterns = patterns.negative_patterns
|
95
93
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
end
|
102
|
-
end
|
94
|
+
# add files found by positive patterns
|
95
|
+
files = Array.new
|
96
|
+
permit_patterns.each_index do |i|
|
97
|
+
files = files | (collect(permit_patterns[i]));
|
98
|
+
end
|
103
99
|
|
104
|
-
|
105
|
-
files.each do |file|
|
106
|
-
file_stats = File.lstat(file)
|
100
|
+
p "Files: #{files}."
|
107
101
|
|
108
|
-
|
109
|
-
|
102
|
+
# expand to absolute pathes
|
103
|
+
files.map! {|f| File.expand_path(f)}
|
110
104
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
105
|
+
# remove files found by negative patterns
|
106
|
+
forbid_patterns.each_index do |i|
|
107
|
+
forbid_files = Array.new(collect(forbid_patterns[i]));
|
108
|
+
forbid_files.each do |f|
|
109
|
+
files.delete(File.expand_path(f))
|
110
|
+
end
|
111
|
+
end
|
116
112
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
113
|
+
# create and add contents and instances
|
114
|
+
files.each do |file|
|
115
|
+
file_stats = File.lstat(file)
|
116
|
+
|
117
|
+
# index only files
|
118
|
+
next if (file_stats.directory?)
|
119
|
+
|
120
|
+
# keep only files with names in UTF-8
|
121
|
+
unless file.force_encoding("UTF-8").valid_encoding?
|
122
|
+
@log.warn { "Non-UTF8 file name \"#{file}\"" }
|
123
|
+
next
|
124
|
+
end
|
125
|
+
|
126
|
+
# add files present in the given DB to the DB and remove these files
|
127
|
+
# from further processing (save checksum calculation)
|
128
|
+
if otherDB_table.has_key?(file)
|
129
|
+
instance = otherDB_table[file]
|
130
|
+
if instance.size == file_stats.size and instance.modification_time == file_stats.mtime.utc
|
131
|
+
@indexed_content.add_content(otherDB_contents[instance.checksum])
|
132
|
+
@indexed_content.add_instance(instance)
|
133
|
+
next
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# calculate a checksum
|
138
|
+
unless (checksum = self.class.get_checksum(file))
|
139
|
+
@log.warn { "Cheksum failure: " + file }
|
140
|
+
next
|
141
|
+
end
|
142
|
+
|
143
|
+
@indexed_content.add_content(ContentData::Content.new(checksum, file_stats.size, Time.now.utc)) \
|
144
|
+
unless @indexed_content.content_exists(checksum)
|
145
|
+
|
146
|
+
instance = ContentData::ContentInstance.new(checksum, file_stats.size, server_name, file_stats.dev.to_s,
|
147
|
+
File.expand_path(file), file_stats.mtime.utc)
|
123
148
|
@indexed_content.add_instance(instance)
|
124
|
-
next
|
125
149
|
end
|
126
150
|
end
|
127
|
-
|
128
|
-
# calculate a checksum
|
129
|
-
unless (checksum = self.class.get_checksum(file))
|
130
|
-
@log.warn { "Cheksum failure: " + file }
|
131
|
-
next
|
132
|
-
end
|
133
|
-
|
134
|
-
@indexed_content.add_content(Content.new(checksum, file_stats.size, Time.now.utc)) unless (@indexed_content.content_exists(checksum))
|
135
|
-
|
136
|
-
instance = ContentInstance.new(checksum, file_stats.size, server_name, file_stats.dev.to_s, File.expand_path(file), file_stats.mtime.utc)
|
137
|
-
@indexed_content.add_instance(instance)
|
138
151
|
end
|
152
|
+
|
139
153
|
end
|
140
154
|
end
|
@@ -1,61 +1,66 @@
|
|
1
|
-
|
1
|
+
module BBFS
|
2
|
+
module FileIndexing
|
2
3
|
|
3
|
-
class IndexerPatterns
|
4
|
-
|
4
|
+
class IndexerPatterns
|
5
|
+
attr_reader :positive_patterns, :negative_patterns
|
5
6
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
7
|
+
# @param indexer_patterns_str [String]
|
8
|
+
def initialize (indexer_patterns = nil)
|
9
|
+
p "Initialize index patterns #{indexer_patterns}."
|
10
|
+
@positive_patterns = Array.new
|
11
|
+
@negative_patterns = Array.new
|
12
|
+
# TODO add a test (including empty collections)
|
13
|
+
if indexer_patterns
|
14
|
+
indexer_patterns.positive_patterns.each do |pattern|
|
15
|
+
add_pattern(pattern)
|
16
|
+
end
|
17
|
+
indexer_patterns.negative_patterns.each do |pattern|
|
18
|
+
add_pattern(pattern, false)
|
19
|
+
end
|
20
|
+
end
|
14
21
|
end
|
15
|
-
indexer_patterns.negative_patterns.each do |pattern|
|
16
|
-
add_pattern(pattern, false)
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
23
|
+
def serialize
|
24
|
+
# TODO add a test (including empty collections)
|
25
|
+
indexer_patterns = IndexerPatternsMessage.new
|
26
|
+
positive_patterns.each do |pattern|
|
27
|
+
indexer_patterns.positive_patterns << pattern
|
28
|
+
end
|
29
|
+
negative_patterns.each do |pattern|
|
30
|
+
indexer_patterns.negative_patterns << pattern
|
31
|
+
end
|
32
|
+
indexer_patterns
|
33
|
+
end
|
32
34
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
35
|
+
# @param pattern [String]
|
36
|
+
# @param is_positive [true]
|
37
|
+
# @param is_positive [false]
|
38
|
+
def add_pattern(pattern, is_positive = true)
|
39
|
+
pattern.gsub!(/\\/,'/')
|
40
|
+
if (is_positive)
|
41
|
+
@positive_patterns << pattern
|
42
|
+
else
|
43
|
+
@negative_patterns << pattern
|
44
|
+
end
|
45
|
+
end
|
44
46
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
47
|
+
def parse_from_file(file)
|
48
|
+
input_patterns = IO.readlines(file)
|
49
|
+
begin
|
50
|
+
puts "Error loading patterns=%s" % file
|
51
|
+
raise IOError("Error loading patterns=%s" % file)
|
52
|
+
end unless not input_patterns.nil?
|
51
53
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
54
|
+
input_patterns.each do |pattern|
|
55
|
+
if (m = /^\s*([+-]):(.*)/.match(pattern))
|
56
|
+
add_pattern(m[2], m[1].eql?('+') ? true : false)
|
57
|
+
elsif (not /^\s*[\/\/|#]/.match(pattern)) # not a comment
|
58
|
+
puts "pattern in incorrect format: #{pattern}"
|
59
|
+
raise RuntimeError("pattern in incorrect format: #{pattern}")
|
60
|
+
end
|
61
|
+
end
|
58
62
|
end
|
59
63
|
end
|
64
|
+
|
60
65
|
end
|
61
|
-
end
|
66
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: file_indexing
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,8 +9,19 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
13
|
-
dependencies:
|
12
|
+
date: 2012-04-11 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: content_data
|
16
|
+
requirement: &70262612434200 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70262612434200
|
14
25
|
description: Indexes files, treats files with same binary sequence as one content.
|
15
26
|
email: kolmanv@gmail.com
|
16
27
|
executables: []
|