file_indexing 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/file_indexing.rb +2 -0
- data/lib/file_indexing/index_agent.rb +123 -109
- data/lib/file_indexing/indexer_patterns.rb +56 -51
- metadata +14 -3
data/lib/file_indexing.rb
CHANGED
@@ -2,139 +2,153 @@ require 'digest/sha1'
|
|
2
2
|
require 'logger'
|
3
3
|
require 'pp'
|
4
4
|
require 'time'
|
5
|
-
|
6
|
-
require '
|
5
|
+
|
6
|
+
require 'content_data'
|
7
|
+
|
8
|
+
require_relative 'indexer_patterns'
|
9
|
+
|
10
|
+
module BBFS
|
11
|
+
module FileIndexing
|
7
12
|
|
8
13
|
####################
|
9
14
|
# Index Agent
|
10
15
|
####################
|
11
16
|
|
12
|
-
class IndexAgent
|
13
|
-
|
14
|
-
|
15
|
-
LOCALTZ = Time.now.zone
|
16
|
-
ENV['TZ'] = 'UTC'
|
17
|
+
class IndexAgent
|
18
|
+
attr_reader :indexed_content
|
17
19
|
|
18
|
-
|
19
|
-
|
20
|
-
init_db()
|
21
|
-
end
|
22
|
-
|
23
|
-
def init_db()
|
24
|
-
@indexed_content = ContentData.new
|
25
|
-
end
|
20
|
+
LOCALTZ = Time.now.zone
|
21
|
+
ENV['TZ'] = 'UTC'
|
26
22
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
end
|
23
|
+
def initialize
|
24
|
+
init_log()
|
25
|
+
init_db()
|
26
|
+
end
|
32
27
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
end
|
28
|
+
def init_db()
|
29
|
+
@indexed_content = ContentData::ContentData.new
|
30
|
+
end
|
37
31
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
file = File.new(filename)
|
43
|
-
while buffer = file.read(65536)
|
44
|
-
digest << buffer
|
32
|
+
def init_log()
|
33
|
+
@log = Logger.new(STDERR)
|
34
|
+
@log.level = Logger::WARN
|
35
|
+
@log.datetime_format = "%Y-%m-%d %H:%M:%S"
|
45
36
|
end
|
46
|
-
#@log.info { digest.hexdigest.downcase + ' ' + filename }
|
47
|
-
digest.hexdigest.downcase
|
48
|
-
rescue Errno::EACCES, Errno::ETXTBSY => exp
|
49
|
-
@log.warn { "#{exp.message}" }
|
50
|
-
false
|
51
|
-
ensure
|
52
|
-
file.close if file != nil
|
53
|
-
end
|
54
|
-
end
|
55
37
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
end
|
38
|
+
def set_log(log_path, log_level)
|
39
|
+
@log = Logger.new(log_path) if log_path
|
40
|
+
@log.level = log_level
|
41
|
+
end
|
61
42
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
otherDB.instances.each_value do |i|
|
79
|
-
next unless i.server_name == server_name #and i.device == @device
|
80
|
-
otherDB_table[i.full_path] = i
|
43
|
+
# Calculate file checksum (SHA1)
|
44
|
+
def self.get_checksum(filename)
|
45
|
+
digest = Digest::SHA1.new
|
46
|
+
begin
|
47
|
+
file = File.new(filename)
|
48
|
+
while buffer = file.read(65536)
|
49
|
+
digest << buffer
|
50
|
+
end
|
51
|
+
#@log.info { digest.hexdigest.downcase + ' ' + filename }
|
52
|
+
digest.hexdigest.downcase
|
53
|
+
rescue Errno::EACCES, Errno::ETXTBSY => exp
|
54
|
+
@log.warn { "#{exp.message}" }
|
55
|
+
false
|
56
|
+
ensure
|
57
|
+
file.close if file != nil
|
58
|
+
end
|
81
59
|
end
|
82
|
-
end
|
83
60
|
|
84
|
-
|
85
|
-
|
61
|
+
# get all files
|
62
|
+
# satisfying the pattern
|
63
|
+
def collect(pattern)
|
64
|
+
Dir.glob(pattern.to_s)
|
65
|
+
end
|
86
66
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
67
|
+
# index device according to the pattern
|
68
|
+
# store the result
|
69
|
+
# does not adds automatically otherDB to stored result
|
70
|
+
# TODO device support
|
71
|
+
def index(patterns, otherDB = nil)
|
72
|
+
abort "#{self.class}: DB not empty. Current implementation permits only one running of index" \
|
73
|
+
unless @indexed_content.contents.empty?
|
74
|
+
|
75
|
+
server_name = `hostname`
|
76
|
+
permit_patterns = Array.new
|
77
|
+
forbid_patterns = Array.new
|
78
|
+
otherDB_table = Hash.new # contains instances from given DB while full path name is a key and instance is a value
|
79
|
+
otherDB_contents = Hash.new # given DB contents
|
80
|
+
|
81
|
+
# if there is a given DB then populate table with files
|
82
|
+
# that was already indexed on this server/device
|
83
|
+
if (otherDB != nil)
|
84
|
+
otherDB_contents.update(otherDB.contents)
|
85
|
+
otherDB.instances.each_value do |i|
|
86
|
+
next unless i.server_name == server_name #and i.device == @device
|
87
|
+
otherDB_table[i.full_path] = i
|
88
|
+
end
|
89
|
+
end
|
92
90
|
|
93
|
-
|
94
|
-
|
91
|
+
permit_patterns = patterns.positive_patterns
|
92
|
+
forbid_patterns = patterns.negative_patterns
|
95
93
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
end
|
102
|
-
end
|
94
|
+
# add files found by positive patterns
|
95
|
+
files = Array.new
|
96
|
+
permit_patterns.each_index do |i|
|
97
|
+
files = files | (collect(permit_patterns[i]));
|
98
|
+
end
|
103
99
|
|
104
|
-
|
105
|
-
files.each do |file|
|
106
|
-
file_stats = File.lstat(file)
|
100
|
+
p "Files: #{files}."
|
107
101
|
|
108
|
-
|
109
|
-
|
102
|
+
# expand to absolute pathes
|
103
|
+
files.map! {|f| File.expand_path(f)}
|
110
104
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
105
|
+
# remove files found by negative patterns
|
106
|
+
forbid_patterns.each_index do |i|
|
107
|
+
forbid_files = Array.new(collect(forbid_patterns[i]));
|
108
|
+
forbid_files.each do |f|
|
109
|
+
files.delete(File.expand_path(f))
|
110
|
+
end
|
111
|
+
end
|
116
112
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
113
|
+
# create and add contents and instances
|
114
|
+
files.each do |file|
|
115
|
+
file_stats = File.lstat(file)
|
116
|
+
|
117
|
+
# index only files
|
118
|
+
next if (file_stats.directory?)
|
119
|
+
|
120
|
+
# keep only files with names in UTF-8
|
121
|
+
unless file.force_encoding("UTF-8").valid_encoding?
|
122
|
+
@log.warn { "Non-UTF8 file name \"#{file}\"" }
|
123
|
+
next
|
124
|
+
end
|
125
|
+
|
126
|
+
# add files present in the given DB to the DB and remove these files
|
127
|
+
# from further processing (save checksum calculation)
|
128
|
+
if otherDB_table.has_key?(file)
|
129
|
+
instance = otherDB_table[file]
|
130
|
+
if instance.size == file_stats.size and instance.modification_time == file_stats.mtime.utc
|
131
|
+
@indexed_content.add_content(otherDB_contents[instance.checksum])
|
132
|
+
@indexed_content.add_instance(instance)
|
133
|
+
next
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# calculate a checksum
|
138
|
+
unless (checksum = self.class.get_checksum(file))
|
139
|
+
@log.warn { "Cheksum failure: " + file }
|
140
|
+
next
|
141
|
+
end
|
142
|
+
|
143
|
+
@indexed_content.add_content(ContentData::Content.new(checksum, file_stats.size, Time.now.utc)) \
|
144
|
+
unless @indexed_content.content_exists(checksum)
|
145
|
+
|
146
|
+
instance = ContentData::ContentInstance.new(checksum, file_stats.size, server_name, file_stats.dev.to_s,
|
147
|
+
File.expand_path(file), file_stats.mtime.utc)
|
123
148
|
@indexed_content.add_instance(instance)
|
124
|
-
next
|
125
149
|
end
|
126
150
|
end
|
127
|
-
|
128
|
-
# calculate a checksum
|
129
|
-
unless (checksum = self.class.get_checksum(file))
|
130
|
-
@log.warn { "Cheksum failure: " + file }
|
131
|
-
next
|
132
|
-
end
|
133
|
-
|
134
|
-
@indexed_content.add_content(Content.new(checksum, file_stats.size, Time.now.utc)) unless (@indexed_content.content_exists(checksum))
|
135
|
-
|
136
|
-
instance = ContentInstance.new(checksum, file_stats.size, server_name, file_stats.dev.to_s, File.expand_path(file), file_stats.mtime.utc)
|
137
|
-
@indexed_content.add_instance(instance)
|
138
151
|
end
|
152
|
+
|
139
153
|
end
|
140
154
|
end
|
@@ -1,61 +1,66 @@
|
|
1
|
-
|
1
|
+
module BBFS
|
2
|
+
module FileIndexing
|
2
3
|
|
3
|
-
class IndexerPatterns
|
4
|
-
|
4
|
+
class IndexerPatterns
|
5
|
+
attr_reader :positive_patterns, :negative_patterns
|
5
6
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
7
|
+
# @param indexer_patterns_str [String]
|
8
|
+
def initialize (indexer_patterns = nil)
|
9
|
+
p "Initialize index patterns #{indexer_patterns}."
|
10
|
+
@positive_patterns = Array.new
|
11
|
+
@negative_patterns = Array.new
|
12
|
+
# TODO add a test (including empty collections)
|
13
|
+
if indexer_patterns
|
14
|
+
indexer_patterns.positive_patterns.each do |pattern|
|
15
|
+
add_pattern(pattern)
|
16
|
+
end
|
17
|
+
indexer_patterns.negative_patterns.each do |pattern|
|
18
|
+
add_pattern(pattern, false)
|
19
|
+
end
|
20
|
+
end
|
14
21
|
end
|
15
|
-
indexer_patterns.negative_patterns.each do |pattern|
|
16
|
-
add_pattern(pattern, false)
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
23
|
+
def serialize
|
24
|
+
# TODO add a test (including empty collections)
|
25
|
+
indexer_patterns = IndexerPatternsMessage.new
|
26
|
+
positive_patterns.each do |pattern|
|
27
|
+
indexer_patterns.positive_patterns << pattern
|
28
|
+
end
|
29
|
+
negative_patterns.each do |pattern|
|
30
|
+
indexer_patterns.negative_patterns << pattern
|
31
|
+
end
|
32
|
+
indexer_patterns
|
33
|
+
end
|
32
34
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
35
|
+
# @param pattern [String]
|
36
|
+
# @param is_positive [true]
|
37
|
+
# @param is_positive [false]
|
38
|
+
def add_pattern(pattern, is_positive = true)
|
39
|
+
pattern.gsub!(/\\/,'/')
|
40
|
+
if (is_positive)
|
41
|
+
@positive_patterns << pattern
|
42
|
+
else
|
43
|
+
@negative_patterns << pattern
|
44
|
+
end
|
45
|
+
end
|
44
46
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
47
|
+
def parse_from_file(file)
|
48
|
+
input_patterns = IO.readlines(file)
|
49
|
+
begin
|
50
|
+
puts "Error loading patterns=%s" % file
|
51
|
+
raise IOError("Error loading patterns=%s" % file)
|
52
|
+
end unless not input_patterns.nil?
|
51
53
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
54
|
+
input_patterns.each do |pattern|
|
55
|
+
if (m = /^\s*([+-]):(.*)/.match(pattern))
|
56
|
+
add_pattern(m[2], m[1].eql?('+') ? true : false)
|
57
|
+
elsif (not /^\s*[\/\/|#]/.match(pattern)) # not a comment
|
58
|
+
puts "pattern in incorrect format: #{pattern}"
|
59
|
+
raise RuntimeError("pattern in incorrect format: #{pattern}")
|
60
|
+
end
|
61
|
+
end
|
58
62
|
end
|
59
63
|
end
|
64
|
+
|
60
65
|
end
|
61
|
-
end
|
66
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: file_indexing
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,8 +9,19 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
13
|
-
dependencies:
|
12
|
+
date: 2012-04-11 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: content_data
|
16
|
+
requirement: &70262612434200 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70262612434200
|
14
25
|
description: Indexes files, treats files with same binary sequence as one content.
|
15
26
|
email: kolmanv@gmail.com
|
16
27
|
executables: []
|