file_indexing 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/file_indexing/index_agent.rb +140 -0
- data/lib/file_indexing/indexer_patterns.rb +61 -0
- data/lib/file_indexing.rb +10 -0
- metadata +47 -0
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
require 'logger'
|
3
|
+
require 'pp'
|
4
|
+
require 'time'
|
5
|
+
require './content_data'
|
6
|
+
require './indexer_patterns'
|
7
|
+
|
8
|
+
####################
|
9
|
+
# Index Agent
|
10
|
+
####################
|
11
|
+
|
12
|
+
class IndexAgent
|
13
|
+
attr_reader :indexed_content
|
14
|
+
|
15
|
+
LOCALTZ = Time.now.zone
|
16
|
+
ENV['TZ'] = 'UTC'
|
17
|
+
|
18
|
+
def initialize
|
19
|
+
init_log()
|
20
|
+
init_db()
|
21
|
+
end
|
22
|
+
|
23
|
+
def init_db()
|
24
|
+
@indexed_content = ContentData.new
|
25
|
+
end
|
26
|
+
|
27
|
+
def init_log()
|
28
|
+
@log = Logger.new(STDERR)
|
29
|
+
@log.level = Logger::WARN
|
30
|
+
@log.datetime_format = "%Y-%m-%d %H:%M:%S"
|
31
|
+
end
|
32
|
+
|
33
|
+
def set_log(log_path, log_level)
|
34
|
+
@log = Logger.new(log_path) if log_path
|
35
|
+
@log.level = log_level
|
36
|
+
end
|
37
|
+
|
38
|
+
# Calculate file checksum (SHA1)
|
39
|
+
def self.get_checksum(filename)
|
40
|
+
digest = Digest::SHA1.new
|
41
|
+
begin
|
42
|
+
file = File.new(filename)
|
43
|
+
while buffer = file.read(65536)
|
44
|
+
digest << buffer
|
45
|
+
end
|
46
|
+
#@log.info { digest.hexdigest.downcase + ' ' + filename }
|
47
|
+
digest.hexdigest.downcase
|
48
|
+
rescue Errno::EACCES, Errno::ETXTBSY => exp
|
49
|
+
@log.warn { "#{exp.message}" }
|
50
|
+
false
|
51
|
+
ensure
|
52
|
+
file.close if file != nil
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# get all files
|
57
|
+
# satisfying the pattern
|
58
|
+
def collect(pattern)
|
59
|
+
Dir.glob(pattern.to_s)
|
60
|
+
end
|
61
|
+
|
62
|
+
# index device according to the pattern
|
63
|
+
# store the result
|
64
|
+
# TODO device support
|
65
|
+
def index(patterns, otherDB = nil)
|
66
|
+
abort "#{self.class}: DB not empty. Current implementation permits only one running of index" unless db.contents.empty?
|
67
|
+
|
68
|
+
server_name = `hostname`
|
69
|
+
permit_patterns = Array.new
|
70
|
+
forbid_patterns = Array.new
|
71
|
+
otherDB_table = Hash.new # contains instances from given DB while full path name is a key and instance is a value
|
72
|
+
otherDB_contents = Hash.new # given DB contents
|
73
|
+
|
74
|
+
# if there is a given DB then populate table with files
|
75
|
+
# that was already indexed on this server/device
|
76
|
+
if (otherDB != nil)
|
77
|
+
otherDB_contents.update(otherDB.contents)
|
78
|
+
otherDB.instances.each_value do |i|
|
79
|
+
next unless i.server_name == server_name #and i.device == @device
|
80
|
+
otherDB_table[i.full_path] = i
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
permit_patterns = patterns.positive_patterns
|
85
|
+
forbid_patterns = patterns.negative_patterns
|
86
|
+
|
87
|
+
# add files found by positive patterns
|
88
|
+
files = Array.new
|
89
|
+
permit_patterns.each_index do |i|
|
90
|
+
files = files | (collect(permit_patterns[i]));
|
91
|
+
end
|
92
|
+
|
93
|
+
# expand to absolute pathes
|
94
|
+
files.map! {|f| File.expand_path(f)}
|
95
|
+
|
96
|
+
# remove files found by negative patterns
|
97
|
+
forbid_patterns.each_index do |i|
|
98
|
+
forbid_files = Array.new(collect(forbid_patterns[i]));
|
99
|
+
forbid_files.each do |f|
|
100
|
+
files.delete(File.expand_path(f))
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
# create and add contents and instances
|
105
|
+
files.each do |file|
|
106
|
+
file_stats = File.lstat(file)
|
107
|
+
|
108
|
+
# index only files
|
109
|
+
next if (file_stats.directory?)
|
110
|
+
|
111
|
+
# keep only files with names in UTF-8
|
112
|
+
unless file.force_encoding("UTF-8").valid_encoding?
|
113
|
+
@log.warn { "Non-UTF8 file name \"#{file}\"" }
|
114
|
+
next
|
115
|
+
end
|
116
|
+
|
117
|
+
# add files present in the given DB to the DB and remove these files
|
118
|
+
# from further processing (save checksum calculation)
|
119
|
+
if otherDB_table.has_key?(file)
|
120
|
+
instance = otherDB_table[file]
|
121
|
+
if instance.size == file_stats.size and instance.modification_time == file_stats.mtime.utc
|
122
|
+
@indexed_content.add_content(otherDB_contents[instance.checksum])
|
123
|
+
@indexed_content.add_instance(instance)
|
124
|
+
next
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# calculate a checksum
|
129
|
+
unless (checksum = self.class.get_checksum(file))
|
130
|
+
@log.warn { "Cheksum failure: " + file }
|
131
|
+
next
|
132
|
+
end
|
133
|
+
|
134
|
+
@indexed_content.add_content(Content.new(checksum, file_stats.size, Time.now.utc)) unless (@indexed_content.content_exists(checksum))
|
135
|
+
|
136
|
+
instance = ContentInstance.new(checksum, file_stats.size, server_name, file_stats.dev.to_s, File.expand_path(file), file_stats.mtime.utc)
|
137
|
+
@indexed_content.add_instance(instance)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require './agent.pb'
|
2
|
+
|
3
|
+
class IndexerPatterns
|
4
|
+
attr_reader :positive_patterns, :negative_patterns
|
5
|
+
|
6
|
+
# @param indexer_patterns_str [String]
|
7
|
+
def initialize (indexer_patterns = nil)
|
8
|
+
@positive_patterns = Array.new
|
9
|
+
@negative_patterns = Array.new
|
10
|
+
# TODO add a test (including empty collections)
|
11
|
+
unless indexer_patterns.nil?
|
12
|
+
indexer_patterns.positive_patterns.each do |pattern|
|
13
|
+
add_pattern(pattern)
|
14
|
+
end
|
15
|
+
indexer_patterns.negative_patterns.each do |pattern|
|
16
|
+
add_pattern(pattern, false)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def serialize
|
22
|
+
# TODO add a test (including empty collections)
|
23
|
+
indexer_patterns = IndexerPatternsMessage.new
|
24
|
+
positive_patterns.each do |pattern|
|
25
|
+
indexer_patterns.positive_patterns << pattern
|
26
|
+
end
|
27
|
+
negative_patterns.each do |pattern|
|
28
|
+
indexer_patterns.negative_patterns << pattern
|
29
|
+
end
|
30
|
+
indexer_patterns
|
31
|
+
end
|
32
|
+
|
33
|
+
# @param pattern [String]
|
34
|
+
# @param is_positive [true]
|
35
|
+
# @param is_positive [false]
|
36
|
+
def add_pattern(pattern, is_positive = true)
|
37
|
+
pattern.gsub!(/\\/,'/')
|
38
|
+
if (is_positive)
|
39
|
+
@positive_patterns << pattern
|
40
|
+
else
|
41
|
+
@negative_patterns << pattern
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def parse_from_file(file)
|
46
|
+
input_patterns = IO.readlines(file)
|
47
|
+
begin
|
48
|
+
puts "Error loading patterns=%s" % file
|
49
|
+
raise IOError("Error loading patterns=%s" % file)
|
50
|
+
end unless not input_patterns.nil?
|
51
|
+
|
52
|
+
input_patterns.each do |pattern|
|
53
|
+
if (m = /^\s*([+-]):(.*)/.match(pattern))
|
54
|
+
add_pattern(m[2], m[1].eql?('+') ? true : false)
|
55
|
+
elsif (not /^\s*[\/\/|#]/.match(pattern)) # not a comment
|
56
|
+
puts "pattern in incorrect format: #{pattern}"
|
57
|
+
raise RuntimeError("pattern in incorrect format: #{pattern}")
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require_relative 'file_indexing/index_agent'
|
2
|
+
require_relative 'file_indexing/indexer_patterns'
|
3
|
+
|
4
|
+
# Data structure for an abstract layer over files.
|
5
|
+
# Each binary sequence is a content, each file is content instance.
|
6
|
+
module BBFS
|
7
|
+
module FileIndexing
|
8
|
+
VERSION = "0.0.1"
|
9
|
+
end
|
10
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: file_indexing
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Gena Petelko, Kolman Vornovitsky
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-04-08 00:00:00.000000000Z
|
13
|
+
dependencies: []
|
14
|
+
description: Indexes files, treats files with same binary sequence as one content.
|
15
|
+
email: kolmanv@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/file_indexing.rb
|
21
|
+
- lib/file_indexing/index_agent.rb
|
22
|
+
- lib/file_indexing/indexer_patterns.rb
|
23
|
+
homepage: http://github.com/kolmanv/bbfs
|
24
|
+
licenses: []
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
none: false
|
31
|
+
requirements:
|
32
|
+
- - ! '>='
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
|
+
none: false
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
requirements: []
|
42
|
+
rubyforge_project:
|
43
|
+
rubygems_version: 1.8.15
|
44
|
+
signing_key:
|
45
|
+
specification_version: 3
|
46
|
+
summary: Indexes files.
|
47
|
+
test_files: []
|