file_indexing 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/file_indexing/index_agent.rb +140 -0
- data/lib/file_indexing/indexer_patterns.rb +61 -0
- data/lib/file_indexing.rb +10 -0
- metadata +47 -0
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
require 'logger'
|
3
|
+
require 'pp'
|
4
|
+
require 'time'
|
5
|
+
require './content_data'
|
6
|
+
require './indexer_patterns'
|
7
|
+
|
8
|
+
####################
|
9
|
+
# Index Agent
|
10
|
+
####################
|
11
|
+
|
12
|
+
class IndexAgent
|
13
|
+
attr_reader :indexed_content
|
14
|
+
|
15
|
+
LOCALTZ = Time.now.zone
|
16
|
+
ENV['TZ'] = 'UTC'
|
17
|
+
|
18
|
+
def initialize
|
19
|
+
init_log()
|
20
|
+
init_db()
|
21
|
+
end
|
22
|
+
|
23
|
+
def init_db()
|
24
|
+
@indexed_content = ContentData.new
|
25
|
+
end
|
26
|
+
|
27
|
+
def init_log()
|
28
|
+
@log = Logger.new(STDERR)
|
29
|
+
@log.level = Logger::WARN
|
30
|
+
@log.datetime_format = "%Y-%m-%d %H:%M:%S"
|
31
|
+
end
|
32
|
+
|
33
|
+
def set_log(log_path, log_level)
|
34
|
+
@log = Logger.new(log_path) if log_path
|
35
|
+
@log.level = log_level
|
36
|
+
end
|
37
|
+
|
38
|
+
# Calculate file checksum (SHA1)
|
39
|
+
def self.get_checksum(filename)
|
40
|
+
digest = Digest::SHA1.new
|
41
|
+
begin
|
42
|
+
file = File.new(filename)
|
43
|
+
while buffer = file.read(65536)
|
44
|
+
digest << buffer
|
45
|
+
end
|
46
|
+
#@log.info { digest.hexdigest.downcase + ' ' + filename }
|
47
|
+
digest.hexdigest.downcase
|
48
|
+
rescue Errno::EACCES, Errno::ETXTBSY => exp
|
49
|
+
@log.warn { "#{exp.message}" }
|
50
|
+
false
|
51
|
+
ensure
|
52
|
+
file.close if file != nil
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# get all files
|
57
|
+
# satisfying the pattern
|
58
|
+
def collect(pattern)
|
59
|
+
Dir.glob(pattern.to_s)
|
60
|
+
end
|
61
|
+
|
62
|
+
# index device according to the pattern
|
63
|
+
# store the result
|
64
|
+
# TODO device support
|
65
|
+
def index(patterns, otherDB = nil)
|
66
|
+
abort "#{self.class}: DB not empty. Current implementation permits only one running of index" unless db.contents.empty?
|
67
|
+
|
68
|
+
server_name = `hostname`
|
69
|
+
permit_patterns = Array.new
|
70
|
+
forbid_patterns = Array.new
|
71
|
+
otherDB_table = Hash.new # contains instances from given DB while full path name is a key and instance is a value
|
72
|
+
otherDB_contents = Hash.new # given DB contents
|
73
|
+
|
74
|
+
# if there is a given DB then populate table with files
|
75
|
+
# that was already indexed on this server/device
|
76
|
+
if (otherDB != nil)
|
77
|
+
otherDB_contents.update(otherDB.contents)
|
78
|
+
otherDB.instances.each_value do |i|
|
79
|
+
next unless i.server_name == server_name #and i.device == @device
|
80
|
+
otherDB_table[i.full_path] = i
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
permit_patterns = patterns.positive_patterns
|
85
|
+
forbid_patterns = patterns.negative_patterns
|
86
|
+
|
87
|
+
# add files found by positive patterns
|
88
|
+
files = Array.new
|
89
|
+
permit_patterns.each_index do |i|
|
90
|
+
files = files | (collect(permit_patterns[i]));
|
91
|
+
end
|
92
|
+
|
93
|
+
# expand to absolute pathes
|
94
|
+
files.map! {|f| File.expand_path(f)}
|
95
|
+
|
96
|
+
# remove files found by negative patterns
|
97
|
+
forbid_patterns.each_index do |i|
|
98
|
+
forbid_files = Array.new(collect(forbid_patterns[i]));
|
99
|
+
forbid_files.each do |f|
|
100
|
+
files.delete(File.expand_path(f))
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
# create and add contents and instances
|
105
|
+
files.each do |file|
|
106
|
+
file_stats = File.lstat(file)
|
107
|
+
|
108
|
+
# index only files
|
109
|
+
next if (file_stats.directory?)
|
110
|
+
|
111
|
+
# keep only files with names in UTF-8
|
112
|
+
unless file.force_encoding("UTF-8").valid_encoding?
|
113
|
+
@log.warn { "Non-UTF8 file name \"#{file}\"" }
|
114
|
+
next
|
115
|
+
end
|
116
|
+
|
117
|
+
# add files present in the given DB to the DB and remove these files
|
118
|
+
# from further processing (save checksum calculation)
|
119
|
+
if otherDB_table.has_key?(file)
|
120
|
+
instance = otherDB_table[file]
|
121
|
+
if instance.size == file_stats.size and instance.modification_time == file_stats.mtime.utc
|
122
|
+
@indexed_content.add_content(otherDB_contents[instance.checksum])
|
123
|
+
@indexed_content.add_instance(instance)
|
124
|
+
next
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# calculate a checksum
|
129
|
+
unless (checksum = self.class.get_checksum(file))
|
130
|
+
@log.warn { "Cheksum failure: " + file }
|
131
|
+
next
|
132
|
+
end
|
133
|
+
|
134
|
+
@indexed_content.add_content(Content.new(checksum, file_stats.size, Time.now.utc)) unless (@indexed_content.content_exists(checksum))
|
135
|
+
|
136
|
+
instance = ContentInstance.new(checksum, file_stats.size, server_name, file_stats.dev.to_s, File.expand_path(file), file_stats.mtime.utc)
|
137
|
+
@indexed_content.add_instance(instance)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require './agent.pb'
|
2
|
+
|
3
|
+
class IndexerPatterns
|
4
|
+
attr_reader :positive_patterns, :negative_patterns
|
5
|
+
|
6
|
+
# @param indexer_patterns_str [String]
|
7
|
+
def initialize (indexer_patterns = nil)
|
8
|
+
@positive_patterns = Array.new
|
9
|
+
@negative_patterns = Array.new
|
10
|
+
# TODO add a test (including empty collections)
|
11
|
+
unless indexer_patterns.nil?
|
12
|
+
indexer_patterns.positive_patterns.each do |pattern|
|
13
|
+
add_pattern(pattern)
|
14
|
+
end
|
15
|
+
indexer_patterns.negative_patterns.each do |pattern|
|
16
|
+
add_pattern(pattern, false)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def serialize
|
22
|
+
# TODO add a test (including empty collections)
|
23
|
+
indexer_patterns = IndexerPatternsMessage.new
|
24
|
+
positive_patterns.each do |pattern|
|
25
|
+
indexer_patterns.positive_patterns << pattern
|
26
|
+
end
|
27
|
+
negative_patterns.each do |pattern|
|
28
|
+
indexer_patterns.negative_patterns << pattern
|
29
|
+
end
|
30
|
+
indexer_patterns
|
31
|
+
end
|
32
|
+
|
33
|
+
# @param pattern [String]
|
34
|
+
# @param is_positive [true]
|
35
|
+
# @param is_positive [false]
|
36
|
+
def add_pattern(pattern, is_positive = true)
|
37
|
+
pattern.gsub!(/\\/,'/')
|
38
|
+
if (is_positive)
|
39
|
+
@positive_patterns << pattern
|
40
|
+
else
|
41
|
+
@negative_patterns << pattern
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def parse_from_file(file)
|
46
|
+
input_patterns = IO.readlines(file)
|
47
|
+
begin
|
48
|
+
puts "Error loading patterns=%s" % file
|
49
|
+
raise IOError("Error loading patterns=%s" % file)
|
50
|
+
end unless not input_patterns.nil?
|
51
|
+
|
52
|
+
input_patterns.each do |pattern|
|
53
|
+
if (m = /^\s*([+-]):(.*)/.match(pattern))
|
54
|
+
add_pattern(m[2], m[1].eql?('+') ? true : false)
|
55
|
+
elsif (not /^\s*[\/\/|#]/.match(pattern)) # not a comment
|
56
|
+
puts "pattern in incorrect format: #{pattern}"
|
57
|
+
raise RuntimeError("pattern in incorrect format: #{pattern}")
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require_relative 'file_indexing/index_agent'
|
2
|
+
require_relative 'file_indexing/indexer_patterns'
|
3
|
+
|
4
|
+
# Data structure for an abstract layer over files.
|
5
|
+
# Each binary sequence is a content, each file is content instance.
|
6
|
+
module BBFS
|
7
|
+
module FileIndexing
|
8
|
+
VERSION = "0.0.1"
|
9
|
+
end
|
10
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: file_indexing
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Gena Petelko, Kolman Vornovitsky
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-04-08 00:00:00.000000000Z
|
13
|
+
dependencies: []
|
14
|
+
description: Indexes files, treats files with same binary sequence as one content.
|
15
|
+
email: kolmanv@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/file_indexing.rb
|
21
|
+
- lib/file_indexing/index_agent.rb
|
22
|
+
- lib/file_indexing/indexer_patterns.rb
|
23
|
+
homepage: http://github.com/kolmanv/bbfs
|
24
|
+
licenses: []
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
none: false
|
31
|
+
requirements:
|
32
|
+
- - ! '>='
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
36
|
+
none: false
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
requirements: []
|
42
|
+
rubyforge_project:
|
43
|
+
rubygems_version: 1.8.15
|
44
|
+
signing_key:
|
45
|
+
specification_version: 3
|
46
|
+
summary: Indexes files.
|
47
|
+
test_files: []
|