file_indexing 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,140 @@
1
+ require 'digest/sha1'
2
+ require 'logger'
3
+ require 'pp'
4
+ require 'time'
5
+ require './content_data'
6
+ require './indexer_patterns'
7
+
8
+ ####################
9
+ # Index Agent
10
+ ####################
11
+
12
+ class IndexAgent
13
+ attr_reader :indexed_content
14
+
15
+ LOCALTZ = Time.now.zone
16
+ ENV['TZ'] = 'UTC'
17
+
18
+ def initialize
19
+ init_log()
20
+ init_db()
21
+ end
22
+
23
+ def init_db()
24
+ @indexed_content = ContentData.new
25
+ end
26
+
27
+ def init_log()
28
+ @log = Logger.new(STDERR)
29
+ @log.level = Logger::WARN
30
+ @log.datetime_format = "%Y-%m-%d %H:%M:%S"
31
+ end
32
+
33
+ def set_log(log_path, log_level)
34
+ @log = Logger.new(log_path) if log_path
35
+ @log.level = log_level
36
+ end
37
+
38
+ # Calculate file checksum (SHA1)
39
+ def self.get_checksum(filename)
40
+ digest = Digest::SHA1.new
41
+ begin
42
+ file = File.new(filename)
43
+ while buffer = file.read(65536)
44
+ digest << buffer
45
+ end
46
+ #@log.info { digest.hexdigest.downcase + ' ' + filename }
47
+ digest.hexdigest.downcase
48
+ rescue Errno::EACCES, Errno::ETXTBSY => exp
49
+ @log.warn { "#{exp.message}" }
50
+ false
51
+ ensure
52
+ file.close if file != nil
53
+ end
54
+ end
55
+
56
+ # get all files
57
+ # satisfying the pattern
58
+ def collect(pattern)
59
+ Dir.glob(pattern.to_s)
60
+ end
61
+
62
+ # index device according to the pattern
63
+ # store the result
64
+ # TODO device support
65
+ def index(patterns, otherDB = nil)
66
+ abort "#{self.class}: DB not empty. Current implementation permits only one running of index" unless db.contents.empty?
67
+
68
+ server_name = `hostname`
69
+ permit_patterns = Array.new
70
+ forbid_patterns = Array.new
71
+ otherDB_table = Hash.new # contains instances from given DB while full path name is a key and instance is a value
72
+ otherDB_contents = Hash.new # given DB contents
73
+
74
+ # if there is a given DB then populate table with files
75
+ # that was already indexed on this server/device
76
+ if (otherDB != nil)
77
+ otherDB_contents.update(otherDB.contents)
78
+ otherDB.instances.each_value do |i|
79
+ next unless i.server_name == server_name #and i.device == @device
80
+ otherDB_table[i.full_path] = i
81
+ end
82
+ end
83
+
84
+ permit_patterns = patterns.positive_patterns
85
+ forbid_patterns = patterns.negative_patterns
86
+
87
+ # add files found by positive patterns
88
+ files = Array.new
89
+ permit_patterns.each_index do |i|
90
+ files = files | (collect(permit_patterns[i]));
91
+ end
92
+
93
+ # expand to absolute pathes
94
+ files.map! {|f| File.expand_path(f)}
95
+
96
+ # remove files found by negative patterns
97
+ forbid_patterns.each_index do |i|
98
+ forbid_files = Array.new(collect(forbid_patterns[i]));
99
+ forbid_files.each do |f|
100
+ files.delete(File.expand_path(f))
101
+ end
102
+ end
103
+
104
+ # create and add contents and instances
105
+ files.each do |file|
106
+ file_stats = File.lstat(file)
107
+
108
+ # index only files
109
+ next if (file_stats.directory?)
110
+
111
+ # keep only files with names in UTF-8
112
+ unless file.force_encoding("UTF-8").valid_encoding?
113
+ @log.warn { "Non-UTF8 file name \"#{file}\"" }
114
+ next
115
+ end
116
+
117
+ # add files present in the given DB to the DB and remove these files
118
+ # from further processing (save checksum calculation)
119
+ if otherDB_table.has_key?(file)
120
+ instance = otherDB_table[file]
121
+ if instance.size == file_stats.size and instance.modification_time == file_stats.mtime.utc
122
+ @indexed_content.add_content(otherDB_contents[instance.checksum])
123
+ @indexed_content.add_instance(instance)
124
+ next
125
+ end
126
+ end
127
+
128
+ # calculate a checksum
129
+ unless (checksum = self.class.get_checksum(file))
130
+ @log.warn { "Cheksum failure: " + file }
131
+ next
132
+ end
133
+
134
+ @indexed_content.add_content(Content.new(checksum, file_stats.size, Time.now.utc)) unless (@indexed_content.content_exists(checksum))
135
+
136
+ instance = ContentInstance.new(checksum, file_stats.size, server_name, file_stats.dev.to_s, File.expand_path(file), file_stats.mtime.utc)
137
+ @indexed_content.add_instance(instance)
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,61 @@
1
+ require './agent.pb'
2
+
3
+ class IndexerPatterns
4
+ attr_reader :positive_patterns, :negative_patterns
5
+
6
+ # @param indexer_patterns_str [String]
7
+ def initialize (indexer_patterns = nil)
8
+ @positive_patterns = Array.new
9
+ @negative_patterns = Array.new
10
+ # TODO add a test (including empty collections)
11
+ unless indexer_patterns.nil?
12
+ indexer_patterns.positive_patterns.each do |pattern|
13
+ add_pattern(pattern)
14
+ end
15
+ indexer_patterns.negative_patterns.each do |pattern|
16
+ add_pattern(pattern, false)
17
+ end
18
+ end
19
+ end
20
+
21
+ def serialize
22
+ # TODO add a test (including empty collections)
23
+ indexer_patterns = IndexerPatternsMessage.new
24
+ positive_patterns.each do |pattern|
25
+ indexer_patterns.positive_patterns << pattern
26
+ end
27
+ negative_patterns.each do |pattern|
28
+ indexer_patterns.negative_patterns << pattern
29
+ end
30
+ indexer_patterns
31
+ end
32
+
33
+ # @param pattern [String]
34
+ # @param is_positive [true]
35
+ # @param is_positive [false]
36
+ def add_pattern(pattern, is_positive = true)
37
+ pattern.gsub!(/\\/,'/')
38
+ if (is_positive)
39
+ @positive_patterns << pattern
40
+ else
41
+ @negative_patterns << pattern
42
+ end
43
+ end
44
+
45
+ def parse_from_file(file)
46
+ input_patterns = IO.readlines(file)
47
+ begin
48
+ puts "Error loading patterns=%s" % file
49
+ raise IOError("Error loading patterns=%s" % file)
50
+ end unless not input_patterns.nil?
51
+
52
+ input_patterns.each do |pattern|
53
+ if (m = /^\s*([+-]):(.*)/.match(pattern))
54
+ add_pattern(m[2], m[1].eql?('+') ? true : false)
55
+ elsif (not /^\s*[\/\/|#]/.match(pattern)) # not a comment
56
+ puts "pattern in incorrect format: #{pattern}"
57
+ raise RuntimeError("pattern in incorrect format: #{pattern}")
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,10 @@
1
+ require_relative 'file_indexing/index_agent'
2
+ require_relative 'file_indexing/indexer_patterns'
3
+
4
+ # Data structure for an abstract layer over files.
5
+ # Each binary sequence is a content, each file is content instance.
6
+ module BBFS
7
+ module FileIndexing
8
+ VERSION = "0.0.1"
9
+ end
10
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: file_indexing
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Gena Petelko, Kolman Vornovitsky
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-04-08 00:00:00.000000000Z
13
+ dependencies: []
14
+ description: Indexes files, treats files with same binary sequence as one content.
15
+ email: kolmanv@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/file_indexing.rb
21
+ - lib/file_indexing/index_agent.rb
22
+ - lib/file_indexing/indexer_patterns.rb
23
+ homepage: http://github.com/kolmanv/bbfs
24
+ licenses: []
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubyforge_project:
43
+ rubygems_version: 1.8.15
44
+ signing_key:
45
+ specification_version: 3
46
+ summary: Indexes files.
47
+ test_files: []