file_indexing 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ require 'digest/sha1'
2
+ require 'logger'
3
+ require 'pp'
4
+ require 'time'
5
+ require './content_data'
6
+ require './indexer_patterns'
7
+
8
+ ####################
9
+ # Index Agent
10
+ ####################
11
+
12
+ class IndexAgent
13
+ attr_reader :indexed_content
14
+
15
+ LOCALTZ = Time.now.zone
16
+ ENV['TZ'] = 'UTC'
17
+
18
+ def initialize
19
+ init_log()
20
+ init_db()
21
+ end
22
+
23
+ def init_db()
24
+ @indexed_content = ContentData.new
25
+ end
26
+
27
+ def init_log()
28
+ @log = Logger.new(STDERR)
29
+ @log.level = Logger::WARN
30
+ @log.datetime_format = "%Y-%m-%d %H:%M:%S"
31
+ end
32
+
33
+ def set_log(log_path, log_level)
34
+ @log = Logger.new(log_path) if log_path
35
+ @log.level = log_level
36
+ end
37
+
38
+ # Calculate file checksum (SHA1)
39
+ def self.get_checksum(filename)
40
+ digest = Digest::SHA1.new
41
+ begin
42
+ file = File.new(filename)
43
+ while buffer = file.read(65536)
44
+ digest << buffer
45
+ end
46
+ #@log.info { digest.hexdigest.downcase + ' ' + filename }
47
+ digest.hexdigest.downcase
48
+ rescue Errno::EACCES, Errno::ETXTBSY => exp
49
+ @log.warn { "#{exp.message}" }
50
+ false
51
+ ensure
52
+ file.close if file != nil
53
+ end
54
+ end
55
+
56
+ # get all files
57
+ # satisfying the pattern
58
+ def collect(pattern)
59
+ Dir.glob(pattern.to_s)
60
+ end
61
+
62
+ # index device according to the pattern
63
+ # store the result
64
+ # TODO device support
65
+ def index(patterns, otherDB = nil)
66
+ abort "#{self.class}: DB not empty. Current implementation permits only one running of index" unless db.contents.empty?
67
+
68
+ server_name = `hostname`
69
+ permit_patterns = Array.new
70
+ forbid_patterns = Array.new
71
+ otherDB_table = Hash.new # contains instances from given DB while full path name is a key and instance is a value
72
+ otherDB_contents = Hash.new # given DB contents
73
+
74
+ # if there is a given DB then populate table with files
75
+ # that was already indexed on this server/device
76
+ if (otherDB != nil)
77
+ otherDB_contents.update(otherDB.contents)
78
+ otherDB.instances.each_value do |i|
79
+ next unless i.server_name == server_name #and i.device == @device
80
+ otherDB_table[i.full_path] = i
81
+ end
82
+ end
83
+
84
+ permit_patterns = patterns.positive_patterns
85
+ forbid_patterns = patterns.negative_patterns
86
+
87
+ # add files found by positive patterns
88
+ files = Array.new
89
+ permit_patterns.each_index do |i|
90
+ files = files | (collect(permit_patterns[i]));
91
+ end
92
+
93
+ # expand to absolute pathes
94
+ files.map! {|f| File.expand_path(f)}
95
+
96
+ # remove files found by negative patterns
97
+ forbid_patterns.each_index do |i|
98
+ forbid_files = Array.new(collect(forbid_patterns[i]));
99
+ forbid_files.each do |f|
100
+ files.delete(File.expand_path(f))
101
+ end
102
+ end
103
+
104
+ # create and add contents and instances
105
+ files.each do |file|
106
+ file_stats = File.lstat(file)
107
+
108
+ # index only files
109
+ next if (file_stats.directory?)
110
+
111
+ # keep only files with names in UTF-8
112
+ unless file.force_encoding("UTF-8").valid_encoding?
113
+ @log.warn { "Non-UTF8 file name \"#{file}\"" }
114
+ next
115
+ end
116
+
117
+ # add files present in the given DB to the DB and remove these files
118
+ # from further processing (save checksum calculation)
119
+ if otherDB_table.has_key?(file)
120
+ instance = otherDB_table[file]
121
+ if instance.size == file_stats.size and instance.modification_time == file_stats.mtime.utc
122
+ @indexed_content.add_content(otherDB_contents[instance.checksum])
123
+ @indexed_content.add_instance(instance)
124
+ next
125
+ end
126
+ end
127
+
128
+ # calculate a checksum
129
+ unless (checksum = self.class.get_checksum(file))
130
+ @log.warn { "Cheksum failure: " + file }
131
+ next
132
+ end
133
+
134
+ @indexed_content.add_content(Content.new(checksum, file_stats.size, Time.now.utc)) unless (@indexed_content.content_exists(checksum))
135
+
136
+ instance = ContentInstance.new(checksum, file_stats.size, server_name, file_stats.dev.to_s, File.expand_path(file), file_stats.mtime.utc)
137
+ @indexed_content.add_instance(instance)
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,61 @@
1
+ require './agent.pb'
2
+
3
+ class IndexerPatterns
4
+ attr_reader :positive_patterns, :negative_patterns
5
+
6
+ # @param indexer_patterns_str [String]
7
+ def initialize (indexer_patterns = nil)
8
+ @positive_patterns = Array.new
9
+ @negative_patterns = Array.new
10
+ # TODO add a test (including empty collections)
11
+ unless indexer_patterns.nil?
12
+ indexer_patterns.positive_patterns.each do |pattern|
13
+ add_pattern(pattern)
14
+ end
15
+ indexer_patterns.negative_patterns.each do |pattern|
16
+ add_pattern(pattern, false)
17
+ end
18
+ end
19
+ end
20
+
21
+ def serialize
22
+ # TODO add a test (including empty collections)
23
+ indexer_patterns = IndexerPatternsMessage.new
24
+ positive_patterns.each do |pattern|
25
+ indexer_patterns.positive_patterns << pattern
26
+ end
27
+ negative_patterns.each do |pattern|
28
+ indexer_patterns.negative_patterns << pattern
29
+ end
30
+ indexer_patterns
31
+ end
32
+
33
+ # @param pattern [String]
34
+ # @param is_positive [true]
35
+ # @param is_positive [false]
36
+ def add_pattern(pattern, is_positive = true)
37
+ pattern.gsub!(/\\/,'/')
38
+ if (is_positive)
39
+ @positive_patterns << pattern
40
+ else
41
+ @negative_patterns << pattern
42
+ end
43
+ end
44
+
45
+ def parse_from_file(file)
46
+ input_patterns = IO.readlines(file)
47
+ begin
48
+ puts "Error loading patterns=%s" % file
49
+ raise IOError("Error loading patterns=%s" % file)
50
+ end unless not input_patterns.nil?
51
+
52
+ input_patterns.each do |pattern|
53
+ if (m = /^\s*([+-]):(.*)/.match(pattern))
54
+ add_pattern(m[2], m[1].eql?('+') ? true : false)
55
+ elsif (not /^\s*[\/\/|#]/.match(pattern)) # not a comment
56
+ puts "pattern in incorrect format: #{pattern}"
57
+ raise RuntimeError("pattern in incorrect format: #{pattern}")
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,10 @@
1
+ require_relative 'file_indexing/index_agent'
2
+ require_relative 'file_indexing/indexer_patterns'
3
+
4
+ # Data structure for an abstract layer over files.
5
+ # Each binary sequence is a content, each file is content instance.
6
+ module BBFS
7
+ module FileIndexing
8
+ VERSION = "0.0.1"
9
+ end
10
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: file_indexing
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Gena Petelko, Kolman Vornovitsky
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-04-08 00:00:00.000000000Z
13
+ dependencies: []
14
+ description: Indexes files, treats files with same binary sequence as one content.
15
+ email: kolmanv@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/file_indexing.rb
21
+ - lib/file_indexing/index_agent.rb
22
+ - lib/file_indexing/indexer_patterns.rb
23
+ homepage: http://github.com/kolmanv/bbfs
24
+ licenses: []
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubyforge_project:
43
+ rubygems_version: 1.8.15
44
+ signing_key:
45
+ specification_version: 3
46
+ summary: Indexes files.
47
+ test_files: []