file_indexing 0.0.2 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/lib/file_indexing/index_agent.rb +61 -38
  2. data/lib/file_indexing/indexer_patterns.rb +10 -3
  3. data/lib/file_indexing/version.rb +5 -0
  4. data/lib/file_indexing.rb +2 -3
  5. data/spec/file_indexing/index_agent_spec.rb +54 -0
  6. data/test/file_indexing/index_agent_test/New.txt +0 -0
  7. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/bin/libexslt.dll +0 -0
  8. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/bin/libxslt.dll +0 -0
  9. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/bin/xsltproc.exe +0 -0
  10. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libexslt/exslt.h +102 -0
  11. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libexslt/exsltconfig.h +73 -0
  12. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libexslt/exsltexports.h +140 -0
  13. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libexslt/libexslt.h +29 -0
  14. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/attributes.h +38 -0
  15. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/documents.h +93 -0
  16. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/extensions.h +262 -0
  17. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/extra.h +80 -0
  18. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/functions.h +78 -0
  19. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/imports.h +75 -0
  20. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/keys.h +53 -0
  21. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/libxslt.h +30 -0
  22. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/namespaces.h +68 -0
  23. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/numbersInternals.h +69 -0
  24. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/pattern.h +81 -0
  25. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/preproc.h +43 -0
  26. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/security.h +104 -0
  27. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/templates.h +77 -0
  28. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/transform.h +207 -0
  29. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/trio.h +216 -0
  30. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/triodef.h +220 -0
  31. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/variables.h +91 -0
  32. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/win32config.h +101 -0
  33. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/xslt.h +103 -0
  34. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/xsltInternals.h +1967 -0
  35. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/xsltconfig.h +172 -0
  36. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/xsltexports.h +142 -0
  37. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/xsltlocale.h +57 -0
  38. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/xsltutils.h +309 -0
  39. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/xsltwin32config.h +105 -0
  40. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/lib/libexslt.lib +0 -0
  41. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/lib/libexslt_a.lib +0 -0
  42. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/lib/libxslt.lib +0 -0
  43. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/lib/libxslt_a.lib +0 -0
  44. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/readme.txt +22 -0
  45. data/test/file_indexing/index_agent_test/patterns.input +3 -0
  46. data/test/file_indexing/index_agent_test.rb +53 -0
  47. metadata +129 -7
@@ -1,11 +1,11 @@
1
1
  require 'digest/sha1'
2
- require 'logger'
3
2
  require 'pp'
3
+ require 'set'
4
4
  require 'time'
5
5
 
6
6
  require 'content_data'
7
-
8
- require_relative 'indexer_patterns'
7
+ require 'file_indexing/indexer_patterns'
8
+ require 'log'
9
9
 
10
10
  module BBFS
11
11
  module FileIndexing
@@ -15,55 +15,58 @@ module BBFS
15
15
  ####################
16
16
 
17
17
  class IndexAgent
18
- attr_reader :indexed_content
18
+ attr_reader :indexed_content, :failed_files
19
19
 
20
+ # Why are those lines needed?
20
21
  LOCALTZ = Time.now.zone
21
22
  ENV['TZ'] = 'UTC'
22
23
 
23
24
  def initialize
24
- init_log()
25
- init_db()
26
- end
27
-
28
- def init_db()
29
25
  @indexed_content = ContentData::ContentData.new
30
- end
31
-
32
- def init_log()
33
- @log = Logger.new(STDERR)
34
- @log.level = Logger::WARN
35
- @log.datetime_format = "%Y-%m-%d %H:%M:%S"
36
- end
37
-
38
- def set_log(log_path, log_level)
39
- @log = Logger.new(log_path) if log_path
40
- @log.level = log_level
26
+ @failed_files = Set.new
41
27
  end
42
28
 
43
29
  # Calculate file checksum (SHA1)
44
30
  def self.get_checksum(filename)
45
31
  digest = Digest::SHA1.new
46
32
  begin
47
- file = File.new(filename)
48
- while buffer = file.read(65536)
49
- digest << buffer
50
- end
51
- #@log.info { digest.hexdigest.downcase + ' ' + filename }
33
+ File.open(filename, 'rb') { |f|
34
+ while buffer = f.read(65536) do
35
+ digest << buffer
36
+ end
37
+ }
38
+ Log.debug1("#{filename} sha1 #{digest.hexdigest.downcase}")
52
39
  digest.hexdigest.downcase
53
40
  rescue Errno::EACCES, Errno::ETXTBSY => exp
54
- @log.warn { "#{exp.message}" }
41
+ Log.warning("#{exp.message}")
55
42
  false
56
- ensure
57
- file.close if file != nil
58
43
  end
59
44
  end
60
45
 
46
+ def IndexAgent.get_content_checksum(content)
47
+ # Calculate checksum.
48
+ digest = Digest::SHA1.new
49
+ digest << content
50
+ digest.hexdigest.downcase
51
+ end
52
+
61
53
  # get all files
62
54
  # satisfying the pattern
63
55
  def collect(pattern)
64
56
  Dir.glob(pattern.to_s)
65
57
  end
66
58
 
59
+ # TODO(kolman): Replace this with File.lstat(file).mtime when new version of Ruby comes out.
60
+ # http://bugs.ruby-lang.org/issues/6385
61
+ def IndexAgent.get_correct_mtime(file)
62
+ begin
63
+ File.open(file, 'r') { |f| f.mtime }
64
+ rescue Errno::EACCES => e
65
+ Log.warning("Could not open file #{file} to get mtime. #{e}")
66
+ return 0
67
+ end
68
+ end
69
+
67
70
  # index device according to the pattern
68
71
  # store the result
69
72
  # does not adds automatically otherDB to stored result
@@ -72,7 +75,7 @@ module BBFS
72
75
  abort "#{self.class}: DB not empty. Current implementation permits only one running of index" \
73
76
  unless @indexed_content.contents.empty?
74
77
 
75
- server_name = `hostname`
78
+ server_name = `hostname`.strip
76
79
  permit_patterns = Array.new
77
80
  forbid_patterns = Array.new
78
81
  otherDB_table = Hash.new # contains instances from given DB while full path name is a key and instance is a value
@@ -97,7 +100,7 @@ module BBFS
97
100
  files = files | (collect(permit_patterns[i]));
98
101
  end
99
102
 
100
- p "Files: #{files}."
103
+ Log.info "Files: #{files}."
101
104
 
102
105
  # expand to absolute pathes
103
106
  files.map! {|f| File.expand_path(f)}
@@ -113,13 +116,14 @@ module BBFS
113
116
  # create and add contents and instances
114
117
  files.each do |file|
115
118
  file_stats = File.lstat(file)
119
+ file_mtime = IndexAgent.get_correct_mtime(file)
116
120
 
117
121
  # index only files
118
- next if (file_stats.directory?)
122
+ next if file_stats.directory?
119
123
 
120
124
  # keep only files with names in UTF-8
121
125
  unless file.force_encoding("UTF-8").valid_encoding?
122
- @log.warn { "Non-UTF8 file name \"#{file}\"" }
126
+ Log.warning("Non UTF-8 file name \"#{file}\", skipping.")
123
127
  next
124
128
  end
125
129
 
@@ -127,27 +131,46 @@ module BBFS
127
131
  # from further processing (save checksum calculation)
128
132
  if otherDB_table.has_key?(file)
129
133
  instance = otherDB_table[file]
130
- if instance.size == file_stats.size and instance.modification_time == file_stats.mtime.utc
134
+ if instance.size == file_stats.size and instance.modification_time == file_mtime
131
135
  @indexed_content.add_content(otherDB_contents[instance.checksum])
132
136
  @indexed_content.add_instance(instance)
133
137
  next
138
+ else
139
+ Log.warning("File (#{file}) size or modification file is different.")
134
140
  end
135
141
  end
136
142
 
137
143
  # calculate a checksum
138
144
  unless (checksum = self.class.get_checksum(file))
139
- @log.warn { "Cheksum failure: " + file }
145
+ Log.warning("Cheksum failure: " + file)
146
+ @failed_files.add(file)
140
147
  next
141
148
  end
142
149
 
143
- @indexed_content.add_content(ContentData::Content.new(checksum, file_stats.size, Time.now.utc)) \
144
- unless @indexed_content.content_exists(checksum)
150
+ if !@indexed_content.content_exists(checksum)
151
+ @indexed_content.add_content ContentData::Content.new(checksum, file_stats.size,
152
+ Time.now.utc)
153
+ end
145
154
 
146
- instance = ContentData::ContentInstance.new(checksum, file_stats.size, server_name, file_stats.dev.to_s,
147
- File.expand_path(file), file_stats.mtime.utc)
155
+ instance = ContentData::ContentInstance.new(
156
+ checksum, file_stats.size, server_name, file_stats.dev.to_s,
157
+ File.expand_path(file), file_mtime)
148
158
  @indexed_content.add_instance(instance)
149
159
  end
150
160
  end
161
+
162
+ def IndexAgent.create_shallow_instance(filename)
163
+ return nil unless File.exists?(filename)
164
+ file_stats = File.lstat(filename)
165
+ file_mtime = IndexAgent.get_correct_mtime(filename)
166
+ ContentData::ContentInstance.new(nil, file_stats.size, nil, file_stats.dev.to_s,
167
+ File.expand_path(filename), file_mtime)
168
+ end
169
+
170
+ def IndexAgent.global_path(filename)
171
+ server_name = `hostname`.strip
172
+ return ContentData::ContentInstance.instance_global_path(server_name, filename)
173
+ end
151
174
  end
152
175
 
153
176
  end
@@ -1,3 +1,6 @@
1
+ require 'log'
2
+ require 'params'
3
+
1
4
  module BBFS
2
5
  module FileIndexing
3
6
 
@@ -6,7 +9,7 @@ module BBFS
6
9
 
7
10
  # @param indexer_patterns_str [String]
8
11
  def initialize (indexer_patterns = nil)
9
- p "Initialize index patterns #{indexer_patterns}."
12
+ Log.info "Initialize index patterns #{indexer_patterns}."
10
13
  @positive_patterns = Array.new
11
14
  @negative_patterns = Array.new
12
15
  # TODO add a test (including empty collections)
@@ -47,7 +50,7 @@ module BBFS
47
50
  def parse_from_file(file)
48
51
  input_patterns = IO.readlines(file)
49
52
  begin
50
- puts "Error loading patterns=%s" % file
53
+ Log.info "Error loading patterns=%s" % file
51
54
  raise IOError("Error loading patterns=%s" % file)
52
55
  end unless not input_patterns.nil?
53
56
 
@@ -55,11 +58,15 @@ module BBFS
55
58
  if (m = /^\s*([+-]):(.*)/.match(pattern))
56
59
  add_pattern(m[2], m[1].eql?('+') ? true : false)
57
60
  elsif (not /^\s*[\/\/|#]/.match(pattern)) # not a comment
58
- puts "pattern in incorrect format: #{pattern}"
61
+ Log.info "pattern in incorrect format: #{pattern}"
59
62
  raise RuntimeError("pattern in incorrect format: #{pattern}")
60
63
  end
61
64
  end
62
65
  end
66
+
67
+ def size
68
+ return @positive_patterns.size
69
+ end
63
70
  end
64
71
 
65
72
  end
@@ -0,0 +1,5 @@
1
+ module BBFS
2
+ module FileIndexing
3
+ VERSION = "0.0.8"
4
+ end
5
+ end
data/lib/file_indexing.rb CHANGED
@@ -1,12 +1,11 @@
1
1
  require 'content_data'
2
2
 
3
- require_relative 'file_indexing/index_agent'
4
- require_relative 'file_indexing/indexer_patterns'
3
+ require 'file_indexing/index_agent'
4
+ require 'file_indexing/indexer_patterns'
5
5
 
6
6
  # Data structure for an abstract layer over files.
7
7
  # Each binary sequence is a content, each file is content instance.
8
8
  module BBFS
9
9
  module FileIndexing
10
- VERSION = "0.0.1"
11
10
  end
12
11
  end
@@ -0,0 +1,54 @@
1
+ require 'rspec'
2
+ require 'tempfile'
3
+
4
+ require_relative '../../lib/file_indexing/index_agent.rb'
5
+
6
+ module BBFS
7
+ module FileCopy
8
+ module Spec
9
+
10
+ describe 'checksum' do
11
+ it 'should generate correct checksum' do
12
+ # The test does not checks the problem the problem is when reading from File
13
+ # class which handles read(num) different from read()
14
+ content = ''
15
+ 100000.times { content << 'abagadavazahatikalamansapazkareshet' }
16
+ content_checksum = FileIndexing::IndexAgent.get_content_checksum(content)
17
+
18
+ stream = StringIO.new(content)
19
+ File.stub(:open).and_yield(stream)
20
+ file_checksum = FileIndexing::IndexAgent.get_checksum('kuku')
21
+
22
+ content_checksum.should == file_checksum
23
+ content_checksum.should == '381e99eb0e2dfcaf45c9a367a04a4197ef3039a6'
24
+ end
25
+
26
+ it 'should generate correct checksum for temp file' do
27
+ # A hack to get tmp file name
28
+ tmp_file = Tempfile.new('foo')
29
+ path = tmp_file .path
30
+ tmp_file .close()
31
+
32
+ # Open file in binary mode.
33
+ file = File.open(path, 'wb')
34
+ 100000.times { file.write('abagadavazahatikalamansapazkareshet') }
35
+ file.close()
36
+
37
+ file_checksum = FileIndexing::IndexAgent.get_checksum(path)
38
+ file_checksum.should == '381e99eb0e2dfcaf45c9a367a04a4197ef3039a6'
39
+
40
+ File.open(path, 'rb') { |f|
41
+ content = f.read()
42
+ content_checksum = FileIndexing::IndexAgent.get_content_checksum(content)
43
+ content_checksum.should == '381e99eb0e2dfcaf45c9a367a04a4197ef3039a6'
44
+ file_checksum.should == content_checksum
45
+ }
46
+
47
+ # Delete tmp file.
48
+ tmp_file.unlink
49
+ end
50
+
51
+ end
52
+ end
53
+ end
54
+ end
File without changes
@@ -0,0 +1,102 @@
1
+
2
+ #ifndef __EXSLT_H__
3
+ #define __EXSLT_H__
4
+
5
+ #include <libxml/tree.h>
6
+ #include <libxml/xpath.h>
7
+ #include "exsltexports.h"
8
+ #include <libexslt/exsltconfig.h>
9
+
10
+ #ifdef __cplusplus
11
+ extern "C" {
12
+ #endif
13
+
14
+ EXSLTPUBVAR const char *exsltLibraryVersion;
15
+ EXSLTPUBVAR const int exsltLibexsltVersion;
16
+ EXSLTPUBVAR const int exsltLibxsltVersion;
17
+ EXSLTPUBVAR const int exsltLibxmlVersion;
18
+
19
+ /**
20
+ * EXSLT_COMMON_NAMESPACE:
21
+ *
22
+ * Namespace for EXSLT common functions
23
+ */
24
+ #define EXSLT_COMMON_NAMESPACE ((const xmlChar *) "http://exslt.org/common")
25
+ /**
26
+ * EXSLT_CRYPTO_NAMESPACE:
27
+ *
28
+ * Namespace for EXSLT crypto functions
29
+ */
30
+ #define EXSLT_CRYPTO_NAMESPACE ((const xmlChar *) "http://exslt.org/crypto")
31
+ /**
32
+ * EXSLT_MATH_NAMESPACE:
33
+ *
34
+ * Namespace for EXSLT math functions
35
+ */
36
+ #define EXSLT_MATH_NAMESPACE ((const xmlChar *) "http://exslt.org/math")
37
+ /**
38
+ * EXSLT_SETS_NAMESPACE:
39
+ *
40
+ * Namespace for EXSLT set functions
41
+ */
42
+ #define EXSLT_SETS_NAMESPACE ((const xmlChar *) "http://exslt.org/sets")
43
+ /**
44
+ * EXSLT_FUNCTIONS_NAMESPACE:
45
+ *
46
+ * Namespace for EXSLT functions extension functions
47
+ */
48
+ #define EXSLT_FUNCTIONS_NAMESPACE ((const xmlChar *) "http://exslt.org/functions")
49
+ /**
50
+ * EXSLT_STRINGS_NAMESPACE:
51
+ *
52
+ * Namespace for EXSLT strings functions
53
+ */
54
+ #define EXSLT_STRINGS_NAMESPACE ((const xmlChar *) "http://exslt.org/strings")
55
+ /**
56
+ * EXSLT_DATE_NAMESPACE:
57
+ *
58
+ * Namespace for EXSLT date functions
59
+ */
60
+ #define EXSLT_DATE_NAMESPACE ((const xmlChar *) "http://exslt.org/dates-and-times")
61
+ /**
62
+ * EXSLT_DYNAMIC_NAMESPACE:
63
+ *
64
+ * Namespace for EXSLT dynamic functions
65
+ */
66
+ #define EXSLT_DYNAMIC_NAMESPACE ((const xmlChar *) "http://exslt.org/dynamic")
67
+
68
+ /**
69
+ * SAXON_NAMESPACE:
70
+ *
71
+ * Namespace for SAXON extensions functions
72
+ */
73
+ #define SAXON_NAMESPACE ((const xmlChar *) "http://icl.com/saxon")
74
+
75
+ EXSLTPUBFUN void EXSLTCALL exsltCommonRegister (void);
76
+ #ifdef EXSLT_CRYPTO_ENABLED
77
+ EXSLTPUBFUN void EXSLTCALL exsltCryptoRegister (void);
78
+ #endif
79
+ EXSLTPUBFUN void EXSLTCALL exsltMathRegister (void);
80
+ EXSLTPUBFUN void EXSLTCALL exsltSetsRegister (void);
81
+ EXSLTPUBFUN void EXSLTCALL exsltFuncRegister (void);
82
+ EXSLTPUBFUN void EXSLTCALL exsltStrRegister (void);
83
+ EXSLTPUBFUN void EXSLTCALL exsltDateRegister (void);
84
+ EXSLTPUBFUN void EXSLTCALL exsltSaxonRegister (void);
85
+ EXSLTPUBFUN void EXSLTCALL exsltDynRegister(void);
86
+
87
+ EXSLTPUBFUN void EXSLTCALL exsltRegisterAll (void);
88
+
89
+ EXSLTPUBFUN int EXSLTCALL exsltDateXpathCtxtRegister (xmlXPathContextPtr ctxt,
90
+ const xmlChar *prefix);
91
+ EXSLTPUBFUN int EXSLTCALL exsltMathXpathCtxtRegister (xmlXPathContextPtr ctxt,
92
+ const xmlChar *prefix);
93
+ EXSLTPUBFUN int EXSLTCALL exsltSetsXpathCtxtRegister (xmlXPathContextPtr ctxt,
94
+ const xmlChar *prefix);
95
+ EXSLTPUBFUN int EXSLTCALL exsltStrXpathCtxtRegister (xmlXPathContextPtr ctxt,
96
+ const xmlChar *prefix);
97
+
98
+ #ifdef __cplusplus
99
+ }
100
+ #endif
101
+ #endif /* __EXSLT_H__ */
102
+
@@ -0,0 +1,73 @@
1
+ /*
2
+ * exsltconfig.h: compile-time version informations for the EXSLT library
3
+ *
4
+ * See Copyright for the status of this software.
5
+ *
6
+ * daniel@veillard.com
7
+ */
8
+
9
+ #ifndef __XML_EXSLTCONFIG_H__
10
+ #define __XML_EXSLTCONFIG_H__
11
+
12
+ #ifdef __cplusplus
13
+ extern "C" {
14
+ #endif
15
+
16
+ /**
17
+ * LIBEXSLT_DOTTED_VERSION:
18
+ *
19
+ * the version string like "1.2.3"
20
+ */
21
+ #define LIBEXSLT_DOTTED_VERSION "0.8.15"
22
+
23
+ /**
24
+ * LIBEXSLT_VERSION:
25
+ *
26
+ * the version number: 1.2.3 value is 10203
27
+ */
28
+ #define LIBEXSLT_VERSION 815
29
+
30
+ /**
31
+ * LIBEXSLT_VERSION_STRING:
32
+ *
33
+ * the version number string, 1.2.3 value is "10203"
34
+ */
35
+ #define LIBEXSLT_VERSION_STRING "815"
36
+
37
+ /**
38
+ * LIBEXSLT_VERSION_EXTRA:
39
+ *
40
+ * extra version information, used to show a CVS compilation
41
+ */
42
+ #define LIBEXSLT_VERSION_EXTRA ""
43
+
44
+ /**
45
+ * WITH_CRYPTO:
46
+ *
47
+ * Whether crypto support is configured into exslt
48
+ */
49
+ #if 1
50
+ #define EXSLT_CRYPTO_ENABLED
51
+ #endif
52
+
53
+ /**
54
+ * ATTRIBUTE_UNUSED:
55
+ *
56
+ * This macro is used to flag unused function parameters to GCC
57
+ */
58
+ #ifdef __GNUC__
59
+ #ifdef HAVE_ANSIDECL_H
60
+ #include <ansidecl.h>
61
+ #endif
62
+ #ifndef ATTRIBUTE_UNUSED
63
+ #define ATTRIBUTE_UNUSED __attribute__((unused))
64
+ #endif
65
+ #else
66
+ #define ATTRIBUTE_UNUSED
67
+ #endif
68
+
69
+ #ifdef __cplusplus
70
+ }
71
+ #endif
72
+
73
+ #endif /* __XML_EXSLTCONFIG_H__ */