file_indexing 0.0.2 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/lib/file_indexing/index_agent.rb +61 -38
  2. data/lib/file_indexing/indexer_patterns.rb +10 -3
  3. data/lib/file_indexing/version.rb +5 -0
  4. data/lib/file_indexing.rb +2 -3
  5. data/spec/file_indexing/index_agent_spec.rb +54 -0
  6. data/test/file_indexing/index_agent_test/New.txt +0 -0
  7. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/bin/libexslt.dll +0 -0
  8. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/bin/libxslt.dll +0 -0
  9. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/bin/xsltproc.exe +0 -0
  10. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libexslt/exslt.h +102 -0
  11. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libexslt/exsltconfig.h +73 -0
  12. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libexslt/exsltexports.h +140 -0
  13. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libexslt/libexslt.h +29 -0
  14. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/attributes.h +38 -0
  15. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/documents.h +93 -0
  16. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/extensions.h +262 -0
  17. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/extra.h +80 -0
  18. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/functions.h +78 -0
  19. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/imports.h +75 -0
  20. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/keys.h +53 -0
  21. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/libxslt.h +30 -0
  22. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/namespaces.h +68 -0
  23. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/numbersInternals.h +69 -0
  24. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/pattern.h +81 -0
  25. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/preproc.h +43 -0
  26. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/security.h +104 -0
  27. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/templates.h +77 -0
  28. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/transform.h +207 -0
  29. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/trio.h +216 -0
  30. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/triodef.h +220 -0
  31. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/variables.h +91 -0
  32. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/win32config.h +101 -0
  33. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/xslt.h +103 -0
  34. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/xsltInternals.h +1967 -0
  35. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/xsltconfig.h +172 -0
  36. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/xsltexports.h +142 -0
  37. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/xsltlocale.h +57 -0
  38. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/xsltutils.h +309 -0
  39. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/include/libxslt/xsltwin32config.h +105 -0
  40. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/lib/libexslt.lib +0 -0
  41. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/lib/libexslt_a.lib +0 -0
  42. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/lib/libxslt.lib +0 -0
  43. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/lib/libxslt_a.lib +0 -0
  44. data/test/file_indexing/index_agent_test/libxslt-1.1.26.win32/readme.txt +22 -0
  45. data/test/file_indexing/index_agent_test/patterns.input +3 -0
  46. data/test/file_indexing/index_agent_test.rb +53 -0
  47. metadata +129 -7
@@ -1,11 +1,11 @@
1
1
  require 'digest/sha1'
2
- require 'logger'
3
2
  require 'pp'
3
+ require 'set'
4
4
  require 'time'
5
5
 
6
6
  require 'content_data'
7
-
8
- require_relative 'indexer_patterns'
7
+ require 'file_indexing/indexer_patterns'
8
+ require 'log'
9
9
 
10
10
  module BBFS
11
11
  module FileIndexing
@@ -15,55 +15,58 @@ module BBFS
15
15
  ####################
16
16
 
17
17
  class IndexAgent
18
- attr_reader :indexed_content
18
+ attr_reader :indexed_content, :failed_files
19
19
 
20
+ # Why are those lines needed?
20
21
  LOCALTZ = Time.now.zone
21
22
  ENV['TZ'] = 'UTC'
22
23
 
23
24
  def initialize
24
- init_log()
25
- init_db()
26
- end
27
-
28
- def init_db()
29
25
  @indexed_content = ContentData::ContentData.new
30
- end
31
-
32
- def init_log()
33
- @log = Logger.new(STDERR)
34
- @log.level = Logger::WARN
35
- @log.datetime_format = "%Y-%m-%d %H:%M:%S"
36
- end
37
-
38
- def set_log(log_path, log_level)
39
- @log = Logger.new(log_path) if log_path
40
- @log.level = log_level
26
+ @failed_files = Set.new
41
27
  end
42
28
 
43
29
  # Calculate file checksum (SHA1)
44
30
  def self.get_checksum(filename)
45
31
  digest = Digest::SHA1.new
46
32
  begin
47
- file = File.new(filename)
48
- while buffer = file.read(65536)
49
- digest << buffer
50
- end
51
- #@log.info { digest.hexdigest.downcase + ' ' + filename }
33
+ File.open(filename, 'rb') { |f|
34
+ while buffer = f.read(65536) do
35
+ digest << buffer
36
+ end
37
+ }
38
+ Log.debug1("#{filename} sha1 #{digest.hexdigest.downcase}")
52
39
  digest.hexdigest.downcase
53
40
  rescue Errno::EACCES, Errno::ETXTBSY => exp
54
- @log.warn { "#{exp.message}" }
41
+ Log.warning("#{exp.message}")
55
42
  false
56
- ensure
57
- file.close if file != nil
58
43
  end
59
44
  end
60
45
 
46
+ def IndexAgent.get_content_checksum(content)
47
+ # Calculate checksum.
48
+ digest = Digest::SHA1.new
49
+ digest << content
50
+ digest.hexdigest.downcase
51
+ end
52
+
61
53
  # get all files
62
54
  # satisfying the pattern
63
55
  def collect(pattern)
64
56
  Dir.glob(pattern.to_s)
65
57
  end
66
58
 
59
+ # TODO(kolman): Replace this with File.lstat(file).mtime when new version of Ruby comes out.
60
+ # http://bugs.ruby-lang.org/issues/6385
61
+ def IndexAgent.get_correct_mtime(file)
62
+ begin
63
+ File.open(file, 'r') { |f| f.mtime }
64
+ rescue Errno::EACCES => e
65
+ Log.warning("Could not open file #{file} to get mtime. #{e}")
66
+ return 0
67
+ end
68
+ end
69
+
67
70
  # index device according to the pattern
68
71
  # store the result
69
72
  # does not adds automatically otherDB to stored result
@@ -72,7 +75,7 @@ module BBFS
72
75
  abort "#{self.class}: DB not empty. Current implementation permits only one running of index" \
73
76
  unless @indexed_content.contents.empty?
74
77
 
75
- server_name = `hostname`
78
+ server_name = `hostname`.strip
76
79
  permit_patterns = Array.new
77
80
  forbid_patterns = Array.new
78
81
  otherDB_table = Hash.new # contains instances from given DB while full path name is a key and instance is a value
@@ -97,7 +100,7 @@ module BBFS
97
100
  files = files | (collect(permit_patterns[i]));
98
101
  end
99
102
 
100
- p "Files: #{files}."
103
+ Log.info "Files: #{files}."
101
104
 
102
105
  # expand to absolute pathes
103
106
  files.map! {|f| File.expand_path(f)}
@@ -113,13 +116,14 @@ module BBFS
113
116
  # create and add contents and instances
114
117
  files.each do |file|
115
118
  file_stats = File.lstat(file)
119
+ file_mtime = IndexAgent.get_correct_mtime(file)
116
120
 
117
121
  # index only files
118
- next if (file_stats.directory?)
122
+ next if file_stats.directory?
119
123
 
120
124
  # keep only files with names in UTF-8
121
125
  unless file.force_encoding("UTF-8").valid_encoding?
122
- @log.warn { "Non-UTF8 file name \"#{file}\"" }
126
+ Log.warning("Non UTF-8 file name \"#{file}\", skipping.")
123
127
  next
124
128
  end
125
129
 
@@ -127,27 +131,46 @@ module BBFS
127
131
  # from further processing (save checksum calculation)
128
132
  if otherDB_table.has_key?(file)
129
133
  instance = otherDB_table[file]
130
- if instance.size == file_stats.size and instance.modification_time == file_stats.mtime.utc
134
+ if instance.size == file_stats.size and instance.modification_time == file_mtime
131
135
  @indexed_content.add_content(otherDB_contents[instance.checksum])
132
136
  @indexed_content.add_instance(instance)
133
137
  next
138
+ else
139
+ Log.warning("File (#{file}) size or modification file is different.")
134
140
  end
135
141
  end
136
142
 
137
143
  # calculate a checksum
138
144
  unless (checksum = self.class.get_checksum(file))
139
- @log.warn { "Cheksum failure: " + file }
145
+ Log.warning("Cheksum failure: " + file)
146
+ @failed_files.add(file)
140
147
  next
141
148
  end
142
149
 
143
- @indexed_content.add_content(ContentData::Content.new(checksum, file_stats.size, Time.now.utc)) \
144
- unless @indexed_content.content_exists(checksum)
150
+ if !@indexed_content.content_exists(checksum)
151
+ @indexed_content.add_content ContentData::Content.new(checksum, file_stats.size,
152
+ Time.now.utc)
153
+ end
145
154
 
146
- instance = ContentData::ContentInstance.new(checksum, file_stats.size, server_name, file_stats.dev.to_s,
147
- File.expand_path(file), file_stats.mtime.utc)
155
+ instance = ContentData::ContentInstance.new(
156
+ checksum, file_stats.size, server_name, file_stats.dev.to_s,
157
+ File.expand_path(file), file_mtime)
148
158
  @indexed_content.add_instance(instance)
149
159
  end
150
160
  end
161
+
162
+ def IndexAgent.create_shallow_instance(filename)
163
+ return nil unless File.exists?(filename)
164
+ file_stats = File.lstat(filename)
165
+ file_mtime = IndexAgent.get_correct_mtime(filename)
166
+ ContentData::ContentInstance.new(nil, file_stats.size, nil, file_stats.dev.to_s,
167
+ File.expand_path(filename), file_mtime)
168
+ end
169
+
170
+ def IndexAgent.global_path(filename)
171
+ server_name = `hostname`.strip
172
+ return ContentData::ContentInstance.instance_global_path(server_name, filename)
173
+ end
151
174
  end
152
175
 
153
176
  end
@@ -1,3 +1,6 @@
1
+ require 'log'
2
+ require 'params'
3
+
1
4
  module BBFS
2
5
  module FileIndexing
3
6
 
@@ -6,7 +9,7 @@ module BBFS
6
9
 
7
10
  # @param indexer_patterns_str [String]
8
11
  def initialize (indexer_patterns = nil)
9
- p "Initialize index patterns #{indexer_patterns}."
12
+ Log.info "Initialize index patterns #{indexer_patterns}."
10
13
  @positive_patterns = Array.new
11
14
  @negative_patterns = Array.new
12
15
  # TODO add a test (including empty collections)
@@ -47,7 +50,7 @@ module BBFS
47
50
  def parse_from_file(file)
48
51
  input_patterns = IO.readlines(file)
49
52
  begin
50
- puts "Error loading patterns=%s" % file
53
+ Log.info "Error loading patterns=%s" % file
51
54
  raise IOError("Error loading patterns=%s" % file)
52
55
  end unless not input_patterns.nil?
53
56
 
@@ -55,11 +58,15 @@ module BBFS
55
58
  if (m = /^\s*([+-]):(.*)/.match(pattern))
56
59
  add_pattern(m[2], m[1].eql?('+') ? true : false)
57
60
  elsif (not /^\s*[\/\/|#]/.match(pattern)) # not a comment
58
- puts "pattern in incorrect format: #{pattern}"
61
+ Log.info "pattern in incorrect format: #{pattern}"
59
62
  raise RuntimeError("pattern in incorrect format: #{pattern}")
60
63
  end
61
64
  end
62
65
  end
66
+
67
+ def size
68
+ return @positive_patterns.size
69
+ end
63
70
  end
64
71
 
65
72
  end
@@ -0,0 +1,5 @@
1
+ module BBFS
2
+ module FileIndexing
3
+ VERSION = "0.0.8"
4
+ end
5
+ end
data/lib/file_indexing.rb CHANGED
@@ -1,12 +1,11 @@
1
1
  require 'content_data'
2
2
 
3
- require_relative 'file_indexing/index_agent'
4
- require_relative 'file_indexing/indexer_patterns'
3
+ require 'file_indexing/index_agent'
4
+ require 'file_indexing/indexer_patterns'
5
5
 
6
6
  # Data structure for an abstract layer over files.
7
7
  # Each binary sequence is a content, each file is content instance.
8
8
  module BBFS
9
9
  module FileIndexing
10
- VERSION = "0.0.1"
11
10
  end
12
11
  end
@@ -0,0 +1,54 @@
1
+ require 'rspec'
2
+ require 'tempfile'
3
+
4
+ require_relative '../../lib/file_indexing/index_agent.rb'
5
+
6
+ module BBFS
7
+ module FileCopy
8
+ module Spec
9
+
10
+ describe 'checksum' do
11
+ it 'should generate correct checksum' do
12
+ # The test does not checks the problem the problem is when reading from File
13
+ # class which handles read(num) different from read()
14
+ content = ''
15
+ 100000.times { content << 'abagadavazahatikalamansapazkareshet' }
16
+ content_checksum = FileIndexing::IndexAgent.get_content_checksum(content)
17
+
18
+ stream = StringIO.new(content)
19
+ File.stub(:open).and_yield(stream)
20
+ file_checksum = FileIndexing::IndexAgent.get_checksum('kuku')
21
+
22
+ content_checksum.should == file_checksum
23
+ content_checksum.should == '381e99eb0e2dfcaf45c9a367a04a4197ef3039a6'
24
+ end
25
+
26
+ it 'should generate correct checksum for temp file' do
27
+ # A hack to get tmp file name
28
+ tmp_file = Tempfile.new('foo')
29
+ path = tmp_file .path
30
+ tmp_file .close()
31
+
32
+ # Open file in binary mode.
33
+ file = File.open(path, 'wb')
34
+ 100000.times { file.write('abagadavazahatikalamansapazkareshet') }
35
+ file.close()
36
+
37
+ file_checksum = FileIndexing::IndexAgent.get_checksum(path)
38
+ file_checksum.should == '381e99eb0e2dfcaf45c9a367a04a4197ef3039a6'
39
+
40
+ File.open(path, 'rb') { |f|
41
+ content = f.read()
42
+ content_checksum = FileIndexing::IndexAgent.get_content_checksum(content)
43
+ content_checksum.should == '381e99eb0e2dfcaf45c9a367a04a4197ef3039a6'
44
+ file_checksum.should == content_checksum
45
+ }
46
+
47
+ # Delete tmp file.
48
+ tmp_file.unlink
49
+ end
50
+
51
+ end
52
+ end
53
+ end
54
+ end
File without changes
@@ -0,0 +1,102 @@
1
+
2
+ #ifndef __EXSLT_H__
3
+ #define __EXSLT_H__
4
+
5
+ #include <libxml/tree.h>
6
+ #include <libxml/xpath.h>
7
+ #include "exsltexports.h"
8
+ #include <libexslt/exsltconfig.h>
9
+
10
+ #ifdef __cplusplus
11
+ extern "C" {
12
+ #endif
13
+
14
+ EXSLTPUBVAR const char *exsltLibraryVersion;
15
+ EXSLTPUBVAR const int exsltLibexsltVersion;
16
+ EXSLTPUBVAR const int exsltLibxsltVersion;
17
+ EXSLTPUBVAR const int exsltLibxmlVersion;
18
+
19
+ /**
20
+ * EXSLT_COMMON_NAMESPACE:
21
+ *
22
+ * Namespace for EXSLT common functions
23
+ */
24
+ #define EXSLT_COMMON_NAMESPACE ((const xmlChar *) "http://exslt.org/common")
25
+ /**
26
+ * EXSLT_CRYPTO_NAMESPACE:
27
+ *
28
+ * Namespace for EXSLT crypto functions
29
+ */
30
+ #define EXSLT_CRYPTO_NAMESPACE ((const xmlChar *) "http://exslt.org/crypto")
31
+ /**
32
+ * EXSLT_MATH_NAMESPACE:
33
+ *
34
+ * Namespace for EXSLT math functions
35
+ */
36
+ #define EXSLT_MATH_NAMESPACE ((const xmlChar *) "http://exslt.org/math")
37
+ /**
38
+ * EXSLT_SETS_NAMESPACE:
39
+ *
40
+ * Namespace for EXSLT set functions
41
+ */
42
+ #define EXSLT_SETS_NAMESPACE ((const xmlChar *) "http://exslt.org/sets")
43
+ /**
44
+ * EXSLT_FUNCTIONS_NAMESPACE:
45
+ *
46
+ * Namespace for EXSLT functions extension functions
47
+ */
48
+ #define EXSLT_FUNCTIONS_NAMESPACE ((const xmlChar *) "http://exslt.org/functions")
49
+ /**
50
+ * EXSLT_STRINGS_NAMESPACE:
51
+ *
52
+ * Namespace for EXSLT strings functions
53
+ */
54
+ #define EXSLT_STRINGS_NAMESPACE ((const xmlChar *) "http://exslt.org/strings")
55
+ /**
56
+ * EXSLT_DATE_NAMESPACE:
57
+ *
58
+ * Namespace for EXSLT date functions
59
+ */
60
+ #define EXSLT_DATE_NAMESPACE ((const xmlChar *) "http://exslt.org/dates-and-times")
61
+ /**
62
+ * EXSLT_DYNAMIC_NAMESPACE:
63
+ *
64
+ * Namespace for EXSLT dynamic functions
65
+ */
66
+ #define EXSLT_DYNAMIC_NAMESPACE ((const xmlChar *) "http://exslt.org/dynamic")
67
+
68
+ /**
69
+ * SAXON_NAMESPACE:
70
+ *
71
+ * Namespace for SAXON extensions functions
72
+ */
73
+ #define SAXON_NAMESPACE ((const xmlChar *) "http://icl.com/saxon")
74
+
75
+ EXSLTPUBFUN void EXSLTCALL exsltCommonRegister (void);
76
+ #ifdef EXSLT_CRYPTO_ENABLED
77
+ EXSLTPUBFUN void EXSLTCALL exsltCryptoRegister (void);
78
+ #endif
79
+ EXSLTPUBFUN void EXSLTCALL exsltMathRegister (void);
80
+ EXSLTPUBFUN void EXSLTCALL exsltSetsRegister (void);
81
+ EXSLTPUBFUN void EXSLTCALL exsltFuncRegister (void);
82
+ EXSLTPUBFUN void EXSLTCALL exsltStrRegister (void);
83
+ EXSLTPUBFUN void EXSLTCALL exsltDateRegister (void);
84
+ EXSLTPUBFUN void EXSLTCALL exsltSaxonRegister (void);
85
+ EXSLTPUBFUN void EXSLTCALL exsltDynRegister(void);
86
+
87
+ EXSLTPUBFUN void EXSLTCALL exsltRegisterAll (void);
88
+
89
+ EXSLTPUBFUN int EXSLTCALL exsltDateXpathCtxtRegister (xmlXPathContextPtr ctxt,
90
+ const xmlChar *prefix);
91
+ EXSLTPUBFUN int EXSLTCALL exsltMathXpathCtxtRegister (xmlXPathContextPtr ctxt,
92
+ const xmlChar *prefix);
93
+ EXSLTPUBFUN int EXSLTCALL exsltSetsXpathCtxtRegister (xmlXPathContextPtr ctxt,
94
+ const xmlChar *prefix);
95
+ EXSLTPUBFUN int EXSLTCALL exsltStrXpathCtxtRegister (xmlXPathContextPtr ctxt,
96
+ const xmlChar *prefix);
97
+
98
+ #ifdef __cplusplus
99
+ }
100
+ #endif
101
+ #endif /* __EXSLT_H__ */
102
+
@@ -0,0 +1,73 @@
1
+ /*
2
+ * exsltconfig.h: compile-time version informations for the EXSLT library
3
+ *
4
+ * See Copyright for the status of this software.
5
+ *
6
+ * daniel@veillard.com
7
+ */
8
+
9
+ #ifndef __XML_EXSLTCONFIG_H__
10
+ #define __XML_EXSLTCONFIG_H__
11
+
12
+ #ifdef __cplusplus
13
+ extern "C" {
14
+ #endif
15
+
16
+ /**
17
+ * LIBEXSLT_DOTTED_VERSION:
18
+ *
19
+ * the version string like "1.2.3"
20
+ */
21
+ #define LIBEXSLT_DOTTED_VERSION "0.8.15"
22
+
23
+ /**
24
+ * LIBEXSLT_VERSION:
25
+ *
26
+ * the version number: 1.2.3 value is 10203
27
+ */
28
+ #define LIBEXSLT_VERSION 815
29
+
30
+ /**
31
+ * LIBEXSLT_VERSION_STRING:
32
+ *
33
+ * the version number string, 1.2.3 value is "10203"
34
+ */
35
+ #define LIBEXSLT_VERSION_STRING "815"
36
+
37
+ /**
38
+ * LIBEXSLT_VERSION_EXTRA:
39
+ *
40
+ * extra version information, used to show a CVS compilation
41
+ */
42
+ #define LIBEXSLT_VERSION_EXTRA ""
43
+
44
+ /**
45
+ * WITH_CRYPTO:
46
+ *
47
+ * Whether crypto support is configured into exslt
48
+ */
49
+ #if 1
50
+ #define EXSLT_CRYPTO_ENABLED
51
+ #endif
52
+
53
+ /**
54
+ * ATTRIBUTE_UNUSED:
55
+ *
56
+ * This macro is used to flag unused function parameters to GCC
57
+ */
58
+ #ifdef __GNUC__
59
+ #ifdef HAVE_ANSIDECL_H
60
+ #include <ansidecl.h>
61
+ #endif
62
+ #ifndef ATTRIBUTE_UNUSED
63
+ #define ATTRIBUTE_UNUSED __attribute__((unused))
64
+ #endif
65
+ #else
66
+ #define ATTRIBUTE_UNUSED
67
+ #endif
68
+
69
+ #ifdef __cplusplus
70
+ }
71
+ #endif
72
+
73
+ #endif /* __XML_EXSLTCONFIG_H__ */