findler 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,6 @@
1
+ *.gem
2
+ .idea/
3
+ .yardoc/
4
+ .bundle/
5
+ Gemfile.lock
6
+ pkg/
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "http://rubygems.org"
2
+ gemspec
3
+
4
+ gem "rake"
5
+ gem "yard"
6
+ gem "rspec", '~> 2.7.0'
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2012 Matthew McEachen
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
@@ -0,0 +1,50 @@
1
+ # Findler: Filesystem Iteration with Persistable State
2
+
3
+ Findler is a Ruby library for iterating over a filtered set of files from a given
4
+ path, written to be suitable with concurrent workers and very large
5
+ filesystem hierarchies.
6
+
7
+ ## Usage
8
+
9
+ ```ruby
10
+ f = Findler.new "/Users/mrm"
11
+ f.append_extension ".jpg", ".jpeg"
12
+ iterator = f.iterator
13
+ iterator.next
14
+ # => "/Users/mrm/Photos/img_1000.jpg"
15
+ ```
16
+
17
+ ## Cross-process continuations
18
+
19
+ This should smell an awful lot like [hike](https://github.com/sstephenson/hike),
20
+ except for that last bit.
21
+
22
+ ```Findler::Iterator``` instances can be "paused" and "resumed" with ```Marshal```.
23
+ The entire state of the iteration for the filesystem is returned, which can then
24
+ be pushed onto any durable storage, like ActiveRecord or Redis, or just a local file:
25
+
26
+ ```ruby
27
+ File.open('iterator.state', 'w') { |f| Marshal.dump(iterator, f) }
28
+ ```
29
+
30
+ To resume iteration:
31
+
32
+ ```ruby
33
+ Marshal.load(IO.open('iterator.state'))
34
+ iterator.next
35
+ # => "/Users/mrm/Photos/img_1001.jpg"
36
+ ```
37
+
38
+ To re-check a directory hierarchy for files that you haven't visited yet:
39
+
40
+ ```ruby
41
+ iterator.rescan!
42
+ iterator.next
43
+ # => "/Users/mrm/Photos/img_1002.jpg"
44
+ ```
45
+
46
+
47
+ ## Changelog
48
+
49
+ * 0.0.1 First `find`
50
+ * 0.0.2 Added scalable Bloom filter so ```Iterator#rescan``` is possible
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "yard"
3
+ YARD::Rake::YardocTask.new do |t|
4
+ t.files = ['lib/**/*.rb', 'README.md']
5
+ end
6
+
7
+ require "rspec/core/rake_task"
8
+ RSpec::Core::RakeTask.new(:spec)
9
+
10
+ task :default => :spec
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "findler"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "findler"
7
+ s.version = Findler::VERSION
8
+ s.authors = ["Matthew McEachen"]
9
+ s.email = ["matthew+github@mceachen.org"]
10
+ s.homepage = "https://github.com/mceachen/findler/"
11
+ s.summary = %q{Findler is a stateful filesystem iterator}
12
+ s.description = %q{Findler is designed for very large filesystem hierarchies,
13
+ where simple block processing, or returning an array of matches, just isn't feasible.
14
+ Usage instructions are available in the README.}
15
+
16
+ s.rubyforge_project = "findler"
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+ s.add_dependency "bloomer"
23
+ end
@@ -0,0 +1,50 @@
1
+ class Findler
2
+
3
+ VERSION = "0.0.2"
4
+
5
+ IGNORE_CASE = 1
6
+ INCLUDE_HIDDEN = 2
7
+
8
+ autoload :Iterator, "findler/iterator"
9
+
10
+ def initialize path
11
+ @path = path
12
+ @flags = 0
13
+ end
14
+
15
+ # These are File.fnmatch patterns. If any pattern matches, it will be returned by Iterator#next.
16
+ # (see File.fnmatch?)
17
+ def add_pattern *patterns
18
+ patterns.each { |ea| (@patterns ||= []) << ea }
19
+ end
20
+
21
+ def append_extension *extensions
22
+ extensions.each { |ea| add_pattern "*#{normalize_extension(ea)}" }
23
+ end
24
+
25
+ # Should patterns be interpreted in a case-insensitive manor? (default is case sensitive)
26
+ def case_insensitive!
27
+ @flags |= IGNORE_CASE
28
+ end
29
+
30
+ # Should we traverse hidden directories and files? (default is to skip files that start
31
+ # with a '.')
32
+ def include_hidden!
33
+ @flags |= INCLUDE_HIDDEN
34
+ end
35
+
36
+ def iterator
37
+ Iterator.new(:path => @path, :patterns => @patterns, :flags => @flags)
38
+ end
39
+
40
+ private
41
+
42
+ def normalize_extension extension
43
+ if extension.nil? || extension.empty? || extension.start_with?(".")
44
+ extension
45
+ else
46
+ ".#{extension}"
47
+ end
48
+ end
49
+
50
+ end
@@ -0,0 +1,119 @@
1
+ require 'bloomer'
2
+
3
+ class Findler
4
+ class Iterator
5
+
6
+ attr_reader :path, :parent, :patterns, :flags, :visited_dirs, :visited_files
7
+
8
+ def initialize(attrs, parent = nil)
9
+ @path = attrs[:path]
10
+ @path = Pathname.new(@path) unless @path.is_a? Pathname
11
+ @parent = parent
12
+
13
+ set_ivar(:visited_dirs, attrs) { Bloomer::Scalable.new(256, 1.0/1_000_000) }
14
+ set_ivar(:visited_files, attrs) { Bloomer::Scalable.new(256, 1.0/1_000_000) }
15
+ set_ivar(:patterns, attrs) { nil }
16
+ set_ivar(:flags, attrs) { 0 }
17
+
18
+ @sub_iter = self.class.new(attrs[:sub_iter], self) if attrs[:sub_iter]
19
+ end
20
+
21
+ # Visit this directory and all sub directories, and check for unseen files. Only call on the root iterator.
22
+ def rescan!
23
+ raise "Only invoke on root" unless @parent.nil?
24
+ @visited_dirs = Bloomer::Scalable.new(256, 1.0/1_000_000)
25
+ @children = nil
26
+ @sub_iter = nil
27
+ end
28
+
29
+ #def to_hash
30
+ # {:path => @path, :visited_dirs:patterns => @patterns, :flags => @flags, :sub_iter => @sub_iter && @sub_iter.to_hash}
31
+ #end
32
+ #
33
+ #def _dump(depth)
34
+ # Marshal.dump(to_hash)
35
+ #end
36
+ #
37
+ #def self._load(data)
38
+ # new(Marshal.load(data))
39
+ #end
40
+
41
+ def case_insensitive?
42
+ (Findler::IGNORE_CASE | flags) != 0
43
+ end
44
+
45
+ def skip_hidden?
46
+ (Findler::INCLUDE_HIDDEN | flags) == 0
47
+ end
48
+
49
+ def fnmatch_flags
50
+ @_fnflags ||= (@parent && @parent.fnmatch_flags) || begin
51
+ f = 0
52
+ f |= File::FNM_CASEFOLD if case_insensitive?
53
+ f |= File::FNM_DOTMATCH if !skip_hidden?
54
+ f
55
+ end
56
+ end
57
+
58
+ def path
59
+ @path
60
+ end
61
+
62
+ def next
63
+ return nil unless @path.exist?
64
+
65
+ if @sub_iter
66
+ nxt = @sub_iter.next
67
+ return nxt unless nxt.nil?
68
+ @visited_dirs.add @sub_iter.path.to_s
69
+ @sub_iter = nil
70
+ end
71
+
72
+ # If someone touches the directory while we iterate, redo the @children.
73
+ @children = nil if @path.ctime != @ctime || @path.mtime != @mtime
74
+ @children ||= begin
75
+ @mtime = @path.mtime
76
+ @ctime = @path.ctime
77
+ @path.children.delete_if { |ea| skip?(ea) }
78
+ end
79
+
80
+ nxt = @children.shift
81
+ return nil if nxt.nil?
82
+
83
+ if nxt.directory?
84
+ @sub_iter = Iterator.new({:path => nxt}, self)
85
+ self.next
86
+ else
87
+ @visited_files.add nxt.to_s
88
+ nxt
89
+ end
90
+ end
91
+
92
+ private
93
+
94
+ def set_ivar(field, attrs, &block)
95
+ sym = "@#{field}".to_sym
96
+ v = attrs[field]
97
+ v ||= begin
98
+ (p = instance_variable_get(:@parent)) && p.instance_variable_get(sym)
99
+ end
100
+ v ||= yield
101
+ instance_variable_set(sym, v)
102
+ end
103
+
104
+ def hidden?(pathname)
105
+ pathname.basename.to_s.start_with?(".")
106
+ end
107
+
108
+ def skip? pathname
109
+ s = pathname.to_s
110
+ return true if hidden?(pathname) && skip_hidden?
111
+ return @visited_dirs.include?(s) if pathname.directory?
112
+ return true if @visited_files.include?(s)
113
+ unless patterns.nil?
114
+ return true if patterns.none? { |p| pathname.fnmatch(p, fnmatch_flags) }
115
+ end
116
+ return false
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,117 @@
1
+ require 'spec_helper'
2
+
3
+ describe Findler do
4
+
5
+ before :each do
6
+ @opts = {
7
+ :depth => 3,
8
+ :files_per_dir => 3,
9
+ :subdirs_per_dir => 3,
10
+ :prefix => "tmp",
11
+ :suffix => "",
12
+ :dir_prefix => "dir",
13
+ :dir_suffix => ""
14
+ }
15
+ end
16
+
17
+ it "should find all files by default" do
18
+ with_tree([".jpg", ".txt"]) do |dir|
19
+ f = Findler.new(dir)
20
+ iter = f.iterator
21
+ collect_files(iter).should =~ `find * -type f`.split
22
+ end
23
+ end
24
+
25
+ it "should find only .jpg files when constrained" do
26
+ with_tree([".jpg", ".txt", ".JPG"]) do |dir|
27
+ f = Findler.new(dir)
28
+ f.append_extension ".jpg"
29
+ iter = f.iterator
30
+ collect_files(iter).should =~ `find * -type f -name \\*.jpg`.split
31
+ end
32
+ end
33
+
34
+ it "should find .jpg or .JPG files when constrained" do
35
+ with_tree([".jpg", ".txt", ".JPG"]) do |dir|
36
+ f = Findler.new(dir)
37
+ f.append_extension ".jpg"
38
+ f.case_insensitive!
39
+ iter = f.iterator
40
+ collect_files(iter).should =~ `find * -type f -iname \\*.jpg`.split
41
+ end
42
+ end
43
+
44
+ it "should find files added after iteration started" do
45
+ with_tree([".txt"]) do |dir|
46
+ f = Findler.new(dir)
47
+ iter = f.iterator
48
+ iter.next.should_not be_nil
49
+
50
+ # cheating with mtime on the touch doesn't properly update the parent directory ctime,
51
+ # so we have to deal with the second-granularity resolution of the filesystem.
52
+ sleep(1.1)
53
+
54
+ FileUtils.touch(dir + "new.txt")
55
+ collect_files(iter).should include("new.txt")
56
+ end
57
+ end
58
+
59
+ it "should find new files after a rescan" do
60
+ with_tree([".txt", ".no"]) do |dir|
61
+ f = Findler.new(dir)
62
+ f.append_extension ".txt"
63
+ iter = f.iterator
64
+ collect_files(iter).should =~ `find * -type f -iname \\*.txt`.split
65
+ FileUtils.touch(dir + "dir-0" + "dir-1" + "new-0.txt")
66
+ FileUtils.touch(dir + "dir-1" + "dir-0" + "new-1.txt")
67
+ FileUtils.touch(dir + "dir-2" + "dir-2" + "new-2.txt")
68
+ collect_files(iter).should be_empty
69
+ iter.rescan!
70
+ collect_files(iter).should =~ ["dir-0/dir-1/new-0.txt", "dir-1/dir-0/new-1.txt", "dir-2/dir-2/new-2.txt"]
71
+ end
72
+ end
73
+
74
+ it "should not return files removed after iteration started" do
75
+ with_tree([".txt"]) do |dir|
76
+ f = Findler.new(dir)
77
+ iter = f.iterator
78
+ iter.next.should_not be_nil
79
+ sleep(1.1) # see above for hand-wringing-defense of this atrocity
80
+
81
+ (dir + "tmp-1.txt").unlink
82
+ collect_files(iter).should_not include("tmp-1.txt")
83
+ end
84
+ end
85
+
86
+ it "should dump/load in the middle of iterating" do
87
+ with_tree([".jpg", ".txt", ".JPG"]) do |dir|
88
+ all_files = `find * -type f -iname \\*.jpg`.split
89
+ all_files.size.times do |i|
90
+ f = Findler.new(dir)
91
+ f.append_extension ".jpg"
92
+ f.case_insensitive!
93
+ iter_a = f.iterator
94
+ files_a = i.times.collect { relative_path(iter_a, iter_a.next) }
95
+ iter_b = Marshal.load(Marshal.dump(iter_a))
96
+ files_b = collect_files(iter_b)
97
+
98
+ iter_c = Marshal.load(Marshal.dump(iter_b))
99
+ collect_files(iter_c)
100
+ iter_c.next.should be_nil
101
+
102
+ (files_a + files_b).should =~ all_files
103
+ end
104
+ end
105
+ end
106
+
107
+ it "should create an iterator even for a non-existent directory" do
108
+ tmpdir = nil
109
+ cwd = Dir.pwd
110
+ Dir.mktmpdir do |dir|
111
+ tmpdir = Pathname.new dir
112
+ end
113
+ tmpdir.should_not exist
114
+ f = Findler.new(tmpdir)
115
+ collect_files(f.iterator).should be_empty
116
+ end
117
+ end
@@ -0,0 +1,65 @@
1
+ require 'rspec'
2
+ require 'tmpdir'
3
+ require 'fileutils'
4
+
5
+ RSpec.configure do |config|
6
+ config.color_enabled = true
7
+ config.formatter = 'documentation'
8
+ end
9
+
10
+ def with_tmp_dir(&block)
11
+ cwd = Dir.pwd
12
+ Dir.mktmpdir do |dir|
13
+ Dir.chdir(dir)
14
+ yield(Pathname.new dir)
15
+ end
16
+ Dir.chdir(cwd)
17
+ end
18
+
19
+ def with_tree(sufficies, &block)
20
+ with_tmp_dir do |dir|
21
+ sufficies.each { |suffix| mk_tree dir, @opts.merge(:suffix => suffix) }
22
+ yield(dir)
23
+ end
24
+ end
25
+
26
+ def mk_tree(target_dir, options)
27
+ opts = {
28
+ :depth => 3,
29
+ :files_per_dir => 3,
30
+ :subdirs_per_dir => 3,
31
+ :prefix => "tmp",
32
+ :suffix => "",
33
+ :dir_prefix => "dir",
34
+ :dir_suffix => ""
35
+ }.merge options
36
+ p = target_dir.is_a?(Pathname) ? target_dir : Pathname.new(target_dir)
37
+ p.mkdir unless p.exist?
38
+
39
+ opts[:files_per_dir].times do |i|
40
+ fname = "#{opts[:prefix]}-#{i}#{opts[:suffix]}"
41
+ FileUtils.touch(p + fname).to_s
42
+ end
43
+ return if (opts[:depth] -= 1) <= 0
44
+ opts[:subdirs_per_dir].times do |i|
45
+ dir = "#{opts[:dir_prefix]}-#{i}#{opts[:dir_suffix]}"
46
+ mk_tree(p + dir, opts)
47
+ end
48
+ end
49
+
50
+ def expected_files(depth, files_per_dir, subdirs_per_dir)
51
+ return 0 if depth == 0
52
+ files_per_dir + (subdirs_per_dir * expected_files(depth - 1, files_per_dir, subdirs_per_dir))
53
+ end
54
+
55
+ def relative_path(parent, pathname)
56
+ pathname.relative_path_from(parent.path).to_s
57
+ end
58
+
59
+ def collect_files(iter)
60
+ files = []
61
+ while nxt = iter.next
62
+ files << relative_path(iter, nxt)
63
+ end
64
+ files
65
+ end
metadata ADDED
@@ -0,0 +1,94 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: findler
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 2
10
+ version: 0.0.2
11
+ platform: ruby
12
+ authors:
13
+ - Matthew McEachen
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-01-21 00:00:00 -08:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 3
29
+ segments:
30
+ - 0
31
+ version: "0"
32
+ type: :runtime
33
+ name: bloomer
34
+ version_requirements: *id001
35
+ description: |-
36
+ Findler is designed for very large filesystem hierarchies,
37
+ where simple block processing, or returning an array of matches, just isn't feasible.
38
+ Usage instructions are available in the README.
39
+ email:
40
+ - matthew+github@mceachen.org
41
+ executables: []
42
+
43
+ extensions: []
44
+
45
+ extra_rdoc_files: []
46
+
47
+ files:
48
+ - .gitignore
49
+ - Gemfile
50
+ - MIT-LICENSE
51
+ - README.md
52
+ - Rakefile
53
+ - findler.gemspec
54
+ - lib/findler.rb
55
+ - lib/findler/iterator.rb
56
+ - spec/findler_spec.rb
57
+ - spec/spec_helper.rb
58
+ has_rdoc: true
59
+ homepage: https://github.com/mceachen/findler/
60
+ licenses: []
61
+
62
+ post_install_message:
63
+ rdoc_options: []
64
+
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ none: false
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ hash: 3
73
+ segments:
74
+ - 0
75
+ version: "0"
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ none: false
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ hash: 3
82
+ segments:
83
+ - 0
84
+ version: "0"
85
+ requirements: []
86
+
87
+ rubyforge_project: findler
88
+ rubygems_version: 1.6.2
89
+ signing_key:
90
+ specification_version: 3
91
+ summary: Findler is a stateful filesystem iterator
92
+ test_files:
93
+ - spec/findler_spec.rb
94
+ - spec/spec_helper.rb