findler 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ *.gem
2
+ .idea/
3
+ .yardoc/
4
+ .bundle/
5
+ Gemfile.lock
6
+ pkg/
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "http://rubygems.org"
2
+ gemspec
3
+
4
+ gem "rake"
5
+ gem "yard"
6
+ gem "rspec", '~> 2.7.0'
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2012 Matthew McEachen
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
@@ -0,0 +1,50 @@
1
+ # Findler: Filesystem Iteration with Persistable State
2
+
3
+ Findler is a Ruby library for iterating over a filtered set of files from a given
4
+ path, written to be suitable with concurrent workers and very large
5
+ filesystem hierarchies.
6
+
7
+ ## Usage
8
+
9
+ ```ruby
10
+ f = Findler.new "/Users/mrm"
11
+ f.append_extension ".jpg", ".jpeg"
12
+ iterator = f.iterator
13
+ iterator.next
14
+ # => "/Users/mrm/Photos/img_1000.jpg"
15
+ ```
16
+
17
+ ## Cross-process continuations
18
+
19
+ This should smell an awful lot like [hike](https://github.com/sstephenson/hike),
20
+ except for that last bit.
21
+
22
+ ```Findler::Iterator``` instances can be "paused" and "resumed" with ```Marshal```.
23
+ The entire state of the iteration for the filesystem is returned, which can then
24
+ be pushed onto any durable storage, like ActiveRecord or Redis, or just a local file:
25
+
26
+ ```ruby
27
+ File.open('iterator.state', 'w') { |f| Marshal.dump(iterator, f) }
28
+ ```
29
+
30
+ To resume iteration:
31
+
32
+ ```ruby
33
+ Marshal.load(IO.open('iterator.state'))
34
+ iterator.next
35
+ # => "/Users/mrm/Photos/img_1001.jpg"
36
+ ```
37
+
38
+ To re-check a directory hierarchy for files that you haven't visited yet:
39
+
40
+ ```ruby
41
+ iterator.rescan!
42
+ iterator.next
43
+ # => "/Users/mrm/Photos/img_1002.jpg"
44
+ ```
45
+
46
+
47
+ ## Changelog
48
+
49
+ * 0.0.1 First `find`
50
+ * 0.0.2 Added scalable Bloom filter so ```Iterator#rescan``` is possible
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "yard"
3
+ YARD::Rake::YardocTask.new do |t|
4
+ t.files = ['lib/**/*.rb', 'README.md']
5
+ end
6
+
7
+ require "rspec/core/rake_task"
8
+ RSpec::Core::RakeTask.new(:spec)
9
+
10
+ task :default => :spec
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "findler"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "findler"
7
+ s.version = Findler::VERSION
8
+ s.authors = ["Matthew McEachen"]
9
+ s.email = ["matthew+github@mceachen.org"]
10
+ s.homepage = "https://github.com/mceachen/findler/"
11
+ s.summary = %q{Findler is a stateful filesystem iterator}
12
+ s.description = %q{Findler is designed for very large filesystem hierarchies,
13
+ where simple block processing, or returning an array of matches, just isn't feasible.
14
+ Usage instructions are available in the README.}
15
+
16
+ s.rubyforge_project = "findler"
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+ s.add_dependency "bloomer"
23
+ end
@@ -0,0 +1,50 @@
1
+ class Findler
2
+
3
+ VERSION = "0.0.2"
4
+
5
+ IGNORE_CASE = 1
6
+ INCLUDE_HIDDEN = 2
7
+
8
+ autoload :Iterator, "findler/iterator"
9
+
10
+ def initialize path
11
+ @path = path
12
+ @flags = 0
13
+ end
14
+
15
+ # These are File.fnmatch patterns. If any pattern matches, it will be returned by Iterator#next.
16
+ # (see File.fnmatch?)
17
+ def add_pattern *patterns
18
+ patterns.each { |ea| (@patterns ||= []) << ea }
19
+ end
20
+
21
+ def append_extension *extensions
22
+ extensions.each { |ea| add_pattern "*#{normalize_extension(ea)}" }
23
+ end
24
+
25
+ # Should patterns be interpreted in a case-insensitive manor? (default is case sensitive)
26
+ def case_insensitive!
27
+ @flags |= IGNORE_CASE
28
+ end
29
+
30
+ # Should we traverse hidden directories and files? (default is to skip files that start
31
+ # with a '.')
32
+ def include_hidden!
33
+ @flags |= INCLUDE_HIDDEN
34
+ end
35
+
36
+ def iterator
37
+ Iterator.new(:path => @path, :patterns => @patterns, :flags => @flags)
38
+ end
39
+
40
+ private
41
+
42
+ def normalize_extension extension
43
+ if extension.nil? || extension.empty? || extension.start_with?(".")
44
+ extension
45
+ else
46
+ ".#{extension}"
47
+ end
48
+ end
49
+
50
+ end
@@ -0,0 +1,119 @@
1
+ require 'bloomer'
2
+
3
+ class Findler
4
+ class Iterator
5
+
6
+ attr_reader :path, :parent, :patterns, :flags, :visited_dirs, :visited_files
7
+
8
+ def initialize(attrs, parent = nil)
9
+ @path = attrs[:path]
10
+ @path = Pathname.new(@path) unless @path.is_a? Pathname
11
+ @parent = parent
12
+
13
+ set_ivar(:visited_dirs, attrs) { Bloomer::Scalable.new(256, 1.0/1_000_000) }
14
+ set_ivar(:visited_files, attrs) { Bloomer::Scalable.new(256, 1.0/1_000_000) }
15
+ set_ivar(:patterns, attrs) { nil }
16
+ set_ivar(:flags, attrs) { 0 }
17
+
18
+ @sub_iter = self.class.new(attrs[:sub_iter], self) if attrs[:sub_iter]
19
+ end
20
+
21
+ # Visit this directory and all sub directories, and check for unseen files. Only call on the root iterator.
22
+ def rescan!
23
+ raise "Only invoke on root" unless @parent.nil?
24
+ @visited_dirs = Bloomer::Scalable.new(256, 1.0/1_000_000)
25
+ @children = nil
26
+ @sub_iter = nil
27
+ end
28
+
29
+ #def to_hash
30
+ # {:path => @path, :visited_dirs:patterns => @patterns, :flags => @flags, :sub_iter => @sub_iter && @sub_iter.to_hash}
31
+ #end
32
+ #
33
+ #def _dump(depth)
34
+ # Marshal.dump(to_hash)
35
+ #end
36
+ #
37
+ #def self._load(data)
38
+ # new(Marshal.load(data))
39
+ #end
40
+
41
+ def case_insensitive?
42
+ (Findler::IGNORE_CASE | flags) != 0
43
+ end
44
+
45
+ def skip_hidden?
46
+ (Findler::INCLUDE_HIDDEN | flags) == 0
47
+ end
48
+
49
+ def fnmatch_flags
50
+ @_fnflags ||= (@parent && @parent.fnmatch_flags) || begin
51
+ f = 0
52
+ f |= File::FNM_CASEFOLD if case_insensitive?
53
+ f |= File::FNM_DOTMATCH if !skip_hidden?
54
+ f
55
+ end
56
+ end
57
+
58
+ def path
59
+ @path
60
+ end
61
+
62
+ def next
63
+ return nil unless @path.exist?
64
+
65
+ if @sub_iter
66
+ nxt = @sub_iter.next
67
+ return nxt unless nxt.nil?
68
+ @visited_dirs.add @sub_iter.path.to_s
69
+ @sub_iter = nil
70
+ end
71
+
72
+ # If someone touches the directory while we iterate, redo the @children.
73
+ @children = nil if @path.ctime != @ctime || @path.mtime != @mtime
74
+ @children ||= begin
75
+ @mtime = @path.mtime
76
+ @ctime = @path.ctime
77
+ @path.children.delete_if { |ea| skip?(ea) }
78
+ end
79
+
80
+ nxt = @children.shift
81
+ return nil if nxt.nil?
82
+
83
+ if nxt.directory?
84
+ @sub_iter = Iterator.new({:path => nxt}, self)
85
+ self.next
86
+ else
87
+ @visited_files.add nxt.to_s
88
+ nxt
89
+ end
90
+ end
91
+
92
+ private
93
+
94
+ def set_ivar(field, attrs, &block)
95
+ sym = "@#{field}".to_sym
96
+ v = attrs[field]
97
+ v ||= begin
98
+ (p = instance_variable_get(:@parent)) && p.instance_variable_get(sym)
99
+ end
100
+ v ||= yield
101
+ instance_variable_set(sym, v)
102
+ end
103
+
104
+ def hidden?(pathname)
105
+ pathname.basename.to_s.start_with?(".")
106
+ end
107
+
108
+ def skip? pathname
109
+ s = pathname.to_s
110
+ return true if hidden?(pathname) && skip_hidden?
111
+ return @visited_dirs.include?(s) if pathname.directory?
112
+ return true if @visited_files.include?(s)
113
+ unless patterns.nil?
114
+ return true if patterns.none? { |p| pathname.fnmatch(p, fnmatch_flags) }
115
+ end
116
+ return false
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,117 @@
1
+ require 'spec_helper'
2
+
3
+ describe Findler do
4
+
5
+ before :each do
6
+ @opts = {
7
+ :depth => 3,
8
+ :files_per_dir => 3,
9
+ :subdirs_per_dir => 3,
10
+ :prefix => "tmp",
11
+ :suffix => "",
12
+ :dir_prefix => "dir",
13
+ :dir_suffix => ""
14
+ }
15
+ end
16
+
17
+ it "should find all files by default" do
18
+ with_tree([".jpg", ".txt"]) do |dir|
19
+ f = Findler.new(dir)
20
+ iter = f.iterator
21
+ collect_files(iter).should =~ `find * -type f`.split
22
+ end
23
+ end
24
+
25
+ it "should find only .jpg files when constrained" do
26
+ with_tree([".jpg", ".txt", ".JPG"]) do |dir|
27
+ f = Findler.new(dir)
28
+ f.append_extension ".jpg"
29
+ iter = f.iterator
30
+ collect_files(iter).should =~ `find * -type f -name \\*.jpg`.split
31
+ end
32
+ end
33
+
34
+ it "should find .jpg or .JPG files when constrained" do
35
+ with_tree([".jpg", ".txt", ".JPG"]) do |dir|
36
+ f = Findler.new(dir)
37
+ f.append_extension ".jpg"
38
+ f.case_insensitive!
39
+ iter = f.iterator
40
+ collect_files(iter).should =~ `find * -type f -iname \\*.jpg`.split
41
+ end
42
+ end
43
+
44
+ it "should find files added after iteration started" do
45
+ with_tree([".txt"]) do |dir|
46
+ f = Findler.new(dir)
47
+ iter = f.iterator
48
+ iter.next.should_not be_nil
49
+
50
+ # cheating with mtime on the touch doesn't properly update the parent directory ctime,
51
+ # so we have to deal with the second-granularity resolution of the filesystem.
52
+ sleep(1.1)
53
+
54
+ FileUtils.touch(dir + "new.txt")
55
+ collect_files(iter).should include("new.txt")
56
+ end
57
+ end
58
+
59
+ it "should find new files after a rescan" do
60
+ with_tree([".txt", ".no"]) do |dir|
61
+ f = Findler.new(dir)
62
+ f.append_extension ".txt"
63
+ iter = f.iterator
64
+ collect_files(iter).should =~ `find * -type f -iname \\*.txt`.split
65
+ FileUtils.touch(dir + "dir-0" + "dir-1" + "new-0.txt")
66
+ FileUtils.touch(dir + "dir-1" + "dir-0" + "new-1.txt")
67
+ FileUtils.touch(dir + "dir-2" + "dir-2" + "new-2.txt")
68
+ collect_files(iter).should be_empty
69
+ iter.rescan!
70
+ collect_files(iter).should =~ ["dir-0/dir-1/new-0.txt", "dir-1/dir-0/new-1.txt", "dir-2/dir-2/new-2.txt"]
71
+ end
72
+ end
73
+
74
+ it "should not return files removed after iteration started" do
75
+ with_tree([".txt"]) do |dir|
76
+ f = Findler.new(dir)
77
+ iter = f.iterator
78
+ iter.next.should_not be_nil
79
+ sleep(1.1) # see above for hand-wringing-defense of this atrocity
80
+
81
+ (dir + "tmp-1.txt").unlink
82
+ collect_files(iter).should_not include("tmp-1.txt")
83
+ end
84
+ end
85
+
86
+ it "should dump/load in the middle of iterating" do
87
+ with_tree([".jpg", ".txt", ".JPG"]) do |dir|
88
+ all_files = `find * -type f -iname \\*.jpg`.split
89
+ all_files.size.times do |i|
90
+ f = Findler.new(dir)
91
+ f.append_extension ".jpg"
92
+ f.case_insensitive!
93
+ iter_a = f.iterator
94
+ files_a = i.times.collect { relative_path(iter_a, iter_a.next) }
95
+ iter_b = Marshal.load(Marshal.dump(iter_a))
96
+ files_b = collect_files(iter_b)
97
+
98
+ iter_c = Marshal.load(Marshal.dump(iter_b))
99
+ collect_files(iter_c)
100
+ iter_c.next.should be_nil
101
+
102
+ (files_a + files_b).should =~ all_files
103
+ end
104
+ end
105
+ end
106
+
107
+ it "should create an iterator even for a non-existent directory" do
108
+ tmpdir = nil
109
+ cwd = Dir.pwd
110
+ Dir.mktmpdir do |dir|
111
+ tmpdir = Pathname.new dir
112
+ end
113
+ tmpdir.should_not exist
114
+ f = Findler.new(tmpdir)
115
+ collect_files(f.iterator).should be_empty
116
+ end
117
+ end
@@ -0,0 +1,65 @@
1
+ require 'rspec'
2
+ require 'tmpdir'
3
+ require 'fileutils'
4
+
5
+ RSpec.configure do |config|
6
+ config.color_enabled = true
7
+ config.formatter = 'documentation'
8
+ end
9
+
10
+ def with_tmp_dir(&block)
11
+ cwd = Dir.pwd
12
+ Dir.mktmpdir do |dir|
13
+ Dir.chdir(dir)
14
+ yield(Pathname.new dir)
15
+ end
16
+ Dir.chdir(cwd)
17
+ end
18
+
19
+ def with_tree(sufficies, &block)
20
+ with_tmp_dir do |dir|
21
+ sufficies.each { |suffix| mk_tree dir, @opts.merge(:suffix => suffix) }
22
+ yield(dir)
23
+ end
24
+ end
25
+
26
+ def mk_tree(target_dir, options)
27
+ opts = {
28
+ :depth => 3,
29
+ :files_per_dir => 3,
30
+ :subdirs_per_dir => 3,
31
+ :prefix => "tmp",
32
+ :suffix => "",
33
+ :dir_prefix => "dir",
34
+ :dir_suffix => ""
35
+ }.merge options
36
+ p = target_dir.is_a?(Pathname) ? target_dir : Pathname.new(target_dir)
37
+ p.mkdir unless p.exist?
38
+
39
+ opts[:files_per_dir].times do |i|
40
+ fname = "#{opts[:prefix]}-#{i}#{opts[:suffix]}"
41
+ FileUtils.touch(p + fname).to_s
42
+ end
43
+ return if (opts[:depth] -= 1) <= 0
44
+ opts[:subdirs_per_dir].times do |i|
45
+ dir = "#{opts[:dir_prefix]}-#{i}#{opts[:dir_suffix]}"
46
+ mk_tree(p + dir, opts)
47
+ end
48
+ end
49
+
50
+ def expected_files(depth, files_per_dir, subdirs_per_dir)
51
+ return 0 if depth == 0
52
+ files_per_dir + (subdirs_per_dir * expected_files(depth - 1, files_per_dir, subdirs_per_dir))
53
+ end
54
+
55
+ def relative_path(parent, pathname)
56
+ pathname.relative_path_from(parent.path).to_s
57
+ end
58
+
59
+ def collect_files(iter)
60
+ files = []
61
+ while nxt = iter.next
62
+ files << relative_path(iter, nxt)
63
+ end
64
+ files
65
+ end
metadata ADDED
@@ -0,0 +1,94 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: findler
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 2
10
+ version: 0.0.2
11
+ platform: ruby
12
+ authors:
13
+ - Matthew McEachen
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-01-21 00:00:00 -08:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 3
29
+ segments:
30
+ - 0
31
+ version: "0"
32
+ type: :runtime
33
+ name: bloomer
34
+ version_requirements: *id001
35
+ description: |-
36
+ Findler is designed for very large filesystem hierarchies,
37
+ where simple block processing, or returning an array of matches, just isn't feasible.
38
+ Usage instructions are available in the README.
39
+ email:
40
+ - matthew+github@mceachen.org
41
+ executables: []
42
+
43
+ extensions: []
44
+
45
+ extra_rdoc_files: []
46
+
47
+ files:
48
+ - .gitignore
49
+ - Gemfile
50
+ - MIT-LICENSE
51
+ - README.md
52
+ - Rakefile
53
+ - findler.gemspec
54
+ - lib/findler.rb
55
+ - lib/findler/iterator.rb
56
+ - spec/findler_spec.rb
57
+ - spec/spec_helper.rb
58
+ has_rdoc: true
59
+ homepage: https://github.com/mceachen/findler/
60
+ licenses: []
61
+
62
+ post_install_message:
63
+ rdoc_options: []
64
+
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ none: false
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ hash: 3
73
+ segments:
74
+ - 0
75
+ version: "0"
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ none: false
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ hash: 3
82
+ segments:
83
+ - 0
84
+ version: "0"
85
+ requirements: []
86
+
87
+ rubyforge_project: findler
88
+ rubygems_version: 1.6.2
89
+ signing_key:
90
+ specification_version: 3
91
+ summary: Findler is a stateful filesystem iterator
92
+ test_files:
93
+ - spec/findler_spec.rb
94
+ - spec/spec_helper.rb