folder_data_sampler 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/folder_data_sampler.rb +142 -0
  2. metadata +65 -0
@@ -0,0 +1,142 @@
1
+ #/usr/bin/env ruby
2
+
3
+ require 'find'
4
+
5
+ # This library samples leave folders within a given folder tree.
6
+ #
7
+ # It takes as input a root path in the filesystem where to look for data.
8
+ # All the leaves in the folder tree under that root are candidates for selection.
9
+ #
10
+ # The candidates are then filtered by categories. A category is nothing more
11
+ # than a folder name. Candidates whose path does not contain a folder named
12
+ # after the filter are discarded.
13
+ #
14
+ # For example candidate "/A/B/C/file.x" will be discarded if filter = "AB" but
15
+ # will be kept if filter = "A" or "B" or "C".
16
+ # Note: filtering is case and white space insensitive.
17
+ #
18
+ # From the final pool of candidates that passed all filters a random candidate
19
+ # is picked.
20
+ #
21
+ # For example given this folder tree:
22
+ #
23
+ # data/photos/cars/renault/megane
24
+ # /volkswagen/golf
25
+ # /drivers/f1/senna
26
+ # /rally/sainz
27
+ #
28
+ # When calling DataSampler.sample with "data" as root folder and "photos" as
29
+ # filter one of these leaves will be returned:
30
+ #
31
+ # data/photos/cars/renault/megane
32
+ # data/photos/cars/volkswagen/golf
33
+ # data/photos/drivers/f1/senna
34
+ # data/photos/drivers/rally/sainz
35
+ #
36
+ # If the filter would be "drivers" then one of these leaves will be returned:
37
+ #
38
+ # data/photos/drivers/f1/senna
39
+ # data/photos/drivers/rally/sainz
40
+ #
41
+ # Options:
42
+ #
43
+ # pre_filters specify an ordered list of categories (folders) where to search
44
+ # within the root folder. This basically makes the search faster
45
+ # when dealing with large folder trees.
46
+ #
47
+ # Example:
48
+ # DataSampler.sample "data" pre_filters=["photos", "cars"]
49
+ #
50
+ # returns one of:
51
+ #
52
+ # data/photos/cars/renault/megane
53
+ # data/photos/cars/volkswagen/golf
54
+ #
55
+ # It's equivalent to:
56
+ #
57
+ # DataSampler.sample "data/photos/car"
58
+ #
59
+ # Author:: Manuel Pais (mailto:manuelpais@gmail.com)
60
+ # License:: Distributes under the same terms as Ruby
61
+
62
+ class DataSampler
63
+
64
+ def self.sample(data_source, filters, options = {})
65
+ sample = pick_sample(filter_data(data_source, filters, options).keys)
66
+ if sample
67
+ return "#{data_source}/#{sample}"
68
+ else
69
+ return ""
70
+ end
71
+ end
72
+
73
+ def self.filter_data(data_source, filters, options = {})
74
+ candidates = find_data(data_source, options)
75
+ candidates = filter_candidates(filters, candidates)
76
+ end
77
+
78
+ def self.find_data(data_source, options = {})
79
+ # declare accepted options here, not in middle of function
80
+ filter_by = options[:pre_filters]
81
+ return_folder = options[:return_folder]
82
+ discard_folder = options[:discard_folder]
83
+
84
+ found_data = {}
85
+ search_within = "."
86
+
87
+ search_within = filter_by.join("/") if filter_by
88
+
89
+ Dir.chdir(data_source) do
90
+ Find.find(search_within) do |path|
91
+ Find.prune if path =~ /\.hg/ # skip mercurial control files
92
+
93
+ if File.file?(path)
94
+ data_item = path.gsub("./", "").rstrip
95
+ data_item = File.dirname(data_item) if return_folder
96
+ found_data[data_item] = categorize(data_item)
97
+ end
98
+ end
99
+ end
100
+
101
+ return found_data
102
+ end
103
+
104
+ def self.categorize(path)
105
+ categories = path.split("/") if path
106
+ return categories
107
+ end
108
+
109
+ def self.filter_candidates(filters, candidates)
110
+ unless candidates.nil? or candidates.empty?
111
+ filters.each do |filter_name, filter_value|
112
+ candidates.delete_if {|data_item, categories| not matches(filter_value, categories)}
113
+ end
114
+ end
115
+ return candidates
116
+ end
117
+
118
+ def self.matches(filter, categories)
119
+ categories.each do |category|
120
+ if spaceless(category).eql?(spaceless(filter))
121
+ return true
122
+ end
123
+ end
124
+ return false
125
+ end
126
+
127
+ def self.spaceless(sentence)
128
+ sentence.delete(" ").downcase
129
+ end
130
+
131
+ def self.pick_sample(candidate_values)
132
+ unless candidate_values.nil? or candidate_values.empty?
133
+ if RUBY_VERSION < "1.9"
134
+ random_candidate = rand(candidate_values.length)
135
+ return candidate_values[random_candidate]
136
+ else
137
+ # The Array#sample function in ruby >= 1.9 does the same as above code
138
+ return candidate_values.sample
139
+ end
140
+ end
141
+ end
142
+ end
metadata ADDED
@@ -0,0 +1,65 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: folder_data_sampler
3
+ version: !ruby/object:Gem::Version
4
+ hash: 23
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 2
9
+ - 0
10
+ version: 0.2.0
11
+ platform: ruby
12
+ authors:
13
+ - Manuel Pais
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-09-18 00:00:00 Z
19
+ dependencies: []
20
+
21
+ description: Samples data in a folder structure given a hash with criteria for filtering
22
+ email: manuel.pais@gmail.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - lib/folder_data_sampler.rb
31
+ homepage: http://rubygems.org/gems/folder_data_sampler
32
+ licenses: []
33
+
34
+ post_install_message:
35
+ rdoc_options: []
36
+
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ hash: 3
45
+ segments:
46
+ - 0
47
+ version: "0"
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ hash: 3
54
+ segments:
55
+ - 0
56
+ version: "0"
57
+ requirements: []
58
+
59
+ rubyforge_project:
60
+ rubygems_version: 1.7.2
61
+ signing_key:
62
+ specification_version: 3
63
+ summary: Folder Data Sampler
64
+ test_files: []
65
+