folder_data_sampler 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/folder_data_sampler.rb +142 -0
  2. metadata +65 -0
@@ -0,0 +1,142 @@
1
+ #/usr/bin/env ruby
2
+
3
+ require 'find'
4
+
5
+ # This library samples leave folders within a given folder tree.
6
+ #
7
+ # It takes as input a root path in the filesystem where to look for data.
8
+ # All the leaves in the folder tree under that root are candidates for selection.
9
+ #
10
+ # The candidates are then filtered by categories. A category is nothing more
11
+ # than a folder name. Candidates whose path does not contain a folder named
12
+ # after the filter are discarded.
13
+ #
14
+ # For example candidate "/A/B/C/file.x" will be discarded if filter = "AB" but
15
+ # will be kept if filter = "A" or "B" or "C".
16
+ # Note: filtering is case and white space insensitive.
17
+ #
18
+ # From the final pool of candidates that passed all filters a random candidate
19
+ # is picked.
20
+ #
21
+ # For example given this folder tree:
22
+ #
23
+ # data/photos/cars/renault/megane
24
+ # /volkswagen/golf
25
+ # /drivers/f1/senna
26
+ # /rally/sainz
27
+ #
28
+ # When calling DataSampler.sample with "data" as root folder and "photos" as
29
+ # filter one of these leaves will be returned:
30
+ #
31
+ # data/photos/cars/renault/megane
32
+ # data/photos/cars/volkswagen/golf
33
+ # data/photos/drivers/f1/senna
34
+ # data/photos/drivers/rally/sainz
35
+ #
36
+ # If the filter would be "drivers" then one of these leaves will be returned:
37
+ #
38
+ # data/photos/drivers/f1/senna
39
+ # data/photos/drivers/rally/sainz
40
+ #
41
+ # Options:
42
+ #
43
+ # pre_filters specify an ordered list of categories (folders) where to search
44
+ # within the root folder. This basically makes the search faster
45
+ # when dealing with large folder trees.
46
+ #
47
+ # Example:
48
+ # DataSampler.sample "data" pre_filters=["photos", "cars"]
49
+ #
50
+ # returns one of:
51
+ #
52
+ # data/photos/cars/renault/megane
53
+ # data/photos/cars/volkswagen/golf
54
+ #
55
+ # It's equivalent to:
56
+ #
57
+ # DataSampler.sample "data/photos/car"
58
+ #
59
+ # Author:: Manuel Pais (mailto:manuelpais@gmail.com)
60
+ # License:: Distributes under the same terms as Ruby
61
+
62
+ class DataSampler
63
+
64
+ def self.sample(data_source, filters, options = {})
65
+ sample = pick_sample(filter_data(data_source, filters, options).keys)
66
+ if sample
67
+ return "#{data_source}/#{sample}"
68
+ else
69
+ return ""
70
+ end
71
+ end
72
+
73
+ def self.filter_data(data_source, filters, options = {})
74
+ candidates = find_data(data_source, options)
75
+ candidates = filter_candidates(filters, candidates)
76
+ end
77
+
78
+ def self.find_data(data_source, options = {})
79
+ # declare accepted options here, not in middle of function
80
+ filter_by = options[:pre_filters]
81
+ return_folder = options[:return_folder]
82
+ discard_folder = options[:discard_folder]
83
+
84
+ found_data = {}
85
+ search_within = "."
86
+
87
+ search_within = filter_by.join("/") if filter_by
88
+
89
+ Dir.chdir(data_source) do
90
+ Find.find(search_within) do |path|
91
+ Find.prune if path =~ /\.hg/ # skip mercurial control files
92
+
93
+ if File.file?(path)
94
+ data_item = path.gsub("./", "").rstrip
95
+ data_item = File.dirname(data_item) if return_folder
96
+ found_data[data_item] = categorize(data_item)
97
+ end
98
+ end
99
+ end
100
+
101
+ return found_data
102
+ end
103
+
104
+ def self.categorize(path)
105
+ categories = path.split("/") if path
106
+ return categories
107
+ end
108
+
109
+ def self.filter_candidates(filters, candidates)
110
+ unless candidates.nil? or candidates.empty?
111
+ filters.each do |filter_name, filter_value|
112
+ candidates.delete_if {|data_item, categories| not matches(filter_value, categories)}
113
+ end
114
+ end
115
+ return candidates
116
+ end
117
+
118
+ def self.matches(filter, categories)
119
+ categories.each do |category|
120
+ if spaceless(category).eql?(spaceless(filter))
121
+ return true
122
+ end
123
+ end
124
+ return false
125
+ end
126
+
127
+ def self.spaceless(sentence)
128
+ sentence.delete(" ").downcase
129
+ end
130
+
131
+ def self.pick_sample(candidate_values)
132
+ unless candidate_values.nil? or candidate_values.empty?
133
+ if RUBY_VERSION < "1.9"
134
+ random_candidate = rand(candidate_values.length)
135
+ return candidate_values[random_candidate]
136
+ else
137
+ # The Array#sample function in ruby >= 1.9 does the same as above code
138
+ return candidate_values.sample
139
+ end
140
+ end
141
+ end
142
+ end
metadata ADDED
@@ -0,0 +1,65 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: folder_data_sampler
3
+ version: !ruby/object:Gem::Version
4
+ hash: 23
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 2
9
+ - 0
10
+ version: 0.2.0
11
+ platform: ruby
12
+ authors:
13
+ - Manuel Pais
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-09-18 00:00:00 Z
19
+ dependencies: []
20
+
21
+ description: Samples data in a folder structure given a hash with criteria for filtering
22
+ email: manuel.pais@gmail.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - lib/folder_data_sampler.rb
31
+ homepage: http://rubygems.org/gems/folder_data_sampler
32
+ licenses: []
33
+
34
+ post_install_message:
35
+ rdoc_options: []
36
+
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ hash: 3
45
+ segments:
46
+ - 0
47
+ version: "0"
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ hash: 3
54
+ segments:
55
+ - 0
56
+ version: "0"
57
+ requirements: []
58
+
59
+ rubyforge_project:
60
+ rubygems_version: 1.7.2
61
+ signing_key:
62
+ specification_version: 3
63
+ summary: Folder Data Sampler
64
+ test_files: []
65
+