folder_data_sampler 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/folder_data_sampler.rb +142 -0
- metadata +65 -0
@@ -0,0 +1,142 @@
|
|
1
|
+
#/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'find'
|
4
|
+
|
5
|
+
# This library samples leave folders within a given folder tree.
|
6
|
+
#
|
7
|
+
# It takes as input a root path in the filesystem where to look for data.
|
8
|
+
# All the leaves in the folder tree under that root are candidates for selection.
|
9
|
+
#
|
10
|
+
# The candidates are then filtered by categories. A category is nothing more
|
11
|
+
# than a folder name. Candidates whose path does not contain a folder named
|
12
|
+
# after the filter are discarded.
|
13
|
+
#
|
14
|
+
# For example candidate "/A/B/C/file.x" will be discarded if filter = "AB" but
|
15
|
+
# will be kept if filter = "A" or "B" or "C".
|
16
|
+
# Note: filtering is case and white space insensitive.
|
17
|
+
#
|
18
|
+
# From the final pool of candidates that passed all filters a random candidate
|
19
|
+
# is picked.
|
20
|
+
#
|
21
|
+
# For example given this folder tree:
|
22
|
+
#
|
23
|
+
# data/photos/cars/renault/megane
|
24
|
+
# /volkswagen/golf
|
25
|
+
# /drivers/f1/senna
|
26
|
+
# /rally/sainz
|
27
|
+
#
|
28
|
+
# When calling DataSampler.sample with "data" as root folder and "photos" as
|
29
|
+
# filter one of these leaves will be returned:
|
30
|
+
#
|
31
|
+
# data/photos/cars/renault/megane
|
32
|
+
# data/photos/cars/volkswagen/golf
|
33
|
+
# data/photos/drivers/f1/senna
|
34
|
+
# data/photos/drivers/rally/sainz
|
35
|
+
#
|
36
|
+
# If the filter would be "drivers" then one of these leaves will be returned:
|
37
|
+
#
|
38
|
+
# data/photos/drivers/f1/senna
|
39
|
+
# data/photos/drivers/rally/sainz
|
40
|
+
#
|
41
|
+
# Options:
|
42
|
+
#
|
43
|
+
# pre_filters specify an ordered list of categories (folders) where to search
|
44
|
+
# within the root folder. This basically makes the search faster
|
45
|
+
# when dealing with large folder trees.
|
46
|
+
#
|
47
|
+
# Example:
|
48
|
+
# DataSampler.sample "data" pre_filters=["photos", "cars"]
|
49
|
+
#
|
50
|
+
# returns one of:
|
51
|
+
#
|
52
|
+
# data/photos/cars/renault/megane
|
53
|
+
# data/photos/cars/volkswagen/golf
|
54
|
+
#
|
55
|
+
# It's equivalent to:
|
56
|
+
#
|
57
|
+
# DataSampler.sample "data/photos/car"
|
58
|
+
#
|
59
|
+
# Author:: Manuel Pais (mailto:manuelpais@gmail.com)
|
60
|
+
# License:: Distributes under the same terms as Ruby
|
61
|
+
|
62
|
+
class DataSampler
|
63
|
+
|
64
|
+
def self.sample(data_source, filters, options = {})
|
65
|
+
sample = pick_sample(filter_data(data_source, filters, options).keys)
|
66
|
+
if sample
|
67
|
+
return "#{data_source}/#{sample}"
|
68
|
+
else
|
69
|
+
return ""
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.filter_data(data_source, filters, options = {})
|
74
|
+
candidates = find_data(data_source, options)
|
75
|
+
candidates = filter_candidates(filters, candidates)
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.find_data(data_source, options = {})
|
79
|
+
# declare accepted options here, not in middle of function
|
80
|
+
filter_by = options[:pre_filters]
|
81
|
+
return_folder = options[:return_folder]
|
82
|
+
discard_folder = options[:discard_folder]
|
83
|
+
|
84
|
+
found_data = {}
|
85
|
+
search_within = "."
|
86
|
+
|
87
|
+
search_within = filter_by.join("/") if filter_by
|
88
|
+
|
89
|
+
Dir.chdir(data_source) do
|
90
|
+
Find.find(search_within) do |path|
|
91
|
+
Find.prune if path =~ /\.hg/ # skip mercurial control files
|
92
|
+
|
93
|
+
if File.file?(path)
|
94
|
+
data_item = path.gsub("./", "").rstrip
|
95
|
+
data_item = File.dirname(data_item) if return_folder
|
96
|
+
found_data[data_item] = categorize(data_item)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
return found_data
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.categorize(path)
|
105
|
+
categories = path.split("/") if path
|
106
|
+
return categories
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.filter_candidates(filters, candidates)
|
110
|
+
unless candidates.nil? or candidates.empty?
|
111
|
+
filters.each do |filter_name, filter_value|
|
112
|
+
candidates.delete_if {|data_item, categories| not matches(filter_value, categories)}
|
113
|
+
end
|
114
|
+
end
|
115
|
+
return candidates
|
116
|
+
end
|
117
|
+
|
118
|
+
def self.matches(filter, categories)
|
119
|
+
categories.each do |category|
|
120
|
+
if spaceless(category).eql?(spaceless(filter))
|
121
|
+
return true
|
122
|
+
end
|
123
|
+
end
|
124
|
+
return false
|
125
|
+
end
|
126
|
+
|
127
|
+
def self.spaceless(sentence)
|
128
|
+
sentence.delete(" ").downcase
|
129
|
+
end
|
130
|
+
|
131
|
+
def self.pick_sample(candidate_values)
|
132
|
+
unless candidate_values.nil? or candidate_values.empty?
|
133
|
+
if RUBY_VERSION < "1.9"
|
134
|
+
random_candidate = rand(candidate_values.length)
|
135
|
+
return candidate_values[random_candidate]
|
136
|
+
else
|
137
|
+
# The Array#sample function in ruby >= 1.9 does the same as above code
|
138
|
+
return candidate_values.sample
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
metadata
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: folder_data_sampler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
- 0
|
10
|
+
version: 0.2.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Manuel Pais
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2012-09-18 00:00:00 Z
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Samples data in a folder structure given a hash with criteria for filtering
|
22
|
+
email: manuel.pais@gmail.com
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- lib/folder_data_sampler.rb
|
31
|
+
homepage: http://rubygems.org/gems/folder_data_sampler
|
32
|
+
licenses: []
|
33
|
+
|
34
|
+
post_install_message:
|
35
|
+
rdoc_options: []
|
36
|
+
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
hash: 3
|
45
|
+
segments:
|
46
|
+
- 0
|
47
|
+
version: "0"
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
hash: 3
|
54
|
+
segments:
|
55
|
+
- 0
|
56
|
+
version: "0"
|
57
|
+
requirements: []
|
58
|
+
|
59
|
+
rubyforge_project:
|
60
|
+
rubygems_version: 1.7.2
|
61
|
+
signing_key:
|
62
|
+
specification_version: 3
|
63
|
+
summary: Folder Data Sampler
|
64
|
+
test_files: []
|
65
|
+
|