folder_data_sampler 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/folder_data_sampler.rb +142 -0
- metadata +65 -0
@@ -0,0 +1,142 @@
|
|
1
|
+
#/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'find'
|
4
|
+
|
5
|
+
# This library samples leave folders within a given folder tree.
|
6
|
+
#
|
7
|
+
# It takes as input a root path in the filesystem where to look for data.
|
8
|
+
# All the leaves in the folder tree under that root are candidates for selection.
|
9
|
+
#
|
10
|
+
# The candidates are then filtered by categories. A category is nothing more
|
11
|
+
# than a folder name. Candidates whose path does not contain a folder named
|
12
|
+
# after the filter are discarded.
|
13
|
+
#
|
14
|
+
# For example candidate "/A/B/C/file.x" will be discarded if filter = "AB" but
|
15
|
+
# will be kept if filter = "A" or "B" or "C".
|
16
|
+
# Note: filtering is case and white space insensitive.
|
17
|
+
#
|
18
|
+
# From the final pool of candidates that passed all filters a random candidate
|
19
|
+
# is picked.
|
20
|
+
#
|
21
|
+
# For example given this folder tree:
|
22
|
+
#
|
23
|
+
# data/photos/cars/renault/megane
|
24
|
+
# /volkswagen/golf
|
25
|
+
# /drivers/f1/senna
|
26
|
+
# /rally/sainz
|
27
|
+
#
|
28
|
+
# When calling DataSampler.sample with "data" as root folder and "photos" as
|
29
|
+
# filter one of these leaves will be returned:
|
30
|
+
#
|
31
|
+
# data/photos/cars/renault/megane
|
32
|
+
# data/photos/cars/volkswagen/golf
|
33
|
+
# data/photos/drivers/f1/senna
|
34
|
+
# data/photos/drivers/rally/sainz
|
35
|
+
#
|
36
|
+
# If the filter would be "drivers" then one of these leaves will be returned:
|
37
|
+
#
|
38
|
+
# data/photos/drivers/f1/senna
|
39
|
+
# data/photos/drivers/rally/sainz
|
40
|
+
#
|
41
|
+
# Options:
|
42
|
+
#
|
43
|
+
# pre_filters specify an ordered list of categories (folders) where to search
|
44
|
+
# within the root folder. This basically makes the search faster
|
45
|
+
# when dealing with large folder trees.
|
46
|
+
#
|
47
|
+
# Example:
|
48
|
+
# DataSampler.sample "data" pre_filters=["photos", "cars"]
|
49
|
+
#
|
50
|
+
# returns one of:
|
51
|
+
#
|
52
|
+
# data/photos/cars/renault/megane
|
53
|
+
# data/photos/cars/volkswagen/golf
|
54
|
+
#
|
55
|
+
# It's equivalent to:
|
56
|
+
#
|
57
|
+
# DataSampler.sample "data/photos/car"
|
58
|
+
#
|
59
|
+
# Author:: Manuel Pais (mailto:manuelpais@gmail.com)
|
60
|
+
# License:: Distributes under the same terms as Ruby
|
61
|
+
|
62
|
+
class DataSampler
|
63
|
+
|
64
|
+
def self.sample(data_source, filters, options = {})
|
65
|
+
sample = pick_sample(filter_data(data_source, filters, options).keys)
|
66
|
+
if sample
|
67
|
+
return "#{data_source}/#{sample}"
|
68
|
+
else
|
69
|
+
return ""
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.filter_data(data_source, filters, options = {})
|
74
|
+
candidates = find_data(data_source, options)
|
75
|
+
candidates = filter_candidates(filters, candidates)
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.find_data(data_source, options = {})
|
79
|
+
# declare accepted options here, not in middle of function
|
80
|
+
filter_by = options[:pre_filters]
|
81
|
+
return_folder = options[:return_folder]
|
82
|
+
discard_folder = options[:discard_folder]
|
83
|
+
|
84
|
+
found_data = {}
|
85
|
+
search_within = "."
|
86
|
+
|
87
|
+
search_within = filter_by.join("/") if filter_by
|
88
|
+
|
89
|
+
Dir.chdir(data_source) do
|
90
|
+
Find.find(search_within) do |path|
|
91
|
+
Find.prune if path =~ /\.hg/ # skip mercurial control files
|
92
|
+
|
93
|
+
if File.file?(path)
|
94
|
+
data_item = path.gsub("./", "").rstrip
|
95
|
+
data_item = File.dirname(data_item) if return_folder
|
96
|
+
found_data[data_item] = categorize(data_item)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
return found_data
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.categorize(path)
|
105
|
+
categories = path.split("/") if path
|
106
|
+
return categories
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.filter_candidates(filters, candidates)
|
110
|
+
unless candidates.nil? or candidates.empty?
|
111
|
+
filters.each do |filter_name, filter_value|
|
112
|
+
candidates.delete_if {|data_item, categories| not matches(filter_value, categories)}
|
113
|
+
end
|
114
|
+
end
|
115
|
+
return candidates
|
116
|
+
end
|
117
|
+
|
118
|
+
def self.matches(filter, categories)
|
119
|
+
categories.each do |category|
|
120
|
+
if spaceless(category).eql?(spaceless(filter))
|
121
|
+
return true
|
122
|
+
end
|
123
|
+
end
|
124
|
+
return false
|
125
|
+
end
|
126
|
+
|
127
|
+
def self.spaceless(sentence)
|
128
|
+
sentence.delete(" ").downcase
|
129
|
+
end
|
130
|
+
|
131
|
+
def self.pick_sample(candidate_values)
|
132
|
+
unless candidate_values.nil? or candidate_values.empty?
|
133
|
+
if RUBY_VERSION < "1.9"
|
134
|
+
random_candidate = rand(candidate_values.length)
|
135
|
+
return candidate_values[random_candidate]
|
136
|
+
else
|
137
|
+
# The Array#sample function in ruby >= 1.9 does the same as above code
|
138
|
+
return candidate_values.sample
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
metadata
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: folder_data_sampler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 23
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
- 0
|
10
|
+
version: 0.2.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Manuel Pais
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2012-09-18 00:00:00 Z
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Samples data in a folder structure given a hash with criteria for filtering
|
22
|
+
email: manuel.pais@gmail.com
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- lib/folder_data_sampler.rb
|
31
|
+
homepage: http://rubygems.org/gems/folder_data_sampler
|
32
|
+
licenses: []
|
33
|
+
|
34
|
+
post_install_message:
|
35
|
+
rdoc_options: []
|
36
|
+
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
hash: 3
|
45
|
+
segments:
|
46
|
+
- 0
|
47
|
+
version: "0"
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
hash: 3
|
54
|
+
segments:
|
55
|
+
- 0
|
56
|
+
version: "0"
|
57
|
+
requirements: []
|
58
|
+
|
59
|
+
rubyforge_project:
|
60
|
+
rubygems_version: 1.7.2
|
61
|
+
signing_key:
|
62
|
+
specification_version: 3
|
63
|
+
summary: Folder Data Sampler
|
64
|
+
test_files: []
|
65
|
+
|