imw 0.2.12 → 0.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/imw/schemes/ftp.rb +142 -0
- data/lib/imw/tools/extension_analyzer.rb +13 -8
- data/lib/imw/utils/has_uri.rb +7 -0
- data/spec/imw/tools/extension_analyzer_spec.rb +153 -0
- metadata +7 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.13
|
@@ -0,0 +1,142 @@
|
|
1
|
+
module IMW
|
2
|
+
module Schemes
|
3
|
+
|
4
|
+
# Defines methods for reading and writing data from an FTP server.
|
5
|
+
#
|
6
|
+
# IMW.open('ftp://user:pass@my_bucket/path/to/some/file.csv')
|
7
|
+
#
|
8
|
+
# Learn more about {Amazon Web Services}[http://aws.amazon.com].
|
9
|
+
module FTP
|
10
|
+
|
11
|
+
module Base
|
12
|
+
|
13
|
+
# Is this resource an FTP resource?
|
14
|
+
#
|
15
|
+
# @return [true, false]
|
16
|
+
def on_ftp?
|
17
|
+
true
|
18
|
+
end
|
19
|
+
alias_method :is_ftp?, :on_ftp?
|
20
|
+
|
21
|
+
# Copy this resource to the +new_uri+.
|
22
|
+
#
|
23
|
+
# @param [String, IMW::Resource] new_uri
|
24
|
+
# @return [IMW::Resource] the new resource
|
25
|
+
def cp new_uri
|
26
|
+
local_obj = IMW.open(new_uri)
|
27
|
+
raise IMW::PathError.new("FTP resources (#{self}) can only be downloaded to a local path") unless local_obj.is_local?
|
28
|
+
local_obj.dir.should_exist!
|
29
|
+
FTP.open(host, user, password) do |ftp|
|
30
|
+
ftp.get(path, local_obj.path)
|
31
|
+
end
|
32
|
+
local_obj
|
33
|
+
end
|
34
|
+
|
35
|
+
# Does this resource exist on S3?
|
36
|
+
#
|
37
|
+
# @return [true, false]
|
38
|
+
def exist?
|
39
|
+
s3_object.exists?
|
40
|
+
end
|
41
|
+
alias_method :exists?, :exist?
|
42
|
+
|
43
|
+
# Remove this resource from S3.
|
44
|
+
#
|
45
|
+
# @return [IMW::Resource] the deleted object
|
46
|
+
def rm
|
47
|
+
s3_object.delete
|
48
|
+
end
|
49
|
+
alias_method :rm!, :rm
|
50
|
+
|
51
|
+
# Return the S3N URL for this S3 object
|
52
|
+
#
|
53
|
+
# resource = IMW.open('s3://my_bucket/path/to/some/obj')
|
54
|
+
# resource.s3n_url
|
55
|
+
# => 's3n://my_bucket/path/to/some/obj'
|
56
|
+
#
|
57
|
+
# @return [String]
|
58
|
+
def s3n_url
|
59
|
+
uri.to_s.gsub(/^s3:/, 's3n:')
|
60
|
+
end
|
61
|
+
|
62
|
+
# Return the contents of this S3 object.
|
63
|
+
#
|
64
|
+
# @return [String]
|
65
|
+
def read
|
66
|
+
s3_object.value
|
67
|
+
end
|
68
|
+
|
69
|
+
# Store +source+ into +destination+.
|
70
|
+
#
|
71
|
+
# @param [String, IMW::Resource, #io] source
|
72
|
+
# @param [String, IMW::Resource, #path, #bucket] destination
|
73
|
+
# @return [IMW::Resource] the new S3 object
|
74
|
+
def self.put source, destination
|
75
|
+
source = IMW.open(source)
|
76
|
+
destintation = IMW.open(destination)
|
77
|
+
raise IMW::ArgumentError.new("destination must be on S3 -- #{destination} given") unless destination.on_s3?
|
78
|
+
make_connection!
|
79
|
+
AWS::S3::S3Object.store(destination.path, source.io, destination.bucket)
|
80
|
+
destination
|
81
|
+
end
|
82
|
+
|
83
|
+
# Download +source+ from S3 into +destination+.
|
84
|
+
#
|
85
|
+
# @param [String, IMW::Resource, #path, #bucket] source
|
86
|
+
# @param [String, IMW::Resource, #write] destination
|
87
|
+
# @return [IMW::Resource] the new resource
|
88
|
+
def self.get source, destination
|
89
|
+
source = IMW.open(source)
|
90
|
+
destination = IMW.open!(destination)
|
91
|
+
raise IMW::ArgumentError.new("source must be on S3 -- #{source} given") unless source.on_s3?
|
92
|
+
make_connection!
|
93
|
+
AWS::S3::S3Object.stream(source.path, source.bucket) do |chunk|
|
94
|
+
destination.write(chunk)
|
95
|
+
end
|
96
|
+
destination.close
|
97
|
+
destination.reopen
|
98
|
+
end
|
99
|
+
|
100
|
+
# Copy S3 resource +source+ to +destination+.
|
101
|
+
#
|
102
|
+
# @param [String, IMW::Resource, #path, #bucket] source
|
103
|
+
# @param [String, IMW::Resource, #path, #bucket] destination
|
104
|
+
# @return [IMW::Resource] the new resource
|
105
|
+
def self.copy source, destination
|
106
|
+
source = IMW.open(source)
|
107
|
+
destination = IMW.open(destination)
|
108
|
+
raise IMW::PathError.new("Bucket names must be non-blank and match to 'copy'") unless source.bucket.present? && destination.bucket.present? && source.bucket == destination.bucket
|
109
|
+
make_connection!
|
110
|
+
AWS::S3::Object.copy(source.path, destination.path, destination.bucket)
|
111
|
+
destination
|
112
|
+
end
|
113
|
+
|
114
|
+
# Return the resource at the base path of this resource joined
|
115
|
+
# to +path+.
|
116
|
+
#
|
117
|
+
# IMW.open('s3:://bucket/path/to/dir').join('subdir')
|
118
|
+
# #=> IMW::Resource at 's3://bucket/path/to/dir/subdir'
|
119
|
+
#
|
120
|
+
# @param [Array<String>] paths
|
121
|
+
# @return [IMW::Resource]
|
122
|
+
def join *paths
|
123
|
+
IMW.open(File.join(stripped_uri.to_s, *paths))
|
124
|
+
end
|
125
|
+
|
126
|
+
protected
|
127
|
+
# Make an S3 connection.
|
128
|
+
#
|
129
|
+
# Uses settings defined in IMW::AWS_CREDENTIALS.
|
130
|
+
#
|
131
|
+
# @return [AWS
|
132
|
+
def self.make_connection!
|
133
|
+
return @connection if @connection
|
134
|
+
raise IMW::Error.new("Must define a constant IMW::AWS_CREDENTIALS with an :access_key_id and a :secret_access_key before using S3 resources") unless defined?(IMW::AWS_CREDENTIALS)
|
135
|
+
require 'aws/s3'
|
136
|
+
@connection = AWS::S3::Base.establish_connection!(IMW::AWS_CREDENTIALS)
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
@@ -4,8 +4,10 @@ module IMW
|
|
4
4
|
# Mixin with some heuristic methods for identifying common
|
5
5
|
# extensions and likely data formats for a collection of files.
|
6
6
|
#
|
7
|
-
# Requires the including class to define a method +resources+
|
8
|
-
# returns an array of IMW::Resource objects
|
7
|
+
# Requires the including class to define a method +resources+
|
8
|
+
# which returns an array of IMW::Resource objects as well as a
|
9
|
+
# method +total_size+ which gives the total size of the resources
|
10
|
+
# (for weighting extensions by size).
|
9
11
|
module ExtensionAnalyzer
|
10
12
|
|
11
13
|
# Return the file counts of each extension.
|
@@ -24,7 +26,7 @@ module IMW
|
|
24
26
|
# Return the most common extension by count of files.
|
25
27
|
def most_common_extension_by_count
|
26
28
|
return @most_common_extension_by_count if @most_common_extension_by_count
|
27
|
-
current_count, current_extension = 0,
|
29
|
+
current_count, current_extension = 0, ''
|
28
30
|
extension_counts.each_pair do |extension, count|
|
29
31
|
current_extension = extension if count > current_count
|
30
32
|
end
|
@@ -63,11 +65,14 @@ module IMW
|
|
63
65
|
# @return [String]
|
64
66
|
def most_common_extension_by_size
|
65
67
|
return @most_common_extension_by_size if @most_common_extension_by_size
|
66
|
-
current_size, current_extension = 0,
|
68
|
+
current_size, current_extension = 0, ''
|
67
69
|
extension_sizes.each_pair do |extension, size|
|
68
|
-
|
70
|
+
if size > current_size
|
71
|
+
current_extension = extension
|
72
|
+
current_size = size
|
73
|
+
end
|
69
74
|
end
|
70
|
-
if current_extension.strip.blank?
|
75
|
+
current_extension = 'flat' if current_extension.strip.blank?
|
71
76
|
@most_common_extension_by_size = current_extension
|
72
77
|
end
|
73
78
|
|
@@ -90,8 +95,8 @@ module IMW
|
|
90
95
|
return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
|
91
96
|
count_fraction = normalized_extension_counts[most_common_extension_by_count]
|
92
97
|
size_fraction = normalized_extension_sizes[most_common_extension_by_size]
|
93
|
-
return most_common_extension_by_count if count_fraction
|
94
|
-
return most_common_extension_by_size if count_fraction < 0.5 and size_fraction
|
98
|
+
return most_common_extension_by_count if count_fraction >= 0.5 and size_fraction < 0.5 # FIXME arbitrary
|
99
|
+
return most_common_extension_by_size if count_fraction < 0.5 and size_fraction >= 0.5
|
95
100
|
most_common_extension_by_size # default to size
|
96
101
|
end
|
97
102
|
|
data/lib/imw/utils/has_uri.rb
CHANGED
@@ -79,6 +79,13 @@ module IMW
|
|
79
79
|
@user ||= uri.user
|
80
80
|
end
|
81
81
|
|
82
|
+
# Returns the password associated with access to this URI.
|
83
|
+
#
|
84
|
+
# @return [String]
|
85
|
+
def password
|
86
|
+
@password ||= uri.password
|
87
|
+
end
|
88
|
+
|
82
89
|
# Return the fragment part of this resource's URI.
|
83
90
|
#
|
84
91
|
# Will likely be +nil+ for local resources.
|
@@ -0,0 +1,153 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../spec_helper"
|
2
|
+
|
3
|
+
describe IMW::Tools::ExtensionAnalyzer do
|
4
|
+
|
5
|
+
before do
|
6
|
+
class Analyzer
|
7
|
+
attr_accessor :dir, :resources
|
8
|
+
include IMW::Tools::ExtensionAnalyzer
|
9
|
+
def initialize dir
|
10
|
+
self.dir = File.expand_path(dir)
|
11
|
+
@resources = IMW.open(self.dir).all_resources
|
12
|
+
end
|
13
|
+
def total_size
|
14
|
+
@total_size ||= resources.map(&:size).inject(0) { |e, sum| sum += e }
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe 'working with an empty directory' do
|
20
|
+
before do
|
21
|
+
@analyzer = Analyzer.new(IMWTest::TMP_DIR)
|
22
|
+
end
|
23
|
+
|
24
|
+
%w[most_common_extension_by_count most_common_extension_by_size most_common_extension].each do |method|
|
25
|
+
it "should return 'flat' when asked for its '#{method}'" do
|
26
|
+
@analyzer.send(method).should == 'flat'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
%w[extension_counts normalized_extension_counts extension_sizes normalized_extension_sizes].each do |method|
|
31
|
+
it "should return an empty hash when asked for its '#{method}'" do
|
32
|
+
@analyzer.send(method).should == {}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe 'working with files that lack extensions' do
|
38
|
+
|
39
|
+
before do
|
40
|
+
@dir = File.join(IMWTest::TMP_DIR, 'ext_dir')
|
41
|
+
FileUtils.mkdir_p(@dir)
|
42
|
+
|
43
|
+
@f1 = "foobar1"
|
44
|
+
@f2 = "foobar2"
|
45
|
+
@f3 = "foobar1"
|
46
|
+
@files = [@f1, @f2, @f3]
|
47
|
+
|
48
|
+
@files.each do |basename|
|
49
|
+
IMWTest::Random.file File.join(@dir, basename)
|
50
|
+
end
|
51
|
+
|
52
|
+
@analyzer = Analyzer.new(IMWTest::TMP_DIR)
|
53
|
+
end
|
54
|
+
|
55
|
+
%w[most_common_extension_by_count most_common_extension_by_size most_common_extension].each do |method|
|
56
|
+
it "should return 'flat' when asked for its '#{method}'" do
|
57
|
+
@analyzer.send(method).should == 'flat'
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
describe 'working with a directory of files' do
|
63
|
+
before do
|
64
|
+
@dir = File.join(IMWTest::TMP_DIR, 'ext_dir')
|
65
|
+
FileUtils.mkdir_p(@dir)
|
66
|
+
|
67
|
+
@csv1 = "foobar1.csv"
|
68
|
+
@csv2 = "foobar2.csv"
|
69
|
+
@xml = "foobar1.xml"
|
70
|
+
@txt = "foobar1.txt"
|
71
|
+
@files = [@csv1, @csv2, @xml, @txt]
|
72
|
+
|
73
|
+
@files.each do |basename|
|
74
|
+
IMWTest::Random.file File.join(@dir, basename)
|
75
|
+
end
|
76
|
+
|
77
|
+
def bloat basename
|
78
|
+
File.open(File.join(@dir, basename), 'a') do |f|
|
79
|
+
1000.times do
|
80
|
+
f.write( 'hello ' * 100)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
@analyzer = Analyzer.new @dir
|
86
|
+
end
|
87
|
+
|
88
|
+
describe "working with extension counts" do
|
89
|
+
it "should be able to return counts by extension" do
|
90
|
+
@analyzer.extension_counts.should == {'xml' => 1, 'txt' => 1, 'csv' => 2 }
|
91
|
+
end
|
92
|
+
|
93
|
+
it "should be able to return the most common extension by count" do
|
94
|
+
@analyzer.most_common_extension_by_count.should == 'csv'
|
95
|
+
end
|
96
|
+
|
97
|
+
it "should be able to calculate extension weighted by number of files" do
|
98
|
+
@analyzer.normalized_extension_counts.should == { 'csv' => 0.5, 'xml' => 0.25, 'txt' => 0.25 }
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
describe "working with extension sizes" do
|
103
|
+
it "should be able to calculate extension sizes" do
|
104
|
+
csv_size = File.size(File.join(@dir, @csv1)) + File.size(File.join(@dir, @csv2))
|
105
|
+
xml_size = File.size(File.join(@dir, @xml))
|
106
|
+
txt_size = File.size(File.join(@dir, @txt))
|
107
|
+
@analyzer.extension_sizes.should == { 'csv' => csv_size, 'xml' => xml_size, 'txt' => txt_size }
|
108
|
+
end
|
109
|
+
|
110
|
+
it "should be able to return the most common extension by size" do
|
111
|
+
bloat @txt
|
112
|
+
@analyzer.most_common_extension_by_size.should == 'txt'
|
113
|
+
end
|
114
|
+
|
115
|
+
it "should be able to calculate extension sizes" do
|
116
|
+
csv_size = File.size(File.join(@dir, @csv1)) + File.size(File.join(@dir, @csv2))
|
117
|
+
xml_size = File.size(File.join(@dir, @xml))
|
118
|
+
txt_size = File.size(File.join(@dir, @txt))
|
119
|
+
total_size = csv_size + xml_size + txt_size
|
120
|
+
@analyzer.normalized_extension_sizes.should == { 'csv' => csv_size.to_f / total_size.to_f, 'xml' => xml_size.to_f / total_size.to_f, 'txt' => txt_size.to_f / total_size.to_f }
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
describe "determining the most common extension" do
|
125
|
+
|
126
|
+
it "should obviously return an extension if it is the most common by count as well as the most common by size" do
|
127
|
+
bloat @csv1
|
128
|
+
@analyzer.most_common_extension.should == 'csv'
|
129
|
+
end
|
130
|
+
|
131
|
+
it "should return the most common extension by count if the count fraction is half or greater and the size fraction is less than half" do
|
132
|
+
bloat @txt
|
133
|
+
bloat @xml
|
134
|
+
@analyzer.most_common_extension.should == 'csv'
|
135
|
+
end
|
136
|
+
|
137
|
+
it "should return the most common extension by size if the size fraction is half or greater and the count fraction is less than half" do
|
138
|
+
# need to add an xml file
|
139
|
+
@new_xml = File.join(@dir, 'xml2.xml')
|
140
|
+
IMWTest::Random.file(@new_xml)
|
141
|
+
bloat @txt
|
142
|
+
@analyzer = Analyzer.new @dir
|
143
|
+
@analyzer.most_common_extension.should == 'txt'
|
144
|
+
end
|
145
|
+
|
146
|
+
it "should return the most common extension by size if no other conditions are met" do
|
147
|
+
bloat @txt
|
148
|
+
@analyzer.most_common_extension.should == 'txt'
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: imw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 13
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 13
|
10
|
+
version: 0.2.13
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Dhruv Bansal
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2010-11-
|
19
|
+
date: 2010-11-22 00:00:00 -06:00
|
20
20
|
default_executable:
|
21
21
|
dependencies: []
|
22
22
|
|
@@ -81,6 +81,7 @@ files:
|
|
81
81
|
- lib/imw/resource.rb
|
82
82
|
- lib/imw/runner.rb
|
83
83
|
- lib/imw/schemes.rb
|
84
|
+
- lib/imw/schemes/ftp.rb
|
84
85
|
- lib/imw/schemes/hdfs.rb
|
85
86
|
- lib/imw/schemes/http.rb
|
86
87
|
- lib/imw/schemes/local.rb
|
@@ -170,6 +171,7 @@ files:
|
|
170
171
|
- spec/imw/schemes/sql_spec.rb
|
171
172
|
- spec/imw/tools/aggregator_spec.rb
|
172
173
|
- spec/imw/tools/archiver_spec.rb
|
174
|
+
- spec/imw/tools/extension_analyzer_spec.rb
|
173
175
|
- spec/imw/tools/summarizer_spec.rb
|
174
176
|
- spec/imw/tools/transferer_spec.rb
|
175
177
|
- spec/imw/utils/dynamically_extendable_spec.rb
|
@@ -225,6 +227,7 @@ test_files:
|
|
225
227
|
- spec/imw/archives/tarbz2_spec.rb
|
226
228
|
- spec/imw/archives/rar_spec.rb
|
227
229
|
- spec/imw/tools/archiver_spec.rb
|
230
|
+
- spec/imw/tools/extension_analyzer_spec.rb
|
228
231
|
- spec/imw/tools/summarizer_spec.rb
|
229
232
|
- spec/imw/tools/transferer_spec.rb
|
230
233
|
- spec/imw/tools/aggregator_spec.rb
|