imw 0.2.12 → 0.2.13
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/imw/schemes/ftp.rb +142 -0
- data/lib/imw/tools/extension_analyzer.rb +13 -8
- data/lib/imw/utils/has_uri.rb +7 -0
- data/spec/imw/tools/extension_analyzer_spec.rb +153 -0
- metadata +7 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.13
|
@@ -0,0 +1,142 @@
|
|
1
|
+
module IMW
|
2
|
+
module Schemes
|
3
|
+
|
4
|
+
# Defines methods for reading and writing data from an FTP server.
|
5
|
+
#
|
6
|
+
# IMW.open('ftp://user:pass@my_bucket/path/to/some/file.csv')
|
7
|
+
#
|
8
|
+
# Learn more about {Amazon Web Services}[http://aws.amazon.com].
|
9
|
+
module FTP
|
10
|
+
|
11
|
+
module Base
|
12
|
+
|
13
|
+
# Is this resource an FTP resource?
|
14
|
+
#
|
15
|
+
# @return [true, false]
|
16
|
+
def on_ftp?
|
17
|
+
true
|
18
|
+
end
|
19
|
+
alias_method :is_ftp?, :on_ftp?
|
20
|
+
|
21
|
+
# Copy this resource to the +new_uri+.
|
22
|
+
#
|
23
|
+
# @param [String, IMW::Resource] new_uri
|
24
|
+
# @return [IMW::Resource] the new resource
|
25
|
+
def cp new_uri
|
26
|
+
local_obj = IMW.open(new_uri)
|
27
|
+
raise IMW::PathError.new("FTP resources (#{self}) can only be downloaded to a local path") unless local_obj.is_local?
|
28
|
+
local_obj.dir.should_exist!
|
29
|
+
FTP.open(host, user, password) do |ftp|
|
30
|
+
ftp.get(path, local_obj.path)
|
31
|
+
end
|
32
|
+
local_obj
|
33
|
+
end
|
34
|
+
|
35
|
+
# Does this resource exist on S3?
|
36
|
+
#
|
37
|
+
# @return [true, false]
|
38
|
+
def exist?
|
39
|
+
s3_object.exists?
|
40
|
+
end
|
41
|
+
alias_method :exists?, :exist?
|
42
|
+
|
43
|
+
# Remove this resource from S3.
|
44
|
+
#
|
45
|
+
# @return [IMW::Resource] the deleted object
|
46
|
+
def rm
|
47
|
+
s3_object.delete
|
48
|
+
end
|
49
|
+
alias_method :rm!, :rm
|
50
|
+
|
51
|
+
# Return the S3N URL for this S3 object
|
52
|
+
#
|
53
|
+
# resource = IMW.open('s3://my_bucket/path/to/some/obj')
|
54
|
+
# resource.s3n_url
|
55
|
+
# => 's3n://my_bucket/path/to/some/obj'
|
56
|
+
#
|
57
|
+
# @return [String]
|
58
|
+
def s3n_url
|
59
|
+
uri.to_s.gsub(/^s3:/, 's3n:')
|
60
|
+
end
|
61
|
+
|
62
|
+
# Return the contents of this S3 object.
|
63
|
+
#
|
64
|
+
# @return [String]
|
65
|
+
def read
|
66
|
+
s3_object.value
|
67
|
+
end
|
68
|
+
|
69
|
+
# Store +source+ into +destination+.
|
70
|
+
#
|
71
|
+
# @param [String, IMW::Resource, #io] source
|
72
|
+
# @param [String, IMW::Resource, #path, #bucket] destination
|
73
|
+
# @return [IMW::Resource] the new S3 object
|
74
|
+
def self.put source, destination
|
75
|
+
source = IMW.open(source)
|
76
|
+
destintation = IMW.open(destination)
|
77
|
+
raise IMW::ArgumentError.new("destination must be on S3 -- #{destination} given") unless destination.on_s3?
|
78
|
+
make_connection!
|
79
|
+
AWS::S3::S3Object.store(destination.path, source.io, destination.bucket)
|
80
|
+
destination
|
81
|
+
end
|
82
|
+
|
83
|
+
# Download +source+ from S3 into +destination+.
|
84
|
+
#
|
85
|
+
# @param [String, IMW::Resource, #path, #bucket] source
|
86
|
+
# @param [String, IMW::Resource, #write] destination
|
87
|
+
# @return [IMW::Resource] the new resource
|
88
|
+
def self.get source, destination
|
89
|
+
source = IMW.open(source)
|
90
|
+
destination = IMW.open!(destination)
|
91
|
+
raise IMW::ArgumentError.new("source must be on S3 -- #{source} given") unless source.on_s3?
|
92
|
+
make_connection!
|
93
|
+
AWS::S3::S3Object.stream(source.path, source.bucket) do |chunk|
|
94
|
+
destination.write(chunk)
|
95
|
+
end
|
96
|
+
destination.close
|
97
|
+
destination.reopen
|
98
|
+
end
|
99
|
+
|
100
|
+
# Copy S3 resource +source+ to +destination+.
|
101
|
+
#
|
102
|
+
# @param [String, IMW::Resource, #path, #bucket] source
|
103
|
+
# @param [String, IMW::Resource, #path, #bucket] destination
|
104
|
+
# @return [IMW::Resource] the new resource
|
105
|
+
def self.copy source, destination
|
106
|
+
source = IMW.open(source)
|
107
|
+
destination = IMW.open(destination)
|
108
|
+
raise IMW::PathError.new("Bucket names must be non-blank and match to 'copy'") unless source.bucket.present? && destination.bucket.present? && source.bucket == destination.bucket
|
109
|
+
make_connection!
|
110
|
+
AWS::S3::Object.copy(source.path, destination.path, destination.bucket)
|
111
|
+
destination
|
112
|
+
end
|
113
|
+
|
114
|
+
# Return the resource at the base path of this resource joined
|
115
|
+
# to +path+.
|
116
|
+
#
|
117
|
+
# IMW.open('s3:://bucket/path/to/dir').join('subdir')
|
118
|
+
# #=> IMW::Resource at 's3://bucket/path/to/dir/subdir'
|
119
|
+
#
|
120
|
+
# @param [Array<String>] paths
|
121
|
+
# @return [IMW::Resource]
|
122
|
+
def join *paths
|
123
|
+
IMW.open(File.join(stripped_uri.to_s, *paths))
|
124
|
+
end
|
125
|
+
|
126
|
+
protected
|
127
|
+
# Make an S3 connection.
|
128
|
+
#
|
129
|
+
# Uses settings defined in IMW::AWS_CREDENTIALS.
|
130
|
+
#
|
131
|
+
# @return [AWS
|
132
|
+
def self.make_connection!
|
133
|
+
return @connection if @connection
|
134
|
+
raise IMW::Error.new("Must define a constant IMW::AWS_CREDENTIALS with an :access_key_id and a :secret_access_key before using S3 resources") unless defined?(IMW::AWS_CREDENTIALS)
|
135
|
+
require 'aws/s3'
|
136
|
+
@connection = AWS::S3::Base.establish_connection!(IMW::AWS_CREDENTIALS)
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
@@ -4,8 +4,10 @@ module IMW
|
|
4
4
|
# Mixin with some heuristic methods for identifying common
|
5
5
|
# extensions and likely data formats for a collection of files.
|
6
6
|
#
|
7
|
-
# Requires the including class to define a method +resources+
|
8
|
-
# returns an array of IMW::Resource objects
|
7
|
+
# Requires the including class to define a method +resources+
|
8
|
+
# which returns an array of IMW::Resource objects as well as a
|
9
|
+
# method +total_size+ which gives the total size of the resources
|
10
|
+
# (for weighting extensions by size).
|
9
11
|
module ExtensionAnalyzer
|
10
12
|
|
11
13
|
# Return the file counts of each extension.
|
@@ -24,7 +26,7 @@ module IMW
|
|
24
26
|
# Return the most common extension by count of files.
|
25
27
|
def most_common_extension_by_count
|
26
28
|
return @most_common_extension_by_count if @most_common_extension_by_count
|
27
|
-
current_count, current_extension = 0,
|
29
|
+
current_count, current_extension = 0, ''
|
28
30
|
extension_counts.each_pair do |extension, count|
|
29
31
|
current_extension = extension if count > current_count
|
30
32
|
end
|
@@ -63,11 +65,14 @@ module IMW
|
|
63
65
|
# @return [String]
|
64
66
|
def most_common_extension_by_size
|
65
67
|
return @most_common_extension_by_size if @most_common_extension_by_size
|
66
|
-
current_size, current_extension = 0,
|
68
|
+
current_size, current_extension = 0, ''
|
67
69
|
extension_sizes.each_pair do |extension, size|
|
68
|
-
|
70
|
+
if size > current_size
|
71
|
+
current_extension = extension
|
72
|
+
current_size = size
|
73
|
+
end
|
69
74
|
end
|
70
|
-
if current_extension.strip.blank?
|
75
|
+
current_extension = 'flat' if current_extension.strip.blank?
|
71
76
|
@most_common_extension_by_size = current_extension
|
72
77
|
end
|
73
78
|
|
@@ -90,8 +95,8 @@ module IMW
|
|
90
95
|
return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
|
91
96
|
count_fraction = normalized_extension_counts[most_common_extension_by_count]
|
92
97
|
size_fraction = normalized_extension_sizes[most_common_extension_by_size]
|
93
|
-
return most_common_extension_by_count if count_fraction
|
94
|
-
return most_common_extension_by_size if count_fraction < 0.5 and size_fraction
|
98
|
+
return most_common_extension_by_count if count_fraction >= 0.5 and size_fraction < 0.5 # FIXME arbitrary
|
99
|
+
return most_common_extension_by_size if count_fraction < 0.5 and size_fraction >= 0.5
|
95
100
|
most_common_extension_by_size # default to size
|
96
101
|
end
|
97
102
|
|
data/lib/imw/utils/has_uri.rb
CHANGED
@@ -79,6 +79,13 @@ module IMW
|
|
79
79
|
@user ||= uri.user
|
80
80
|
end
|
81
81
|
|
82
|
+
# Returns the password associated with access to this URI.
|
83
|
+
#
|
84
|
+
# @return [String]
|
85
|
+
def password
|
86
|
+
@password ||= uri.password
|
87
|
+
end
|
88
|
+
|
82
89
|
# Return the fragment part of this resource's URI.
|
83
90
|
#
|
84
91
|
# Will likely be +nil+ for local resources.
|
@@ -0,0 +1,153 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../spec_helper"
|
2
|
+
|
3
|
+
describe IMW::Tools::ExtensionAnalyzer do
|
4
|
+
|
5
|
+
before do
|
6
|
+
class Analyzer
|
7
|
+
attr_accessor :dir, :resources
|
8
|
+
include IMW::Tools::ExtensionAnalyzer
|
9
|
+
def initialize dir
|
10
|
+
self.dir = File.expand_path(dir)
|
11
|
+
@resources = IMW.open(self.dir).all_resources
|
12
|
+
end
|
13
|
+
def total_size
|
14
|
+
@total_size ||= resources.map(&:size).inject(0) { |e, sum| sum += e }
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe 'working with an empty directory' do
|
20
|
+
before do
|
21
|
+
@analyzer = Analyzer.new(IMWTest::TMP_DIR)
|
22
|
+
end
|
23
|
+
|
24
|
+
%w[most_common_extension_by_count most_common_extension_by_size most_common_extension].each do |method|
|
25
|
+
it "should return 'flat' when asked for its '#{method}'" do
|
26
|
+
@analyzer.send(method).should == 'flat'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
%w[extension_counts normalized_extension_counts extension_sizes normalized_extension_sizes].each do |method|
|
31
|
+
it "should return an empty hash when asked for its '#{method}'" do
|
32
|
+
@analyzer.send(method).should == {}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe 'working with files that lack extensions' do
|
38
|
+
|
39
|
+
before do
|
40
|
+
@dir = File.join(IMWTest::TMP_DIR, 'ext_dir')
|
41
|
+
FileUtils.mkdir_p(@dir)
|
42
|
+
|
43
|
+
@f1 = "foobar1"
|
44
|
+
@f2 = "foobar2"
|
45
|
+
@f3 = "foobar1"
|
46
|
+
@files = [@f1, @f2, @f3]
|
47
|
+
|
48
|
+
@files.each do |basename|
|
49
|
+
IMWTest::Random.file File.join(@dir, basename)
|
50
|
+
end
|
51
|
+
|
52
|
+
@analyzer = Analyzer.new(IMWTest::TMP_DIR)
|
53
|
+
end
|
54
|
+
|
55
|
+
%w[most_common_extension_by_count most_common_extension_by_size most_common_extension].each do |method|
|
56
|
+
it "should return 'flat' when asked for its '#{method}'" do
|
57
|
+
@analyzer.send(method).should == 'flat'
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
describe 'working with a directory of files' do
|
63
|
+
before do
|
64
|
+
@dir = File.join(IMWTest::TMP_DIR, 'ext_dir')
|
65
|
+
FileUtils.mkdir_p(@dir)
|
66
|
+
|
67
|
+
@csv1 = "foobar1.csv"
|
68
|
+
@csv2 = "foobar2.csv"
|
69
|
+
@xml = "foobar1.xml"
|
70
|
+
@txt = "foobar1.txt"
|
71
|
+
@files = [@csv1, @csv2, @xml, @txt]
|
72
|
+
|
73
|
+
@files.each do |basename|
|
74
|
+
IMWTest::Random.file File.join(@dir, basename)
|
75
|
+
end
|
76
|
+
|
77
|
+
def bloat basename
|
78
|
+
File.open(File.join(@dir, basename), 'a') do |f|
|
79
|
+
1000.times do
|
80
|
+
f.write( 'hello ' * 100)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
@analyzer = Analyzer.new @dir
|
86
|
+
end
|
87
|
+
|
88
|
+
describe "working with extension counts" do
|
89
|
+
it "should be able to return counts by extension" do
|
90
|
+
@analyzer.extension_counts.should == {'xml' => 1, 'txt' => 1, 'csv' => 2 }
|
91
|
+
end
|
92
|
+
|
93
|
+
it "should be able to return the most common extension by count" do
|
94
|
+
@analyzer.most_common_extension_by_count.should == 'csv'
|
95
|
+
end
|
96
|
+
|
97
|
+
it "should be able to calculate extension weighted by number of files" do
|
98
|
+
@analyzer.normalized_extension_counts.should == { 'csv' => 0.5, 'xml' => 0.25, 'txt' => 0.25 }
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
describe "working with extension sizes" do
|
103
|
+
it "should be able to calculate extension sizes" do
|
104
|
+
csv_size = File.size(File.join(@dir, @csv1)) + File.size(File.join(@dir, @csv2))
|
105
|
+
xml_size = File.size(File.join(@dir, @xml))
|
106
|
+
txt_size = File.size(File.join(@dir, @txt))
|
107
|
+
@analyzer.extension_sizes.should == { 'csv' => csv_size, 'xml' => xml_size, 'txt' => txt_size }
|
108
|
+
end
|
109
|
+
|
110
|
+
it "should be able to return the most common extension by size" do
|
111
|
+
bloat @txt
|
112
|
+
@analyzer.most_common_extension_by_size.should == 'txt'
|
113
|
+
end
|
114
|
+
|
115
|
+
it "should be able to calculate extension sizes" do
|
116
|
+
csv_size = File.size(File.join(@dir, @csv1)) + File.size(File.join(@dir, @csv2))
|
117
|
+
xml_size = File.size(File.join(@dir, @xml))
|
118
|
+
txt_size = File.size(File.join(@dir, @txt))
|
119
|
+
total_size = csv_size + xml_size + txt_size
|
120
|
+
@analyzer.normalized_extension_sizes.should == { 'csv' => csv_size.to_f / total_size.to_f, 'xml' => xml_size.to_f / total_size.to_f, 'txt' => txt_size.to_f / total_size.to_f }
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
describe "determining the most common extension" do
|
125
|
+
|
126
|
+
it "should obviously return an extension if it is the most common by count as well as the most common by size" do
|
127
|
+
bloat @csv1
|
128
|
+
@analyzer.most_common_extension.should == 'csv'
|
129
|
+
end
|
130
|
+
|
131
|
+
it "should return the most common extension by count if the count fraction is half or greater and the size fraction is less than half" do
|
132
|
+
bloat @txt
|
133
|
+
bloat @xml
|
134
|
+
@analyzer.most_common_extension.should == 'csv'
|
135
|
+
end
|
136
|
+
|
137
|
+
it "should return the most common extension by size if the size fraction is half or greater and the count fraction is less than half" do
|
138
|
+
# need to add an xml file
|
139
|
+
@new_xml = File.join(@dir, 'xml2.xml')
|
140
|
+
IMWTest::Random.file(@new_xml)
|
141
|
+
bloat @txt
|
142
|
+
@analyzer = Analyzer.new @dir
|
143
|
+
@analyzer.most_common_extension.should == 'txt'
|
144
|
+
end
|
145
|
+
|
146
|
+
it "should return the most common extension by size if no other conditions are met" do
|
147
|
+
bloat @txt
|
148
|
+
@analyzer.most_common_extension.should == 'txt'
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: imw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 13
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 13
|
10
|
+
version: 0.2.13
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Dhruv Bansal
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2010-11-
|
19
|
+
date: 2010-11-22 00:00:00 -06:00
|
20
20
|
default_executable:
|
21
21
|
dependencies: []
|
22
22
|
|
@@ -81,6 +81,7 @@ files:
|
|
81
81
|
- lib/imw/resource.rb
|
82
82
|
- lib/imw/runner.rb
|
83
83
|
- lib/imw/schemes.rb
|
84
|
+
- lib/imw/schemes/ftp.rb
|
84
85
|
- lib/imw/schemes/hdfs.rb
|
85
86
|
- lib/imw/schemes/http.rb
|
86
87
|
- lib/imw/schemes/local.rb
|
@@ -170,6 +171,7 @@ files:
|
|
170
171
|
- spec/imw/schemes/sql_spec.rb
|
171
172
|
- spec/imw/tools/aggregator_spec.rb
|
172
173
|
- spec/imw/tools/archiver_spec.rb
|
174
|
+
- spec/imw/tools/extension_analyzer_spec.rb
|
173
175
|
- spec/imw/tools/summarizer_spec.rb
|
174
176
|
- spec/imw/tools/transferer_spec.rb
|
175
177
|
- spec/imw/utils/dynamically_extendable_spec.rb
|
@@ -225,6 +227,7 @@ test_files:
|
|
225
227
|
- spec/imw/archives/tarbz2_spec.rb
|
226
228
|
- spec/imw/archives/rar_spec.rb
|
227
229
|
- spec/imw/tools/archiver_spec.rb
|
230
|
+
- spec/imw/tools/extension_analyzer_spec.rb
|
228
231
|
- spec/imw/tools/summarizer_spec.rb
|
229
232
|
- spec/imw/tools/transferer_spec.rb
|
230
233
|
- spec/imw/tools/aggregator_spec.rb
|