imw 0.2.12 → 0.2.13

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.12
1
+ 0.2.13
@@ -0,0 +1,142 @@
1
+ module IMW
2
+ module Schemes
3
+
4
+ # Defines methods for reading and writing data from an FTP server.
5
+ #
6
+ # IMW.open('ftp://user:pass@my_bucket/path/to/some/file.csv')
7
+ #
8
+ # Learn more about {Amazon Web Services}[http://aws.amazon.com].
9
+ module FTP
10
+
11
+ module Base
12
+
13
+ # Is this resource an FTP resource?
14
+ #
15
+ # @return [true, false]
16
+ def on_ftp?
17
+ true
18
+ end
19
+ alias_method :is_ftp?, :on_ftp?
20
+
21
+ # Copy this resource to the +new_uri+.
22
+ #
23
+ # @param [String, IMW::Resource] new_uri
24
+ # @return [IMW::Resource] the new resource
25
+ def cp new_uri
26
+ local_obj = IMW.open(new_uri)
27
+ raise IMW::PathError.new("FTP resources (#{self}) can only be downloaded to a local path") unless local_obj.is_local?
28
+ local_obj.dir.should_exist!
29
+ FTP.open(host, user, password) do |ftp|
30
+ ftp.get(path, local_obj.path)
31
+ end
32
+ local_obj
33
+ end
34
+
35
+ # Does this resource exist on S3?
36
+ #
37
+ # @return [true, false]
38
+ def exist?
39
+ s3_object.exists?
40
+ end
41
+ alias_method :exists?, :exist?
42
+
43
+ # Remove this resource from S3.
44
+ #
45
+ # @return [IMW::Resource] the deleted object
46
+ def rm
47
+ s3_object.delete
48
+ end
49
+ alias_method :rm!, :rm
50
+
51
+ # Return the S3N URL for this S3 object
52
+ #
53
+ # resource = IMW.open('s3://my_bucket/path/to/some/obj')
54
+ # resource.s3n_url
55
+ # => 's3n://my_bucket/path/to/some/obj'
56
+ #
57
+ # @return [String]
58
+ def s3n_url
59
+ uri.to_s.gsub(/^s3:/, 's3n:')
60
+ end
61
+
62
+ # Return the contents of this S3 object.
63
+ #
64
+ # @return [String]
65
+ def read
66
+ s3_object.value
67
+ end
68
+
69
+ # Store +source+ into +destination+.
70
+ #
71
+ # @param [String, IMW::Resource, #io] source
72
+ # @param [String, IMW::Resource, #path, #bucket] destination
73
+ # @return [IMW::Resource] the new S3 object
74
+ def self.put source, destination
75
+ source = IMW.open(source)
76
+ destintation = IMW.open(destination)
77
+ raise IMW::ArgumentError.new("destination must be on S3 -- #{destination} given") unless destination.on_s3?
78
+ make_connection!
79
+ AWS::S3::S3Object.store(destination.path, source.io, destination.bucket)
80
+ destination
81
+ end
82
+
83
+ # Download +source+ from S3 into +destination+.
84
+ #
85
+ # @param [String, IMW::Resource, #path, #bucket] source
86
+ # @param [String, IMW::Resource, #write] destination
87
+ # @return [IMW::Resource] the new resource
88
+ def self.get source, destination
89
+ source = IMW.open(source)
90
+ destination = IMW.open!(destination)
91
+ raise IMW::ArgumentError.new("source must be on S3 -- #{source} given") unless source.on_s3?
92
+ make_connection!
93
+ AWS::S3::S3Object.stream(source.path, source.bucket) do |chunk|
94
+ destination.write(chunk)
95
+ end
96
+ destination.close
97
+ destination.reopen
98
+ end
99
+
100
+ # Copy S3 resource +source+ to +destination+.
101
+ #
102
+ # @param [String, IMW::Resource, #path, #bucket] source
103
+ # @param [String, IMW::Resource, #path, #bucket] destination
104
+ # @return [IMW::Resource] the new resource
105
+ def self.copy source, destination
106
+ source = IMW.open(source)
107
+ destination = IMW.open(destination)
108
+ raise IMW::PathError.new("Bucket names must be non-blank and match to 'copy'") unless source.bucket.present? && destination.bucket.present? && source.bucket == destination.bucket
109
+ make_connection!
110
+ AWS::S3::Object.copy(source.path, destination.path, destination.bucket)
111
+ destination
112
+ end
113
+
114
+ # Return the resource at the base path of this resource joined
115
+ # to +path+.
116
+ #
117
+ # IMW.open('s3:://bucket/path/to/dir').join('subdir')
118
+ # #=> IMW::Resource at 's3://bucket/path/to/dir/subdir'
119
+ #
120
+ # @param [Array<String>] paths
121
+ # @return [IMW::Resource]
122
+ def join *paths
123
+ IMW.open(File.join(stripped_uri.to_s, *paths))
124
+ end
125
+
126
+ protected
127
+ # Make an S3 connection.
128
+ #
129
+ # Uses settings defined in IMW::AWS_CREDENTIALS.
130
+ #
131
+ # @return [AWS
132
+ def self.make_connection!
133
+ return @connection if @connection
134
+ raise IMW::Error.new("Must define a constant IMW::AWS_CREDENTIALS with an :access_key_id and a :secret_access_key before using S3 resources") unless defined?(IMW::AWS_CREDENTIALS)
135
+ require 'aws/s3'
136
+ @connection = AWS::S3::Base.establish_connection!(IMW::AWS_CREDENTIALS)
137
+ end
138
+
139
+ end
140
+ end
141
+ end
142
+
@@ -4,8 +4,10 @@ module IMW
4
4
  # Mixin with some heuristic methods for identifying common
5
5
  # extensions and likely data formats for a collection of files.
6
6
  #
7
- # Requires the including class to define a method +resources+ which
8
- # returns an array of IMW::Resource objects.
7
+ # Requires the including class to define a method +resources+
8
+ # which returns an array of IMW::Resource objects as well as a
9
+ # method +total_size+ which gives the total size of the resources
10
+ # (for weighting extensions by size).
9
11
  module ExtensionAnalyzer
10
12
 
11
13
  # Return the file counts of each extension.
@@ -24,7 +26,7 @@ module IMW
24
26
  # Return the most common extension by count of files.
25
27
  def most_common_extension_by_count
26
28
  return @most_common_extension_by_count if @most_common_extension_by_count
27
- current_count, current_extension = 0, nil
29
+ current_count, current_extension = 0, ''
28
30
  extension_counts.each_pair do |extension, count|
29
31
  current_extension = extension if count > current_count
30
32
  end
@@ -63,11 +65,14 @@ module IMW
63
65
  # @return [String]
64
66
  def most_common_extension_by_size
65
67
  return @most_common_extension_by_size if @most_common_extension_by_size
66
- current_size, current_extension = 0, nil
68
+ current_size, current_extension = 0, ''
67
69
  extension_sizes.each_pair do |extension, size|
68
- current_extension = extension if size > current_size
70
+ if size > current_size
71
+ current_extension = extension
72
+ current_size = size
73
+ end
69
74
  end
70
- if current_extension.strip.blank? then current_extension = 'flat' end
75
+ current_extension = 'flat' if current_extension.strip.blank?
71
76
  @most_common_extension_by_size = current_extension
72
77
  end
73
78
 
@@ -90,8 +95,8 @@ module IMW
90
95
  return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
91
96
  count_fraction = normalized_extension_counts[most_common_extension_by_count]
92
97
  size_fraction = normalized_extension_sizes[most_common_extension_by_size]
93
- return most_common_extension_by_count if count_fraction > 0.5 and size_fraction < 0.5 # FIXME arbitrary
94
- return most_common_extension_by_size if count_fraction < 0.5 and size_fraction > 0.5
98
+ return most_common_extension_by_count if count_fraction >= 0.5 and size_fraction < 0.5 # FIXME arbitrary
99
+ return most_common_extension_by_size if count_fraction < 0.5 and size_fraction >= 0.5
95
100
  most_common_extension_by_size # default to size
96
101
  end
97
102
 
@@ -79,6 +79,13 @@ module IMW
79
79
  @user ||= uri.user
80
80
  end
81
81
 
82
+ # Returns the password associated with access to this URI.
83
+ #
84
+ # @return [String]
85
+ def password
86
+ @password ||= uri.password
87
+ end
88
+
82
89
  # Return the fragment part of this resource's URI.
83
90
  #
84
91
  # Will likely be +nil+ for local resources.
@@ -0,0 +1,153 @@
1
+ require File.dirname(__FILE__) + "/../../spec_helper"
2
+
3
+ describe IMW::Tools::ExtensionAnalyzer do
4
+
5
+ before do
6
+ class Analyzer
7
+ attr_accessor :dir, :resources
8
+ include IMW::Tools::ExtensionAnalyzer
9
+ def initialize dir
10
+ self.dir = File.expand_path(dir)
11
+ @resources = IMW.open(self.dir).all_resources
12
+ end
13
+ def total_size
14
+ @total_size ||= resources.map(&:size).inject(0) { |e, sum| sum += e }
15
+ end
16
+ end
17
+ end
18
+
19
+ describe 'working with an empty directory' do
20
+ before do
21
+ @analyzer = Analyzer.new(IMWTest::TMP_DIR)
22
+ end
23
+
24
+ %w[most_common_extension_by_count most_common_extension_by_size most_common_extension].each do |method|
25
+ it "should return 'flat' when asked for its '#{method}'" do
26
+ @analyzer.send(method).should == 'flat'
27
+ end
28
+ end
29
+
30
+ %w[extension_counts normalized_extension_counts extension_sizes normalized_extension_sizes].each do |method|
31
+ it "should return an empty hash when asked for its '#{method}'" do
32
+ @analyzer.send(method).should == {}
33
+ end
34
+ end
35
+ end
36
+
37
+ describe 'working with files that lack extensions' do
38
+
39
+ before do
40
+ @dir = File.join(IMWTest::TMP_DIR, 'ext_dir')
41
+ FileUtils.mkdir_p(@dir)
42
+
43
+ @f1 = "foobar1"
44
+ @f2 = "foobar2"
45
+ @f3 = "foobar1"
46
+ @files = [@f1, @f2, @f3]
47
+
48
+ @files.each do |basename|
49
+ IMWTest::Random.file File.join(@dir, basename)
50
+ end
51
+
52
+ @analyzer = Analyzer.new(IMWTest::TMP_DIR)
53
+ end
54
+
55
+ %w[most_common_extension_by_count most_common_extension_by_size most_common_extension].each do |method|
56
+ it "should return 'flat' when asked for its '#{method}'" do
57
+ @analyzer.send(method).should == 'flat'
58
+ end
59
+ end
60
+ end
61
+
62
+ describe 'working with a directory of files' do
63
+ before do
64
+ @dir = File.join(IMWTest::TMP_DIR, 'ext_dir')
65
+ FileUtils.mkdir_p(@dir)
66
+
67
+ @csv1 = "foobar1.csv"
68
+ @csv2 = "foobar2.csv"
69
+ @xml = "foobar1.xml"
70
+ @txt = "foobar1.txt"
71
+ @files = [@csv1, @csv2, @xml, @txt]
72
+
73
+ @files.each do |basename|
74
+ IMWTest::Random.file File.join(@dir, basename)
75
+ end
76
+
77
+ def bloat basename
78
+ File.open(File.join(@dir, basename), 'a') do |f|
79
+ 1000.times do
80
+ f.write( 'hello ' * 100)
81
+ end
82
+ end
83
+ end
84
+
85
+ @analyzer = Analyzer.new @dir
86
+ end
87
+
88
+ describe "working with extension counts" do
89
+ it "should be able to return counts by extension" do
90
+ @analyzer.extension_counts.should == {'xml' => 1, 'txt' => 1, 'csv' => 2 }
91
+ end
92
+
93
+ it "should be able to return the most common extension by count" do
94
+ @analyzer.most_common_extension_by_count.should == 'csv'
95
+ end
96
+
97
+ it "should be able to calculate extension weighted by number of files" do
98
+ @analyzer.normalized_extension_counts.should == { 'csv' => 0.5, 'xml' => 0.25, 'txt' => 0.25 }
99
+ end
100
+ end
101
+
102
+ describe "working with extension sizes" do
103
+ it "should be able to calculate extension sizes" do
104
+ csv_size = File.size(File.join(@dir, @csv1)) + File.size(File.join(@dir, @csv2))
105
+ xml_size = File.size(File.join(@dir, @xml))
106
+ txt_size = File.size(File.join(@dir, @txt))
107
+ @analyzer.extension_sizes.should == { 'csv' => csv_size, 'xml' => xml_size, 'txt' => txt_size }
108
+ end
109
+
110
+ it "should be able to return the most common extension by size" do
111
+ bloat @txt
112
+ @analyzer.most_common_extension_by_size.should == 'txt'
113
+ end
114
+
115
+ it "should be able to calculate extension sizes" do
116
+ csv_size = File.size(File.join(@dir, @csv1)) + File.size(File.join(@dir, @csv2))
117
+ xml_size = File.size(File.join(@dir, @xml))
118
+ txt_size = File.size(File.join(@dir, @txt))
119
+ total_size = csv_size + xml_size + txt_size
120
+ @analyzer.normalized_extension_sizes.should == { 'csv' => csv_size.to_f / total_size.to_f, 'xml' => xml_size.to_f / total_size.to_f, 'txt' => txt_size.to_f / total_size.to_f }
121
+ end
122
+ end
123
+
124
+ describe "determining the most common extension" do
125
+
126
+ it "should obviously return an extension if it is the most common by count as well as the most common by size" do
127
+ bloat @csv1
128
+ @analyzer.most_common_extension.should == 'csv'
129
+ end
130
+
131
+ it "should return the most common extension by count if the count fraction is half or greater and the size fraction is less than half" do
132
+ bloat @txt
133
+ bloat @xml
134
+ @analyzer.most_common_extension.should == 'csv'
135
+ end
136
+
137
+ it "should return the most common extension by size if the size fraction is half or greater and the count fraction is less than half" do
138
+ # need to add an xml file
139
+ @new_xml = File.join(@dir, 'xml2.xml')
140
+ IMWTest::Random.file(@new_xml)
141
+ bloat @txt
142
+ @analyzer = Analyzer.new @dir
143
+ @analyzer.most_common_extension.should == 'txt'
144
+ end
145
+
146
+ it "should return the most common extension by size if no other conditions are met" do
147
+ bloat @txt
148
+ @analyzer.most_common_extension.should == 'txt'
149
+ end
150
+
151
+ end
152
+ end
153
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: imw
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
4
+ hash: 13
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 12
10
- version: 0.2.12
9
+ - 13
10
+ version: 0.2.13
11
11
  platform: ruby
12
12
  authors:
13
13
  - Dhruv Bansal
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-11-17 00:00:00 -06:00
19
+ date: 2010-11-22 00:00:00 -06:00
20
20
  default_executable:
21
21
  dependencies: []
22
22
 
@@ -81,6 +81,7 @@ files:
81
81
  - lib/imw/resource.rb
82
82
  - lib/imw/runner.rb
83
83
  - lib/imw/schemes.rb
84
+ - lib/imw/schemes/ftp.rb
84
85
  - lib/imw/schemes/hdfs.rb
85
86
  - lib/imw/schemes/http.rb
86
87
  - lib/imw/schemes/local.rb
@@ -170,6 +171,7 @@ files:
170
171
  - spec/imw/schemes/sql_spec.rb
171
172
  - spec/imw/tools/aggregator_spec.rb
172
173
  - spec/imw/tools/archiver_spec.rb
174
+ - spec/imw/tools/extension_analyzer_spec.rb
173
175
  - spec/imw/tools/summarizer_spec.rb
174
176
  - spec/imw/tools/transferer_spec.rb
175
177
  - spec/imw/utils/dynamically_extendable_spec.rb
@@ -225,6 +227,7 @@ test_files:
225
227
  - spec/imw/archives/tarbz2_spec.rb
226
228
  - spec/imw/archives/rar_spec.rb
227
229
  - spec/imw/tools/archiver_spec.rb
230
+ - spec/imw/tools/extension_analyzer_spec.rb
228
231
  - spec/imw/tools/summarizer_spec.rb
229
232
  - spec/imw/tools/transferer_spec.rb
230
233
  - spec/imw/tools/aggregator_spec.rb