imw 0.2.12 → 0.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.12
1
+ 0.2.13
@@ -0,0 +1,142 @@
1
+ module IMW
2
+ module Schemes
3
+
4
+ # Defines methods for reading and writing data from an FTP server.
5
+ #
6
+ # IMW.open('ftp://user:pass@my_bucket/path/to/some/file.csv')
7
+ #
8
+ # Learn more about {Amazon Web Services}[http://aws.amazon.com].
9
+ module FTP
10
+
11
+ module Base
12
+
13
+ # Is this resource an FTP resource?
14
+ #
15
+ # @return [true, false]
16
+ def on_ftp?
17
+ true
18
+ end
19
+ alias_method :is_ftp?, :on_ftp?
20
+
21
+ # Copy this resource to the +new_uri+.
22
+ #
23
+ # @param [String, IMW::Resource] new_uri
24
+ # @return [IMW::Resource] the new resource
25
+ def cp new_uri
26
+ local_obj = IMW.open(new_uri)
27
+ raise IMW::PathError.new("FTP resources (#{self}) can only be downloaded to a local path") unless local_obj.is_local?
28
+ local_obj.dir.should_exist!
29
+ FTP.open(host, user, password) do |ftp|
30
+ ftp.get(path, local_obj.path)
31
+ end
32
+ local_obj
33
+ end
34
+
35
+ # Does this resource exist on S3?
36
+ #
37
+ # @return [true, false]
38
+ def exist?
39
+ s3_object.exists?
40
+ end
41
+ alias_method :exists?, :exist?
42
+
43
+ # Remove this resource from S3.
44
+ #
45
+ # @return [IMW::Resource] the deleted object
46
+ def rm
47
+ s3_object.delete
48
+ end
49
+ alias_method :rm!, :rm
50
+
51
+ # Return the S3N URL for this S3 object
52
+ #
53
+ # resource = IMW.open('s3://my_bucket/path/to/some/obj')
54
+ # resource.s3n_url
55
+ # => 's3n://my_bucket/path/to/some/obj'
56
+ #
57
+ # @return [String]
58
+ def s3n_url
59
+ uri.to_s.gsub(/^s3:/, 's3n:')
60
+ end
61
+
62
+ # Return the contents of this S3 object.
63
+ #
64
+ # @return [String]
65
+ def read
66
+ s3_object.value
67
+ end
68
+
69
+ # Store +source+ into +destination+.
70
+ #
71
+ # @param [String, IMW::Resource, #io] source
72
+ # @param [String, IMW::Resource, #path, #bucket] destination
73
+ # @return [IMW::Resource] the new S3 object
74
+ def self.put source, destination
75
+ source = IMW.open(source)
76
+ destintation = IMW.open(destination)
77
+ raise IMW::ArgumentError.new("destination must be on S3 -- #{destination} given") unless destination.on_s3?
78
+ make_connection!
79
+ AWS::S3::S3Object.store(destination.path, source.io, destination.bucket)
80
+ destination
81
+ end
82
+
83
+ # Download +source+ from S3 into +destination+.
84
+ #
85
+ # @param [String, IMW::Resource, #path, #bucket] source
86
+ # @param [String, IMW::Resource, #write] destination
87
+ # @return [IMW::Resource] the new resource
88
+ def self.get source, destination
89
+ source = IMW.open(source)
90
+ destination = IMW.open!(destination)
91
+ raise IMW::ArgumentError.new("source must be on S3 -- #{source} given") unless source.on_s3?
92
+ make_connection!
93
+ AWS::S3::S3Object.stream(source.path, source.bucket) do |chunk|
94
+ destination.write(chunk)
95
+ end
96
+ destination.close
97
+ destination.reopen
98
+ end
99
+
100
+ # Copy S3 resource +source+ to +destination+.
101
+ #
102
+ # @param [String, IMW::Resource, #path, #bucket] source
103
+ # @param [String, IMW::Resource, #path, #bucket] destination
104
+ # @return [IMW::Resource] the new resource
105
+ def self.copy source, destination
106
+ source = IMW.open(source)
107
+ destination = IMW.open(destination)
108
+ raise IMW::PathError.new("Bucket names must be non-blank and match to 'copy'") unless source.bucket.present? && destination.bucket.present? && source.bucket == destination.bucket
109
+ make_connection!
110
+ AWS::S3::Object.copy(source.path, destination.path, destination.bucket)
111
+ destination
112
+ end
113
+
114
+ # Return the resource at the base path of this resource joined
115
+ # to +path+.
116
+ #
117
+ # IMW.open('s3:://bucket/path/to/dir').join('subdir')
118
+ # #=> IMW::Resource at 's3://bucket/path/to/dir/subdir'
119
+ #
120
+ # @param [Array<String>] paths
121
+ # @return [IMW::Resource]
122
+ def join *paths
123
+ IMW.open(File.join(stripped_uri.to_s, *paths))
124
+ end
125
+
126
+ protected
127
+ # Make an S3 connection.
128
+ #
129
+ # Uses settings defined in IMW::AWS_CREDENTIALS.
130
+ #
131
+ # @return [AWS
132
+ def self.make_connection!
133
+ return @connection if @connection
134
+ raise IMW::Error.new("Must define a constant IMW::AWS_CREDENTIALS with an :access_key_id and a :secret_access_key before using S3 resources") unless defined?(IMW::AWS_CREDENTIALS)
135
+ require 'aws/s3'
136
+ @connection = AWS::S3::Base.establish_connection!(IMW::AWS_CREDENTIALS)
137
+ end
138
+
139
+ end
140
+ end
141
+ end
142
+
@@ -4,8 +4,10 @@ module IMW
4
4
  # Mixin with some heuristic methods for identifying common
5
5
  # extensions and likely data formats for a collection of files.
6
6
  #
7
- # Requires the including class to define a method +resources+ which
8
- # returns an array of IMW::Resource objects.
7
+ # Requires the including class to define a method +resources+
8
+ # which returns an array of IMW::Resource objects as well as a
9
+ # method +total_size+ which gives the total size of the resources
10
+ # (for weighting extensions by size).
9
11
  module ExtensionAnalyzer
10
12
 
11
13
  # Return the file counts of each extension.
@@ -24,7 +26,7 @@ module IMW
24
26
  # Return the most common extension by count of files.
25
27
  def most_common_extension_by_count
26
28
  return @most_common_extension_by_count if @most_common_extension_by_count
27
- current_count, current_extension = 0, nil
29
+ current_count, current_extension = 0, ''
28
30
  extension_counts.each_pair do |extension, count|
29
31
  current_extension = extension if count > current_count
30
32
  end
@@ -63,11 +65,14 @@ module IMW
63
65
  # @return [String]
64
66
  def most_common_extension_by_size
65
67
  return @most_common_extension_by_size if @most_common_extension_by_size
66
- current_size, current_extension = 0, nil
68
+ current_size, current_extension = 0, ''
67
69
  extension_sizes.each_pair do |extension, size|
68
- current_extension = extension if size > current_size
70
+ if size > current_size
71
+ current_extension = extension
72
+ current_size = size
73
+ end
69
74
  end
70
- if current_extension.strip.blank? then current_extension = 'flat' end
75
+ current_extension = 'flat' if current_extension.strip.blank?
71
76
  @most_common_extension_by_size = current_extension
72
77
  end
73
78
 
@@ -90,8 +95,8 @@ module IMW
90
95
  return most_common_extension_by_size if most_common_extension_by_size == most_common_extension_by_count # no contest
91
96
  count_fraction = normalized_extension_counts[most_common_extension_by_count]
92
97
  size_fraction = normalized_extension_sizes[most_common_extension_by_size]
93
- return most_common_extension_by_count if count_fraction > 0.5 and size_fraction < 0.5 # FIXME arbitrary
94
- return most_common_extension_by_size if count_fraction < 0.5 and size_fraction > 0.5
98
+ return most_common_extension_by_count if count_fraction >= 0.5 and size_fraction < 0.5 # FIXME arbitrary
99
+ return most_common_extension_by_size if count_fraction < 0.5 and size_fraction >= 0.5
95
100
  most_common_extension_by_size # default to size
96
101
  end
97
102
 
@@ -79,6 +79,13 @@ module IMW
79
79
  @user ||= uri.user
80
80
  end
81
81
 
82
+ # Returns the password associated with access to this URI.
83
+ #
84
+ # @return [String]
85
+ def password
86
+ @password ||= uri.password
87
+ end
88
+
82
89
  # Return the fragment part of this resource's URI.
83
90
  #
84
91
  # Will likely be +nil+ for local resources.
@@ -0,0 +1,153 @@
1
+ require File.dirname(__FILE__) + "/../../spec_helper"
2
+
3
+ describe IMW::Tools::ExtensionAnalyzer do
4
+
5
+ before do
6
+ class Analyzer
7
+ attr_accessor :dir, :resources
8
+ include IMW::Tools::ExtensionAnalyzer
9
+ def initialize dir
10
+ self.dir = File.expand_path(dir)
11
+ @resources = IMW.open(self.dir).all_resources
12
+ end
13
+ def total_size
14
+ @total_size ||= resources.map(&:size).inject(0) { |e, sum| sum += e }
15
+ end
16
+ end
17
+ end
18
+
19
+ describe 'working with an empty directory' do
20
+ before do
21
+ @analyzer = Analyzer.new(IMWTest::TMP_DIR)
22
+ end
23
+
24
+ %w[most_common_extension_by_count most_common_extension_by_size most_common_extension].each do |method|
25
+ it "should return 'flat' when asked for its '#{method}'" do
26
+ @analyzer.send(method).should == 'flat'
27
+ end
28
+ end
29
+
30
+ %w[extension_counts normalized_extension_counts extension_sizes normalized_extension_sizes].each do |method|
31
+ it "should return an empty hash when asked for its '#{method}'" do
32
+ @analyzer.send(method).should == {}
33
+ end
34
+ end
35
+ end
36
+
37
+ describe 'working with files that lack extensions' do
38
+
39
+ before do
40
+ @dir = File.join(IMWTest::TMP_DIR, 'ext_dir')
41
+ FileUtils.mkdir_p(@dir)
42
+
43
+ @f1 = "foobar1"
44
+ @f2 = "foobar2"
45
+ @f3 = "foobar1"
46
+ @files = [@f1, @f2, @f3]
47
+
48
+ @files.each do |basename|
49
+ IMWTest::Random.file File.join(@dir, basename)
50
+ end
51
+
52
+ @analyzer = Analyzer.new(IMWTest::TMP_DIR)
53
+ end
54
+
55
+ %w[most_common_extension_by_count most_common_extension_by_size most_common_extension].each do |method|
56
+ it "should return 'flat' when asked for its '#{method}'" do
57
+ @analyzer.send(method).should == 'flat'
58
+ end
59
+ end
60
+ end
61
+
62
+ describe 'working with a directory of files' do
63
+ before do
64
+ @dir = File.join(IMWTest::TMP_DIR, 'ext_dir')
65
+ FileUtils.mkdir_p(@dir)
66
+
67
+ @csv1 = "foobar1.csv"
68
+ @csv2 = "foobar2.csv"
69
+ @xml = "foobar1.xml"
70
+ @txt = "foobar1.txt"
71
+ @files = [@csv1, @csv2, @xml, @txt]
72
+
73
+ @files.each do |basename|
74
+ IMWTest::Random.file File.join(@dir, basename)
75
+ end
76
+
77
+ def bloat basename
78
+ File.open(File.join(@dir, basename), 'a') do |f|
79
+ 1000.times do
80
+ f.write( 'hello ' * 100)
81
+ end
82
+ end
83
+ end
84
+
85
+ @analyzer = Analyzer.new @dir
86
+ end
87
+
88
+ describe "working with extension counts" do
89
+ it "should be able to return counts by extension" do
90
+ @analyzer.extension_counts.should == {'xml' => 1, 'txt' => 1, 'csv' => 2 }
91
+ end
92
+
93
+ it "should be able to return the most common extension by count" do
94
+ @analyzer.most_common_extension_by_count.should == 'csv'
95
+ end
96
+
97
+ it "should be able to calculate extension weighted by number of files" do
98
+ @analyzer.normalized_extension_counts.should == { 'csv' => 0.5, 'xml' => 0.25, 'txt' => 0.25 }
99
+ end
100
+ end
101
+
102
+ describe "working with extension sizes" do
103
+ it "should be able to calculate extension sizes" do
104
+ csv_size = File.size(File.join(@dir, @csv1)) + File.size(File.join(@dir, @csv2))
105
+ xml_size = File.size(File.join(@dir, @xml))
106
+ txt_size = File.size(File.join(@dir, @txt))
107
+ @analyzer.extension_sizes.should == { 'csv' => csv_size, 'xml' => xml_size, 'txt' => txt_size }
108
+ end
109
+
110
+ it "should be able to return the most common extension by size" do
111
+ bloat @txt
112
+ @analyzer.most_common_extension_by_size.should == 'txt'
113
+ end
114
+
115
+ it "should be able to calculate extension sizes" do
116
+ csv_size = File.size(File.join(@dir, @csv1)) + File.size(File.join(@dir, @csv2))
117
+ xml_size = File.size(File.join(@dir, @xml))
118
+ txt_size = File.size(File.join(@dir, @txt))
119
+ total_size = csv_size + xml_size + txt_size
120
+ @analyzer.normalized_extension_sizes.should == { 'csv' => csv_size.to_f / total_size.to_f, 'xml' => xml_size.to_f / total_size.to_f, 'txt' => txt_size.to_f / total_size.to_f }
121
+ end
122
+ end
123
+
124
+ describe "determining the most common extension" do
125
+
126
+ it "should obviously return an extension if it is the most common by count as well as the most common by size" do
127
+ bloat @csv1
128
+ @analyzer.most_common_extension.should == 'csv'
129
+ end
130
+
131
+ it "should return the most common extension by count if the count fraction is half or greater and the size fraction is less than half" do
132
+ bloat @txt
133
+ bloat @xml
134
+ @analyzer.most_common_extension.should == 'csv'
135
+ end
136
+
137
+ it "should return the most common extension by size if the size fraction is half or greater and the count fraction is less than half" do
138
+ # need to add an xml file
139
+ @new_xml = File.join(@dir, 'xml2.xml')
140
+ IMWTest::Random.file(@new_xml)
141
+ bloat @txt
142
+ @analyzer = Analyzer.new @dir
143
+ @analyzer.most_common_extension.should == 'txt'
144
+ end
145
+
146
+ it "should return the most common extension by size if no other conditions are met" do
147
+ bloat @txt
148
+ @analyzer.most_common_extension.should == 'txt'
149
+ end
150
+
151
+ end
152
+ end
153
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: imw
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
4
+ hash: 13
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 12
10
- version: 0.2.12
9
+ - 13
10
+ version: 0.2.13
11
11
  platform: ruby
12
12
  authors:
13
13
  - Dhruv Bansal
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-11-17 00:00:00 -06:00
19
+ date: 2010-11-22 00:00:00 -06:00
20
20
  default_executable:
21
21
  dependencies: []
22
22
 
@@ -81,6 +81,7 @@ files:
81
81
  - lib/imw/resource.rb
82
82
  - lib/imw/runner.rb
83
83
  - lib/imw/schemes.rb
84
+ - lib/imw/schemes/ftp.rb
84
85
  - lib/imw/schemes/hdfs.rb
85
86
  - lib/imw/schemes/http.rb
86
87
  - lib/imw/schemes/local.rb
@@ -170,6 +171,7 @@ files:
170
171
  - spec/imw/schemes/sql_spec.rb
171
172
  - spec/imw/tools/aggregator_spec.rb
172
173
  - spec/imw/tools/archiver_spec.rb
174
+ - spec/imw/tools/extension_analyzer_spec.rb
173
175
  - spec/imw/tools/summarizer_spec.rb
174
176
  - spec/imw/tools/transferer_spec.rb
175
177
  - spec/imw/utils/dynamically_extendable_spec.rb
@@ -225,6 +227,7 @@ test_files:
225
227
  - spec/imw/archives/tarbz2_spec.rb
226
228
  - spec/imw/archives/rar_spec.rb
227
229
  - spec/imw/tools/archiver_spec.rb
230
+ - spec/imw/tools/extension_analyzer_spec.rb
228
231
  - spec/imw/tools/summarizer_spec.rb
229
232
  - spec/imw/tools/transferer_spec.rb
230
233
  - spec/imw/tools/aggregator_spec.rb