imw 0.2.9 → 0.2.10

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.9
1
+ 0.2.10
@@ -0,0 +1,10 @@
1
+ ---
2
+
3
+ metadata:
4
+ /path/to/my/data.tsv:
5
+ - name: foobar
6
+ datatype: foobar
7
+ doc: foobar
8
+
9
+
10
+
@@ -0,0 +1,148 @@
1
+ require 'imw/resource'
2
+
3
+ module IMW
4
+ module Tools
5
+
6
+ # Aggregates resources into a single local directory.
7
+ #
8
+ # The directory should already exist.
9
+ #
10
+ # Any local resources will be copied into the directory.
11
+ #
12
+ # Any remote resources will be downloaded into the directory.
13
+ #
14
+ # If any of the resources are (local and) archives, they will first be
15
+ # extracted, with only their contents winding up in the final
16
+ # directory (the file hierarchy of the archive will be preserved).
17
+ #
18
+ # If any of the resources are (local and) compressed, they will
19
+ # first be uncompressed before being added to the directory.
20
+ #
21
+ # As an example:
22
+ #
23
+ # aggregator = IMW::Tools::Aggregator.new '/path/to/agg_dir'
24
+ # aggregator.aggregate '/path/to/my/regular_file.tsv', '/path/to/an/archive.tar.bz2', '/path/to/my_compressed_file.gz', 'http://mywebsite.com/index.html'
25
+ #
26
+ # This will create a directory at <tt>/path/to/agg_dir</tt> which
27
+ # looks like
28
+ #
29
+ # path_to_agg_dir
30
+ # |-- regular_file.tsv
31
+ # |-- archive
32
+ # | |-- internal_archive_file_1
33
+ # | |-- internal_archive_file_2
34
+ # | ...
35
+ # | `-- internal_archive_file_N
36
+ # |-- my_compressed_file
37
+ # `-- index.html
38
+ #
39
+ # Notice that
40
+ #
41
+ # - the local file was copied over
42
+ #
43
+ # - the remote file was downloaded and copied over
44
+ #
45
+ # - the tar archive was first exctracted
46
+ #
47
+ # - the compressed file was aggregated
48
+ #
49
+ # This process can take a while when the constituent files are
50
+ # large.
51
+ class Aggregator
52
+
53
+ attr_reader :dir
54
+
55
+ def initialize dir
56
+ self.dir = IMW.open(dir)
57
+ end
58
+
59
+ # Set the directory for this Aggregator.
60
+ #
61
+ # Will raise unless +new_dir+ is an existing, local directory.
62
+ #
63
+ # @param [String, IMW::Resource] new_dir
64
+ # @return [IMW::Resource]
65
+ def dir= new_dir
66
+ @dir = IMW.open(new_dir)
67
+ raise IMW::SchemError.new("Aggregator requires a local directory, not #{@dir}") unless @dir.is_local?
68
+ @dir.should_exist! "Aggregator requires the aggregation directory to already exist"
69
+ raise IMW::PathError.new("Aggregator requires a directory, not #{@dir}") unless @dir.is_directory?
70
+ @dir
71
+ end
72
+
73
+ # Return a list of error messages for this Aggregator.
74
+ #
75
+ # @return [Array] the error messages
76
+ def errors
77
+ @errors ||= []
78
+ end
79
+
80
+ # Was this archiver successful (did it not have any errors)?
81
+ #
82
+ # @return [true, false]
83
+ def success?
84
+ errors.empty?
85
+ end
86
+
87
+ # Aggregate the given inputs into this Aggregator's +dir+.
88
+ #
89
+ # @param [Array<IMW::Resource,String>] inputs
90
+ # @return [IMW::Tools::Aggregator]
91
+ def aggregate *paths_or_inputs
92
+ @errors = []
93
+ paths_or_inputs.each do |path_or_input|
94
+ input = IMW.open(path_or_input)
95
+ if input.is_local?
96
+ aggregate_local_input(input)
97
+ else
98
+ download = download_remote_input(input)
99
+ if download.is_compressed? || download.is_archive?
100
+ aggregate_local_input(download)
101
+ download.rm!
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ protected
108
+
109
+ # Aggregate a local input.
110
+ #
111
+ # Will extract archives, decompress compressed files, and copy
112
+ # regular files and directories (but will not recurse into
113
+ # directories to find archives or compressed files).
114
+ #
115
+ # @param [IMW::Resource] input
116
+ def aggregate_local_input input
117
+ new_path = File.join(dir.path, input.basename)
118
+ case
119
+ when input.is_archive?
120
+ IMW.announce_if_verbose("Aggregating and extracting #{input} to #{dir}...")
121
+ FileUtils.cd(dir.path) do
122
+ input.extract
123
+ end
124
+ when input.is_compressed?
125
+ IMW.announce_if_verbose("Decompressing #{input}...")
126
+ input.cp(new_path).decompress!
127
+ else
128
+ IMW.announce_if_verbose("Copying #{input}...")
129
+ input.cp(new_path)
130
+ end
131
+ end
132
+
133
+ # Download a remote input to this Aggregator's +dir+.
134
+ #
135
+ # @param [IMW::Resource] input
136
+ def download_remote_input input
137
+ IMW.announce_if_verbose("Downloading #{input}...")
138
+ input.cp(File.join(dir.path, input.effective_basename))
139
+ end
140
+
141
+ def add_processing_error error # :nodoc:
142
+ IMW.logger.warn error
143
+ errors << error
144
+ end
145
+
146
+ end
147
+ end
148
+ end
data/lib/imw/tools.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  module IMW
2
2
  module Tools
3
+ autoload :Aggregator, 'imw/tools/aggregator'
3
4
  autoload :Archiver, 'imw/tools/archiver'
4
5
  autoload :Transferer, 'imw/tools/transferer'
5
6
  autoload :Summarizer, 'imw/tools/summarizer'
@@ -22,7 +22,13 @@ module IMW
22
22
  # Error communicating with a remote entity.
23
23
  NetworkError = Class.new(Error)
24
24
 
25
- # Error communicating with a remote entity.
25
+ # Raised when a resource is of the wrong scheme.
26
+ SchemeError = Class.new(Error)
27
+
28
+ # Raised when a resource is of the wrong (or malformed) format.
29
+ FormatError = Class.new(Error)
30
+
31
+ # Bad argument.
26
32
  ArgumentError = Class.new(Error)
27
33
 
28
34
  # Error in defining or matching a schema.
@@ -0,0 +1,71 @@
1
+ require File.dirname(__FILE__) + "/../../spec_helper"
2
+
3
+ describe IMW::Tools::Aggregator do
4
+ before do
5
+ @dir = 'agg_dir'
6
+ FileUtils.mkdir_p(@dir)
7
+
8
+ # remote files
9
+ @homepage = "http://www.google.com"
10
+ @website = "http://www.google.com/support/"
11
+ @remote_files = [@homepage, @website]
12
+
13
+ # regular files
14
+ @csv = "foobar-csv.csv"
15
+ @xml = "foobar-xml.xml"
16
+ @txt = "foobar-txt.txt"
17
+ @blah = "foobar"
18
+ @files = [@csv, @xml, @txt, @blah]
19
+
20
+ # compressed files
21
+ @bz2 = "foobar-bz2.bz2"
22
+ @gz = "foobar-gz.gz"
23
+ @compressed_files = [@bz2, @gz]
24
+
25
+ # archives
26
+ @zip = "foobar-zip.zip"
27
+ @tarbz2 = "foobar-tarbz2.tar.bz2"
28
+ @targz = "foobar-targz.tar.gz"
29
+ @tar = "foobar-tar.tar"
30
+ @rar = "foobar-rar.rar"
31
+ @archives = [@zip, @tarbz2, @targz, @rar, @tar]
32
+
33
+ @local_files = @files + @compressed_files + @archives
34
+
35
+ @all_files = @remote_files + @local_files
36
+
37
+ @local_files.each do |path|
38
+ IMWTest::Random.file path
39
+ end
40
+
41
+ @aggregator = IMW::Tools::Aggregator.new @dir
42
+ end
43
+
44
+ it "should copy regular files to its directory" do
45
+ @aggregator.aggregate *@files
46
+ @aggregator.dir.path.should contain(*@files)
47
+ @files.each { |path| IMW.open(path).exist?.should be_true }
48
+ end
49
+
50
+ it "should copy remote files to its archive directory" do
51
+ @aggregator.aggregate *@remote_files
52
+ @aggregator.dir.path.should contain('_index', 'support') # _index from Http#effective_basename on http://www.google.com
53
+ end
54
+
55
+ it "should uncompress compressed files to its directory" do
56
+ @aggregator.aggregate *@compressed_files
57
+ @aggregator.dir.path.should contain('foobar-bz2', 'foobar-gz')
58
+ @aggregator.dir.path.should_not contain(*@compressed_files)
59
+ end
60
+
61
+ it "should copy the content of archive files to its archive directory (but not the actual archives)" do
62
+ @aggregator.aggregate *@archives
63
+ @archives.each do |archive|
64
+ @aggregator.dir.path.should_not contain(archive)
65
+ @aggregator.dir.path.should contain(*IMW.open(archive).contents)
66
+ end
67
+ end
68
+
69
+ end
70
+
71
+
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: imw
3
3
  version: !ruby/object:Gem::Version
4
- hash: 5
4
+ hash: 3
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 9
10
- version: 0.2.9
9
+ - 10
10
+ version: 0.2.10
11
11
  platform: ruby
12
12
  authors:
13
13
  - Dhruv Bansal
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-10-26 00:00:00 -05:00
19
+ date: 2010-10-29 00:00:00 -05:00
20
20
  default_executable: imw
21
21
  dependencies: []
22
22
 
@@ -40,6 +40,7 @@ files:
40
40
  - bin/imw
41
41
  - etc/imwrc.rb
42
42
  - examples/dataset.rb
43
+ - examples/metadata.yml
43
44
  - lib/imw.rb
44
45
  - lib/imw/archives.rb
45
46
  - lib/imw/archives/rar.rb
@@ -85,6 +86,7 @@ files:
85
86
  - lib/imw/schemes/s3.rb
86
87
  - lib/imw/schemes/sql.rb
87
88
  - lib/imw/tools.rb
89
+ - lib/imw/tools/aggregator.rb
88
90
  - lib/imw/tools/archiver.rb
89
91
  - lib/imw/tools/downloader.rb
90
92
  - lib/imw/tools/extension_analyzer.rb
@@ -164,6 +166,7 @@ files:
164
166
  - spec/imw/schemes/remote_spec.rb
165
167
  - spec/imw/schemes/s3_spec.rb
166
168
  - spec/imw/schemes/sql_spec.rb
169
+ - spec/imw/tools/aggregator_spec.rb
167
170
  - spec/imw/tools/archiver_spec.rb
168
171
  - spec/imw/tools/summarizer_spec.rb
169
172
  - spec/imw/tools/transferer_spec.rb
@@ -222,6 +225,7 @@ test_files:
222
225
  - spec/imw/tools/archiver_spec.rb
223
226
  - spec/imw/tools/summarizer_spec.rb
224
227
  - spec/imw/tools/transferer_spec.rb
228
+ - spec/imw/tools/aggregator_spec.rb
225
229
  - spec/imw/compressed_files/compressible_spec.rb
226
230
  - spec/imw/compressed_files/bz2_spec.rb
227
231
  - spec/imw/compressed_files/gz_spec.rb