imw 0.2.9 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.9
1
+ 0.2.10
@@ -0,0 +1,10 @@
1
+ ---
2
+
3
+ metadata:
4
+ /path/to/my/data.tsv:
5
+ - name: foobar
6
+ datatype: foobar
7
+ doc: foobar
8
+
9
+
10
+
@@ -0,0 +1,148 @@
1
+ require 'imw/resource'
2
+
3
+ module IMW
4
+ module Tools
5
+
6
+ # Aggregates resources into a single local directory.
7
+ #
8
+ # The directory should already exist.
9
+ #
10
+ # Any local resources will be copied into the directory.
11
+ #
12
+ # Any remote resources will be downloaded into the directory.
13
+ #
14
+ # If any of the resources are (local and) archives, they will first be
15
+ # extracted, with only their contents winding up in the final
16
+ # directory (the file hierarchy of the archive will be preserved).
17
+ #
18
+ # If any of the resources are (local and) compressed, they will
19
+ # first be uncompressed before being added to the directory.
20
+ #
21
+ # As an example:
22
+ #
23
+ # aggregator = IMW::Tools::Aggregator.new '/path/to/agg_dir'
24
+ # aggregator.aggregate '/path/to/my/regular_file.tsv', '/path/to/an/archive.tar.bz2', '/path/to/my_compressed_file.gz', 'http://mywebsite.com/index.html'
25
+ #
26
+ # This will create a directory at <tt>/path/to/agg_dir</tt> which
27
+ # looks like
28
+ #
29
+ # path_to_agg_dir
30
+ # |-- regular_file.tsv
31
+ # |-- archive
32
+ # | |-- internal_archive_file_1
33
+ # | |-- internal_archive_file_2
34
+ # | ...
35
+ # | `-- internal_archive_file_N
36
+ # |-- my_compressed_file
37
+ # `-- index.html
38
+ #
39
+ # Notice that
40
+ #
41
+ # - the local file was copied over
42
+ #
43
+ # - the remote file was downloaded and copied over
44
+ #
45
+ # - the tar archive was first exctracted
46
+ #
47
+ # - the compressed file was aggregated
48
+ #
49
+ # This process can take a while when the constituent files are
50
+ # large.
51
+ class Aggregator
52
+
53
+ attr_reader :dir
54
+
55
+ def initialize dir
56
+ self.dir = IMW.open(dir)
57
+ end
58
+
59
+ # Set the directory for this Aggregator.
60
+ #
61
+ # Will raise unless +new_dir+ is an existing, local directory.
62
+ #
63
+ # @param [String, IMW::Resource] new_dir
64
+ # @return [IMW::Resource]
65
+ def dir= new_dir
66
+ @dir = IMW.open(new_dir)
67
+ raise IMW::SchemError.new("Aggregator requires a local directory, not #{@dir}") unless @dir.is_local?
68
+ @dir.should_exist! "Aggregator requires the aggregation directory to already exist"
69
+ raise IMW::PathError.new("Aggregator requires a directory, not #{@dir}") unless @dir.is_directory?
70
+ @dir
71
+ end
72
+
73
+ # Return a list of error messages for this Aggregator.
74
+ #
75
+ # @return [Array] the error messages
76
+ def errors
77
+ @errors ||= []
78
+ end
79
+
80
+ # Was this archiver successful (did it not have any errors)?
81
+ #
82
+ # @return [true, false]
83
+ def success?
84
+ errors.empty?
85
+ end
86
+
87
+ # Aggregate the given inputs into this Aggregator's +dir+.
88
+ #
89
+ # @param [Array<IMW::Resource,String>] inputs
90
+ # @return [IMW::Tools::Aggregator]
91
+ def aggregate *paths_or_inputs
92
+ @errors = []
93
+ paths_or_inputs.each do |path_or_input|
94
+ input = IMW.open(path_or_input)
95
+ if input.is_local?
96
+ aggregate_local_input(input)
97
+ else
98
+ download = download_remote_input(input)
99
+ if download.is_compressed? || download.is_archive?
100
+ aggregate_local_input(download)
101
+ download.rm!
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ protected
108
+
109
+ # Aggregate a local input.
110
+ #
111
+ # Will extract archives, decompress compressed files, and copy
112
+ # regular files and directories (but will not recurse into
113
+ # directories to find archives or compressed files).
114
+ #
115
+ # @param [IMW::Resource] input
116
+ def aggregate_local_input input
117
+ new_path = File.join(dir.path, input.basename)
118
+ case
119
+ when input.is_archive?
120
+ IMW.announce_if_verbose("Aggregating and extracting #{input} to #{dir}...")
121
+ FileUtils.cd(dir.path) do
122
+ input.extract
123
+ end
124
+ when input.is_compressed?
125
+ IMW.announce_if_verbose("Decompressing #{input}...")
126
+ input.cp(new_path).decompress!
127
+ else
128
+ IMW.announce_if_verbose("Copying #{input}...")
129
+ input.cp(new_path)
130
+ end
131
+ end
132
+
133
+ # Download a remote input to this Aggregator's +dir+.
134
+ #
135
+ # @param [IMW::Resource] input
136
+ def download_remote_input input
137
+ IMW.announce_if_verbose("Downloading #{input}...")
138
+ input.cp(File.join(dir.path, input.effective_basename))
139
+ end
140
+
141
+ def add_processing_error error # :nodoc:
142
+ IMW.logger.warn error
143
+ errors << error
144
+ end
145
+
146
+ end
147
+ end
148
+ end
data/lib/imw/tools.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  module IMW
2
2
  module Tools
3
+ autoload :Aggregator, 'imw/tools/aggregator'
3
4
  autoload :Archiver, 'imw/tools/archiver'
4
5
  autoload :Transferer, 'imw/tools/transferer'
5
6
  autoload :Summarizer, 'imw/tools/summarizer'
@@ -22,7 +22,13 @@ module IMW
22
22
  # Error communicating with a remote entity.
23
23
  NetworkError = Class.new(Error)
24
24
 
25
- # Error communicating with a remote entity.
25
+ # Raised when a resource is of the wrong scheme.
26
+ SchemeError = Class.new(Error)
27
+
28
+ # Raised when a resource is of the wrong (or malformed) format.
29
+ FormatError = Class.new(Error)
30
+
31
+ # Bad argument.
26
32
  ArgumentError = Class.new(Error)
27
33
 
28
34
  # Error in defining or matching a schema.
@@ -0,0 +1,71 @@
1
+ require File.dirname(__FILE__) + "/../../spec_helper"
2
+
3
+ describe IMW::Tools::Aggregator do
4
+ before do
5
+ @dir = 'agg_dir'
6
+ FileUtils.mkdir_p(@dir)
7
+
8
+ # remote files
9
+ @homepage = "http://www.google.com"
10
+ @website = "http://www.google.com/support/"
11
+ @remote_files = [@homepage, @website]
12
+
13
+ # regular files
14
+ @csv = "foobar-csv.csv"
15
+ @xml = "foobar-xml.xml"
16
+ @txt = "foobar-txt.txt"
17
+ @blah = "foobar"
18
+ @files = [@csv, @xml, @txt, @blah]
19
+
20
+ # compressed files
21
+ @bz2 = "foobar-bz2.bz2"
22
+ @gz = "foobar-gz.gz"
23
+ @compressed_files = [@bz2, @gz]
24
+
25
+ # archives
26
+ @zip = "foobar-zip.zip"
27
+ @tarbz2 = "foobar-tarbz2.tar.bz2"
28
+ @targz = "foobar-targz.tar.gz"
29
+ @tar = "foobar-tar.tar"
30
+ @rar = "foobar-rar.rar"
31
+ @archives = [@zip, @tarbz2, @targz, @rar, @tar]
32
+
33
+ @local_files = @files + @compressed_files + @archives
34
+
35
+ @all_files = @remote_files + @local_files
36
+
37
+ @local_files.each do |path|
38
+ IMWTest::Random.file path
39
+ end
40
+
41
+ @aggregator = IMW::Tools::Aggregator.new @dir
42
+ end
43
+
44
+ it "should copy regular files to its directory" do
45
+ @aggregator.aggregate *@files
46
+ @aggregator.dir.path.should contain(*@files)
47
+ @files.each { |path| IMW.open(path).exist?.should be_true }
48
+ end
49
+
50
+ it "should copy remote files to its archive directory" do
51
+ @aggregator.aggregate *@remote_files
52
+ @aggregator.dir.path.should contain('_index', 'support') # _index from Http#effective_basename on http://www.google.com
53
+ end
54
+
55
+ it "should uncompress compressed files to its directory" do
56
+ @aggregator.aggregate *@compressed_files
57
+ @aggregator.dir.path.should contain('foobar-bz2', 'foobar-gz')
58
+ @aggregator.dir.path.should_not contain(*@compressed_files)
59
+ end
60
+
61
+ it "should copy the content of archive files to its archive directory (but not the actual archives)" do
62
+ @aggregator.aggregate *@archives
63
+ @archives.each do |archive|
64
+ @aggregator.dir.path.should_not contain(archive)
65
+ @aggregator.dir.path.should contain(*IMW.open(archive).contents)
66
+ end
67
+ end
68
+
69
+ end
70
+
71
+
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: imw
3
3
  version: !ruby/object:Gem::Version
4
- hash: 5
4
+ hash: 3
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 9
10
- version: 0.2.9
9
+ - 10
10
+ version: 0.2.10
11
11
  platform: ruby
12
12
  authors:
13
13
  - Dhruv Bansal
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2010-10-26 00:00:00 -05:00
19
+ date: 2010-10-29 00:00:00 -05:00
20
20
  default_executable: imw
21
21
  dependencies: []
22
22
 
@@ -40,6 +40,7 @@ files:
40
40
  - bin/imw
41
41
  - etc/imwrc.rb
42
42
  - examples/dataset.rb
43
+ - examples/metadata.yml
43
44
  - lib/imw.rb
44
45
  - lib/imw/archives.rb
45
46
  - lib/imw/archives/rar.rb
@@ -85,6 +86,7 @@ files:
85
86
  - lib/imw/schemes/s3.rb
86
87
  - lib/imw/schemes/sql.rb
87
88
  - lib/imw/tools.rb
89
+ - lib/imw/tools/aggregator.rb
88
90
  - lib/imw/tools/archiver.rb
89
91
  - lib/imw/tools/downloader.rb
90
92
  - lib/imw/tools/extension_analyzer.rb
@@ -164,6 +166,7 @@ files:
164
166
  - spec/imw/schemes/remote_spec.rb
165
167
  - spec/imw/schemes/s3_spec.rb
166
168
  - spec/imw/schemes/sql_spec.rb
169
+ - spec/imw/tools/aggregator_spec.rb
167
170
  - spec/imw/tools/archiver_spec.rb
168
171
  - spec/imw/tools/summarizer_spec.rb
169
172
  - spec/imw/tools/transferer_spec.rb
@@ -222,6 +225,7 @@ test_files:
222
225
  - spec/imw/tools/archiver_spec.rb
223
226
  - spec/imw/tools/summarizer_spec.rb
224
227
  - spec/imw/tools/transferer_spec.rb
228
+ - spec/imw/tools/aggregator_spec.rb
225
229
  - spec/imw/compressed_files/compressible_spec.rb
226
230
  - spec/imw/compressed_files/bz2_spec.rb
227
231
  - spec/imw/compressed_files/gz_spec.rb