imw 0.2.18 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,119 +0,0 @@
1
- module IMW
2
- module Schemes
3
-
4
- # Contains modules which define methods appropriate for remote
5
- # resources, no matter the protocol.
6
- module Remote
7
-
8
- # Defines methods appropriate for accessing a remote resource,
9
- # no matter the protocol.
10
- module Base
11
-
12
- #
13
- # TODO -- self.extended should extend by RemoteDirectory when appropriate
14
- #
15
-
16
- def self.extended obj
17
- obj.extend(RemoteFile)
18
- end
19
-
20
- # Is this resource on a remote host?
21
- #
22
- # @return [true,false]
23
- def is_remote?
24
- true
25
- end
26
-
27
- # The host of this resource.
28
- #
29
- # @return [String]
30
- def host
31
- @host ||= uri.host
32
- end
33
-
34
- # Return the query string part of this resource's URI. Will
35
- # likely be +nil+ for local resources.
36
- #
37
- # @return [String]
38
- def query_string
39
- @query_string ||= uri.query
40
- end
41
-
42
- # Return the path part of this resource's URI. Will _not_
43
- # include the +query_string+ or +fragment+.
44
- #
45
- # @return [String]
46
- def path
47
- @path ||= uri.path
48
- end
49
-
50
- end
51
-
52
- module RemoteFile
53
-
54
- # Return the IO object for this remote file.
55
- #
56
- # The mode of this resource is ignored.
57
- #
58
- # @return [StringIO]
59
- def io
60
- require 'open-uri'
61
- @io ||= open(uri.to_s) # ignore mode
62
- end
63
-
64
- # Read the contents of this remote file.
65
- #
66
- # @return [String]
67
- def read
68
- io.read
69
- end
70
-
71
- # Return the lines of this remote file.
72
- #
73
- # If passed a block then yield each line to the block.
74
- #
75
- # @return [Array] the lines of this remote file
76
- # @yield [String] each line of this remote file
77
- def load &block
78
- if block_given?
79
- io.each do |line|
80
- yield line
81
- end
82
- else
83
- read.split("\n")
84
- end
85
- end
86
-
87
- # Map over the lines in this remote file.
88
- #
89
- # @yield [String] each line of the file
90
- def map &block
91
- io.map(&block)
92
- end
93
- end
94
-
95
-
96
- module RemoteDirectory
97
-
98
- # Return the resource at the base path of this resource joined
99
- # to +path+.
100
- #
101
- # IMW.open('http://example.com/path/to/dir').join('subdir')
102
- # #=> IMW::Resource at 'http://example.com/path/to/dir/subdir'
103
- #
104
- # @param [Array<String>] paths
105
- # @return [IMW::Resource]
106
- def join *paths
107
- IMW.open(File.join(stripped_uri.to_s, *paths))
108
- end
109
-
110
- #
111
- # TODO -- bloody everything. what's the best way to tell if
112
- # the remote URL is a directory?
113
- #
114
-
115
-
116
- end
117
- end
118
- end
119
- end
@@ -1,143 +0,0 @@
1
- module IMW
2
- module Schemes
3
-
4
- # Defines methods for reading and writing data to {Amazon
5
- # S3}[http://aws.amazon.com/s3] buckets.
6
- #
7
- # IMW.open('s3://my_bucket/path/to/some/file.csv')
8
- #
9
- # Learn more about {Amazon Web Services}[http://aws.amazon.com].
10
- module S3
11
-
12
- # For an S3 resource, the bucket is just the hostname.
13
- #
14
- # @return [String]
15
- def bucket
16
- host
17
- end
18
-
19
- # Is this resource an S3 resource?
20
- #
21
- # @return [true, false]
22
- def on_s3?
23
- true
24
- end
25
- alias_method :is_s3?, :on_s3?
26
-
27
- # Copy this resource to the +new_uri+.
28
- #
29
- # @param [String, IMW::Resource] new_uri
30
- # @return [IMW::Resource] the new resource
31
- def cp new_uri
32
- #IMW::Tools::Transferer.new(:cp, self, new_uri).transfer!
33
- IMW::Schemes::S3.get(self, new_uri)
34
- end
35
-
36
- # Does this resource exist on S3?
37
- #
38
- # @return [true, false]
39
- def exist?
40
- AWS::S3::S3Object.exists?(raw_path, bucket)
41
- end
42
- alias_method :exists?, :exist?
43
-
44
- # Remove this resource from S3.
45
- #
46
- # @return [IMW::Resource] the deleted object
47
- def rm
48
- AWS::S3::S3Object.delete(raw_path, bucket)
49
- end
50
- alias_method :rm!, :rm
51
-
52
- # Return the S3N URL for this S3 object
53
- #
54
- # resource = IMW.open('s3://my_bucket/path/to/some/obj')
55
- # resource.s3n_url
56
- # => 's3n://my_bucket/path/to/some/obj'
57
- #
58
- # @return [String]
59
- def s3n_url
60
- uri.to_s.gsub(/^s3:/, 's3n:')
61
- end
62
-
63
- # Return the contents of this S3 object.
64
- #
65
- # @return [String]
66
- def read
67
- AWS::S3::S3Object.value(raw_path, bucket)
68
- end
69
-
70
- # Store +source+ into +destination+.
71
- #
72
- # @param [String, IMW::Resource, #io] source
73
- # @param [String, IMW::Resource, #path, #bucket] destination
74
- # @return [IMW::Resource] the new S3 object
75
- def self.put source, destination
76
- source = IMW.open(source)
77
- destintation = IMW.open(destination)
78
- raise IMW::ArgumentError.new("destination must be on S3 -- #{destination} given") unless destination.on_s3?
79
- make_connection!
80
- AWS::S3::S3Object.store(destination.raw_path, source.io, destination.bucket)
81
- destination
82
- end
83
-
84
- # Download +source+ from S3 into +destination+.
85
- #
86
- # @param [String, IMW::Resource, #path, #bucket] source
87
- # @param [String, IMW::Resource, #write] destination
88
- # @return [IMW::Resource] the new resource
89
- def self.get source, destination
90
- source = IMW.open(source)
91
- destination = IMW.open!(destination)
92
- raise IMW::ArgumentError.new("source must be on S3 -- #{source} given") unless source.on_s3?
93
- make_connection!
94
- AWS::S3::S3Object.stream(source.raw_path, source.bucket) do |chunk|
95
- destination.write(chunk)
96
- end
97
- destination.close
98
- destination.reopen
99
- end
100
-
101
- # Copy S3 resource +source+ to +destination+.
102
- #
103
- # @param [String, IMW::Resource, #path, #bucket] source
104
- # @param [String, IMW::Resource, #path, #bucket] destination
105
- # @return [IMW::Resource] the new resource
106
- def self.copy source, destination
107
- source = IMW.open(source)
108
- destination = IMW.open(destination)
109
- raise IMW::PathError.new("Bucket names must be non-blank and match to 'copy'") unless source.bucket.present? && destination.bucket.present? && source.bucket == destination.bucket
110
- make_connection!
111
- AWS::S3::Object.copy(source.raw_path, destination.raw_path, destination.bucket)
112
- destination
113
- end
114
-
115
- # Return the resource at the base path of this resource joined
116
- # to +path+.
117
- #
118
- # IMW.open('s3:://bucket/path/to/dir').join('subdir')
119
- # #=> IMW::Resource at 's3://bucket/path/to/dir/subdir'
120
- #
121
- # @param [Array<String>] paths
122
- # @return [IMW::Resource]
123
- def join *paths
124
- IMW.open(File.join(stripped_uri.to_s, *paths))
125
- end
126
-
127
- protected
128
- # Make an S3 connection.
129
- #
130
- # Uses settings defined in IMW::AWS_CREDENTIALS.
131
- #
132
- # @return [AWS
133
- def self.make_connection!
134
- return @connection if @connection
135
- raise IMW::Error.new("Must define a constant IMW::AWS_CREDENTIALS with an :access_key_id and a :secret_access_key before using S3 resources") unless defined?(IMW::AWS_CREDENTIALS)
136
- require 'aws/s3'
137
- @connection = AWS::S3::Base.establish_connection!(IMW::AWS_CREDENTIALS)
138
- end
139
-
140
- end
141
- end
142
- end
143
-
@@ -1,129 +0,0 @@
1
- require 'dbi'
2
-
3
- module IMW
4
- module Schemes
5
-
6
- # Encapsulates a connection to a relational database.
7
- #
8
- # Calling
9
- #
10
- # IMW.open('sql://host:port/database_name')
11
- #
12
- # shold create a connection to a database at the given +port+ on
13
- # the given +host+ using the given +database_name+.
14
- module SQL
15
-
16
- # A base implementation of a connection to a relational
17
- # database.
18
- #
19
- # The Base#extended method will examine the +scheme+ of an
20
- # object extended with this module and choose a more specific
21
- # database adaptor module to extend with as well.
22
- module Base
23
-
24
- # When an IMW::Resource is extended use URI's scheme to choose
25
- # which other module inside IMW::Schemes::SQL to extend with.
26
- def self.extended obj
27
- case obj.scheme
28
- when 'mysql' then obj.extend(IMW::Schemes::SQL::MySQL)
29
- when 'postgresql' then obj.extend(IMW::Schemes::SQL::PostgreSQL)
30
- else raise IMW::ArgumentError.new("Unknown database type: #{obj.scheme}")
31
- end
32
- end
33
-
34
- # For an SQL connection the database will be the same as the
35
- # path.
36
- #
37
- # @return [String]
38
- def database
39
- @database ||= path.tr('/','')
40
- end
41
-
42
- # Redefineeach method inappropriate for databases.
43
- [:dirname, :basename, :extname, :extension, :name].each do |method|
44
- define_method(method) do
45
- nil
46
- end
47
- end
48
-
49
- # Return a summary of this database.
50
- #
51
- # Purposefully does not call +super+.
52
- #
53
- # @return [Hash]
54
- def external_summary
55
- {
56
- :uri => uri.to_s,
57
- :database => database
58
- }
59
- end
60
-
61
- # The (cached) database connection for this resource.
62
- #
63
- # @return [DBI::DatabaseHandle]
64
- def connection
65
- @connection ||= DBI.connect("#{dbi_module}:#{database}:#{host}", user, password)
66
- end
67
-
68
- # Return the password associated with user's account on the
69
- # given database.
70
- #
71
- # @return [String]
72
- def password
73
- @password ||= resource_options[:password]
74
- end
75
-
76
- # Return an array of the table names in the current database.
77
- #
78
- # @return [Array<String>]
79
- def tables
80
- [].tap do |table_names|
81
- execute("SHOW TABLES") do |row|
82
- table_names << row.first
83
- end
84
- end
85
- end
86
-
87
- # Execute the (joined) +query_string_parts+ using this
88
- # resource's cached connection.
89
- #
90
- # If passed a block, yield each row of the result set to the
91
- # block.
92
- #
93
- # @param [Array<String>] query_string_parts
94
- # @yield [DBI::Row]
95
- # @return [DBI::StatementHandle]
96
- def execute *query_string_parts, &block
97
- query = query_string_parts.join(' ')
98
- IMW.announce_if_verbose "Querying #{self}: #{query}"
99
- statement = connection.execute(query)
100
- block_given? ? statement.fetch(&block) : statement
101
- end
102
- end
103
-
104
- # Module for MySQL databases.
105
- module MySQL
106
-
107
- # Return the name of the DBI module used to connect to MySQL.
108
- #
109
- # @return [String]
110
- def dbi_module
111
- "DBI:Mysql"
112
- end
113
- end
114
-
115
- # Module for PostgreSQL databases.
116
- module PostgreSQL
117
-
118
- # Return the name of the DBI module used to connect to PostgreSQL.
119
- #
120
- # @return [String]
121
- def dbi_module
122
- "DBI:Pg"
123
- end
124
- end
125
-
126
- end
127
- end
128
- end
129
-
@@ -1,12 +0,0 @@
1
- module IMW
2
- module Tools
3
- autoload :Aggregator, 'imw/tools/aggregator'
4
- autoload :Archiver, 'imw/tools/archiver'
5
- autoload :Transferer, 'imw/tools/transferer'
6
- autoload :Summarizer, 'imw/tools/summarizer'
7
- autoload :ExtensionAnalyzer, 'imw/tools/extension_analyzer'
8
- autoload :Downloader, 'imw/tools/downloader'
9
- end
10
- end
11
-
12
-
@@ -1,148 +0,0 @@
1
- require 'imw/resource'
2
-
3
- module IMW
4
- module Tools
5
-
6
- # Aggregates resources into a single local directory.
7
- #
8
- # The directory should already exist.
9
- #
10
- # Any local resources will be copied into the directory.
11
- #
12
- # Any remote resources will be downloaded into the directory.
13
- #
14
- # If any of the resources are archives, they will first be
15
- # extracted, with only their contents winding up in the final
16
- # directory (the file hierarchy of the archive will be preserved).
17
- #
18
- # If any of the resources are compressed, they will first be
19
- # uncompressed before being added to the directory.
20
- #
21
- # As an example:
22
- #
23
- # aggregator = IMW::Tools::Aggregator.new '/path/to/agg_dir'
24
- # aggregator.aggregate '/path/to/my/regular_file.tsv', '/path/to/an/archive.tar.bz2', '/path/to/my_compressed_file.gz', 'http://mywebsite.com/index.html'
25
- #
26
- # This will create a directory at <tt>/path/to/agg_dir</tt> which
27
- # looks like
28
- #
29
- # path_to_agg_dir
30
- # |-- regular_file.tsv
31
- # |-- archive
32
- # | |-- internal_archive_file_1
33
- # | |-- internal_archive_file_2
34
- # | ...
35
- # | `-- internal_archive_file_N
36
- # |-- my_compressed_file
37
- # `-- index.html
38
- #
39
- # Notice that
40
- #
41
- # - the local file was copied over
42
- #
43
- # - the remote file was downloaded and copied over
44
- #
45
- # - the tar archive was first exctracted
46
- #
47
- # - the compressed file was aggregated
48
- #
49
- # This process can take a while when the constituent files are
50
- # large.
51
- class Aggregator
52
-
53
- attr_reader :dir
54
-
55
- def initialize dir
56
- self.dir = IMW.open(dir)
57
- end
58
-
59
- # Set the directory for this Aggregator.
60
- #
61
- # Will raise unless +new_dir+ is an existing, local directory.
62
- #
63
- # @param [String, IMW::Resource] new_dir
64
- # @return [IMW::Resource]
65
- def dir= new_dir
66
- @dir = IMW.open(new_dir)
67
- raise IMW::SchemError.new("Aggregator requires a local directory, not #{@dir}") unless @dir.is_local?
68
- @dir.should_exist! "Aggregator requires the aggregation directory to already exist"
69
- raise IMW::PathError.new("Aggregator requires a directory, not #{@dir}") unless @dir.is_directory?
70
- @dir
71
- end
72
-
73
- # Return a list of error messages for this Aggregator.
74
- #
75
- # @return [Array] the error messages
76
- def errors
77
- @errors ||= []
78
- end
79
-
80
- # Was this archiver successful (did it not have any errors)?
81
- #
82
- # @return [true, false]
83
- def success?
84
- errors.empty?
85
- end
86
-
87
- # Aggregate the given inputs into this Aggregator's +dir+.
88
- #
89
- # @param [Array<IMW::Resource,String>] inputs
90
- # @return [IMW::Tools::Aggregator]
91
- def aggregate *paths_or_inputs
92
- @errors = []
93
- paths_or_inputs.flatten.compact.each do |path_or_input|
94
- input = IMW.open(path_or_input)
95
- if input.is_local?
96
- aggregate_local_input(input)
97
- else
98
- download = download_remote_input(input)
99
- if download.is_compressed? || download.is_archive?
100
- aggregate_local_input(download)
101
- download.rm!
102
- end
103
- end
104
- end
105
- end
106
-
107
- protected
108
-
109
- # Aggregate a local input.
110
- #
111
- # Will extract archives, decompress compressed files, and copy
112
- # regular files and directories (but will not recurse into
113
- # directories to find archives or compressed files).
114
- #
115
- # @param [IMW::Resource] input
116
- def aggregate_local_input input
117
- new_path = File.join(dir.path, input.basename)
118
- case
119
- when input.is_archive?
120
- IMW.announce_if_verbose("Aggregating and extracting #{input} to #{dir}...")
121
- FileUtils.cd(dir.path) do
122
- input.extract
123
- end
124
- when input.is_compressed?
125
- IMW.announce_if_verbose("Decompressing #{input}...")
126
- input.cp(new_path).decompress!
127
- else
128
- IMW.announce_if_verbose("Copying #{input}...")
129
- input.cp(new_path)
130
- end
131
- end
132
-
133
- # Download a remote input to this Aggregator's +dir+.
134
- #
135
- # @param [IMW::Resource] input
136
- def download_remote_input input
137
- IMW.announce_if_verbose("Downloading #{input}...")
138
- input.cp(File.join(dir.path, input.effective_basename))
139
- end
140
-
141
- def add_processing_error error # :nodoc:
142
- IMW.logger.warn error
143
- errors << error
144
- end
145
-
146
- end
147
- end
148
- end