imw 0.2.18 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,119 +0,0 @@
1
- module IMW
2
- module Schemes
3
-
4
- # Contains modules which define methods appropriate for remote
5
- # resources, no matter the protocol.
6
- module Remote
7
-
8
- # Defines methods appropriate for accessing a remote resource,
9
- # no matter the protocol.
10
- module Base
11
-
12
- #
13
- # TODO -- self.extended should extend by RemoteDirectory when appropriate
14
- #
15
-
16
- def self.extended obj
17
- obj.extend(RemoteFile)
18
- end
19
-
20
- # Is this resource on a remote host?
21
- #
22
- # @return [true,false]
23
- def is_remote?
24
- true
25
- end
26
-
27
- # The host of this resource.
28
- #
29
- # @return [String]
30
- def host
31
- @host ||= uri.host
32
- end
33
-
34
- # Return the query string part of this resource's URI. Will
35
- # likely be +nil+ for local resources.
36
- #
37
- # @return [String]
38
- def query_string
39
- @query_string ||= uri.query
40
- end
41
-
42
- # Return the path part of this resource's URI. Will _not_
43
- # include the +query_string+ or +fragment+.
44
- #
45
- # @return [String]
46
- def path
47
- @path ||= uri.path
48
- end
49
-
50
- end
51
-
52
- module RemoteFile
53
-
54
- # Return the IO object for this remote file.
55
- #
56
- # The mode of this resource is ignored.
57
- #
58
- # @return [StringIO]
59
- def io
60
- require 'open-uri'
61
- @io ||= open(uri.to_s) # ignore mode
62
- end
63
-
64
- # Read the contents of this remote file.
65
- #
66
- # @return [String]
67
- def read
68
- io.read
69
- end
70
-
71
- # Return the lines of this remote file.
72
- #
73
- # If passed a block then yield each line to the block.
74
- #
75
- # @return [Array] the lines of this remote file
76
- # @yield [String] each line of this remote file
77
- def load &block
78
- if block_given?
79
- io.each do |line|
80
- yield line
81
- end
82
- else
83
- read.split("\n")
84
- end
85
- end
86
-
87
- # Map over the lines in this remote file.
88
- #
89
- # @yield [String] each line of the file
90
- def map &block
91
- io.map(&block)
92
- end
93
- end
94
-
95
-
96
- module RemoteDirectory
97
-
98
- # Return the resource at the base path of this resource joined
99
- # to +path+.
100
- #
101
- # IMW.open('http://example.com/path/to/dir').join('subdir')
102
- # #=> IMW::Resource at 'http://example.com/path/to/dir/subdir'
103
- #
104
- # @param [Array<String>] paths
105
- # @return [IMW::Resource]
106
- def join *paths
107
- IMW.open(File.join(stripped_uri.to_s, *paths))
108
- end
109
-
110
- #
111
- # TODO -- bloody everything. what's the best way to tell if
112
- # the remote URL is a directory?
113
- #
114
-
115
-
116
- end
117
- end
118
- end
119
- end
@@ -1,143 +0,0 @@
1
- module IMW
2
- module Schemes
3
-
4
- # Defines methods for reading and writing data to {Amazon
5
- # S3}[http://aws.amazon.com/s3] buckets.
6
- #
7
- # IMW.open('s3://my_bucket/path/to/some/file.csv')
8
- #
9
- # Learn more about {Amazon Web Services}[http://aws.amazon.com].
10
- module S3
11
-
12
- # For an S3 resource, the bucket is just the hostname.
13
- #
14
- # @return [String]
15
- def bucket
16
- host
17
- end
18
-
19
- # Is this resource an S3 resource?
20
- #
21
- # @return [true, false]
22
- def on_s3?
23
- true
24
- end
25
- alias_method :is_s3?, :on_s3?
26
-
27
- # Copy this resource to the +new_uri+.
28
- #
29
- # @param [String, IMW::Resource] new_uri
30
- # @return [IMW::Resource] the new resource
31
- def cp new_uri
32
- #IMW::Tools::Transferer.new(:cp, self, new_uri).transfer!
33
- IMW::Schemes::S3.get(self, new_uri)
34
- end
35
-
36
- # Does this resource exist on S3?
37
- #
38
- # @return [true, false]
39
- def exist?
40
- AWS::S3::S3Object.exists?(raw_path, bucket)
41
- end
42
- alias_method :exists?, :exist?
43
-
44
- # Remove this resource from S3.
45
- #
46
- # @return [IMW::Resource] the deleted object
47
- def rm
48
- AWS::S3::S3Object.delete(raw_path, bucket)
49
- end
50
- alias_method :rm!, :rm
51
-
52
- # Return the S3N URL for this S3 object
53
- #
54
- # resource = IMW.open('s3://my_bucket/path/to/some/obj')
55
- # resource.s3n_url
56
- # => 's3n://my_bucket/path/to/some/obj'
57
- #
58
- # @return [String]
59
- def s3n_url
60
- uri.to_s.gsub(/^s3:/, 's3n:')
61
- end
62
-
63
- # Return the contents of this S3 object.
64
- #
65
- # @return [String]
66
- def read
67
- AWS::S3::S3Object.value(raw_path, bucket)
68
- end
69
-
70
- # Store +source+ into +destination+.
71
- #
72
- # @param [String, IMW::Resource, #io] source
73
- # @param [String, IMW::Resource, #path, #bucket] destination
74
- # @return [IMW::Resource] the new S3 object
75
- def self.put source, destination
76
- source = IMW.open(source)
77
- destintation = IMW.open(destination)
78
- raise IMW::ArgumentError.new("destination must be on S3 -- #{destination} given") unless destination.on_s3?
79
- make_connection!
80
- AWS::S3::S3Object.store(destination.raw_path, source.io, destination.bucket)
81
- destination
82
- end
83
-
84
- # Download +source+ from S3 into +destination+.
85
- #
86
- # @param [String, IMW::Resource, #path, #bucket] source
87
- # @param [String, IMW::Resource, #write] destination
88
- # @return [IMW::Resource] the new resource
89
- def self.get source, destination
90
- source = IMW.open(source)
91
- destination = IMW.open!(destination)
92
- raise IMW::ArgumentError.new("source must be on S3 -- #{source} given") unless source.on_s3?
93
- make_connection!
94
- AWS::S3::S3Object.stream(source.raw_path, source.bucket) do |chunk|
95
- destination.write(chunk)
96
- end
97
- destination.close
98
- destination.reopen
99
- end
100
-
101
- # Copy S3 resource +source+ to +destination+.
102
- #
103
- # @param [String, IMW::Resource, #path, #bucket] source
104
- # @param [String, IMW::Resource, #path, #bucket] destination
105
- # @return [IMW::Resource] the new resource
106
- def self.copy source, destination
107
- source = IMW.open(source)
108
- destination = IMW.open(destination)
109
- raise IMW::PathError.new("Bucket names must be non-blank and match to 'copy'") unless source.bucket.present? && destination.bucket.present? && source.bucket == destination.bucket
110
- make_connection!
111
- AWS::S3::Object.copy(source.raw_path, destination.raw_path, destination.bucket)
112
- destination
113
- end
114
-
115
- # Return the resource at the base path of this resource joined
116
- # to +path+.
117
- #
118
- # IMW.open('s3:://bucket/path/to/dir').join('subdir')
119
- # #=> IMW::Resource at 's3://bucket/path/to/dir/subdir'
120
- #
121
- # @param [Array<String>] paths
122
- # @return [IMW::Resource]
123
- def join *paths
124
- IMW.open(File.join(stripped_uri.to_s, *paths))
125
- end
126
-
127
- protected
128
- # Make an S3 connection.
129
- #
130
- # Uses settings defined in IMW::AWS_CREDENTIALS.
131
- #
132
- # @return [AWS
133
- def self.make_connection!
134
- return @connection if @connection
135
- raise IMW::Error.new("Must define a constant IMW::AWS_CREDENTIALS with an :access_key_id and a :secret_access_key before using S3 resources") unless defined?(IMW::AWS_CREDENTIALS)
136
- require 'aws/s3'
137
- @connection = AWS::S3::Base.establish_connection!(IMW::AWS_CREDENTIALS)
138
- end
139
-
140
- end
141
- end
142
- end
143
-
@@ -1,129 +0,0 @@
1
- require 'dbi'
2
-
3
- module IMW
4
- module Schemes
5
-
6
- # Encapsulates a connection to a relational database.
7
- #
8
- # Calling
9
- #
10
- # IMW.open('sql://host:port/database_name')
11
- #
12
- # shold create a connection to a database at the given +port+ on
13
- # the given +host+ using the given +database_name+.
14
- module SQL
15
-
16
- # A base implementation of a connection to a relational
17
- # database.
18
- #
19
- # The Base#extended method will examine the +scheme+ of an
20
- # object extended with this module and choose a more specific
21
- # database adaptor module to extend with as well.
22
- module Base
23
-
24
- # When an IMW::Resource is extended use URI's scheme to choose
25
- # which other module inside IMW::Schemes::SQL to extend with.
26
- def self.extended obj
27
- case obj.scheme
28
- when 'mysql' then obj.extend(IMW::Schemes::SQL::MySQL)
29
- when 'postgresql' then obj.extend(IMW::Schemes::SQL::PostgreSQL)
30
- else raise IMW::ArgumentError.new("Unknown database type: #{obj.scheme}")
31
- end
32
- end
33
-
34
- # For an SQL connection the database will be the same as the
35
- # path.
36
- #
37
- # @return [String]
38
- def database
39
- @database ||= path.tr('/','')
40
- end
41
-
42
- # Redefineeach method inappropriate for databases.
43
- [:dirname, :basename, :extname, :extension, :name].each do |method|
44
- define_method(method) do
45
- nil
46
- end
47
- end
48
-
49
- # Return a summary of this database.
50
- #
51
- # Purposefully does not call +super+.
52
- #
53
- # @return [Hash]
54
- def external_summary
55
- {
56
- :uri => uri.to_s,
57
- :database => database
58
- }
59
- end
60
-
61
- # The (cached) database connection for this resource.
62
- #
63
- # @return [DBI::DatabaseHandle]
64
- def connection
65
- @connection ||= DBI.connect("#{dbi_module}:#{database}:#{host}", user, password)
66
- end
67
-
68
- # Return the password associated with user's account on the
69
- # given database.
70
- #
71
- # @return [String]
72
- def password
73
- @password ||= resource_options[:password]
74
- end
75
-
76
- # Return an array of the table names in the current database.
77
- #
78
- # @return [Array<String>]
79
- def tables
80
- [].tap do |table_names|
81
- execute("SHOW TABLES") do |row|
82
- table_names << row.first
83
- end
84
- end
85
- end
86
-
87
- # Execute the (joined) +query_string_parts+ using this
88
- # resource's cached connection.
89
- #
90
- # If passed a block, yield each row of the result set to the
91
- # block.
92
- #
93
- # @param [Array<String>] query_string_parts
94
- # @yield [DBI::Row]
95
- # @return [DBI::StatementHandle]
96
- def execute *query_string_parts, &block
97
- query = query_string_parts.join(' ')
98
- IMW.announce_if_verbose "Querying #{self}: #{query}"
99
- statement = connection.execute(query)
100
- block_given? ? statement.fetch(&block) : statement
101
- end
102
- end
103
-
104
- # Module for MySQL databases.
105
- module MySQL
106
-
107
- # Return the name of the DBI module used to connect to MySQL.
108
- #
109
- # @return [String]
110
- def dbi_module
111
- "DBI:Mysql"
112
- end
113
- end
114
-
115
- # Module for PostgreSQL databases.
116
- module PostgreSQL
117
-
118
- # Return the name of the DBI module used to connect to PostgreSQL.
119
- #
120
- # @return [String]
121
- def dbi_module
122
- "DBI:Pg"
123
- end
124
- end
125
-
126
- end
127
- end
128
- end
129
-
@@ -1,12 +0,0 @@
1
- module IMW
2
- module Tools
3
- autoload :Aggregator, 'imw/tools/aggregator'
4
- autoload :Archiver, 'imw/tools/archiver'
5
- autoload :Transferer, 'imw/tools/transferer'
6
- autoload :Summarizer, 'imw/tools/summarizer'
7
- autoload :ExtensionAnalyzer, 'imw/tools/extension_analyzer'
8
- autoload :Downloader, 'imw/tools/downloader'
9
- end
10
- end
11
-
12
-
@@ -1,148 +0,0 @@
1
- require 'imw/resource'
2
-
3
- module IMW
4
- module Tools
5
-
6
- # Aggregates resources into a single local directory.
7
- #
8
- # The directory should already exist.
9
- #
10
- # Any local resources will be copied into the directory.
11
- #
12
- # Any remote resources will be downloaded into the directory.
13
- #
14
- # If any of the resources are archives, they will first be
15
- # extracted, with only their contents winding up in the final
16
- # directory (the file hierarchy of the archive will be preserved).
17
- #
18
- # If any of the resources are compressed, they will first be
19
- # uncompressed before being added to the directory.
20
- #
21
- # As an example:
22
- #
23
- # aggregator = IMW::Tools::Aggregator.new '/path/to/agg_dir'
24
- # aggregator.aggregate '/path/to/my/regular_file.tsv', '/path/to/an/archive.tar.bz2', '/path/to/my_compressed_file.gz', 'http://mywebsite.com/index.html'
25
- #
26
- # This will create a directory at <tt>/path/to/agg_dir</tt> which
27
- # looks like
28
- #
29
- # path_to_agg_dir
30
- # |-- regular_file.tsv
31
- # |-- archive
32
- # | |-- internal_archive_file_1
33
- # | |-- internal_archive_file_2
34
- # | ...
35
- # | `-- internal_archive_file_N
36
- # |-- my_compressed_file
37
- # `-- index.html
38
- #
39
- # Notice that
40
- #
41
- # - the local file was copied over
42
- #
43
- # - the remote file was downloaded and copied over
44
- #
45
- # - the tar archive was first exctracted
46
- #
47
- # - the compressed file was aggregated
48
- #
49
- # This process can take a while when the constituent files are
50
- # large.
51
- class Aggregator
52
-
53
- attr_reader :dir
54
-
55
- def initialize dir
56
- self.dir = IMW.open(dir)
57
- end
58
-
59
- # Set the directory for this Aggregator.
60
- #
61
- # Will raise unless +new_dir+ is an existing, local directory.
62
- #
63
- # @param [String, IMW::Resource] new_dir
64
- # @return [IMW::Resource]
65
- def dir= new_dir
66
- @dir = IMW.open(new_dir)
67
- raise IMW::SchemError.new("Aggregator requires a local directory, not #{@dir}") unless @dir.is_local?
68
- @dir.should_exist! "Aggregator requires the aggregation directory to already exist"
69
- raise IMW::PathError.new("Aggregator requires a directory, not #{@dir}") unless @dir.is_directory?
70
- @dir
71
- end
72
-
73
- # Return a list of error messages for this Aggregator.
74
- #
75
- # @return [Array] the error messages
76
- def errors
77
- @errors ||= []
78
- end
79
-
80
- # Was this archiver successful (did it not have any errors)?
81
- #
82
- # @return [true, false]
83
- def success?
84
- errors.empty?
85
- end
86
-
87
- # Aggregate the given inputs into this Aggregator's +dir+.
88
- #
89
- # @param [Array<IMW::Resource,String>] inputs
90
- # @return [IMW::Tools::Aggregator]
91
- def aggregate *paths_or_inputs
92
- @errors = []
93
- paths_or_inputs.flatten.compact.each do |path_or_input|
94
- input = IMW.open(path_or_input)
95
- if input.is_local?
96
- aggregate_local_input(input)
97
- else
98
- download = download_remote_input(input)
99
- if download.is_compressed? || download.is_archive?
100
- aggregate_local_input(download)
101
- download.rm!
102
- end
103
- end
104
- end
105
- end
106
-
107
- protected
108
-
109
- # Aggregate a local input.
110
- #
111
- # Will extract archives, decompress compressed files, and copy
112
- # regular files and directories (but will not recurse into
113
- # directories to find archives or compressed files).
114
- #
115
- # @param [IMW::Resource] input
116
- def aggregate_local_input input
117
- new_path = File.join(dir.path, input.basename)
118
- case
119
- when input.is_archive?
120
- IMW.announce_if_verbose("Aggregating and extracting #{input} to #{dir}...")
121
- FileUtils.cd(dir.path) do
122
- input.extract
123
- end
124
- when input.is_compressed?
125
- IMW.announce_if_verbose("Decompressing #{input}...")
126
- input.cp(new_path).decompress!
127
- else
128
- IMW.announce_if_verbose("Copying #{input}...")
129
- input.cp(new_path)
130
- end
131
- end
132
-
133
- # Download a remote input to this Aggregator's +dir+.
134
- #
135
- # @param [IMW::Resource] input
136
- def download_remote_input input
137
- IMW.announce_if_verbose("Downloading #{input}...")
138
- input.cp(File.join(dir.path, input.effective_basename))
139
- end
140
-
141
- def add_processing_error error # :nodoc:
142
- IMW.logger.warn error
143
- errors << error
144
- end
145
-
146
- end
147
- end
148
- end