imw 0.2.18 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +7 -26
- data/Gemfile.lock +13 -38
- data/{LICENSE → LICENSE.txt} +1 -1
- data/README.textile +35 -0
- data/Rakefile +45 -22
- data/VERSION +1 -1
- data/examples/foo.rb +19 -0
- data/examples/html_selector.rb +22 -0
- data/examples/nes_game_list.csv +625 -0
- data/examples/nes_gamespot.csv +1371 -0
- data/examples/nes_nintendo.csv +624 -0
- data/examples/nes_unlicensed.csv +89 -0
- data/examples/nes_wikipedia.csv +710 -0
- data/examples/nibbler_test.rb +24 -0
- data/examples/script.rb +19 -0
- data/lib/imw.rb +28 -140
- data/lib/imw/error.rb +9 -0
- data/lib/imw/recordizer.rb +8 -0
- data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
- data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
- data/lib/imw/resource.rb +3 -119
- data/lib/imw/serializer.rb +7 -0
- data/lib/imw/serializer/json_serializer.rb +17 -0
- data/lib/imw/uri.rb +41 -0
- data/spec/resource_spec.rb +78 -0
- data/spec/uri_spec.rb +55 -0
- metadata +81 -232
- data/README.rdoc +0 -371
- data/bin/imw +0 -5
- data/bin/tsv_to_json.rb +0 -29
- data/etc/imwrc.rb +0 -26
- data/examples/dataset.rb +0 -12
- data/examples/metadata.yml +0 -10
- data/lib/imw/archives.rb +0 -120
- data/lib/imw/archives/rar.rb +0 -19
- data/lib/imw/archives/tar.rb +0 -19
- data/lib/imw/archives/tarbz2.rb +0 -73
- data/lib/imw/archives/targz.rb +0 -73
- data/lib/imw/archives/zip.rb +0 -51
- data/lib/imw/boot.rb +0 -87
- data/lib/imw/compressed_files.rb +0 -94
- data/lib/imw/compressed_files/bz2.rb +0 -16
- data/lib/imw/compressed_files/compressible.rb +0 -75
- data/lib/imw/compressed_files/gz.rb +0 -16
- data/lib/imw/dataset.rb +0 -125
- data/lib/imw/dataset/paths.rb +0 -29
- data/lib/imw/dataset/workflow.rb +0 -195
- data/lib/imw/formats.rb +0 -33
- data/lib/imw/formats/delimited.rb +0 -170
- data/lib/imw/formats/excel.rb +0 -100
- data/lib/imw/formats/json.rb +0 -41
- data/lib/imw/formats/pdf.rb +0 -71
- data/lib/imw/formats/sgml.rb +0 -69
- data/lib/imw/formats/yaml.rb +0 -41
- data/lib/imw/metadata.rb +0 -83
- data/lib/imw/metadata/contains_metadata.rb +0 -54
- data/lib/imw/metadata/dsl.rb +0 -111
- data/lib/imw/metadata/field.rb +0 -37
- data/lib/imw/metadata/has_metadata.rb +0 -98
- data/lib/imw/metadata/has_summary.rb +0 -57
- data/lib/imw/metadata/schema.rb +0 -17
- data/lib/imw/parsers.rb +0 -8
- data/lib/imw/parsers/flat.rb +0 -44
- data/lib/imw/parsers/html_parser.rb +0 -387
- data/lib/imw/parsers/html_parser/matchers.rb +0 -289
- data/lib/imw/parsers/line_parser.rb +0 -87
- data/lib/imw/parsers/regexp_parser.rb +0 -72
- data/lib/imw/repository.rb +0 -12
- data/lib/imw/runner.rb +0 -118
- data/lib/imw/schemes.rb +0 -23
- data/lib/imw/schemes/ftp.rb +0 -142
- data/lib/imw/schemes/hdfs.rb +0 -251
- data/lib/imw/schemes/http.rb +0 -165
- data/lib/imw/schemes/local.rb +0 -409
- data/lib/imw/schemes/remote.rb +0 -119
- data/lib/imw/schemes/s3.rb +0 -143
- data/lib/imw/schemes/sql.rb +0 -129
- data/lib/imw/tools.rb +0 -12
- data/lib/imw/tools/aggregator.rb +0 -148
- data/lib/imw/tools/archiver.rb +0 -220
- data/lib/imw/tools/downloader.rb +0 -63
- data/lib/imw/tools/extension_analyzer.rb +0 -114
- data/lib/imw/tools/summarizer.rb +0 -83
- data/lib/imw/tools/transferer.rb +0 -167
- data/lib/imw/utils.rb +0 -74
- data/lib/imw/utils/dynamically_extendable.rb +0 -137
- data/lib/imw/utils/error.rb +0 -59
- data/lib/imw/utils/extensions/hpricot.rb +0 -34
- data/lib/imw/utils/has_uri.rb +0 -131
- data/lib/imw/utils/log.rb +0 -92
- data/lib/imw/utils/misc.rb +0 -57
- data/lib/imw/utils/paths.rb +0 -146
- data/lib/imw/utils/uri.rb +0 -59
- data/lib/imw/utils/uuid.rb +0 -33
- data/lib/imw/utils/validate.rb +0 -38
- data/lib/imw/utils/version.rb +0 -11
- data/spec/data/formats/delimited/sample.csv +0 -131
- data/spec/data/formats/delimited/sample.tsv +0 -131
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +0 -1
- data/spec/data/formats/none/sample +0 -650
- data/spec/data/formats/sgml/sample.xml +0 -617
- data/spec/data/formats/text/sample.txt +0 -650
- data/spec/data/formats/yaml/sample.yaml +0 -410
- data/spec/data/schema-tabular.yaml +0 -11
- data/spec/imw/archives/rar_spec.rb +0 -16
- data/spec/imw/archives/tar_spec.rb +0 -16
- data/spec/imw/archives/tarbz2_spec.rb +0 -24
- data/spec/imw/archives/targz_spec.rb +0 -21
- data/spec/imw/archives/zip_spec.rb +0 -16
- data/spec/imw/archives_spec.rb +0 -77
- data/spec/imw/compressed_files/bz2_spec.rb +0 -15
- data/spec/imw/compressed_files/compressible_spec.rb +0 -36
- data/spec/imw/compressed_files/gz_spec.rb +0 -15
- data/spec/imw/compressed_files_spec.rb +0 -47
- data/spec/imw/dataset/paths_spec.rb +0 -32
- data/spec/imw/dataset/workflow_spec.rb +0 -41
- data/spec/imw/formats/delimited_spec.rb +0 -44
- data/spec/imw/formats/excel_spec.rb +0 -55
- data/spec/imw/formats/json_spec.rb +0 -18
- data/spec/imw/formats/sgml_spec.rb +0 -24
- data/spec/imw/formats/yaml_spec.rb +0 -19
- data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
- data/spec/imw/metadata/field_spec.rb +0 -25
- data/spec/imw/metadata/has_metadata_spec.rb +0 -58
- data/spec/imw/metadata/has_summary_spec.rb +0 -32
- data/spec/imw/metadata/schema_spec.rb +0 -24
- data/spec/imw/metadata_spec.rb +0 -86
- data/spec/imw/parsers/line_parser_spec.rb +0 -96
- data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
- data/spec/imw/resource_spec.rb +0 -32
- data/spec/imw/schemes/hdfs_spec.rb +0 -67
- data/spec/imw/schemes/http_spec.rb +0 -19
- data/spec/imw/schemes/local_spec.rb +0 -165
- data/spec/imw/schemes/remote_spec.rb +0 -38
- data/spec/imw/schemes/s3_spec.rb +0 -31
- data/spec/imw/schemes/sql_spec.rb +0 -3
- data/spec/imw/tools/aggregator_spec.rb +0 -71
- data/spec/imw/tools/archiver_spec.rb +0 -120
- data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
- data/spec/imw/tools/summarizer_spec.rb +0 -8
- data/spec/imw/tools/transferer_spec.rb +0 -195
- data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
- data/spec/imw/utils/has_uri_spec.rb +0 -61
- data/spec/imw/utils/paths_spec.rb +0 -10
- data/spec/imw/utils/shared_paths_spec.rb +0 -29
- data/spec/imw_spec.rb +0 -14
- data/spec/rcov.opts +0 -1
- data/spec/spec_helper.rb +0 -31
- data/spec/support/custom_matchers.rb +0 -28
- data/spec/support/file_contents_matcher.rb +0 -30
- data/spec/support/paths_matcher.rb +0 -66
- data/spec/support/random.rb +0 -213
- data/spec/support/without_regard_to_order_matcher.rb +0 -41
data/lib/imw/schemes/remote.rb
DELETED
|
@@ -1,119 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Schemes
|
|
3
|
-
|
|
4
|
-
# Contains modules which define methods appropriate for remote
|
|
5
|
-
# resources, no matter the protocol.
|
|
6
|
-
module Remote
|
|
7
|
-
|
|
8
|
-
# Defines methods appropriate for accessing a remote resource,
|
|
9
|
-
# no matter the protocol.
|
|
10
|
-
module Base
|
|
11
|
-
|
|
12
|
-
#
|
|
13
|
-
# TODO -- self.extended should extend by RemoteDirectory when appropriate
|
|
14
|
-
#
|
|
15
|
-
|
|
16
|
-
def self.extended obj
|
|
17
|
-
obj.extend(RemoteFile)
|
|
18
|
-
end
|
|
19
|
-
|
|
20
|
-
# Is this resource on a remote host?
|
|
21
|
-
#
|
|
22
|
-
# @return [true,false]
|
|
23
|
-
def is_remote?
|
|
24
|
-
true
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
# The host of this resource.
|
|
28
|
-
#
|
|
29
|
-
# @return [String]
|
|
30
|
-
def host
|
|
31
|
-
@host ||= uri.host
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
# Return the query string part of this resource's URI. Will
|
|
35
|
-
# likely be +nil+ for local resources.
|
|
36
|
-
#
|
|
37
|
-
# @return [String]
|
|
38
|
-
def query_string
|
|
39
|
-
@query_string ||= uri.query
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
# Return the path part of this resource's URI. Will _not_
|
|
43
|
-
# include the +query_string+ or +fragment+.
|
|
44
|
-
#
|
|
45
|
-
# @return [String]
|
|
46
|
-
def path
|
|
47
|
-
@path ||= uri.path
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
module RemoteFile
|
|
53
|
-
|
|
54
|
-
# Return the IO object for this remote file.
|
|
55
|
-
#
|
|
56
|
-
# The mode of this resource is ignored.
|
|
57
|
-
#
|
|
58
|
-
# @return [StringIO]
|
|
59
|
-
def io
|
|
60
|
-
require 'open-uri'
|
|
61
|
-
@io ||= open(uri.to_s) # ignore mode
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
# Read the contents of this remote file.
|
|
65
|
-
#
|
|
66
|
-
# @return [String]
|
|
67
|
-
def read
|
|
68
|
-
io.read
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
# Return the lines of this remote file.
|
|
72
|
-
#
|
|
73
|
-
# If passed a block then yield each line to the block.
|
|
74
|
-
#
|
|
75
|
-
# @return [Array] the lines of this remote file
|
|
76
|
-
# @yield [String] each line of this remote file
|
|
77
|
-
def load &block
|
|
78
|
-
if block_given?
|
|
79
|
-
io.each do |line|
|
|
80
|
-
yield line
|
|
81
|
-
end
|
|
82
|
-
else
|
|
83
|
-
read.split("\n")
|
|
84
|
-
end
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
# Map over the lines in this remote file.
|
|
88
|
-
#
|
|
89
|
-
# @yield [String] each line of the file
|
|
90
|
-
def map &block
|
|
91
|
-
io.map(&block)
|
|
92
|
-
end
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
module RemoteDirectory
|
|
97
|
-
|
|
98
|
-
# Return the resource at the base path of this resource joined
|
|
99
|
-
# to +path+.
|
|
100
|
-
#
|
|
101
|
-
# IMW.open('http://example.com/path/to/dir').join('subdir')
|
|
102
|
-
# #=> IMW::Resource at 'http://example.com/path/to/dir/subdir'
|
|
103
|
-
#
|
|
104
|
-
# @param [Array<String>] paths
|
|
105
|
-
# @return [IMW::Resource]
|
|
106
|
-
def join *paths
|
|
107
|
-
IMW.open(File.join(stripped_uri.to_s, *paths))
|
|
108
|
-
end
|
|
109
|
-
|
|
110
|
-
#
|
|
111
|
-
# TODO -- bloody everything. what's the best way to tell if
|
|
112
|
-
# the remote URL is a directory?
|
|
113
|
-
#
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
end
|
|
117
|
-
end
|
|
118
|
-
end
|
|
119
|
-
end
|
data/lib/imw/schemes/s3.rb
DELETED
|
@@ -1,143 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Schemes
|
|
3
|
-
|
|
4
|
-
# Defines methods for reading and writing data to {Amazon
|
|
5
|
-
# S3}[http://aws.amazon.com/s3] buckets.
|
|
6
|
-
#
|
|
7
|
-
# IMW.open('s3://my_bucket/path/to/some/file.csv')
|
|
8
|
-
#
|
|
9
|
-
# Learn more about {Amazon Web Services}[http://aws.amazon.com].
|
|
10
|
-
module S3
|
|
11
|
-
|
|
12
|
-
# For an S3 resource, the bucket is just the hostname.
|
|
13
|
-
#
|
|
14
|
-
# @return [String]
|
|
15
|
-
def bucket
|
|
16
|
-
host
|
|
17
|
-
end
|
|
18
|
-
|
|
19
|
-
# Is this resource an S3 resource?
|
|
20
|
-
#
|
|
21
|
-
# @return [true, false]
|
|
22
|
-
def on_s3?
|
|
23
|
-
true
|
|
24
|
-
end
|
|
25
|
-
alias_method :is_s3?, :on_s3?
|
|
26
|
-
|
|
27
|
-
# Copy this resource to the +new_uri+.
|
|
28
|
-
#
|
|
29
|
-
# @param [String, IMW::Resource] new_uri
|
|
30
|
-
# @return [IMW::Resource] the new resource
|
|
31
|
-
def cp new_uri
|
|
32
|
-
#IMW::Tools::Transferer.new(:cp, self, new_uri).transfer!
|
|
33
|
-
IMW::Schemes::S3.get(self, new_uri)
|
|
34
|
-
end
|
|
35
|
-
|
|
36
|
-
# Does this resource exist on S3?
|
|
37
|
-
#
|
|
38
|
-
# @return [true, false]
|
|
39
|
-
def exist?
|
|
40
|
-
AWS::S3::S3Object.exists?(raw_path, bucket)
|
|
41
|
-
end
|
|
42
|
-
alias_method :exists?, :exist?
|
|
43
|
-
|
|
44
|
-
# Remove this resource from S3.
|
|
45
|
-
#
|
|
46
|
-
# @return [IMW::Resource] the deleted object
|
|
47
|
-
def rm
|
|
48
|
-
AWS::S3::S3Object.delete(raw_path, bucket)
|
|
49
|
-
end
|
|
50
|
-
alias_method :rm!, :rm
|
|
51
|
-
|
|
52
|
-
# Return the S3N URL for this S3 object
|
|
53
|
-
#
|
|
54
|
-
# resource = IMW.open('s3://my_bucket/path/to/some/obj')
|
|
55
|
-
# resource.s3n_url
|
|
56
|
-
# => 's3n://my_bucket/path/to/some/obj'
|
|
57
|
-
#
|
|
58
|
-
# @return [String]
|
|
59
|
-
def s3n_url
|
|
60
|
-
uri.to_s.gsub(/^s3:/, 's3n:')
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
# Return the contents of this S3 object.
|
|
64
|
-
#
|
|
65
|
-
# @return [String]
|
|
66
|
-
def read
|
|
67
|
-
AWS::S3::S3Object.value(raw_path, bucket)
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
# Store +source+ into +destination+.
|
|
71
|
-
#
|
|
72
|
-
# @param [String, IMW::Resource, #io] source
|
|
73
|
-
# @param [String, IMW::Resource, #path, #bucket] destination
|
|
74
|
-
# @return [IMW::Resource] the new S3 object
|
|
75
|
-
def self.put source, destination
|
|
76
|
-
source = IMW.open(source)
|
|
77
|
-
destintation = IMW.open(destination)
|
|
78
|
-
raise IMW::ArgumentError.new("destination must be on S3 -- #{destination} given") unless destination.on_s3?
|
|
79
|
-
make_connection!
|
|
80
|
-
AWS::S3::S3Object.store(destination.raw_path, source.io, destination.bucket)
|
|
81
|
-
destination
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
# Download +source+ from S3 into +destination+.
|
|
85
|
-
#
|
|
86
|
-
# @param [String, IMW::Resource, #path, #bucket] source
|
|
87
|
-
# @param [String, IMW::Resource, #write] destination
|
|
88
|
-
# @return [IMW::Resource] the new resource
|
|
89
|
-
def self.get source, destination
|
|
90
|
-
source = IMW.open(source)
|
|
91
|
-
destination = IMW.open!(destination)
|
|
92
|
-
raise IMW::ArgumentError.new("source must be on S3 -- #{source} given") unless source.on_s3?
|
|
93
|
-
make_connection!
|
|
94
|
-
AWS::S3::S3Object.stream(source.raw_path, source.bucket) do |chunk|
|
|
95
|
-
destination.write(chunk)
|
|
96
|
-
end
|
|
97
|
-
destination.close
|
|
98
|
-
destination.reopen
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
# Copy S3 resource +source+ to +destination+.
|
|
102
|
-
#
|
|
103
|
-
# @param [String, IMW::Resource, #path, #bucket] source
|
|
104
|
-
# @param [String, IMW::Resource, #path, #bucket] destination
|
|
105
|
-
# @return [IMW::Resource] the new resource
|
|
106
|
-
def self.copy source, destination
|
|
107
|
-
source = IMW.open(source)
|
|
108
|
-
destination = IMW.open(destination)
|
|
109
|
-
raise IMW::PathError.new("Bucket names must be non-blank and match to 'copy'") unless source.bucket.present? && destination.bucket.present? && source.bucket == destination.bucket
|
|
110
|
-
make_connection!
|
|
111
|
-
AWS::S3::Object.copy(source.raw_path, destination.raw_path, destination.bucket)
|
|
112
|
-
destination
|
|
113
|
-
end
|
|
114
|
-
|
|
115
|
-
# Return the resource at the base path of this resource joined
|
|
116
|
-
# to +path+.
|
|
117
|
-
#
|
|
118
|
-
# IMW.open('s3:://bucket/path/to/dir').join('subdir')
|
|
119
|
-
# #=> IMW::Resource at 's3://bucket/path/to/dir/subdir'
|
|
120
|
-
#
|
|
121
|
-
# @param [Array<String>] paths
|
|
122
|
-
# @return [IMW::Resource]
|
|
123
|
-
def join *paths
|
|
124
|
-
IMW.open(File.join(stripped_uri.to_s, *paths))
|
|
125
|
-
end
|
|
126
|
-
|
|
127
|
-
protected
|
|
128
|
-
# Make an S3 connection.
|
|
129
|
-
#
|
|
130
|
-
# Uses settings defined in IMW::AWS_CREDENTIALS.
|
|
131
|
-
#
|
|
132
|
-
# @return [AWS
|
|
133
|
-
def self.make_connection!
|
|
134
|
-
return @connection if @connection
|
|
135
|
-
raise IMW::Error.new("Must define a constant IMW::AWS_CREDENTIALS with an :access_key_id and a :secret_access_key before using S3 resources") unless defined?(IMW::AWS_CREDENTIALS)
|
|
136
|
-
require 'aws/s3'
|
|
137
|
-
@connection = AWS::S3::Base.establish_connection!(IMW::AWS_CREDENTIALS)
|
|
138
|
-
end
|
|
139
|
-
|
|
140
|
-
end
|
|
141
|
-
end
|
|
142
|
-
end
|
|
143
|
-
|
data/lib/imw/schemes/sql.rb
DELETED
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
require 'dbi'
|
|
2
|
-
|
|
3
|
-
module IMW
|
|
4
|
-
module Schemes
|
|
5
|
-
|
|
6
|
-
# Encapsulates a connection to a relational database.
|
|
7
|
-
#
|
|
8
|
-
# Calling
|
|
9
|
-
#
|
|
10
|
-
# IMW.open('sql://host:port/database_name')
|
|
11
|
-
#
|
|
12
|
-
# shold create a connection to a database at the given +port+ on
|
|
13
|
-
# the given +host+ using the given +database_name+.
|
|
14
|
-
module SQL
|
|
15
|
-
|
|
16
|
-
# A base implementation of a connection to a relational
|
|
17
|
-
# database.
|
|
18
|
-
#
|
|
19
|
-
# The Base#extended method will examine the +scheme+ of an
|
|
20
|
-
# object extended with this module and choose a more specific
|
|
21
|
-
# database adaptor module to extend with as well.
|
|
22
|
-
module Base
|
|
23
|
-
|
|
24
|
-
# When an IMW::Resource is extended use URI's scheme to choose
|
|
25
|
-
# which other module inside IMW::Schemes::SQL to extend with.
|
|
26
|
-
def self.extended obj
|
|
27
|
-
case obj.scheme
|
|
28
|
-
when 'mysql' then obj.extend(IMW::Schemes::SQL::MySQL)
|
|
29
|
-
when 'postgresql' then obj.extend(IMW::Schemes::SQL::PostgreSQL)
|
|
30
|
-
else raise IMW::ArgumentError.new("Unknown database type: #{obj.scheme}")
|
|
31
|
-
end
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
# For an SQL connection the database will be the same as the
|
|
35
|
-
# path.
|
|
36
|
-
#
|
|
37
|
-
# @return [String]
|
|
38
|
-
def database
|
|
39
|
-
@database ||= path.tr('/','')
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
# Redefineeach method inappropriate for databases.
|
|
43
|
-
[:dirname, :basename, :extname, :extension, :name].each do |method|
|
|
44
|
-
define_method(method) do
|
|
45
|
-
nil
|
|
46
|
-
end
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
# Return a summary of this database.
|
|
50
|
-
#
|
|
51
|
-
# Purposefully does not call +super+.
|
|
52
|
-
#
|
|
53
|
-
# @return [Hash]
|
|
54
|
-
def external_summary
|
|
55
|
-
{
|
|
56
|
-
:uri => uri.to_s,
|
|
57
|
-
:database => database
|
|
58
|
-
}
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
# The (cached) database connection for this resource.
|
|
62
|
-
#
|
|
63
|
-
# @return [DBI::DatabaseHandle]
|
|
64
|
-
def connection
|
|
65
|
-
@connection ||= DBI.connect("#{dbi_module}:#{database}:#{host}", user, password)
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
# Return the password associated with user's account on the
|
|
69
|
-
# given database.
|
|
70
|
-
#
|
|
71
|
-
# @return [String]
|
|
72
|
-
def password
|
|
73
|
-
@password ||= resource_options[:password]
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
# Return an array of the table names in the current database.
|
|
77
|
-
#
|
|
78
|
-
# @return [Array<String>]
|
|
79
|
-
def tables
|
|
80
|
-
[].tap do |table_names|
|
|
81
|
-
execute("SHOW TABLES") do |row|
|
|
82
|
-
table_names << row.first
|
|
83
|
-
end
|
|
84
|
-
end
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
# Execute the (joined) +query_string_parts+ using this
|
|
88
|
-
# resource's cached connection.
|
|
89
|
-
#
|
|
90
|
-
# If passed a block, yield each row of the result set to the
|
|
91
|
-
# block.
|
|
92
|
-
#
|
|
93
|
-
# @param [Array<String>] query_string_parts
|
|
94
|
-
# @yield [DBI::Row]
|
|
95
|
-
# @return [DBI::StatementHandle]
|
|
96
|
-
def execute *query_string_parts, &block
|
|
97
|
-
query = query_string_parts.join(' ')
|
|
98
|
-
IMW.announce_if_verbose "Querying #{self}: #{query}"
|
|
99
|
-
statement = connection.execute(query)
|
|
100
|
-
block_given? ? statement.fetch(&block) : statement
|
|
101
|
-
end
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
# Module for MySQL databases.
|
|
105
|
-
module MySQL
|
|
106
|
-
|
|
107
|
-
# Return the name of the DBI module used to connect to MySQL.
|
|
108
|
-
#
|
|
109
|
-
# @return [String]
|
|
110
|
-
def dbi_module
|
|
111
|
-
"DBI:Mysql"
|
|
112
|
-
end
|
|
113
|
-
end
|
|
114
|
-
|
|
115
|
-
# Module for PostgreSQL databases.
|
|
116
|
-
module PostgreSQL
|
|
117
|
-
|
|
118
|
-
# Return the name of the DBI module used to connect to PostgreSQL.
|
|
119
|
-
#
|
|
120
|
-
# @return [String]
|
|
121
|
-
def dbi_module
|
|
122
|
-
"DBI:Pg"
|
|
123
|
-
end
|
|
124
|
-
end
|
|
125
|
-
|
|
126
|
-
end
|
|
127
|
-
end
|
|
128
|
-
end
|
|
129
|
-
|
data/lib/imw/tools.rb
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Tools
|
|
3
|
-
autoload :Aggregator, 'imw/tools/aggregator'
|
|
4
|
-
autoload :Archiver, 'imw/tools/archiver'
|
|
5
|
-
autoload :Transferer, 'imw/tools/transferer'
|
|
6
|
-
autoload :Summarizer, 'imw/tools/summarizer'
|
|
7
|
-
autoload :ExtensionAnalyzer, 'imw/tools/extension_analyzer'
|
|
8
|
-
autoload :Downloader, 'imw/tools/downloader'
|
|
9
|
-
end
|
|
10
|
-
end
|
|
11
|
-
|
|
12
|
-
|
data/lib/imw/tools/aggregator.rb
DELETED
|
@@ -1,148 +0,0 @@
|
|
|
1
|
-
require 'imw/resource'
|
|
2
|
-
|
|
3
|
-
module IMW
|
|
4
|
-
module Tools
|
|
5
|
-
|
|
6
|
-
# Aggregates resources into a single local directory.
|
|
7
|
-
#
|
|
8
|
-
# The directory should already exist.
|
|
9
|
-
#
|
|
10
|
-
# Any local resources will be copied into the directory.
|
|
11
|
-
#
|
|
12
|
-
# Any remote resources will be downloaded into the directory.
|
|
13
|
-
#
|
|
14
|
-
# If any of the resources are archives, they will first be
|
|
15
|
-
# extracted, with only their contents winding up in the final
|
|
16
|
-
# directory (the file hierarchy of the archive will be preserved).
|
|
17
|
-
#
|
|
18
|
-
# If any of the resources are compressed, they will first be
|
|
19
|
-
# uncompressed before being added to the directory.
|
|
20
|
-
#
|
|
21
|
-
# As an example:
|
|
22
|
-
#
|
|
23
|
-
# aggregator = IMW::Tools::Aggregator.new '/path/to/agg_dir'
|
|
24
|
-
# aggregator.aggregate '/path/to/my/regular_file.tsv', '/path/to/an/archive.tar.bz2', '/path/to/my_compressed_file.gz', 'http://mywebsite.com/index.html'
|
|
25
|
-
#
|
|
26
|
-
# This will create a directory at <tt>/path/to/agg_dir</tt> which
|
|
27
|
-
# looks like
|
|
28
|
-
#
|
|
29
|
-
# path_to_agg_dir
|
|
30
|
-
# |-- regular_file.tsv
|
|
31
|
-
# |-- archive
|
|
32
|
-
# | |-- internal_archive_file_1
|
|
33
|
-
# | |-- internal_archive_file_2
|
|
34
|
-
# | ...
|
|
35
|
-
# | `-- internal_archive_file_N
|
|
36
|
-
# |-- my_compressed_file
|
|
37
|
-
# `-- index.html
|
|
38
|
-
#
|
|
39
|
-
# Notice that
|
|
40
|
-
#
|
|
41
|
-
# - the local file was copied over
|
|
42
|
-
#
|
|
43
|
-
# - the remote file was downloaded and copied over
|
|
44
|
-
#
|
|
45
|
-
# - the tar archive was first exctracted
|
|
46
|
-
#
|
|
47
|
-
# - the compressed file was aggregated
|
|
48
|
-
#
|
|
49
|
-
# This process can take a while when the constituent files are
|
|
50
|
-
# large.
|
|
51
|
-
class Aggregator
|
|
52
|
-
|
|
53
|
-
attr_reader :dir
|
|
54
|
-
|
|
55
|
-
def initialize dir
|
|
56
|
-
self.dir = IMW.open(dir)
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
# Set the directory for this Aggregator.
|
|
60
|
-
#
|
|
61
|
-
# Will raise unless +new_dir+ is an existing, local directory.
|
|
62
|
-
#
|
|
63
|
-
# @param [String, IMW::Resource] new_dir
|
|
64
|
-
# @return [IMW::Resource]
|
|
65
|
-
def dir= new_dir
|
|
66
|
-
@dir = IMW.open(new_dir)
|
|
67
|
-
raise IMW::SchemError.new("Aggregator requires a local directory, not #{@dir}") unless @dir.is_local?
|
|
68
|
-
@dir.should_exist! "Aggregator requires the aggregation directory to already exist"
|
|
69
|
-
raise IMW::PathError.new("Aggregator requires a directory, not #{@dir}") unless @dir.is_directory?
|
|
70
|
-
@dir
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
# Return a list of error messages for this Aggregator.
|
|
74
|
-
#
|
|
75
|
-
# @return [Array] the error messages
|
|
76
|
-
def errors
|
|
77
|
-
@errors ||= []
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
# Was this archiver successful (did it not have any errors)?
|
|
81
|
-
#
|
|
82
|
-
# @return [true, false]
|
|
83
|
-
def success?
|
|
84
|
-
errors.empty?
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
# Aggregate the given inputs into this Aggregator's +dir+.
|
|
88
|
-
#
|
|
89
|
-
# @param [Array<IMW::Resource,String>] inputs
|
|
90
|
-
# @return [IMW::Tools::Aggregator]
|
|
91
|
-
def aggregate *paths_or_inputs
|
|
92
|
-
@errors = []
|
|
93
|
-
paths_or_inputs.flatten.compact.each do |path_or_input|
|
|
94
|
-
input = IMW.open(path_or_input)
|
|
95
|
-
if input.is_local?
|
|
96
|
-
aggregate_local_input(input)
|
|
97
|
-
else
|
|
98
|
-
download = download_remote_input(input)
|
|
99
|
-
if download.is_compressed? || download.is_archive?
|
|
100
|
-
aggregate_local_input(download)
|
|
101
|
-
download.rm!
|
|
102
|
-
end
|
|
103
|
-
end
|
|
104
|
-
end
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
protected
|
|
108
|
-
|
|
109
|
-
# Aggregate a local input.
|
|
110
|
-
#
|
|
111
|
-
# Will extract archives, decompress compressed files, and copy
|
|
112
|
-
# regular files and directories (but will not recurse into
|
|
113
|
-
# directories to find archives or compressed files).
|
|
114
|
-
#
|
|
115
|
-
# @param [IMW::Resource] input
|
|
116
|
-
def aggregate_local_input input
|
|
117
|
-
new_path = File.join(dir.path, input.basename)
|
|
118
|
-
case
|
|
119
|
-
when input.is_archive?
|
|
120
|
-
IMW.announce_if_verbose("Aggregating and extracting #{input} to #{dir}...")
|
|
121
|
-
FileUtils.cd(dir.path) do
|
|
122
|
-
input.extract
|
|
123
|
-
end
|
|
124
|
-
when input.is_compressed?
|
|
125
|
-
IMW.announce_if_verbose("Decompressing #{input}...")
|
|
126
|
-
input.cp(new_path).decompress!
|
|
127
|
-
else
|
|
128
|
-
IMW.announce_if_verbose("Copying #{input}...")
|
|
129
|
-
input.cp(new_path)
|
|
130
|
-
end
|
|
131
|
-
end
|
|
132
|
-
|
|
133
|
-
# Download a remote input to this Aggregator's +dir+.
|
|
134
|
-
#
|
|
135
|
-
# @param [IMW::Resource] input
|
|
136
|
-
def download_remote_input input
|
|
137
|
-
IMW.announce_if_verbose("Downloading #{input}...")
|
|
138
|
-
input.cp(File.join(dir.path, input.effective_basename))
|
|
139
|
-
end
|
|
140
|
-
|
|
141
|
-
def add_processing_error error # :nodoc:
|
|
142
|
-
IMW.logger.warn error
|
|
143
|
-
errors << error
|
|
144
|
-
end
|
|
145
|
-
|
|
146
|
-
end
|
|
147
|
-
end
|
|
148
|
-
end
|