imw 0.2.18 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,118 +0,0 @@
1
- require 'imw'
2
- require 'optparse'
3
-
4
- module IMW
5
-
6
- RunnerError = Class.new(IMW::Error)
7
-
8
- class Runner
9
-
10
- DEFAULT_OPTIONS = {
11
- :requires => [],
12
- :selectors => [],
13
- :dry_run => false
14
- }
15
-
16
- attr_reader :args, :options
17
-
18
- def initialize *args
19
- @args = args
20
- @options = DEFAULT_OPTIONS.dup
21
- parser.parse!(args)
22
- end
23
-
24
- def parser
25
- OptionParser.new do |opts|
26
- opts.banner = "usage: imw [OPTIONS] TASK"
27
- opts.separator <<EOF
28
-
29
- Run TASK for all datasets in the repository. IMW will read any
30
- *.imw files in the current directory by default.
31
-
32
- Options include
33
-
34
- EOF
35
-
36
- opts.on('-v', '--verbose', "Print verbose output") do
37
- IMW.verbose = true # class level, see IMW::Runner.verbose?
38
- end
39
-
40
- opts.on('-d', '--skip-dependencies', "Execute given tasks without invoking dependencies first") do
41
- options[:execute] = true
42
- end
43
-
44
- opts.on('-l', '--list', "List datasets in repository") do
45
- options[:list] = true
46
- end
47
-
48
- opts.on('-s', '--selector SELECTOR', "Filter datasets by regexp SELECTOR. Can be given more than once.") do |selector|
49
- options[:selectors] << selector
50
- end
51
-
52
- opts.on('-r', '--require PATH', "Require PATH. Can be given more than once.") do |path|
53
- options[:requires] << path
54
- end
55
-
56
- end
57
- end
58
-
59
- def require_files
60
- Dir['*.imw'].each { |path| load File.expand_path(path) }
61
- Dir['*.rb'].each { |path| require path.gsub(/\.rb$/,'') }
62
- options[:requires].each do |path|
63
- IMW.open(path) do |requireable|
64
- if requireable.directory?
65
- requireable["**/*.rb"].each { |file| require file }
66
- requireable["**/*.imw"].each { |file| load file }
67
- else
68
- require requireable.path
69
- end
70
- end
71
- end
72
- end
73
-
74
- def task
75
- args.first
76
- end
77
-
78
- def handles
79
- if options[:selectors].blank?
80
- IMW.repository.keys.sort
81
- else
82
- IMW.repository.handles.map do |handle|
83
- handle if options[:selectors].all? { |selector| handle.to_s =~ Regexp.new(selector) }
84
- end.compact.sort
85
- end
86
- end
87
-
88
- def datasets
89
- handles.map { |handle| IMW.repository[handle] }
90
- end
91
-
92
- def list!
93
- puts handles
94
- exit
95
- end
96
-
97
- def run_task!
98
- datasets.each do |dataset|
99
- dataset[task].send(options[:execute] ? :execute : :invoke)
100
- end
101
- exit
102
- end
103
-
104
- def run!
105
- require_files
106
- case
107
- when options[:list]
108
- list!
109
- when task.blank?
110
- puts parser
111
- exit 1
112
- else
113
- run_task!
114
- end
115
- end
116
- end
117
- end
118
-
@@ -1,23 +0,0 @@
1
- module IMW
2
- module Schemes
3
- autoload :Local, 'imw/schemes/local'
4
- autoload :Remote, 'imw/schemes/remote'
5
- autoload :S3, 'imw/schemes/s3'
6
- autoload :HTTP, 'imw/schemes/http'
7
- autoload :HTTPS, 'imw/schemes/http'
8
- autoload :HDFS, 'imw/schemes/hdfs'
9
- autoload :SQL, 'imw/schemes/sql'
10
-
11
- HANDLERS = [
12
- ["Schemes::Local::Base", Proc.new { |resource| resource.scheme == 'file' || resource.scheme.blank? } ],
13
- ["Schemes::Remote::Base", Proc.new { |resource| resource.scheme != 'file' && resource.scheme.present? } ],
14
- ["Schemes::S3", %r{^s3://}i ],
15
- ["Schemes::HTTP", %r{^http://}i ],
16
- ["Schemes::HTTPS", %r{^https://}i ],
17
- ["Schemes::HDFS", %r{^hdfs://}i ],
18
- ["Schemes::SQL::Base", %r{^\w+sql://}i ]
19
- ]
20
- end
21
- end
22
-
23
-
@@ -1,142 +0,0 @@
1
- module IMW
2
- module Schemes
3
-
4
- # Defines methods for reading and writing data from an FTP server.
5
- #
6
- # IMW.open('ftp://user:pass@my_bucket/path/to/some/file.csv')
7
- #
8
- # Learn more about {Amazon Web Services}[http://aws.amazon.com].
9
- module FTP
10
-
11
- module Base
12
-
13
- # Is this resource an FTP resource?
14
- #
15
- # @return [true, false]
16
- def on_ftp?
17
- true
18
- end
19
- alias_method :is_ftp?, :on_ftp?
20
-
21
- # Copy this resource to the +new_uri+.
22
- #
23
- # @param [String, IMW::Resource] new_uri
24
- # @return [IMW::Resource] the new resource
25
- def cp new_uri
26
- local_obj = IMW.open(new_uri)
27
- raise IMW::PathError.new("FTP resources (#{self}) can only be downloaded to a local path") unless local_obj.is_local?
28
- local_obj.dir.should_exist!
29
- FTP.open(host, user, password) do |ftp|
30
- ftp.get(path, local_obj.path)
31
- end
32
- local_obj
33
- end
34
-
35
- # Does this resource exist on S3?
36
- #
37
- # @return [true, false]
38
- def exist?
39
- s3_object.exists?
40
- end
41
- alias_method :exists?, :exist?
42
-
43
- # Remove this resource from S3.
44
- #
45
- # @return [IMW::Resource] the deleted object
46
- def rm
47
- s3_object.delete
48
- end
49
- alias_method :rm!, :rm
50
-
51
- # Return the S3N URL for this S3 object
52
- #
53
- # resource = IMW.open('s3://my_bucket/path/to/some/obj')
54
- # resource.s3n_url
55
- # => 's3n://my_bucket/path/to/some/obj'
56
- #
57
- # @return [String]
58
- def s3n_url
59
- uri.to_s.gsub(/^s3:/, 's3n:')
60
- end
61
-
62
- # Return the contents of this S3 object.
63
- #
64
- # @return [String]
65
- def read
66
- s3_object.value
67
- end
68
-
69
- # Store +source+ into +destination+.
70
- #
71
- # @param [String, IMW::Resource, #io] source
72
- # @param [String, IMW::Resource, #path, #bucket] destination
73
- # @return [IMW::Resource] the new S3 object
74
- def self.put source, destination
75
- source = IMW.open(source)
76
- destintation = IMW.open(destination)
77
- raise IMW::ArgumentError.new("destination must be on S3 -- #{destination} given") unless destination.on_s3?
78
- make_connection!
79
- AWS::S3::S3Object.store(destination.path, source.io, destination.bucket)
80
- destination
81
- end
82
-
83
- # Download +source+ from S3 into +destination+.
84
- #
85
- # @param [String, IMW::Resource, #path, #bucket] source
86
- # @param [String, IMW::Resource, #write] destination
87
- # @return [IMW::Resource] the new resource
88
- def self.get source, destination
89
- source = IMW.open(source)
90
- destination = IMW.open!(destination)
91
- raise IMW::ArgumentError.new("source must be on S3 -- #{source} given") unless source.on_s3?
92
- make_connection!
93
- AWS::S3::S3Object.stream(source.path, source.bucket) do |chunk|
94
- destination.write(chunk)
95
- end
96
- destination.close
97
- destination.reopen
98
- end
99
-
100
- # Copy S3 resource +source+ to +destination+.
101
- #
102
- # @param [String, IMW::Resource, #path, #bucket] source
103
- # @param [String, IMW::Resource, #path, #bucket] destination
104
- # @return [IMW::Resource] the new resource
105
- def self.copy source, destination
106
- source = IMW.open(source)
107
- destination = IMW.open(destination)
108
- raise IMW::PathError.new("Bucket names must be non-blank and match to 'copy'") unless source.bucket.present? && destination.bucket.present? && source.bucket == destination.bucket
109
- make_connection!
110
- AWS::S3::Object.copy(source.path, destination.path, destination.bucket)
111
- destination
112
- end
113
-
114
- # Return the resource at the base path of this resource joined
115
- # to +path+.
116
- #
117
- # IMW.open('s3:://bucket/path/to/dir').join('subdir')
118
- # #=> IMW::Resource at 's3://bucket/path/to/dir/subdir'
119
- #
120
- # @param [Array<String>] paths
121
- # @return [IMW::Resource]
122
- def join *paths
123
- IMW.open(File.join(stripped_uri.to_s, *paths))
124
- end
125
-
126
- protected
127
- # Make an S3 connection.
128
- #
129
- # Uses settings defined in IMW::AWS_CREDENTIALS.
130
- #
131
- # @return [AWS
132
- def self.make_connection!
133
- return @connection if @connection
134
- raise IMW::Error.new("Must define a constant IMW::AWS_CREDENTIALS with an :access_key_id and a :secret_access_key before using S3 resources") unless defined?(IMW::AWS_CREDENTIALS)
135
- require 'aws/s3'
136
- @connection = AWS::S3::Base.establish_connection!(IMW::AWS_CREDENTIALS)
137
- end
138
-
139
- end
140
- end
141
- end
142
-
@@ -1,251 +0,0 @@
1
- module IMW
2
- module Schemes
3
-
4
- # Defines methods for reading and writing data to/from an
5
- # HDFS[http://hadoop.apache.org/common/docs/current/hdfs_design.html]]
6
- #
7
- # Learn more about Hadoop[http://hadoop.apache.org] and the
8
- # {Hadoop Distributed
9
- # Filesystem}[http://hadoop.apache.org/common/docs/current/hdfs_design.html].
10
- module HDFS
11
-
12
- # Checks to see if this is a file or directory
13
- def self.extended obj
14
- obj.extend(obj.is_directory? ? HDFSDirectory : HDFSFile)
15
- end
16
-
17
- # Is this resource an HDFS resource?
18
- #
19
- # @return [true, false]
20
- def on_hdfs?
21
- true
22
- end
23
- alias_method :is_hdfs?, :on_hdfs?
24
-
25
- # Copy this resource to the +new_uri+.
26
- #
27
- # @param [String, IMW::Resource] new_uri
28
- # @return [IMW::Resource] the new resource
29
- def cp new_uri
30
- IMW::Tools::Transferer.new(:cp, self, new_uri).transfer!
31
- end
32
-
33
- # Move this resource to the +new_uri+.
34
- #
35
- # @param [String, IMW::Resource] new_uri
36
- # @return [IMW::Resource] the new resource
37
- def mv new_uri
38
- IMW::Tools::Transferer.new(:mv, self, new_uri).transfer!
39
- end
40
-
41
- # Delete this resource from the HDFS.
42
- #
43
- # @option options [true,false] :skip_trash
44
- def rm options={}
45
- should_exist!("Cannot delete.")
46
- args = [:rm]
47
- args << '-skipTrash' if options[:skip] || options[:skip_trash] || options[:skipTrash]
48
- args << path
49
- HDFS.fs(*args)
50
- self
51
- end
52
- alias_method :rm!, :rm
53
-
54
-
55
- # Does this path exist on the HDFS?
56
- #
57
- # @return [true, false]
58
- def exist?
59
- return @exist unless @exist.nil?
60
- refresh!
61
- @exist
62
- end
63
- alias_method :exists?, :exist?
64
-
65
-
66
- # Return the size (in bytes) of this resource on the HDFS.
67
- #
68
- # This value is cached. Call +refresh+ to refresh the cache
69
- # manually.
70
- #
71
- # @return [Fixnum]
72
- def size
73
- return @size unless @size.nil?
74
- refresh!
75
- should_exist!("Cannot report size")
76
- @size
77
- end
78
-
79
- # Return the number of directories contained at or below this
80
- # path on the HDFS.
81
- #
82
- # This value is cached. Call +refresh+ to refresh the cache
83
- # manually.
84
- #
85
- # @return [Fixnum]
86
- def num_dirs
87
- return @num_dirs unless @num_dirs.nil?
88
- refresh!
89
- should_exist!("Cannot report number of directories.")
90
- @num_dirs
91
- end
92
-
93
- # Return the number of files contained at or below this path
94
- # on the HDFS.
95
- #
96
- # This value is cached. Call +refresh+ to refresh the cache
97
- # manually.
98
- #
99
- # @return [Fixnum]
100
- def num_files
101
- return @num_files unless @num_files.nil?
102
- refresh!
103
- should_exist!("Cannot report number of files.")
104
- @num_files
105
- end
106
-
107
- # Is this resource an HDFS directory?
108
- #
109
- # @return [true, false]
110
- def is_directory?
111
- exist? && num_dirs > 0
112
- end
113
-
114
- # Refresh the cached file properties.
115
- #
116
- # @return [IMW::Resource] this resource
117
- def refresh!
118
- response = HDFS.fs(:count, path)
119
- if response.blank? || response =~ /^Can not find listing for/
120
- @exist = false
121
- @num_dirs, @num_files, @size, @hdfs_path = false, false, false, false
122
- else
123
- @exist = true
124
- parts = response.split
125
- @num_dirs, @num_files, @size = parts[0..2].map(&:to_i)
126
- @hdfs_path = parts.last
127
- end
128
- self
129
- end
130
-
131
- # Execute +command+ with +args+ on the Hadoop Distributed
132
- # Filesystem (HDFS).
133
- #
134
- # If passed a block, yield each line of the output from the
135
- # command, else just return the output.
136
- #
137
- # Try running `hadoop fs -help' for more information.
138
- #
139
- # @param [String, Symbol] command the command to run.
140
- # @param [String, Symbol] args the arguments to pass the command
141
- # @yield [String] each line of the command's output
142
- # @return [String] the command's output
143
- def self.fs command, *args
144
- command_string = "#{executable} fs -#{command} #{args.compact.map(&:to_str).join(' ')}"
145
- command_string += " 2>&1" if command == :count # FIXME or else it just spams the screen when we do HDFS#refresh!
146
- output = `#{command_string}`.chomp
147
- if block_given?
148
- output.split("\n").each do |line|
149
- yield line
150
- end
151
- else
152
- output
153
- end
154
- end
155
-
156
- protected
157
- # Returns the path to the Hadoop executable.
158
- #
159
- # @return [String]
160
- def self.executable
161
- @executable ||= begin
162
- string = `which hadoop`.chomp
163
- raise IMW::Error.new("Could not find hadoop command. Is Hadoop installed?") if string.blank?
164
- string
165
- end
166
- end
167
- end
168
-
169
- # Defines methods for reading data from HDFS files.
170
- module HDFSFile
171
-
172
- # Return the contents of this HDFS file as a string.
173
- #
174
- # Be VERY careful how you use this!
175
- #
176
- # @return [String]
177
- def read
178
- HDFS.fs(:cat, path)
179
- end
180
-
181
- # Iterate through each line of this HDFS resource.
182
- #
183
- # @yield [String] each line of the file
184
- def each &block
185
- HDFS.fs(:cat, path, &block)
186
- end
187
-
188
- # Return a handle on a StringIO object representing the
189
- # content in this HDFS file.
190
- #
191
- # Be VERY careful how you use this! It is a StringIO object
192
- # so the whole HDFS file is read into a string before
193
- # returning the handle.
194
- #
195
- # @return [StringIO]
196
- def io
197
- @io ||= StringIO.new(read)
198
- end
199
-
200
- # Map over the lines of this HDFS resource.
201
- #
202
- # @yield [String] each line of the file
203
- # @return [Array] the result of the block on each line
204
- def map &block
205
- [].tap do |output|
206
- HDFS.fs(:cat, path) do |line|
207
- output << block.call(line)
208
- end
209
- end
210
- end
211
-
212
- end
213
-
214
- # Defines methods for listing contents of HDFS directories.
215
- module HDFSDirectory
216
-
217
- # Return the paths of all files and directories directly below
218
- # this directory on the HDFS.
219
- #
220
- # @return [Array<String>]
221
- def contents
222
- [].tap do |paths|
223
- HDFS.fs(:ls, path) do |line|
224
- next if line =~ /^Found.*items$/
225
- paths << line.split.last
226
- end
227
- end
228
- end
229
-
230
- # Return the resources directly below this directory on the
231
- # HDFS.
232
- #
233
- # @return [Array<IMW::Resource>]
234
- def resources
235
- contents.map { |path| IMW.open(path) }
236
- end
237
-
238
- # Return the resource at the base path of this resource joined
239
- # to +path+.
240
- #
241
- # IMW.open('hdfs:///path/to/dir').join('subdir')
242
- # #=> IMW::Resource at 'hdfs:///path/to/dir/subdir'
243
- #
244
- # @param [Array<String>] paths
245
- # @return [IMW::Resource]
246
- def join *paths
247
- IMW.open(File.join(stripped_uri.to_s, *paths))
248
- end
249
- end
250
- end
251
- end