imw 0.2.18 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,118 +0,0 @@
1
- require 'imw'
2
- require 'optparse'
3
-
4
- module IMW
5
-
6
- RunnerError = Class.new(IMW::Error)
7
-
8
- class Runner
9
-
10
- DEFAULT_OPTIONS = {
11
- :requires => [],
12
- :selectors => [],
13
- :dry_run => false
14
- }
15
-
16
- attr_reader :args, :options
17
-
18
- def initialize *args
19
- @args = args
20
- @options = DEFAULT_OPTIONS.dup
21
- parser.parse!(args)
22
- end
23
-
24
- def parser
25
- OptionParser.new do |opts|
26
- opts.banner = "usage: imw [OPTIONS] TASK"
27
- opts.separator <<EOF
28
-
29
- Run TASK for all datasets in the repository. IMW will read any
30
- *.imw files in the current directory by default.
31
-
32
- Options include
33
-
34
- EOF
35
-
36
- opts.on('-v', '--verbose', "Print verbose output") do
37
- IMW.verbose = true # class level, see IMW::Runner.verbose?
38
- end
39
-
40
- opts.on('-d', '--skip-dependencies', "Execute given tasks without invoking dependencies first") do
41
- options[:execute] = true
42
- end
43
-
44
- opts.on('-l', '--list', "List datasets in repository") do
45
- options[:list] = true
46
- end
47
-
48
- opts.on('-s', '--selector SELECTOR', "Filter datasets by regexp SELECTOR. Can be given more than once.") do |selector|
49
- options[:selectors] << selector
50
- end
51
-
52
- opts.on('-r', '--require PATH', "Require PATH. Can be given more than once.") do |path|
53
- options[:requires] << path
54
- end
55
-
56
- end
57
- end
58
-
59
- def require_files
60
- Dir['*.imw'].each { |path| load File.expand_path(path) }
61
- Dir['*.rb'].each { |path| require path.gsub(/\.rb$/,'') }
62
- options[:requires].each do |path|
63
- IMW.open(path) do |requireable|
64
- if requireable.directory?
65
- requireable["**/*.rb"].each { |file| require file }
66
- requireable["**/*.imw"].each { |file| load file }
67
- else
68
- require requireable.path
69
- end
70
- end
71
- end
72
- end
73
-
74
- def task
75
- args.first
76
- end
77
-
78
- def handles
79
- if options[:selectors].blank?
80
- IMW.repository.keys.sort
81
- else
82
- IMW.repository.handles.map do |handle|
83
- handle if options[:selectors].all? { |selector| handle.to_s =~ Regexp.new(selector) }
84
- end.compact.sort
85
- end
86
- end
87
-
88
- def datasets
89
- handles.map { |handle| IMW.repository[handle] }
90
- end
91
-
92
- def list!
93
- puts handles
94
- exit
95
- end
96
-
97
- def run_task!
98
- datasets.each do |dataset|
99
- dataset[task].send(options[:execute] ? :execute : :invoke)
100
- end
101
- exit
102
- end
103
-
104
- def run!
105
- require_files
106
- case
107
- when options[:list]
108
- list!
109
- when task.blank?
110
- puts parser
111
- exit 1
112
- else
113
- run_task!
114
- end
115
- end
116
- end
117
- end
118
-
@@ -1,23 +0,0 @@
1
- module IMW
2
- module Schemes
3
- autoload :Local, 'imw/schemes/local'
4
- autoload :Remote, 'imw/schemes/remote'
5
- autoload :S3, 'imw/schemes/s3'
6
- autoload :HTTP, 'imw/schemes/http'
7
- autoload :HTTPS, 'imw/schemes/http'
8
- autoload :HDFS, 'imw/schemes/hdfs'
9
- autoload :SQL, 'imw/schemes/sql'
10
-
11
- HANDLERS = [
12
- ["Schemes::Local::Base", Proc.new { |resource| resource.scheme == 'file' || resource.scheme.blank? } ],
13
- ["Schemes::Remote::Base", Proc.new { |resource| resource.scheme != 'file' && resource.scheme.present? } ],
14
- ["Schemes::S3", %r{^s3://}i ],
15
- ["Schemes::HTTP", %r{^http://}i ],
16
- ["Schemes::HTTPS", %r{^https://}i ],
17
- ["Schemes::HDFS", %r{^hdfs://}i ],
18
- ["Schemes::SQL::Base", %r{^\w+sql://}i ]
19
- ]
20
- end
21
- end
22
-
23
-
@@ -1,142 +0,0 @@
1
- module IMW
2
- module Schemes
3
-
4
- # Defines methods for reading and writing data from an FTP server.
5
- #
6
- # IMW.open('ftp://user:pass@my_bucket/path/to/some/file.csv')
7
- #
8
- # Learn more about {Amazon Web Services}[http://aws.amazon.com].
9
- module FTP
10
-
11
- module Base
12
-
13
- # Is this resource an FTP resource?
14
- #
15
- # @return [true, false]
16
- def on_ftp?
17
- true
18
- end
19
- alias_method :is_ftp?, :on_ftp?
20
-
21
- # Copy this resource to the +new_uri+.
22
- #
23
- # @param [String, IMW::Resource] new_uri
24
- # @return [IMW::Resource] the new resource
25
- def cp new_uri
26
- local_obj = IMW.open(new_uri)
27
- raise IMW::PathError.new("FTP resources (#{self}) can only be downloaded to a local path") unless local_obj.is_local?
28
- local_obj.dir.should_exist!
29
- FTP.open(host, user, password) do |ftp|
30
- ftp.get(path, local_obj.path)
31
- end
32
- local_obj
33
- end
34
-
35
- # Does this resource exist on S3?
36
- #
37
- # @return [true, false]
38
- def exist?
39
- s3_object.exists?
40
- end
41
- alias_method :exists?, :exist?
42
-
43
- # Remove this resource from S3.
44
- #
45
- # @return [IMW::Resource] the deleted object
46
- def rm
47
- s3_object.delete
48
- end
49
- alias_method :rm!, :rm
50
-
51
- # Return the S3N URL for this S3 object
52
- #
53
- # resource = IMW.open('s3://my_bucket/path/to/some/obj')
54
- # resource.s3n_url
55
- # => 's3n://my_bucket/path/to/some/obj'
56
- #
57
- # @return [String]
58
- def s3n_url
59
- uri.to_s.gsub(/^s3:/, 's3n:')
60
- end
61
-
62
- # Return the contents of this S3 object.
63
- #
64
- # @return [String]
65
- def read
66
- s3_object.value
67
- end
68
-
69
- # Store +source+ into +destination+.
70
- #
71
- # @param [String, IMW::Resource, #io] source
72
- # @param [String, IMW::Resource, #path, #bucket] destination
73
- # @return [IMW::Resource] the new S3 object
74
- def self.put source, destination
75
- source = IMW.open(source)
76
- destintation = IMW.open(destination)
77
- raise IMW::ArgumentError.new("destination must be on S3 -- #{destination} given") unless destination.on_s3?
78
- make_connection!
79
- AWS::S3::S3Object.store(destination.path, source.io, destination.bucket)
80
- destination
81
- end
82
-
83
- # Download +source+ from S3 into +destination+.
84
- #
85
- # @param [String, IMW::Resource, #path, #bucket] source
86
- # @param [String, IMW::Resource, #write] destination
87
- # @return [IMW::Resource] the new resource
88
- def self.get source, destination
89
- source = IMW.open(source)
90
- destination = IMW.open!(destination)
91
- raise IMW::ArgumentError.new("source must be on S3 -- #{source} given") unless source.on_s3?
92
- make_connection!
93
- AWS::S3::S3Object.stream(source.path, source.bucket) do |chunk|
94
- destination.write(chunk)
95
- end
96
- destination.close
97
- destination.reopen
98
- end
99
-
100
- # Copy S3 resource +source+ to +destination+.
101
- #
102
- # @param [String, IMW::Resource, #path, #bucket] source
103
- # @param [String, IMW::Resource, #path, #bucket] destination
104
- # @return [IMW::Resource] the new resource
105
- def self.copy source, destination
106
- source = IMW.open(source)
107
- destination = IMW.open(destination)
108
- raise IMW::PathError.new("Bucket names must be non-blank and match to 'copy'") unless source.bucket.present? && destination.bucket.present? && source.bucket == destination.bucket
109
- make_connection!
110
- AWS::S3::Object.copy(source.path, destination.path, destination.bucket)
111
- destination
112
- end
113
-
114
- # Return the resource at the base path of this resource joined
115
- # to +path+.
116
- #
117
- # IMW.open('s3:://bucket/path/to/dir').join('subdir')
118
- # #=> IMW::Resource at 's3://bucket/path/to/dir/subdir'
119
- #
120
- # @param [Array<String>] paths
121
- # @return [IMW::Resource]
122
- def join *paths
123
- IMW.open(File.join(stripped_uri.to_s, *paths))
124
- end
125
-
126
- protected
127
- # Make an S3 connection.
128
- #
129
- # Uses settings defined in IMW::AWS_CREDENTIALS.
130
- #
131
- # @return [AWS
132
- def self.make_connection!
133
- return @connection if @connection
134
- raise IMW::Error.new("Must define a constant IMW::AWS_CREDENTIALS with an :access_key_id and a :secret_access_key before using S3 resources") unless defined?(IMW::AWS_CREDENTIALS)
135
- require 'aws/s3'
136
- @connection = AWS::S3::Base.establish_connection!(IMW::AWS_CREDENTIALS)
137
- end
138
-
139
- end
140
- end
141
- end
142
-
@@ -1,251 +0,0 @@
1
- module IMW
2
- module Schemes
3
-
4
- # Defines methods for reading and writing data to/from an
5
- # HDFS[http://hadoop.apache.org/common/docs/current/hdfs_design.html]]
6
- #
7
- # Learn more about Hadoop[http://hadoop.apache.org] and the
8
- # {Hadoop Distributed
9
- # Filesystem}[http://hadoop.apache.org/common/docs/current/hdfs_design.html].
10
- module HDFS
11
-
12
- # Checks to see if this is a file or directory
13
- def self.extended obj
14
- obj.extend(obj.is_directory? ? HDFSDirectory : HDFSFile)
15
- end
16
-
17
- # Is this resource an HDFS resource?
18
- #
19
- # @return [true, false]
20
- def on_hdfs?
21
- true
22
- end
23
- alias_method :is_hdfs?, :on_hdfs?
24
-
25
- # Copy this resource to the +new_uri+.
26
- #
27
- # @param [String, IMW::Resource] new_uri
28
- # @return [IMW::Resource] the new resource
29
- def cp new_uri
30
- IMW::Tools::Transferer.new(:cp, self, new_uri).transfer!
31
- end
32
-
33
- # Move this resource to the +new_uri+.
34
- #
35
- # @param [String, IMW::Resource] new_uri
36
- # @return [IMW::Resource] the new resource
37
- def mv new_uri
38
- IMW::Tools::Transferer.new(:mv, self, new_uri).transfer!
39
- end
40
-
41
- # Delete this resource from the HDFS.
42
- #
43
- # @option options [true,false] :skip_trash
44
- def rm options={}
45
- should_exist!("Cannot delete.")
46
- args = [:rm]
47
- args << '-skipTrash' if options[:skip] || options[:skip_trash] || options[:skipTrash]
48
- args << path
49
- HDFS.fs(*args)
50
- self
51
- end
52
- alias_method :rm!, :rm
53
-
54
-
55
- # Does this path exist on the HDFS?
56
- #
57
- # @return [true, false]
58
- def exist?
59
- return @exist unless @exist.nil?
60
- refresh!
61
- @exist
62
- end
63
- alias_method :exists?, :exist?
64
-
65
-
66
- # Return the size (in bytes) of this resource on the HDFS.
67
- #
68
- # This value is cached. Call +refresh+ to refresh the cache
69
- # manually.
70
- #
71
- # @return [Fixnum]
72
- def size
73
- return @size unless @size.nil?
74
- refresh!
75
- should_exist!("Cannot report size")
76
- @size
77
- end
78
-
79
- # Return the number of directories contained at or below this
80
- # path on the HDFS.
81
- #
82
- # This value is cached. Call +refresh+ to refresh the cache
83
- # manually.
84
- #
85
- # @return [Fixnum]
86
- def num_dirs
87
- return @num_dirs unless @num_dirs.nil?
88
- refresh!
89
- should_exist!("Cannot report number of directories.")
90
- @num_dirs
91
- end
92
-
93
- # Return the number of files contained at or below this path
94
- # on the HDFS.
95
- #
96
- # This value is cached. Call +refresh+ to refresh the cache
97
- # manually.
98
- #
99
- # @return [Fixnum]
100
- def num_files
101
- return @num_files unless @num_files.nil?
102
- refresh!
103
- should_exist!("Cannot report number of files.")
104
- @num_files
105
- end
106
-
107
- # Is this resource an HDFS directory?
108
- #
109
- # @return [true, false]
110
- def is_directory?
111
- exist? && num_dirs > 0
112
- end
113
-
114
- # Refresh the cached file properties.
115
- #
116
- # @return [IMW::Resource] this resource
117
- def refresh!
118
- response = HDFS.fs(:count, path)
119
- if response.blank? || response =~ /^Can not find listing for/
120
- @exist = false
121
- @num_dirs, @num_files, @size, @hdfs_path = false, false, false, false
122
- else
123
- @exist = true
124
- parts = response.split
125
- @num_dirs, @num_files, @size = parts[0..2].map(&:to_i)
126
- @hdfs_path = parts.last
127
- end
128
- self
129
- end
130
-
131
- # Execute +command+ with +args+ on the Hadoop Distributed
132
- # Filesystem (HDFS).
133
- #
134
- # If passed a block, yield each line of the output from the
135
- # command, else just return the output.
136
- #
137
- # Try running `hadoop fs -help' for more information.
138
- #
139
- # @param [String, Symbol] command the command to run.
140
- # @param [String, Symbol] args the arguments to pass the command
141
- # @yield [String] each line of the command's output
142
- # @return [String] the command's output
143
- def self.fs command, *args
144
- command_string = "#{executable} fs -#{command} #{args.compact.map(&:to_str).join(' ')}"
145
- command_string += " 2>&1" if command == :count # FIXME or else it just spams the screen when we do HDFS#refresh!
146
- output = `#{command_string}`.chomp
147
- if block_given?
148
- output.split("\n").each do |line|
149
- yield line
150
- end
151
- else
152
- output
153
- end
154
- end
155
-
156
- protected
157
- # Returns the path to the Hadoop executable.
158
- #
159
- # @return [String]
160
- def self.executable
161
- @executable ||= begin
162
- string = `which hadoop`.chomp
163
- raise IMW::Error.new("Could not find hadoop command. Is Hadoop installed?") if string.blank?
164
- string
165
- end
166
- end
167
- end
168
-
169
- # Defines methods for reading data from HDFS files.
170
- module HDFSFile
171
-
172
- # Return the contents of this HDFS file as a string.
173
- #
174
- # Be VERY careful how you use this!
175
- #
176
- # @return [String]
177
- def read
178
- HDFS.fs(:cat, path)
179
- end
180
-
181
- # Iterate through each line of this HDFS resource.
182
- #
183
- # @yield [String] each line of the file
184
- def each &block
185
- HDFS.fs(:cat, path, &block)
186
- end
187
-
188
- # Return a handle on a StringIO object representing the
189
- # content in this HDFS file.
190
- #
191
- # Be VERY careful how you use this! It is a StringIO object
192
- # so the whole HDFS file is read into a string before
193
- # returning the handle.
194
- #
195
- # @return [StringIO]
196
- def io
197
- @io ||= StringIO.new(read)
198
- end
199
-
200
- # Map over the lines of this HDFS resource.
201
- #
202
- # @yield [String] each line of the file
203
- # @return [Array] the result of the block on each line
204
- def map &block
205
- [].tap do |output|
206
- HDFS.fs(:cat, path) do |line|
207
- output << block.call(line)
208
- end
209
- end
210
- end
211
-
212
- end
213
-
214
- # Defines methods for listing contents of HDFS directories.
215
- module HDFSDirectory
216
-
217
- # Return the paths of all files and directories directly below
218
- # this directory on the HDFS.
219
- #
220
- # @return [Array<String>]
221
- def contents
222
- [].tap do |paths|
223
- HDFS.fs(:ls, path) do |line|
224
- next if line =~ /^Found.*items$/
225
- paths << line.split.last
226
- end
227
- end
228
- end
229
-
230
- # Return the resources directly below this directory on the
231
- # HDFS.
232
- #
233
- # @return [Array<IMW::Resource>]
234
- def resources
235
- contents.map { |path| IMW.open(path) }
236
- end
237
-
238
- # Return the resource at the base path of this resource joined
239
- # to +path+.
240
- #
241
- # IMW.open('hdfs:///path/to/dir').join('subdir')
242
- # #=> IMW::Resource at 'hdfs:///path/to/dir/subdir'
243
- #
244
- # @param [Array<String>] paths
245
- # @return [IMW::Resource]
246
- def join *paths
247
- IMW.open(File.join(stripped_uri.to_s, *paths))
248
- end
249
- end
250
- end
251
- end