swineherd-fs 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +2 -0
- data/LICENSE +188 -0
- data/README.textile +66 -0
- data/VERSION +1 -0
- data/lib/swineherd-fs/hadoopfilesystem.rb +249 -0
- data/lib/swineherd-fs/localfilesystem.rb +81 -0
- data/lib/swineherd-fs/s3filesystem.rb +311 -0
- data/lib/swineherd-fs.rb +91 -0
- data/rspec.watchr +19 -0
- data/spec/filesystem_spec.rb +186 -0
- data/spec/spec_helper.rb +2 -0
- data/swineherd-fs.gemspec +23 -0
- metadata +121 -0
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,188 @@
|
|
1
|
+
Copyright 2011 Infochimps, Inc
|
2
|
+
|
3
|
+
Apache License Version 2.0, January 2004, http://www.apache.org/licenses/
|
4
|
+
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6
|
+
|
7
|
+
1. Definitions.
|
8
|
+
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
10
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
11
|
+
|
12
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13
|
+
the copyright owner that is granting the License.
|
14
|
+
|
15
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
16
|
+
other entities that control, are controlled by, or are under common
|
17
|
+
control with that entity. For the purposes of this definition,
|
18
|
+
"control" means (i) the power, direct or indirect, to cause the
|
19
|
+
direction or management of such entity, whether by contract or
|
20
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22
|
+
|
23
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24
|
+
exercising permissions granted by this License.
|
25
|
+
|
26
|
+
"Source" form shall mean the preferred form for making modifications,
|
27
|
+
including but not limited to software source code, documentation
|
28
|
+
source, and configuration files.
|
29
|
+
|
30
|
+
"Object" form shall mean any form resulting from mechanical
|
31
|
+
transformation or translation of a Source form, including but
|
32
|
+
not limited to compiled object code, generated documentation,
|
33
|
+
and conversions to other media types.
|
34
|
+
|
35
|
+
"Work" shall mean the work of authorship, whether in Source or
|
36
|
+
Object form, made available under the License, as indicated by a
|
37
|
+
copyright notice that is included in or attached to the work
|
38
|
+
(an example is provided in the Appendix below).
|
39
|
+
|
40
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41
|
+
form, that is based on (or derived from) the Work and for which the
|
42
|
+
editorial revisions, annotations, elaborations, or other modifications
|
43
|
+
represent, as a whole, an original work of authorship. For the purposes
|
44
|
+
of this License, Derivative Works shall not include works that remain
|
45
|
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46
|
+
the Work and Derivative Works thereof.
|
47
|
+
|
48
|
+
"Contribution" shall mean any work of authorship, including
|
49
|
+
the original version of the Work and any modifications or additions
|
50
|
+
to that Work or Derivative Works thereof, that is intentionally
|
51
|
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52
|
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53
|
+
the copyright owner. For the purposes of this definition, "submitted"
|
54
|
+
means any form of electronic, verbal, or written communication sent
|
55
|
+
to the Licensor or its representatives, including but not limited to
|
56
|
+
communication on electronic mailing lists, source code control systems,
|
57
|
+
and issue tracking systems that are managed by, or on behalf of, the
|
58
|
+
Licensor for the purpose of discussing and improving the Work, but
|
59
|
+
excluding communication that is conspicuously marked or otherwise
|
60
|
+
designated in writing by the copyright owner as "Not a Contribution."
|
61
|
+
|
62
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63
|
+
on behalf of whom a Contribution has been received by Licensor and
|
64
|
+
subsequently incorporated within the Work.
|
65
|
+
|
66
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67
|
+
this License, each Contributor hereby grants to You a perpetual,
|
68
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69
|
+
copyright license to reproduce, prepare Derivative Works of,
|
70
|
+
publicly display, publicly perform, sublicense, and distribute the
|
71
|
+
Work and such Derivative Works in Source or Object form.
|
72
|
+
|
73
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74
|
+
this License, each Contributor hereby grants to You a perpetual,
|
75
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76
|
+
(except as stated in this section) patent license to make, have made,
|
77
|
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78
|
+
where such license applies only to those patent claims licensable
|
79
|
+
by such Contributor that are necessarily infringed by their
|
80
|
+
Contribution(s) alone or by combination of their Contribution(s)
|
81
|
+
with the Work to which such Contribution(s) was submitted. If You
|
82
|
+
institute patent litigation against any entity (including a
|
83
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84
|
+
or a Contribution incorporated within the Work constitutes direct
|
85
|
+
or contributory patent infringement, then any patent licenses
|
86
|
+
granted to You under this License for that Work shall terminate
|
87
|
+
as of the date such litigation is filed.
|
88
|
+
|
89
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
90
|
+
Work or Derivative Works thereof in any medium, with or without
|
91
|
+
modifications, and in Source or Object form, provided that You
|
92
|
+
meet the following conditions:
|
93
|
+
|
94
|
+
(a) You must give any other recipients of the Work or
|
95
|
+
Derivative Works a copy of this License; and
|
96
|
+
|
97
|
+
(b) You must cause any modified files to carry prominent notices
|
98
|
+
stating that You changed the files; and
|
99
|
+
|
100
|
+
(c) You must retain, in the Source form of any Derivative Works
|
101
|
+
that You distribute, all copyright, patent, trademark, and
|
102
|
+
attribution notices from the Source form of the Work,
|
103
|
+
excluding those notices that do not pertain to any part of
|
104
|
+
the Derivative Works; and
|
105
|
+
|
106
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107
|
+
distribution, then any Derivative Works that You distribute must
|
108
|
+
include a readable copy of the attribution notices contained
|
109
|
+
within such NOTICE file, excluding those notices that do not
|
110
|
+
pertain to any part of the Derivative Works, in at least one
|
111
|
+
of the following places: within a NOTICE text file distributed
|
112
|
+
as part of the Derivative Works; within the Source form or
|
113
|
+
documentation, if provided along with the Derivative Works; or,
|
114
|
+
within a display generated by the Derivative Works, if and
|
115
|
+
wherever such third-party notices normally appear. The contents
|
116
|
+
of the NOTICE file are for informational purposes only and
|
117
|
+
do not modify the License. You may add Your own attribution
|
118
|
+
notices within Derivative Works that You distribute, alongside
|
119
|
+
or as an addendum to the NOTICE text from the Work, provided
|
120
|
+
that such additional attribution notices cannot be construed
|
121
|
+
as modifying the License.
|
122
|
+
|
123
|
+
You may add Your own copyright statement to Your modifications and
|
124
|
+
may provide additional or different license terms and conditions
|
125
|
+
for use, reproduction, or distribution of Your modifications, or
|
126
|
+
for any such Derivative Works as a whole, provided Your use,
|
127
|
+
reproduction, and distribution of the Work otherwise complies with
|
128
|
+
the conditions stated in this License.
|
129
|
+
|
130
|
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131
|
+
any Contribution intentionally submitted for inclusion in the Work
|
132
|
+
by You to the Licensor shall be under the terms and conditions of
|
133
|
+
this License, without any additional terms or conditions.
|
134
|
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135
|
+
the terms of any separate license agreement you may have executed
|
136
|
+
with Licensor regarding such Contributions.
|
137
|
+
|
138
|
+
6. Trademarks. This License does not grant permission to use the trade
|
139
|
+
names, trademarks, service marks, or product names of the Licensor,
|
140
|
+
except as required for reasonable and customary use in describing the
|
141
|
+
origin of the Work and reproducing the content of the NOTICE file.
|
142
|
+
|
143
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144
|
+
agreed to in writing, Licensor provides the Work (and each
|
145
|
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147
|
+
implied, including, without limitation, any warranties or conditions
|
148
|
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149
|
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150
|
+
appropriateness of using or redistributing the Work and assume any
|
151
|
+
risks associated with Your exercise of permissions under this License.
|
152
|
+
|
153
|
+
8. Limitation of Liability. In no event and under no legal theory,
|
154
|
+
whether in tort (including negligence), contract, or otherwise,
|
155
|
+
unless required by applicable law (such as deliberate and grossly
|
156
|
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157
|
+
liable to You for damages, including any direct, indirect, special,
|
158
|
+
incidental, or consequential damages of any character arising as a
|
159
|
+
result of this License or out of the use or inability to use the
|
160
|
+
Work (including but not limited to damages for loss of goodwill,
|
161
|
+
work stoppage, computer failure or malfunction, or any and all
|
162
|
+
other commercial damages or losses), even if such Contributor
|
163
|
+
has been advised of the possibility of such damages.
|
164
|
+
|
165
|
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166
|
+
the Work or Derivative Works thereof, You may choose to offer,
|
167
|
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168
|
+
or other liability obligations and/or rights consistent with this
|
169
|
+
License. However, in accepting such obligations, You may act only
|
170
|
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171
|
+
of any other Contributor, and only if You agree to indemnify,
|
172
|
+
defend, and hold each Contributor harmless for any liability
|
173
|
+
incurred by, or claims asserted against, such Contributor by reason
|
174
|
+
of your accepting any such warranty or additional liability.
|
175
|
+
|
176
|
+
END OF TERMS AND CONDITIONS
|
177
|
+
|
178
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
179
|
+
you may not use this file except in compliance with the License.
|
180
|
+
You may obtain a copy of the License at
|
181
|
+
|
182
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
183
|
+
|
184
|
+
Unless required by applicable law or agreed to in writing, software
|
185
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
186
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
187
|
+
See the License for the specific language governing permissions and
|
188
|
+
limitations under the License.
|
data/README.textile
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
h1. Swineherd-fs
|
2
|
+
|
3
|
+
* @file@ - Local file system. Only thoroughly tested on Ubuntu Linux.
|
4
|
+
* @hdfs@ - Hadoop distributed file system. Uses the Apache Hadoop 0.20 API. Requires JRuby.
|
5
|
+
* @s3@ - Amazon Simple Storage System (s3).
|
6
|
+
* @ftp@ - FTP (Not yet implemented)
|
7
|
+
|
8
|
+
All filesystem abstractions implement the following core functions, many taken from the UNIX filesystem:
|
9
|
+
|
10
|
+
* @mv@
|
11
|
+
* @cp@
|
12
|
+
* @cp_r@
|
13
|
+
* @rm@
|
14
|
+
* @rm_r@
|
15
|
+
* @open@
|
16
|
+
* @exists?@
|
17
|
+
* @directory?@
|
18
|
+
* @ls@
|
19
|
+
* @ls_r@
|
20
|
+
* @mkdir_p@
|
21
|
+
|
22
|
+
Note: Since S3 is just a key-value store, it is difficult to preserve the notion of a directory. Therefore the @mkdir_p@ function has no purpose, as there cannot be empty directories. @mkdir_p@ currently only ensures that the bucket exists. This implies that the @directory?@ test only succeeds if the directory is non-empty, which clashes with the notion on the UNIX filesystem.
|
23
|
+
|
24
|
+
Additionally, the S3 and HDFS abstractions implement functions for moving files to and from the local filesystem:
|
25
|
+
|
26
|
+
* @copy_to_local@
|
27
|
+
* @copy_from_local@
|
28
|
+
|
29
|
+
Note: For these methods the destination and source path respectively are assumed to be local, so they do not have to be prefaced by a filescheme.
|
30
|
+
|
31
|
+
The @Swineherd::Filesystem@ module implements a generic filesystem abstraction using schemed filepaths (hdfs://,s3://,file://).
|
32
|
+
|
33
|
+
Currently only the following methods are supported for @Swineherd::Filesystem@:
|
34
|
+
|
35
|
+
* @cp@
|
36
|
+
* @exists?@
|
37
|
+
|
38
|
+
For example, instead of doing the following:<pre><code>hdfs = Swineherd::HadoopFilesystem.new
|
39
|
+
localfs = Swineherd::LocalFileSystem.new
|
40
|
+
hdfs.copy_to_local('foo/bar/baz.txt', 'foo/bar/baz.txt') unless localfs.exists? 'foo/bar/baz.txt'
|
41
|
+
</code></pre>
|
42
|
+
|
43
|
+
You can do:<pre><code>fs = Swineherd::Filesystem
|
44
|
+
fs.cp('hdfs://foo/bar/baz.txt','foo/bar/baz.txt') unless fs.exists?('foo/bar/baz.txt')
|
45
|
+
</code></pre>
|
46
|
+
|
47
|
+
Note: A path without a scheme is treated as a path on the local filesystem, or use the explicit file:// scheme for clarity. The following are equivalent:
|
48
|
+
|
49
|
+
<pre><code>fs.exists?('foo/bar/baz.txt')
|
50
|
+
fs.exists?('file://foo/bar/baz.txt')
|
51
|
+
</code></pre>
|
52
|
+
|
53
|
+
h4. Config
|
54
|
+
|
55
|
+
* In order to use the @S3Filesystem@, Swineherd requires AWS S3 access credentials.
|
56
|
+
|
57
|
+
* In @~/swineherd.yaml@ or @/etc/swineherd.yaml@:
|
58
|
+
|
59
|
+
<pre><code>aws:
|
60
|
+
access_key: my_access_key
|
61
|
+
secret_key: my_secret_key
|
62
|
+
</code></pre>
|
63
|
+
|
64
|
+
* Or just pass them in when creating the instance:
|
65
|
+
|
66
|
+
<pre><code>S3 = Swineherd::S3FileSystem.new(:access_key => "my_access_key",:secret_key => "my_secret_key")</code></pre>
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.2
|
@@ -0,0 +1,249 @@
|
|
1
|
+
module Swineherd
|
2
|
+
|
3
|
+
#
|
4
|
+
# Methods for dealing with the Hadoop distributed file system (hdfs). This class
|
5
|
+
# requires that you run with JRuby as it makes use of the native Java Hadoop
|
6
|
+
# libraries.
|
7
|
+
#
|
8
|
+
class HadoopFileSystem
|
9
|
+
|
10
|
+
attr_accessor :conf, :hdfs
|
11
|
+
|
12
|
+
def initialize *args
|
13
|
+
set_hadoop_environment if running_jruby?
|
14
|
+
|
15
|
+
@conf = Java::org.apache.hadoop.conf.Configuration.new
|
16
|
+
|
17
|
+
if Swineherd.config[:aws]
|
18
|
+
@conf.set("fs.s3.awsAccessKeyId",Swineherd.config[:aws][:access_key])
|
19
|
+
@conf.set("fs.s3.awsSecretAccessKey",Swineherd.config[:aws][:secret_key])
|
20
|
+
|
21
|
+
@conf.set("fs.s3n.awsAccessKeyId",Swineherd.config[:aws][:access_key])
|
22
|
+
@conf.set("fs.s3n.awsSecretAccessKey",Swineherd.config[:aws][:secret_key])
|
23
|
+
end
|
24
|
+
|
25
|
+
@hdfs = Java::org.apache.hadoop.fs.FileSystem.get(@conf)
|
26
|
+
end
|
27
|
+
|
28
|
+
def open path, mode="r", &blk
|
29
|
+
HadoopFile.new(path,mode,self,&blk)
|
30
|
+
end
|
31
|
+
|
32
|
+
def size path
|
33
|
+
ls_r(path).inject(0){|sz,filepath| sz += @hdfs.get_file_status(Path.new(filepath)).get_len}
|
34
|
+
end
|
35
|
+
|
36
|
+
def ls path
|
37
|
+
(@hdfs.list_status(Path.new(path)) || []).map{|path| path.get_path.to_s}
|
38
|
+
end
|
39
|
+
|
40
|
+
#list directories recursively, similar to unix 'ls -R'
|
41
|
+
def ls_r path
|
42
|
+
ls(path).inject([]){|rec_paths,path| rec_paths << path; rec_paths << ls(path) unless file?(path); rec_paths}.flatten
|
43
|
+
end
|
44
|
+
|
45
|
+
def rm path
|
46
|
+
begin
|
47
|
+
@hdfs.delete(Path.new(path), false)
|
48
|
+
rescue java.io.IOException => e
|
49
|
+
raise Errno::EISDIR, e.message
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def rm_r path
|
54
|
+
@hdfs.delete(Path.new(path), true)
|
55
|
+
end
|
56
|
+
|
57
|
+
def exists? path
|
58
|
+
@hdfs.exists(Path.new(path))
|
59
|
+
end
|
60
|
+
|
61
|
+
def directory? path
|
62
|
+
exists?(path) && @hdfs.get_file_status(Path.new(path)).is_dir?
|
63
|
+
end
|
64
|
+
|
65
|
+
def file? path
|
66
|
+
exists?(path) && @hdfs.isFile(Path.new(path))
|
67
|
+
end
|
68
|
+
|
69
|
+
def mv srcpath, dstpath
|
70
|
+
@hdfs.rename(Path.new(srcpath), Path.new(dstpath))
|
71
|
+
end
|
72
|
+
|
73
|
+
#supports s3://,s3n://,hdfs:// in @srcpath@ and @dstpath@
|
74
|
+
def cp srcpath, dstpath
|
75
|
+
@src_fs = Java::org.apache.hadoop.fs.FileSystem.get(Java::JavaNet::URI.create(srcpath),@conf)
|
76
|
+
@dest_fs = Java::org.apache.hadoop.fs.FileSystem.get(Java::JavaNet::URI.create(dstpath),@conf)
|
77
|
+
FileUtil.copy(@src_fs, Path.new(srcpath),@dest_fs, Path.new(dstpath), false, @conf)
|
78
|
+
end
|
79
|
+
|
80
|
+
def cp_r srcpath,dstpath
|
81
|
+
cp srcpath,dstpath
|
82
|
+
end
|
83
|
+
|
84
|
+
def mkdir_p path
|
85
|
+
@hdfs.mkdirs(Path.new(path))
|
86
|
+
end
|
87
|
+
|
88
|
+
#
|
89
|
+
# Copy hdfs file to local filesystem
|
90
|
+
#
|
91
|
+
def copy_to_local srcfile, dstfile
|
92
|
+
@hdfs.copy_to_local_file(Path.new(srcfile), Path.new(dstfile))
|
93
|
+
end
|
94
|
+
# alias :get :copy_to_local
|
95
|
+
|
96
|
+
#
|
97
|
+
# Copy local file to hdfs filesystem
|
98
|
+
#
|
99
|
+
def copy_from_local srcfile, dstfile
|
100
|
+
@hdfs.copy_from_local_file(Path.new(srcfile), Path.new(dstfile))
|
101
|
+
end
|
102
|
+
# alias :put :copy_from_local
|
103
|
+
|
104
|
+
|
105
|
+
#
|
106
|
+
# Merge all part files in a directory into one file.
|
107
|
+
#
|
108
|
+
def merge srcdir, dstfile
|
109
|
+
FileUtil.copy_merge(@hdfs, Path.new(srcdir), @hdfs, Path.new(dstfile), false, @conf, "")
|
110
|
+
end
|
111
|
+
|
112
|
+
#
|
113
|
+
# This is hackety. Use with caution.
|
114
|
+
#
|
115
|
+
def stream input, output
|
116
|
+
input_fs_scheme = (Java::JavaNet::URI.create(input).scheme || "file") + "://"
|
117
|
+
output_fs_scheme = (Java::JavaNet::URI.create(output).scheme || "file") + "://"
|
118
|
+
system("#{@hadoop_home}/bin/hadoop \\
|
119
|
+
jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
|
120
|
+
-D mapred.job.name=\"Stream { #{input_fs_scheme}(#{File.basename(input)}) -> #{output_fs_scheme}(#{File.basename(output)}) }\" \\
|
121
|
+
-D mapred.min.split.size=1000000000 \\
|
122
|
+
-D mapred.reduce.tasks=0 \\
|
123
|
+
-mapper \"/bin/cat\" \\
|
124
|
+
-input \"#{input}\" \\
|
125
|
+
-output \"#{output}\"")
|
126
|
+
end
|
127
|
+
|
128
|
+
#
|
129
|
+
# BZIP
|
130
|
+
#
|
131
|
+
def bzip input, output
|
132
|
+
system("#{@hadoop_home}/bin/hadoop \\
|
133
|
+
jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
|
134
|
+
-D mapred.output.compress=true \\
|
135
|
+
-D mapred.output.compression.codec=org.apache.hadoop.io.compress.BZip2Codec \\
|
136
|
+
-D mapred.reduce.tasks=1 \\
|
137
|
+
-mapper \"/bin/cat\" \\
|
138
|
+
-reducer \"/bin/cat\" \\
|
139
|
+
-input \"#{input}\" \\
|
140
|
+
-output \"#{output}\"")
|
141
|
+
end
|
142
|
+
|
143
|
+
#
|
144
|
+
# Merges many input files into :reduce_tasks amount of output files
|
145
|
+
#
|
146
|
+
def dist_merge inputs, output, options = {}
|
147
|
+
options[:reduce_tasks] ||= 25
|
148
|
+
options[:partition_fields] ||= 2
|
149
|
+
options[:sort_fields] ||= 2
|
150
|
+
options[:field_separator] ||= '/t'
|
151
|
+
names = inputs.map{|inp| File.basename(inp)}.join(',')
|
152
|
+
cmd = "#{@hadoop_home}/bin/hadoop \\
|
153
|
+
jar #{@hadoop_home}/contrib/streaming/hadoop-*streaming*.jar \\
|
154
|
+
-D mapred.job.name=\"Swineherd Merge (#{names} -> #{output})\" \\
|
155
|
+
-D num.key.fields.for.partition=\"#{options[:partition_fields]}\" \\
|
156
|
+
-D stream.num.map.output.key.fields=\"#{options[:sort_fields]}\" \\
|
157
|
+
-D mapred.text.key.partitioner.options=\"-k1,#{options[:partition_fields]}\" \\
|
158
|
+
-D stream.map.output.field.separator=\"'#{options[:field_separator]}'\" \\
|
159
|
+
-D mapred.min.split.size=1000000000 \\
|
160
|
+
-D mapred.reduce.tasks=#{options[:reduce_tasks]} \\
|
161
|
+
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \\
|
162
|
+
-mapper \"/bin/cat\" \\
|
163
|
+
-reducer \"/usr/bin/uniq\" \\
|
164
|
+
-input \"#{inputs.join(',')}\" \\
|
165
|
+
-output \"#{output}\""
|
166
|
+
puts cmd
|
167
|
+
system cmd
|
168
|
+
end
|
169
|
+
|
170
|
+
class HadoopFile
|
171
|
+
attr_accessor :handle
|
172
|
+
|
173
|
+
#
|
174
|
+
# In order to open input and output streams we must pass around the hadoop fs object itself
|
175
|
+
#
|
176
|
+
def initialize path, mode, fs, &blk
|
177
|
+
raise Errno::EISDIR,"#{path} is a directory" if fs.directory?(path)
|
178
|
+
@path = Path.new(path)
|
179
|
+
case mode
|
180
|
+
when "r"
|
181
|
+
@handle = fs.hdfs.open(@path).to_io(&blk)
|
182
|
+
when "w"
|
183
|
+
@handle = fs.hdfs.create(@path).to_io.to_outputstream
|
184
|
+
if block_given?
|
185
|
+
yield self
|
186
|
+
self.close
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def path
|
192
|
+
@path.toString()
|
193
|
+
end
|
194
|
+
|
195
|
+
def read
|
196
|
+
@handle.read
|
197
|
+
end
|
198
|
+
|
199
|
+
def write string
|
200
|
+
@handle.write(string.to_java_string.get_bytes)
|
201
|
+
end
|
202
|
+
|
203
|
+
def close
|
204
|
+
@handle.close
|
205
|
+
end
|
206
|
+
|
207
|
+
end
|
208
|
+
|
209
|
+
private
|
210
|
+
|
211
|
+
# Check that we are running with jruby, check for hadoop home.
|
212
|
+
def running_jruby?
|
213
|
+
begin
|
214
|
+
require 'java'
|
215
|
+
rescue LoadError => e
|
216
|
+
raise "\nJava not found, are you sure you're running with JRuby?\n" + e.message
|
217
|
+
end
|
218
|
+
@hadoop_home = ENV['HADOOP_HOME']
|
219
|
+
raise "\nHadoop installation not found, try setting $HADOOP_HOME\n" unless @hadoop_home && (File.exist? @hadoop_home)
|
220
|
+
true
|
221
|
+
end
|
222
|
+
|
223
|
+
#
|
224
|
+
# Place hadoop jars in class path, require appropriate jars, set hadoop conf
|
225
|
+
#
|
226
|
+
|
227
|
+
def set_classpath
|
228
|
+
hadoop_conf = (ENV['HADOOP_CONF_DIR'] || File.join(@hadoop_home, 'conf'))
|
229
|
+
hadoop_conf += "/" unless hadoop_conf.end_with? "/"
|
230
|
+
$CLASSPATH << hadoop_conf unless $CLASSPATH.include?(hadoop_conf)
|
231
|
+
end
|
232
|
+
|
233
|
+
def import_classes
|
234
|
+
Dir["#{@hadoop_home}/hadoop*.jar", "#{@hadoop_home}/lib/*.jar"].each{|jar| require jar}
|
235
|
+
['org.apache.hadoop.fs.Path',
|
236
|
+
'org.apache.hadoop.fs.FileUtil',
|
237
|
+
'org.apache.hadoop.mapreduce.lib.input.FileInputFormat',
|
238
|
+
'org.apache.hadoop.mapreduce.lib.output.FileOutputFormat',
|
239
|
+
'org.apache.hadoop.fs.FSDataOutputStream',
|
240
|
+
'org.apache.hadoop.fs.FSDataInputStream'].map{|j_class| java_import(j_class) }
|
241
|
+
end
|
242
|
+
|
243
|
+
def set_hadoop_environment
|
244
|
+
set_classpath
|
245
|
+
import_classes
|
246
|
+
end
|
247
|
+
|
248
|
+
end
|
249
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module Swineherd
|
2
|
+
class LocalFileSystem
|
3
|
+
#include Swineherd::BaseFileSystem
|
4
|
+
|
5
|
+
def initialize *args
|
6
|
+
end
|
7
|
+
|
8
|
+
def open path, mode="r", &blk
|
9
|
+
File.open(path,mode,&blk)
|
10
|
+
end
|
11
|
+
|
12
|
+
#Globs for files at @path@, append '**/*' to glob recursively
|
13
|
+
def size path
|
14
|
+
Dir[path].inject(0){|s,f|s+=File.size(f)}
|
15
|
+
end
|
16
|
+
|
17
|
+
#A leaky abstraction, should be called rm_rf if it calls rm_rf
|
18
|
+
def rm_r path
|
19
|
+
FileUtils.rm_rf path
|
20
|
+
end
|
21
|
+
|
22
|
+
def rm path
|
23
|
+
FileUtils.rm path
|
24
|
+
end
|
25
|
+
|
26
|
+
def exists? path
|
27
|
+
File.exists?(path)
|
28
|
+
end
|
29
|
+
|
30
|
+
def directory? path
|
31
|
+
File.directory? path
|
32
|
+
end
|
33
|
+
|
34
|
+
def mv srcpath, dstpath
|
35
|
+
FileUtils.mv(srcpath,dstpath)
|
36
|
+
end
|
37
|
+
|
38
|
+
def cp srcpath, dstpath
|
39
|
+
FileUtils.cp(srcpath,dstpath)
|
40
|
+
end
|
41
|
+
|
42
|
+
def cp_r srcpath, dstpath
|
43
|
+
FileUtils.cp_r(srcpath,dstpath)
|
44
|
+
end
|
45
|
+
|
46
|
+
def mkdir_p path
|
47
|
+
FileUtils.mkdir_p path
|
48
|
+
end
|
49
|
+
|
50
|
+
#List directory contents,similar to unix `ls`
|
51
|
+
#Dir[@path@/*] to return files in immediate directory of @path@
|
52
|
+
def ls path
|
53
|
+
if exists?(path)
|
54
|
+
if !directory?(path)
|
55
|
+
[path]
|
56
|
+
else
|
57
|
+
path += '/' unless path =~ /\/$/
|
58
|
+
Dir[path+'*']
|
59
|
+
end
|
60
|
+
else
|
61
|
+
raise Errno::ENOENT, "No such file or directory - #{path}"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
#Recursively list directory contents
|
66
|
+
#Dir[@path@/**/*],similar to unix `ls -R`
|
67
|
+
def ls_r path
|
68
|
+
if exists?(path)
|
69
|
+
if !directory?(path)
|
70
|
+
[path]
|
71
|
+
else
|
72
|
+
path += '/' unless path =~ /\/$/
|
73
|
+
Dir[path+'**/*']
|
74
|
+
end
|
75
|
+
else
|
76
|
+
raise Errno::ENOENT, "No such file or directory - #{path}"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,311 @@
|
|
1
|
+
module Swineherd
|
2
|
+
|
3
|
+
#
|
4
|
+
# Methods for interacting with Amazon's Simple Store Service (S3).
|
5
|
+
#
|
6
|
+
class S3FileSystem
|
7
|
+
|
8
|
+
attr_accessor :s3
|
9
|
+
|
10
|
+
def initialize options={}
|
11
|
+
aws_access_key = options[:aws_access_key] || (Swineherd.config[:aws] && Swineherd.config[:aws][:access_key])
|
12
|
+
aws_secret_key = options[:aws_secret_key] || (Swineherd.config[:aws] && Swineherd.config[:aws][:secret_key])
|
13
|
+
raise "Missing AWS keys" unless aws_access_key && aws_secret_key
|
14
|
+
@s3 = RightAws::S3.new(aws_access_key, aws_secret_key,:logger => Logger.new(nil)) #FIXME: Just wanted it to shut up
|
15
|
+
end
|
16
|
+
|
17
|
+
def open path, mode="r", &blk
|
18
|
+
S3File.new(path,mode,self,&blk)
|
19
|
+
end
|
20
|
+
|
21
|
+
def size path
|
22
|
+
if directory?(path)
|
23
|
+
ls_r(path).inject(0){|sum,file| sum += filesize(file)}
|
24
|
+
else
|
25
|
+
filesize(path)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def rm path
|
30
|
+
bkt,key = split_path(path)
|
31
|
+
if key.empty? || directory?(path)
|
32
|
+
raise Errno::EISDIR,"#{path} is a directory or bucket, use rm_r or rm_bucket"
|
33
|
+
else
|
34
|
+
@s3.interface.delete(bkt, key)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
#rm_r - Remove recursively. Does not delete buckets, use rm_bucket
|
39
|
+
#params: @path@ - Path of file or folder to delete
|
40
|
+
#returns: Array - Array of paths which were deleted
|
41
|
+
def rm_r path
|
42
|
+
bkt,key = split_path(path)
|
43
|
+
if key.empty?
|
44
|
+
# only the bucket was passed in
|
45
|
+
else
|
46
|
+
if directory?(path)
|
47
|
+
@s3.interface.delete_folder(bkt,key).flatten
|
48
|
+
else
|
49
|
+
@s3.interface.delete(bkt, key)
|
50
|
+
[path]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def rm_bucket bucket_name
|
56
|
+
@s3.interface.force_delete_bucket(bucket_name)
|
57
|
+
end
|
58
|
+
|
59
|
+
def exists? path
|
60
|
+
bucket,key = split_path(path)
|
61
|
+
begin
|
62
|
+
if key.empty? #only a bucket was passed in, check if it exists
|
63
|
+
#FIXME: there may be a better way to test, relying on error to be raised here
|
64
|
+
@s3.interface.bucket_location(bucket) && true
|
65
|
+
elsif file?(path) #simply test for existence of the file
|
66
|
+
true
|
67
|
+
else #treat as directory and see if there are files beneath it
|
68
|
+
#if it's not a file, it is harmless to add '/'.
|
69
|
+
#the prefix search may return files with the same root extension,
|
70
|
+
#ie. foo.txt and foo.txt.bak, if we leave off the trailing slash
|
71
|
+
key+="/" unless key =~ /\/$/
|
72
|
+
@s3.interface.list_bucket(bucket,:prefix => key).size > 0
|
73
|
+
end
|
74
|
+
rescue RightAws::AwsError => error
|
75
|
+
if error.message =~ /nosuchbucket/i
|
76
|
+
false
|
77
|
+
elsif error.message =~ /not found/i
|
78
|
+
false
|
79
|
+
else
|
80
|
+
raise
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def directory? path
|
86
|
+
exists?(path) && !file?(path)
|
87
|
+
end
|
88
|
+
|
89
|
+
def file? path
|
90
|
+
bucket,key = split_path(path)
|
91
|
+
begin
|
92
|
+
return false if (key.nil? || key.empty?) #buckets are not files
|
93
|
+
#FIXME: there may be a better way to test, relying on error to be raised
|
94
|
+
@s3.interface.head(bucket,key) && true
|
95
|
+
rescue RightAws::AwsError => error
|
96
|
+
if error.message =~ /nosuchbucket/i
|
97
|
+
false
|
98
|
+
elsif error.message =~ /not found/i
|
99
|
+
false
|
100
|
+
else
|
101
|
+
raise
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def mv srcpath, dstpath
|
107
|
+
src_bucket,src_key_path = split_path(srcpath)
|
108
|
+
dst_bucket,dst_key_path = split_path(dstpath)
|
109
|
+
mkdir_p(dstpath) unless exists?(dstpath)
|
110
|
+
if directory? srcpath
|
111
|
+
paths_to_copy = ls_r(srcpath)
|
112
|
+
common_dir = common_directory(paths_to_copy)
|
113
|
+
paths_to_copy.each do |path|
|
114
|
+
bkt,key = split_path(path)
|
115
|
+
src_key = key
|
116
|
+
dst_key = File.join(dst_key_path, path.gsub(common_dir, ''))
|
117
|
+
@s3.interface.move(src_bucket, src_key, dst_bucket, dst_key)
|
118
|
+
end
|
119
|
+
else
|
120
|
+
@s3.interface.move(src_bucket, src_key_path, dst_bucket, dst_key_path)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def cp srcpath, dstpath
|
125
|
+
src_bucket,src_key_path = split_path(srcpath)
|
126
|
+
dst_bucket,dst_key_path = split_path(dstpath)
|
127
|
+
mkdir_p(dstpath) unless exists?(dstpath)
|
128
|
+
if src_key_path.empty? || directory?(srcpath)
|
129
|
+
raise Errno::EISDIR,"#{srcpath} is a directory or bucket, use cp_r"
|
130
|
+
else
|
131
|
+
@s3.interface.copy(src_bucket, src_key_path, dst_bucket, dst_key_path)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
# mv is just a special case of cp_r...this is a waste
|
136
|
+
def cp_r srcpath, dstpath
|
137
|
+
src_bucket,src_key_path = split_path(srcpath)
|
138
|
+
dst_bucket,dst_key_path = split_path(dstpath)
|
139
|
+
mkdir_p(dstpath) unless exists?(dstpath)
|
140
|
+
if directory? srcpath
|
141
|
+
paths_to_copy = ls_r(srcpath)
|
142
|
+
common_dir = common_directory(paths_to_copy)
|
143
|
+
paths_to_copy.each do |path|
|
144
|
+
bkt,key = split_path(path)
|
145
|
+
src_key = key
|
146
|
+
dst_key = File.join(dst_key_path, path.gsub(common_dir, ''))
|
147
|
+
@s3.interface.copy(src_bucket, src_key, dst_bucket, dst_key)
|
148
|
+
end
|
149
|
+
else
|
150
|
+
@s3.interface.copy(src_bucket, src_key_path, dst_bucket, dst_key_path)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
#This is a bit funny, there's actually no need to create a 'path' since
|
155
|
+
#s3 is nothing more than a glorified key-value store. When you create a
|
156
|
+
#'file' (key) the 'path' will be created for you. All we do here is create
|
157
|
+
#the bucket unless it already exists.
|
158
|
+
def mkdir_p path
|
159
|
+
bkt,key = split_path(path)
|
160
|
+
@s3.interface.create_bucket(bkt) unless exists? path
|
161
|
+
end
|
162
|
+
|
163
|
+
def ls path
|
164
|
+
if exists?(path)
|
165
|
+
bkt,prefix = split_path(path)
|
166
|
+
prefix += '/' if directory?(path) && !(prefix =~ /\/$/) && !prefix.empty?
|
167
|
+
contents = []
|
168
|
+
@s3.interface.incrementally_list_bucket(bkt, {'prefix' => prefix,:delimiter => '/'}) do |res|
|
169
|
+
contents += res[:common_prefixes].map{|c| File.join(bkt,c)}
|
170
|
+
contents += res[:contents].map{|c| File.join(bkt, c[:key])}
|
171
|
+
end
|
172
|
+
contents
|
173
|
+
else
|
174
|
+
raise Errno::ENOENT, "No such file or directory - #{path}"
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
def ls_r path
|
179
|
+
if(file?(path))
|
180
|
+
[path]
|
181
|
+
else
|
182
|
+
ls(path).inject([]){|paths,path| paths << path if directory?(path);paths << ls_r(path)}.flatten
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
# FIXME: Not implemented for directories
|
187
|
+
# @srcpath@ is assumed to be on the local filesystem
|
188
|
+
def copy_from_local srcpath, destpath
|
189
|
+
bucket,key = split_path(destpath)
|
190
|
+
if File.exists?(srcpath)
|
191
|
+
if File.directory?(srcpath)
|
192
|
+
raise "NotYetImplemented"
|
193
|
+
else
|
194
|
+
@s3.interface.put(bucket, key, File.open(srcpath))
|
195
|
+
end
|
196
|
+
else
|
197
|
+
raise Errno::ENOENT, "No such file or directory - #{srcpath}"
|
198
|
+
end
|
199
|
+
end
|
200
|
+
# alias :put :copy_from_local
|
201
|
+
|
202
|
+
#FIXME: Not implemented for directories
|
203
|
+
def copy_to_local srcpath, dstpath
|
204
|
+
src_bucket,src_key_path = split_path(srcpath)
|
205
|
+
dstfile = File.new(dstpath, 'w')
|
206
|
+
@s3.interface.get(src_bucket, src_key_path) do |chunk|
|
207
|
+
dstfile.write(chunk)
|
208
|
+
end
|
209
|
+
dstfile.close
|
210
|
+
end
|
211
|
+
# alias :get :copy_to_local
|
212
|
+
|
213
|
+
def bucket path
|
214
|
+
#URI.parse(path).path.split('/').reject{|x| x.empty?}.first
|
215
|
+
split_path(path).first
|
216
|
+
end
|
217
|
+
|
218
|
+
def key_for path
|
219
|
+
#File.join(URI.parse(path).path.split('/').reject{|x| x.empty?}[1..-1])
|
220
|
+
split_path(path).last
|
221
|
+
end
|
222
|
+
|
223
|
+
def split_path path
|
224
|
+
uri = URI.parse(path)
|
225
|
+
base_uri = ""
|
226
|
+
base_uri << uri.host if uri.scheme
|
227
|
+
base_uri << uri.path
|
228
|
+
path = base_uri.split('/').reject{|x| x.empty?}
|
229
|
+
[path[0],path[1..-1].join("/")]
|
230
|
+
end
|
231
|
+
|
232
|
+
private
|
233
|
+
|
234
|
+
# FIXME: This is dense
|
235
|
+
def common_directory paths
|
236
|
+
dirs = paths.map{|path| path.split('/')}
|
237
|
+
min_size = dirs.map{|splits| splits.size}.min
|
238
|
+
dirs = dirs.map{|splits| splits[0...min_size]}
|
239
|
+
uncommon_idx = dirs.transpose.each_with_index.find{|dirnames, idx| dirnames.uniq.length > 1}.last
|
240
|
+
dirs[0][0...uncommon_idx].join('/')
|
241
|
+
end
|
242
|
+
|
243
|
+
def filesize filepath
|
244
|
+
bucket,key = split_path(filepath)
|
245
|
+
header = @s3.interface.head(bucket, key)
|
246
|
+
header['content-length'].to_i
|
247
|
+
end
|
248
|
+
|
249
|
+
class S3File
|
250
|
+
attr_accessor :path, :handle, :fs
|
251
|
+
|
252
|
+
#
|
253
|
+
# In order to open input and output streams we must pass around the s3 fs object itself
|
254
|
+
#
|
255
|
+
def initialize path, mode, fs, &blk
|
256
|
+
@fs = fs
|
257
|
+
@path = path
|
258
|
+
case mode
|
259
|
+
when "r" then
|
260
|
+
# raise "#{fs.type(path)} is not a readable file - #{path}" unless fs.type(path) == "file"
|
261
|
+
when "w" then
|
262
|
+
# raise "Path #{path} is a directory." unless (fs.type(path) == "file") || (fs.type(path) == "unknown")
|
263
|
+
@handle = Tempfile.new('s3filestream')
|
264
|
+
if block_given?
|
265
|
+
yield self
|
266
|
+
close
|
267
|
+
end
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
#
|
272
|
+
# Faster than iterating
|
273
|
+
#
|
274
|
+
def read
|
275
|
+
bucket,key = fs.split_path(path)
|
276
|
+
fs.s3.interface.get_object(bucket, key)
|
277
|
+
end
|
278
|
+
|
279
|
+
#
|
280
|
+
# This is a little hackety. That is, once you call (.each) on the object the full object starts
|
281
|
+
# downloading...
|
282
|
+
#
|
283
|
+
def readline
|
284
|
+
bucket,key = fs.split_path(path)
|
285
|
+
@handle ||= fs.s3.interface.get_object(bucket, key).each
|
286
|
+
begin
|
287
|
+
@handle.next
|
288
|
+
rescue StopIteration, NoMethodError
|
289
|
+
@handle = nil
|
290
|
+
raise EOFError.new("end of file reached")
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
def write string
|
295
|
+
@handle.write(string)
|
296
|
+
end
|
297
|
+
|
298
|
+
def close
|
299
|
+
bucket,key = fs.split_path(path)
|
300
|
+
if @handle
|
301
|
+
@handle.read
|
302
|
+
fs.s3.interface.put(bucket, key, File.open(@handle.path, 'r'))
|
303
|
+
@handle.close
|
304
|
+
end
|
305
|
+
@handle = nil
|
306
|
+
end
|
307
|
+
|
308
|
+
end
|
309
|
+
|
310
|
+
end
|
311
|
+
end
|
data/lib/swineherd-fs.rb
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'configliere' ; Configliere.use(:commandline, :env_var, :define,:config_file)
|
2
|
+
require 'logger'
|
3
|
+
|
4
|
+
require 'fileutils'
|
5
|
+
require 'tempfile'
|
6
|
+
require 'right_aws'
|
7
|
+
|
8
|
+
require 'swineherd-fs/localfilesystem'
|
9
|
+
require 'swineherd-fs/s3filesystem'
|
10
|
+
require 'swineherd-fs/hadoopfilesystem'
|
11
|
+
|
12
|
+
#Merge in system and user settings
|
13
|
+
SYSTEM_CONFIG_PATH = "/etc/swineherd.yaml" unless defined?(SYSTEM_CONFIG_PATH)
|
14
|
+
USER_CONFIG_PATH = File.join(ENV['HOME'], '.swineherd.yaml') unless defined?(USER_CONFIG_PATH)
|
15
|
+
|
16
|
+
module Swineherd
|
17
|
+
|
18
|
+
def self.config
|
19
|
+
return @config if @config
|
20
|
+
config = Configliere::Param.new
|
21
|
+
config.read SYSTEM_CONFIG_PATH if File.exists? SYSTEM_CONFIG_PATH
|
22
|
+
config.read USER_CONFIG_PATH if File.exists? USER_CONFIG_PATH
|
23
|
+
@config ||= config
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.logger
|
27
|
+
return @log if @log
|
28
|
+
@log ||= Logger.new(config[:log_file] || STDOUT)
|
29
|
+
@log.formatter = proc { |severity, datetime, progname, msg|
|
30
|
+
"[#{severity.upcase}] #{msg}\n"
|
31
|
+
}
|
32
|
+
@log
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.logger= logger
|
36
|
+
@log = logger
|
37
|
+
end
|
38
|
+
|
39
|
+
module FileSystem
|
40
|
+
|
41
|
+
HDFS_SCHEME_REGEXP = /^hdfs:\/\//
|
42
|
+
S3_SCHEME_REGEXP = /^s3n?:\/\//
|
43
|
+
|
44
|
+
FILESYSTEMS = {
|
45
|
+
'file' => Swineherd::LocalFileSystem,
|
46
|
+
'hdfs' => Swineherd::HadoopFileSystem,
|
47
|
+
's3' => Swineherd::S3FileSystem,
|
48
|
+
's3n' => Swineherd::S3FileSystem
|
49
|
+
}
|
50
|
+
|
51
|
+
# A factory function that returns an instance of the requested class
|
52
|
+
def self.get scheme, *args
|
53
|
+
begin
|
54
|
+
FILESYSTEMS[scheme.to_s].new *args
|
55
|
+
rescue NoMethodError => e
|
56
|
+
raise "Filesystem with scheme #{scheme} does not exist.\n #{e.message}"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.exists?(path)
|
61
|
+
fs = self.get(scheme_for(path))
|
62
|
+
Swineherd.logger.info "#exists? - #{fs.class} for '#{path}'"
|
63
|
+
fs.exists?(path)
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.cp(srcpath,destpath)
|
67
|
+
src_fs = scheme_for(srcpath)
|
68
|
+
dest_fs = scheme_for(destpath)
|
69
|
+
Swineherd.logger.info "#cp - #{src_fs} --> #{dest_fs}"
|
70
|
+
if(src_fs.eql?(dest_fs))
|
71
|
+
self.get(src_fs).cp(srcpath,destpath)
|
72
|
+
elsif src_fs.eql?(:file)
|
73
|
+
self.get(dest_fs).copy_from_local(srcpath,destpath)
|
74
|
+
elsif dest_fs.eql?(:file)
|
75
|
+
self.get(src_fs).copy_to_local(srcpath,destpath)
|
76
|
+
else #cp between s3/s3n and hdfs can be handled by Hadoop:FileUtil in HadoopFileSystem
|
77
|
+
self.get(:hdfs).cp(srcpath,destpath)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
#defaults to local filesystem :file
|
84
|
+
def self.scheme_for(path)
|
85
|
+
scheme = URI.parse(path).scheme
|
86
|
+
(scheme && scheme.to_sym) || :file
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
data/rspec.watchr
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
def run_spec(file)
|
4
|
+
unless File.exist?(file)
|
5
|
+
puts "#{file} does not exist"
|
6
|
+
return
|
7
|
+
end
|
8
|
+
puts "Running #{file}"
|
9
|
+
system "rspec #{file}"
|
10
|
+
end
|
11
|
+
|
12
|
+
watch("spec/.*/*_spec\.rb") do |match|
|
13
|
+
run_spec match[0]
|
14
|
+
end
|
15
|
+
|
16
|
+
watch("lib/swineherd-fs/(.*)\.rb") do |match|
|
17
|
+
file = %{spec/#{match[1]}_spec.rb}
|
18
|
+
run_spec file if File.exists?(file)
|
19
|
+
end
|
@@ -0,0 +1,186 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
FS_SPEC_ROOT = File.dirname(__FILE__)
|
3
|
+
S3_TEST_BUCKET = 'swineherd-fs-test-bucket' #You'll have to set this to something else
|
4
|
+
|
5
|
+
shared_examples_for "an abstract filesystem" do
|
6
|
+
|
7
|
+
let(:test_filename){ File.join(test_dirname,"filename.txt") }
|
8
|
+
let(:test_string){ "foobarbaz" }
|
9
|
+
|
10
|
+
let(:files){ ['d.txt','b/c.txt'].map{|f| File.join(test_dirname,f)} }
|
11
|
+
let(:dirs){ %w(b).map{|d| File.join(test_dirname,d)} }
|
12
|
+
|
13
|
+
it "implements #exists?" do
|
14
|
+
fs.mkdir_p(test_dirname)
|
15
|
+
expect{ fs.open(test_filename,'w'){|f| f.write(test_string)} }.to change{ fs.exists?(test_filename) }.from(false).to(true)
|
16
|
+
end
|
17
|
+
|
18
|
+
it "implements #directory?" do
|
19
|
+
fs.mkdir_p(test_dirname)
|
20
|
+
fs.open(test_filename, 'w'){|f| f.write(test_string)}
|
21
|
+
fs.directory?(test_filename).should eql false
|
22
|
+
fs.directory?(test_dirname).should eql true
|
23
|
+
end
|
24
|
+
|
25
|
+
it "implements #rm on files" do
|
26
|
+
fs.mkdir_p(test_dirname)
|
27
|
+
fs.open(test_filename, 'w'){|f| f.write(test_string)}
|
28
|
+
expect{ fs.rm(test_filename) }.to change{ fs.exists?(test_filename) }.from(true).to(false)
|
29
|
+
end
|
30
|
+
|
31
|
+
it "raises error on #rm of non-empty directory" do
|
32
|
+
fs.mkdir_p(test_dirname)
|
33
|
+
fs.open(test_filename, 'w'){|f| f.write(test_string)}
|
34
|
+
expect{fs.rm(test_dirname)}.to raise_error
|
35
|
+
end
|
36
|
+
|
37
|
+
it "implements #rm_r" do
|
38
|
+
fs.mkdir_p(test_dirname)
|
39
|
+
fs.open(test_filename,'w'){|f| f.write(test_string)}
|
40
|
+
expect{ fs.rm_r(test_dirname) }.to change{ fs.exists?(test_dirname) && fs.exists?(test_filename) }.from(true).to(false)
|
41
|
+
end
|
42
|
+
|
43
|
+
it "implements #ls" do
|
44
|
+
dirs.each{ |dir| fs.mkdir_p(dir) }
|
45
|
+
files.each{|filename| fs.open(filename,"w"){|f|f.write(test_string) }}
|
46
|
+
fs.ls(test_dirname).length.should eql 2
|
47
|
+
end
|
48
|
+
|
49
|
+
it "implements #ls_r" do
|
50
|
+
dirs.each{ |dir| fs.mkdir_p(dir) }
|
51
|
+
files.each{|filename| fs.open(filename,"w"){|f|f.write(test_string) }}
|
52
|
+
fs.ls_r(test_dirname).length.should eql 3
|
53
|
+
end
|
54
|
+
|
55
|
+
it "implements #size" do
|
56
|
+
fs.mkdir_p(test_dirname)
|
57
|
+
fs.open(test_filename,'w'){|f| f.write(test_string)}
|
58
|
+
test_string.length.should eql(fs.size(test_filename))
|
59
|
+
end
|
60
|
+
|
61
|
+
it "implements #mkdir_p" do
|
62
|
+
expect{ fs.mkdir_p(test_dirname) }.to change{ fs.directory?(test_dirname) }.from(false).to(true)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "implements #mv" do
|
66
|
+
fs.mkdir_p(test_dirname)
|
67
|
+
fs.open(test_filename, 'w'){|f| f.write(test_string)}
|
68
|
+
filename2 = File.join(test_dirname,"new_file.txt")
|
69
|
+
expect{ fs.mv(test_filename, filename2) }.to change{ fs.exists?(filename2) }.from(false).to(true)
|
70
|
+
fs.exists?(test_filename).should eql false
|
71
|
+
fs.open(filename2,"r").read.should eql test_string
|
72
|
+
end
|
73
|
+
|
74
|
+
it "implements #cp" do
|
75
|
+
fs.mkdir_p(test_dirname)
|
76
|
+
fs.open(test_filename, 'w'){|f| f.write(test_string)}
|
77
|
+
filename2 = File.join(test_dirname,"new_file.txt")
|
78
|
+
expect{ fs.cp(test_filename, filename2) }.to change{ fs.exists?(filename2) }.from(false).to(true)
|
79
|
+
fs.open(test_filename,"r").read.should eql fs.open(filename2,"r").read
|
80
|
+
end
|
81
|
+
|
82
|
+
it "implements #cp_r"
|
83
|
+
|
84
|
+
it "implements #open" do
|
85
|
+
fs.mkdir_p(test_dirname)
|
86
|
+
expect{
|
87
|
+
file = fs.open(test_filename, 'w')
|
88
|
+
file.write(test_string)
|
89
|
+
file.close
|
90
|
+
}.to change{ fs.exists?(test_filename) }.from(false).to(true)
|
91
|
+
end
|
92
|
+
|
93
|
+
it "implements #open with &blk" do
|
94
|
+
fs.mkdir_p(test_dirname)
|
95
|
+
expect{ fs.open(test_filename, 'w'){|f| f.write(test_string)} }.to change{ fs.exists?(test_filename) }.from(false).to(true)
|
96
|
+
end
|
97
|
+
|
98
|
+
describe "with a new file" do
|
99
|
+
|
100
|
+
it "implements path" do
|
101
|
+
fs.mkdir_p(test_dirname)
|
102
|
+
file = fs.open(test_filename,'w')
|
103
|
+
file.path.should eql test_filename
|
104
|
+
end
|
105
|
+
|
106
|
+
it "implements write" do
|
107
|
+
fs.mkdir_p(test_dirname)
|
108
|
+
fs.open(test_filename,'w'){|f| f.write(test_string)}
|
109
|
+
end
|
110
|
+
|
111
|
+
it "should not allow write after close" do
|
112
|
+
fs.mkdir_p(test_dirname)
|
113
|
+
file = fs.open(test_filename,'w')
|
114
|
+
file.write(test_string)
|
115
|
+
file.close
|
116
|
+
lambda{file.write(test_string)}.should raise_error
|
117
|
+
end
|
118
|
+
|
119
|
+
it "implements read" do
|
120
|
+
fs.mkdir_p(test_dirname)
|
121
|
+
fs.open(test_filename,'w'){|f| f.write(test_string)}
|
122
|
+
fs.open(test_filename,'r').read.should eql test_string
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
after do
|
128
|
+
fs.rm_r(test_dirname) if fs.exists?(test_dirname)
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
describe Swineherd::FileSystem do
|
134
|
+
let(:fs){ Swineherd::FileSystem }
|
135
|
+
let(:test_dirname){ FS_SPEC_ROOT+"/tmp/test_dir" }
|
136
|
+
let(:test_filename){ File.join(test_dirname,"filename.txt") }
|
137
|
+
let(:test_string){ "foobarbaz" }
|
138
|
+
|
139
|
+
it "implements #cp" do
|
140
|
+
localfs = Swineherd::LocalFileSystem.new
|
141
|
+
s3_fs = Swineherd::S3FileSystem.new
|
142
|
+
localfs.mkdir_p(test_dirname)
|
143
|
+
localfs.open(test_filename, 'w'){|f| f.write(test_string)}
|
144
|
+
s3_filename = "s3://"+S3_TEST_BUCKET+"/new_file.txt"
|
145
|
+
expect{ fs.cp(test_filename, s3_filename) }.to change{ fs.exists?(s3_filename) }.from(false).to(true)
|
146
|
+
localfs.rm_r(test_dirname) if localfs.exists?(test_dirname)
|
147
|
+
s3_fs.rm(s3_filename)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
describe Swineherd::LocalFileSystem do
|
152
|
+
|
153
|
+
it_behaves_like "an abstract filesystem" do
|
154
|
+
let(:fs){ Swineherd::LocalFileSystem.new }
|
155
|
+
let(:test_dirname){ FS_SPEC_ROOT+"/tmp/test_dir" }
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
159
|
+
|
160
|
+
describe Swineherd::S3FileSystem do
|
161
|
+
|
162
|
+
#mkdir_p won't pass because there is no concept of a directory on s3
|
163
|
+
|
164
|
+
it_behaves_like "an abstract filesystem" do
|
165
|
+
let(:fs){ Swineherd::S3FileSystem.new }
|
166
|
+
let(:test_dirname){ S3_TEST_BUCKET+"/tmp/test_dir" }
|
167
|
+
end
|
168
|
+
|
169
|
+
describe "an S3FileSystem" do
|
170
|
+
let(:fs){ Swineherd::S3FileSystem.new }
|
171
|
+
|
172
|
+
it "should return false for #file? on a bucket" do
|
173
|
+
fs.file?(S3_TEST_BUCKET).should eql false
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
describe Swineherd::HadoopFileSystem do
|
180
|
+
|
181
|
+
it_behaves_like "an abstract filesystem" do
|
182
|
+
let(:fs){ Swineherd::HadoopFileSystem.new }
|
183
|
+
let(:test_dirname){ "/tmp/test_dir" }
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{swineherd-fs}
|
5
|
+
s.version = "0.0.2"
|
6
|
+
s.authors = ["David Snyder","Jacob Perkins"]
|
7
|
+
s.date = %q{2012-01-20}
|
8
|
+
s.description = %q{A filesystem abstraction for Amazon S3 and Hadoop HDFS}
|
9
|
+
s.summary = %q{A filesystem abstraction for Amazon S3 and Hadoop HDFS}
|
10
|
+
s.email = %q{"david@infochimps.com"}
|
11
|
+
s.homepage = %q{http://github.com/infochimps-labs/swineherd-fs}
|
12
|
+
|
13
|
+
s.files = ["LICENSE", "VERSION","Gemfile", "swineherd-fs.gemspec", "rspec.watchr", "README.textile", "lib/swineherd-fs.rb","lib/swineherd-fs/localfilesystem.rb", "lib/swineherd-fs/s3filesystem.rb", "lib/swineherd-fs/hadoopfilesystem.rb", "spec/spec_helper.rb", "spec/filesystem_spec.rb"]
|
14
|
+
s.test_files = ["spec/spec_helper.rb", "spec/filesystem_spec.rb"]
|
15
|
+
s.require_paths = ["lib"]
|
16
|
+
|
17
|
+
s.add_development_dependency("rspec")
|
18
|
+
s.add_development_dependency("watchr")
|
19
|
+
s.add_runtime_dependency(%q<configliere>, [">= 0"])
|
20
|
+
s.add_runtime_dependency(%q<right_aws>, [">= 0"])
|
21
|
+
s.add_runtime_dependency(%q<jruby-openssl>, [">= 0"])
|
22
|
+
end
|
23
|
+
|
metadata
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: swineherd-fs
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.2
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- David Snyder
|
9
|
+
- Jacob Perkins
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
|
14
|
+
date: 2012-01-20 00:00:00 Z
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: rspec
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: "0"
|
25
|
+
type: :development
|
26
|
+
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: watchr
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: "0"
|
36
|
+
type: :development
|
37
|
+
version_requirements: *id002
|
38
|
+
- !ruby/object:Gem::Dependency
|
39
|
+
name: configliere
|
40
|
+
prerelease: false
|
41
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: "0"
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id003
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: right_aws
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: "0"
|
58
|
+
type: :runtime
|
59
|
+
version_requirements: *id004
|
60
|
+
- !ruby/object:Gem::Dependency
|
61
|
+
name: jruby-openssl
|
62
|
+
prerelease: false
|
63
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: "0"
|
69
|
+
type: :runtime
|
70
|
+
version_requirements: *id005
|
71
|
+
description: A filesystem abstraction for Amazon S3 and Hadoop HDFS
|
72
|
+
email: "\"david@infochimps.com\""
|
73
|
+
executables: []
|
74
|
+
|
75
|
+
extensions: []
|
76
|
+
|
77
|
+
extra_rdoc_files: []
|
78
|
+
|
79
|
+
files:
|
80
|
+
- LICENSE
|
81
|
+
- VERSION
|
82
|
+
- Gemfile
|
83
|
+
- swineherd-fs.gemspec
|
84
|
+
- rspec.watchr
|
85
|
+
- README.textile
|
86
|
+
- lib/swineherd-fs.rb
|
87
|
+
- lib/swineherd-fs/localfilesystem.rb
|
88
|
+
- lib/swineherd-fs/s3filesystem.rb
|
89
|
+
- lib/swineherd-fs/hadoopfilesystem.rb
|
90
|
+
- spec/spec_helper.rb
|
91
|
+
- spec/filesystem_spec.rb
|
92
|
+
homepage: http://github.com/infochimps-labs/swineherd-fs
|
93
|
+
licenses: []
|
94
|
+
|
95
|
+
post_install_message:
|
96
|
+
rdoc_options: []
|
97
|
+
|
98
|
+
require_paths:
|
99
|
+
- lib
|
100
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
101
|
+
none: false
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: "0"
|
106
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
107
|
+
none: false
|
108
|
+
requirements:
|
109
|
+
- - ">="
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: "0"
|
112
|
+
requirements: []
|
113
|
+
|
114
|
+
rubyforge_project:
|
115
|
+
rubygems_version: 1.8.15
|
116
|
+
signing_key:
|
117
|
+
specification_version: 3
|
118
|
+
summary: A filesystem abstraction for Amazon S3 and Hadoop HDFS
|
119
|
+
test_files:
|
120
|
+
- spec/spec_helper.rb
|
121
|
+
- spec/filesystem_spec.rb
|