vayacondios-server 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/.gitignore +61 -0
  2. data/.travis.yml +11 -0
  3. data/.yardopts +10 -0
  4. data/CHANGELOG.md +0 -0
  5. data/Gemfile +4 -0
  6. data/Guardfile +41 -0
  7. data/LICENSE.md +99 -0
  8. data/Procfile +2 -0
  9. data/README.md +183 -0
  10. data/Rakefile +6 -0
  11. data/app/http_shim.rb +67 -0
  12. data/bin/vcd.sh +27 -0
  13. data/config/http_shim.rb +43 -0
  14. data/config/vayacondios.example.yaml +4 -0
  15. data/config/vayacondios.yaml +4 -0
  16. data/lib/tasks/publish.rake +23 -0
  17. data/lib/tasks/spec.rake +9 -0
  18. data/lib/tasks/yard.rake +2 -0
  19. data/lib/vayacondios/client/configliere.rb +38 -0
  20. data/lib/vayacondios/client/http_client.rb +49 -0
  21. data/lib/vayacondios/client/notifier.rb +84 -0
  22. data/lib/vayacondios/server/handlers/config_handler.rb +35 -0
  23. data/lib/vayacondios/server/handlers/event_handler.rb +30 -0
  24. data/lib/vayacondios/server/model/config_document.rb +94 -0
  25. data/lib/vayacondios/server/model/document.rb +25 -0
  26. data/lib/vayacondios/server/model/event_document.rb +94 -0
  27. data/lib/vayacondios/version.rb +3 -0
  28. data/lib/vayacondios-client.rb +20 -0
  29. data/lib/vayacondios-server.rb +18 -0
  30. data/scripts/hadoop_monitor/configurable.rb +74 -0
  31. data/scripts/hadoop_monitor/hadoop_client.rb +249 -0
  32. data/scripts/hadoop_monitor/hadoop_monitor.rb +91 -0
  33. data/scripts/hadoop_monitor/hadoopable.rb +65 -0
  34. data/scripts/hadoop_monitor/machine_monitor.rb +115 -0
  35. data/scripts/s3_cataloger/buckets +33 -0
  36. data/scripts/s3_cataloger/foreach_bucket +88 -0
  37. data/scripts/s3_cataloger/parse_ls.py +391 -0
  38. data/spec/client/notifier_spec.rb +120 -0
  39. data/spec/server/config_spec.rb +55 -0
  40. data/spec/server/event_spec.rb +44 -0
  41. data/spec/server/server_spec.rb +20 -0
  42. data/spec/spec_helper.rb +10 -0
  43. data/spec/support/mongo_cleaner.rb +26 -0
  44. data/vayacondios-client.gemspec +26 -0
  45. data/vayacondios-server.gemspec +30 -0
  46. metadata +216 -0
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env jruby19
2
+
3
+ require_relative 'hadoop_client'
4
+ require_relative 'configurable'
5
+ require 'java'
6
+ require 'mongo'
7
+ require 'scanf'
8
+ require 'gorillib/hash/slice'
9
+ require 'thread'
10
+ require 'open-uri'
11
+ require 'json'
12
+
13
+ module Vayacondios
14
+
15
+ class HadoopMonitor
16
+ def initialize
17
+ init_settings
18
+
19
+ @hadoop = HadoopClient.new
20
+
21
+ @monitored_jobs = []
22
+
23
+ logger.debug "Creating mongo collections."
24
+ @conn = Mongo::Connection.new settings.mongo_ip
25
+ @db = @conn[settings.mongo_jobs_db]
26
+ @job_logs = @db.create_collection(settings.mongo_job_logs_collection)
27
+
28
+ # After we create the job_events database, one of the machine
29
+ # monitors will create the machine stats databse.
30
+ @job_events = @db.create_collection(settings.mongo_job_events_collection,
31
+ :capped => true,
32
+ :size => settings.job_events_size)
33
+
34
+ @cluster_state = CLUSTER_QUIET
35
+ end
36
+
37
+ def run
38
+ loop do
39
+
40
+ logger.debug "In main event loop."
41
+
42
+ cur_running_jobs = @hadoop.jobs_with_state HadoopClient::RUNNING
43
+ cur_cluster_state = (cur_running_jobs.size > 0) ? CLUSTER_BUSY : CLUSTER_QUIET
44
+
45
+ @hadoop.subtract(@monitored_jobs, cur_running_jobs).each do |job|
46
+ logger.debug "#{job.get_id.to_s} is complete."
47
+ update_job_stats job, Time.now
48
+ end
49
+ @hadoop.subtract(cur_running_jobs, @monitored_jobs).each do |job|
50
+ logger.debug "#{job.get_id.to_s} started."
51
+ update_job_properties job
52
+ end
53
+
54
+ (@monitored_jobs + cur_running_jobs).each{|job| update_job_stats job}
55
+
56
+ @monitored_jobs = cur_running_jobs
57
+ update_cluster_state cur_cluster_state
58
+
59
+ sleep settings.sleep_seconds
60
+
61
+ end
62
+ end
63
+
64
+ private
65
+
66
+ include Configurable
67
+
68
+ def update_cluster_state new_state
69
+ return if new_state == @cluster_state
70
+ @cluster_state = new_state
71
+ logger.info "Cluster state changed to #{@cluster_state}"
72
+ @job_events.insert(EVENT => @cluster_state, TIME => Time.now.to_i)
73
+ end
74
+
75
+ def update_job_properties job
76
+ properties = @hadoop.job_properties job
77
+ logger.debug "upserting #{JSON.generate properties}"
78
+ @job_logs.save(properties, upsert: true, safe: true)
79
+ end
80
+
81
+ def update_job_stats job, finish_time = nil
82
+ @hadoop.job_stats(job, finish_time || Time.now).each do |job_stat|
83
+ logger.debug "upserting #{JSON.generate job_stat}"
84
+ @job_logs.save(job_stat, upsert: true, safe: true)
85
+ end
86
+ end
87
+
88
+ end
89
+ end
90
+
91
+ Vayacondios::HadoopMonitor.new.run
@@ -0,0 +1,65 @@
1
+ require 'stringio'
2
+
3
+ module Vayacondios
4
+
5
+ module Hadoopable
6
+
7
+ include Configurable
8
+
9
+ #--------------------------------------------------------------------------------
10
+ # Initialize jruby and tell it about hadoop.
11
+ #--------------------------------------------------------------------------------
12
+
13
+ begin
14
+ require 'java'
15
+ rescue LoadError => e
16
+ raise "\nJava not found. Are you sure you're running with JRuby?\n#{e.message}"
17
+ end
18
+
19
+ hadoop_home = ENV['HADOOP_HOME'] || '/usr/lib/hadoop'
20
+
21
+ raise "\nHadoop installation not found. Try setting $HADOOP_HOME\n" unless (hadoop_home and (File.exist? hadoop_home))
22
+
23
+ $CLASSPATH << File.join(File.join(hadoop_home, 'conf') || ENV['HADOOP_CONF_DIR'],
24
+ '') # add trailing slash
25
+
26
+ Dir["#{hadoop_home}/{hadoop*.jar,lib/*.jar}"].each{|jar| require jar}
27
+
28
+ include_class org.apache.hadoop.mapred.JobConf
29
+ include_class org.apache.hadoop.mapred.JobClient
30
+ include_class org.apache.hadoop.mapred.JobStatus
31
+ include_class org.apache.hadoop.mapred.TIPStatus
32
+ include_class org.apache.hadoop.conf.Configuration
33
+ #--------------------------------------------------------------------------------
34
+
35
+ def get_hadoop_conf
36
+ logger.debug "Getting hadoop configuration"
37
+
38
+ stderr, $stderr = $stderr, StringIO.new
39
+
40
+ conf = Configuration.new
41
+
42
+ # per-site defaults
43
+ %w[capacity-scheduler.xml core-site.xml hadoop-policy.xml hadoop-site.xml hdfs-site.xml mapred-site.xml].each do |conf_file|
44
+ conf.addResource conf_file
45
+ end
46
+
47
+ conf.reload_configuration
48
+
49
+ # per-user overrides
50
+ if Swineherd.config[:aws]
51
+ conf.set("fs.s3.awsAccessKeyId",Swineherd.config[:aws][:access_key])
52
+ conf.set("fs.s3.awsSecretAccessKey",Swineherd.config[:aws][:secret_key])
53
+
54
+ conf.set("fs.s3n.awsAccessKeyId",Swineherd.config[:aws][:access_key])
55
+ conf.set("fs.s3n.awsSecretAccessKey",Swineherd.config[:aws][:secret_key])
56
+ end
57
+
58
+ return conf
59
+ ensure
60
+ stderr_lines = $stderr.string.split("\n")
61
+ $stderr = stderr
62
+ stderr_lines.each{|line| logger.debug line}
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,115 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative 'configure'
4
+ require 'thread'
5
+ require 'socket'
6
+ require 'scanf'
7
+ require 'json'
8
+ require 'mongo'
9
+
10
+ module Vayacondios
11
+
12
+ class StatServer
13
+
14
+ include Configurable
15
+
16
+ def initialize
17
+ unless get_conf.mongo_ip
18
+ raise "The IP address of the mongo server must be set!"
19
+ end
20
+
21
+ logger.info "Connecting to Mongo server at ip #{get_conf.mongo_ip}"
22
+ conn = Mongo::Connection.new get_conf.mongo_ip
23
+ logger.debug "Getting job database #{get_conf.mongo_jobs_db}"
24
+ @db = conn[get_conf.mongo_jobs_db]
25
+ end
26
+
27
+ def run
28
+
29
+ # TODO: This entire script should be replaced by calls to zabbix
30
+ # initiated by the main loop of the hadoop_monitor.
31
+
32
+ logger.debug "Waiting for hadoop monitor to create the event collection."
33
+ sleep get_conf.sleep_seconds until
34
+ @db.collection_names.index get_conf.mongo_job_events_collection
35
+
36
+ job_events = @db[get_conf.mongo_job_events_collection]
37
+
38
+ logger.debug "Got the event collection. Creating machine stats collection."
39
+ machine_stats = @db.
40
+ create_collection(get_conf.mongo_machine_stats_collection)
41
+
42
+ logger.debug "Querying job_events until we see an insertion."
43
+ # Keep querying the job_events collection until there's an
44
+ # event. Don't just use the cursor from .find without checking,
45
+ # because if hadoop_monitor inserts an event into an empty
46
+ # database, this cursor will no longer work, even if it's
47
+ # tailable. not quite sure why Mongo does it that way.
48
+ events = job_events.find
49
+ events.add_option 0x02 # tailable
50
+ until events.has_next?
51
+ sleep get_conf.sleep_seconds
52
+ events = job_events.find
53
+ events.add_option 0x02 # tailable
54
+ end
55
+
56
+ logger.debug "Priming main event loop. Waiting to see if the cluster is busy."
57
+
58
+ # Get up-to-date on the state of the cluster. assume quiet to start.
59
+ cluster_busy = self.class.next_state(events, false, get_conf.event)
60
+
61
+ # main loop
62
+ loop do
63
+
64
+ logger.debug "In main event loop. Waiting to see if the cluster is busy."
65
+
66
+ # Get up-to-date on the state of the cluster.
67
+ cluster_busy = self.class.next_state(events, cluster_busy, get_conf.event)
68
+
69
+ # Don't grab stats unless the cluster is busy
70
+ unless cluster_busy
71
+ sleep get_conf.sleep_seconds
72
+ next
73
+ end
74
+
75
+ logger.debug "Grabbing stats and pushing them into the collection."
76
+
77
+ # Grab the stats!
78
+ # ifstat's delay will function as our heartbeat timer.
79
+ is, ignore, rw = `ifstat 1 1`.split("\n").map(&:split)
80
+ headers, *disks = `iostat -x`.split("\n")[5..-1].map(&:split)
81
+ cpu, mem, swap, proc_headers, *procs = `top -b -n 1`.
82
+ split("\n").map(&:strip).select{|x| not x.empty?}[2..-1]
83
+
84
+ # Write the stats into the mongo collection.
85
+ machine_stats.insert(
86
+ :net => Hash[is.zip(rw.each_slice(2).map{|r,w| {:r => r, :w => w}})],
87
+ :disk => Hash[disks.map{|d| [d.first, Hash[headers.zip(d)]]}],
88
+ :cpu => self.class.split_top_stats(cpu),
89
+ :mem => self.class.split_top_stats(mem),
90
+ :swap => self.class.split_top_stats(swap))
91
+ end
92
+ end
93
+
94
+ private
95
+
96
+ def self.split_top_stats line
97
+ Hash[line.split(':', 2).last.split(',').map(&:strip).map do |stat|
98
+ stat.scanf("%f%*c%s").reverse
99
+ end]
100
+ end
101
+
102
+ def self.next_state events_cursor, current_state, event_attr_name
103
+ while current_event = events_cursor.next
104
+ current_state = case current_event[event_attr_name]
105
+ when CLUSTER_BUSY then true
106
+ when CLUSTER_QUIET then false
107
+ else current_state
108
+ end
109
+ end
110
+ current_state
111
+ end
112
+ end
113
+ end
114
+
115
+ Vayacondios::StatServer.new.run
@@ -0,0 +1,33 @@
1
+ export dir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
2
+
3
+ bdump_and_bparse() {
4
+ bucket_name=$1
5
+ bdump $1 ; bparse $1
6
+ }
7
+
8
+ bparse_and_bload() {
9
+ bucket_name=$1
10
+ bparse $1 ; bload "$@"
11
+ }
12
+
13
+ bdump_and_bload() {
14
+ bucket_name=$1
15
+ bdump $1; bparse $1 ; bload $1
16
+ }
17
+
18
+ bdump() {
19
+ bucket_name=$1
20
+ s3cmd ls -r s3://$bucket_name/ >$bucket_name.ls
21
+ }
22
+
23
+ bparse() {
24
+ bucket_name=$1
25
+ $dir/parse_ls.py <$bucket_name.ls >$bucket_name.json
26
+ }
27
+
28
+ bload() {
29
+ bucket_name=$1
30
+ db=$2
31
+ collection=$3
32
+ mongoimport -d $db -c $collection $bucket_name.json
33
+ }
@@ -0,0 +1,88 @@
1
+ #!/usr/bin/env bash
2
+
3
+ export dir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
4
+
5
+ . $dir/buckets
6
+
7
+ case $1 in
8
+ -f)
9
+ bucket_file=$2
10
+ shift 2
11
+ ;;
12
+ -h|--help)
13
+ cat <<EOF
14
+ foreach_bucket [OPTIONS] COMMAND [ARGUMENTS]
15
+
16
+ This script is used to do a recursive listing of an s3 bucket using
17
+ the s3cmd and then jsonify the output. It runs the COMMAND on the
18
+ buckets specified in a file; on standard input; or, by default, on all
19
+ buckets that can be seen by s3cmd.
20
+
21
+ OPTIONS include the following:
22
+
23
+ -f BUCKET_FILE file containing a bucket name on each line. If
24
+ this is set to '-', then buckets are read from
25
+ standard input.
26
+
27
+ COMMAND includes anything in the 'buckets' script. The main commands
28
+ are the following:
29
+
30
+ bdump dumps BUCKET to a file BUCKET.ls in the current
31
+ working directory
32
+
33
+ bparse runs BUCKET.ls through a parser to jsonify it and
34
+ outputs the result as BUCKET.json
35
+
36
+ bload loads BUCKET.json into a mongo database. The first
37
+ argument passed to this command specifies the
38
+ mongo database, while the second specifies the
39
+ collection.
40
+ EOF
41
+ exit 0
42
+ ;;
43
+ -*)
44
+ echo "Invalid option: $1"
45
+ exit 1
46
+ ;;
47
+ esac
48
+
49
+ command=$1
50
+ shift
51
+
52
+ buckets=()
53
+
54
+ ## no bucket file specified read all s3 buckets
55
+ if [[ -z $bucket_file ]]
56
+ then
57
+ for bucket in `s3cmd ls | cut -d ' ' -f 4 | cut -d / -f 3`
58
+ do
59
+ buckets=("${buckets[@]}" "$bucket")
60
+ done
61
+
62
+ ## read buckets from standard input
63
+ elif [[ $bucket_file == "-" ]]
64
+ then
65
+ read bucket
66
+ until [[ $? -eq 1 ]]
67
+ do
68
+ buckets=("${buckets[@]}" "$bucket")
69
+ read bucket
70
+ done
71
+
72
+ ## read from bucket_file
73
+ else
74
+ tmpIFS=$IFS
75
+ IFS=$'\n'
76
+
77
+ for bucket in `cat $bucket_file`
78
+ do
79
+ buckets=("${buckets[@]}" "$bucket")
80
+ done
81
+
82
+ IFS=$tmpIFS
83
+ fi
84
+
85
+ for bucket in "${buckets[@]}"
86
+ do
87
+ ($command $bucket "$@")&
88
+ done
@@ -0,0 +1,391 @@
1
+ #!/usr/bin/env python
2
+
3
+ import logging
4
+ import sys
5
+
6
+ # crank this down to info for progress messages. can also use
7
+ # "filename=" for that kind of thing. The only reason this is stderr
8
+ # is to allow for output redirection.
9
+ logging.basicConfig(stream=sys.stderr, level=logging.ERROR)
10
+
11
+ #-------------------------------------------------------------------------------
12
+
13
+ def calculate_sizes(parsedHierarchies):
14
+ """
15
+ @param parsedHierarchies dictionary mapping filenames to
16
+ parsedHierarchies. This is in the same
17
+ format as the 'subdirs' component of a
18
+ parsedHierarchy.
19
+ """
20
+
21
+ from operator import add
22
+ return reduce(
23
+ add,
24
+ (
25
+ calculate_size(parsedHierarchies[name])
26
+ for name in parsedHierarchies.keys()))
27
+
28
+
29
+ def calculate_size(parsedHierarchy):
30
+ """
31
+ @param parsedHierarchy dictionary in the same format as the one
32
+ operated on by insert_line
33
+ """
34
+
35
+ if 'subdirs' in parsedHierarchy:
36
+ parsedHierarchy['tree_size'] = calculate_sizes(parsedHierarchy['subdirs'])
37
+ elif parsedHierarchy['type'] == 'd':
38
+ parsedHierarchy['tree_size'] = 0
39
+
40
+ if 'tree_size' in parsedHierarchy:
41
+ return parsedHierarchy['tree_size']
42
+ else:
43
+ return parsedHierarchy['file_size']
44
+
45
+ #-------------------------------------------------------------------------------
46
+
47
+ from sys import stdout
48
+ def write_listing_in_json(listing, writer = stdout):
49
+ writer.write('{"basename":"%s"' % listing['basename'])
50
+
51
+ from operator import add
52
+ writer.write(reduce(add, (',"%s":%s' % (key,
53
+ '"%s"' % listing[key]
54
+ if isinstance(listing[key],str)
55
+ else listing[key])
56
+ for key in listing.keys() if key != 'subdirs')))
57
+
58
+ writer.write('}\n')
59
+
60
+ #-------------------------------------------------------------------------------
61
+
62
+ def each_listing_in_hierarchy(parsedHierarchy):
63
+ """
64
+ @param parsedHierarchy dictionary mapping filenames to
65
+ parsedHierarchies. This is in the same
66
+ format as the 'subdirs' component of a
67
+ parsedHierarchy.
68
+
69
+ @return one record for every file listing. Every parsedHierarchy
70
+ will have its 'subdirs' key deleted and will consequently be flat.
71
+ """
72
+
73
+ if 'subdirs' in parsedHierarchy:
74
+ subdirs = parsedHierarchy['subdirs']
75
+ del parsedHierarchy['subdirs']
76
+ return [parsedHierarchy] + each_listing_in_subdirs(subdirs)
77
+ else:
78
+ return [parsedHierarchy]
79
+
80
+ def each_listing_in_subdirs(parsedHierarchies):
81
+ keys = parsedHierarchies.keys()
82
+ keys.sort()
83
+ from operator import add
84
+
85
+ return reduce(add,
86
+ [each_listing_in_hierarchy(parsedHierarchies[f])
87
+ for f in keys])
88
+
89
+ #-------------------------------------------------------------------------------
90
+
91
+ def insert_line(parsedLine,
92
+ parsedHierarchy,
93
+ bucket_name,
94
+ prefix='/',
95
+ s3hdfs = False):
96
+ """
97
+ @param parsedHierarchy A parsed hierarchy is a dictionary that
98
+ contains the size, date, type, path, and
99
+ subdirs of a file. It has two special
100
+ properties: the basename contains no /
101
+ characters, and the "subdirs" points to a
102
+ dictionary that maps names to
103
+ parsedHierarchies underneath this one.
104
+ """
105
+
106
+ def insert_subdir(parsedHierarchy, subdir, bucket_name, prefix):
107
+ if 'subdirs' not in parsedHierarchy:
108
+ parsedHierarchy['subdirs'] = {}
109
+ if subdir not in parsedHierarchy['subdirs']:
110
+ parsedHierarchy['subdirs'][subdir] = {}
111
+ parsedHierarchy['subdirs'][subdir]['basename'] = subdir
112
+ parsedHierarchy['subdirs'][subdir]['file_size'] = 0
113
+ parsedHierarchy['subdirs'][subdir]['type'] = 'd'
114
+
115
+ prot = 's3' if s3hdfs else 's3n'
116
+
117
+ parent_url = (parsedHierarchy['_id'] if '_id' in parsedHierarchy
118
+ else '%s://%s/' % (prot, bucket_name))
119
+
120
+ parsedHierarchy['subdirs'][subdir]['parent_id'] = parent_url
121
+
122
+
123
+ url = '%s://%s%s%s' % (prot, bucket_name, prefix, subdir)
124
+ parsedHierarchy['subdirs'][subdir]['_id'] = url
125
+
126
+ import hashlib
127
+ sha1hasher = hashlib.new('sha1')
128
+ sha1hasher.update(url)
129
+
130
+ parsedHierarchy['subdirs'][subdir]['uuid'] = (
131
+ sha1hasher.hexdigest().lower())
132
+
133
+ path = parsedLine['path']
134
+ # recursively insert rest of path after /
135
+ if path.find('/') != -1:
136
+ base,rest = path.split('/',1)
137
+
138
+ insert_subdir(parsedHierarchy, base, bucket_name, prefix)
139
+
140
+ parsedLine['path'] = rest
141
+ insert_line(parsedLine,
142
+ parsedHierarchy['subdirs'][base],
143
+ bucket_name,
144
+ prefix + base + '/')
145
+
146
+ # insert one file or directory into "subdirs"
147
+ else:
148
+ insert_subdir(parsedHierarchy, path, bucket_name, prefix)
149
+
150
+ # This will also overwrite the default 'type':'d' from insert_subdir
151
+ for k in parsedLine.keys():
152
+ parsedHierarchy['subdirs'][path][k] = parsedLine[k]
153
+
154
+ parsedHierarchy['subdirs'][path]['basename'] = \
155
+ parsedHierarchy['subdirs'][path]['path']
156
+ del parsedHierarchy['subdirs'][path]['path']
157
+
158
+ #-------------------------------------------------------------------------------
159
+
160
+ def json2ls(json, writer, prefix='/'):
161
+ """
162
+ sanity check. writes json back out to the command line in ls form
163
+ """
164
+
165
+ from datetime import datetime
166
+ d =(datetime.fromtimestamp(json['datetime']).strftime("%Y-%m-%d %H:%M")
167
+ if 'datetime' in json else '1970-01-01 00:00')
168
+
169
+ writer.write("%s %9d %s\n" % (
170
+ d,
171
+ json['file_size'],
172
+ json['_id'].replace('s3n', 's3')))
173
+
174
+ #-------------------------------------------------------------------------------
175
+
176
+ def hdfs_parse_line(bucket_name):
177
+
178
+ import re
179
+
180
+ def line_parser(line):
181
+
182
+ components = re.compile(r"""
183
+
184
+ ^
185
+ (
186
+ [d\-] # directory bit
187
+ )
188
+ (?:[r\-][w\-][xs\-]){2}
189
+ [r\-][w\-][x\-]
190
+
191
+ [ \t]*
192
+
193
+ (?:-|[0-9]+) # number of links. ignore.
194
+
195
+ [ \t]*
196
+
197
+ ([0-9]+) # size
198
+
199
+ [ \t]*
200
+
201
+ (\d\d\d\d-\d\d-\d\d[ ]\d\d:\d\d)
202
+
203
+ [ \t]*
204
+
205
+ ( # path
206
+ [^ \t]
207
+ [^\n]*
208
+ )
209
+
210
+ .*
211
+
212
+ $
213
+
214
+ """, re.VERBOSE)
215
+
216
+ m = components.match(line)
217
+ if not m:
218
+ import sys
219
+ sys.stderr.write("couldn't parse line: %s\n" % (line))
220
+ return None
221
+
222
+ typ, fsize, datetime, path = m.groups()
223
+
224
+ if typ == '-': typ = 'f'
225
+ if path.startswith('/'): path = path[1:]
226
+
227
+ return datetime, fsize, bucket_name, path, typ
228
+
229
+ return line_parser
230
+
231
+ #-------------------------------------------------------------------------------
232
+
233
+ def s3_parse_line(line):
234
+
235
+ import re
236
+ components = re.compile(r"""
237
+
238
+ ^
239
+ (\d\d\d\d-\d\d-\d\d[ ]\d\d:\d\d)
240
+
241
+ [ \t]*
242
+
243
+ ([0-9]+)
244
+
245
+ [ \t]*
246
+
247
+ (?:
248
+ (?:s3://)
249
+ ([^/]*)
250
+ /
251
+ ([^\n]*)
252
+ )
253
+
254
+ .*
255
+
256
+ $
257
+
258
+ """, re.VERBOSE)
259
+
260
+ m = components.match(line)
261
+ if not m:
262
+ import sys
263
+ sys.stderr.write("couldn't parse line: %s\n" % (line))
264
+ return None
265
+
266
+ datetime, fsize, bucket_name, parsed_line = m.groups()
267
+ typ = 'f'
268
+
269
+ return datetime, fsize, bucket_name, parsed_line, typ
270
+
271
+ #-------------------------------------------------------------------------------
272
+
273
+ def ls2json_subdirs(lines, line_parser):
274
+
275
+ parsedHierarchy = None
276
+
277
+ count = 0
278
+ for line in lines:
279
+ count = count + 1
280
+ if count % 1000 == 0:
281
+ logging.info("inserting line %d" % (count))
282
+
283
+ line_tuple = line_parser(line)
284
+
285
+ if not line_tuple:
286
+ continue
287
+
288
+ parsedLine = {}
289
+
290
+ (
291
+
292
+ parsedLine['datetime'],
293
+ parsedLine['file_size'],
294
+ bucket_name,
295
+ parsedLine['path'],
296
+ parsedLine['type']
297
+
298
+ ) = line_tuple
299
+
300
+ if not parsedHierarchy:
301
+ url = "s3n://%s" % (bucket_name)
302
+ import hashlib
303
+ sha1hasher = hashlib.new('sha1')
304
+ sha1hasher.update(url)
305
+
306
+ parsedHierarchy = {
307
+ bucket_name : {
308
+ "subdirs" : {},
309
+ "basename" : bucket_name,
310
+ "_id" : url,
311
+ "type" : "d",
312
+ "file_size" : 0,
313
+ "uuid" : sha1hasher.hexdigest(),
314
+ }
315
+ }
316
+
317
+ parsedLine['file_size'] = int(parsedLine['file_size'])
318
+
319
+ if parsedLine['datetime'] == '1970-01-01 00:00':
320
+ del parsedLine['datetime']
321
+ else:
322
+ from datetime import datetime
323
+ parsedLine['datetime'] = int(datetime.strptime(
324
+ parsedLine['datetime'],
325
+ "%Y-%m-%d %H:%M").strftime("%s"))
326
+
327
+ parsedLine['file_size'] = int(parsedLine['file_size'])
328
+
329
+ if parsedLine['path'].endswith('/'):
330
+ parsedLine['path'] = parsedLine['path'][:-1]
331
+ parsedLine['type'] = 'd'
332
+
333
+ insert_line(parsedLine,
334
+ parsedHierarchy[bucket_name],
335
+ bucket_name)
336
+
337
+ if not parsedHierarchy: return []
338
+
339
+ logging.info("calculating sizes")
340
+ calculate_sizes(parsedHierarchy)
341
+
342
+ logging.info("converting hierarchies")
343
+ return each_listing_in_subdirs(parsedHierarchy)
344
+
345
+ #-------------------------------------------------------------------------------
346
+
347
+ if __name__ == '__main__':
348
+
349
+ from optparse import OptionParser
350
+ parser = OptionParser(usage = "usage: %prog [options] [s3hdfs bucket name]")
351
+ parser.add_option("-i", "--input", dest="infile", default = None,
352
+ help="input file..")
353
+ parser.add_option("-o", "--output", dest="outfile", default = None,
354
+ help="output file.")
355
+ parser.add_option("-t", "--test", dest="test", default = False,
356
+ action="store_true",
357
+ help="reoutput in ls format. for debugging")
358
+
359
+ (options, args) = parser.parse_args()
360
+
361
+ import sys
362
+ if len(args) > 1:
363
+ parser.print_usage()
364
+ sys.exit(0)
365
+
366
+ if args:
367
+ bucket, = args
368
+ ls_converter = lambda istream: ls2json_subdirs(istream.readlines(),
369
+ hdfs_parse_line(bucket))
370
+ else:
371
+ ls_converter = lambda istream: ls2json_subdirs(istream.readlines(),
372
+ s3_parse_line)
373
+
374
+ def open_or_die(fname, flags="r"):
375
+ try:
376
+ return open(fname, flags)
377
+ except IOError as (errno, strerr):
378
+ sys.stderr.write("Couldn't open %s: %s\n" % (fname, strerr))
379
+ sys.exit(0)
380
+
381
+ from sys import stdin, stdout
382
+ instream = open_or_die(options.infile) if options.infile else stdin
383
+ outstream = open_or_die(options.outfile, 'w') if options.outfile else stdout
384
+
385
+ if options.test:
386
+ for listing in ls_converter(instream):
387
+ json2ls(listing, outstream)
388
+ else:
389
+ for listing in ls_converter(instream):
390
+ write_listing_in_json(listing, outstream)
391
+