vayacondios-server 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +61 -0
- data/.travis.yml +11 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +0 -0
- data/Gemfile +4 -0
- data/Guardfile +41 -0
- data/LICENSE.md +99 -0
- data/Procfile +2 -0
- data/README.md +183 -0
- data/Rakefile +6 -0
- data/app/http_shim.rb +67 -0
- data/bin/vcd.sh +27 -0
- data/config/http_shim.rb +43 -0
- data/config/vayacondios.example.yaml +4 -0
- data/config/vayacondios.yaml +4 -0
- data/lib/tasks/publish.rake +23 -0
- data/lib/tasks/spec.rake +9 -0
- data/lib/tasks/yard.rake +2 -0
- data/lib/vayacondios/client/configliere.rb +38 -0
- data/lib/vayacondios/client/http_client.rb +49 -0
- data/lib/vayacondios/client/notifier.rb +84 -0
- data/lib/vayacondios/server/handlers/config_handler.rb +35 -0
- data/lib/vayacondios/server/handlers/event_handler.rb +30 -0
- data/lib/vayacondios/server/model/config_document.rb +94 -0
- data/lib/vayacondios/server/model/document.rb +25 -0
- data/lib/vayacondios/server/model/event_document.rb +94 -0
- data/lib/vayacondios/version.rb +3 -0
- data/lib/vayacondios-client.rb +20 -0
- data/lib/vayacondios-server.rb +18 -0
- data/scripts/hadoop_monitor/configurable.rb +74 -0
- data/scripts/hadoop_monitor/hadoop_client.rb +249 -0
- data/scripts/hadoop_monitor/hadoop_monitor.rb +91 -0
- data/scripts/hadoop_monitor/hadoopable.rb +65 -0
- data/scripts/hadoop_monitor/machine_monitor.rb +115 -0
- data/scripts/s3_cataloger/buckets +33 -0
- data/scripts/s3_cataloger/foreach_bucket +88 -0
- data/scripts/s3_cataloger/parse_ls.py +391 -0
- data/spec/client/notifier_spec.rb +120 -0
- data/spec/server/config_spec.rb +55 -0
- data/spec/server/event_spec.rb +44 -0
- data/spec/server/server_spec.rb +20 -0
- data/spec/spec_helper.rb +10 -0
- data/spec/support/mongo_cleaner.rb +26 -0
- data/vayacondios-client.gemspec +26 -0
- data/vayacondios-server.gemspec +30 -0
- metadata +216 -0
@@ -0,0 +1,91 @@
|
|
1
|
+
#!/usr/bin/env jruby19
|
2
|
+
|
3
|
+
require_relative 'hadoop_client'
|
4
|
+
require_relative 'configurable'
|
5
|
+
require 'java'
|
6
|
+
require 'mongo'
|
7
|
+
require 'scanf'
|
8
|
+
require 'gorillib/hash/slice'
|
9
|
+
require 'thread'
|
10
|
+
require 'open-uri'
|
11
|
+
require 'json'
|
12
|
+
|
13
|
+
module Vayacondios
|
14
|
+
|
15
|
+
class HadoopMonitor
|
16
|
+
def initialize
|
17
|
+
init_settings
|
18
|
+
|
19
|
+
@hadoop = HadoopClient.new
|
20
|
+
|
21
|
+
@monitored_jobs = []
|
22
|
+
|
23
|
+
logger.debug "Creating mongo collections."
|
24
|
+
@conn = Mongo::Connection.new settings.mongo_ip
|
25
|
+
@db = @conn[settings.mongo_jobs_db]
|
26
|
+
@job_logs = @db.create_collection(settings.mongo_job_logs_collection)
|
27
|
+
|
28
|
+
# After we create the job_events database, one of the machine
|
29
|
+
# monitors will create the machine stats databse.
|
30
|
+
@job_events = @db.create_collection(settings.mongo_job_events_collection,
|
31
|
+
:capped => true,
|
32
|
+
:size => settings.job_events_size)
|
33
|
+
|
34
|
+
@cluster_state = CLUSTER_QUIET
|
35
|
+
end
|
36
|
+
|
37
|
+
def run
|
38
|
+
loop do
|
39
|
+
|
40
|
+
logger.debug "In main event loop."
|
41
|
+
|
42
|
+
cur_running_jobs = @hadoop.jobs_with_state HadoopClient::RUNNING
|
43
|
+
cur_cluster_state = (cur_running_jobs.size > 0) ? CLUSTER_BUSY : CLUSTER_QUIET
|
44
|
+
|
45
|
+
@hadoop.subtract(@monitored_jobs, cur_running_jobs).each do |job|
|
46
|
+
logger.debug "#{job.get_id.to_s} is complete."
|
47
|
+
update_job_stats job, Time.now
|
48
|
+
end
|
49
|
+
@hadoop.subtract(cur_running_jobs, @monitored_jobs).each do |job|
|
50
|
+
logger.debug "#{job.get_id.to_s} started."
|
51
|
+
update_job_properties job
|
52
|
+
end
|
53
|
+
|
54
|
+
(@monitored_jobs + cur_running_jobs).each{|job| update_job_stats job}
|
55
|
+
|
56
|
+
@monitored_jobs = cur_running_jobs
|
57
|
+
update_cluster_state cur_cluster_state
|
58
|
+
|
59
|
+
sleep settings.sleep_seconds
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
include Configurable
|
67
|
+
|
68
|
+
def update_cluster_state new_state
|
69
|
+
return if new_state == @cluster_state
|
70
|
+
@cluster_state = new_state
|
71
|
+
logger.info "Cluster state changed to #{@cluster_state}"
|
72
|
+
@job_events.insert(EVENT => @cluster_state, TIME => Time.now.to_i)
|
73
|
+
end
|
74
|
+
|
75
|
+
def update_job_properties job
|
76
|
+
properties = @hadoop.job_properties job
|
77
|
+
logger.debug "upserting #{JSON.generate properties}"
|
78
|
+
@job_logs.save(properties, upsert: true, safe: true)
|
79
|
+
end
|
80
|
+
|
81
|
+
def update_job_stats job, finish_time = nil
|
82
|
+
@hadoop.job_stats(job, finish_time || Time.now).each do |job_stat|
|
83
|
+
logger.debug "upserting #{JSON.generate job_stat}"
|
84
|
+
@job_logs.save(job_stat, upsert: true, safe: true)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
Vayacondios::HadoopMonitor.new.run
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'stringio'
|
2
|
+
|
3
|
+
module Vayacondios
|
4
|
+
|
5
|
+
module Hadoopable
|
6
|
+
|
7
|
+
include Configurable
|
8
|
+
|
9
|
+
#--------------------------------------------------------------------------------
|
10
|
+
# Initialize jruby and tell it about hadoop.
|
11
|
+
#--------------------------------------------------------------------------------
|
12
|
+
|
13
|
+
begin
|
14
|
+
require 'java'
|
15
|
+
rescue LoadError => e
|
16
|
+
raise "\nJava not found. Are you sure you're running with JRuby?\n#{e.message}"
|
17
|
+
end
|
18
|
+
|
19
|
+
hadoop_home = ENV['HADOOP_HOME'] || '/usr/lib/hadoop'
|
20
|
+
|
21
|
+
raise "\nHadoop installation not found. Try setting $HADOOP_HOME\n" unless (hadoop_home and (File.exist? hadoop_home))
|
22
|
+
|
23
|
+
$CLASSPATH << File.join(File.join(hadoop_home, 'conf') || ENV['HADOOP_CONF_DIR'],
|
24
|
+
'') # add trailing slash
|
25
|
+
|
26
|
+
Dir["#{hadoop_home}/{hadoop*.jar,lib/*.jar}"].each{|jar| require jar}
|
27
|
+
|
28
|
+
include_class org.apache.hadoop.mapred.JobConf
|
29
|
+
include_class org.apache.hadoop.mapred.JobClient
|
30
|
+
include_class org.apache.hadoop.mapred.JobStatus
|
31
|
+
include_class org.apache.hadoop.mapred.TIPStatus
|
32
|
+
include_class org.apache.hadoop.conf.Configuration
|
33
|
+
#--------------------------------------------------------------------------------
|
34
|
+
|
35
|
+
def get_hadoop_conf
|
36
|
+
logger.debug "Getting hadoop configuration"
|
37
|
+
|
38
|
+
stderr, $stderr = $stderr, StringIO.new
|
39
|
+
|
40
|
+
conf = Configuration.new
|
41
|
+
|
42
|
+
# per-site defaults
|
43
|
+
%w[capacity-scheduler.xml core-site.xml hadoop-policy.xml hadoop-site.xml hdfs-site.xml mapred-site.xml].each do |conf_file|
|
44
|
+
conf.addResource conf_file
|
45
|
+
end
|
46
|
+
|
47
|
+
conf.reload_configuration
|
48
|
+
|
49
|
+
# per-user overrides
|
50
|
+
if Swineherd.config[:aws]
|
51
|
+
conf.set("fs.s3.awsAccessKeyId",Swineherd.config[:aws][:access_key])
|
52
|
+
conf.set("fs.s3.awsSecretAccessKey",Swineherd.config[:aws][:secret_key])
|
53
|
+
|
54
|
+
conf.set("fs.s3n.awsAccessKeyId",Swineherd.config[:aws][:access_key])
|
55
|
+
conf.set("fs.s3n.awsSecretAccessKey",Swineherd.config[:aws][:secret_key])
|
56
|
+
end
|
57
|
+
|
58
|
+
return conf
|
59
|
+
ensure
|
60
|
+
stderr_lines = $stderr.string.split("\n")
|
61
|
+
$stderr = stderr
|
62
|
+
stderr_lines.each{|line| logger.debug line}
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative 'configure'
|
4
|
+
require 'thread'
|
5
|
+
require 'socket'
|
6
|
+
require 'scanf'
|
7
|
+
require 'json'
|
8
|
+
require 'mongo'
|
9
|
+
|
10
|
+
module Vayacondios
|
11
|
+
|
12
|
+
class StatServer
|
13
|
+
|
14
|
+
include Configurable
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
unless get_conf.mongo_ip
|
18
|
+
raise "The IP address of the mongo server must be set!"
|
19
|
+
end
|
20
|
+
|
21
|
+
logger.info "Connecting to Mongo server at ip #{get_conf.mongo_ip}"
|
22
|
+
conn = Mongo::Connection.new get_conf.mongo_ip
|
23
|
+
logger.debug "Getting job database #{get_conf.mongo_jobs_db}"
|
24
|
+
@db = conn[get_conf.mongo_jobs_db]
|
25
|
+
end
|
26
|
+
|
27
|
+
def run
|
28
|
+
|
29
|
+
# TODO: This entire script should be replaced by calls to zabbix
|
30
|
+
# initiated by the main loop of the hadoop_monitor.
|
31
|
+
|
32
|
+
logger.debug "Waiting for hadoop monitor to create the event collection."
|
33
|
+
sleep get_conf.sleep_seconds until
|
34
|
+
@db.collection_names.index get_conf.mongo_job_events_collection
|
35
|
+
|
36
|
+
job_events = @db[get_conf.mongo_job_events_collection]
|
37
|
+
|
38
|
+
logger.debug "Got the event collection. Creating machine stats collection."
|
39
|
+
machine_stats = @db.
|
40
|
+
create_collection(get_conf.mongo_machine_stats_collection)
|
41
|
+
|
42
|
+
logger.debug "Querying job_events until we see an insertion."
|
43
|
+
# Keep querying the job_events collection until there's an
|
44
|
+
# event. Don't just use the cursor from .find without checking,
|
45
|
+
# because if hadoop_monitor inserts an event into an empty
|
46
|
+
# database, this cursor will no longer work, even if it's
|
47
|
+
# tailable. not quite sure why Mongo does it that way.
|
48
|
+
events = job_events.find
|
49
|
+
events.add_option 0x02 # tailable
|
50
|
+
until events.has_next?
|
51
|
+
sleep get_conf.sleep_seconds
|
52
|
+
events = job_events.find
|
53
|
+
events.add_option 0x02 # tailable
|
54
|
+
end
|
55
|
+
|
56
|
+
logger.debug "Priming main event loop. Waiting to see if the cluster is busy."
|
57
|
+
|
58
|
+
# Get up-to-date on the state of the cluster. assume quiet to start.
|
59
|
+
cluster_busy = self.class.next_state(events, false, get_conf.event)
|
60
|
+
|
61
|
+
# main loop
|
62
|
+
loop do
|
63
|
+
|
64
|
+
logger.debug "In main event loop. Waiting to see if the cluster is busy."
|
65
|
+
|
66
|
+
# Get up-to-date on the state of the cluster.
|
67
|
+
cluster_busy = self.class.next_state(events, cluster_busy, get_conf.event)
|
68
|
+
|
69
|
+
# Don't grab stats unless the cluster is busy
|
70
|
+
unless cluster_busy
|
71
|
+
sleep get_conf.sleep_seconds
|
72
|
+
next
|
73
|
+
end
|
74
|
+
|
75
|
+
logger.debug "Grabbing stats and pushing them into the collection."
|
76
|
+
|
77
|
+
# Grab the stats!
|
78
|
+
# ifstat's delay will function as our heartbeat timer.
|
79
|
+
is, ignore, rw = `ifstat 1 1`.split("\n").map(&:split)
|
80
|
+
headers, *disks = `iostat -x`.split("\n")[5..-1].map(&:split)
|
81
|
+
cpu, mem, swap, proc_headers, *procs = `top -b -n 1`.
|
82
|
+
split("\n").map(&:strip).select{|x| not x.empty?}[2..-1]
|
83
|
+
|
84
|
+
# Write the stats into the mongo collection.
|
85
|
+
machine_stats.insert(
|
86
|
+
:net => Hash[is.zip(rw.each_slice(2).map{|r,w| {:r => r, :w => w}})],
|
87
|
+
:disk => Hash[disks.map{|d| [d.first, Hash[headers.zip(d)]]}],
|
88
|
+
:cpu => self.class.split_top_stats(cpu),
|
89
|
+
:mem => self.class.split_top_stats(mem),
|
90
|
+
:swap => self.class.split_top_stats(swap))
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
private
|
95
|
+
|
96
|
+
def self.split_top_stats line
|
97
|
+
Hash[line.split(':', 2).last.split(',').map(&:strip).map do |stat|
|
98
|
+
stat.scanf("%f%*c%s").reverse
|
99
|
+
end]
|
100
|
+
end
|
101
|
+
|
102
|
+
def self.next_state events_cursor, current_state, event_attr_name
|
103
|
+
while current_event = events_cursor.next
|
104
|
+
current_state = case current_event[event_attr_name]
|
105
|
+
when CLUSTER_BUSY then true
|
106
|
+
when CLUSTER_QUIET then false
|
107
|
+
else current_state
|
108
|
+
end
|
109
|
+
end
|
110
|
+
current_state
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
Vayacondios::StatServer.new.run
|
@@ -0,0 +1,33 @@
|
|
1
|
+
export dir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
2
|
+
|
3
|
+
bdump_and_bparse() {
|
4
|
+
bucket_name=$1
|
5
|
+
bdump $1 ; bparse $1
|
6
|
+
}
|
7
|
+
|
8
|
+
bparse_and_bload() {
|
9
|
+
bucket_name=$1
|
10
|
+
bparse $1 ; bload "$@"
|
11
|
+
}
|
12
|
+
|
13
|
+
bdump_and_bload() {
|
14
|
+
bucket_name=$1
|
15
|
+
bdump $1; bparse $1 ; bload $1
|
16
|
+
}
|
17
|
+
|
18
|
+
bdump() {
|
19
|
+
bucket_name=$1
|
20
|
+
s3cmd ls -r s3://$bucket_name/ >$bucket_name.ls
|
21
|
+
}
|
22
|
+
|
23
|
+
bparse() {
|
24
|
+
bucket_name=$1
|
25
|
+
$dir/parse_ls.py <$bucket_name.ls >$bucket_name.json
|
26
|
+
}
|
27
|
+
|
28
|
+
bload() {
|
29
|
+
bucket_name=$1
|
30
|
+
db=$2
|
31
|
+
collection=$3
|
32
|
+
mongoimport -d $db -c $collection $bucket_name.json
|
33
|
+
}
|
@@ -0,0 +1,88 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
export dir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
4
|
+
|
5
|
+
. $dir/buckets
|
6
|
+
|
7
|
+
case $1 in
|
8
|
+
-f)
|
9
|
+
bucket_file=$2
|
10
|
+
shift 2
|
11
|
+
;;
|
12
|
+
-h|--help)
|
13
|
+
cat <<EOF
|
14
|
+
foreach_bucket [OPTIONS] COMMAND [ARGUMENTS]
|
15
|
+
|
16
|
+
This script is used to do a recursive listing of an s3 bucket using
|
17
|
+
the s3cmd and then jsonify the output. It runs the COMMAND on the
|
18
|
+
buckets specified in a file; on standard input; or, by default, on all
|
19
|
+
buckets that can be seen by s3cmd.
|
20
|
+
|
21
|
+
OPTIONS include the following:
|
22
|
+
|
23
|
+
-f BUCKET_FILE file containing a bucket name on each line. If
|
24
|
+
this is set to '-', then buckets are read from
|
25
|
+
standard input.
|
26
|
+
|
27
|
+
COMMAND includes anything in the 'buckets' script. The main commands
|
28
|
+
are the following:
|
29
|
+
|
30
|
+
bdump dumps BUCKET to a file BUCKET.ls in the current
|
31
|
+
working directory
|
32
|
+
|
33
|
+
bparse runs BUCKET.ls through a parser to jsonify it and
|
34
|
+
outputs the result as BUCKET.json
|
35
|
+
|
36
|
+
bload loads BUCKET.json into a mongo database. The first
|
37
|
+
argument passed to this command specifies the
|
38
|
+
mongo database, while the second specifies the
|
39
|
+
collection.
|
40
|
+
EOF
|
41
|
+
exit 0
|
42
|
+
;;
|
43
|
+
-*)
|
44
|
+
echo "Invalid option: $1"
|
45
|
+
exit 1
|
46
|
+
;;
|
47
|
+
esac
|
48
|
+
|
49
|
+
command=$1
|
50
|
+
shift
|
51
|
+
|
52
|
+
buckets=()
|
53
|
+
|
54
|
+
## no bucket file specified read all s3 buckets
|
55
|
+
if [[ -z $bucket_file ]]
|
56
|
+
then
|
57
|
+
for bucket in `s3cmd ls | cut -d ' ' -f 4 | cut -d / -f 3`
|
58
|
+
do
|
59
|
+
buckets=("${buckets[@]}" "$bucket")
|
60
|
+
done
|
61
|
+
|
62
|
+
## read buckets from standard input
|
63
|
+
elif [[ $bucket_file == "-" ]]
|
64
|
+
then
|
65
|
+
read bucket
|
66
|
+
until [[ $? -eq 1 ]]
|
67
|
+
do
|
68
|
+
buckets=("${buckets[@]}" "$bucket")
|
69
|
+
read bucket
|
70
|
+
done
|
71
|
+
|
72
|
+
## read from bucket_file
|
73
|
+
else
|
74
|
+
tmpIFS=$IFS
|
75
|
+
IFS=$'\n'
|
76
|
+
|
77
|
+
for bucket in `cat $bucket_file`
|
78
|
+
do
|
79
|
+
buckets=("${buckets[@]}" "$bucket")
|
80
|
+
done
|
81
|
+
|
82
|
+
IFS=$tmpIFS
|
83
|
+
fi
|
84
|
+
|
85
|
+
for bucket in "${buckets[@]}"
|
86
|
+
do
|
87
|
+
($command $bucket "$@")&
|
88
|
+
done
|
@@ -0,0 +1,391 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import sys
|
5
|
+
|
6
|
+
# crank this down to info for progress messages. can also use
|
7
|
+
# "filename=" for that kind of thing. The only reason this is stderr
|
8
|
+
# is to allow for output redirection.
|
9
|
+
logging.basicConfig(stream=sys.stderr, level=logging.ERROR)
|
10
|
+
|
11
|
+
#-------------------------------------------------------------------------------
|
12
|
+
|
13
|
+
def calculate_sizes(parsedHierarchies):
|
14
|
+
"""
|
15
|
+
@param parsedHierarchies dictionary mapping filenames to
|
16
|
+
parsedHierarchies. This is in the same
|
17
|
+
format as the 'subdirs' component of a
|
18
|
+
parsedHierarchy.
|
19
|
+
"""
|
20
|
+
|
21
|
+
from operator import add
|
22
|
+
return reduce(
|
23
|
+
add,
|
24
|
+
(
|
25
|
+
calculate_size(parsedHierarchies[name])
|
26
|
+
for name in parsedHierarchies.keys()))
|
27
|
+
|
28
|
+
|
29
|
+
def calculate_size(parsedHierarchy):
|
30
|
+
"""
|
31
|
+
@param parsedHierarchy dictionary in the same format as the one
|
32
|
+
operated on by insert_line
|
33
|
+
"""
|
34
|
+
|
35
|
+
if 'subdirs' in parsedHierarchy:
|
36
|
+
parsedHierarchy['tree_size'] = calculate_sizes(parsedHierarchy['subdirs'])
|
37
|
+
elif parsedHierarchy['type'] == 'd':
|
38
|
+
parsedHierarchy['tree_size'] = 0
|
39
|
+
|
40
|
+
if 'tree_size' in parsedHierarchy:
|
41
|
+
return parsedHierarchy['tree_size']
|
42
|
+
else:
|
43
|
+
return parsedHierarchy['file_size']
|
44
|
+
|
45
|
+
#-------------------------------------------------------------------------------
|
46
|
+
|
47
|
+
from sys import stdout
|
48
|
+
def write_listing_in_json(listing, writer = stdout):
|
49
|
+
writer.write('{"basename":"%s"' % listing['basename'])
|
50
|
+
|
51
|
+
from operator import add
|
52
|
+
writer.write(reduce(add, (',"%s":%s' % (key,
|
53
|
+
'"%s"' % listing[key]
|
54
|
+
if isinstance(listing[key],str)
|
55
|
+
else listing[key])
|
56
|
+
for key in listing.keys() if key != 'subdirs')))
|
57
|
+
|
58
|
+
writer.write('}\n')
|
59
|
+
|
60
|
+
#-------------------------------------------------------------------------------
|
61
|
+
|
62
|
+
def each_listing_in_hierarchy(parsedHierarchy):
|
63
|
+
"""
|
64
|
+
@param parsedHierarchy dictionary mapping filenames to
|
65
|
+
parsedHierarchies. This is in the same
|
66
|
+
format as the 'subdirs' component of a
|
67
|
+
parsedHierarchy.
|
68
|
+
|
69
|
+
@return one record for every file listing. Every parsedHierarchy
|
70
|
+
will have its 'subdirs' key deleted and will consequently be flat.
|
71
|
+
"""
|
72
|
+
|
73
|
+
if 'subdirs' in parsedHierarchy:
|
74
|
+
subdirs = parsedHierarchy['subdirs']
|
75
|
+
del parsedHierarchy['subdirs']
|
76
|
+
return [parsedHierarchy] + each_listing_in_subdirs(subdirs)
|
77
|
+
else:
|
78
|
+
return [parsedHierarchy]
|
79
|
+
|
80
|
+
def each_listing_in_subdirs(parsedHierarchies):
|
81
|
+
keys = parsedHierarchies.keys()
|
82
|
+
keys.sort()
|
83
|
+
from operator import add
|
84
|
+
|
85
|
+
return reduce(add,
|
86
|
+
[each_listing_in_hierarchy(parsedHierarchies[f])
|
87
|
+
for f in keys])
|
88
|
+
|
89
|
+
#-------------------------------------------------------------------------------
|
90
|
+
|
91
|
+
def insert_line(parsedLine,
|
92
|
+
parsedHierarchy,
|
93
|
+
bucket_name,
|
94
|
+
prefix='/',
|
95
|
+
s3hdfs = False):
|
96
|
+
"""
|
97
|
+
@param parsedHierarchy A parsed hierarchy is a dictionary that
|
98
|
+
contains the size, date, type, path, and
|
99
|
+
subdirs of a file. It has two special
|
100
|
+
properties: the basename contains no /
|
101
|
+
characters, and the "subdirs" points to a
|
102
|
+
dictionary that maps names to
|
103
|
+
parsedHierarchies underneath this one.
|
104
|
+
"""
|
105
|
+
|
106
|
+
def insert_subdir(parsedHierarchy, subdir, bucket_name, prefix):
|
107
|
+
if 'subdirs' not in parsedHierarchy:
|
108
|
+
parsedHierarchy['subdirs'] = {}
|
109
|
+
if subdir not in parsedHierarchy['subdirs']:
|
110
|
+
parsedHierarchy['subdirs'][subdir] = {}
|
111
|
+
parsedHierarchy['subdirs'][subdir]['basename'] = subdir
|
112
|
+
parsedHierarchy['subdirs'][subdir]['file_size'] = 0
|
113
|
+
parsedHierarchy['subdirs'][subdir]['type'] = 'd'
|
114
|
+
|
115
|
+
prot = 's3' if s3hdfs else 's3n'
|
116
|
+
|
117
|
+
parent_url = (parsedHierarchy['_id'] if '_id' in parsedHierarchy
|
118
|
+
else '%s://%s/' % (prot, bucket_name))
|
119
|
+
|
120
|
+
parsedHierarchy['subdirs'][subdir]['parent_id'] = parent_url
|
121
|
+
|
122
|
+
|
123
|
+
url = '%s://%s%s%s' % (prot, bucket_name, prefix, subdir)
|
124
|
+
parsedHierarchy['subdirs'][subdir]['_id'] = url
|
125
|
+
|
126
|
+
import hashlib
|
127
|
+
sha1hasher = hashlib.new('sha1')
|
128
|
+
sha1hasher.update(url)
|
129
|
+
|
130
|
+
parsedHierarchy['subdirs'][subdir]['uuid'] = (
|
131
|
+
sha1hasher.hexdigest().lower())
|
132
|
+
|
133
|
+
path = parsedLine['path']
|
134
|
+
# recursively insert rest of path after /
|
135
|
+
if path.find('/') != -1:
|
136
|
+
base,rest = path.split('/',1)
|
137
|
+
|
138
|
+
insert_subdir(parsedHierarchy, base, bucket_name, prefix)
|
139
|
+
|
140
|
+
parsedLine['path'] = rest
|
141
|
+
insert_line(parsedLine,
|
142
|
+
parsedHierarchy['subdirs'][base],
|
143
|
+
bucket_name,
|
144
|
+
prefix + base + '/')
|
145
|
+
|
146
|
+
# insert one file or directory into "subdirs"
|
147
|
+
else:
|
148
|
+
insert_subdir(parsedHierarchy, path, bucket_name, prefix)
|
149
|
+
|
150
|
+
# This will also overwrite the default 'type':'d' from insert_subdir
|
151
|
+
for k in parsedLine.keys():
|
152
|
+
parsedHierarchy['subdirs'][path][k] = parsedLine[k]
|
153
|
+
|
154
|
+
parsedHierarchy['subdirs'][path]['basename'] = \
|
155
|
+
parsedHierarchy['subdirs'][path]['path']
|
156
|
+
del parsedHierarchy['subdirs'][path]['path']
|
157
|
+
|
158
|
+
#-------------------------------------------------------------------------------
|
159
|
+
|
160
|
+
def json2ls(json, writer, prefix='/'):
|
161
|
+
"""
|
162
|
+
sanity check. writes json back out to the command line in ls form
|
163
|
+
"""
|
164
|
+
|
165
|
+
from datetime import datetime
|
166
|
+
d =(datetime.fromtimestamp(json['datetime']).strftime("%Y-%m-%d %H:%M")
|
167
|
+
if 'datetime' in json else '1970-01-01 00:00')
|
168
|
+
|
169
|
+
writer.write("%s %9d %s\n" % (
|
170
|
+
d,
|
171
|
+
json['file_size'],
|
172
|
+
json['_id'].replace('s3n', 's3')))
|
173
|
+
|
174
|
+
#-------------------------------------------------------------------------------
|
175
|
+
|
176
|
+
def hdfs_parse_line(bucket_name):
|
177
|
+
|
178
|
+
import re
|
179
|
+
|
180
|
+
def line_parser(line):
|
181
|
+
|
182
|
+
components = re.compile(r"""
|
183
|
+
|
184
|
+
^
|
185
|
+
(
|
186
|
+
[d\-] # directory bit
|
187
|
+
)
|
188
|
+
(?:[r\-][w\-][xs\-]){2}
|
189
|
+
[r\-][w\-][x\-]
|
190
|
+
|
191
|
+
[ \t]*
|
192
|
+
|
193
|
+
(?:-|[0-9]+) # number of links. ignore.
|
194
|
+
|
195
|
+
[ \t]*
|
196
|
+
|
197
|
+
([0-9]+) # size
|
198
|
+
|
199
|
+
[ \t]*
|
200
|
+
|
201
|
+
(\d\d\d\d-\d\d-\d\d[ ]\d\d:\d\d)
|
202
|
+
|
203
|
+
[ \t]*
|
204
|
+
|
205
|
+
( # path
|
206
|
+
[^ \t]
|
207
|
+
[^\n]*
|
208
|
+
)
|
209
|
+
|
210
|
+
.*
|
211
|
+
|
212
|
+
$
|
213
|
+
|
214
|
+
""", re.VERBOSE)
|
215
|
+
|
216
|
+
m = components.match(line)
|
217
|
+
if not m:
|
218
|
+
import sys
|
219
|
+
sys.stderr.write("couldn't parse line: %s\n" % (line))
|
220
|
+
return None
|
221
|
+
|
222
|
+
typ, fsize, datetime, path = m.groups()
|
223
|
+
|
224
|
+
if typ == '-': typ = 'f'
|
225
|
+
if path.startswith('/'): path = path[1:]
|
226
|
+
|
227
|
+
return datetime, fsize, bucket_name, path, typ
|
228
|
+
|
229
|
+
return line_parser
|
230
|
+
|
231
|
+
#-------------------------------------------------------------------------------
|
232
|
+
|
233
|
+
def s3_parse_line(line):
|
234
|
+
|
235
|
+
import re
|
236
|
+
components = re.compile(r"""
|
237
|
+
|
238
|
+
^
|
239
|
+
(\d\d\d\d-\d\d-\d\d[ ]\d\d:\d\d)
|
240
|
+
|
241
|
+
[ \t]*
|
242
|
+
|
243
|
+
([0-9]+)
|
244
|
+
|
245
|
+
[ \t]*
|
246
|
+
|
247
|
+
(?:
|
248
|
+
(?:s3://)
|
249
|
+
([^/]*)
|
250
|
+
/
|
251
|
+
([^\n]*)
|
252
|
+
)
|
253
|
+
|
254
|
+
.*
|
255
|
+
|
256
|
+
$
|
257
|
+
|
258
|
+
""", re.VERBOSE)
|
259
|
+
|
260
|
+
m = components.match(line)
|
261
|
+
if not m:
|
262
|
+
import sys
|
263
|
+
sys.stderr.write("couldn't parse line: %s\n" % (line))
|
264
|
+
return None
|
265
|
+
|
266
|
+
datetime, fsize, bucket_name, parsed_line = m.groups()
|
267
|
+
typ = 'f'
|
268
|
+
|
269
|
+
return datetime, fsize, bucket_name, parsed_line, typ
|
270
|
+
|
271
|
+
#-------------------------------------------------------------------------------
|
272
|
+
|
273
|
+
def ls2json_subdirs(lines, line_parser):
|
274
|
+
|
275
|
+
parsedHierarchy = None
|
276
|
+
|
277
|
+
count = 0
|
278
|
+
for line in lines:
|
279
|
+
count = count + 1
|
280
|
+
if count % 1000 == 0:
|
281
|
+
logging.info("inserting line %d" % (count))
|
282
|
+
|
283
|
+
line_tuple = line_parser(line)
|
284
|
+
|
285
|
+
if not line_tuple:
|
286
|
+
continue
|
287
|
+
|
288
|
+
parsedLine = {}
|
289
|
+
|
290
|
+
(
|
291
|
+
|
292
|
+
parsedLine['datetime'],
|
293
|
+
parsedLine['file_size'],
|
294
|
+
bucket_name,
|
295
|
+
parsedLine['path'],
|
296
|
+
parsedLine['type']
|
297
|
+
|
298
|
+
) = line_tuple
|
299
|
+
|
300
|
+
if not parsedHierarchy:
|
301
|
+
url = "s3n://%s" % (bucket_name)
|
302
|
+
import hashlib
|
303
|
+
sha1hasher = hashlib.new('sha1')
|
304
|
+
sha1hasher.update(url)
|
305
|
+
|
306
|
+
parsedHierarchy = {
|
307
|
+
bucket_name : {
|
308
|
+
"subdirs" : {},
|
309
|
+
"basename" : bucket_name,
|
310
|
+
"_id" : url,
|
311
|
+
"type" : "d",
|
312
|
+
"file_size" : 0,
|
313
|
+
"uuid" : sha1hasher.hexdigest(),
|
314
|
+
}
|
315
|
+
}
|
316
|
+
|
317
|
+
parsedLine['file_size'] = int(parsedLine['file_size'])
|
318
|
+
|
319
|
+
if parsedLine['datetime'] == '1970-01-01 00:00':
|
320
|
+
del parsedLine['datetime']
|
321
|
+
else:
|
322
|
+
from datetime import datetime
|
323
|
+
parsedLine['datetime'] = int(datetime.strptime(
|
324
|
+
parsedLine['datetime'],
|
325
|
+
"%Y-%m-%d %H:%M").strftime("%s"))
|
326
|
+
|
327
|
+
parsedLine['file_size'] = int(parsedLine['file_size'])
|
328
|
+
|
329
|
+
if parsedLine['path'].endswith('/'):
|
330
|
+
parsedLine['path'] = parsedLine['path'][:-1]
|
331
|
+
parsedLine['type'] = 'd'
|
332
|
+
|
333
|
+
insert_line(parsedLine,
|
334
|
+
parsedHierarchy[bucket_name],
|
335
|
+
bucket_name)
|
336
|
+
|
337
|
+
if not parsedHierarchy: return []
|
338
|
+
|
339
|
+
logging.info("calculating sizes")
|
340
|
+
calculate_sizes(parsedHierarchy)
|
341
|
+
|
342
|
+
logging.info("converting hierarchies")
|
343
|
+
return each_listing_in_subdirs(parsedHierarchy)
|
344
|
+
|
345
|
+
#-------------------------------------------------------------------------------
|
346
|
+
|
347
|
+
if __name__ == '__main__':
|
348
|
+
|
349
|
+
from optparse import OptionParser
|
350
|
+
parser = OptionParser(usage = "usage: %prog [options] [s3hdfs bucket name]")
|
351
|
+
parser.add_option("-i", "--input", dest="infile", default = None,
|
352
|
+
help="input file..")
|
353
|
+
parser.add_option("-o", "--output", dest="outfile", default = None,
|
354
|
+
help="output file.")
|
355
|
+
parser.add_option("-t", "--test", dest="test", default = False,
|
356
|
+
action="store_true",
|
357
|
+
help="reoutput in ls format. for debugging")
|
358
|
+
|
359
|
+
(options, args) = parser.parse_args()
|
360
|
+
|
361
|
+
import sys
|
362
|
+
if len(args) > 1:
|
363
|
+
parser.print_usage()
|
364
|
+
sys.exit(0)
|
365
|
+
|
366
|
+
if args:
|
367
|
+
bucket, = args
|
368
|
+
ls_converter = lambda istream: ls2json_subdirs(istream.readlines(),
|
369
|
+
hdfs_parse_line(bucket))
|
370
|
+
else:
|
371
|
+
ls_converter = lambda istream: ls2json_subdirs(istream.readlines(),
|
372
|
+
s3_parse_line)
|
373
|
+
|
374
|
+
def open_or_die(fname, flags="r"):
|
375
|
+
try:
|
376
|
+
return open(fname, flags)
|
377
|
+
except IOError as (errno, strerr):
|
378
|
+
sys.stderr.write("Couldn't open %s: %s\n" % (fname, strerr))
|
379
|
+
sys.exit(0)
|
380
|
+
|
381
|
+
from sys import stdin, stdout
|
382
|
+
instream = open_or_die(options.infile) if options.infile else stdin
|
383
|
+
outstream = open_or_die(options.outfile, 'w') if options.outfile else stdout
|
384
|
+
|
385
|
+
if options.test:
|
386
|
+
for listing in ls_converter(instream):
|
387
|
+
json2ls(listing, outstream)
|
388
|
+
else:
|
389
|
+
for listing in ls_converter(instream):
|
390
|
+
write_listing_in_json(listing, outstream)
|
391
|
+
|