vayacondios-server 0.2.11 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -1
- data/.travis.yml +2 -0
- data/Gemfile +15 -9
- data/LICENSE.md +2 -6
- data/Procfile +1 -1
- data/README.md +656 -111
- data/Rakefile +89 -6
- data/bin/vcd +10 -0
- data/bin/vcd-server +8 -0
- data/config/database.yml +6 -0
- data/config/spec.example.yml +18 -0
- data/config/vayacondios.example.yml +15 -0
- data/config/vcd-server.rb +37 -0
- data/examples/configuration.rb +56 -0
- data/examples/event_stream.rb +19 -0
- data/examples/simple.rb +61 -0
- data/features/event.feature +319 -0
- data/features/events.feature +208 -0
- data/features/stash.feature +840 -0
- data/features/stashes.feature +492 -0
- data/features/step_definitions/stash_steps.rb +113 -0
- data/features/stream.feature +30 -0
- data/features/support/em.rb +14 -0
- data/features/support/env.rb +13 -0
- data/lib/vayacondios/configuration.rb +63 -0
- data/lib/vayacondios/server/api.rb +126 -0
- data/lib/vayacondios/server/api_options.rb +56 -0
- data/lib/vayacondios/server/configuration.rb +23 -0
- data/lib/vayacondios/server/driver.rb +71 -0
- data/lib/vayacondios/server/drivers/mongo.rb +126 -0
- data/lib/vayacondios/server/handlers/document_handler.rb +81 -0
- data/lib/vayacondios/server/handlers/event_handler.rb +31 -26
- data/lib/vayacondios/server/handlers/events_handler.rb +31 -0
- data/lib/vayacondios/server/handlers/stash_handler.rb +69 -0
- data/lib/vayacondios/server/handlers/stashes_handler.rb +49 -0
- data/lib/vayacondios/server/handlers/stream_handler.rb +39 -0
- data/lib/vayacondios/server/models/document.rb +87 -0
- data/lib/vayacondios/server/models/event.rb +198 -0
- data/lib/vayacondios/server/models/stash.rb +100 -0
- data/lib/vayacondios/server.rb +35 -0
- data/lib/vayacondios-server.rb +19 -13
- data/lib/vayacondios.rb +22 -0
- data/pom.xml +124 -4
- data/spec/configuration_spec.rb +41 -0
- data/spec/server/api_options_spec.rb +32 -0
- data/spec/server/api_spec.rb +279 -0
- data/spec/server/configuration_spec.rb +27 -0
- data/spec/server/drivers/mongo_spec.rb +107 -0
- data/spec/server/handlers/event_handler_spec.rb +62 -0
- data/spec/server/handlers/events_handler_spec.rb +51 -0
- data/spec/server/handlers/stash_handler_spec.rb +68 -0
- data/spec/server/handlers/stashes_handler_spec.rb +50 -0
- data/spec/server/handlers/stream_handler_spec.rb +5 -0
- data/spec/server/models/document_spec.rb +9 -0
- data/spec/server/models/event_spec.rb +185 -0
- data/spec/server/models/stash_spec.rb +95 -0
- data/spec/spec_helper.rb +23 -3
- data/spec/support/database_helper.rb +42 -0
- data/spec/support/log_helper.rb +19 -0
- data/spec/support/shared_context_for_events.rb +22 -0
- data/spec/support/shared_context_for_stashes.rb +24 -0
- data/spec/support/shared_examples_for_handlers.rb +32 -0
- data/src/main/java/com/infochimps/vayacondios/BaseClient.java +342 -0
- data/src/main/java/com/infochimps/vayacondios/HTTPClient.java +426 -0
- data/src/main/java/com/infochimps/vayacondios/VayacondiosClient.java +487 -65
- data/src/main/java/com/infochimps/vayacondios/test/IntegrationTest.java +3 -0
- data/src/test/java/com/infochimps/vayacondios/BaseClientTest.java +50 -0
- data/src/test/java/com/infochimps/vayacondios/HTTPClientIT.java +267 -0
- data/vayacondios-server.gemspec +9 -9
- metadata +127 -122
- checksums.yaml +0 -15
- data/.rspec +0 -2
- data/.yardopts +0 -10
- data/Guardfile +0 -41
- data/app/http_shim.rb +0 -71
- data/bin/vcd.sh +0 -27
- data/config/http_shim.rb +0 -43
- data/config/vayacondios.example.yaml +0 -7
- data/config/vayacondios.yaml +0 -7
- data/examples/java/ItemSetTest.java +0 -76
- data/lib/tasks/publish.rake +0 -23
- data/lib/tasks/spec.rake +0 -11
- data/lib/tasks/yard.rake +0 -2
- data/lib/vayacondios/client/config.rb +0 -7
- data/lib/vayacondios/client/configliere.rb +0 -38
- data/lib/vayacondios/client/cube_client.rb +0 -39
- data/lib/vayacondios/client/http_client.rb +0 -49
- data/lib/vayacondios/client/itemset.rb +0 -130
- data/lib/vayacondios/client/legacy_switch.rb +0 -43
- data/lib/vayacondios/client/notifier.rb +0 -123
- data/lib/vayacondios/client/zabbix_client.rb +0 -148
- data/lib/vayacondios/legacy_switch.rb +0 -43
- data/lib/vayacondios/server/errors/bad_request.rb +0 -6
- data/lib/vayacondios/server/errors/not_found.rb +0 -6
- data/lib/vayacondios/server/handlers/config_handler.rb +0 -32
- data/lib/vayacondios/server/handlers/itemset_handler.rb +0 -60
- data/lib/vayacondios/server/legacy_switch.rb +0 -43
- data/lib/vayacondios/server/model/config_document.rb +0 -89
- data/lib/vayacondios/server/model/document.rb +0 -25
- data/lib/vayacondios/server/model/event_document.rb +0 -94
- data/lib/vayacondios/server/model/itemset_document.rb +0 -126
- data/lib/vayacondios/server/rack/extract_methods.rb +0 -35
- data/lib/vayacondios/server/rack/jsonize.rb +0 -43
- data/lib/vayacondios/server/rack/params.rb +0 -50
- data/lib/vayacondios/server/rack/path.rb +0 -23
- data/lib/vayacondios/server/rack/path_validation.rb +0 -22
- data/lib/vayacondios/version.rb +0 -3
- data/lib/vayacondios-client.rb +0 -22
- data/scripts/hadoop_monitor/configurable.rb +0 -66
- data/scripts/hadoop_monitor/hadoop_attempt_scraper.rb +0 -45
- data/scripts/hadoop_monitor/hadoop_client.rb +0 -273
- data/scripts/hadoop_monitor/hadoop_monitor.rb +0 -101
- data/scripts/hadoop_monitor/hadoopable.rb +0 -65
- data/scripts/hadoop_monitor/machine_monitor.rb +0 -115
- data/scripts/s3_cataloger/buckets +0 -33
- data/scripts/s3_cataloger/foreach_bucket +0 -88
- data/scripts/s3_cataloger/parse_ls.py +0 -391
- data/spec/client/itemset_legacy_spec.rb +0 -55
- data/spec/client/itemset_spec.rb +0 -60
- data/spec/client/notifier_spec.rb +0 -120
- data/spec/server/config_spec.rb +0 -113
- data/spec/server/event_spec.rb +0 -103
- data/spec/server/itemset_legacy_spec.rb +0 -320
- data/spec/server/itemset_spec.rb +0 -317
- data/spec/server/rack/extract_methods_spec.rb +0 -60
- data/spec/server/rack/path_spec.rb +0 -36
- data/spec/server/rack/path_validation_spec.rb +0 -22
- data/spec/server/server_spec.rb +0 -20
- data/spec/support/mongo_cleaner.rb +0 -32
- data/src/main/java/ItemSetTest.java +0 -76
- data/src/main/java/com/infochimps/util/CurrentClass.java +0 -26
- data/src/main/java/com/infochimps/util/DebugUtil.java +0 -38
- data/src/main/java/com/infochimps/util/HttpHelper.java +0 -181
- data/src/main/java/com/infochimps/vayacondios/ItemSets.java +0 -373
- data/src/main/java/com/infochimps/vayacondios/LinkToVCD.java +0 -18
- data/src/main/java/com/infochimps/vayacondios/MemoryVCDShim.java +0 -84
- data/src/main/java/com/infochimps/vayacondios/Organization.java +0 -62
- data/src/main/java/com/infochimps/vayacondios/PathBuilder.java +0 -13
- data/src/main/java/com/infochimps/vayacondios/StandardVCDLink.java +0 -218
- data/src/main/java/com/infochimps/vayacondios/VCDIntegrationTest.java +0 -108
- data/src/test/java/com/infochimps/vayacondios/TestVayacondiosInMemory.java +0 -78
- data/vayacondios-client.gemspec +0 -25
@@ -1,115 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require_relative 'configure'
|
4
|
-
require 'thread'
|
5
|
-
require 'socket'
|
6
|
-
require 'scanf'
|
7
|
-
require 'json'
|
8
|
-
require 'mongo'
|
9
|
-
|
10
|
-
class Vayacondios
|
11
|
-
|
12
|
-
class StatServer
|
13
|
-
|
14
|
-
include Configurable
|
15
|
-
|
16
|
-
def initialize
|
17
|
-
unless get_conf.mongo_ip
|
18
|
-
raise "The IP address of the mongo server must be set!"
|
19
|
-
end
|
20
|
-
|
21
|
-
logger.info "Connecting to Mongo server at ip #{get_conf.mongo_ip}"
|
22
|
-
conn = Mongo::Connection.new get_conf.mongo_ip
|
23
|
-
logger.debug "Getting job database #{get_conf.mongo_jobs_db}"
|
24
|
-
@db = conn[get_conf.mongo_jobs_db]
|
25
|
-
end
|
26
|
-
|
27
|
-
def run
|
28
|
-
|
29
|
-
# TODO: This entire script should be replaced by calls to zabbix
|
30
|
-
# initiated by the main loop of the hadoop_monitor.
|
31
|
-
|
32
|
-
logger.debug "Waiting for hadoop monitor to create the event collection."
|
33
|
-
sleep get_conf.sleep_seconds until
|
34
|
-
@db.collection_names.index get_conf.mongo_job_events_collection
|
35
|
-
|
36
|
-
job_events = @db[get_conf.mongo_job_events_collection]
|
37
|
-
|
38
|
-
logger.debug "Got the event collection. Creating machine stats collection."
|
39
|
-
machine_stats = @db.
|
40
|
-
create_collection(get_conf.mongo_machine_stats_collection)
|
41
|
-
|
42
|
-
logger.debug "Querying job_events until we see an insertion."
|
43
|
-
# Keep querying the job_events collection until there's an
|
44
|
-
# event. Don't just use the cursor from .find without checking,
|
45
|
-
# because if hadoop_monitor inserts an event into an empty
|
46
|
-
# database, this cursor will no longer work, even if it's
|
47
|
-
# tailable. not quite sure why Mongo does it that way.
|
48
|
-
events = job_events.find
|
49
|
-
events.add_option 0x02 # tailable
|
50
|
-
until events.has_next?
|
51
|
-
sleep get_conf.sleep_seconds
|
52
|
-
events = job_events.find
|
53
|
-
events.add_option 0x02 # tailable
|
54
|
-
end
|
55
|
-
|
56
|
-
logger.debug "Priming main event loop. Waiting to see if the cluster is busy."
|
57
|
-
|
58
|
-
# Get up-to-date on the state of the cluster. assume quiet to start.
|
59
|
-
cluster_busy = self.class.next_state(events, false, get_conf.event)
|
60
|
-
|
61
|
-
# main loop
|
62
|
-
loop do
|
63
|
-
|
64
|
-
logger.debug "In main event loop. Waiting to see if the cluster is busy."
|
65
|
-
|
66
|
-
# Get up-to-date on the state of the cluster.
|
67
|
-
cluster_busy = self.class.next_state(events, cluster_busy, get_conf.event)
|
68
|
-
|
69
|
-
# Don't grab stats unless the cluster is busy
|
70
|
-
unless cluster_busy
|
71
|
-
sleep get_conf.sleep_seconds
|
72
|
-
next
|
73
|
-
end
|
74
|
-
|
75
|
-
logger.debug "Grabbing stats and pushing them into the collection."
|
76
|
-
|
77
|
-
# Grab the stats!
|
78
|
-
# ifstat's delay will function as our heartbeat timer.
|
79
|
-
is, ignore, rw = `ifstat 1 1`.split("\n").map(&:split)
|
80
|
-
headers, *disks = `iostat -x`.split("\n")[5..-1].map(&:split)
|
81
|
-
cpu, mem, swap, proc_headers, *procs = `top -b -n 1`.
|
82
|
-
split("\n").map(&:strip).select{|x| not x.empty?}[2..-1]
|
83
|
-
|
84
|
-
# Write the stats into the mongo collection.
|
85
|
-
machine_stats.insert(
|
86
|
-
:net => Hash[is.zip(rw.each_slice(2).map{|r,w| {:r => r, :w => w}})],
|
87
|
-
:disk => Hash[disks.map{|d| [d.first, Hash[headers.zip(d)]]}],
|
88
|
-
:cpu => self.class.split_top_stats(cpu),
|
89
|
-
:mem => self.class.split_top_stats(mem),
|
90
|
-
:swap => self.class.split_top_stats(swap))
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
private
|
95
|
-
|
96
|
-
def self.split_top_stats line
|
97
|
-
Hash[line.split(':', 2).last.split(',').map(&:strip).map do |stat|
|
98
|
-
stat.scanf("%f%*c%s").reverse
|
99
|
-
end]
|
100
|
-
end
|
101
|
-
|
102
|
-
def self.next_state events_cursor, current_state, event_attr_name
|
103
|
-
while current_event = events_cursor.next
|
104
|
-
current_state = case current_event[event_attr_name]
|
105
|
-
when CLUSTER_BUSY then true
|
106
|
-
when CLUSTER_QUIET then false
|
107
|
-
else current_state
|
108
|
-
end
|
109
|
-
end
|
110
|
-
current_state
|
111
|
-
end
|
112
|
-
end
|
113
|
-
end
|
114
|
-
|
115
|
-
Vayacondios::StatServer.new.run
|
@@ -1,33 +0,0 @@
|
|
1
|
-
export dir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
2
|
-
|
3
|
-
bdump_and_bparse() {
|
4
|
-
bucket_name=$1
|
5
|
-
bdump $1 ; bparse $1
|
6
|
-
}
|
7
|
-
|
8
|
-
bparse_and_bload() {
|
9
|
-
bucket_name=$1
|
10
|
-
bparse $1 ; bload "$@"
|
11
|
-
}
|
12
|
-
|
13
|
-
bdump_and_bload() {
|
14
|
-
bucket_name=$1
|
15
|
-
bdump $1; bparse $1 ; bload $1
|
16
|
-
}
|
17
|
-
|
18
|
-
bdump() {
|
19
|
-
bucket_name=$1
|
20
|
-
s3cmd ls -r s3://$bucket_name/ >$bucket_name.ls
|
21
|
-
}
|
22
|
-
|
23
|
-
bparse() {
|
24
|
-
bucket_name=$1
|
25
|
-
$dir/parse_ls.py <$bucket_name.ls >$bucket_name.json
|
26
|
-
}
|
27
|
-
|
28
|
-
bload() {
|
29
|
-
bucket_name=$1
|
30
|
-
db=$2
|
31
|
-
collection=$3
|
32
|
-
mongoimport -d $db -c $collection $bucket_name.json
|
33
|
-
}
|
@@ -1,88 +0,0 @@
|
|
1
|
-
#!/usr/bin/env bash
|
2
|
-
|
3
|
-
export dir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
4
|
-
|
5
|
-
. $dir/buckets
|
6
|
-
|
7
|
-
case $1 in
|
8
|
-
-f)
|
9
|
-
bucket_file=$2
|
10
|
-
shift 2
|
11
|
-
;;
|
12
|
-
-h|--help)
|
13
|
-
cat <<EOF
|
14
|
-
foreach_bucket [OPTIONS] COMMAND [ARGUMENTS]
|
15
|
-
|
16
|
-
This script is used to do a recursive listing of an s3 bucket using
|
17
|
-
the s3cmd and then jsonify the output. It runs the COMMAND on the
|
18
|
-
buckets specified in a file; on standard input; or, by default, on all
|
19
|
-
buckets that can be seen by s3cmd.
|
20
|
-
|
21
|
-
OPTIONS include the following:
|
22
|
-
|
23
|
-
-f BUCKET_FILE file containing a bucket name on each line. If
|
24
|
-
this is set to '-', then buckets are read from
|
25
|
-
standard input.
|
26
|
-
|
27
|
-
COMMAND includes anything in the 'buckets' script. The main commands
|
28
|
-
are the following:
|
29
|
-
|
30
|
-
bdump dumps BUCKET to a file BUCKET.ls in the current
|
31
|
-
working directory
|
32
|
-
|
33
|
-
bparse runs BUCKET.ls through a parser to jsonify it and
|
34
|
-
outputs the result as BUCKET.json
|
35
|
-
|
36
|
-
bload loads BUCKET.json into a mongo database. The first
|
37
|
-
argument passed to this command specifies the
|
38
|
-
mongo database, while the second specifies the
|
39
|
-
collection.
|
40
|
-
EOF
|
41
|
-
exit 0
|
42
|
-
;;
|
43
|
-
-*)
|
44
|
-
echo "Invalid option: $1"
|
45
|
-
exit 1
|
46
|
-
;;
|
47
|
-
esac
|
48
|
-
|
49
|
-
command=$1
|
50
|
-
shift
|
51
|
-
|
52
|
-
buckets=()
|
53
|
-
|
54
|
-
## no bucket file specified read all s3 buckets
|
55
|
-
if [[ -z $bucket_file ]]
|
56
|
-
then
|
57
|
-
for bucket in `s3cmd ls | cut -d ' ' -f 4 | cut -d / -f 3`
|
58
|
-
do
|
59
|
-
buckets=("${buckets[@]}" "$bucket")
|
60
|
-
done
|
61
|
-
|
62
|
-
## read buckets from standard input
|
63
|
-
elif [[ $bucket_file == "-" ]]
|
64
|
-
then
|
65
|
-
read bucket
|
66
|
-
until [[ $? -eq 1 ]]
|
67
|
-
do
|
68
|
-
buckets=("${buckets[@]}" "$bucket")
|
69
|
-
read bucket
|
70
|
-
done
|
71
|
-
|
72
|
-
## read from bucket_file
|
73
|
-
else
|
74
|
-
tmpIFS=$IFS
|
75
|
-
IFS=$'\n'
|
76
|
-
|
77
|
-
for bucket in `cat $bucket_file`
|
78
|
-
do
|
79
|
-
buckets=("${buckets[@]}" "$bucket")
|
80
|
-
done
|
81
|
-
|
82
|
-
IFS=$tmpIFS
|
83
|
-
fi
|
84
|
-
|
85
|
-
for bucket in "${buckets[@]}"
|
86
|
-
do
|
87
|
-
($command $bucket "$@")&
|
88
|
-
done
|
@@ -1,391 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
# crank this down to info for progress messages. can also use
|
7
|
-
# "filename=" for that kind of thing. The only reason this is stderr
|
8
|
-
# is to allow for output redirection.
|
9
|
-
logging.basicConfig(stream=sys.stderr, level=logging.ERROR)
|
10
|
-
|
11
|
-
#-------------------------------------------------------------------------------
|
12
|
-
|
13
|
-
def calculate_sizes(parsedHierarchies):
|
14
|
-
"""
|
15
|
-
@param parsedHierarchies dictionary mapping filenames to
|
16
|
-
parsedHierarchies. This is in the same
|
17
|
-
format as the 'subdirs' component of a
|
18
|
-
parsedHierarchy.
|
19
|
-
"""
|
20
|
-
|
21
|
-
from operator import add
|
22
|
-
return reduce(
|
23
|
-
add,
|
24
|
-
(
|
25
|
-
calculate_size(parsedHierarchies[name])
|
26
|
-
for name in parsedHierarchies.keys()))
|
27
|
-
|
28
|
-
|
29
|
-
def calculate_size(parsedHierarchy):
|
30
|
-
"""
|
31
|
-
@param parsedHierarchy dictionary in the same format as the one
|
32
|
-
operated on by insert_line
|
33
|
-
"""
|
34
|
-
|
35
|
-
if 'subdirs' in parsedHierarchy:
|
36
|
-
parsedHierarchy['tree_size'] = calculate_sizes(parsedHierarchy['subdirs'])
|
37
|
-
elif parsedHierarchy['type'] == 'd':
|
38
|
-
parsedHierarchy['tree_size'] = 0
|
39
|
-
|
40
|
-
if 'tree_size' in parsedHierarchy:
|
41
|
-
return parsedHierarchy['tree_size']
|
42
|
-
else:
|
43
|
-
return parsedHierarchy['file_size']
|
44
|
-
|
45
|
-
#-------------------------------------------------------------------------------
|
46
|
-
|
47
|
-
from sys import stdout
|
48
|
-
def write_listing_in_json(listing, writer = stdout):
|
49
|
-
writer.write('{"basename":"%s"' % listing['basename'])
|
50
|
-
|
51
|
-
from operator import add
|
52
|
-
writer.write(reduce(add, (',"%s":%s' % (key,
|
53
|
-
'"%s"' % listing[key]
|
54
|
-
if isinstance(listing[key],str)
|
55
|
-
else listing[key])
|
56
|
-
for key in listing.keys() if key != 'subdirs')))
|
57
|
-
|
58
|
-
writer.write('}\n')
|
59
|
-
|
60
|
-
#-------------------------------------------------------------------------------
|
61
|
-
|
62
|
-
def each_listing_in_hierarchy(parsedHierarchy):
|
63
|
-
"""
|
64
|
-
@param parsedHierarchy dictionary mapping filenames to
|
65
|
-
parsedHierarchies. This is in the same
|
66
|
-
format as the 'subdirs' component of a
|
67
|
-
parsedHierarchy.
|
68
|
-
|
69
|
-
@return one record for every file listing. Every parsedHierarchy
|
70
|
-
will have its 'subdirs' key deleted and will consequently be flat.
|
71
|
-
"""
|
72
|
-
|
73
|
-
if 'subdirs' in parsedHierarchy:
|
74
|
-
subdirs = parsedHierarchy['subdirs']
|
75
|
-
del parsedHierarchy['subdirs']
|
76
|
-
return [parsedHierarchy] + each_listing_in_subdirs(subdirs)
|
77
|
-
else:
|
78
|
-
return [parsedHierarchy]
|
79
|
-
|
80
|
-
def each_listing_in_subdirs(parsedHierarchies):
|
81
|
-
keys = parsedHierarchies.keys()
|
82
|
-
keys.sort()
|
83
|
-
from operator import add
|
84
|
-
|
85
|
-
return reduce(add,
|
86
|
-
[each_listing_in_hierarchy(parsedHierarchies[f])
|
87
|
-
for f in keys])
|
88
|
-
|
89
|
-
#-------------------------------------------------------------------------------
|
90
|
-
|
91
|
-
def insert_line(parsedLine,
|
92
|
-
parsedHierarchy,
|
93
|
-
bucket_name,
|
94
|
-
prefix='/',
|
95
|
-
s3hdfs = False):
|
96
|
-
"""
|
97
|
-
@param parsedHierarchy A parsed hierarchy is a dictionary that
|
98
|
-
contains the size, date, type, path, and
|
99
|
-
subdirs of a file. It has two special
|
100
|
-
properties: the basename contains no /
|
101
|
-
characters, and the "subdirs" points to a
|
102
|
-
dictionary that maps names to
|
103
|
-
parsedHierarchies underneath this one.
|
104
|
-
"""
|
105
|
-
|
106
|
-
def insert_subdir(parsedHierarchy, subdir, bucket_name, prefix):
|
107
|
-
if 'subdirs' not in parsedHierarchy:
|
108
|
-
parsedHierarchy['subdirs'] = {}
|
109
|
-
if subdir not in parsedHierarchy['subdirs']:
|
110
|
-
parsedHierarchy['subdirs'][subdir] = {}
|
111
|
-
parsedHierarchy['subdirs'][subdir]['basename'] = subdir
|
112
|
-
parsedHierarchy['subdirs'][subdir]['file_size'] = 0
|
113
|
-
parsedHierarchy['subdirs'][subdir]['type'] = 'd'
|
114
|
-
|
115
|
-
prot = 's3' if s3hdfs else 's3n'
|
116
|
-
|
117
|
-
parent_url = (parsedHierarchy['_id'] if '_id' in parsedHierarchy
|
118
|
-
else '%s://%s/' % (prot, bucket_name))
|
119
|
-
|
120
|
-
parsedHierarchy['subdirs'][subdir]['parent_id'] = parent_url
|
121
|
-
|
122
|
-
|
123
|
-
url = '%s://%s%s%s' % (prot, bucket_name, prefix, subdir)
|
124
|
-
parsedHierarchy['subdirs'][subdir]['_id'] = url
|
125
|
-
|
126
|
-
import hashlib
|
127
|
-
sha1hasher = hashlib.new('sha1')
|
128
|
-
sha1hasher.update(url)
|
129
|
-
|
130
|
-
parsedHierarchy['subdirs'][subdir]['uuid'] = (
|
131
|
-
sha1hasher.hexdigest().lower())
|
132
|
-
|
133
|
-
path = parsedLine['path']
|
134
|
-
# recursively insert rest of path after /
|
135
|
-
if path.find('/') != -1:
|
136
|
-
base,rest = path.split('/',1)
|
137
|
-
|
138
|
-
insert_subdir(parsedHierarchy, base, bucket_name, prefix)
|
139
|
-
|
140
|
-
parsedLine['path'] = rest
|
141
|
-
insert_line(parsedLine,
|
142
|
-
parsedHierarchy['subdirs'][base],
|
143
|
-
bucket_name,
|
144
|
-
prefix + base + '/')
|
145
|
-
|
146
|
-
# insert one file or directory into "subdirs"
|
147
|
-
else:
|
148
|
-
insert_subdir(parsedHierarchy, path, bucket_name, prefix)
|
149
|
-
|
150
|
-
# This will also overwrite the default 'type':'d' from insert_subdir
|
151
|
-
for k in parsedLine.keys():
|
152
|
-
parsedHierarchy['subdirs'][path][k] = parsedLine[k]
|
153
|
-
|
154
|
-
parsedHierarchy['subdirs'][path]['basename'] = \
|
155
|
-
parsedHierarchy['subdirs'][path]['path']
|
156
|
-
del parsedHierarchy['subdirs'][path]['path']
|
157
|
-
|
158
|
-
#-------------------------------------------------------------------------------
|
159
|
-
|
160
|
-
def json2ls(json, writer, prefix='/'):
|
161
|
-
"""
|
162
|
-
sanity check. writes json back out to the command line in ls form
|
163
|
-
"""
|
164
|
-
|
165
|
-
from datetime import datetime
|
166
|
-
d =(datetime.fromtimestamp(json['datetime']).strftime("%Y-%m-%d %H:%M")
|
167
|
-
if 'datetime' in json else '1970-01-01 00:00')
|
168
|
-
|
169
|
-
writer.write("%s %9d %s\n" % (
|
170
|
-
d,
|
171
|
-
json['file_size'],
|
172
|
-
json['_id'].replace('s3n', 's3')))
|
173
|
-
|
174
|
-
#-------------------------------------------------------------------------------
|
175
|
-
|
176
|
-
def hdfs_parse_line(bucket_name):
|
177
|
-
|
178
|
-
import re
|
179
|
-
|
180
|
-
def line_parser(line):
|
181
|
-
|
182
|
-
components = re.compile(r"""
|
183
|
-
|
184
|
-
^
|
185
|
-
(
|
186
|
-
[d\-] # directory bit
|
187
|
-
)
|
188
|
-
(?:[r\-][w\-][xs\-]){2}
|
189
|
-
[r\-][w\-][x\-]
|
190
|
-
|
191
|
-
[ \t]*
|
192
|
-
|
193
|
-
(?:-|[0-9]+) # number of links. ignore.
|
194
|
-
|
195
|
-
[ \t]*
|
196
|
-
|
197
|
-
([0-9]+) # size
|
198
|
-
|
199
|
-
[ \t]*
|
200
|
-
|
201
|
-
(\d\d\d\d-\d\d-\d\d[ ]\d\d:\d\d)
|
202
|
-
|
203
|
-
[ \t]*
|
204
|
-
|
205
|
-
( # path
|
206
|
-
[^ \t]
|
207
|
-
[^\n]*
|
208
|
-
)
|
209
|
-
|
210
|
-
.*
|
211
|
-
|
212
|
-
$
|
213
|
-
|
214
|
-
""", re.VERBOSE)
|
215
|
-
|
216
|
-
m = components.match(line)
|
217
|
-
if not m:
|
218
|
-
import sys
|
219
|
-
sys.stderr.write("couldn't parse line: %s\n" % (line))
|
220
|
-
return None
|
221
|
-
|
222
|
-
typ, fsize, datetime, path = m.groups()
|
223
|
-
|
224
|
-
if typ == '-': typ = 'f'
|
225
|
-
if path.startswith('/'): path = path[1:]
|
226
|
-
|
227
|
-
return datetime, fsize, bucket_name, path, typ
|
228
|
-
|
229
|
-
return line_parser
|
230
|
-
|
231
|
-
#-------------------------------------------------------------------------------
|
232
|
-
|
233
|
-
def s3_parse_line(line):
|
234
|
-
|
235
|
-
import re
|
236
|
-
components = re.compile(r"""
|
237
|
-
|
238
|
-
^
|
239
|
-
(\d\d\d\d-\d\d-\d\d[ ]\d\d:\d\d)
|
240
|
-
|
241
|
-
[ \t]*
|
242
|
-
|
243
|
-
([0-9]+)
|
244
|
-
|
245
|
-
[ \t]*
|
246
|
-
|
247
|
-
(?:
|
248
|
-
(?:s3://)
|
249
|
-
([^/]*)
|
250
|
-
/
|
251
|
-
([^\n]*)
|
252
|
-
)
|
253
|
-
|
254
|
-
.*
|
255
|
-
|
256
|
-
$
|
257
|
-
|
258
|
-
""", re.VERBOSE)
|
259
|
-
|
260
|
-
m = components.match(line)
|
261
|
-
if not m:
|
262
|
-
import sys
|
263
|
-
sys.stderr.write("couldn't parse line: %s\n" % (line))
|
264
|
-
return None
|
265
|
-
|
266
|
-
datetime, fsize, bucket_name, parsed_line = m.groups()
|
267
|
-
typ = 'f'
|
268
|
-
|
269
|
-
return datetime, fsize, bucket_name, parsed_line, typ
|
270
|
-
|
271
|
-
#-------------------------------------------------------------------------------
|
272
|
-
|
273
|
-
def ls2json_subdirs(lines, line_parser):
|
274
|
-
|
275
|
-
parsedHierarchy = None
|
276
|
-
|
277
|
-
count = 0
|
278
|
-
for line in lines:
|
279
|
-
count = count + 1
|
280
|
-
if count % 1000 == 0:
|
281
|
-
logging.info("inserting line %d" % (count))
|
282
|
-
|
283
|
-
line_tuple = line_parser(line)
|
284
|
-
|
285
|
-
if not line_tuple:
|
286
|
-
continue
|
287
|
-
|
288
|
-
parsedLine = {}
|
289
|
-
|
290
|
-
(
|
291
|
-
|
292
|
-
parsedLine['datetime'],
|
293
|
-
parsedLine['file_size'],
|
294
|
-
bucket_name,
|
295
|
-
parsedLine['path'],
|
296
|
-
parsedLine['type']
|
297
|
-
|
298
|
-
) = line_tuple
|
299
|
-
|
300
|
-
if not parsedHierarchy:
|
301
|
-
url = "s3n://%s" % (bucket_name)
|
302
|
-
import hashlib
|
303
|
-
sha1hasher = hashlib.new('sha1')
|
304
|
-
sha1hasher.update(url)
|
305
|
-
|
306
|
-
parsedHierarchy = {
|
307
|
-
bucket_name : {
|
308
|
-
"subdirs" : {},
|
309
|
-
"basename" : bucket_name,
|
310
|
-
"_id" : url,
|
311
|
-
"type" : "d",
|
312
|
-
"file_size" : 0,
|
313
|
-
"uuid" : sha1hasher.hexdigest(),
|
314
|
-
}
|
315
|
-
}
|
316
|
-
|
317
|
-
parsedLine['file_size'] = int(parsedLine['file_size'])
|
318
|
-
|
319
|
-
if parsedLine['datetime'] == '1970-01-01 00:00':
|
320
|
-
del parsedLine['datetime']
|
321
|
-
else:
|
322
|
-
from datetime import datetime
|
323
|
-
parsedLine['datetime'] = int(datetime.strptime(
|
324
|
-
parsedLine['datetime'],
|
325
|
-
"%Y-%m-%d %H:%M").strftime("%s"))
|
326
|
-
|
327
|
-
parsedLine['file_size'] = int(parsedLine['file_size'])
|
328
|
-
|
329
|
-
if parsedLine['path'].endswith('/'):
|
330
|
-
parsedLine['path'] = parsedLine['path'][:-1]
|
331
|
-
parsedLine['type'] = 'd'
|
332
|
-
|
333
|
-
insert_line(parsedLine,
|
334
|
-
parsedHierarchy[bucket_name],
|
335
|
-
bucket_name)
|
336
|
-
|
337
|
-
if not parsedHierarchy: return []
|
338
|
-
|
339
|
-
logging.info("calculating sizes")
|
340
|
-
calculate_sizes(parsedHierarchy)
|
341
|
-
|
342
|
-
logging.info("converting hierarchies")
|
343
|
-
return each_listing_in_subdirs(parsedHierarchy)
|
344
|
-
|
345
|
-
#-------------------------------------------------------------------------------
|
346
|
-
|
347
|
-
if __name__ == '__main__':
|
348
|
-
|
349
|
-
from optparse import OptionParser
|
350
|
-
parser = OptionParser(usage = "usage: %prog [options] [s3hdfs bucket name]")
|
351
|
-
parser.add_option("-i", "--input", dest="infile", default = None,
|
352
|
-
help="input file..")
|
353
|
-
parser.add_option("-o", "--output", dest="outfile", default = None,
|
354
|
-
help="output file.")
|
355
|
-
parser.add_option("-t", "--test", dest="test", default = False,
|
356
|
-
action="store_true",
|
357
|
-
help="reoutput in ls format. for debugging")
|
358
|
-
|
359
|
-
(options, args) = parser.parse_args()
|
360
|
-
|
361
|
-
import sys
|
362
|
-
if len(args) > 1:
|
363
|
-
parser.print_usage()
|
364
|
-
sys.exit(0)
|
365
|
-
|
366
|
-
if args:
|
367
|
-
bucket, = args
|
368
|
-
ls_converter = lambda istream: ls2json_subdirs(istream.readlines(),
|
369
|
-
hdfs_parse_line(bucket))
|
370
|
-
else:
|
371
|
-
ls_converter = lambda istream: ls2json_subdirs(istream.readlines(),
|
372
|
-
s3_parse_line)
|
373
|
-
|
374
|
-
def open_or_die(fname, flags="r"):
|
375
|
-
try:
|
376
|
-
return open(fname, flags)
|
377
|
-
except IOError as (errno, strerr):
|
378
|
-
sys.stderr.write("Couldn't open %s: %s\n" % (fname, strerr))
|
379
|
-
sys.exit(0)
|
380
|
-
|
381
|
-
from sys import stdin, stdout
|
382
|
-
instream = open_or_die(options.infile) if options.infile else stdin
|
383
|
-
outstream = open_or_die(options.outfile, 'w') if options.outfile else stdout
|
384
|
-
|
385
|
-
if options.test:
|
386
|
-
for listing in ls_converter(instream):
|
387
|
-
json2ls(listing, outstream)
|
388
|
-
else:
|
389
|
-
for listing in ls_converter(instream):
|
390
|
-
write_listing_in_json(listing, outstream)
|
391
|
-
|
@@ -1,55 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require_relative '../../lib/vayacondios/server/legacy_switch'
|
3
|
-
|
4
|
-
require 'multi_json'
|
5
|
-
|
6
|
-
require_relative '../../lib/vayacondios/client/itemset'
|
7
|
-
|
8
|
-
describe Vayacondios::Client::ItemSet do
|
9
|
-
context "after instantiation in legacy mode" do
|
10
|
-
itemset = Vayacondios::Client::ItemSet.new("foohost", 9999, "fooorg", "footopic", "fooid")
|
11
|
-
ary = ["foo", "bar", "baz"]
|
12
|
-
|
13
|
-
# testing internals here to avoid shimming up HTTP libraries.
|
14
|
-
|
15
|
-
it "generates a put request without a patch header when asked to create" do
|
16
|
-
Vayacondios.force_legacy_mode true
|
17
|
-
|
18
|
-
req = itemset.instance_eval{_req(:create, ary)}
|
19
|
-
|
20
|
-
req.method.should eql('PUT')
|
21
|
-
req.body.should eql(MultiJson.encode(ary))
|
22
|
-
req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
|
23
|
-
req.each_header.to_a.should_not include(["x_method", "PATCH"])
|
24
|
-
end
|
25
|
-
|
26
|
-
it "generates a put request with a patch header when asked to update" do
|
27
|
-
Vayacondios.force_legacy_mode true
|
28
|
-
|
29
|
-
req = itemset.instance_eval{_req(:update, ary)}
|
30
|
-
|
31
|
-
req.method.should eql('PUT')
|
32
|
-
req.body.should eql(MultiJson.encode(ary))
|
33
|
-
req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
|
34
|
-
req.each_header.to_a.should include(["x-method", "PATCH"])
|
35
|
-
end
|
36
|
-
|
37
|
-
it "generates a get request when asked to fetch" do
|
38
|
-
req = itemset.instance_eval{_req(:fetch)}
|
39
|
-
|
40
|
-
req.method.should eql('GET')
|
41
|
-
req.body.should be_nil
|
42
|
-
req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
|
43
|
-
end
|
44
|
-
|
45
|
-
it "generates a delete request when asked to remove" do
|
46
|
-
Vayacondios.force_legacy_mode true
|
47
|
-
|
48
|
-
req = itemset.instance_eval{_req(:remove, ary)}
|
49
|
-
|
50
|
-
req.method.should eql('DELETE')
|
51
|
-
req.body.should eql(MultiJson.encode(ary))
|
52
|
-
req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|