vayacondios-server 0.2.11 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -1
- data/.travis.yml +2 -0
- data/Gemfile +15 -9
- data/LICENSE.md +2 -6
- data/Procfile +1 -1
- data/README.md +656 -111
- data/Rakefile +89 -6
- data/bin/vcd +10 -0
- data/bin/vcd-server +8 -0
- data/config/database.yml +6 -0
- data/config/spec.example.yml +18 -0
- data/config/vayacondios.example.yml +15 -0
- data/config/vcd-server.rb +37 -0
- data/examples/configuration.rb +56 -0
- data/examples/event_stream.rb +19 -0
- data/examples/simple.rb +61 -0
- data/features/event.feature +319 -0
- data/features/events.feature +208 -0
- data/features/stash.feature +840 -0
- data/features/stashes.feature +492 -0
- data/features/step_definitions/stash_steps.rb +113 -0
- data/features/stream.feature +30 -0
- data/features/support/em.rb +14 -0
- data/features/support/env.rb +13 -0
- data/lib/vayacondios/configuration.rb +63 -0
- data/lib/vayacondios/server/api.rb +126 -0
- data/lib/vayacondios/server/api_options.rb +56 -0
- data/lib/vayacondios/server/configuration.rb +23 -0
- data/lib/vayacondios/server/driver.rb +71 -0
- data/lib/vayacondios/server/drivers/mongo.rb +126 -0
- data/lib/vayacondios/server/handlers/document_handler.rb +81 -0
- data/lib/vayacondios/server/handlers/event_handler.rb +31 -26
- data/lib/vayacondios/server/handlers/events_handler.rb +31 -0
- data/lib/vayacondios/server/handlers/stash_handler.rb +69 -0
- data/lib/vayacondios/server/handlers/stashes_handler.rb +49 -0
- data/lib/vayacondios/server/handlers/stream_handler.rb +39 -0
- data/lib/vayacondios/server/models/document.rb +87 -0
- data/lib/vayacondios/server/models/event.rb +198 -0
- data/lib/vayacondios/server/models/stash.rb +100 -0
- data/lib/vayacondios/server.rb +35 -0
- data/lib/vayacondios-server.rb +19 -13
- data/lib/vayacondios.rb +22 -0
- data/pom.xml +124 -4
- data/spec/configuration_spec.rb +41 -0
- data/spec/server/api_options_spec.rb +32 -0
- data/spec/server/api_spec.rb +279 -0
- data/spec/server/configuration_spec.rb +27 -0
- data/spec/server/drivers/mongo_spec.rb +107 -0
- data/spec/server/handlers/event_handler_spec.rb +62 -0
- data/spec/server/handlers/events_handler_spec.rb +51 -0
- data/spec/server/handlers/stash_handler_spec.rb +68 -0
- data/spec/server/handlers/stashes_handler_spec.rb +50 -0
- data/spec/server/handlers/stream_handler_spec.rb +5 -0
- data/spec/server/models/document_spec.rb +9 -0
- data/spec/server/models/event_spec.rb +185 -0
- data/spec/server/models/stash_spec.rb +95 -0
- data/spec/spec_helper.rb +23 -3
- data/spec/support/database_helper.rb +42 -0
- data/spec/support/log_helper.rb +19 -0
- data/spec/support/shared_context_for_events.rb +22 -0
- data/spec/support/shared_context_for_stashes.rb +24 -0
- data/spec/support/shared_examples_for_handlers.rb +32 -0
- data/src/main/java/com/infochimps/vayacondios/BaseClient.java +342 -0
- data/src/main/java/com/infochimps/vayacondios/HTTPClient.java +426 -0
- data/src/main/java/com/infochimps/vayacondios/VayacondiosClient.java +487 -65
- data/src/main/java/com/infochimps/vayacondios/test/IntegrationTest.java +3 -0
- data/src/test/java/com/infochimps/vayacondios/BaseClientTest.java +50 -0
- data/src/test/java/com/infochimps/vayacondios/HTTPClientIT.java +267 -0
- data/vayacondios-server.gemspec +9 -9
- metadata +127 -122
- checksums.yaml +0 -15
- data/.rspec +0 -2
- data/.yardopts +0 -10
- data/Guardfile +0 -41
- data/app/http_shim.rb +0 -71
- data/bin/vcd.sh +0 -27
- data/config/http_shim.rb +0 -43
- data/config/vayacondios.example.yaml +0 -7
- data/config/vayacondios.yaml +0 -7
- data/examples/java/ItemSetTest.java +0 -76
- data/lib/tasks/publish.rake +0 -23
- data/lib/tasks/spec.rake +0 -11
- data/lib/tasks/yard.rake +0 -2
- data/lib/vayacondios/client/config.rb +0 -7
- data/lib/vayacondios/client/configliere.rb +0 -38
- data/lib/vayacondios/client/cube_client.rb +0 -39
- data/lib/vayacondios/client/http_client.rb +0 -49
- data/lib/vayacondios/client/itemset.rb +0 -130
- data/lib/vayacondios/client/legacy_switch.rb +0 -43
- data/lib/vayacondios/client/notifier.rb +0 -123
- data/lib/vayacondios/client/zabbix_client.rb +0 -148
- data/lib/vayacondios/legacy_switch.rb +0 -43
- data/lib/vayacondios/server/errors/bad_request.rb +0 -6
- data/lib/vayacondios/server/errors/not_found.rb +0 -6
- data/lib/vayacondios/server/handlers/config_handler.rb +0 -32
- data/lib/vayacondios/server/handlers/itemset_handler.rb +0 -60
- data/lib/vayacondios/server/legacy_switch.rb +0 -43
- data/lib/vayacondios/server/model/config_document.rb +0 -89
- data/lib/vayacondios/server/model/document.rb +0 -25
- data/lib/vayacondios/server/model/event_document.rb +0 -94
- data/lib/vayacondios/server/model/itemset_document.rb +0 -126
- data/lib/vayacondios/server/rack/extract_methods.rb +0 -35
- data/lib/vayacondios/server/rack/jsonize.rb +0 -43
- data/lib/vayacondios/server/rack/params.rb +0 -50
- data/lib/vayacondios/server/rack/path.rb +0 -23
- data/lib/vayacondios/server/rack/path_validation.rb +0 -22
- data/lib/vayacondios/version.rb +0 -3
- data/lib/vayacondios-client.rb +0 -22
- data/scripts/hadoop_monitor/configurable.rb +0 -66
- data/scripts/hadoop_monitor/hadoop_attempt_scraper.rb +0 -45
- data/scripts/hadoop_monitor/hadoop_client.rb +0 -273
- data/scripts/hadoop_monitor/hadoop_monitor.rb +0 -101
- data/scripts/hadoop_monitor/hadoopable.rb +0 -65
- data/scripts/hadoop_monitor/machine_monitor.rb +0 -115
- data/scripts/s3_cataloger/buckets +0 -33
- data/scripts/s3_cataloger/foreach_bucket +0 -88
- data/scripts/s3_cataloger/parse_ls.py +0 -391
- data/spec/client/itemset_legacy_spec.rb +0 -55
- data/spec/client/itemset_spec.rb +0 -60
- data/spec/client/notifier_spec.rb +0 -120
- data/spec/server/config_spec.rb +0 -113
- data/spec/server/event_spec.rb +0 -103
- data/spec/server/itemset_legacy_spec.rb +0 -320
- data/spec/server/itemset_spec.rb +0 -317
- data/spec/server/rack/extract_methods_spec.rb +0 -60
- data/spec/server/rack/path_spec.rb +0 -36
- data/spec/server/rack/path_validation_spec.rb +0 -22
- data/spec/server/server_spec.rb +0 -20
- data/spec/support/mongo_cleaner.rb +0 -32
- data/src/main/java/ItemSetTest.java +0 -76
- data/src/main/java/com/infochimps/util/CurrentClass.java +0 -26
- data/src/main/java/com/infochimps/util/DebugUtil.java +0 -38
- data/src/main/java/com/infochimps/util/HttpHelper.java +0 -181
- data/src/main/java/com/infochimps/vayacondios/ItemSets.java +0 -373
- data/src/main/java/com/infochimps/vayacondios/LinkToVCD.java +0 -18
- data/src/main/java/com/infochimps/vayacondios/MemoryVCDShim.java +0 -84
- data/src/main/java/com/infochimps/vayacondios/Organization.java +0 -62
- data/src/main/java/com/infochimps/vayacondios/PathBuilder.java +0 -13
- data/src/main/java/com/infochimps/vayacondios/StandardVCDLink.java +0 -218
- data/src/main/java/com/infochimps/vayacondios/VCDIntegrationTest.java +0 -108
- data/src/test/java/com/infochimps/vayacondios/TestVayacondiosInMemory.java +0 -78
- data/vayacondios-client.gemspec +0 -25
@@ -1,115 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require_relative 'configure'
|
4
|
-
require 'thread'
|
5
|
-
require 'socket'
|
6
|
-
require 'scanf'
|
7
|
-
require 'json'
|
8
|
-
require 'mongo'
|
9
|
-
|
10
|
-
class Vayacondios
|
11
|
-
|
12
|
-
class StatServer
|
13
|
-
|
14
|
-
include Configurable
|
15
|
-
|
16
|
-
def initialize
|
17
|
-
unless get_conf.mongo_ip
|
18
|
-
raise "The IP address of the mongo server must be set!"
|
19
|
-
end
|
20
|
-
|
21
|
-
logger.info "Connecting to Mongo server at ip #{get_conf.mongo_ip}"
|
22
|
-
conn = Mongo::Connection.new get_conf.mongo_ip
|
23
|
-
logger.debug "Getting job database #{get_conf.mongo_jobs_db}"
|
24
|
-
@db = conn[get_conf.mongo_jobs_db]
|
25
|
-
end
|
26
|
-
|
27
|
-
def run
|
28
|
-
|
29
|
-
# TODO: This entire script should be replaced by calls to zabbix
|
30
|
-
# initiated by the main loop of the hadoop_monitor.
|
31
|
-
|
32
|
-
logger.debug "Waiting for hadoop monitor to create the event collection."
|
33
|
-
sleep get_conf.sleep_seconds until
|
34
|
-
@db.collection_names.index get_conf.mongo_job_events_collection
|
35
|
-
|
36
|
-
job_events = @db[get_conf.mongo_job_events_collection]
|
37
|
-
|
38
|
-
logger.debug "Got the event collection. Creating machine stats collection."
|
39
|
-
machine_stats = @db.
|
40
|
-
create_collection(get_conf.mongo_machine_stats_collection)
|
41
|
-
|
42
|
-
logger.debug "Querying job_events until we see an insertion."
|
43
|
-
# Keep querying the job_events collection until there's an
|
44
|
-
# event. Don't just use the cursor from .find without checking,
|
45
|
-
# because if hadoop_monitor inserts an event into an empty
|
46
|
-
# database, this cursor will no longer work, even if it's
|
47
|
-
# tailable. not quite sure why Mongo does it that way.
|
48
|
-
events = job_events.find
|
49
|
-
events.add_option 0x02 # tailable
|
50
|
-
until events.has_next?
|
51
|
-
sleep get_conf.sleep_seconds
|
52
|
-
events = job_events.find
|
53
|
-
events.add_option 0x02 # tailable
|
54
|
-
end
|
55
|
-
|
56
|
-
logger.debug "Priming main event loop. Waiting to see if the cluster is busy."
|
57
|
-
|
58
|
-
# Get up-to-date on the state of the cluster. assume quiet to start.
|
59
|
-
cluster_busy = self.class.next_state(events, false, get_conf.event)
|
60
|
-
|
61
|
-
# main loop
|
62
|
-
loop do
|
63
|
-
|
64
|
-
logger.debug "In main event loop. Waiting to see if the cluster is busy."
|
65
|
-
|
66
|
-
# Get up-to-date on the state of the cluster.
|
67
|
-
cluster_busy = self.class.next_state(events, cluster_busy, get_conf.event)
|
68
|
-
|
69
|
-
# Don't grab stats unless the cluster is busy
|
70
|
-
unless cluster_busy
|
71
|
-
sleep get_conf.sleep_seconds
|
72
|
-
next
|
73
|
-
end
|
74
|
-
|
75
|
-
logger.debug "Grabbing stats and pushing them into the collection."
|
76
|
-
|
77
|
-
# Grab the stats!
|
78
|
-
# ifstat's delay will function as our heartbeat timer.
|
79
|
-
is, ignore, rw = `ifstat 1 1`.split("\n").map(&:split)
|
80
|
-
headers, *disks = `iostat -x`.split("\n")[5..-1].map(&:split)
|
81
|
-
cpu, mem, swap, proc_headers, *procs = `top -b -n 1`.
|
82
|
-
split("\n").map(&:strip).select{|x| not x.empty?}[2..-1]
|
83
|
-
|
84
|
-
# Write the stats into the mongo collection.
|
85
|
-
machine_stats.insert(
|
86
|
-
:net => Hash[is.zip(rw.each_slice(2).map{|r,w| {:r => r, :w => w}})],
|
87
|
-
:disk => Hash[disks.map{|d| [d.first, Hash[headers.zip(d)]]}],
|
88
|
-
:cpu => self.class.split_top_stats(cpu),
|
89
|
-
:mem => self.class.split_top_stats(mem),
|
90
|
-
:swap => self.class.split_top_stats(swap))
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
private
|
95
|
-
|
96
|
-
def self.split_top_stats line
|
97
|
-
Hash[line.split(':', 2).last.split(',').map(&:strip).map do |stat|
|
98
|
-
stat.scanf("%f%*c%s").reverse
|
99
|
-
end]
|
100
|
-
end
|
101
|
-
|
102
|
-
def self.next_state events_cursor, current_state, event_attr_name
|
103
|
-
while current_event = events_cursor.next
|
104
|
-
current_state = case current_event[event_attr_name]
|
105
|
-
when CLUSTER_BUSY then true
|
106
|
-
when CLUSTER_QUIET then false
|
107
|
-
else current_state
|
108
|
-
end
|
109
|
-
end
|
110
|
-
current_state
|
111
|
-
end
|
112
|
-
end
|
113
|
-
end
|
114
|
-
|
115
|
-
Vayacondios::StatServer.new.run
|
@@ -1,33 +0,0 @@
|
|
1
|
-
export dir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
2
|
-
|
3
|
-
bdump_and_bparse() {
|
4
|
-
bucket_name=$1
|
5
|
-
bdump $1 ; bparse $1
|
6
|
-
}
|
7
|
-
|
8
|
-
bparse_and_bload() {
|
9
|
-
bucket_name=$1
|
10
|
-
bparse $1 ; bload "$@"
|
11
|
-
}
|
12
|
-
|
13
|
-
bdump_and_bload() {
|
14
|
-
bucket_name=$1
|
15
|
-
bdump $1; bparse $1 ; bload $1
|
16
|
-
}
|
17
|
-
|
18
|
-
bdump() {
|
19
|
-
bucket_name=$1
|
20
|
-
s3cmd ls -r s3://$bucket_name/ >$bucket_name.ls
|
21
|
-
}
|
22
|
-
|
23
|
-
bparse() {
|
24
|
-
bucket_name=$1
|
25
|
-
$dir/parse_ls.py <$bucket_name.ls >$bucket_name.json
|
26
|
-
}
|
27
|
-
|
28
|
-
bload() {
|
29
|
-
bucket_name=$1
|
30
|
-
db=$2
|
31
|
-
collection=$3
|
32
|
-
mongoimport -d $db -c $collection $bucket_name.json
|
33
|
-
}
|
@@ -1,88 +0,0 @@
|
|
1
|
-
#!/usr/bin/env bash
|
2
|
-
|
3
|
-
export dir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
4
|
-
|
5
|
-
. $dir/buckets
|
6
|
-
|
7
|
-
case $1 in
|
8
|
-
-f)
|
9
|
-
bucket_file=$2
|
10
|
-
shift 2
|
11
|
-
;;
|
12
|
-
-h|--help)
|
13
|
-
cat <<EOF
|
14
|
-
foreach_bucket [OPTIONS] COMMAND [ARGUMENTS]
|
15
|
-
|
16
|
-
This script is used to do a recursive listing of an s3 bucket using
|
17
|
-
the s3cmd and then jsonify the output. It runs the COMMAND on the
|
18
|
-
buckets specified in a file; on standard input; or, by default, on all
|
19
|
-
buckets that can be seen by s3cmd.
|
20
|
-
|
21
|
-
OPTIONS include the following:
|
22
|
-
|
23
|
-
-f BUCKET_FILE file containing a bucket name on each line. If
|
24
|
-
this is set to '-', then buckets are read from
|
25
|
-
standard input.
|
26
|
-
|
27
|
-
COMMAND includes anything in the 'buckets' script. The main commands
|
28
|
-
are the following:
|
29
|
-
|
30
|
-
bdump dumps BUCKET to a file BUCKET.ls in the current
|
31
|
-
working directory
|
32
|
-
|
33
|
-
bparse runs BUCKET.ls through a parser to jsonify it and
|
34
|
-
outputs the result as BUCKET.json
|
35
|
-
|
36
|
-
bload loads BUCKET.json into a mongo database. The first
|
37
|
-
argument passed to this command specifies the
|
38
|
-
mongo database, while the second specifies the
|
39
|
-
collection.
|
40
|
-
EOF
|
41
|
-
exit 0
|
42
|
-
;;
|
43
|
-
-*)
|
44
|
-
echo "Invalid option: $1"
|
45
|
-
exit 1
|
46
|
-
;;
|
47
|
-
esac
|
48
|
-
|
49
|
-
command=$1
|
50
|
-
shift
|
51
|
-
|
52
|
-
buckets=()
|
53
|
-
|
54
|
-
## no bucket file specified read all s3 buckets
|
55
|
-
if [[ -z $bucket_file ]]
|
56
|
-
then
|
57
|
-
for bucket in `s3cmd ls | cut -d ' ' -f 4 | cut -d / -f 3`
|
58
|
-
do
|
59
|
-
buckets=("${buckets[@]}" "$bucket")
|
60
|
-
done
|
61
|
-
|
62
|
-
## read buckets from standard input
|
63
|
-
elif [[ $bucket_file == "-" ]]
|
64
|
-
then
|
65
|
-
read bucket
|
66
|
-
until [[ $? -eq 1 ]]
|
67
|
-
do
|
68
|
-
buckets=("${buckets[@]}" "$bucket")
|
69
|
-
read bucket
|
70
|
-
done
|
71
|
-
|
72
|
-
## read from bucket_file
|
73
|
-
else
|
74
|
-
tmpIFS=$IFS
|
75
|
-
IFS=$'\n'
|
76
|
-
|
77
|
-
for bucket in `cat $bucket_file`
|
78
|
-
do
|
79
|
-
buckets=("${buckets[@]}" "$bucket")
|
80
|
-
done
|
81
|
-
|
82
|
-
IFS=$tmpIFS
|
83
|
-
fi
|
84
|
-
|
85
|
-
for bucket in "${buckets[@]}"
|
86
|
-
do
|
87
|
-
($command $bucket "$@")&
|
88
|
-
done
|
@@ -1,391 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
|
6
|
-
# crank this down to info for progress messages. can also use
|
7
|
-
# "filename=" for that kind of thing. The only reason this is stderr
|
8
|
-
# is to allow for output redirection.
|
9
|
-
logging.basicConfig(stream=sys.stderr, level=logging.ERROR)
|
10
|
-
|
11
|
-
#-------------------------------------------------------------------------------
|
12
|
-
|
13
|
-
def calculate_sizes(parsedHierarchies):
|
14
|
-
"""
|
15
|
-
@param parsedHierarchies dictionary mapping filenames to
|
16
|
-
parsedHierarchies. This is in the same
|
17
|
-
format as the 'subdirs' component of a
|
18
|
-
parsedHierarchy.
|
19
|
-
"""
|
20
|
-
|
21
|
-
from operator import add
|
22
|
-
return reduce(
|
23
|
-
add,
|
24
|
-
(
|
25
|
-
calculate_size(parsedHierarchies[name])
|
26
|
-
for name in parsedHierarchies.keys()))
|
27
|
-
|
28
|
-
|
29
|
-
def calculate_size(parsedHierarchy):
|
30
|
-
"""
|
31
|
-
@param parsedHierarchy dictionary in the same format as the one
|
32
|
-
operated on by insert_line
|
33
|
-
"""
|
34
|
-
|
35
|
-
if 'subdirs' in parsedHierarchy:
|
36
|
-
parsedHierarchy['tree_size'] = calculate_sizes(parsedHierarchy['subdirs'])
|
37
|
-
elif parsedHierarchy['type'] == 'd':
|
38
|
-
parsedHierarchy['tree_size'] = 0
|
39
|
-
|
40
|
-
if 'tree_size' in parsedHierarchy:
|
41
|
-
return parsedHierarchy['tree_size']
|
42
|
-
else:
|
43
|
-
return parsedHierarchy['file_size']
|
44
|
-
|
45
|
-
#-------------------------------------------------------------------------------
|
46
|
-
|
47
|
-
from sys import stdout
|
48
|
-
def write_listing_in_json(listing, writer = stdout):
|
49
|
-
writer.write('{"basename":"%s"' % listing['basename'])
|
50
|
-
|
51
|
-
from operator import add
|
52
|
-
writer.write(reduce(add, (',"%s":%s' % (key,
|
53
|
-
'"%s"' % listing[key]
|
54
|
-
if isinstance(listing[key],str)
|
55
|
-
else listing[key])
|
56
|
-
for key in listing.keys() if key != 'subdirs')))
|
57
|
-
|
58
|
-
writer.write('}\n')
|
59
|
-
|
60
|
-
#-------------------------------------------------------------------------------
|
61
|
-
|
62
|
-
def each_listing_in_hierarchy(parsedHierarchy):
|
63
|
-
"""
|
64
|
-
@param parsedHierarchy dictionary mapping filenames to
|
65
|
-
parsedHierarchies. This is in the same
|
66
|
-
format as the 'subdirs' component of a
|
67
|
-
parsedHierarchy.
|
68
|
-
|
69
|
-
@return one record for every file listing. Every parsedHierarchy
|
70
|
-
will have its 'subdirs' key deleted and will consequently be flat.
|
71
|
-
"""
|
72
|
-
|
73
|
-
if 'subdirs' in parsedHierarchy:
|
74
|
-
subdirs = parsedHierarchy['subdirs']
|
75
|
-
del parsedHierarchy['subdirs']
|
76
|
-
return [parsedHierarchy] + each_listing_in_subdirs(subdirs)
|
77
|
-
else:
|
78
|
-
return [parsedHierarchy]
|
79
|
-
|
80
|
-
def each_listing_in_subdirs(parsedHierarchies):
|
81
|
-
keys = parsedHierarchies.keys()
|
82
|
-
keys.sort()
|
83
|
-
from operator import add
|
84
|
-
|
85
|
-
return reduce(add,
|
86
|
-
[each_listing_in_hierarchy(parsedHierarchies[f])
|
87
|
-
for f in keys])
|
88
|
-
|
89
|
-
#-------------------------------------------------------------------------------
|
90
|
-
|
91
|
-
def insert_line(parsedLine,
|
92
|
-
parsedHierarchy,
|
93
|
-
bucket_name,
|
94
|
-
prefix='/',
|
95
|
-
s3hdfs = False):
|
96
|
-
"""
|
97
|
-
@param parsedHierarchy A parsed hierarchy is a dictionary that
|
98
|
-
contains the size, date, type, path, and
|
99
|
-
subdirs of a file. It has two special
|
100
|
-
properties: the basename contains no /
|
101
|
-
characters, and the "subdirs" points to a
|
102
|
-
dictionary that maps names to
|
103
|
-
parsedHierarchies underneath this one.
|
104
|
-
"""
|
105
|
-
|
106
|
-
def insert_subdir(parsedHierarchy, subdir, bucket_name, prefix):
|
107
|
-
if 'subdirs' not in parsedHierarchy:
|
108
|
-
parsedHierarchy['subdirs'] = {}
|
109
|
-
if subdir not in parsedHierarchy['subdirs']:
|
110
|
-
parsedHierarchy['subdirs'][subdir] = {}
|
111
|
-
parsedHierarchy['subdirs'][subdir]['basename'] = subdir
|
112
|
-
parsedHierarchy['subdirs'][subdir]['file_size'] = 0
|
113
|
-
parsedHierarchy['subdirs'][subdir]['type'] = 'd'
|
114
|
-
|
115
|
-
prot = 's3' if s3hdfs else 's3n'
|
116
|
-
|
117
|
-
parent_url = (parsedHierarchy['_id'] if '_id' in parsedHierarchy
|
118
|
-
else '%s://%s/' % (prot, bucket_name))
|
119
|
-
|
120
|
-
parsedHierarchy['subdirs'][subdir]['parent_id'] = parent_url
|
121
|
-
|
122
|
-
|
123
|
-
url = '%s://%s%s%s' % (prot, bucket_name, prefix, subdir)
|
124
|
-
parsedHierarchy['subdirs'][subdir]['_id'] = url
|
125
|
-
|
126
|
-
import hashlib
|
127
|
-
sha1hasher = hashlib.new('sha1')
|
128
|
-
sha1hasher.update(url)
|
129
|
-
|
130
|
-
parsedHierarchy['subdirs'][subdir]['uuid'] = (
|
131
|
-
sha1hasher.hexdigest().lower())
|
132
|
-
|
133
|
-
path = parsedLine['path']
|
134
|
-
# recursively insert rest of path after /
|
135
|
-
if path.find('/') != -1:
|
136
|
-
base,rest = path.split('/',1)
|
137
|
-
|
138
|
-
insert_subdir(parsedHierarchy, base, bucket_name, prefix)
|
139
|
-
|
140
|
-
parsedLine['path'] = rest
|
141
|
-
insert_line(parsedLine,
|
142
|
-
parsedHierarchy['subdirs'][base],
|
143
|
-
bucket_name,
|
144
|
-
prefix + base + '/')
|
145
|
-
|
146
|
-
# insert one file or directory into "subdirs"
|
147
|
-
else:
|
148
|
-
insert_subdir(parsedHierarchy, path, bucket_name, prefix)
|
149
|
-
|
150
|
-
# This will also overwrite the default 'type':'d' from insert_subdir
|
151
|
-
for k in parsedLine.keys():
|
152
|
-
parsedHierarchy['subdirs'][path][k] = parsedLine[k]
|
153
|
-
|
154
|
-
parsedHierarchy['subdirs'][path]['basename'] = \
|
155
|
-
parsedHierarchy['subdirs'][path]['path']
|
156
|
-
del parsedHierarchy['subdirs'][path]['path']
|
157
|
-
|
158
|
-
#-------------------------------------------------------------------------------
|
159
|
-
|
160
|
-
def json2ls(json, writer, prefix='/'):
|
161
|
-
"""
|
162
|
-
sanity check. writes json back out to the command line in ls form
|
163
|
-
"""
|
164
|
-
|
165
|
-
from datetime import datetime
|
166
|
-
d =(datetime.fromtimestamp(json['datetime']).strftime("%Y-%m-%d %H:%M")
|
167
|
-
if 'datetime' in json else '1970-01-01 00:00')
|
168
|
-
|
169
|
-
writer.write("%s %9d %s\n" % (
|
170
|
-
d,
|
171
|
-
json['file_size'],
|
172
|
-
json['_id'].replace('s3n', 's3')))
|
173
|
-
|
174
|
-
#-------------------------------------------------------------------------------
|
175
|
-
|
176
|
-
def hdfs_parse_line(bucket_name):
|
177
|
-
|
178
|
-
import re
|
179
|
-
|
180
|
-
def line_parser(line):
|
181
|
-
|
182
|
-
components = re.compile(r"""
|
183
|
-
|
184
|
-
^
|
185
|
-
(
|
186
|
-
[d\-] # directory bit
|
187
|
-
)
|
188
|
-
(?:[r\-][w\-][xs\-]){2}
|
189
|
-
[r\-][w\-][x\-]
|
190
|
-
|
191
|
-
[ \t]*
|
192
|
-
|
193
|
-
(?:-|[0-9]+) # number of links. ignore.
|
194
|
-
|
195
|
-
[ \t]*
|
196
|
-
|
197
|
-
([0-9]+) # size
|
198
|
-
|
199
|
-
[ \t]*
|
200
|
-
|
201
|
-
(\d\d\d\d-\d\d-\d\d[ ]\d\d:\d\d)
|
202
|
-
|
203
|
-
[ \t]*
|
204
|
-
|
205
|
-
( # path
|
206
|
-
[^ \t]
|
207
|
-
[^\n]*
|
208
|
-
)
|
209
|
-
|
210
|
-
.*
|
211
|
-
|
212
|
-
$
|
213
|
-
|
214
|
-
""", re.VERBOSE)
|
215
|
-
|
216
|
-
m = components.match(line)
|
217
|
-
if not m:
|
218
|
-
import sys
|
219
|
-
sys.stderr.write("couldn't parse line: %s\n" % (line))
|
220
|
-
return None
|
221
|
-
|
222
|
-
typ, fsize, datetime, path = m.groups()
|
223
|
-
|
224
|
-
if typ == '-': typ = 'f'
|
225
|
-
if path.startswith('/'): path = path[1:]
|
226
|
-
|
227
|
-
return datetime, fsize, bucket_name, path, typ
|
228
|
-
|
229
|
-
return line_parser
|
230
|
-
|
231
|
-
#-------------------------------------------------------------------------------
|
232
|
-
|
233
|
-
def s3_parse_line(line):
|
234
|
-
|
235
|
-
import re
|
236
|
-
components = re.compile(r"""
|
237
|
-
|
238
|
-
^
|
239
|
-
(\d\d\d\d-\d\d-\d\d[ ]\d\d:\d\d)
|
240
|
-
|
241
|
-
[ \t]*
|
242
|
-
|
243
|
-
([0-9]+)
|
244
|
-
|
245
|
-
[ \t]*
|
246
|
-
|
247
|
-
(?:
|
248
|
-
(?:s3://)
|
249
|
-
([^/]*)
|
250
|
-
/
|
251
|
-
([^\n]*)
|
252
|
-
)
|
253
|
-
|
254
|
-
.*
|
255
|
-
|
256
|
-
$
|
257
|
-
|
258
|
-
""", re.VERBOSE)
|
259
|
-
|
260
|
-
m = components.match(line)
|
261
|
-
if not m:
|
262
|
-
import sys
|
263
|
-
sys.stderr.write("couldn't parse line: %s\n" % (line))
|
264
|
-
return None
|
265
|
-
|
266
|
-
datetime, fsize, bucket_name, parsed_line = m.groups()
|
267
|
-
typ = 'f'
|
268
|
-
|
269
|
-
return datetime, fsize, bucket_name, parsed_line, typ
|
270
|
-
|
271
|
-
#-------------------------------------------------------------------------------
|
272
|
-
|
273
|
-
def ls2json_subdirs(lines, line_parser):
|
274
|
-
|
275
|
-
parsedHierarchy = None
|
276
|
-
|
277
|
-
count = 0
|
278
|
-
for line in lines:
|
279
|
-
count = count + 1
|
280
|
-
if count % 1000 == 0:
|
281
|
-
logging.info("inserting line %d" % (count))
|
282
|
-
|
283
|
-
line_tuple = line_parser(line)
|
284
|
-
|
285
|
-
if not line_tuple:
|
286
|
-
continue
|
287
|
-
|
288
|
-
parsedLine = {}
|
289
|
-
|
290
|
-
(
|
291
|
-
|
292
|
-
parsedLine['datetime'],
|
293
|
-
parsedLine['file_size'],
|
294
|
-
bucket_name,
|
295
|
-
parsedLine['path'],
|
296
|
-
parsedLine['type']
|
297
|
-
|
298
|
-
) = line_tuple
|
299
|
-
|
300
|
-
if not parsedHierarchy:
|
301
|
-
url = "s3n://%s" % (bucket_name)
|
302
|
-
import hashlib
|
303
|
-
sha1hasher = hashlib.new('sha1')
|
304
|
-
sha1hasher.update(url)
|
305
|
-
|
306
|
-
parsedHierarchy = {
|
307
|
-
bucket_name : {
|
308
|
-
"subdirs" : {},
|
309
|
-
"basename" : bucket_name,
|
310
|
-
"_id" : url,
|
311
|
-
"type" : "d",
|
312
|
-
"file_size" : 0,
|
313
|
-
"uuid" : sha1hasher.hexdigest(),
|
314
|
-
}
|
315
|
-
}
|
316
|
-
|
317
|
-
parsedLine['file_size'] = int(parsedLine['file_size'])
|
318
|
-
|
319
|
-
if parsedLine['datetime'] == '1970-01-01 00:00':
|
320
|
-
del parsedLine['datetime']
|
321
|
-
else:
|
322
|
-
from datetime import datetime
|
323
|
-
parsedLine['datetime'] = int(datetime.strptime(
|
324
|
-
parsedLine['datetime'],
|
325
|
-
"%Y-%m-%d %H:%M").strftime("%s"))
|
326
|
-
|
327
|
-
parsedLine['file_size'] = int(parsedLine['file_size'])
|
328
|
-
|
329
|
-
if parsedLine['path'].endswith('/'):
|
330
|
-
parsedLine['path'] = parsedLine['path'][:-1]
|
331
|
-
parsedLine['type'] = 'd'
|
332
|
-
|
333
|
-
insert_line(parsedLine,
|
334
|
-
parsedHierarchy[bucket_name],
|
335
|
-
bucket_name)
|
336
|
-
|
337
|
-
if not parsedHierarchy: return []
|
338
|
-
|
339
|
-
logging.info("calculating sizes")
|
340
|
-
calculate_sizes(parsedHierarchy)
|
341
|
-
|
342
|
-
logging.info("converting hierarchies")
|
343
|
-
return each_listing_in_subdirs(parsedHierarchy)
|
344
|
-
|
345
|
-
#-------------------------------------------------------------------------------
|
346
|
-
|
347
|
-
if __name__ == '__main__':
|
348
|
-
|
349
|
-
from optparse import OptionParser
|
350
|
-
parser = OptionParser(usage = "usage: %prog [options] [s3hdfs bucket name]")
|
351
|
-
parser.add_option("-i", "--input", dest="infile", default = None,
|
352
|
-
help="input file..")
|
353
|
-
parser.add_option("-o", "--output", dest="outfile", default = None,
|
354
|
-
help="output file.")
|
355
|
-
parser.add_option("-t", "--test", dest="test", default = False,
|
356
|
-
action="store_true",
|
357
|
-
help="reoutput in ls format. for debugging")
|
358
|
-
|
359
|
-
(options, args) = parser.parse_args()
|
360
|
-
|
361
|
-
import sys
|
362
|
-
if len(args) > 1:
|
363
|
-
parser.print_usage()
|
364
|
-
sys.exit(0)
|
365
|
-
|
366
|
-
if args:
|
367
|
-
bucket, = args
|
368
|
-
ls_converter = lambda istream: ls2json_subdirs(istream.readlines(),
|
369
|
-
hdfs_parse_line(bucket))
|
370
|
-
else:
|
371
|
-
ls_converter = lambda istream: ls2json_subdirs(istream.readlines(),
|
372
|
-
s3_parse_line)
|
373
|
-
|
374
|
-
def open_or_die(fname, flags="r"):
|
375
|
-
try:
|
376
|
-
return open(fname, flags)
|
377
|
-
except IOError as (errno, strerr):
|
378
|
-
sys.stderr.write("Couldn't open %s: %s\n" % (fname, strerr))
|
379
|
-
sys.exit(0)
|
380
|
-
|
381
|
-
from sys import stdin, stdout
|
382
|
-
instream = open_or_die(options.infile) if options.infile else stdin
|
383
|
-
outstream = open_or_die(options.outfile, 'w') if options.outfile else stdout
|
384
|
-
|
385
|
-
if options.test:
|
386
|
-
for listing in ls_converter(instream):
|
387
|
-
json2ls(listing, outstream)
|
388
|
-
else:
|
389
|
-
for listing in ls_converter(instream):
|
390
|
-
write_listing_in_json(listing, outstream)
|
391
|
-
|
@@ -1,55 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require_relative '../../lib/vayacondios/server/legacy_switch'
|
3
|
-
|
4
|
-
require 'multi_json'
|
5
|
-
|
6
|
-
require_relative '../../lib/vayacondios/client/itemset'
|
7
|
-
|
8
|
-
describe Vayacondios::Client::ItemSet do
|
9
|
-
context "after instantiation in legacy mode" do
|
10
|
-
itemset = Vayacondios::Client::ItemSet.new("foohost", 9999, "fooorg", "footopic", "fooid")
|
11
|
-
ary = ["foo", "bar", "baz"]
|
12
|
-
|
13
|
-
# testing internals here to avoid shimming up HTTP libraries.
|
14
|
-
|
15
|
-
it "generates a put request without a patch header when asked to create" do
|
16
|
-
Vayacondios.force_legacy_mode true
|
17
|
-
|
18
|
-
req = itemset.instance_eval{_req(:create, ary)}
|
19
|
-
|
20
|
-
req.method.should eql('PUT')
|
21
|
-
req.body.should eql(MultiJson.encode(ary))
|
22
|
-
req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
|
23
|
-
req.each_header.to_a.should_not include(["x_method", "PATCH"])
|
24
|
-
end
|
25
|
-
|
26
|
-
it "generates a put request with a patch header when asked to update" do
|
27
|
-
Vayacondios.force_legacy_mode true
|
28
|
-
|
29
|
-
req = itemset.instance_eval{_req(:update, ary)}
|
30
|
-
|
31
|
-
req.method.should eql('PUT')
|
32
|
-
req.body.should eql(MultiJson.encode(ary))
|
33
|
-
req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
|
34
|
-
req.each_header.to_a.should include(["x-method", "PATCH"])
|
35
|
-
end
|
36
|
-
|
37
|
-
it "generates a get request when asked to fetch" do
|
38
|
-
req = itemset.instance_eval{_req(:fetch)}
|
39
|
-
|
40
|
-
req.method.should eql('GET')
|
41
|
-
req.body.should be_nil
|
42
|
-
req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
|
43
|
-
end
|
44
|
-
|
45
|
-
it "generates a delete request when asked to remove" do
|
46
|
-
Vayacondios.force_legacy_mode true
|
47
|
-
|
48
|
-
req = itemset.instance_eval{_req(:remove, ary)}
|
49
|
-
|
50
|
-
req.method.should eql('DELETE')
|
51
|
-
req.body.should eql(MultiJson.encode(ary))
|
52
|
-
req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|