vayacondios-server 0.2.11 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. data/.gitignore +3 -1
  2. data/.travis.yml +2 -0
  3. data/Gemfile +15 -9
  4. data/LICENSE.md +2 -6
  5. data/Procfile +1 -1
  6. data/README.md +656 -111
  7. data/Rakefile +89 -6
  8. data/bin/vcd +10 -0
  9. data/bin/vcd-server +8 -0
  10. data/config/database.yml +6 -0
  11. data/config/spec.example.yml +18 -0
  12. data/config/vayacondios.example.yml +15 -0
  13. data/config/vcd-server.rb +37 -0
  14. data/examples/configuration.rb +56 -0
  15. data/examples/event_stream.rb +19 -0
  16. data/examples/simple.rb +61 -0
  17. data/features/event.feature +319 -0
  18. data/features/events.feature +208 -0
  19. data/features/stash.feature +840 -0
  20. data/features/stashes.feature +492 -0
  21. data/features/step_definitions/stash_steps.rb +113 -0
  22. data/features/stream.feature +30 -0
  23. data/features/support/em.rb +14 -0
  24. data/features/support/env.rb +13 -0
  25. data/lib/vayacondios/configuration.rb +63 -0
  26. data/lib/vayacondios/server/api.rb +126 -0
  27. data/lib/vayacondios/server/api_options.rb +56 -0
  28. data/lib/vayacondios/server/configuration.rb +23 -0
  29. data/lib/vayacondios/server/driver.rb +71 -0
  30. data/lib/vayacondios/server/drivers/mongo.rb +126 -0
  31. data/lib/vayacondios/server/handlers/document_handler.rb +81 -0
  32. data/lib/vayacondios/server/handlers/event_handler.rb +31 -26
  33. data/lib/vayacondios/server/handlers/events_handler.rb +31 -0
  34. data/lib/vayacondios/server/handlers/stash_handler.rb +69 -0
  35. data/lib/vayacondios/server/handlers/stashes_handler.rb +49 -0
  36. data/lib/vayacondios/server/handlers/stream_handler.rb +39 -0
  37. data/lib/vayacondios/server/models/document.rb +87 -0
  38. data/lib/vayacondios/server/models/event.rb +198 -0
  39. data/lib/vayacondios/server/models/stash.rb +100 -0
  40. data/lib/vayacondios/server.rb +35 -0
  41. data/lib/vayacondios-server.rb +19 -13
  42. data/lib/vayacondios.rb +22 -0
  43. data/pom.xml +124 -4
  44. data/spec/configuration_spec.rb +41 -0
  45. data/spec/server/api_options_spec.rb +32 -0
  46. data/spec/server/api_spec.rb +279 -0
  47. data/spec/server/configuration_spec.rb +27 -0
  48. data/spec/server/drivers/mongo_spec.rb +107 -0
  49. data/spec/server/handlers/event_handler_spec.rb +62 -0
  50. data/spec/server/handlers/events_handler_spec.rb +51 -0
  51. data/spec/server/handlers/stash_handler_spec.rb +68 -0
  52. data/spec/server/handlers/stashes_handler_spec.rb +50 -0
  53. data/spec/server/handlers/stream_handler_spec.rb +5 -0
  54. data/spec/server/models/document_spec.rb +9 -0
  55. data/spec/server/models/event_spec.rb +185 -0
  56. data/spec/server/models/stash_spec.rb +95 -0
  57. data/spec/spec_helper.rb +23 -3
  58. data/spec/support/database_helper.rb +42 -0
  59. data/spec/support/log_helper.rb +19 -0
  60. data/spec/support/shared_context_for_events.rb +22 -0
  61. data/spec/support/shared_context_for_stashes.rb +24 -0
  62. data/spec/support/shared_examples_for_handlers.rb +32 -0
  63. data/src/main/java/com/infochimps/vayacondios/BaseClient.java +342 -0
  64. data/src/main/java/com/infochimps/vayacondios/HTTPClient.java +426 -0
  65. data/src/main/java/com/infochimps/vayacondios/VayacondiosClient.java +487 -65
  66. data/src/main/java/com/infochimps/vayacondios/test/IntegrationTest.java +3 -0
  67. data/src/test/java/com/infochimps/vayacondios/BaseClientTest.java +50 -0
  68. data/src/test/java/com/infochimps/vayacondios/HTTPClientIT.java +267 -0
  69. data/vayacondios-server.gemspec +9 -9
  70. metadata +127 -122
  71. checksums.yaml +0 -15
  72. data/.rspec +0 -2
  73. data/.yardopts +0 -10
  74. data/Guardfile +0 -41
  75. data/app/http_shim.rb +0 -71
  76. data/bin/vcd.sh +0 -27
  77. data/config/http_shim.rb +0 -43
  78. data/config/vayacondios.example.yaml +0 -7
  79. data/config/vayacondios.yaml +0 -7
  80. data/examples/java/ItemSetTest.java +0 -76
  81. data/lib/tasks/publish.rake +0 -23
  82. data/lib/tasks/spec.rake +0 -11
  83. data/lib/tasks/yard.rake +0 -2
  84. data/lib/vayacondios/client/config.rb +0 -7
  85. data/lib/vayacondios/client/configliere.rb +0 -38
  86. data/lib/vayacondios/client/cube_client.rb +0 -39
  87. data/lib/vayacondios/client/http_client.rb +0 -49
  88. data/lib/vayacondios/client/itemset.rb +0 -130
  89. data/lib/vayacondios/client/legacy_switch.rb +0 -43
  90. data/lib/vayacondios/client/notifier.rb +0 -123
  91. data/lib/vayacondios/client/zabbix_client.rb +0 -148
  92. data/lib/vayacondios/legacy_switch.rb +0 -43
  93. data/lib/vayacondios/server/errors/bad_request.rb +0 -6
  94. data/lib/vayacondios/server/errors/not_found.rb +0 -6
  95. data/lib/vayacondios/server/handlers/config_handler.rb +0 -32
  96. data/lib/vayacondios/server/handlers/itemset_handler.rb +0 -60
  97. data/lib/vayacondios/server/legacy_switch.rb +0 -43
  98. data/lib/vayacondios/server/model/config_document.rb +0 -89
  99. data/lib/vayacondios/server/model/document.rb +0 -25
  100. data/lib/vayacondios/server/model/event_document.rb +0 -94
  101. data/lib/vayacondios/server/model/itemset_document.rb +0 -126
  102. data/lib/vayacondios/server/rack/extract_methods.rb +0 -35
  103. data/lib/vayacondios/server/rack/jsonize.rb +0 -43
  104. data/lib/vayacondios/server/rack/params.rb +0 -50
  105. data/lib/vayacondios/server/rack/path.rb +0 -23
  106. data/lib/vayacondios/server/rack/path_validation.rb +0 -22
  107. data/lib/vayacondios/version.rb +0 -3
  108. data/lib/vayacondios-client.rb +0 -22
  109. data/scripts/hadoop_monitor/configurable.rb +0 -66
  110. data/scripts/hadoop_monitor/hadoop_attempt_scraper.rb +0 -45
  111. data/scripts/hadoop_monitor/hadoop_client.rb +0 -273
  112. data/scripts/hadoop_monitor/hadoop_monitor.rb +0 -101
  113. data/scripts/hadoop_monitor/hadoopable.rb +0 -65
  114. data/scripts/hadoop_monitor/machine_monitor.rb +0 -115
  115. data/scripts/s3_cataloger/buckets +0 -33
  116. data/scripts/s3_cataloger/foreach_bucket +0 -88
  117. data/scripts/s3_cataloger/parse_ls.py +0 -391
  118. data/spec/client/itemset_legacy_spec.rb +0 -55
  119. data/spec/client/itemset_spec.rb +0 -60
  120. data/spec/client/notifier_spec.rb +0 -120
  121. data/spec/server/config_spec.rb +0 -113
  122. data/spec/server/event_spec.rb +0 -103
  123. data/spec/server/itemset_legacy_spec.rb +0 -320
  124. data/spec/server/itemset_spec.rb +0 -317
  125. data/spec/server/rack/extract_methods_spec.rb +0 -60
  126. data/spec/server/rack/path_spec.rb +0 -36
  127. data/spec/server/rack/path_validation_spec.rb +0 -22
  128. data/spec/server/server_spec.rb +0 -20
  129. data/spec/support/mongo_cleaner.rb +0 -32
  130. data/src/main/java/ItemSetTest.java +0 -76
  131. data/src/main/java/com/infochimps/util/CurrentClass.java +0 -26
  132. data/src/main/java/com/infochimps/util/DebugUtil.java +0 -38
  133. data/src/main/java/com/infochimps/util/HttpHelper.java +0 -181
  134. data/src/main/java/com/infochimps/vayacondios/ItemSets.java +0 -373
  135. data/src/main/java/com/infochimps/vayacondios/LinkToVCD.java +0 -18
  136. data/src/main/java/com/infochimps/vayacondios/MemoryVCDShim.java +0 -84
  137. data/src/main/java/com/infochimps/vayacondios/Organization.java +0 -62
  138. data/src/main/java/com/infochimps/vayacondios/PathBuilder.java +0 -13
  139. data/src/main/java/com/infochimps/vayacondios/StandardVCDLink.java +0 -218
  140. data/src/main/java/com/infochimps/vayacondios/VCDIntegrationTest.java +0 -108
  141. data/src/test/java/com/infochimps/vayacondios/TestVayacondiosInMemory.java +0 -78
  142. data/vayacondios-client.gemspec +0 -25
@@ -1,115 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require_relative 'configure'
4
- require 'thread'
5
- require 'socket'
6
- require 'scanf'
7
- require 'json'
8
- require 'mongo'
9
-
10
- class Vayacondios
11
-
12
- class StatServer
13
-
14
- include Configurable
15
-
16
- def initialize
17
- unless get_conf.mongo_ip
18
- raise "The IP address of the mongo server must be set!"
19
- end
20
-
21
- logger.info "Connecting to Mongo server at ip #{get_conf.mongo_ip}"
22
- conn = Mongo::Connection.new get_conf.mongo_ip
23
- logger.debug "Getting job database #{get_conf.mongo_jobs_db}"
24
- @db = conn[get_conf.mongo_jobs_db]
25
- end
26
-
27
- def run
28
-
29
- # TODO: This entire script should be replaced by calls to zabbix
30
- # initiated by the main loop of the hadoop_monitor.
31
-
32
- logger.debug "Waiting for hadoop monitor to create the event collection."
33
- sleep get_conf.sleep_seconds until
34
- @db.collection_names.index get_conf.mongo_job_events_collection
35
-
36
- job_events = @db[get_conf.mongo_job_events_collection]
37
-
38
- logger.debug "Got the event collection. Creating machine stats collection."
39
- machine_stats = @db.
40
- create_collection(get_conf.mongo_machine_stats_collection)
41
-
42
- logger.debug "Querying job_events until we see an insertion."
43
- # Keep querying the job_events collection until there's an
44
- # event. Don't just use the cursor from .find without checking,
45
- # because if hadoop_monitor inserts an event into an empty
46
- # database, this cursor will no longer work, even if it's
47
- # tailable. not quite sure why Mongo does it that way.
48
- events = job_events.find
49
- events.add_option 0x02 # tailable
50
- until events.has_next?
51
- sleep get_conf.sleep_seconds
52
- events = job_events.find
53
- events.add_option 0x02 # tailable
54
- end
55
-
56
- logger.debug "Priming main event loop. Waiting to see if the cluster is busy."
57
-
58
- # Get up-to-date on the state of the cluster. assume quiet to start.
59
- cluster_busy = self.class.next_state(events, false, get_conf.event)
60
-
61
- # main loop
62
- loop do
63
-
64
- logger.debug "In main event loop. Waiting to see if the cluster is busy."
65
-
66
- # Get up-to-date on the state of the cluster.
67
- cluster_busy = self.class.next_state(events, cluster_busy, get_conf.event)
68
-
69
- # Don't grab stats unless the cluster is busy
70
- unless cluster_busy
71
- sleep get_conf.sleep_seconds
72
- next
73
- end
74
-
75
- logger.debug "Grabbing stats and pushing them into the collection."
76
-
77
- # Grab the stats!
78
- # ifstat's delay will function as our heartbeat timer.
79
- is, ignore, rw = `ifstat 1 1`.split("\n").map(&:split)
80
- headers, *disks = `iostat -x`.split("\n")[5..-1].map(&:split)
81
- cpu, mem, swap, proc_headers, *procs = `top -b -n 1`.
82
- split("\n").map(&:strip).select{|x| not x.empty?}[2..-1]
83
-
84
- # Write the stats into the mongo collection.
85
- machine_stats.insert(
86
- :net => Hash[is.zip(rw.each_slice(2).map{|r,w| {:r => r, :w => w}})],
87
- :disk => Hash[disks.map{|d| [d.first, Hash[headers.zip(d)]]}],
88
- :cpu => self.class.split_top_stats(cpu),
89
- :mem => self.class.split_top_stats(mem),
90
- :swap => self.class.split_top_stats(swap))
91
- end
92
- end
93
-
94
- private
95
-
96
- def self.split_top_stats line
97
- Hash[line.split(':', 2).last.split(',').map(&:strip).map do |stat|
98
- stat.scanf("%f%*c%s").reverse
99
- end]
100
- end
101
-
102
- def self.next_state events_cursor, current_state, event_attr_name
103
- while current_event = events_cursor.next
104
- current_state = case current_event[event_attr_name]
105
- when CLUSTER_BUSY then true
106
- when CLUSTER_QUIET then false
107
- else current_state
108
- end
109
- end
110
- current_state
111
- end
112
- end
113
- end
114
-
115
- Vayacondios::StatServer.new.run
@@ -1,33 +0,0 @@
1
- export dir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
2
-
3
- bdump_and_bparse() {
4
- bucket_name=$1
5
- bdump $1 ; bparse $1
6
- }
7
-
8
- bparse_and_bload() {
9
- bucket_name=$1
10
- bparse $1 ; bload "$@"
11
- }
12
-
13
- bdump_and_bload() {
14
- bucket_name=$1
15
- bdump $1; bparse $1 ; bload $1
16
- }
17
-
18
- bdump() {
19
- bucket_name=$1
20
- s3cmd ls -r s3://$bucket_name/ >$bucket_name.ls
21
- }
22
-
23
- bparse() {
24
- bucket_name=$1
25
- $dir/parse_ls.py <$bucket_name.ls >$bucket_name.json
26
- }
27
-
28
- bload() {
29
- bucket_name=$1
30
- db=$2
31
- collection=$3
32
- mongoimport -d $db -c $collection $bucket_name.json
33
- }
@@ -1,88 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- export dir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
4
-
5
- . $dir/buckets
6
-
7
- case $1 in
8
- -f)
9
- bucket_file=$2
10
- shift 2
11
- ;;
12
- -h|--help)
13
- cat <<EOF
14
- foreach_bucket [OPTIONS] COMMAND [ARGUMENTS]
15
-
16
- This script is used to do a recursive listing of an s3 bucket using
17
- the s3cmd and then jsonify the output. It runs the COMMAND on the
18
- buckets specified in a file; on standard input; or, by default, on all
19
- buckets that can be seen by s3cmd.
20
-
21
- OPTIONS include the following:
22
-
23
- -f BUCKET_FILE file containing a bucket name on each line. If
24
- this is set to '-', then buckets are read from
25
- standard input.
26
-
27
- COMMAND includes anything in the 'buckets' script. The main commands
28
- are the following:
29
-
30
- bdump dumps BUCKET to a file BUCKET.ls in the current
31
- working directory
32
-
33
- bparse runs BUCKET.ls through a parser to jsonify it and
34
- outputs the result as BUCKET.json
35
-
36
- bload loads BUCKET.json into a mongo database. The first
37
- argument passed to this command specifies the
38
- mongo database, while the second specifies the
39
- collection.
40
- EOF
41
- exit 0
42
- ;;
43
- -*)
44
- echo "Invalid option: $1"
45
- exit 1
46
- ;;
47
- esac
48
-
49
- command=$1
50
- shift
51
-
52
- buckets=()
53
-
54
- ## no bucket file specified read all s3 buckets
55
- if [[ -z $bucket_file ]]
56
- then
57
- for bucket in `s3cmd ls | cut -d ' ' -f 4 | cut -d / -f 3`
58
- do
59
- buckets=("${buckets[@]}" "$bucket")
60
- done
61
-
62
- ## read buckets from standard input
63
- elif [[ $bucket_file == "-" ]]
64
- then
65
- read bucket
66
- until [[ $? -eq 1 ]]
67
- do
68
- buckets=("${buckets[@]}" "$bucket")
69
- read bucket
70
- done
71
-
72
- ## read from bucket_file
73
- else
74
- tmpIFS=$IFS
75
- IFS=$'\n'
76
-
77
- for bucket in `cat $bucket_file`
78
- do
79
- buckets=("${buckets[@]}" "$bucket")
80
- done
81
-
82
- IFS=$tmpIFS
83
- fi
84
-
85
- for bucket in "${buckets[@]}"
86
- do
87
- ($command $bucket "$@")&
88
- done
@@ -1,391 +0,0 @@
1
- #!/usr/bin/env python
2
-
3
- import logging
4
- import sys
5
-
6
- # crank this down to info for progress messages. can also use
7
- # "filename=" for that kind of thing. The only reason this is stderr
8
- # is to allow for output redirection.
9
- logging.basicConfig(stream=sys.stderr, level=logging.ERROR)
10
-
11
- #-------------------------------------------------------------------------------
12
-
13
- def calculate_sizes(parsedHierarchies):
14
- """
15
- @param parsedHierarchies dictionary mapping filenames to
16
- parsedHierarchies. This is in the same
17
- format as the 'subdirs' component of a
18
- parsedHierarchy.
19
- """
20
-
21
- from operator import add
22
- return reduce(
23
- add,
24
- (
25
- calculate_size(parsedHierarchies[name])
26
- for name in parsedHierarchies.keys()))
27
-
28
-
29
- def calculate_size(parsedHierarchy):
30
- """
31
- @param parsedHierarchy dictionary in the same format as the one
32
- operated on by insert_line
33
- """
34
-
35
- if 'subdirs' in parsedHierarchy:
36
- parsedHierarchy['tree_size'] = calculate_sizes(parsedHierarchy['subdirs'])
37
- elif parsedHierarchy['type'] == 'd':
38
- parsedHierarchy['tree_size'] = 0
39
-
40
- if 'tree_size' in parsedHierarchy:
41
- return parsedHierarchy['tree_size']
42
- else:
43
- return parsedHierarchy['file_size']
44
-
45
- #-------------------------------------------------------------------------------
46
-
47
- from sys import stdout
48
- def write_listing_in_json(listing, writer = stdout):
49
- writer.write('{"basename":"%s"' % listing['basename'])
50
-
51
- from operator import add
52
- writer.write(reduce(add, (',"%s":%s' % (key,
53
- '"%s"' % listing[key]
54
- if isinstance(listing[key],str)
55
- else listing[key])
56
- for key in listing.keys() if key != 'subdirs')))
57
-
58
- writer.write('}\n')
59
-
60
- #-------------------------------------------------------------------------------
61
-
62
- def each_listing_in_hierarchy(parsedHierarchy):
63
- """
64
- @param parsedHierarchy dictionary mapping filenames to
65
- parsedHierarchies. This is in the same
66
- format as the 'subdirs' component of a
67
- parsedHierarchy.
68
-
69
- @return one record for every file listing. Every parsedHierarchy
70
- will have its 'subdirs' key deleted and will consequently be flat.
71
- """
72
-
73
- if 'subdirs' in parsedHierarchy:
74
- subdirs = parsedHierarchy['subdirs']
75
- del parsedHierarchy['subdirs']
76
- return [parsedHierarchy] + each_listing_in_subdirs(subdirs)
77
- else:
78
- return [parsedHierarchy]
79
-
80
- def each_listing_in_subdirs(parsedHierarchies):
81
- keys = parsedHierarchies.keys()
82
- keys.sort()
83
- from operator import add
84
-
85
- return reduce(add,
86
- [each_listing_in_hierarchy(parsedHierarchies[f])
87
- for f in keys])
88
-
89
- #-------------------------------------------------------------------------------
90
-
91
- def insert_line(parsedLine,
92
- parsedHierarchy,
93
- bucket_name,
94
- prefix='/',
95
- s3hdfs = False):
96
- """
97
- @param parsedHierarchy A parsed hierarchy is a dictionary that
98
- contains the size, date, type, path, and
99
- subdirs of a file. It has two special
100
- properties: the basename contains no /
101
- characters, and the "subdirs" points to a
102
- dictionary that maps names to
103
- parsedHierarchies underneath this one.
104
- """
105
-
106
- def insert_subdir(parsedHierarchy, subdir, bucket_name, prefix):
107
- if 'subdirs' not in parsedHierarchy:
108
- parsedHierarchy['subdirs'] = {}
109
- if subdir not in parsedHierarchy['subdirs']:
110
- parsedHierarchy['subdirs'][subdir] = {}
111
- parsedHierarchy['subdirs'][subdir]['basename'] = subdir
112
- parsedHierarchy['subdirs'][subdir]['file_size'] = 0
113
- parsedHierarchy['subdirs'][subdir]['type'] = 'd'
114
-
115
- prot = 's3' if s3hdfs else 's3n'
116
-
117
- parent_url = (parsedHierarchy['_id'] if '_id' in parsedHierarchy
118
- else '%s://%s/' % (prot, bucket_name))
119
-
120
- parsedHierarchy['subdirs'][subdir]['parent_id'] = parent_url
121
-
122
-
123
- url = '%s://%s%s%s' % (prot, bucket_name, prefix, subdir)
124
- parsedHierarchy['subdirs'][subdir]['_id'] = url
125
-
126
- import hashlib
127
- sha1hasher = hashlib.new('sha1')
128
- sha1hasher.update(url)
129
-
130
- parsedHierarchy['subdirs'][subdir]['uuid'] = (
131
- sha1hasher.hexdigest().lower())
132
-
133
- path = parsedLine['path']
134
- # recursively insert rest of path after /
135
- if path.find('/') != -1:
136
- base,rest = path.split('/',1)
137
-
138
- insert_subdir(parsedHierarchy, base, bucket_name, prefix)
139
-
140
- parsedLine['path'] = rest
141
- insert_line(parsedLine,
142
- parsedHierarchy['subdirs'][base],
143
- bucket_name,
144
- prefix + base + '/')
145
-
146
- # insert one file or directory into "subdirs"
147
- else:
148
- insert_subdir(parsedHierarchy, path, bucket_name, prefix)
149
-
150
- # This will also overwrite the default 'type':'d' from insert_subdir
151
- for k in parsedLine.keys():
152
- parsedHierarchy['subdirs'][path][k] = parsedLine[k]
153
-
154
- parsedHierarchy['subdirs'][path]['basename'] = \
155
- parsedHierarchy['subdirs'][path]['path']
156
- del parsedHierarchy['subdirs'][path]['path']
157
-
158
- #-------------------------------------------------------------------------------
159
-
160
- def json2ls(json, writer, prefix='/'):
161
- """
162
- sanity check. writes json back out to the command line in ls form
163
- """
164
-
165
- from datetime import datetime
166
- d =(datetime.fromtimestamp(json['datetime']).strftime("%Y-%m-%d %H:%M")
167
- if 'datetime' in json else '1970-01-01 00:00')
168
-
169
- writer.write("%s %9d %s\n" % (
170
- d,
171
- json['file_size'],
172
- json['_id'].replace('s3n', 's3')))
173
-
174
- #-------------------------------------------------------------------------------
175
-
176
- def hdfs_parse_line(bucket_name):
177
-
178
- import re
179
-
180
- def line_parser(line):
181
-
182
- components = re.compile(r"""
183
-
184
- ^
185
- (
186
- [d\-] # directory bit
187
- )
188
- (?:[r\-][w\-][xs\-]){2}
189
- [r\-][w\-][x\-]
190
-
191
- [ \t]*
192
-
193
- (?:-|[0-9]+) # number of links. ignore.
194
-
195
- [ \t]*
196
-
197
- ([0-9]+) # size
198
-
199
- [ \t]*
200
-
201
- (\d\d\d\d-\d\d-\d\d[ ]\d\d:\d\d)
202
-
203
- [ \t]*
204
-
205
- ( # path
206
- [^ \t]
207
- [^\n]*
208
- )
209
-
210
- .*
211
-
212
- $
213
-
214
- """, re.VERBOSE)
215
-
216
- m = components.match(line)
217
- if not m:
218
- import sys
219
- sys.stderr.write("couldn't parse line: %s\n" % (line))
220
- return None
221
-
222
- typ, fsize, datetime, path = m.groups()
223
-
224
- if typ == '-': typ = 'f'
225
- if path.startswith('/'): path = path[1:]
226
-
227
- return datetime, fsize, bucket_name, path, typ
228
-
229
- return line_parser
230
-
231
- #-------------------------------------------------------------------------------
232
-
233
- def s3_parse_line(line):
234
-
235
- import re
236
- components = re.compile(r"""
237
-
238
- ^
239
- (\d\d\d\d-\d\d-\d\d[ ]\d\d:\d\d)
240
-
241
- [ \t]*
242
-
243
- ([0-9]+)
244
-
245
- [ \t]*
246
-
247
- (?:
248
- (?:s3://)
249
- ([^/]*)
250
- /
251
- ([^\n]*)
252
- )
253
-
254
- .*
255
-
256
- $
257
-
258
- """, re.VERBOSE)
259
-
260
- m = components.match(line)
261
- if not m:
262
- import sys
263
- sys.stderr.write("couldn't parse line: %s\n" % (line))
264
- return None
265
-
266
- datetime, fsize, bucket_name, parsed_line = m.groups()
267
- typ = 'f'
268
-
269
- return datetime, fsize, bucket_name, parsed_line, typ
270
-
271
- #-------------------------------------------------------------------------------
272
-
273
- def ls2json_subdirs(lines, line_parser):
274
-
275
- parsedHierarchy = None
276
-
277
- count = 0
278
- for line in lines:
279
- count = count + 1
280
- if count % 1000 == 0:
281
- logging.info("inserting line %d" % (count))
282
-
283
- line_tuple = line_parser(line)
284
-
285
- if not line_tuple:
286
- continue
287
-
288
- parsedLine = {}
289
-
290
- (
291
-
292
- parsedLine['datetime'],
293
- parsedLine['file_size'],
294
- bucket_name,
295
- parsedLine['path'],
296
- parsedLine['type']
297
-
298
- ) = line_tuple
299
-
300
- if not parsedHierarchy:
301
- url = "s3n://%s" % (bucket_name)
302
- import hashlib
303
- sha1hasher = hashlib.new('sha1')
304
- sha1hasher.update(url)
305
-
306
- parsedHierarchy = {
307
- bucket_name : {
308
- "subdirs" : {},
309
- "basename" : bucket_name,
310
- "_id" : url,
311
- "type" : "d",
312
- "file_size" : 0,
313
- "uuid" : sha1hasher.hexdigest(),
314
- }
315
- }
316
-
317
- parsedLine['file_size'] = int(parsedLine['file_size'])
318
-
319
- if parsedLine['datetime'] == '1970-01-01 00:00':
320
- del parsedLine['datetime']
321
- else:
322
- from datetime import datetime
323
- parsedLine['datetime'] = int(datetime.strptime(
324
- parsedLine['datetime'],
325
- "%Y-%m-%d %H:%M").strftime("%s"))
326
-
327
- parsedLine['file_size'] = int(parsedLine['file_size'])
328
-
329
- if parsedLine['path'].endswith('/'):
330
- parsedLine['path'] = parsedLine['path'][:-1]
331
- parsedLine['type'] = 'd'
332
-
333
- insert_line(parsedLine,
334
- parsedHierarchy[bucket_name],
335
- bucket_name)
336
-
337
- if not parsedHierarchy: return []
338
-
339
- logging.info("calculating sizes")
340
- calculate_sizes(parsedHierarchy)
341
-
342
- logging.info("converting hierarchies")
343
- return each_listing_in_subdirs(parsedHierarchy)
344
-
345
- #-------------------------------------------------------------------------------
346
-
347
- if __name__ == '__main__':
348
-
349
- from optparse import OptionParser
350
- parser = OptionParser(usage = "usage: %prog [options] [s3hdfs bucket name]")
351
- parser.add_option("-i", "--input", dest="infile", default = None,
352
- help="input file..")
353
- parser.add_option("-o", "--output", dest="outfile", default = None,
354
- help="output file.")
355
- parser.add_option("-t", "--test", dest="test", default = False,
356
- action="store_true",
357
- help="reoutput in ls format. for debugging")
358
-
359
- (options, args) = parser.parse_args()
360
-
361
- import sys
362
- if len(args) > 1:
363
- parser.print_usage()
364
- sys.exit(0)
365
-
366
- if args:
367
- bucket, = args
368
- ls_converter = lambda istream: ls2json_subdirs(istream.readlines(),
369
- hdfs_parse_line(bucket))
370
- else:
371
- ls_converter = lambda istream: ls2json_subdirs(istream.readlines(),
372
- s3_parse_line)
373
-
374
- def open_or_die(fname, flags="r"):
375
- try:
376
- return open(fname, flags)
377
- except IOError as (errno, strerr):
378
- sys.stderr.write("Couldn't open %s: %s\n" % (fname, strerr))
379
- sys.exit(0)
380
-
381
- from sys import stdin, stdout
382
- instream = open_or_die(options.infile) if options.infile else stdin
383
- outstream = open_or_die(options.outfile, 'w') if options.outfile else stdout
384
-
385
- if options.test:
386
- for listing in ls_converter(instream):
387
- json2ls(listing, outstream)
388
- else:
389
- for listing in ls_converter(instream):
390
- write_listing_in_json(listing, outstream)
391
-
@@ -1,55 +0,0 @@
1
- require 'spec_helper'
2
- require_relative '../../lib/vayacondios/server/legacy_switch'
3
-
4
- require 'multi_json'
5
-
6
- require_relative '../../lib/vayacondios/client/itemset'
7
-
8
- describe Vayacondios::Client::ItemSet do
9
- context "after instantiation in legacy mode" do
10
- itemset = Vayacondios::Client::ItemSet.new("foohost", 9999, "fooorg", "footopic", "fooid")
11
- ary = ["foo", "bar", "baz"]
12
-
13
- # testing internals here to avoid shimming up HTTP libraries.
14
-
15
- it "generates a put request without a patch header when asked to create" do
16
- Vayacondios.force_legacy_mode true
17
-
18
- req = itemset.instance_eval{_req(:create, ary)}
19
-
20
- req.method.should eql('PUT')
21
- req.body.should eql(MultiJson.encode(ary))
22
- req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
23
- req.each_header.to_a.should_not include(["x_method", "PATCH"])
24
- end
25
-
26
- it "generates a put request with a patch header when asked to update" do
27
- Vayacondios.force_legacy_mode true
28
-
29
- req = itemset.instance_eval{_req(:update, ary)}
30
-
31
- req.method.should eql('PUT')
32
- req.body.should eql(MultiJson.encode(ary))
33
- req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
34
- req.each_header.to_a.should include(["x-method", "PATCH"])
35
- end
36
-
37
- it "generates a get request when asked to fetch" do
38
- req = itemset.instance_eval{_req(:fetch)}
39
-
40
- req.method.should eql('GET')
41
- req.body.should be_nil
42
- req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
43
- end
44
-
45
- it "generates a delete request when asked to remove" do
46
- Vayacondios.force_legacy_mode true
47
-
48
- req = itemset.instance_eval{_req(:remove, ary)}
49
-
50
- req.method.should eql('DELETE')
51
- req.body.should eql(MultiJson.encode(ary))
52
- req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
53
- end
54
- end
55
- end