vayacondios-server 0.2.11 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. data/.gitignore +3 -1
  2. data/.travis.yml +2 -0
  3. data/Gemfile +15 -9
  4. data/LICENSE.md +2 -6
  5. data/Procfile +1 -1
  6. data/README.md +656 -111
  7. data/Rakefile +89 -6
  8. data/bin/vcd +10 -0
  9. data/bin/vcd-server +8 -0
  10. data/config/database.yml +6 -0
  11. data/config/spec.example.yml +18 -0
  12. data/config/vayacondios.example.yml +15 -0
  13. data/config/vcd-server.rb +37 -0
  14. data/examples/configuration.rb +56 -0
  15. data/examples/event_stream.rb +19 -0
  16. data/examples/simple.rb +61 -0
  17. data/features/event.feature +319 -0
  18. data/features/events.feature +208 -0
  19. data/features/stash.feature +840 -0
  20. data/features/stashes.feature +492 -0
  21. data/features/step_definitions/stash_steps.rb +113 -0
  22. data/features/stream.feature +30 -0
  23. data/features/support/em.rb +14 -0
  24. data/features/support/env.rb +13 -0
  25. data/lib/vayacondios/configuration.rb +63 -0
  26. data/lib/vayacondios/server/api.rb +126 -0
  27. data/lib/vayacondios/server/api_options.rb +56 -0
  28. data/lib/vayacondios/server/configuration.rb +23 -0
  29. data/lib/vayacondios/server/driver.rb +71 -0
  30. data/lib/vayacondios/server/drivers/mongo.rb +126 -0
  31. data/lib/vayacondios/server/handlers/document_handler.rb +81 -0
  32. data/lib/vayacondios/server/handlers/event_handler.rb +31 -26
  33. data/lib/vayacondios/server/handlers/events_handler.rb +31 -0
  34. data/lib/vayacondios/server/handlers/stash_handler.rb +69 -0
  35. data/lib/vayacondios/server/handlers/stashes_handler.rb +49 -0
  36. data/lib/vayacondios/server/handlers/stream_handler.rb +39 -0
  37. data/lib/vayacondios/server/models/document.rb +87 -0
  38. data/lib/vayacondios/server/models/event.rb +198 -0
  39. data/lib/vayacondios/server/models/stash.rb +100 -0
  40. data/lib/vayacondios/server.rb +35 -0
  41. data/lib/vayacondios-server.rb +19 -13
  42. data/lib/vayacondios.rb +22 -0
  43. data/pom.xml +124 -4
  44. data/spec/configuration_spec.rb +41 -0
  45. data/spec/server/api_options_spec.rb +32 -0
  46. data/spec/server/api_spec.rb +279 -0
  47. data/spec/server/configuration_spec.rb +27 -0
  48. data/spec/server/drivers/mongo_spec.rb +107 -0
  49. data/spec/server/handlers/event_handler_spec.rb +62 -0
  50. data/spec/server/handlers/events_handler_spec.rb +51 -0
  51. data/spec/server/handlers/stash_handler_spec.rb +68 -0
  52. data/spec/server/handlers/stashes_handler_spec.rb +50 -0
  53. data/spec/server/handlers/stream_handler_spec.rb +5 -0
  54. data/spec/server/models/document_spec.rb +9 -0
  55. data/spec/server/models/event_spec.rb +185 -0
  56. data/spec/server/models/stash_spec.rb +95 -0
  57. data/spec/spec_helper.rb +23 -3
  58. data/spec/support/database_helper.rb +42 -0
  59. data/spec/support/log_helper.rb +19 -0
  60. data/spec/support/shared_context_for_events.rb +22 -0
  61. data/spec/support/shared_context_for_stashes.rb +24 -0
  62. data/spec/support/shared_examples_for_handlers.rb +32 -0
  63. data/src/main/java/com/infochimps/vayacondios/BaseClient.java +342 -0
  64. data/src/main/java/com/infochimps/vayacondios/HTTPClient.java +426 -0
  65. data/src/main/java/com/infochimps/vayacondios/VayacondiosClient.java +487 -65
  66. data/src/main/java/com/infochimps/vayacondios/test/IntegrationTest.java +3 -0
  67. data/src/test/java/com/infochimps/vayacondios/BaseClientTest.java +50 -0
  68. data/src/test/java/com/infochimps/vayacondios/HTTPClientIT.java +267 -0
  69. data/vayacondios-server.gemspec +9 -9
  70. metadata +127 -122
  71. checksums.yaml +0 -15
  72. data/.rspec +0 -2
  73. data/.yardopts +0 -10
  74. data/Guardfile +0 -41
  75. data/app/http_shim.rb +0 -71
  76. data/bin/vcd.sh +0 -27
  77. data/config/http_shim.rb +0 -43
  78. data/config/vayacondios.example.yaml +0 -7
  79. data/config/vayacondios.yaml +0 -7
  80. data/examples/java/ItemSetTest.java +0 -76
  81. data/lib/tasks/publish.rake +0 -23
  82. data/lib/tasks/spec.rake +0 -11
  83. data/lib/tasks/yard.rake +0 -2
  84. data/lib/vayacondios/client/config.rb +0 -7
  85. data/lib/vayacondios/client/configliere.rb +0 -38
  86. data/lib/vayacondios/client/cube_client.rb +0 -39
  87. data/lib/vayacondios/client/http_client.rb +0 -49
  88. data/lib/vayacondios/client/itemset.rb +0 -130
  89. data/lib/vayacondios/client/legacy_switch.rb +0 -43
  90. data/lib/vayacondios/client/notifier.rb +0 -123
  91. data/lib/vayacondios/client/zabbix_client.rb +0 -148
  92. data/lib/vayacondios/legacy_switch.rb +0 -43
  93. data/lib/vayacondios/server/errors/bad_request.rb +0 -6
  94. data/lib/vayacondios/server/errors/not_found.rb +0 -6
  95. data/lib/vayacondios/server/handlers/config_handler.rb +0 -32
  96. data/lib/vayacondios/server/handlers/itemset_handler.rb +0 -60
  97. data/lib/vayacondios/server/legacy_switch.rb +0 -43
  98. data/lib/vayacondios/server/model/config_document.rb +0 -89
  99. data/lib/vayacondios/server/model/document.rb +0 -25
  100. data/lib/vayacondios/server/model/event_document.rb +0 -94
  101. data/lib/vayacondios/server/model/itemset_document.rb +0 -126
  102. data/lib/vayacondios/server/rack/extract_methods.rb +0 -35
  103. data/lib/vayacondios/server/rack/jsonize.rb +0 -43
  104. data/lib/vayacondios/server/rack/params.rb +0 -50
  105. data/lib/vayacondios/server/rack/path.rb +0 -23
  106. data/lib/vayacondios/server/rack/path_validation.rb +0 -22
  107. data/lib/vayacondios/version.rb +0 -3
  108. data/lib/vayacondios-client.rb +0 -22
  109. data/scripts/hadoop_monitor/configurable.rb +0 -66
  110. data/scripts/hadoop_monitor/hadoop_attempt_scraper.rb +0 -45
  111. data/scripts/hadoop_monitor/hadoop_client.rb +0 -273
  112. data/scripts/hadoop_monitor/hadoop_monitor.rb +0 -101
  113. data/scripts/hadoop_monitor/hadoopable.rb +0 -65
  114. data/scripts/hadoop_monitor/machine_monitor.rb +0 -115
  115. data/scripts/s3_cataloger/buckets +0 -33
  116. data/scripts/s3_cataloger/foreach_bucket +0 -88
  117. data/scripts/s3_cataloger/parse_ls.py +0 -391
  118. data/spec/client/itemset_legacy_spec.rb +0 -55
  119. data/spec/client/itemset_spec.rb +0 -60
  120. data/spec/client/notifier_spec.rb +0 -120
  121. data/spec/server/config_spec.rb +0 -113
  122. data/spec/server/event_spec.rb +0 -103
  123. data/spec/server/itemset_legacy_spec.rb +0 -320
  124. data/spec/server/itemset_spec.rb +0 -317
  125. data/spec/server/rack/extract_methods_spec.rb +0 -60
  126. data/spec/server/rack/path_spec.rb +0 -36
  127. data/spec/server/rack/path_validation_spec.rb +0 -22
  128. data/spec/server/server_spec.rb +0 -20
  129. data/spec/support/mongo_cleaner.rb +0 -32
  130. data/src/main/java/ItemSetTest.java +0 -76
  131. data/src/main/java/com/infochimps/util/CurrentClass.java +0 -26
  132. data/src/main/java/com/infochimps/util/DebugUtil.java +0 -38
  133. data/src/main/java/com/infochimps/util/HttpHelper.java +0 -181
  134. data/src/main/java/com/infochimps/vayacondios/ItemSets.java +0 -373
  135. data/src/main/java/com/infochimps/vayacondios/LinkToVCD.java +0 -18
  136. data/src/main/java/com/infochimps/vayacondios/MemoryVCDShim.java +0 -84
  137. data/src/main/java/com/infochimps/vayacondios/Organization.java +0 -62
  138. data/src/main/java/com/infochimps/vayacondios/PathBuilder.java +0 -13
  139. data/src/main/java/com/infochimps/vayacondios/StandardVCDLink.java +0 -218
  140. data/src/main/java/com/infochimps/vayacondios/VCDIntegrationTest.java +0 -108
  141. data/src/test/java/com/infochimps/vayacondios/TestVayacondiosInMemory.java +0 -78
  142. data/vayacondios-client.gemspec +0 -25
@@ -1,115 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require_relative 'configure'
4
- require 'thread'
5
- require 'socket'
6
- require 'scanf'
7
- require 'json'
8
- require 'mongo'
9
-
10
- class Vayacondios
11
-
12
- class StatServer
13
-
14
- include Configurable
15
-
16
- def initialize
17
- unless get_conf.mongo_ip
18
- raise "The IP address of the mongo server must be set!"
19
- end
20
-
21
- logger.info "Connecting to Mongo server at ip #{get_conf.mongo_ip}"
22
- conn = Mongo::Connection.new get_conf.mongo_ip
23
- logger.debug "Getting job database #{get_conf.mongo_jobs_db}"
24
- @db = conn[get_conf.mongo_jobs_db]
25
- end
26
-
27
- def run
28
-
29
- # TODO: This entire script should be replaced by calls to zabbix
30
- # initiated by the main loop of the hadoop_monitor.
31
-
32
- logger.debug "Waiting for hadoop monitor to create the event collection."
33
- sleep get_conf.sleep_seconds until
34
- @db.collection_names.index get_conf.mongo_job_events_collection
35
-
36
- job_events = @db[get_conf.mongo_job_events_collection]
37
-
38
- logger.debug "Got the event collection. Creating machine stats collection."
39
- machine_stats = @db.
40
- create_collection(get_conf.mongo_machine_stats_collection)
41
-
42
- logger.debug "Querying job_events until we see an insertion."
43
- # Keep querying the job_events collection until there's an
44
- # event. Don't just use the cursor from .find without checking,
45
- # because if hadoop_monitor inserts an event into an empty
46
- # database, this cursor will no longer work, even if it's
47
- # tailable. not quite sure why Mongo does it that way.
48
- events = job_events.find
49
- events.add_option 0x02 # tailable
50
- until events.has_next?
51
- sleep get_conf.sleep_seconds
52
- events = job_events.find
53
- events.add_option 0x02 # tailable
54
- end
55
-
56
- logger.debug "Priming main event loop. Waiting to see if the cluster is busy."
57
-
58
- # Get up-to-date on the state of the cluster. assume quiet to start.
59
- cluster_busy = self.class.next_state(events, false, get_conf.event)
60
-
61
- # main loop
62
- loop do
63
-
64
- logger.debug "In main event loop. Waiting to see if the cluster is busy."
65
-
66
- # Get up-to-date on the state of the cluster.
67
- cluster_busy = self.class.next_state(events, cluster_busy, get_conf.event)
68
-
69
- # Don't grab stats unless the cluster is busy
70
- unless cluster_busy
71
- sleep get_conf.sleep_seconds
72
- next
73
- end
74
-
75
- logger.debug "Grabbing stats and pushing them into the collection."
76
-
77
- # Grab the stats!
78
- # ifstat's delay will function as our heartbeat timer.
79
- is, ignore, rw = `ifstat 1 1`.split("\n").map(&:split)
80
- headers, *disks = `iostat -x`.split("\n")[5..-1].map(&:split)
81
- cpu, mem, swap, proc_headers, *procs = `top -b -n 1`.
82
- split("\n").map(&:strip).select{|x| not x.empty?}[2..-1]
83
-
84
- # Write the stats into the mongo collection.
85
- machine_stats.insert(
86
- :net => Hash[is.zip(rw.each_slice(2).map{|r,w| {:r => r, :w => w}})],
87
- :disk => Hash[disks.map{|d| [d.first, Hash[headers.zip(d)]]}],
88
- :cpu => self.class.split_top_stats(cpu),
89
- :mem => self.class.split_top_stats(mem),
90
- :swap => self.class.split_top_stats(swap))
91
- end
92
- end
93
-
94
- private
95
-
96
- def self.split_top_stats line
97
- Hash[line.split(':', 2).last.split(',').map(&:strip).map do |stat|
98
- stat.scanf("%f%*c%s").reverse
99
- end]
100
- end
101
-
102
- def self.next_state events_cursor, current_state, event_attr_name
103
- while current_event = events_cursor.next
104
- current_state = case current_event[event_attr_name]
105
- when CLUSTER_BUSY then true
106
- when CLUSTER_QUIET then false
107
- else current_state
108
- end
109
- end
110
- current_state
111
- end
112
- end
113
- end
114
-
115
- Vayacondios::StatServer.new.run
@@ -1,33 +0,0 @@
1
- export dir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
2
-
3
- bdump_and_bparse() {
4
- bucket_name=$1
5
- bdump $1 ; bparse $1
6
- }
7
-
8
- bparse_and_bload() {
9
- bucket_name=$1
10
- bparse $1 ; bload "$@"
11
- }
12
-
13
- bdump_and_bload() {
14
- bucket_name=$1
15
- bdump $1; bparse $1 ; bload $1
16
- }
17
-
18
- bdump() {
19
- bucket_name=$1
20
- s3cmd ls -r s3://$bucket_name/ >$bucket_name.ls
21
- }
22
-
23
- bparse() {
24
- bucket_name=$1
25
- $dir/parse_ls.py <$bucket_name.ls >$bucket_name.json
26
- }
27
-
28
- bload() {
29
- bucket_name=$1
30
- db=$2
31
- collection=$3
32
- mongoimport -d $db -c $collection $bucket_name.json
33
- }
@@ -1,88 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- export dir="$( cd -P "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
4
-
5
- . $dir/buckets
6
-
7
- case $1 in
8
- -f)
9
- bucket_file=$2
10
- shift 2
11
- ;;
12
- -h|--help)
13
- cat <<EOF
14
- foreach_bucket [OPTIONS] COMMAND [ARGUMENTS]
15
-
16
- This script is used to do a recursive listing of an s3 bucket using
17
- the s3cmd and then jsonify the output. It runs the COMMAND on the
18
- buckets specified in a file; on standard input; or, by default, on all
19
- buckets that can be seen by s3cmd.
20
-
21
- OPTIONS include the following:
22
-
23
- -f BUCKET_FILE file containing a bucket name on each line. If
24
- this is set to '-', then buckets are read from
25
- standard input.
26
-
27
- COMMAND includes anything in the 'buckets' script. The main commands
28
- are the following:
29
-
30
- bdump dumps BUCKET to a file BUCKET.ls in the current
31
- working directory
32
-
33
- bparse runs BUCKET.ls through a parser to jsonify it and
34
- outputs the result as BUCKET.json
35
-
36
- bload loads BUCKET.json into a mongo database. The first
37
- argument passed to this command specifies the
38
- mongo database, while the second specifies the
39
- collection.
40
- EOF
41
- exit 0
42
- ;;
43
- -*)
44
- echo "Invalid option: $1"
45
- exit 1
46
- ;;
47
- esac
48
-
49
- command=$1
50
- shift
51
-
52
- buckets=()
53
-
54
- ## no bucket file specified read all s3 buckets
55
- if [[ -z $bucket_file ]]
56
- then
57
- for bucket in `s3cmd ls | cut -d ' ' -f 4 | cut -d / -f 3`
58
- do
59
- buckets=("${buckets[@]}" "$bucket")
60
- done
61
-
62
- ## read buckets from standard input
63
- elif [[ $bucket_file == "-" ]]
64
- then
65
- read bucket
66
- until [[ $? -eq 1 ]]
67
- do
68
- buckets=("${buckets[@]}" "$bucket")
69
- read bucket
70
- done
71
-
72
- ## read from bucket_file
73
- else
74
- tmpIFS=$IFS
75
- IFS=$'\n'
76
-
77
- for bucket in `cat $bucket_file`
78
- do
79
- buckets=("${buckets[@]}" "$bucket")
80
- done
81
-
82
- IFS=$tmpIFS
83
- fi
84
-
85
- for bucket in "${buckets[@]}"
86
- do
87
- ($command $bucket "$@")&
88
- done
@@ -1,391 +0,0 @@
1
- #!/usr/bin/env python
2
-
3
- import logging
4
- import sys
5
-
6
- # crank this down to info for progress messages. can also use
7
- # "filename=" for that kind of thing. The only reason this is stderr
8
- # is to allow for output redirection.
9
- logging.basicConfig(stream=sys.stderr, level=logging.ERROR)
10
-
11
- #-------------------------------------------------------------------------------
12
-
13
- def calculate_sizes(parsedHierarchies):
14
- """
15
- @param parsedHierarchies dictionary mapping filenames to
16
- parsedHierarchies. This is in the same
17
- format as the 'subdirs' component of a
18
- parsedHierarchy.
19
- """
20
-
21
- from operator import add
22
- return reduce(
23
- add,
24
- (
25
- calculate_size(parsedHierarchies[name])
26
- for name in parsedHierarchies.keys()))
27
-
28
-
29
- def calculate_size(parsedHierarchy):
30
- """
31
- @param parsedHierarchy dictionary in the same format as the one
32
- operated on by insert_line
33
- """
34
-
35
- if 'subdirs' in parsedHierarchy:
36
- parsedHierarchy['tree_size'] = calculate_sizes(parsedHierarchy['subdirs'])
37
- elif parsedHierarchy['type'] == 'd':
38
- parsedHierarchy['tree_size'] = 0
39
-
40
- if 'tree_size' in parsedHierarchy:
41
- return parsedHierarchy['tree_size']
42
- else:
43
- return parsedHierarchy['file_size']
44
-
45
- #-------------------------------------------------------------------------------
46
-
47
- from sys import stdout
48
- def write_listing_in_json(listing, writer = stdout):
49
- writer.write('{"basename":"%s"' % listing['basename'])
50
-
51
- from operator import add
52
- writer.write(reduce(add, (',"%s":%s' % (key,
53
- '"%s"' % listing[key]
54
- if isinstance(listing[key],str)
55
- else listing[key])
56
- for key in listing.keys() if key != 'subdirs')))
57
-
58
- writer.write('}\n')
59
-
60
- #-------------------------------------------------------------------------------
61
-
62
- def each_listing_in_hierarchy(parsedHierarchy):
63
- """
64
- @param parsedHierarchy dictionary mapping filenames to
65
- parsedHierarchies. This is in the same
66
- format as the 'subdirs' component of a
67
- parsedHierarchy.
68
-
69
- @return one record for every file listing. Every parsedHierarchy
70
- will have its 'subdirs' key deleted and will consequently be flat.
71
- """
72
-
73
- if 'subdirs' in parsedHierarchy:
74
- subdirs = parsedHierarchy['subdirs']
75
- del parsedHierarchy['subdirs']
76
- return [parsedHierarchy] + each_listing_in_subdirs(subdirs)
77
- else:
78
- return [parsedHierarchy]
79
-
80
- def each_listing_in_subdirs(parsedHierarchies):
81
- keys = parsedHierarchies.keys()
82
- keys.sort()
83
- from operator import add
84
-
85
- return reduce(add,
86
- [each_listing_in_hierarchy(parsedHierarchies[f])
87
- for f in keys])
88
-
89
- #-------------------------------------------------------------------------------
90
-
91
- def insert_line(parsedLine,
92
- parsedHierarchy,
93
- bucket_name,
94
- prefix='/',
95
- s3hdfs = False):
96
- """
97
- @param parsedHierarchy A parsed hierarchy is a dictionary that
98
- contains the size, date, type, path, and
99
- subdirs of a file. It has two special
100
- properties: the basename contains no /
101
- characters, and the "subdirs" points to a
102
- dictionary that maps names to
103
- parsedHierarchies underneath this one.
104
- """
105
-
106
- def insert_subdir(parsedHierarchy, subdir, bucket_name, prefix):
107
- if 'subdirs' not in parsedHierarchy:
108
- parsedHierarchy['subdirs'] = {}
109
- if subdir not in parsedHierarchy['subdirs']:
110
- parsedHierarchy['subdirs'][subdir] = {}
111
- parsedHierarchy['subdirs'][subdir]['basename'] = subdir
112
- parsedHierarchy['subdirs'][subdir]['file_size'] = 0
113
- parsedHierarchy['subdirs'][subdir]['type'] = 'd'
114
-
115
- prot = 's3' if s3hdfs else 's3n'
116
-
117
- parent_url = (parsedHierarchy['_id'] if '_id' in parsedHierarchy
118
- else '%s://%s/' % (prot, bucket_name))
119
-
120
- parsedHierarchy['subdirs'][subdir]['parent_id'] = parent_url
121
-
122
-
123
- url = '%s://%s%s%s' % (prot, bucket_name, prefix, subdir)
124
- parsedHierarchy['subdirs'][subdir]['_id'] = url
125
-
126
- import hashlib
127
- sha1hasher = hashlib.new('sha1')
128
- sha1hasher.update(url)
129
-
130
- parsedHierarchy['subdirs'][subdir]['uuid'] = (
131
- sha1hasher.hexdigest().lower())
132
-
133
- path = parsedLine['path']
134
- # recursively insert rest of path after /
135
- if path.find('/') != -1:
136
- base,rest = path.split('/',1)
137
-
138
- insert_subdir(parsedHierarchy, base, bucket_name, prefix)
139
-
140
- parsedLine['path'] = rest
141
- insert_line(parsedLine,
142
- parsedHierarchy['subdirs'][base],
143
- bucket_name,
144
- prefix + base + '/')
145
-
146
- # insert one file or directory into "subdirs"
147
- else:
148
- insert_subdir(parsedHierarchy, path, bucket_name, prefix)
149
-
150
- # This will also overwrite the default 'type':'d' from insert_subdir
151
- for k in parsedLine.keys():
152
- parsedHierarchy['subdirs'][path][k] = parsedLine[k]
153
-
154
- parsedHierarchy['subdirs'][path]['basename'] = \
155
- parsedHierarchy['subdirs'][path]['path']
156
- del parsedHierarchy['subdirs'][path]['path']
157
-
158
- #-------------------------------------------------------------------------------
159
-
160
- def json2ls(json, writer, prefix='/'):
161
- """
162
- sanity check. writes json back out to the command line in ls form
163
- """
164
-
165
- from datetime import datetime
166
- d =(datetime.fromtimestamp(json['datetime']).strftime("%Y-%m-%d %H:%M")
167
- if 'datetime' in json else '1970-01-01 00:00')
168
-
169
- writer.write("%s %9d %s\n" % (
170
- d,
171
- json['file_size'],
172
- json['_id'].replace('s3n', 's3')))
173
-
174
- #-------------------------------------------------------------------------------
175
-
176
- def hdfs_parse_line(bucket_name):
177
-
178
- import re
179
-
180
- def line_parser(line):
181
-
182
- components = re.compile(r"""
183
-
184
- ^
185
- (
186
- [d\-] # directory bit
187
- )
188
- (?:[r\-][w\-][xs\-]){2}
189
- [r\-][w\-][x\-]
190
-
191
- [ \t]*
192
-
193
- (?:-|[0-9]+) # number of links. ignore.
194
-
195
- [ \t]*
196
-
197
- ([0-9]+) # size
198
-
199
- [ \t]*
200
-
201
- (\d\d\d\d-\d\d-\d\d[ ]\d\d:\d\d)
202
-
203
- [ \t]*
204
-
205
- ( # path
206
- [^ \t]
207
- [^\n]*
208
- )
209
-
210
- .*
211
-
212
- $
213
-
214
- """, re.VERBOSE)
215
-
216
- m = components.match(line)
217
- if not m:
218
- import sys
219
- sys.stderr.write("couldn't parse line: %s\n" % (line))
220
- return None
221
-
222
- typ, fsize, datetime, path = m.groups()
223
-
224
- if typ == '-': typ = 'f'
225
- if path.startswith('/'): path = path[1:]
226
-
227
- return datetime, fsize, bucket_name, path, typ
228
-
229
- return line_parser
230
-
231
- #-------------------------------------------------------------------------------
232
-
233
- def s3_parse_line(line):
234
-
235
- import re
236
- components = re.compile(r"""
237
-
238
- ^
239
- (\d\d\d\d-\d\d-\d\d[ ]\d\d:\d\d)
240
-
241
- [ \t]*
242
-
243
- ([0-9]+)
244
-
245
- [ \t]*
246
-
247
- (?:
248
- (?:s3://)
249
- ([^/]*)
250
- /
251
- ([^\n]*)
252
- )
253
-
254
- .*
255
-
256
- $
257
-
258
- """, re.VERBOSE)
259
-
260
- m = components.match(line)
261
- if not m:
262
- import sys
263
- sys.stderr.write("couldn't parse line: %s\n" % (line))
264
- return None
265
-
266
- datetime, fsize, bucket_name, parsed_line = m.groups()
267
- typ = 'f'
268
-
269
- return datetime, fsize, bucket_name, parsed_line, typ
270
-
271
- #-------------------------------------------------------------------------------
272
-
273
- def ls2json_subdirs(lines, line_parser):
274
-
275
- parsedHierarchy = None
276
-
277
- count = 0
278
- for line in lines:
279
- count = count + 1
280
- if count % 1000 == 0:
281
- logging.info("inserting line %d" % (count))
282
-
283
- line_tuple = line_parser(line)
284
-
285
- if not line_tuple:
286
- continue
287
-
288
- parsedLine = {}
289
-
290
- (
291
-
292
- parsedLine['datetime'],
293
- parsedLine['file_size'],
294
- bucket_name,
295
- parsedLine['path'],
296
- parsedLine['type']
297
-
298
- ) = line_tuple
299
-
300
- if not parsedHierarchy:
301
- url = "s3n://%s" % (bucket_name)
302
- import hashlib
303
- sha1hasher = hashlib.new('sha1')
304
- sha1hasher.update(url)
305
-
306
- parsedHierarchy = {
307
- bucket_name : {
308
- "subdirs" : {},
309
- "basename" : bucket_name,
310
- "_id" : url,
311
- "type" : "d",
312
- "file_size" : 0,
313
- "uuid" : sha1hasher.hexdigest(),
314
- }
315
- }
316
-
317
- parsedLine['file_size'] = int(parsedLine['file_size'])
318
-
319
- if parsedLine['datetime'] == '1970-01-01 00:00':
320
- del parsedLine['datetime']
321
- else:
322
- from datetime import datetime
323
- parsedLine['datetime'] = int(datetime.strptime(
324
- parsedLine['datetime'],
325
- "%Y-%m-%d %H:%M").strftime("%s"))
326
-
327
- parsedLine['file_size'] = int(parsedLine['file_size'])
328
-
329
- if parsedLine['path'].endswith('/'):
330
- parsedLine['path'] = parsedLine['path'][:-1]
331
- parsedLine['type'] = 'd'
332
-
333
- insert_line(parsedLine,
334
- parsedHierarchy[bucket_name],
335
- bucket_name)
336
-
337
- if not parsedHierarchy: return []
338
-
339
- logging.info("calculating sizes")
340
- calculate_sizes(parsedHierarchy)
341
-
342
- logging.info("converting hierarchies")
343
- return each_listing_in_subdirs(parsedHierarchy)
344
-
345
- #-------------------------------------------------------------------------------
346
-
347
- if __name__ == '__main__':
348
-
349
- from optparse import OptionParser
350
- parser = OptionParser(usage = "usage: %prog [options] [s3hdfs bucket name]")
351
- parser.add_option("-i", "--input", dest="infile", default = None,
352
- help="input file..")
353
- parser.add_option("-o", "--output", dest="outfile", default = None,
354
- help="output file.")
355
- parser.add_option("-t", "--test", dest="test", default = False,
356
- action="store_true",
357
- help="reoutput in ls format. for debugging")
358
-
359
- (options, args) = parser.parse_args()
360
-
361
- import sys
362
- if len(args) > 1:
363
- parser.print_usage()
364
- sys.exit(0)
365
-
366
- if args:
367
- bucket, = args
368
- ls_converter = lambda istream: ls2json_subdirs(istream.readlines(),
369
- hdfs_parse_line(bucket))
370
- else:
371
- ls_converter = lambda istream: ls2json_subdirs(istream.readlines(),
372
- s3_parse_line)
373
-
374
- def open_or_die(fname, flags="r"):
375
- try:
376
- return open(fname, flags)
377
- except IOError as (errno, strerr):
378
- sys.stderr.write("Couldn't open %s: %s\n" % (fname, strerr))
379
- sys.exit(0)
380
-
381
- from sys import stdin, stdout
382
- instream = open_or_die(options.infile) if options.infile else stdin
383
- outstream = open_or_die(options.outfile, 'w') if options.outfile else stdout
384
-
385
- if options.test:
386
- for listing in ls_converter(instream):
387
- json2ls(listing, outstream)
388
- else:
389
- for listing in ls_converter(instream):
390
- write_listing_in_json(listing, outstream)
391
-
@@ -1,55 +0,0 @@
1
- require 'spec_helper'
2
- require_relative '../../lib/vayacondios/server/legacy_switch'
3
-
4
- require 'multi_json'
5
-
6
- require_relative '../../lib/vayacondios/client/itemset'
7
-
8
- describe Vayacondios::Client::ItemSet do
9
- context "after instantiation in legacy mode" do
10
- itemset = Vayacondios::Client::ItemSet.new("foohost", 9999, "fooorg", "footopic", "fooid")
11
- ary = ["foo", "bar", "baz"]
12
-
13
- # testing internals here to avoid shimming up HTTP libraries.
14
-
15
- it "generates a put request without a patch header when asked to create" do
16
- Vayacondios.force_legacy_mode true
17
-
18
- req = itemset.instance_eval{_req(:create, ary)}
19
-
20
- req.method.should eql('PUT')
21
- req.body.should eql(MultiJson.encode(ary))
22
- req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
23
- req.each_header.to_a.should_not include(["x_method", "PATCH"])
24
- end
25
-
26
- it "generates a put request with a patch header when asked to update" do
27
- Vayacondios.force_legacy_mode true
28
-
29
- req = itemset.instance_eval{_req(:update, ary)}
30
-
31
- req.method.should eql('PUT')
32
- req.body.should eql(MultiJson.encode(ary))
33
- req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
34
- req.each_header.to_a.should include(["x-method", "PATCH"])
35
- end
36
-
37
- it "generates a get request when asked to fetch" do
38
- req = itemset.instance_eval{_req(:fetch)}
39
-
40
- req.method.should eql('GET')
41
- req.body.should be_nil
42
- req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
43
- end
44
-
45
- it "generates a delete request when asked to remove" do
46
- Vayacondios.force_legacy_mode true
47
-
48
- req = itemset.instance_eval{_req(:remove, ary)}
49
-
50
- req.method.should eql('DELETE')
51
- req.body.should eql(MultiJson.encode(ary))
52
- req.path.should eql('/v1/fooorg/itemset/footopic/fooid')
53
- end
54
- end
55
- end