wonderdog 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. data/.gitignore +2 -0
  2. data/.idea/encodings.xml +5 -0
  3. data/.idea/misc.xml +5 -0
  4. data/.idea/modules.xml +9 -0
  5. data/.idea/scopes/scope_settings.xml +5 -0
  6. data/.idea/vcs.xml +7 -0
  7. data/.idea/wonderdog.iml +41 -0
  8. data/Gemfile +1 -1
  9. data/bin/estool +22 -1
  10. data/bin/squirrel.rb +108 -0
  11. data/lib/wonderdog.rb +3 -0
  12. data/lib/wonderdog/hadoop_invocation_override.rb +4 -1
  13. data/lib/wonderdog/version.rb +1 -1
  14. data/pom.xml +1 -1
  15. data/spec/spec_helper.rb +1 -1
  16. data/spec/wonderdog/hadoop_invocation_override_spec.rb +1 -1
  17. data/squirrel/all_facets.rb +95 -0
  18. data/squirrel/change_es_index_settings.rb +19 -0
  19. data/squirrel/clear_es_caches.rb +30 -0
  20. data/squirrel/esbackup.rb +184 -0
  21. data/squirrel/esbackup_stripped.rb +153 -0
  22. data/squirrel/fields.sh +5 -0
  23. data/squirrel/getFields.rb +19 -0
  24. data/squirrel/replay.rb +219 -0
  25. data/squirrel/squirrel.rb +95 -0
  26. data/squirrel/warmer_interface.rb +59 -0
  27. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +2 -2
  28. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +14 -2
  29. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +20 -5
  30. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +55 -26
  31. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +59 -22
  32. data/test/cardinality.rb +43 -0
  33. data/test/change_es_index_settings.rb +19 -0
  34. data/test/clear_es_caches.rb +30 -0
  35. data/test/config/mapping.yml +327 -0
  36. data/test/config/mappings.yml +328 -0
  37. data/test/count_check.txt +0 -0
  38. data/test/esbackup_stripped.rb +153 -0
  39. data/test/mapping.yml +327 -0
  40. data/test/medium_slow_queries +41 -0
  41. data/test/queries.txt +0 -0
  42. data/test/quick_test_slow_queries +4 -0
  43. data/test/run_pry.rb +3 -0
  44. data/test/some_slow_queries +53 -0
  45. data/test/warmer_interface.rb +64 -0
  46. data/test/warmindices.rb +65 -0
  47. data/wonderdog.gemspec +1 -1
  48. metadata +40 -7
File without changes
@@ -0,0 +1,153 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Simple script to dump elasticsearch indexes as raw JSON
4
+
5
+ require 'tire'
6
+ require 'zlib'
7
+ require 'socket'
8
+ require 'pathname'
9
+ require 'multi_json'
10
+
11
+ class ESBackup
12
+
13
+ def initialize(output_dir, options = {})
14
+ Tire::Configuration.url "http://#{options[:host]}:#{options[:port]}"
15
+ @output_dir = output_dir || ''
16
+ @index = options[:index]
17
+ @batch_size = options[:batch_size].to_i
18
+ @mapping_file = options[:mappings]
19
+ if options[:query].nil?
20
+ @query = nil
21
+ else
22
+ @query = MultiJson.load(options[:query]) rescue nil
23
+ end
24
+ if options[:dump_file].nil?
25
+ @dump_file = @index
26
+ else
27
+ @dump_file = options[:dump_file]
28
+ end
29
+ end
30
+
31
+ def dump_mapping
32
+ index = Tire::Index.new @index
33
+ File.open(@mapping_file, 'w'){ |f| f.puts index.mapping.to_json }
34
+ end
35
+
36
+ def fullpath dir
37
+ basedir = dir.start_with?('/') ? dir : File.join(Dir.pwd, dir)
38
+ FileUtils.mkdir_p(basedir)
39
+ basedir
40
+ end
41
+
42
+ def gz_output
43
+ File.join(fullpath(@output_dir), @index + '.gz')
44
+ end
45
+
46
+ def create_scanner
47
+ scan_opts = { size: @batch_size }
48
+ additional_query = @query
49
+ Tire::Search::Scan.new(@index, scan_opts) do
50
+ # This is fucking stupid; why people have to be cute and make everything DSL only
51
+ # I'll never understand, but the person who wrote this gem has forced us to ONLY be able to
52
+ # ask queries in this manner.
53
+ query do
54
+ additional_query.each_pair do |key, vals|
55
+ case vals
56
+ # Assuming here that you are only asking for one field at a time...this is getting hacky fast
57
+ when Hash then self.send(key.to_sym, *vals.to_a.flatten)
58
+ when Array then self.send(key.to_sym, *vals)
59
+ end
60
+ end
61
+ end if additional_query
62
+ end
63
+ end
64
+
65
+ def run
66
+ dump_mapping if @mapping_file
67
+ gz = Zlib::GzipWriter.open gz_output
68
+ count = 0
69
+ create_scanner.each do |document|
70
+ document.each do |record|
71
+ json_doc = record.to_hash.except(:type, :_index, :_explanation, :_score, :_version, :highlight, :sort).to_json
72
+ gz.puts json_doc
73
+ count += 1
74
+ end
75
+ end
76
+ gz.close
77
+ puts "#{@index} backup complete. #{count} records written"
78
+ end
79
+ end
80
+
81
+ class ESRestore
82
+
83
+ def initialize(input, options = {})
84
+ Tire::Configuration.url "http://#{options[:host]}:#{options[:port]}"
85
+ @index = options[:index]
86
+ @batch_size = options[:batch_size].to_i
87
+ @gz_input = Zlib::GzipReader.open(input)
88
+ @mapping_file = options[:mappings]
89
+ end
90
+
91
+ def create_index
92
+ index = Tire::Index.new @index
93
+ options = @mapping_file ? { mappings: MultiJson.load(File.read(@mapping_file)) } : {}
94
+ index.create(options) unless index.exists?
95
+ index
96
+ end
97
+
98
+ def run
99
+ reindex = create_index
100
+ count, documents = 0, []
101
+ @gz_input.each_line do |json|
102
+ documents << MultiJson.load(json)
103
+ count += 1
104
+ if count % @batch_size == 0
105
+ reindex.bulk_create documents
106
+ puts "#{count} records loaded"
107
+ documents.clear
108
+ end
109
+ end
110
+ @gz_input.close()
111
+ reindex.bulk_create documents if not documents.empty?
112
+ puts "#{@index} restore complete with #{count} records loaded"
113
+ end
114
+ end
115
+
116
+ class ESDup
117
+
118
+ def initialize(input, options = {})
119
+ Tire::Configuration.url "http://#{options[:host]}:#{options[:port]}"
120
+ @index = options[:index]
121
+ @batch_size = options[:batch_size].to_i
122
+ @gz_input = Zlib::GzipReader.open(input)
123
+ @mapping_file = options[:mappings]
124
+ end
125
+
126
+ def create_index
127
+ index = Tire::Index.new @index
128
+ options = @mapping_file ? { mappings: MultiJson.load(File.read(@mapping_file)) } : {}
129
+ index.create(options) unless index.exists?
130
+ index
131
+ end
132
+
133
+ def run
134
+ reindex = create_index
135
+ count, documents = 0, []
136
+ @gz_input.each_line do |json|
137
+ line = MultiJson.load(json)
138
+ line.delete("_id")
139
+ line.delete("id")
140
+ documents << line
141
+ count += 1
142
+ if count % @batch_size == 0
143
+ reindex.bulk_create documents
144
+ puts "#{count} records loaded"
145
+ documents.clear
146
+ end
147
+ end
148
+ @gz_input.close()
149
+ reindex.bulk_create documents if not documents.empty?
150
+ puts "#{@index} restore complete with #{count} records loaded"
151
+ end
152
+ end
153
+
@@ -0,0 +1,327 @@
1
+ ad_tag_count:
2
+ _all:
3
+ enabled: false
4
+ _routing:
5
+ path: ad_tag_id
6
+ dynamic: false
7
+ properties:
8
+ _state:
9
+ type: string
10
+ index: "no"
11
+ store: "yes"
12
+ cnt:
13
+ type: integer
14
+ ad_tag_id:
15
+ type: integer
16
+ creative_id:
17
+ type: integer
18
+ pl_test:
19
+ index: not_analyzed
20
+ type: string
21
+ metric:
22
+ index: not_analyzed
23
+ type: string
24
+ tb_h:
25
+ type: date
26
+ feature:
27
+ index: not_analyzed
28
+ type: string
29
+ base_feature:
30
+ index: not_analyzed
31
+ type: string
32
+ seconds:
33
+ type: integer
34
+ flight_count:
35
+ _all:
36
+ enabled: false
37
+ _routing:
38
+ path: flight_id
39
+ dynamic: false
40
+ properties:
41
+ _state:
42
+ type: string
43
+ index: "no"
44
+ store: "yes"
45
+ cnt:
46
+ type: integer
47
+ flight_id:
48
+ type: integer
49
+ pl_test:
50
+ index: not_analyzed
51
+ type: string
52
+ metric:
53
+ index: not_analyzed
54
+ type: string
55
+ tb_h:
56
+ type: date
57
+ feature:
58
+ index: not_analyzed
59
+ type: string
60
+ base_feature:
61
+ index: not_analyzed
62
+ type: string
63
+ seconds:
64
+ type: integer
65
+ metric_feature:
66
+ index: not_analyzed
67
+ type: string
68
+ creative_count:
69
+ _all:
70
+ enabled: false
71
+ _routing:
72
+ path: creative_id
73
+ dynamic: false
74
+ properties:
75
+ _state:
76
+ type: string
77
+ index: "no"
78
+ store: "yes"
79
+ cnt:
80
+ type: integer
81
+ creative_id:
82
+ type: integer
83
+ pl_test:
84
+ index: not_analyzed
85
+ type: string
86
+ metric:
87
+ index: not_analyzed
88
+ type: string
89
+ tb_h:
90
+ type: date
91
+ feature:
92
+ index: not_analyzed
93
+ type: string
94
+ base_feature:
95
+ index: not_analyzed
96
+ type: string
97
+ seconds:
98
+ type: integer
99
+ item_count:
100
+ _all:
101
+ enabled: false
102
+ _routing:
103
+ path: flight_id
104
+ dynamic: false
105
+ properties:
106
+ _state:
107
+ type: string
108
+ index: "no"
109
+ store: "yes"
110
+ cnt:
111
+ type: integer
112
+ item_id:
113
+ type: integer
114
+ pl_test:
115
+ index: not_analyzed
116
+ type: string
117
+ flight_id:
118
+ type: integer
119
+ metric:
120
+ index: not_analyzed
121
+ type: string
122
+ tb_h:
123
+ type: date
124
+ feature:
125
+ index: not_analyzed
126
+ type: string
127
+ base_feature:
128
+ index: not_analyzed
129
+ type: string
130
+ signal_value_count:
131
+ _all:
132
+ enabled: false
133
+ _routing:
134
+ path: flight_id
135
+ dynamic: false
136
+ properties:
137
+ _state:
138
+ type: string
139
+ index: "no"
140
+ store: "yes"
141
+ cnt:
142
+ type: integer
143
+ signal_value_id:
144
+ index: not_analyzed
145
+ type: string
146
+ flight_id:
147
+ type: integer
148
+ pl_test:
149
+ index: not_analyzed
150
+ type: string
151
+ metric:
152
+ index: not_analyzed
153
+ type: string
154
+ tb_h:
155
+ type: date
156
+ feature:
157
+ index: not_analyzed
158
+ type: string
159
+ base_feature:
160
+ index: not_analyzed
161
+ type: string
162
+ placement_count:
163
+ _all:
164
+ enabled: false
165
+ _routing:
166
+ path: flight_id
167
+ dynamic: false
168
+ properties:
169
+ _state:
170
+ type: string
171
+ index: "no"
172
+ store: "yes"
173
+ cnt:
174
+ type: integer
175
+ ad_tag_id:
176
+ type: integer
177
+ flight_id:
178
+ type: integer
179
+ ext_pl_id:
180
+ index: not_analyzed
181
+ type: string
182
+ creative_id:
183
+ type: integer
184
+ metric:
185
+ index: not_analyzed
186
+ type: string
187
+ tb_h:
188
+ index: not_analyzed
189
+ type: date
190
+ feature:
191
+ index: not_analyzed
192
+ type: string
193
+ detail:
194
+ index: not_analyzed
195
+ type: string
196
+ pl_composite:
197
+ index: not_analyzed
198
+ type: string
199
+ site_count:
200
+ _all:
201
+ enabled: false
202
+ _routing:
203
+ path: flight_id
204
+ dynamic: false
205
+ properties:
206
+ _state:
207
+ type: string
208
+ index: "no"
209
+ store: "yes"
210
+ cnt:
211
+ type: integer
212
+ ad_tag_id:
213
+ type: integer
214
+ flight_id:
215
+ type: integer
216
+ ext_site_id:
217
+ index: not_analyzed
218
+ type: string
219
+ creative_id:
220
+ type: integer
221
+ pl_test:
222
+ index: not_analyzed
223
+ type: string
224
+ metric:
225
+ index: not_analyzed
226
+ type: string
227
+ tb_h:
228
+ type: date
229
+ feature:
230
+ index: not_analyzed
231
+ type: string
232
+ site_composite:
233
+ index: not_analyzed
234
+ type: string
235
+ interaction_count:
236
+ _all:
237
+ enabled: false
238
+ _routing:
239
+ path: flight_id
240
+ dynamic: false
241
+ properties:
242
+ _state:
243
+ type: string
244
+ index: "no"
245
+ store: "yes"
246
+ cnt:
247
+ type: integer
248
+ ad_tag_id:
249
+ type: integer
250
+ pl_test:
251
+ index: not_analyzed
252
+ type: string
253
+ flight_id:
254
+ type: integer
255
+ creative_id:
256
+ type: integer
257
+ tb_h:
258
+ type: date
259
+ feature:
260
+ index: not_analyzed
261
+ type: string
262
+ base_feature:
263
+ index: not_analyzed
264
+ type: string
265
+ detail:
266
+ index: not_analyzed
267
+ type: string
268
+ datax:
269
+ index: not_analyzed
270
+ type: string
271
+ browser_count:
272
+ _all:
273
+ enabled: false
274
+ _routing:
275
+ path: flight_id
276
+ dynamic: false
277
+ properties:
278
+ _state:
279
+ type: string
280
+ index: "no"
281
+ store: "yes"
282
+ cnt:
283
+ type: integer
284
+ flight_id:
285
+ type: integer
286
+ pl_test:
287
+ index: not_analyzed
288
+ type: string
289
+ browser_ua:
290
+ index: not_analyzed
291
+ type: string
292
+ tb_h:
293
+ type: date
294
+ feature:
295
+ index: not_analyzed
296
+ type: string
297
+ parse_exception:
298
+ dynamic: false
299
+ _all:
300
+ enabled: false
301
+ properties:
302
+ _state:
303
+ type: string
304
+ index: "no"
305
+ store: "yes"
306
+ ad_tag_id:
307
+ type: integer
308
+ cnt:
309
+ type: integer
310
+ error_code:
311
+ type: string
312
+ index: not_analyzed
313
+ tb_h:
314
+ type: date
315
+ mountweasel:
316
+ dynamic: false
317
+ _all:
318
+ enabled: false
319
+ properties:
320
+ _state:
321
+ type: string
322
+ index: "no"
323
+ store: "yes"
324
+ tb_h:
325
+ type: date
326
+ cnt:
327
+ type: integer