wonderdog 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. data/.gitignore +2 -0
  2. data/.idea/encodings.xml +5 -0
  3. data/.idea/misc.xml +5 -0
  4. data/.idea/modules.xml +9 -0
  5. data/.idea/scopes/scope_settings.xml +5 -0
  6. data/.idea/vcs.xml +7 -0
  7. data/.idea/wonderdog.iml +41 -0
  8. data/Gemfile +1 -1
  9. data/bin/estool +22 -1
  10. data/bin/squirrel.rb +108 -0
  11. data/lib/wonderdog.rb +3 -0
  12. data/lib/wonderdog/hadoop_invocation_override.rb +4 -1
  13. data/lib/wonderdog/version.rb +1 -1
  14. data/pom.xml +1 -1
  15. data/spec/spec_helper.rb +1 -1
  16. data/spec/wonderdog/hadoop_invocation_override_spec.rb +1 -1
  17. data/squirrel/all_facets.rb +95 -0
  18. data/squirrel/change_es_index_settings.rb +19 -0
  19. data/squirrel/clear_es_caches.rb +30 -0
  20. data/squirrel/esbackup.rb +184 -0
  21. data/squirrel/esbackup_stripped.rb +153 -0
  22. data/squirrel/fields.sh +5 -0
  23. data/squirrel/getFields.rb +19 -0
  24. data/squirrel/replay.rb +219 -0
  25. data/squirrel/squirrel.rb +95 -0
  26. data/squirrel/warmer_interface.rb +59 -0
  27. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +2 -2
  28. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +14 -2
  29. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +20 -5
  30. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +55 -26
  31. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +59 -22
  32. data/test/cardinality.rb +43 -0
  33. data/test/change_es_index_settings.rb +19 -0
  34. data/test/clear_es_caches.rb +30 -0
  35. data/test/config/mapping.yml +327 -0
  36. data/test/config/mappings.yml +328 -0
  37. data/test/count_check.txt +0 -0
  38. data/test/esbackup_stripped.rb +153 -0
  39. data/test/mapping.yml +327 -0
  40. data/test/medium_slow_queries +41 -0
  41. data/test/queries.txt +0 -0
  42. data/test/quick_test_slow_queries +4 -0
  43. data/test/run_pry.rb +3 -0
  44. data/test/some_slow_queries +53 -0
  45. data/test/warmer_interface.rb +64 -0
  46. data/test/warmindices.rb +65 -0
  47. data/wonderdog.gemspec +1 -1
  48. metadata +40 -7
File without changes
@@ -0,0 +1,153 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Simple script to dump elasticsearch indexes as raw JSON
4
+
5
+ require 'tire'
6
+ require 'zlib'
7
+ require 'socket'
8
+ require 'pathname'
9
+ require 'multi_json'
10
+
11
+ class ESBackup
12
+
13
+ def initialize(output_dir, options = {})
14
+ Tire::Configuration.url "http://#{options[:host]}:#{options[:port]}"
15
+ @output_dir = output_dir || ''
16
+ @index = options[:index]
17
+ @batch_size = options[:batch_size].to_i
18
+ @mapping_file = options[:mappings]
19
+ if options[:query].nil?
20
+ @query = nil
21
+ else
22
+ @query = MultiJson.load(options[:query]) rescue nil
23
+ end
24
+ if options[:dump_file].nil?
25
+ @dump_file = @index
26
+ else
27
+ @dump_file = options[:dump_file]
28
+ end
29
+ end
30
+
31
+ def dump_mapping
32
+ index = Tire::Index.new @index
33
+ File.open(@mapping_file, 'w'){ |f| f.puts index.mapping.to_json }
34
+ end
35
+
36
+ def fullpath dir
37
+ basedir = dir.start_with?('/') ? dir : File.join(Dir.pwd, dir)
38
+ FileUtils.mkdir_p(basedir)
39
+ basedir
40
+ end
41
+
42
+ def gz_output
43
+ File.join(fullpath(@output_dir), @index + '.gz')
44
+ end
45
+
46
+ def create_scanner
47
+ scan_opts = { size: @batch_size }
48
+ additional_query = @query
49
+ Tire::Search::Scan.new(@index, scan_opts) do
50
+ # This is fucking stupid; why people have to be cute and make everything DSL only
51
+ # I'll never understand, but the person who wrote this gem has forced us to ONLY be able to
52
+ # ask queries in this manner.
53
+ query do
54
+ additional_query.each_pair do |key, vals|
55
+ case vals
56
+ # Assuming here that you are only asking for one field at a time...this is getting hacky fast
57
+ when Hash then self.send(key.to_sym, *vals.to_a.flatten)
58
+ when Array then self.send(key.to_sym, *vals)
59
+ end
60
+ end
61
+ end if additional_query
62
+ end
63
+ end
64
+
65
+ def run
66
+ dump_mapping if @mapping_file
67
+ gz = Zlib::GzipWriter.open gz_output
68
+ count = 0
69
+ create_scanner.each do |document|
70
+ document.each do |record|
71
+ json_doc = record.to_hash.except(:type, :_index, :_explanation, :_score, :_version, :highlight, :sort).to_json
72
+ gz.puts json_doc
73
+ count += 1
74
+ end
75
+ end
76
+ gz.close
77
+ puts "#{@index} backup complete. #{count} records written"
78
+ end
79
+ end
80
+
81
+ class ESRestore
82
+
83
+ def initialize(input, options = {})
84
+ Tire::Configuration.url "http://#{options[:host]}:#{options[:port]}"
85
+ @index = options[:index]
86
+ @batch_size = options[:batch_size].to_i
87
+ @gz_input = Zlib::GzipReader.open(input)
88
+ @mapping_file = options[:mappings]
89
+ end
90
+
91
+ def create_index
92
+ index = Tire::Index.new @index
93
+ options = @mapping_file ? { mappings: MultiJson.load(File.read(@mapping_file)) } : {}
94
+ index.create(options) unless index.exists?
95
+ index
96
+ end
97
+
98
+ def run
99
+ reindex = create_index
100
+ count, documents = 0, []
101
+ @gz_input.each_line do |json|
102
+ documents << MultiJson.load(json)
103
+ count += 1
104
+ if count % @batch_size == 0
105
+ reindex.bulk_create documents
106
+ puts "#{count} records loaded"
107
+ documents.clear
108
+ end
109
+ end
110
+ @gz_input.close()
111
+ reindex.bulk_create documents if not documents.empty?
112
+ puts "#{@index} restore complete with #{count} records loaded"
113
+ end
114
+ end
115
+
116
+ class ESDup
117
+
118
+ def initialize(input, options = {})
119
+ Tire::Configuration.url "http://#{options[:host]}:#{options[:port]}"
120
+ @index = options[:index]
121
+ @batch_size = options[:batch_size].to_i
122
+ @gz_input = Zlib::GzipReader.open(input)
123
+ @mapping_file = options[:mappings]
124
+ end
125
+
126
+ def create_index
127
+ index = Tire::Index.new @index
128
+ options = @mapping_file ? { mappings: MultiJson.load(File.read(@mapping_file)) } : {}
129
+ index.create(options) unless index.exists?
130
+ index
131
+ end
132
+
133
+ def run
134
+ reindex = create_index
135
+ count, documents = 0, []
136
+ @gz_input.each_line do |json|
137
+ line = MultiJson.load(json)
138
+ line.delete("_id")
139
+ line.delete("id")
140
+ documents << line
141
+ count += 1
142
+ if count % @batch_size == 0
143
+ reindex.bulk_create documents
144
+ puts "#{count} records loaded"
145
+ documents.clear
146
+ end
147
+ end
148
+ @gz_input.close()
149
+ reindex.bulk_create documents if not documents.empty?
150
+ puts "#{@index} restore complete with #{count} records loaded"
151
+ end
152
+ end
153
+
@@ -0,0 +1,327 @@
1
+ ad_tag_count:
2
+ _all:
3
+ enabled: false
4
+ _routing:
5
+ path: ad_tag_id
6
+ dynamic: false
7
+ properties:
8
+ _state:
9
+ type: string
10
+ index: "no"
11
+ store: "yes"
12
+ cnt:
13
+ type: integer
14
+ ad_tag_id:
15
+ type: integer
16
+ creative_id:
17
+ type: integer
18
+ pl_test:
19
+ index: not_analyzed
20
+ type: string
21
+ metric:
22
+ index: not_analyzed
23
+ type: string
24
+ tb_h:
25
+ type: date
26
+ feature:
27
+ index: not_analyzed
28
+ type: string
29
+ base_feature:
30
+ index: not_analyzed
31
+ type: string
32
+ seconds:
33
+ type: integer
34
+ flight_count:
35
+ _all:
36
+ enabled: false
37
+ _routing:
38
+ path: flight_id
39
+ dynamic: false
40
+ properties:
41
+ _state:
42
+ type: string
43
+ index: "no"
44
+ store: "yes"
45
+ cnt:
46
+ type: integer
47
+ flight_id:
48
+ type: integer
49
+ pl_test:
50
+ index: not_analyzed
51
+ type: string
52
+ metric:
53
+ index: not_analyzed
54
+ type: string
55
+ tb_h:
56
+ type: date
57
+ feature:
58
+ index: not_analyzed
59
+ type: string
60
+ base_feature:
61
+ index: not_analyzed
62
+ type: string
63
+ seconds:
64
+ type: integer
65
+ metric_feature:
66
+ index: not_analyzed
67
+ type: string
68
+ creative_count:
69
+ _all:
70
+ enabled: false
71
+ _routing:
72
+ path: creative_id
73
+ dynamic: false
74
+ properties:
75
+ _state:
76
+ type: string
77
+ index: "no"
78
+ store: "yes"
79
+ cnt:
80
+ type: integer
81
+ creative_id:
82
+ type: integer
83
+ pl_test:
84
+ index: not_analyzed
85
+ type: string
86
+ metric:
87
+ index: not_analyzed
88
+ type: string
89
+ tb_h:
90
+ type: date
91
+ feature:
92
+ index: not_analyzed
93
+ type: string
94
+ base_feature:
95
+ index: not_analyzed
96
+ type: string
97
+ seconds:
98
+ type: integer
99
+ item_count:
100
+ _all:
101
+ enabled: false
102
+ _routing:
103
+ path: flight_id
104
+ dynamic: false
105
+ properties:
106
+ _state:
107
+ type: string
108
+ index: "no"
109
+ store: "yes"
110
+ cnt:
111
+ type: integer
112
+ item_id:
113
+ type: integer
114
+ pl_test:
115
+ index: not_analyzed
116
+ type: string
117
+ flight_id:
118
+ type: integer
119
+ metric:
120
+ index: not_analyzed
121
+ type: string
122
+ tb_h:
123
+ type: date
124
+ feature:
125
+ index: not_analyzed
126
+ type: string
127
+ base_feature:
128
+ index: not_analyzed
129
+ type: string
130
+ signal_value_count:
131
+ _all:
132
+ enabled: false
133
+ _routing:
134
+ path: flight_id
135
+ dynamic: false
136
+ properties:
137
+ _state:
138
+ type: string
139
+ index: "no"
140
+ store: "yes"
141
+ cnt:
142
+ type: integer
143
+ signal_value_id:
144
+ index: not_analyzed
145
+ type: string
146
+ flight_id:
147
+ type: integer
148
+ pl_test:
149
+ index: not_analyzed
150
+ type: string
151
+ metric:
152
+ index: not_analyzed
153
+ type: string
154
+ tb_h:
155
+ type: date
156
+ feature:
157
+ index: not_analyzed
158
+ type: string
159
+ base_feature:
160
+ index: not_analyzed
161
+ type: string
162
+ placement_count:
163
+ _all:
164
+ enabled: false
165
+ _routing:
166
+ path: flight_id
167
+ dynamic: false
168
+ properties:
169
+ _state:
170
+ type: string
171
+ index: "no"
172
+ store: "yes"
173
+ cnt:
174
+ type: integer
175
+ ad_tag_id:
176
+ type: integer
177
+ flight_id:
178
+ type: integer
179
+ ext_pl_id:
180
+ index: not_analyzed
181
+ type: string
182
+ creative_id:
183
+ type: integer
184
+ metric:
185
+ index: not_analyzed
186
+ type: string
187
+ tb_h:
188
+ index: not_analyzed
189
+ type: date
190
+ feature:
191
+ index: not_analyzed
192
+ type: string
193
+ detail:
194
+ index: not_analyzed
195
+ type: string
196
+ pl_composite:
197
+ index: not_analyzed
198
+ type: string
199
+ site_count:
200
+ _all:
201
+ enabled: false
202
+ _routing:
203
+ path: flight_id
204
+ dynamic: false
205
+ properties:
206
+ _state:
207
+ type: string
208
+ index: "no"
209
+ store: "yes"
210
+ cnt:
211
+ type: integer
212
+ ad_tag_id:
213
+ type: integer
214
+ flight_id:
215
+ type: integer
216
+ ext_site_id:
217
+ index: not_analyzed
218
+ type: string
219
+ creative_id:
220
+ type: integer
221
+ pl_test:
222
+ index: not_analyzed
223
+ type: string
224
+ metric:
225
+ index: not_analyzed
226
+ type: string
227
+ tb_h:
228
+ type: date
229
+ feature:
230
+ index: not_analyzed
231
+ type: string
232
+ site_composite:
233
+ index: not_analyzed
234
+ type: string
235
+ interaction_count:
236
+ _all:
237
+ enabled: false
238
+ _routing:
239
+ path: flight_id
240
+ dynamic: false
241
+ properties:
242
+ _state:
243
+ type: string
244
+ index: "no"
245
+ store: "yes"
246
+ cnt:
247
+ type: integer
248
+ ad_tag_id:
249
+ type: integer
250
+ pl_test:
251
+ index: not_analyzed
252
+ type: string
253
+ flight_id:
254
+ type: integer
255
+ creative_id:
256
+ type: integer
257
+ tb_h:
258
+ type: date
259
+ feature:
260
+ index: not_analyzed
261
+ type: string
262
+ base_feature:
263
+ index: not_analyzed
264
+ type: string
265
+ detail:
266
+ index: not_analyzed
267
+ type: string
268
+ datax:
269
+ index: not_analyzed
270
+ type: string
271
+ browser_count:
272
+ _all:
273
+ enabled: false
274
+ _routing:
275
+ path: flight_id
276
+ dynamic: false
277
+ properties:
278
+ _state:
279
+ type: string
280
+ index: "no"
281
+ store: "yes"
282
+ cnt:
283
+ type: integer
284
+ flight_id:
285
+ type: integer
286
+ pl_test:
287
+ index: not_analyzed
288
+ type: string
289
+ browser_ua:
290
+ index: not_analyzed
291
+ type: string
292
+ tb_h:
293
+ type: date
294
+ feature:
295
+ index: not_analyzed
296
+ type: string
297
+ parse_exception:
298
+ dynamic: false
299
+ _all:
300
+ enabled: false
301
+ properties:
302
+ _state:
303
+ type: string
304
+ index: "no"
305
+ store: "yes"
306
+ ad_tag_id:
307
+ type: integer
308
+ cnt:
309
+ type: integer
310
+ error_code:
311
+ type: string
312
+ index: not_analyzed
313
+ tb_h:
314
+ type: date
315
+ mountweasel:
316
+ dynamic: false
317
+ _all:
318
+ enabled: false
319
+ properties:
320
+ _state:
321
+ type: string
322
+ index: "no"
323
+ store: "yes"
324
+ tb_h:
325
+ type: date
326
+ cnt:
327
+ type: integer