wonderdog 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. data/.gitignore +2 -0
  2. data/.idea/encodings.xml +5 -0
  3. data/.idea/misc.xml +5 -0
  4. data/.idea/modules.xml +9 -0
  5. data/.idea/scopes/scope_settings.xml +5 -0
  6. data/.idea/vcs.xml +7 -0
  7. data/.idea/wonderdog.iml +41 -0
  8. data/Gemfile +1 -1
  9. data/bin/estool +22 -1
  10. data/bin/squirrel.rb +108 -0
  11. data/lib/wonderdog.rb +3 -0
  12. data/lib/wonderdog/hadoop_invocation_override.rb +4 -1
  13. data/lib/wonderdog/version.rb +1 -1
  14. data/pom.xml +1 -1
  15. data/spec/spec_helper.rb +1 -1
  16. data/spec/wonderdog/hadoop_invocation_override_spec.rb +1 -1
  17. data/squirrel/all_facets.rb +95 -0
  18. data/squirrel/change_es_index_settings.rb +19 -0
  19. data/squirrel/clear_es_caches.rb +30 -0
  20. data/squirrel/esbackup.rb +184 -0
  21. data/squirrel/esbackup_stripped.rb +153 -0
  22. data/squirrel/fields.sh +5 -0
  23. data/squirrel/getFields.rb +19 -0
  24. data/squirrel/replay.rb +219 -0
  25. data/squirrel/squirrel.rb +95 -0
  26. data/squirrel/warmer_interface.rb +59 -0
  27. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +2 -2
  28. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +14 -2
  29. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +20 -5
  30. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +55 -26
  31. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +59 -22
  32. data/test/cardinality.rb +43 -0
  33. data/test/change_es_index_settings.rb +19 -0
  34. data/test/clear_es_caches.rb +30 -0
  35. data/test/config/mapping.yml +327 -0
  36. data/test/config/mappings.yml +328 -0
  37. data/test/count_check.txt +0 -0
  38. data/test/esbackup_stripped.rb +153 -0
  39. data/test/mapping.yml +327 -0
  40. data/test/medium_slow_queries +41 -0
  41. data/test/queries.txt +0 -0
  42. data/test/quick_test_slow_queries +4 -0
  43. data/test/run_pry.rb +3 -0
  44. data/test/some_slow_queries +53 -0
  45. data/test/warmer_interface.rb +64 -0
  46. data/test/warmindices.rb +65 -0
  47. data/wonderdog.gemspec +1 -1
  48. metadata +40 -7
@@ -0,0 +1,43 @@
1
+ require 'rubygems'
2
+ require 'configliere'
3
+ require 'json'
4
+ require 'multi_json'
5
+
6
+ #Settings.use :commandline
7
+ #Settings.use :config_block
8
+ #Settings.define :dump
9
+ #Settings.define :field
10
+ #Settings.resolve!
11
+
12
+
13
+ class Cardinality
14
+ attr_accessor :fields
15
+
16
+ def initialize(dump)
17
+ @dump = dump
18
+ @fields = {}
19
+ end
20
+
21
+ def get_value_counts
22
+ File.open(@dump).each do |line|
23
+ record = MultiJson.load(line)
24
+ record.keys.each do |field|
25
+ @fields[field] ||= Hash.new(0)
26
+ @fields[field][record[field]] ||= Hash.new(0)
27
+ @fields[field][record[field]] += 1
28
+ end
29
+ end
30
+ puts @fields.inspect
31
+ end
32
+
33
+ def output
34
+ @field.keys.each do |field|
35
+ puts "#{field} has #{@fields[field].keys.size} values"
36
+ end
37
+ end
38
+ end
39
+
40
+ #card_ob = Cardinality.new("/home/missy/GitProjects/wonderdog/test/flight_count_20130405").get_value_counts
41
+ #puts card_ob.fields.inspect
42
+
43
+
@@ -0,0 +1,19 @@
1
+ class ChangeESIndexSettings
2
+ def initialize(options = {})
3
+ @host = options[:host]
4
+ @port = options[:port]
5
+ @index = options[:index]
6
+ @settings_and_values = options[:settings_and_values]
7
+ end
8
+
9
+ def change_setting(setting, value)
10
+ puts "changing setting #{setting} to value #{value}"
11
+ `curl -s -XPUT 'http://#{@host}:#{@port}/#{@index}/_settings?pretty=true' -d '{ "#{setting}":"#{value}" }'`
12
+ end
13
+
14
+ def run
15
+ @settings_and_values.each do |setting, value|
16
+ change_setting(setting, value)
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,30 @@
1
+ class ClearESCaches
2
+ def initialize(options={})
3
+ @to_clear = options[:type]
4
+ @host = options[:host]
5
+ @port = options[:port]
6
+ end
7
+
8
+ def clear_all
9
+ `curl -s -XPOST 'http://#{@host}:#{@port}/_all/_cache/clear?field_data=true&filter=true&bloom=true' ; echo`
10
+ end
11
+
12
+ def clear_filter_cache
13
+ `curl -s -XPOST 'http://#{@host}:#{@port}/_all/_cache/clear?field_data=false&filter=true&bloom=true' ; echo`
14
+ end
15
+
16
+ def clear_fielddata
17
+ `curl -s -XPOST 'http://#{@host}:#{@port}/_all/_cache/clear?field_data=true&filter=false&bloom=true' ; echo`
18
+ end
19
+
20
+ def run
21
+ puts @to_clear
22
+ case command = @to_clear.to_sym
23
+ when :all then clear_all
24
+ when :filter then clear_filter_cache
25
+ when :fielddata then clear_fielddata
26
+ else abort "#{command} not recognized"
27
+ end
28
+ end
29
+
30
+ end
@@ -0,0 +1,327 @@
1
+ ad_tag_count:
2
+ _all:
3
+ enabled: false
4
+ _routing:
5
+ path: ad_tag_id
6
+ dynamic: false
7
+ properties:
8
+ _state:
9
+ type: string
10
+ index: "no"
11
+ store: "yes"
12
+ cnt:
13
+ type: integer
14
+ ad_tag_id:
15
+ type: integer
16
+ creative_id:
17
+ type: integer
18
+ pl_test:
19
+ index: not_analyzed
20
+ type: string
21
+ metric:
22
+ index: not_analyzed
23
+ type: string
24
+ tb_h:
25
+ type: date
26
+ feature:
27
+ index: not_analyzed
28
+ type: string
29
+ base_feature:
30
+ index: not_analyzed
31
+ type: string
32
+ seconds:
33
+ type: integer
34
+ flight_count:
35
+ _all:
36
+ enabled: false
37
+ _routing:
38
+ path: flight_id
39
+ dynamic: false
40
+ properties:
41
+ _state:
42
+ type: string
43
+ index: "no"
44
+ store: "yes"
45
+ cnt:
46
+ type: integer
47
+ flight_id:
48
+ type: integer
49
+ pl_test:
50
+ index: not_analyzed
51
+ type: string
52
+ metric:
53
+ index: not_analyzed
54
+ type: string
55
+ tb_h:
56
+ type: date
57
+ feature:
58
+ index: not_analyzed
59
+ type: string
60
+ base_feature:
61
+ index: not_analyzed
62
+ type: string
63
+ seconds:
64
+ type: integer
65
+ metric_feature:
66
+ index: not_analyzed
67
+ type: string
68
+ creative_count:
69
+ _all:
70
+ enabled: false
71
+ _routing:
72
+ path: creative_id
73
+ dynamic: false
74
+ properties:
75
+ _state:
76
+ type: string
77
+ index: "no"
78
+ store: "yes"
79
+ cnt:
80
+ type: integer
81
+ creative_id:
82
+ type: integer
83
+ pl_test:
84
+ index: not_analyzed
85
+ type: string
86
+ metric:
87
+ index: not_analyzed
88
+ type: string
89
+ tb_h:
90
+ type: date
91
+ feature:
92
+ index: not_analyzed
93
+ type: string
94
+ base_feature:
95
+ index: not_analyzed
96
+ type: string
97
+ seconds:
98
+ type: integer
99
+ item_count:
100
+ _all:
101
+ enabled: false
102
+ _routing:
103
+ path: flight_id
104
+ dynamic: false
105
+ properties:
106
+ _state:
107
+ type: string
108
+ index: "no"
109
+ store: "yes"
110
+ cnt:
111
+ type: integer
112
+ item_id:
113
+ type: integer
114
+ pl_test:
115
+ index: not_analyzed
116
+ type: string
117
+ flight_id:
118
+ type: integer
119
+ metric:
120
+ index: not_analyzed
121
+ type: string
122
+ tb_h:
123
+ type: date
124
+ feature:
125
+ index: not_analyzed
126
+ type: string
127
+ base_feature:
128
+ index: not_analyzed
129
+ type: string
130
+ signal_value_count:
131
+ _all:
132
+ enabled: false
133
+ _routing:
134
+ path: flight_id
135
+ dynamic: false
136
+ properties:
137
+ _state:
138
+ type: string
139
+ index: "no"
140
+ store: "yes"
141
+ cnt:
142
+ type: integer
143
+ signal_value_id:
144
+ index: not_analyzed
145
+ type: string
146
+ flight_id:
147
+ type: integer
148
+ pl_test:
149
+ index: not_analyzed
150
+ type: string
151
+ metric:
152
+ index: not_analyzed
153
+ type: string
154
+ tb_h:
155
+ type: date
156
+ feature:
157
+ index: not_analyzed
158
+ type: string
159
+ base_feature:
160
+ index: not_analyzed
161
+ type: string
162
+ placement_count:
163
+ _all:
164
+ enabled: false
165
+ _routing:
166
+ path: flight_id
167
+ dynamic: false
168
+ properties:
169
+ _state:
170
+ type: string
171
+ index: "no"
172
+ store: "yes"
173
+ cnt:
174
+ type: integer
175
+ ad_tag_id:
176
+ type: integer
177
+ flight_id:
178
+ type: integer
179
+ ext_pl_id:
180
+ index: not_analyzed
181
+ type: string
182
+ creative_id:
183
+ type: integer
184
+ metric:
185
+ index: not_analyzed
186
+ type: string
187
+ tb_h:
188
+ index: not_analyzed
189
+ type: date
190
+ feature:
191
+ index: not_analyzed
192
+ type: string
193
+ detail:
194
+ index: not_analyzed
195
+ type: string
196
+ pl_composite:
197
+ index: not_analyzed
198
+ type: string
199
+ site_count:
200
+ _all:
201
+ enabled: false
202
+ _routing:
203
+ path: flight_id
204
+ dynamic: false
205
+ properties:
206
+ _state:
207
+ type: string
208
+ index: "no"
209
+ store: "yes"
210
+ cnt:
211
+ type: integer
212
+ ad_tag_id:
213
+ type: integer
214
+ flight_id:
215
+ type: integer
216
+ ext_site_id:
217
+ index: not_analyzed
218
+ type: string
219
+ creative_id:
220
+ type: integer
221
+ pl_test:
222
+ index: not_analyzed
223
+ type: string
224
+ metric:
225
+ index: not_analyzed
226
+ type: string
227
+ tb_h:
228
+ type: date
229
+ feature:
230
+ index: not_analyzed
231
+ type: string
232
+ site_composite:
233
+ index: not_analyzed
234
+ type: string
235
+ interaction_count:
236
+ _all:
237
+ enabled: false
238
+ _routing:
239
+ path: flight_id
240
+ dynamic: false
241
+ properties:
242
+ _state:
243
+ type: string
244
+ index: "no"
245
+ store: "yes"
246
+ cnt:
247
+ type: integer
248
+ ad_tag_id:
249
+ type: integer
250
+ pl_test:
251
+ index: not_analyzed
252
+ type: string
253
+ flight_id:
254
+ type: integer
255
+ creative_id:
256
+ type: integer
257
+ tb_h:
258
+ type: date
259
+ feature:
260
+ index: not_analyzed
261
+ type: string
262
+ base_feature:
263
+ index: not_analyzed
264
+ type: string
265
+ detail:
266
+ index: not_analyzed
267
+ type: string
268
+ datax:
269
+ index: not_analyzed
270
+ type: string
271
+ browser_count:
272
+ _all:
273
+ enabled: false
274
+ _routing:
275
+ path: flight_id
276
+ dynamic: false
277
+ properties:
278
+ _state:
279
+ type: string
280
+ index: "no"
281
+ store: "yes"
282
+ cnt:
283
+ type: integer
284
+ flight_id:
285
+ type: integer
286
+ pl_test:
287
+ index: not_analyzed
288
+ type: string
289
+ browser_ua:
290
+ index: not_analyzed
291
+ type: string
292
+ tb_h:
293
+ type: date
294
+ feature:
295
+ index: not_analyzed
296
+ type: string
297
+ parse_exception:
298
+ dynamic: false
299
+ _all:
300
+ enabled: false
301
+ properties:
302
+ _state:
303
+ type: string
304
+ index: "no"
305
+ store: "yes"
306
+ ad_tag_id:
307
+ type: integer
308
+ cnt:
309
+ type: integer
310
+ error_code:
311
+ type: string
312
+ index: not_analyzed
313
+ tb_h:
314
+ type: date
315
+ mountweasel:
316
+ dynamic: false
317
+ _all:
318
+ enabled: false
319
+ properties:
320
+ _state:
321
+ type: string
322
+ index: "no"
323
+ store: "yes"
324
+ tb_h:
325
+ type: date
326
+ cnt:
327
+ type: integer
@@ -0,0 +1,328 @@
1
+ ---
2
+ ad_tag_count:
3
+ _all:
4
+ enabled: false
5
+ _routing:
6
+ path: ad_tag_id
7
+ dynamic: false
8
+ properties:
9
+ _state:
10
+ type: string
11
+ index: "no"
12
+ store: "yes"
13
+ cnt:
14
+ type: integer
15
+ ad_tag_id:
16
+ type: integer
17
+ creative_id:
18
+ type: integer
19
+ pl_test:
20
+ index: not_analyzed
21
+ type: string
22
+ metric:
23
+ index: not_analyzed
24
+ type: string
25
+ tb_h:
26
+ type: date
27
+ feature:
28
+ index: not_analyzed
29
+ type: string
30
+ base_feature:
31
+ index: not_analyzed
32
+ type: string
33
+ seconds:
34
+ type: integer
35
+ flight_count:
36
+ _all:
37
+ enabled: false
38
+ _routing:
39
+ path: flight_id
40
+ dynamic: false
41
+ properties:
42
+ _state:
43
+ type: string
44
+ index: "no"
45
+ store: "yes"
46
+ cnt:
47
+ type: integer
48
+ flight_id:
49
+ type: integer
50
+ pl_test:
51
+ index: not_analyzed
52
+ type: string
53
+ metric:
54
+ index: not_analyzed
55
+ type: string
56
+ tb_h:
57
+ type: date
58
+ feature:
59
+ index: not_analyzed
60
+ type: string
61
+ base_feature:
62
+ index: not_analyzed
63
+ type: string
64
+ seconds:
65
+ type: integer
66
+ metric_feature:
67
+ index: not_analyzed
68
+ type: string
69
+ creative_count:
70
+ _all:
71
+ enabled: false
72
+ _routing:
73
+ path: creative_id
74
+ dynamic: false
75
+ properties:
76
+ _state:
77
+ type: string
78
+ index: "no"
79
+ store: "yes"
80
+ cnt:
81
+ type: integer
82
+ creative_id:
83
+ type: integer
84
+ pl_test:
85
+ index: not_analyzed
86
+ type: string
87
+ metric:
88
+ index: not_analyzed
89
+ type: string
90
+ tb_h:
91
+ type: date
92
+ feature:
93
+ index: not_analyzed
94
+ type: string
95
+ base_feature:
96
+ index: not_analyzed
97
+ type: string
98
+ seconds:
99
+ type: integer
100
+ item_count:
101
+ _all:
102
+ enabled: false
103
+ _routing:
104
+ path: flight_id
105
+ dynamic: false
106
+ properties:
107
+ _state:
108
+ type: string
109
+ index: "no"
110
+ store: "yes"
111
+ cnt:
112
+ type: integer
113
+ item_id:
114
+ type: integer
115
+ pl_test:
116
+ index: not_analyzed
117
+ type: string
118
+ flight_id:
119
+ type: integer
120
+ metric:
121
+ index: not_analyzed
122
+ type: string
123
+ tb_h:
124
+ type: date
125
+ feature:
126
+ index: not_analyzed
127
+ type: string
128
+ base_feature:
129
+ index: not_analyzed
130
+ type: string
131
+ signal_value_count:
132
+ _all:
133
+ enabled: false
134
+ _routing:
135
+ path: flight_id
136
+ dynamic: false
137
+ properties:
138
+ _state:
139
+ type: string
140
+ index: "no"
141
+ store: "yes"
142
+ cnt:
143
+ type: integer
144
+ signal_value_id:
145
+ index: not_analyzed
146
+ type: string
147
+ flight_id:
148
+ type: integer
149
+ pl_test:
150
+ index: not_analyzed
151
+ type: string
152
+ metric:
153
+ index: not_analyzed
154
+ type: string
155
+ tb_h:
156
+ type: date
157
+ feature:
158
+ index: not_analyzed
159
+ type: string
160
+ base_feature:
161
+ index: not_analyzed
162
+ type: string
163
+ placement_count:
164
+ _all:
165
+ enabled: false
166
+ _routing:
167
+ path: flight_id
168
+ dynamic: false
169
+ properties:
170
+ _state:
171
+ type: string
172
+ index: "no"
173
+ store: "yes"
174
+ cnt:
175
+ type: integer
176
+ ad_tag_id:
177
+ type: integer
178
+ flight_id:
179
+ type: integer
180
+ ext_pl_id:
181
+ index: not_analyzed
182
+ type: string
183
+ creative_id:
184
+ type: integer
185
+ metric:
186
+ index: not_analyzed
187
+ type: string
188
+ tb_h:
189
+ index: not_analyzed
190
+ type: date
191
+ feature:
192
+ index: not_analyzed
193
+ type: string
194
+ detail:
195
+ index: not_analyzed
196
+ type: string
197
+ pl_composite:
198
+ index: not_analyzed
199
+ type: string
200
+ site_count:
201
+ _all:
202
+ enabled: false
203
+ _routing:
204
+ path: flight_id
205
+ dynamic: false
206
+ properties:
207
+ _state:
208
+ type: string
209
+ index: "no"
210
+ store: "yes"
211
+ cnt:
212
+ type: integer
213
+ ad_tag_id:
214
+ type: integer
215
+ flight_id:
216
+ type: integer
217
+ ext_site_id:
218
+ index: not_analyzed
219
+ type: string
220
+ creative_id:
221
+ type: integer
222
+ pl_test:
223
+ index: not_analyzed
224
+ type: string
225
+ metric:
226
+ index: not_analyzed
227
+ type: string
228
+ tb_h:
229
+ type: date
230
+ feature:
231
+ index: not_analyzed
232
+ type: string
233
+ site_composite:
234
+ index: not_analyzed
235
+ type: string
236
+ interaction_count:
237
+ _all:
238
+ enabled: false
239
+ _routing:
240
+ path: flight_id
241
+ dynamic: false
242
+ properties:
243
+ _state:
244
+ type: string
245
+ index: "no"
246
+ store: "yes"
247
+ cnt:
248
+ type: integer
249
+ ad_tag_id:
250
+ type: integer
251
+ pl_test:
252
+ index: not_analyzed
253
+ type: string
254
+ flight_id:
255
+ type: integer
256
+ creative_id:
257
+ type: integer
258
+ tb_h:
259
+ type: date
260
+ feature:
261
+ index: not_analyzed
262
+ type: string
263
+ base_feature:
264
+ index: not_analyzed
265
+ type: string
266
+ detail:
267
+ index: not_analyzed
268
+ type: string
269
+ datax:
270
+ index: not_analyzed
271
+ type: string
272
+ browser_count:
273
+ _all:
274
+ enabled: false
275
+ _routing:
276
+ path: flight_id
277
+ dynamic: false
278
+ properties:
279
+ _state:
280
+ type: string
281
+ index: "no"
282
+ store: "yes"
283
+ cnt:
284
+ type: integer
285
+ flight_id:
286
+ type: integer
287
+ pl_test:
288
+ index: not_analyzed
289
+ type: string
290
+ browser_ua:
291
+ index: not_analyzed
292
+ type: string
293
+ tb_h:
294
+ type: date
295
+ feature:
296
+ index: not_analyzed
297
+ type: string
298
+ parse_exception:
299
+ dynamic: false
300
+ _all:
301
+ enabled: false
302
+ properties:
303
+ _state:
304
+ type: string
305
+ index: "no"
306
+ store: "yes"
307
+ ad_tag_id:
308
+ type: integer
309
+ cnt:
310
+ type: integer
311
+ error_code:
312
+ type: string
313
+ index: not_analyzed
314
+ tb_h:
315
+ type: date
316
+ mountweasel:
317
+ dynamic: false
318
+ _all:
319
+ enabled: false
320
+ properties:
321
+ _state:
322
+ type: string
323
+ index: "no"
324
+ store: "yes"
325
+ tb_h:
326
+ type: date
327
+ cnt:
328
+ type: integer