wonderdog 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. data/.gitignore +2 -0
  2. data/.idea/encodings.xml +5 -0
  3. data/.idea/misc.xml +5 -0
  4. data/.idea/modules.xml +9 -0
  5. data/.idea/scopes/scope_settings.xml +5 -0
  6. data/.idea/vcs.xml +7 -0
  7. data/.idea/wonderdog.iml +41 -0
  8. data/Gemfile +1 -1
  9. data/bin/estool +22 -1
  10. data/bin/squirrel.rb +108 -0
  11. data/lib/wonderdog.rb +3 -0
  12. data/lib/wonderdog/hadoop_invocation_override.rb +4 -1
  13. data/lib/wonderdog/version.rb +1 -1
  14. data/pom.xml +1 -1
  15. data/spec/spec_helper.rb +1 -1
  16. data/spec/wonderdog/hadoop_invocation_override_spec.rb +1 -1
  17. data/squirrel/all_facets.rb +95 -0
  18. data/squirrel/change_es_index_settings.rb +19 -0
  19. data/squirrel/clear_es_caches.rb +30 -0
  20. data/squirrel/esbackup.rb +184 -0
  21. data/squirrel/esbackup_stripped.rb +153 -0
  22. data/squirrel/fields.sh +5 -0
  23. data/squirrel/getFields.rb +19 -0
  24. data/squirrel/replay.rb +219 -0
  25. data/squirrel/squirrel.rb +95 -0
  26. data/squirrel/warmer_interface.rb +59 -0
  27. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +2 -2
  28. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +14 -2
  29. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +20 -5
  30. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +55 -26
  31. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +59 -22
  32. data/test/cardinality.rb +43 -0
  33. data/test/change_es_index_settings.rb +19 -0
  34. data/test/clear_es_caches.rb +30 -0
  35. data/test/config/mapping.yml +327 -0
  36. data/test/config/mappings.yml +328 -0
  37. data/test/count_check.txt +0 -0
  38. data/test/esbackup_stripped.rb +153 -0
  39. data/test/mapping.yml +327 -0
  40. data/test/medium_slow_queries +41 -0
  41. data/test/queries.txt +0 -0
  42. data/test/quick_test_slow_queries +4 -0
  43. data/test/run_pry.rb +3 -0
  44. data/test/some_slow_queries +53 -0
  45. data/test/warmer_interface.rb +64 -0
  46. data/test/warmindices.rb +65 -0
  47. data/wonderdog.gemspec +1 -1
  48. metadata +40 -7
@@ -0,0 +1,43 @@
1
+ require 'rubygems'
2
+ require 'configliere'
3
+ require 'json'
4
+ require 'multi_json'
5
+
6
+ #Settings.use :commandline
7
+ #Settings.use :config_block
8
+ #Settings.define :dump
9
+ #Settings.define :field
10
+ #Settings.resolve!
11
+
12
+
13
+ class Cardinality
14
+ attr_accessor :fields
15
+
16
+ def initialize(dump)
17
+ @dump = dump
18
+ @fields = {}
19
+ end
20
+
21
+ def get_value_counts
22
+ File.open(@dump).each do |line|
23
+ record = MultiJson.load(line)
24
+ record.keys.each do |field|
25
+ @fields[field] ||= Hash.new(0)
26
+ @fields[field][record[field]] ||= Hash.new(0)
27
+ @fields[field][record[field]] += 1
28
+ end
29
+ end
30
+ puts @fields.inspect
31
+ end
32
+
33
+ def output
34
+ @field.keys.each do |field|
35
+ puts "#{field} has #{@fields[field].keys.size} values"
36
+ end
37
+ end
38
+ end
39
+
40
+ #card_ob = Cardinality.new("/home/missy/GitProjects/wonderdog/test/flight_count_20130405").get_value_counts
41
+ #puts card_ob.fields.inspect
42
+
43
+
@@ -0,0 +1,19 @@
1
+ class ChangeESIndexSettings
2
+ def initialize(options = {})
3
+ @host = options[:host]
4
+ @port = options[:port]
5
+ @index = options[:index]
6
+ @settings_and_values = options[:settings_and_values]
7
+ end
8
+
9
+ def change_setting(setting, value)
10
+ puts "changing setting #{setting} to value #{value}"
11
+ `curl -s -XPUT 'http://#{@host}:#{@port}/#{@index}/_settings?pretty=true' -d '{ "#{setting}":"#{value}" }'`
12
+ end
13
+
14
+ def run
15
+ @settings_and_values.each do |setting, value|
16
+ change_setting(setting, value)
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,30 @@
1
+ class ClearESCaches
2
+ def initialize(options={})
3
+ @to_clear = options[:type]
4
+ @host = options[:host]
5
+ @port = options[:port]
6
+ end
7
+
8
+ def clear_all
9
+ `curl -s -XPOST 'http://#{@host}:#{@port}/_all/_cache/clear?field_data=true&filter=true&bloom=true' ; echo`
10
+ end
11
+
12
+ def clear_filter_cache
13
+ `curl -s -XPOST 'http://#{@host}:#{@port}/_all/_cache/clear?field_data=false&filter=true&bloom=true' ; echo`
14
+ end
15
+
16
+ def clear_fielddata
17
+ `curl -s -XPOST 'http://#{@host}:#{@port}/_all/_cache/clear?field_data=true&filter=false&bloom=true' ; echo`
18
+ end
19
+
20
+ def run
21
+ puts @to_clear
22
+ case command = @to_clear.to_sym
23
+ when :all then clear_all
24
+ when :filter then clear_filter_cache
25
+ when :fielddata then clear_fielddata
26
+ else abort "#{command} not recognized"
27
+ end
28
+ end
29
+
30
+ end
@@ -0,0 +1,327 @@
1
+ ad_tag_count:
2
+ _all:
3
+ enabled: false
4
+ _routing:
5
+ path: ad_tag_id
6
+ dynamic: false
7
+ properties:
8
+ _state:
9
+ type: string
10
+ index: "no"
11
+ store: "yes"
12
+ cnt:
13
+ type: integer
14
+ ad_tag_id:
15
+ type: integer
16
+ creative_id:
17
+ type: integer
18
+ pl_test:
19
+ index: not_analyzed
20
+ type: string
21
+ metric:
22
+ index: not_analyzed
23
+ type: string
24
+ tb_h:
25
+ type: date
26
+ feature:
27
+ index: not_analyzed
28
+ type: string
29
+ base_feature:
30
+ index: not_analyzed
31
+ type: string
32
+ seconds:
33
+ type: integer
34
+ flight_count:
35
+ _all:
36
+ enabled: false
37
+ _routing:
38
+ path: flight_id
39
+ dynamic: false
40
+ properties:
41
+ _state:
42
+ type: string
43
+ index: "no"
44
+ store: "yes"
45
+ cnt:
46
+ type: integer
47
+ flight_id:
48
+ type: integer
49
+ pl_test:
50
+ index: not_analyzed
51
+ type: string
52
+ metric:
53
+ index: not_analyzed
54
+ type: string
55
+ tb_h:
56
+ type: date
57
+ feature:
58
+ index: not_analyzed
59
+ type: string
60
+ base_feature:
61
+ index: not_analyzed
62
+ type: string
63
+ seconds:
64
+ type: integer
65
+ metric_feature:
66
+ index: not_analyzed
67
+ type: string
68
+ creative_count:
69
+ _all:
70
+ enabled: false
71
+ _routing:
72
+ path: creative_id
73
+ dynamic: false
74
+ properties:
75
+ _state:
76
+ type: string
77
+ index: "no"
78
+ store: "yes"
79
+ cnt:
80
+ type: integer
81
+ creative_id:
82
+ type: integer
83
+ pl_test:
84
+ index: not_analyzed
85
+ type: string
86
+ metric:
87
+ index: not_analyzed
88
+ type: string
89
+ tb_h:
90
+ type: date
91
+ feature:
92
+ index: not_analyzed
93
+ type: string
94
+ base_feature:
95
+ index: not_analyzed
96
+ type: string
97
+ seconds:
98
+ type: integer
99
+ item_count:
100
+ _all:
101
+ enabled: false
102
+ _routing:
103
+ path: flight_id
104
+ dynamic: false
105
+ properties:
106
+ _state:
107
+ type: string
108
+ index: "no"
109
+ store: "yes"
110
+ cnt:
111
+ type: integer
112
+ item_id:
113
+ type: integer
114
+ pl_test:
115
+ index: not_analyzed
116
+ type: string
117
+ flight_id:
118
+ type: integer
119
+ metric:
120
+ index: not_analyzed
121
+ type: string
122
+ tb_h:
123
+ type: date
124
+ feature:
125
+ index: not_analyzed
126
+ type: string
127
+ base_feature:
128
+ index: not_analyzed
129
+ type: string
130
+ signal_value_count:
131
+ _all:
132
+ enabled: false
133
+ _routing:
134
+ path: flight_id
135
+ dynamic: false
136
+ properties:
137
+ _state:
138
+ type: string
139
+ index: "no"
140
+ store: "yes"
141
+ cnt:
142
+ type: integer
143
+ signal_value_id:
144
+ index: not_analyzed
145
+ type: string
146
+ flight_id:
147
+ type: integer
148
+ pl_test:
149
+ index: not_analyzed
150
+ type: string
151
+ metric:
152
+ index: not_analyzed
153
+ type: string
154
+ tb_h:
155
+ type: date
156
+ feature:
157
+ index: not_analyzed
158
+ type: string
159
+ base_feature:
160
+ index: not_analyzed
161
+ type: string
162
+ placement_count:
163
+ _all:
164
+ enabled: false
165
+ _routing:
166
+ path: flight_id
167
+ dynamic: false
168
+ properties:
169
+ _state:
170
+ type: string
171
+ index: "no"
172
+ store: "yes"
173
+ cnt:
174
+ type: integer
175
+ ad_tag_id:
176
+ type: integer
177
+ flight_id:
178
+ type: integer
179
+ ext_pl_id:
180
+ index: not_analyzed
181
+ type: string
182
+ creative_id:
183
+ type: integer
184
+ metric:
185
+ index: not_analyzed
186
+ type: string
187
+ tb_h:
188
+ index: not_analyzed
189
+ type: date
190
+ feature:
191
+ index: not_analyzed
192
+ type: string
193
+ detail:
194
+ index: not_analyzed
195
+ type: string
196
+ pl_composite:
197
+ index: not_analyzed
198
+ type: string
199
+ site_count:
200
+ _all:
201
+ enabled: false
202
+ _routing:
203
+ path: flight_id
204
+ dynamic: false
205
+ properties:
206
+ _state:
207
+ type: string
208
+ index: "no"
209
+ store: "yes"
210
+ cnt:
211
+ type: integer
212
+ ad_tag_id:
213
+ type: integer
214
+ flight_id:
215
+ type: integer
216
+ ext_site_id:
217
+ index: not_analyzed
218
+ type: string
219
+ creative_id:
220
+ type: integer
221
+ pl_test:
222
+ index: not_analyzed
223
+ type: string
224
+ metric:
225
+ index: not_analyzed
226
+ type: string
227
+ tb_h:
228
+ type: date
229
+ feature:
230
+ index: not_analyzed
231
+ type: string
232
+ site_composite:
233
+ index: not_analyzed
234
+ type: string
235
+ interaction_count:
236
+ _all:
237
+ enabled: false
238
+ _routing:
239
+ path: flight_id
240
+ dynamic: false
241
+ properties:
242
+ _state:
243
+ type: string
244
+ index: "no"
245
+ store: "yes"
246
+ cnt:
247
+ type: integer
248
+ ad_tag_id:
249
+ type: integer
250
+ pl_test:
251
+ index: not_analyzed
252
+ type: string
253
+ flight_id:
254
+ type: integer
255
+ creative_id:
256
+ type: integer
257
+ tb_h:
258
+ type: date
259
+ feature:
260
+ index: not_analyzed
261
+ type: string
262
+ base_feature:
263
+ index: not_analyzed
264
+ type: string
265
+ detail:
266
+ index: not_analyzed
267
+ type: string
268
+ datax:
269
+ index: not_analyzed
270
+ type: string
271
+ browser_count:
272
+ _all:
273
+ enabled: false
274
+ _routing:
275
+ path: flight_id
276
+ dynamic: false
277
+ properties:
278
+ _state:
279
+ type: string
280
+ index: "no"
281
+ store: "yes"
282
+ cnt:
283
+ type: integer
284
+ flight_id:
285
+ type: integer
286
+ pl_test:
287
+ index: not_analyzed
288
+ type: string
289
+ browser_ua:
290
+ index: not_analyzed
291
+ type: string
292
+ tb_h:
293
+ type: date
294
+ feature:
295
+ index: not_analyzed
296
+ type: string
297
+ parse_exception:
298
+ dynamic: false
299
+ _all:
300
+ enabled: false
301
+ properties:
302
+ _state:
303
+ type: string
304
+ index: "no"
305
+ store: "yes"
306
+ ad_tag_id:
307
+ type: integer
308
+ cnt:
309
+ type: integer
310
+ error_code:
311
+ type: string
312
+ index: not_analyzed
313
+ tb_h:
314
+ type: date
315
+ mountweasel:
316
+ dynamic: false
317
+ _all:
318
+ enabled: false
319
+ properties:
320
+ _state:
321
+ type: string
322
+ index: "no"
323
+ store: "yes"
324
+ tb_h:
325
+ type: date
326
+ cnt:
327
+ type: integer
@@ -0,0 +1,328 @@
1
+ ---
2
+ ad_tag_count:
3
+ _all:
4
+ enabled: false
5
+ _routing:
6
+ path: ad_tag_id
7
+ dynamic: false
8
+ properties:
9
+ _state:
10
+ type: string
11
+ index: "no"
12
+ store: "yes"
13
+ cnt:
14
+ type: integer
15
+ ad_tag_id:
16
+ type: integer
17
+ creative_id:
18
+ type: integer
19
+ pl_test:
20
+ index: not_analyzed
21
+ type: string
22
+ metric:
23
+ index: not_analyzed
24
+ type: string
25
+ tb_h:
26
+ type: date
27
+ feature:
28
+ index: not_analyzed
29
+ type: string
30
+ base_feature:
31
+ index: not_analyzed
32
+ type: string
33
+ seconds:
34
+ type: integer
35
+ flight_count:
36
+ _all:
37
+ enabled: false
38
+ _routing:
39
+ path: flight_id
40
+ dynamic: false
41
+ properties:
42
+ _state:
43
+ type: string
44
+ index: "no"
45
+ store: "yes"
46
+ cnt:
47
+ type: integer
48
+ flight_id:
49
+ type: integer
50
+ pl_test:
51
+ index: not_analyzed
52
+ type: string
53
+ metric:
54
+ index: not_analyzed
55
+ type: string
56
+ tb_h:
57
+ type: date
58
+ feature:
59
+ index: not_analyzed
60
+ type: string
61
+ base_feature:
62
+ index: not_analyzed
63
+ type: string
64
+ seconds:
65
+ type: integer
66
+ metric_feature:
67
+ index: not_analyzed
68
+ type: string
69
+ creative_count:
70
+ _all:
71
+ enabled: false
72
+ _routing:
73
+ path: creative_id
74
+ dynamic: false
75
+ properties:
76
+ _state:
77
+ type: string
78
+ index: "no"
79
+ store: "yes"
80
+ cnt:
81
+ type: integer
82
+ creative_id:
83
+ type: integer
84
+ pl_test:
85
+ index: not_analyzed
86
+ type: string
87
+ metric:
88
+ index: not_analyzed
89
+ type: string
90
+ tb_h:
91
+ type: date
92
+ feature:
93
+ index: not_analyzed
94
+ type: string
95
+ base_feature:
96
+ index: not_analyzed
97
+ type: string
98
+ seconds:
99
+ type: integer
100
+ item_count:
101
+ _all:
102
+ enabled: false
103
+ _routing:
104
+ path: flight_id
105
+ dynamic: false
106
+ properties:
107
+ _state:
108
+ type: string
109
+ index: "no"
110
+ store: "yes"
111
+ cnt:
112
+ type: integer
113
+ item_id:
114
+ type: integer
115
+ pl_test:
116
+ index: not_analyzed
117
+ type: string
118
+ flight_id:
119
+ type: integer
120
+ metric:
121
+ index: not_analyzed
122
+ type: string
123
+ tb_h:
124
+ type: date
125
+ feature:
126
+ index: not_analyzed
127
+ type: string
128
+ base_feature:
129
+ index: not_analyzed
130
+ type: string
131
+ signal_value_count:
132
+ _all:
133
+ enabled: false
134
+ _routing:
135
+ path: flight_id
136
+ dynamic: false
137
+ properties:
138
+ _state:
139
+ type: string
140
+ index: "no"
141
+ store: "yes"
142
+ cnt:
143
+ type: integer
144
+ signal_value_id:
145
+ index: not_analyzed
146
+ type: string
147
+ flight_id:
148
+ type: integer
149
+ pl_test:
150
+ index: not_analyzed
151
+ type: string
152
+ metric:
153
+ index: not_analyzed
154
+ type: string
155
+ tb_h:
156
+ type: date
157
+ feature:
158
+ index: not_analyzed
159
+ type: string
160
+ base_feature:
161
+ index: not_analyzed
162
+ type: string
163
+ placement_count:
164
+ _all:
165
+ enabled: false
166
+ _routing:
167
+ path: flight_id
168
+ dynamic: false
169
+ properties:
170
+ _state:
171
+ type: string
172
+ index: "no"
173
+ store: "yes"
174
+ cnt:
175
+ type: integer
176
+ ad_tag_id:
177
+ type: integer
178
+ flight_id:
179
+ type: integer
180
+ ext_pl_id:
181
+ index: not_analyzed
182
+ type: string
183
+ creative_id:
184
+ type: integer
185
+ metric:
186
+ index: not_analyzed
187
+ type: string
188
+ tb_h:
189
+ index: not_analyzed
190
+ type: date
191
+ feature:
192
+ index: not_analyzed
193
+ type: string
194
+ detail:
195
+ index: not_analyzed
196
+ type: string
197
+ pl_composite:
198
+ index: not_analyzed
199
+ type: string
200
+ site_count:
201
+ _all:
202
+ enabled: false
203
+ _routing:
204
+ path: flight_id
205
+ dynamic: false
206
+ properties:
207
+ _state:
208
+ type: string
209
+ index: "no"
210
+ store: "yes"
211
+ cnt:
212
+ type: integer
213
+ ad_tag_id:
214
+ type: integer
215
+ flight_id:
216
+ type: integer
217
+ ext_site_id:
218
+ index: not_analyzed
219
+ type: string
220
+ creative_id:
221
+ type: integer
222
+ pl_test:
223
+ index: not_analyzed
224
+ type: string
225
+ metric:
226
+ index: not_analyzed
227
+ type: string
228
+ tb_h:
229
+ type: date
230
+ feature:
231
+ index: not_analyzed
232
+ type: string
233
+ site_composite:
234
+ index: not_analyzed
235
+ type: string
236
+ interaction_count:
237
+ _all:
238
+ enabled: false
239
+ _routing:
240
+ path: flight_id
241
+ dynamic: false
242
+ properties:
243
+ _state:
244
+ type: string
245
+ index: "no"
246
+ store: "yes"
247
+ cnt:
248
+ type: integer
249
+ ad_tag_id:
250
+ type: integer
251
+ pl_test:
252
+ index: not_analyzed
253
+ type: string
254
+ flight_id:
255
+ type: integer
256
+ creative_id:
257
+ type: integer
258
+ tb_h:
259
+ type: date
260
+ feature:
261
+ index: not_analyzed
262
+ type: string
263
+ base_feature:
264
+ index: not_analyzed
265
+ type: string
266
+ detail:
267
+ index: not_analyzed
268
+ type: string
269
+ datax:
270
+ index: not_analyzed
271
+ type: string
272
+ browser_count:
273
+ _all:
274
+ enabled: false
275
+ _routing:
276
+ path: flight_id
277
+ dynamic: false
278
+ properties:
279
+ _state:
280
+ type: string
281
+ index: "no"
282
+ store: "yes"
283
+ cnt:
284
+ type: integer
285
+ flight_id:
286
+ type: integer
287
+ pl_test:
288
+ index: not_analyzed
289
+ type: string
290
+ browser_ua:
291
+ index: not_analyzed
292
+ type: string
293
+ tb_h:
294
+ type: date
295
+ feature:
296
+ index: not_analyzed
297
+ type: string
298
+ parse_exception:
299
+ dynamic: false
300
+ _all:
301
+ enabled: false
302
+ properties:
303
+ _state:
304
+ type: string
305
+ index: "no"
306
+ store: "yes"
307
+ ad_tag_id:
308
+ type: integer
309
+ cnt:
310
+ type: integer
311
+ error_code:
312
+ type: string
313
+ index: not_analyzed
314
+ tb_h:
315
+ type: date
316
+ mountweasel:
317
+ dynamic: false
318
+ _all:
319
+ enabled: false
320
+ properties:
321
+ _state:
322
+ type: string
323
+ index: "no"
324
+ store: "yes"
325
+ tb_h:
326
+ type: date
327
+ cnt:
328
+ type: integer