stream_stats 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 34bd7a101e01a96006a3df7971532d0b7b55166d
4
+ data.tar.gz: c8f925a680d994cdaf03d1aec8768d69bbf93de4
5
+ SHA512:
6
+ metadata.gz: 25aebe5f66b5f287ea8ddc150c34ae8a7156c74d338f9bcac2fc25892da6b0d610a9df111c74eedcf03805e1f554323bab7017ef1d29f8f7c29730dc9fb57fe6
7
+ data.tar.gz: 4ab1046ef06e40bd4cac0ca4b2c75cd99e192aa212a28042a5eeaaf8c21dcc32a917be3b12052918de86d461b1a7b00fe802af4423d54e14ed7f24fe81c0939a
data/README.rdoc ADDED
@@ -0,0 +1,43 @@
1
+ = StreamStats
2
+
3
+ Extract statistics from long streams of data with minimal space usage and guaranteed precision.
4
+
5
+ == Install
6
+
7
+ Add to Gemfile or gem install and require
8
+
9
+ gem 'stream_stats'
10
+ require 'stream_stats'
11
+
12
+ == Usage
13
+
14
+ Create stream and add values
15
+
16
+ stream = StreamStats::Stream.new(0.001, [0.50, 0.90])
17
+
18
+ Parameters:
19
+ precision level
20
+ array of quantiles for guaranteed precision
21
+
22
+ The above example guarantees that the 50% and 90% percentile results are guaranteed accurage to +/- 0.001.
23
+
24
+ Populate stream with samples:
25
+
26
+ (0..20).each do |i| stream << i end
27
+
28
+ Get stream result whenever desired:
29
+
30
+ count - count of stream entries
31
+ quantile - query value at quantile
32
+ percentile - query value at percentile
33
+ min - query min value
34
+ max - query max value
35
+ mean - query mean
36
+ stddev - query standard deviation of stream entries
37
+ sum - query sum of stream entries
38
+ squared_sum - query squared sum of stream entries
39
+
40
+ === Credit
41
+
42
+ Complete credit goes to Armon Dadgar. Algorithm code copied directly out of [statsite](https://github.com/armon/statsite)
43
+
@@ -0,0 +1,402 @@
1
+ /*
2
+ Source: https://github.com/armon/statsite/blob/master/src/cm_quantile.c
3
+ Copyright (c) 2012, Armon Dadgar
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+ * Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+ * Redistributions in binary form must reproduce the above copyright
11
+ notice, this list of conditions and the following disclaimer in the
12
+ documentation and/or other materials provided with the distribution.
13
+ * Neither the name of the organization nor the
14
+ names of its contributors may be used to endorse or promote products
15
+ derived from this software without specific prior written permission.
16
+
17
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20
+ DISCLAIMED. IN NO EVENT SHALL ARMON DADGAR BE LIABLE FOR ANY
21
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
+ */
28
+
29
+ /**
30
+ * This module implements the Cormode-Muthukrishnan algorithm
31
+ * for computation of biased quantiles over data streams from
32
+ * "Effective Computation of Biased Quantiles over Data Streams"
33
+ *
34
+ */
35
+ #include <stdint.h>
36
+ #include <iso646.h>
37
+ #include <stdlib.h>
38
+ #include <string.h>
39
+ #include <math.h>
40
+ #include <limits.h>
41
+ #include <stdio.h>
42
+ #include "heap.h"
43
+ #include "cm_quantile.h"
44
+
45
+ /* Static declarations */
46
+ static void cm_add_to_buffer(cm_quantile *cm, double value);
47
+ static double cm_insert_point_value(cm_quantile *cm);
48
+ static void cm_reset_insert_cursor(cm_quantile *cm);
49
+ static int cm_cursor_increment(cm_quantile *cm);
50
+ static void cm_insert_sample(cm_quantile *cm, cm_sample *position, cm_sample *new);
51
+ static void cm_append_sample(cm_quantile *cm, cm_sample *new);
52
+ static void cm_insert(cm_quantile *cm);
53
+ static void cm_compress(cm_quantile *cm);
54
+ static uint64_t cm_threshold(cm_quantile *cm, uint64_t rank);
55
+
56
+ // This is a comparison function that treats keys as doubles
57
+ static int compare_double_keys(register void* key1, register void* key2) {
58
+ // Cast them as double* and read them in
59
+ register double val1 = *((double*)key1);
60
+ register double val2 = *((double*)key2);
61
+
62
+ // Perform the comparison
63
+ if (val1 < val2)
64
+ return -1;
65
+ else if (val1 == val2)
66
+ return 0;
67
+ else
68
+ return 1;
69
+ }
70
+
71
+ /**
72
+ * Initializes the CM quantile struct
73
+ * @arg eps The maximum error for the quantiles
74
+ * @arg quantiles A sorted array of double quantile values, must be on (0, 1)
75
+ * @arg num_quants The number of entries in the quantiles array
76
+ * @arg cm_quantile The cm_quantile struct to initialize
77
+ * @return 0 on success.
78
+ */
79
+ int init_cm_quantile(double eps, double *quantiles, uint32_t num_quants, cm_quantile *cm) {
80
+ // Verify the sanity of epsilon
81
+ if (eps <= 0 or eps >= 0.5) return -1;
82
+
83
+ // Verify the quantiles
84
+ if (!num_quants) return -1;
85
+ for (int i=0; i < num_quants; i++) {
86
+ double val = quantiles[i];
87
+ if (val <= 0 or val >= 1) return -1;
88
+ }
89
+
90
+ // Check that we have a non-null cm
91
+ if (!cm) return -1;
92
+
93
+ // Initialize
94
+ cm->eps = eps;
95
+ cm->num_samples = 0;
96
+ cm->num_values = 0;
97
+ cm->samples = NULL;
98
+ cm->end = NULL;
99
+
100
+ // Copy the quantiles
101
+ cm->quantiles = malloc(num_quants * sizeof(double));
102
+ memcpy(cm->quantiles, quantiles, num_quants * sizeof(double));
103
+ cm->num_quantiles = num_quants;
104
+
105
+ // Initialize the buffers
106
+ heap *heaps = malloc(2*sizeof(heap));
107
+ cm->bufLess = heaps;
108
+ cm->bufMore = heaps+1;
109
+ heap_create(cm->bufLess, 0, compare_double_keys);
110
+ heap_create(cm->bufMore, 0, compare_double_keys);
111
+
112
+ // Setup the cursors
113
+ cm->insert.curs = NULL;
114
+ cm->compress.curs = NULL;
115
+
116
+ return 0;
117
+ }
118
+
119
+ /*
120
+ * Callback function to delete the sample inside the
121
+ * heap buffer.
122
+ */
123
+ static void free_buffer_sample(void *key, void *val) {
124
+ free(val);
125
+ }
126
+
127
+ /**
128
+ * Destroy the CM quantile struct.
129
+ * @arg cm_quantile The cm_quantile to destroy
130
+ * @return 0 on success.
131
+ */
132
+ int destroy_cm_quantile(cm_quantile *cm) {
133
+ // Free the quantiles
134
+ free(cm->quantiles);
135
+
136
+ // Destroy everything in the buffer
137
+ heap_foreach(cm->bufLess, free_buffer_sample);
138
+ heap_destroy(cm->bufLess);
139
+ heap_foreach(cm->bufMore, free_buffer_sample);
140
+ heap_destroy(cm->bufMore);
141
+
142
+ // Free the lower address, since they are allocated to be adjacent
143
+ free((cm->bufLess < cm->bufMore) ? cm->bufLess : cm->bufMore);
144
+
145
+ // Iterate through the linked list, free all
146
+ cm_sample *next;
147
+ cm_sample *current = cm->samples;
148
+ while (current) {
149
+ next = current->next;
150
+ free(current);
151
+ current = next;
152
+ }
153
+
154
+ return 0;
155
+ }
156
+
157
+ /**
158
+ * Adds a new sample to the struct
159
+ * @arg cm_quantile The cm_quantile to add to
160
+ * @arg sample The new sample value
161
+ * @return 0 on success.
162
+ */
163
+ int cm_add_sample(cm_quantile *cm, double sample) {
164
+ cm_add_to_buffer(cm, sample);
165
+ cm_insert(cm);
166
+ cm_compress(cm);
167
+ return 0;
168
+ }
169
+
170
+ /**
171
+ * Forces the internal buffers to be flushed,
172
+ * this allows query to have maximum accuracy.
173
+ * @arg cm_quantile The cm_quantile to add to
174
+ * @return 0 on success.
175
+ */
176
+ int cm_flush(cm_quantile *cm) {
177
+ int rounds = 0;
178
+ while (heap_size(cm->bufLess) or heap_size(cm->bufMore)) {
179
+ if (heap_size(cm->bufMore) == 0) cm_reset_insert_cursor(cm);
180
+ cm_insert(cm);
181
+ cm_compress(cm);
182
+ rounds++;
183
+ }
184
+ return 0;
185
+ }
186
+
187
+ /**
188
+ * Queries for a quantile value
189
+ * @arg cm_quantile The cm_quantile to query
190
+ * @arg quantile The quantile to query
191
+ * @return The value on success or 0.
192
+ */
193
+ double cm_query(cm_quantile *cm, double quantile) {
194
+ uint64_t rank = ceil(quantile * cm->num_values);
195
+ uint64_t min_rank=0;
196
+ uint64_t max_rank;
197
+ uint64_t threshold = ceil(cm_threshold(cm, rank) / 2.);
198
+
199
+ cm_sample *prev = cm->samples;
200
+ cm_sample *current = cm->samples;
201
+ while (current) {
202
+ max_rank = min_rank + current->width + current->delta;
203
+ if (max_rank > rank + threshold) {
204
+ break;
205
+ }
206
+ min_rank += current->width;
207
+ prev = current;
208
+ current = current->next;
209
+ }
210
+ return (prev) ? prev->value : 0;
211
+ }
212
+
213
+ /**
214
+ * Adds a new sample to the buffer
215
+ */
216
+ static void cm_add_to_buffer(cm_quantile *cm, double value) {
217
+ // Allocate a new sample
218
+ cm_sample *s = calloc(1, sizeof(cm_sample));
219
+ s->value = value;
220
+
221
+ /*
222
+ * Check the cursor value.
223
+ * Only use bufLess if we have at least a single value.
224
+ */
225
+ if (cm->num_values && value < cm_insert_point_value(cm)) {
226
+ heap_insert(cm->bufLess, &s->value, s);
227
+ } else {
228
+ heap_insert(cm->bufMore, &s->value, s);
229
+ }
230
+ }
231
+
232
+ // Returns the value under the insertion cursor or 0
233
+ static double cm_insert_point_value(cm_quantile *cm) {
234
+ return (cm->insert.curs) ? cm->insert.curs->value : 0;
235
+ }
236
+
237
+ // Resets the insert cursor
238
+ static void cm_reset_insert_cursor(cm_quantile *cm) {
239
+ // Swap the buffers, reset the cursor
240
+ heap *tmp = cm->bufLess;
241
+ cm->bufLess = cm->bufMore;
242
+ cm->bufMore = tmp;
243
+ cm->insert.curs = NULL;
244
+ }
245
+
246
+ // Computes the number of items to process in one iteration
247
+ static int cm_cursor_increment(cm_quantile *cm) {
248
+ return ceil(cm->num_samples * cm->eps);
249
+ }
250
+
251
+ /* Inserts a new sample before the position sample */
252
+ static void cm_insert_sample(cm_quantile *cm, cm_sample *position, cm_sample *new) {
253
+ // Inserting at the head
254
+ if (!position->prev) {
255
+ position->prev = new;
256
+ cm->samples = new;
257
+ new->next = position;
258
+ } else {
259
+ cm_sample *prev = position->prev;
260
+ prev->next = new;
261
+ position->prev = new;
262
+ new->prev = prev;
263
+ new->next = position;
264
+ }
265
+ }
266
+
267
+ /* Inserts a new sample at the end */
268
+ static void cm_append_sample(cm_quantile *cm, cm_sample *new) {
269
+ new->prev = cm->end;
270
+ cm->end->next = new;
271
+ cm->end = new;
272
+ }
273
+
274
+ /**
275
+ * Incrementally processes inserts by moving
276
+ * data from the buffer to the samples using a cursor
277
+ */
278
+ static void cm_insert(cm_quantile *cm) {
279
+ // Check if this is the first element
280
+ cm_sample *samp;
281
+ if (!cm->samples) {
282
+ if (!heap_delmin(cm->bufMore, NULL, (void**)&samp)) return;
283
+ samp->width = 1;
284
+ samp->delta = 0;
285
+ cm->samples = samp;
286
+ cm->end = samp;
287
+ cm->num_values++;
288
+ cm->num_samples++;
289
+ cm->insert.curs = samp;
290
+ return;
291
+ }
292
+
293
+ // Check if we need to initialize the cursor
294
+ if (!cm->insert.curs) {
295
+ cm->insert.curs = cm->samples;
296
+ }
297
+
298
+ // Handle adding values in the middle
299
+ int incr_size = cm_cursor_increment(cm);
300
+ double *val;
301
+ for (int i=0; i < incr_size and cm->insert.curs; i++) {
302
+ while (heap_min(cm->bufMore, (void**)&val, NULL) && *val <= cm_insert_point_value(cm)) {
303
+ heap_delmin(cm->bufMore, NULL, (void**)&samp);
304
+ samp->width = 1;
305
+ samp->delta = cm->insert.curs->width + cm->insert.curs->delta - 1;
306
+ cm_insert_sample(cm, cm->insert.curs, samp);
307
+ cm->num_values++;
308
+ cm->num_samples++;
309
+
310
+ // Check if we need to update the compress cursor
311
+ if (cm->compress.curs && cm->compress.curs->value >= samp->value) {
312
+ cm->compress.min_rank++;
313
+ }
314
+ }
315
+ // Increment the cursor
316
+ cm->insert.curs = cm->insert.curs->next;
317
+ }
318
+
319
+ // Handle adding values at the end
320
+ if (cm->insert.curs == NULL) {
321
+ while (heap_min(cm->bufMore, (void**)&val, NULL) && *val > cm->end->value) {
322
+ heap_delmin(cm->bufMore, NULL, (void**)&samp);
323
+ samp->width = 1;
324
+ samp->delta = 0;
325
+ cm_append_sample(cm, samp);
326
+ cm->num_values++;
327
+ cm->num_samples++;
328
+ }
329
+
330
+ // Reset the cursor
331
+ cm_reset_insert_cursor(cm);
332
+ }
333
+ }
334
+
335
+ /* Incrementally processes compression by using a cursor */
336
+ static void cm_compress(cm_quantile *cm) {
337
+ // Bail early if there is nothing to really compress..
338
+ if (cm->num_samples < 3) return;
339
+
340
+ // Check if we need to initialize the cursor
341
+ if (!cm->compress.curs) {
342
+ cm->compress.curs = cm->end->prev;
343
+ cm->compress.min_rank = cm->num_values - 1 - cm->compress.curs->width;
344
+ cm->compress.curs = cm->compress.curs->prev;
345
+ }
346
+
347
+ int incr_size = cm_cursor_increment(cm);
348
+ cm_sample *next, *prev;
349
+ uint64_t threshold;
350
+ uint64_t max_rank, test_val;
351
+ for (int i=0; i < incr_size and cm->compress.curs != cm->samples; i++) {
352
+ next = cm->compress.curs->next;
353
+ max_rank = cm->compress.min_rank + cm->compress.curs->width + cm->compress.curs->delta;
354
+ cm->compress.min_rank -= cm->compress.curs->width;
355
+
356
+ threshold = cm_threshold(cm, max_rank);
357
+ test_val = cm->compress.curs->width + next->width + next->delta;
358
+ if (test_val <= threshold) {
359
+ // Make sure we don't stomp the insertion cursor
360
+ if (cm->insert.curs == cm->compress.curs) {
361
+ cm->insert.curs = next;
362
+ }
363
+
364
+ // Combine the widths
365
+ next->width += cm->compress.curs->width;
366
+
367
+ // Remove the tuple
368
+ prev = cm->compress.curs->prev;
369
+ prev->next = next;
370
+ next->prev = prev;
371
+ free(cm->compress.curs);
372
+ cm->compress.curs = prev;
373
+
374
+ // Reduce the sample count
375
+ cm->num_samples--;
376
+ } else {
377
+ cm->compress.curs = cm->compress.curs->prev;
378
+ }
379
+ }
380
+
381
+ // Reset the cursor if we hit the start
382
+ if (cm->compress.curs == cm->samples) cm->compress.curs = NULL;
383
+ }
384
+
385
+ /* Computes the minimum threshold value */
386
+ static uint64_t cm_threshold(cm_quantile *cm, uint64_t rank) {
387
+ uint64_t min_val = LLONG_MAX;
388
+
389
+ uint64_t quant_min;
390
+ double quant;
391
+ for (int i=0; i < cm->num_quantiles; i++) {
392
+ quant = cm->quantiles[i];
393
+ if (rank >= quant * cm->num_values) {
394
+ quant_min = 2 * cm->eps * rank / quant;
395
+ } else {
396
+ quant_min = 2 * cm->eps * (cm->num_values - rank) / (1 - quant);
397
+ }
398
+ if (quant_min < min_val) min_val = quant_min;
399
+ }
400
+
401
+ return min_val;
402
+ }