stream_stats 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 34bd7a101e01a96006a3df7971532d0b7b55166d
4
+ data.tar.gz: c8f925a680d994cdaf03d1aec8768d69bbf93de4
5
+ SHA512:
6
+ metadata.gz: 25aebe5f66b5f287ea8ddc150c34ae8a7156c74d338f9bcac2fc25892da6b0d610a9df111c74eedcf03805e1f554323bab7017ef1d29f8f7c29730dc9fb57fe6
7
+ data.tar.gz: 4ab1046ef06e40bd4cac0ca4b2c75cd99e192aa212a28042a5eeaaf8c21dcc32a917be3b12052918de86d461b1a7b00fe802af4423d54e14ed7f24fe81c0939a
data/README.rdoc ADDED
@@ -0,0 +1,43 @@
1
+ = StreamStats
2
+
3
+ Extract statistics from long streams of data with minimal space usage and guaranteed precision.
4
+
5
+ == Install
6
+
7
+ Add to Gemfile or gem install and require
8
+
9
+ gem 'stream_stats'
10
+ require 'stream_stats'
11
+
12
+ == Usage
13
+
14
+ Create stream and add values
15
+
16
+ stream = StreamStats::Stream.new(0.001, [0.50, 0.90])
17
+
18
+ Parameters:
19
+ precision level
20
+ array of quantiles for guaranteed precision
21
+
22
+ The above example guarantees that the 50% and 90% percentile results are guaranteed accurage to +/- 0.001.
23
+
24
+ Populate stream with samples:
25
+
26
+ (0..20).each do |i| stream << i end
27
+
28
+ Get stream result whenever desired:
29
+
30
+ count - count of stream entries
31
+ quantile - query value at quantile
32
+ percentile - query value at percentile
33
+ min - query min value
34
+ max - query max value
35
+ mean - query mean
36
+ stddev - query standard deviation of stream entries
37
+ sum - query sum of stream entries
38
+ squared_sum - query squared sum of stream entries
39
+
40
+ === Credit
41
+
42
+ Complete credit goes to Armon Dadgar. Algorithm code copied directly out of [statsite](https://github.com/armon/statsite)
43
+
@@ -0,0 +1,402 @@
1
+ /*
2
+ Source: https://github.com/armon/statsite/blob/master/src/cm_quantile.c
3
+ Copyright (c) 2012, Armon Dadgar
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+ * Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+ * Redistributions in binary form must reproduce the above copyright
11
+ notice, this list of conditions and the following disclaimer in the
12
+ documentation and/or other materials provided with the distribution.
13
+ * Neither the name of the organization nor the
14
+ names of its contributors may be used to endorse or promote products
15
+ derived from this software without specific prior written permission.
16
+
17
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20
+ DISCLAIMED. IN NO EVENT SHALL ARMON DADGAR BE LIABLE FOR ANY
21
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
+ */
28
+
29
+ /**
30
+ * This module implements the Cormode-Muthukrishnan algorithm
31
+ * for computation of biased quantiles over data streams from
32
+ * "Effective Computation of Biased Quantiles over Data Streams"
33
+ *
34
+ */
35
+ #include <stdint.h>
36
+ #include <iso646.h>
37
+ #include <stdlib.h>
38
+ #include <string.h>
39
+ #include <math.h>
40
+ #include <limits.h>
41
+ #include <stdio.h>
42
+ #include "heap.h"
43
+ #include "cm_quantile.h"
44
+
45
+ /* Static declarations */
46
+ static void cm_add_to_buffer(cm_quantile *cm, double value);
47
+ static double cm_insert_point_value(cm_quantile *cm);
48
+ static void cm_reset_insert_cursor(cm_quantile *cm);
49
+ static int cm_cursor_increment(cm_quantile *cm);
50
+ static void cm_insert_sample(cm_quantile *cm, cm_sample *position, cm_sample *new);
51
+ static void cm_append_sample(cm_quantile *cm, cm_sample *new);
52
+ static void cm_insert(cm_quantile *cm);
53
+ static void cm_compress(cm_quantile *cm);
54
+ static uint64_t cm_threshold(cm_quantile *cm, uint64_t rank);
55
+
56
+ // This is a comparison function that treats keys as doubles
57
+ static int compare_double_keys(register void* key1, register void* key2) {
58
+ // Cast them as double* and read them in
59
+ register double val1 = *((double*)key1);
60
+ register double val2 = *((double*)key2);
61
+
62
+ // Perform the comparison
63
+ if (val1 < val2)
64
+ return -1;
65
+ else if (val1 == val2)
66
+ return 0;
67
+ else
68
+ return 1;
69
+ }
70
+
71
+ /**
72
+ * Initializes the CM quantile struct
73
+ * @arg eps The maximum error for the quantiles
74
+ * @arg quantiles A sorted array of double quantile values, must be on (0, 1)
75
+ * @arg num_quants The number of entries in the quantiles array
76
+ * @arg cm_quantile The cm_quantile struct to initialize
77
+ * @return 0 on success.
78
+ */
79
+ int init_cm_quantile(double eps, double *quantiles, uint32_t num_quants, cm_quantile *cm) {
80
+ // Verify the sanity of epsilon
81
+ if (eps <= 0 or eps >= 0.5) return -1;
82
+
83
+ // Verify the quantiles
84
+ if (!num_quants) return -1;
85
+ for (int i=0; i < num_quants; i++) {
86
+ double val = quantiles[i];
87
+ if (val <= 0 or val >= 1) return -1;
88
+ }
89
+
90
+ // Check that we have a non-null cm
91
+ if (!cm) return -1;
92
+
93
+ // Initialize
94
+ cm->eps = eps;
95
+ cm->num_samples = 0;
96
+ cm->num_values = 0;
97
+ cm->samples = NULL;
98
+ cm->end = NULL;
99
+
100
+ // Copy the quantiles
101
+ cm->quantiles = malloc(num_quants * sizeof(double));
102
+ memcpy(cm->quantiles, quantiles, num_quants * sizeof(double));
103
+ cm->num_quantiles = num_quants;
104
+
105
+ // Initialize the buffers
106
+ heap *heaps = malloc(2*sizeof(heap));
107
+ cm->bufLess = heaps;
108
+ cm->bufMore = heaps+1;
109
+ heap_create(cm->bufLess, 0, compare_double_keys);
110
+ heap_create(cm->bufMore, 0, compare_double_keys);
111
+
112
+ // Setup the cursors
113
+ cm->insert.curs = NULL;
114
+ cm->compress.curs = NULL;
115
+
116
+ return 0;
117
+ }
118
+
119
+ /*
120
+ * Callback function to delete the sample inside the
121
+ * heap buffer.
122
+ */
123
+ static void free_buffer_sample(void *key, void *val) {
124
+ free(val);
125
+ }
126
+
127
+ /**
128
+ * Destroy the CM quantile struct.
129
+ * @arg cm_quantile The cm_quantile to destroy
130
+ * @return 0 on success.
131
+ */
132
+ int destroy_cm_quantile(cm_quantile *cm) {
133
+ // Free the quantiles
134
+ free(cm->quantiles);
135
+
136
+ // Destroy everything in the buffer
137
+ heap_foreach(cm->bufLess, free_buffer_sample);
138
+ heap_destroy(cm->bufLess);
139
+ heap_foreach(cm->bufMore, free_buffer_sample);
140
+ heap_destroy(cm->bufMore);
141
+
142
+ // Free the lower address, since they are allocated to be adjacent
143
+ free((cm->bufLess < cm->bufMore) ? cm->bufLess : cm->bufMore);
144
+
145
+ // Iterate through the linked list, free all
146
+ cm_sample *next;
147
+ cm_sample *current = cm->samples;
148
+ while (current) {
149
+ next = current->next;
150
+ free(current);
151
+ current = next;
152
+ }
153
+
154
+ return 0;
155
+ }
156
+
157
+ /**
158
+ * Adds a new sample to the struct
159
+ * @arg cm_quantile The cm_quantile to add to
160
+ * @arg sample The new sample value
161
+ * @return 0 on success.
162
+ */
163
+ int cm_add_sample(cm_quantile *cm, double sample) {
164
+ cm_add_to_buffer(cm, sample);
165
+ cm_insert(cm);
166
+ cm_compress(cm);
167
+ return 0;
168
+ }
169
+
170
+ /**
171
+ * Forces the internal buffers to be flushed,
172
+ * this allows query to have maximum accuracy.
173
+ * @arg cm_quantile The cm_quantile to add to
174
+ * @return 0 on success.
175
+ */
176
+ int cm_flush(cm_quantile *cm) {
177
+ int rounds = 0;
178
+ while (heap_size(cm->bufLess) or heap_size(cm->bufMore)) {
179
+ if (heap_size(cm->bufMore) == 0) cm_reset_insert_cursor(cm);
180
+ cm_insert(cm);
181
+ cm_compress(cm);
182
+ rounds++;
183
+ }
184
+ return 0;
185
+ }
186
+
187
+ /**
188
+ * Queries for a quantile value
189
+ * @arg cm_quantile The cm_quantile to query
190
+ * @arg quantile The quantile to query
191
+ * @return The value on success or 0.
192
+ */
193
+ double cm_query(cm_quantile *cm, double quantile) {
194
+ uint64_t rank = ceil(quantile * cm->num_values);
195
+ uint64_t min_rank=0;
196
+ uint64_t max_rank;
197
+ uint64_t threshold = ceil(cm_threshold(cm, rank) / 2.);
198
+
199
+ cm_sample *prev = cm->samples;
200
+ cm_sample *current = cm->samples;
201
+ while (current) {
202
+ max_rank = min_rank + current->width + current->delta;
203
+ if (max_rank > rank + threshold) {
204
+ break;
205
+ }
206
+ min_rank += current->width;
207
+ prev = current;
208
+ current = current->next;
209
+ }
210
+ return (prev) ? prev->value : 0;
211
+ }
212
+
213
+ /**
214
+ * Adds a new sample to the buffer
215
+ */
216
+ static void cm_add_to_buffer(cm_quantile *cm, double value) {
217
+ // Allocate a new sample
218
+ cm_sample *s = calloc(1, sizeof(cm_sample));
219
+ s->value = value;
220
+
221
+ /*
222
+ * Check the cursor value.
223
+ * Only use bufLess if we have at least a single value.
224
+ */
225
+ if (cm->num_values && value < cm_insert_point_value(cm)) {
226
+ heap_insert(cm->bufLess, &s->value, s);
227
+ } else {
228
+ heap_insert(cm->bufMore, &s->value, s);
229
+ }
230
+ }
231
+
232
+ // Returns the value under the insertion cursor or 0
233
+ static double cm_insert_point_value(cm_quantile *cm) {
234
+ return (cm->insert.curs) ? cm->insert.curs->value : 0;
235
+ }
236
+
237
+ // Resets the insert cursor
238
+ static void cm_reset_insert_cursor(cm_quantile *cm) {
239
+ // Swap the buffers, reset the cursor
240
+ heap *tmp = cm->bufLess;
241
+ cm->bufLess = cm->bufMore;
242
+ cm->bufMore = tmp;
243
+ cm->insert.curs = NULL;
244
+ }
245
+
246
+ // Computes the number of items to process in one iteration
247
+ static int cm_cursor_increment(cm_quantile *cm) {
248
+ return ceil(cm->num_samples * cm->eps);
249
+ }
250
+
251
+ /* Inserts a new sample before the position sample */
252
+ static void cm_insert_sample(cm_quantile *cm, cm_sample *position, cm_sample *new) {
253
+ // Inserting at the head
254
+ if (!position->prev) {
255
+ position->prev = new;
256
+ cm->samples = new;
257
+ new->next = position;
258
+ } else {
259
+ cm_sample *prev = position->prev;
260
+ prev->next = new;
261
+ position->prev = new;
262
+ new->prev = prev;
263
+ new->next = position;
264
+ }
265
+ }
266
+
267
+ /* Inserts a new sample at the end */
268
+ static void cm_append_sample(cm_quantile *cm, cm_sample *new) {
269
+ new->prev = cm->end;
270
+ cm->end->next = new;
271
+ cm->end = new;
272
+ }
273
+
274
+ /**
275
+ * Incrementally processes inserts by moving
276
+ * data from the buffer to the samples using a cursor
277
+ */
278
+ static void cm_insert(cm_quantile *cm) {
279
+ // Check if this is the first element
280
+ cm_sample *samp;
281
+ if (!cm->samples) {
282
+ if (!heap_delmin(cm->bufMore, NULL, (void**)&samp)) return;
283
+ samp->width = 1;
284
+ samp->delta = 0;
285
+ cm->samples = samp;
286
+ cm->end = samp;
287
+ cm->num_values++;
288
+ cm->num_samples++;
289
+ cm->insert.curs = samp;
290
+ return;
291
+ }
292
+
293
+ // Check if we need to initialize the cursor
294
+ if (!cm->insert.curs) {
295
+ cm->insert.curs = cm->samples;
296
+ }
297
+
298
+ // Handle adding values in the middle
299
+ int incr_size = cm_cursor_increment(cm);
300
+ double *val;
301
+ for (int i=0; i < incr_size and cm->insert.curs; i++) {
302
+ while (heap_min(cm->bufMore, (void**)&val, NULL) && *val <= cm_insert_point_value(cm)) {
303
+ heap_delmin(cm->bufMore, NULL, (void**)&samp);
304
+ samp->width = 1;
305
+ samp->delta = cm->insert.curs->width + cm->insert.curs->delta - 1;
306
+ cm_insert_sample(cm, cm->insert.curs, samp);
307
+ cm->num_values++;
308
+ cm->num_samples++;
309
+
310
+ // Check if we need to update the compress cursor
311
+ if (cm->compress.curs && cm->compress.curs->value >= samp->value) {
312
+ cm->compress.min_rank++;
313
+ }
314
+ }
315
+ // Increment the cursor
316
+ cm->insert.curs = cm->insert.curs->next;
317
+ }
318
+
319
+ // Handle adding values at the end
320
+ if (cm->insert.curs == NULL) {
321
+ while (heap_min(cm->bufMore, (void**)&val, NULL) && *val > cm->end->value) {
322
+ heap_delmin(cm->bufMore, NULL, (void**)&samp);
323
+ samp->width = 1;
324
+ samp->delta = 0;
325
+ cm_append_sample(cm, samp);
326
+ cm->num_values++;
327
+ cm->num_samples++;
328
+ }
329
+
330
+ // Reset the cursor
331
+ cm_reset_insert_cursor(cm);
332
+ }
333
+ }
334
+
335
+ /* Incrementally processes compression by using a cursor */
336
+ static void cm_compress(cm_quantile *cm) {
337
+ // Bail early if there is nothing to really compress..
338
+ if (cm->num_samples < 3) return;
339
+
340
+ // Check if we need to initialize the cursor
341
+ if (!cm->compress.curs) {
342
+ cm->compress.curs = cm->end->prev;
343
+ cm->compress.min_rank = cm->num_values - 1 - cm->compress.curs->width;
344
+ cm->compress.curs = cm->compress.curs->prev;
345
+ }
346
+
347
+ int incr_size = cm_cursor_increment(cm);
348
+ cm_sample *next, *prev;
349
+ uint64_t threshold;
350
+ uint64_t max_rank, test_val;
351
+ for (int i=0; i < incr_size and cm->compress.curs != cm->samples; i++) {
352
+ next = cm->compress.curs->next;
353
+ max_rank = cm->compress.min_rank + cm->compress.curs->width + cm->compress.curs->delta;
354
+ cm->compress.min_rank -= cm->compress.curs->width;
355
+
356
+ threshold = cm_threshold(cm, max_rank);
357
+ test_val = cm->compress.curs->width + next->width + next->delta;
358
+ if (test_val <= threshold) {
359
+ // Make sure we don't stomp the insertion cursor
360
+ if (cm->insert.curs == cm->compress.curs) {
361
+ cm->insert.curs = next;
362
+ }
363
+
364
+ // Combine the widths
365
+ next->width += cm->compress.curs->width;
366
+
367
+ // Remove the tuple
368
+ prev = cm->compress.curs->prev;
369
+ prev->next = next;
370
+ next->prev = prev;
371
+ free(cm->compress.curs);
372
+ cm->compress.curs = prev;
373
+
374
+ // Reduce the sample count
375
+ cm->num_samples--;
376
+ } else {
377
+ cm->compress.curs = cm->compress.curs->prev;
378
+ }
379
+ }
380
+
381
+ // Reset the cursor if we hit the start
382
+ if (cm->compress.curs == cm->samples) cm->compress.curs = NULL;
383
+ }
384
+
385
+ /* Computes the minimum threshold value */
386
+ static uint64_t cm_threshold(cm_quantile *cm, uint64_t rank) {
387
+ uint64_t min_val = LLONG_MAX;
388
+
389
+ uint64_t quant_min;
390
+ double quant;
391
+ for (int i=0; i < cm->num_quantiles; i++) {
392
+ quant = cm->quantiles[i];
393
+ if (rank >= quant * cm->num_values) {
394
+ quant_min = 2 * cm->eps * rank / quant;
395
+ } else {
396
+ quant_min = 2 * cm->eps * (cm->num_values - rank) / (1 - quant);
397
+ }
398
+ if (quant_min < min_val) min_val = quant_min;
399
+ }
400
+
401
+ return min_val;
402
+ }