stream_stats 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +43 -0
- data/ext/stream_stats/cm_quantile.c +402 -0
- data/ext/stream_stats/cm_quantile.h +88 -0
- data/ext/stream_stats/extconf.rb +3 -0
- data/ext/stream_stats/heap.c +407 -0
- data/ext/stream_stats/heap.h +85 -0
- data/ext/stream_stats/stream_stats.c +135 -0
- data/ext/stream_stats/timer.c +165 -0
- data/ext/stream_stats/timer.h +96 -0
- data/lib/stream_stats.rb +6 -0
- data/lib/stream_stats/stream.rb +17 -0
- data/lib/stream_stats/version.rb +4 -0
- data/stream_stats.gemspec +15 -0
- metadata +57 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 34bd7a101e01a96006a3df7971532d0b7b55166d
|
4
|
+
data.tar.gz: c8f925a680d994cdaf03d1aec8768d69bbf93de4
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 25aebe5f66b5f287ea8ddc150c34ae8a7156c74d338f9bcac2fc25892da6b0d610a9df111c74eedcf03805e1f554323bab7017ef1d29f8f7c29730dc9fb57fe6
|
7
|
+
data.tar.gz: 4ab1046ef06e40bd4cac0ca4b2c75cd99e192aa212a28042a5eeaaf8c21dcc32a917be3b12052918de86d461b1a7b00fe802af4423d54e14ed7f24fe81c0939a
|
data/README.rdoc
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
= StreamStats
|
2
|
+
|
3
|
+
Extract statistics from long streams of data with minimal space usage and guaranteed precision.
|
4
|
+
|
5
|
+
== Install
|
6
|
+
|
7
|
+
Add to Gemfile or gem install and require
|
8
|
+
|
9
|
+
gem 'stream_stats'
|
10
|
+
require 'stream_stats'
|
11
|
+
|
12
|
+
== Usage
|
13
|
+
|
14
|
+
Create stream and add values
|
15
|
+
|
16
|
+
stream = StreamStats::Stream.new(0.001, [0.50, 0.90])
|
17
|
+
|
18
|
+
Parameters:
|
19
|
+
precision level
|
20
|
+
array of quantiles for guaranteed precision
|
21
|
+
|
22
|
+
The above example guarantees that the 50% and 90% percentile results are guaranteed accurage to +/- 0.001.
|
23
|
+
|
24
|
+
Populate stream with samples:
|
25
|
+
|
26
|
+
(0..20).each do |i| stream << i end
|
27
|
+
|
28
|
+
Get stream result whenever desired:
|
29
|
+
|
30
|
+
count - count of stream entries
|
31
|
+
quantile - query value at quantile
|
32
|
+
percentile - query value at percentile
|
33
|
+
min - query min value
|
34
|
+
max - query max value
|
35
|
+
mean - query mean
|
36
|
+
stddev - query standard deviation of stream entries
|
37
|
+
sum - query sum of stream entries
|
38
|
+
squared_sum - query squared sum of stream entries
|
39
|
+
|
40
|
+
=== Credit
|
41
|
+
|
42
|
+
Complete credit goes to Armon Dadgar. Algorithm code copied directly out of [statsite](https://github.com/armon/statsite)
|
43
|
+
|
@@ -0,0 +1,402 @@
|
|
1
|
+
/*
|
2
|
+
Source: https://github.com/armon/statsite/blob/master/src/cm_quantile.c
|
3
|
+
Copyright (c) 2012, Armon Dadgar
|
4
|
+
All rights reserved.
|
5
|
+
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
8
|
+
* Redistributions of source code must retain the above copyright
|
9
|
+
notice, this list of conditions and the following disclaimer.
|
10
|
+
* Redistributions in binary form must reproduce the above copyright
|
11
|
+
notice, this list of conditions and the following disclaimer in the
|
12
|
+
documentation and/or other materials provided with the distribution.
|
13
|
+
* Neither the name of the organization nor the
|
14
|
+
names of its contributors may be used to endorse or promote products
|
15
|
+
derived from this software without specific prior written permission.
|
16
|
+
|
17
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
18
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
19
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
20
|
+
DISCLAIMED. IN NO EVENT SHALL ARMON DADGAR BE LIABLE FOR ANY
|
21
|
+
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
22
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
23
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
24
|
+
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
25
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
26
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
27
|
+
*/
|
28
|
+
|
29
|
+
/**
|
30
|
+
* This module implements the Cormode-Muthukrishnan algorithm
|
31
|
+
* for computation of biased quantiles over data streams from
|
32
|
+
* "Effective Computation of Biased Quantiles over Data Streams"
|
33
|
+
*
|
34
|
+
*/
|
35
|
+
#include <stdint.h>
|
36
|
+
#include <iso646.h>
|
37
|
+
#include <stdlib.h>
|
38
|
+
#include <string.h>
|
39
|
+
#include <math.h>
|
40
|
+
#include <limits.h>
|
41
|
+
#include <stdio.h>
|
42
|
+
#include "heap.h"
|
43
|
+
#include "cm_quantile.h"
|
44
|
+
|
45
|
+
/* Static declarations */
|
46
|
+
static void cm_add_to_buffer(cm_quantile *cm, double value);
|
47
|
+
static double cm_insert_point_value(cm_quantile *cm);
|
48
|
+
static void cm_reset_insert_cursor(cm_quantile *cm);
|
49
|
+
static int cm_cursor_increment(cm_quantile *cm);
|
50
|
+
static void cm_insert_sample(cm_quantile *cm, cm_sample *position, cm_sample *new);
|
51
|
+
static void cm_append_sample(cm_quantile *cm, cm_sample *new);
|
52
|
+
static void cm_insert(cm_quantile *cm);
|
53
|
+
static void cm_compress(cm_quantile *cm);
|
54
|
+
static uint64_t cm_threshold(cm_quantile *cm, uint64_t rank);
|
55
|
+
|
56
|
+
// This is a comparison function that treats keys as doubles
|
57
|
+
static int compare_double_keys(register void* key1, register void* key2) {
|
58
|
+
// Cast them as double* and read them in
|
59
|
+
register double val1 = *((double*)key1);
|
60
|
+
register double val2 = *((double*)key2);
|
61
|
+
|
62
|
+
// Perform the comparison
|
63
|
+
if (val1 < val2)
|
64
|
+
return -1;
|
65
|
+
else if (val1 == val2)
|
66
|
+
return 0;
|
67
|
+
else
|
68
|
+
return 1;
|
69
|
+
}
|
70
|
+
|
71
|
+
/**
|
72
|
+
* Initializes the CM quantile struct
|
73
|
+
* @arg eps The maximum error for the quantiles
|
74
|
+
* @arg quantiles A sorted array of double quantile values, must be on (0, 1)
|
75
|
+
* @arg num_quants The number of entries in the quantiles array
|
76
|
+
* @arg cm_quantile The cm_quantile struct to initialize
|
77
|
+
* @return 0 on success.
|
78
|
+
*/
|
79
|
+
int init_cm_quantile(double eps, double *quantiles, uint32_t num_quants, cm_quantile *cm) {
|
80
|
+
// Verify the sanity of epsilon
|
81
|
+
if (eps <= 0 or eps >= 0.5) return -1;
|
82
|
+
|
83
|
+
// Verify the quantiles
|
84
|
+
if (!num_quants) return -1;
|
85
|
+
for (int i=0; i < num_quants; i++) {
|
86
|
+
double val = quantiles[i];
|
87
|
+
if (val <= 0 or val >= 1) return -1;
|
88
|
+
}
|
89
|
+
|
90
|
+
// Check that we have a non-null cm
|
91
|
+
if (!cm) return -1;
|
92
|
+
|
93
|
+
// Initialize
|
94
|
+
cm->eps = eps;
|
95
|
+
cm->num_samples = 0;
|
96
|
+
cm->num_values = 0;
|
97
|
+
cm->samples = NULL;
|
98
|
+
cm->end = NULL;
|
99
|
+
|
100
|
+
// Copy the quantiles
|
101
|
+
cm->quantiles = malloc(num_quants * sizeof(double));
|
102
|
+
memcpy(cm->quantiles, quantiles, num_quants * sizeof(double));
|
103
|
+
cm->num_quantiles = num_quants;
|
104
|
+
|
105
|
+
// Initialize the buffers
|
106
|
+
heap *heaps = malloc(2*sizeof(heap));
|
107
|
+
cm->bufLess = heaps;
|
108
|
+
cm->bufMore = heaps+1;
|
109
|
+
heap_create(cm->bufLess, 0, compare_double_keys);
|
110
|
+
heap_create(cm->bufMore, 0, compare_double_keys);
|
111
|
+
|
112
|
+
// Setup the cursors
|
113
|
+
cm->insert.curs = NULL;
|
114
|
+
cm->compress.curs = NULL;
|
115
|
+
|
116
|
+
return 0;
|
117
|
+
}
|
118
|
+
|
119
|
+
/*
|
120
|
+
* Callback function to delete the sample inside the
|
121
|
+
* heap buffer.
|
122
|
+
*/
|
123
|
+
static void free_buffer_sample(void *key, void *val) {
|
124
|
+
free(val);
|
125
|
+
}
|
126
|
+
|
127
|
+
/**
|
128
|
+
* Destroy the CM quantile struct.
|
129
|
+
* @arg cm_quantile The cm_quantile to destroy
|
130
|
+
* @return 0 on success.
|
131
|
+
*/
|
132
|
+
int destroy_cm_quantile(cm_quantile *cm) {
|
133
|
+
// Free the quantiles
|
134
|
+
free(cm->quantiles);
|
135
|
+
|
136
|
+
// Destroy everything in the buffer
|
137
|
+
heap_foreach(cm->bufLess, free_buffer_sample);
|
138
|
+
heap_destroy(cm->bufLess);
|
139
|
+
heap_foreach(cm->bufMore, free_buffer_sample);
|
140
|
+
heap_destroy(cm->bufMore);
|
141
|
+
|
142
|
+
// Free the lower address, since they are allocated to be adjacent
|
143
|
+
free((cm->bufLess < cm->bufMore) ? cm->bufLess : cm->bufMore);
|
144
|
+
|
145
|
+
// Iterate through the linked list, free all
|
146
|
+
cm_sample *next;
|
147
|
+
cm_sample *current = cm->samples;
|
148
|
+
while (current) {
|
149
|
+
next = current->next;
|
150
|
+
free(current);
|
151
|
+
current = next;
|
152
|
+
}
|
153
|
+
|
154
|
+
return 0;
|
155
|
+
}
|
156
|
+
|
157
|
+
/**
|
158
|
+
* Adds a new sample to the struct
|
159
|
+
* @arg cm_quantile The cm_quantile to add to
|
160
|
+
* @arg sample The new sample value
|
161
|
+
* @return 0 on success.
|
162
|
+
*/
|
163
|
+
int cm_add_sample(cm_quantile *cm, double sample) {
|
164
|
+
cm_add_to_buffer(cm, sample);
|
165
|
+
cm_insert(cm);
|
166
|
+
cm_compress(cm);
|
167
|
+
return 0;
|
168
|
+
}
|
169
|
+
|
170
|
+
/**
|
171
|
+
* Forces the internal buffers to be flushed,
|
172
|
+
* this allows query to have maximum accuracy.
|
173
|
+
* @arg cm_quantile The cm_quantile to add to
|
174
|
+
* @return 0 on success.
|
175
|
+
*/
|
176
|
+
int cm_flush(cm_quantile *cm) {
|
177
|
+
int rounds = 0;
|
178
|
+
while (heap_size(cm->bufLess) or heap_size(cm->bufMore)) {
|
179
|
+
if (heap_size(cm->bufMore) == 0) cm_reset_insert_cursor(cm);
|
180
|
+
cm_insert(cm);
|
181
|
+
cm_compress(cm);
|
182
|
+
rounds++;
|
183
|
+
}
|
184
|
+
return 0;
|
185
|
+
}
|
186
|
+
|
187
|
+
/**
|
188
|
+
* Queries for a quantile value
|
189
|
+
* @arg cm_quantile The cm_quantile to query
|
190
|
+
* @arg quantile The quantile to query
|
191
|
+
* @return The value on success or 0.
|
192
|
+
*/
|
193
|
+
double cm_query(cm_quantile *cm, double quantile) {
|
194
|
+
uint64_t rank = ceil(quantile * cm->num_values);
|
195
|
+
uint64_t min_rank=0;
|
196
|
+
uint64_t max_rank;
|
197
|
+
uint64_t threshold = ceil(cm_threshold(cm, rank) / 2.);
|
198
|
+
|
199
|
+
cm_sample *prev = cm->samples;
|
200
|
+
cm_sample *current = cm->samples;
|
201
|
+
while (current) {
|
202
|
+
max_rank = min_rank + current->width + current->delta;
|
203
|
+
if (max_rank > rank + threshold) {
|
204
|
+
break;
|
205
|
+
}
|
206
|
+
min_rank += current->width;
|
207
|
+
prev = current;
|
208
|
+
current = current->next;
|
209
|
+
}
|
210
|
+
return (prev) ? prev->value : 0;
|
211
|
+
}
|
212
|
+
|
213
|
+
/**
|
214
|
+
* Adds a new sample to the buffer
|
215
|
+
*/
|
216
|
+
static void cm_add_to_buffer(cm_quantile *cm, double value) {
|
217
|
+
// Allocate a new sample
|
218
|
+
cm_sample *s = calloc(1, sizeof(cm_sample));
|
219
|
+
s->value = value;
|
220
|
+
|
221
|
+
/*
|
222
|
+
* Check the cursor value.
|
223
|
+
* Only use bufLess if we have at least a single value.
|
224
|
+
*/
|
225
|
+
if (cm->num_values && value < cm_insert_point_value(cm)) {
|
226
|
+
heap_insert(cm->bufLess, &s->value, s);
|
227
|
+
} else {
|
228
|
+
heap_insert(cm->bufMore, &s->value, s);
|
229
|
+
}
|
230
|
+
}
|
231
|
+
|
232
|
+
// Returns the value under the insertion cursor or 0
|
233
|
+
static double cm_insert_point_value(cm_quantile *cm) {
|
234
|
+
return (cm->insert.curs) ? cm->insert.curs->value : 0;
|
235
|
+
}
|
236
|
+
|
237
|
+
// Resets the insert cursor
|
238
|
+
static void cm_reset_insert_cursor(cm_quantile *cm) {
|
239
|
+
// Swap the buffers, reset the cursor
|
240
|
+
heap *tmp = cm->bufLess;
|
241
|
+
cm->bufLess = cm->bufMore;
|
242
|
+
cm->bufMore = tmp;
|
243
|
+
cm->insert.curs = NULL;
|
244
|
+
}
|
245
|
+
|
246
|
+
// Computes the number of items to process in one iteration
|
247
|
+
static int cm_cursor_increment(cm_quantile *cm) {
|
248
|
+
return ceil(cm->num_samples * cm->eps);
|
249
|
+
}
|
250
|
+
|
251
|
+
/* Inserts a new sample before the position sample */
|
252
|
+
static void cm_insert_sample(cm_quantile *cm, cm_sample *position, cm_sample *new) {
|
253
|
+
// Inserting at the head
|
254
|
+
if (!position->prev) {
|
255
|
+
position->prev = new;
|
256
|
+
cm->samples = new;
|
257
|
+
new->next = position;
|
258
|
+
} else {
|
259
|
+
cm_sample *prev = position->prev;
|
260
|
+
prev->next = new;
|
261
|
+
position->prev = new;
|
262
|
+
new->prev = prev;
|
263
|
+
new->next = position;
|
264
|
+
}
|
265
|
+
}
|
266
|
+
|
267
|
+
/* Inserts a new sample at the end */
|
268
|
+
static void cm_append_sample(cm_quantile *cm, cm_sample *new) {
|
269
|
+
new->prev = cm->end;
|
270
|
+
cm->end->next = new;
|
271
|
+
cm->end = new;
|
272
|
+
}
|
273
|
+
|
274
|
+
/**
|
275
|
+
* Incrementally processes inserts by moving
|
276
|
+
* data from the buffer to the samples using a cursor
|
277
|
+
*/
|
278
|
+
static void cm_insert(cm_quantile *cm) {
|
279
|
+
// Check if this is the first element
|
280
|
+
cm_sample *samp;
|
281
|
+
if (!cm->samples) {
|
282
|
+
if (!heap_delmin(cm->bufMore, NULL, (void**)&samp)) return;
|
283
|
+
samp->width = 1;
|
284
|
+
samp->delta = 0;
|
285
|
+
cm->samples = samp;
|
286
|
+
cm->end = samp;
|
287
|
+
cm->num_values++;
|
288
|
+
cm->num_samples++;
|
289
|
+
cm->insert.curs = samp;
|
290
|
+
return;
|
291
|
+
}
|
292
|
+
|
293
|
+
// Check if we need to initialize the cursor
|
294
|
+
if (!cm->insert.curs) {
|
295
|
+
cm->insert.curs = cm->samples;
|
296
|
+
}
|
297
|
+
|
298
|
+
// Handle adding values in the middle
|
299
|
+
int incr_size = cm_cursor_increment(cm);
|
300
|
+
double *val;
|
301
|
+
for (int i=0; i < incr_size and cm->insert.curs; i++) {
|
302
|
+
while (heap_min(cm->bufMore, (void**)&val, NULL) && *val <= cm_insert_point_value(cm)) {
|
303
|
+
heap_delmin(cm->bufMore, NULL, (void**)&samp);
|
304
|
+
samp->width = 1;
|
305
|
+
samp->delta = cm->insert.curs->width + cm->insert.curs->delta - 1;
|
306
|
+
cm_insert_sample(cm, cm->insert.curs, samp);
|
307
|
+
cm->num_values++;
|
308
|
+
cm->num_samples++;
|
309
|
+
|
310
|
+
// Check if we need to update the compress cursor
|
311
|
+
if (cm->compress.curs && cm->compress.curs->value >= samp->value) {
|
312
|
+
cm->compress.min_rank++;
|
313
|
+
}
|
314
|
+
}
|
315
|
+
// Increment the cursor
|
316
|
+
cm->insert.curs = cm->insert.curs->next;
|
317
|
+
}
|
318
|
+
|
319
|
+
// Handle adding values at the end
|
320
|
+
if (cm->insert.curs == NULL) {
|
321
|
+
while (heap_min(cm->bufMore, (void**)&val, NULL) && *val > cm->end->value) {
|
322
|
+
heap_delmin(cm->bufMore, NULL, (void**)&samp);
|
323
|
+
samp->width = 1;
|
324
|
+
samp->delta = 0;
|
325
|
+
cm_append_sample(cm, samp);
|
326
|
+
cm->num_values++;
|
327
|
+
cm->num_samples++;
|
328
|
+
}
|
329
|
+
|
330
|
+
// Reset the cursor
|
331
|
+
cm_reset_insert_cursor(cm);
|
332
|
+
}
|
333
|
+
}
|
334
|
+
|
335
|
+
/* Incrementally processes compression by using a cursor */
|
336
|
+
static void cm_compress(cm_quantile *cm) {
|
337
|
+
// Bail early if there is nothing to really compress..
|
338
|
+
if (cm->num_samples < 3) return;
|
339
|
+
|
340
|
+
// Check if we need to initialize the cursor
|
341
|
+
if (!cm->compress.curs) {
|
342
|
+
cm->compress.curs = cm->end->prev;
|
343
|
+
cm->compress.min_rank = cm->num_values - 1 - cm->compress.curs->width;
|
344
|
+
cm->compress.curs = cm->compress.curs->prev;
|
345
|
+
}
|
346
|
+
|
347
|
+
int incr_size = cm_cursor_increment(cm);
|
348
|
+
cm_sample *next, *prev;
|
349
|
+
uint64_t threshold;
|
350
|
+
uint64_t max_rank, test_val;
|
351
|
+
for (int i=0; i < incr_size and cm->compress.curs != cm->samples; i++) {
|
352
|
+
next = cm->compress.curs->next;
|
353
|
+
max_rank = cm->compress.min_rank + cm->compress.curs->width + cm->compress.curs->delta;
|
354
|
+
cm->compress.min_rank -= cm->compress.curs->width;
|
355
|
+
|
356
|
+
threshold = cm_threshold(cm, max_rank);
|
357
|
+
test_val = cm->compress.curs->width + next->width + next->delta;
|
358
|
+
if (test_val <= threshold) {
|
359
|
+
// Make sure we don't stomp the insertion cursor
|
360
|
+
if (cm->insert.curs == cm->compress.curs) {
|
361
|
+
cm->insert.curs = next;
|
362
|
+
}
|
363
|
+
|
364
|
+
// Combine the widths
|
365
|
+
next->width += cm->compress.curs->width;
|
366
|
+
|
367
|
+
// Remove the tuple
|
368
|
+
prev = cm->compress.curs->prev;
|
369
|
+
prev->next = next;
|
370
|
+
next->prev = prev;
|
371
|
+
free(cm->compress.curs);
|
372
|
+
cm->compress.curs = prev;
|
373
|
+
|
374
|
+
// Reduce the sample count
|
375
|
+
cm->num_samples--;
|
376
|
+
} else {
|
377
|
+
cm->compress.curs = cm->compress.curs->prev;
|
378
|
+
}
|
379
|
+
}
|
380
|
+
|
381
|
+
// Reset the cursor if we hit the start
|
382
|
+
if (cm->compress.curs == cm->samples) cm->compress.curs = NULL;
|
383
|
+
}
|
384
|
+
|
385
|
+
/* Computes the minimum threshold value */
|
386
|
+
static uint64_t cm_threshold(cm_quantile *cm, uint64_t rank) {
|
387
|
+
uint64_t min_val = LLONG_MAX;
|
388
|
+
|
389
|
+
uint64_t quant_min;
|
390
|
+
double quant;
|
391
|
+
for (int i=0; i < cm->num_quantiles; i++) {
|
392
|
+
quant = cm->quantiles[i];
|
393
|
+
if (rank >= quant * cm->num_values) {
|
394
|
+
quant_min = 2 * cm->eps * rank / quant;
|
395
|
+
} else {
|
396
|
+
quant_min = 2 * cm->eps * (cm->num_values - rank) / (1 - quant);
|
397
|
+
}
|
398
|
+
if (quant_min < min_val) min_val = quant_min;
|
399
|
+
}
|
400
|
+
|
401
|
+
return min_val;
|
402
|
+
}
|