stream_stats 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.rdoc +43 -0
- data/ext/stream_stats/cm_quantile.c +402 -0
- data/ext/stream_stats/cm_quantile.h +88 -0
- data/ext/stream_stats/extconf.rb +3 -0
- data/ext/stream_stats/heap.c +407 -0
- data/ext/stream_stats/heap.h +85 -0
- data/ext/stream_stats/stream_stats.c +135 -0
- data/ext/stream_stats/timer.c +165 -0
- data/ext/stream_stats/timer.h +96 -0
- data/lib/stream_stats.rb +6 -0
- data/lib/stream_stats/stream.rb +17 -0
- data/lib/stream_stats/version.rb +4 -0
- data/stream_stats.gemspec +15 -0
- metadata +57 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
/**
|
2
|
+
* This module implements the Cormode-Muthukrishnan algorithm
|
3
|
+
* for computation of biased quantiles over data streams from
|
4
|
+
* "Effective Computation of Biased Quantiles over Data Streams"
|
5
|
+
*
|
6
|
+
*/
|
7
|
+
#ifndef CM_QUANTILE_H
|
8
|
+
#define CM_QUANTILE_H
|
9
|
+
#include <stdint.h>
|
10
|
+
#include "heap.h"
|
11
|
+
|
12
|
+
typedef struct cm_sample {
|
13
|
+
double value; // The sampled value
|
14
|
+
uint64_t width; // The number of ranks represented
|
15
|
+
uint64_t delta; // Delta between min/max rank
|
16
|
+
struct cm_sample *next;
|
17
|
+
struct cm_sample *prev;
|
18
|
+
} cm_sample;
|
19
|
+
|
20
|
+
struct cm_insert_cursor {
|
21
|
+
cm_sample *curs;
|
22
|
+
};
|
23
|
+
|
24
|
+
struct cm_compress_cursor {
|
25
|
+
cm_sample *curs;
|
26
|
+
uint64_t min_rank;
|
27
|
+
};
|
28
|
+
|
29
|
+
typedef struct {
|
30
|
+
double eps; // Desired epsilon
|
31
|
+
|
32
|
+
double *quantiles; // Queryable quantiles, sorted array
|
33
|
+
uint32_t num_quantiles; // Number of quantiles
|
34
|
+
|
35
|
+
uint64_t num_samples; // Number of samples
|
36
|
+
uint64_t num_values; // Number of values added
|
37
|
+
|
38
|
+
cm_sample *samples; // Sorted linked list of samples
|
39
|
+
cm_sample *end; // Pointer to the end of the sampels
|
40
|
+
heap *bufLess, *bufMore;// Sample buffer
|
41
|
+
|
42
|
+
struct cm_insert_cursor insert; // Insertion cursor
|
43
|
+
struct cm_compress_cursor compress; // Compression cursor
|
44
|
+
} cm_quantile;
|
45
|
+
|
46
|
+
|
47
|
+
/**
|
48
|
+
* Initializes the CM quantile struct
|
49
|
+
* @arg eps The maximum error for the quantiles
|
50
|
+
* @arg quantiles A sorted array of double quantile values, must be on (0, 1)
|
51
|
+
* @arg num_quants The number of entries in the quantiles array
|
52
|
+
* @arg cm_quantile The cm_quantile struct to initialize
|
53
|
+
* @return 0 on success.
|
54
|
+
*/
|
55
|
+
int init_cm_quantile(double eps, double *quantiles, uint32_t num_quants, cm_quantile *cm);
|
56
|
+
|
57
|
+
/**
|
58
|
+
* Destroy the CM quantile struct.
|
59
|
+
* @arg cm_quantile The cm_quantile to destroy
|
60
|
+
* @return 0 on success.
|
61
|
+
*/
|
62
|
+
int destroy_cm_quantile(cm_quantile *cm);
|
63
|
+
|
64
|
+
/**
|
65
|
+
* Adds a new sample to the struct
|
66
|
+
* @arg cm_quantile The cm_quantile to add to
|
67
|
+
* @arg sample The new sample value
|
68
|
+
* @return 0 on success.
|
69
|
+
*/
|
70
|
+
int cm_add_sample(cm_quantile *cm, double sample);
|
71
|
+
|
72
|
+
/**
|
73
|
+
* Queries for a quantile value
|
74
|
+
* @arg cm_quantile The cm_quantile to query
|
75
|
+
* @arg quantile The quantile to query
|
76
|
+
* @return The value on success or 0.
|
77
|
+
*/
|
78
|
+
double cm_query(cm_quantile *cm, double quantile);
|
79
|
+
|
80
|
+
/**
|
81
|
+
* Forces the internal buffers to be flushed,
|
82
|
+
* this allows query to have maximum accuracy.
|
83
|
+
* @arg cm_quantile The cm_quantile to add to
|
84
|
+
* @return 0 on success.
|
85
|
+
*/
|
86
|
+
int cm_flush(cm_quantile *cm);
|
87
|
+
|
88
|
+
#endif
|
@@ -0,0 +1,407 @@
|
|
1
|
+
/*
|
2
|
+
Source: https://github.com/armon/statsite/blob/master/src/heap.c
|
3
|
+
Copyright (c) 2012, Armon Dadgar
|
4
|
+
All rights reserved.
|
5
|
+
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
8
|
+
* Redistributions of source code must retain the above copyright
|
9
|
+
notice, this list of conditions and the following disclaimer.
|
10
|
+
* Redistributions in binary form must reproduce the above copyright
|
11
|
+
notice, this list of conditions and the following disclaimer in the
|
12
|
+
documentation and/or other materials provided with the distribution.
|
13
|
+
* Neither the name of the organization nor the
|
14
|
+
names of its contributors may be used to endorse or promote products
|
15
|
+
derived from this software without specific prior written permission.
|
16
|
+
|
17
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
18
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
19
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
20
|
+
DISCLAIMED. IN NO EVENT SHALL ARMON DADGAR BE LIABLE FOR ANY
|
21
|
+
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
22
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
23
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
24
|
+
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
25
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
26
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
27
|
+
*/
|
28
|
+
|
29
|
+
/**
|
30
|
+
* This file defines the methods declared in heap.h
|
31
|
+
* These are used to create and manipulate a heap
|
32
|
+
* data structure.
|
33
|
+
*/
|
34
|
+
|
35
|
+
#include <unistd.h>
|
36
|
+
#include <sys/mman.h>
|
37
|
+
#include <assert.h>
|
38
|
+
#include <strings.h>
|
39
|
+
#include <string.h>
|
40
|
+
#include <stdio.h>
|
41
|
+
#include <stdlib.h>
|
42
|
+
#include "heap.h"
|
43
|
+
|
44
|
+
// Helpful Macro's
|
45
|
+
#define LEFT_CHILD(i) ((i<<1)+1)
|
46
|
+
#define RIGHT_CHILD(i) ((i<<1)+2)
|
47
|
+
#define PARENT_ENTRY(i) ((i-1)>>1)
|
48
|
+
#define SWAP_ENTRIES(parent,child) { \
|
49
|
+
void* temp = parent->key; \
|
50
|
+
parent->key = child->key; \
|
51
|
+
child->key = temp; \
|
52
|
+
temp = parent->value; \
|
53
|
+
parent->value = child->value; \
|
54
|
+
child->value = temp; \
|
55
|
+
}
|
56
|
+
|
57
|
+
#define GET_ENTRY(index,table) ((heap_entry*)(table+index))
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
/**
|
63
|
+
* Stores the number of heap_entry structures
|
64
|
+
* we can fit into a single page of memory.
|
65
|
+
*
|
66
|
+
* This is determined by the page size, so we
|
67
|
+
* need to determine this at run time.
|
68
|
+
*/
|
69
|
+
static int ENTRIES_PER_PAGE = 0;
|
70
|
+
|
71
|
+
/**
|
72
|
+
* Stores the number of bytes in a single
|
73
|
+
* page of memory.
|
74
|
+
*/
|
75
|
+
static int PAGE_SIZE = 0;
|
76
|
+
|
77
|
+
// Helper function to map a number of pages into memory
|
78
|
+
// Returns NULL on error, otherwise returns a pointer to the
|
79
|
+
// first page.
|
80
|
+
static void* map_in_pages(int page_count) {
|
81
|
+
// Check everything
|
82
|
+
assert(page_count > 0);
|
83
|
+
|
84
|
+
// Call malloc to get the pages
|
85
|
+
void* addr = malloc(page_count*PAGE_SIZE);
|
86
|
+
|
87
|
+
if (!addr)
|
88
|
+
return NULL;
|
89
|
+
else {
|
90
|
+
// Clear the memory
|
91
|
+
bzero(addr,page_count*PAGE_SIZE);
|
92
|
+
|
93
|
+
// Return the address
|
94
|
+
return addr;
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
|
99
|
+
// Helper function to map a number of pages out of memory
|
100
|
+
static void map_out_pages(void* addr, int page_count) {
|
101
|
+
// Check everything
|
102
|
+
assert(addr != NULL);
|
103
|
+
assert(page_count > 0);
|
104
|
+
|
105
|
+
// Call munmap to get rid of the pages
|
106
|
+
free(addr);
|
107
|
+
}
|
108
|
+
|
109
|
+
|
110
|
+
// This is a comparison function that treats keys as signed ints
|
111
|
+
int compare_int_keys(register void* key1, register void* key2) {
|
112
|
+
// Cast them as int* and read them in
|
113
|
+
register int key1_v = *((int*)key1);
|
114
|
+
register int key2_v = *((int*)key2);
|
115
|
+
|
116
|
+
// Perform the comparison
|
117
|
+
if (key1_v < key2_v)
|
118
|
+
return -1;
|
119
|
+
else if (key1_v == key2_v)
|
120
|
+
return 0;
|
121
|
+
else
|
122
|
+
return 1;
|
123
|
+
}
|
124
|
+
|
125
|
+
|
126
|
+
// Creates a new heap
|
127
|
+
void heap_create(heap* h, int initial_size, int (*comp_func)(void*,void*)) {
|
128
|
+
// Check if we need to setup our globals
|
129
|
+
if (PAGE_SIZE == 0) {
|
130
|
+
// Get the page size
|
131
|
+
PAGE_SIZE = getpagesize();
|
132
|
+
|
133
|
+
// Calculate the max entries
|
134
|
+
ENTRIES_PER_PAGE = PAGE_SIZE / sizeof(heap_entry);
|
135
|
+
}
|
136
|
+
|
137
|
+
// Check that initial size is greater than 0, else set it to ENTRIES_PER_PAGE
|
138
|
+
if (initial_size <= 0)
|
139
|
+
initial_size = ENTRIES_PER_PAGE;
|
140
|
+
|
141
|
+
// If the comp_func is null, treat the keys as signed ints
|
142
|
+
if (comp_func == NULL)
|
143
|
+
comp_func = compare_int_keys;
|
144
|
+
|
145
|
+
|
146
|
+
// Store the compare function
|
147
|
+
h->compare_func = comp_func;
|
148
|
+
|
149
|
+
// Set active entries to 0
|
150
|
+
h->active_entries = 0;
|
151
|
+
|
152
|
+
// Determine how many pages of entries we need
|
153
|
+
h->allocated_pages = initial_size / ENTRIES_PER_PAGE + ((initial_size % ENTRIES_PER_PAGE > 0) ? 1 : 0);
|
154
|
+
h->minimum_pages = h->allocated_pages;
|
155
|
+
|
156
|
+
// Allocate the table
|
157
|
+
h->table = (void*)map_in_pages(h->allocated_pages);
|
158
|
+
}
|
159
|
+
|
160
|
+
|
161
|
+
// Cleanup a heap
|
162
|
+
void heap_destroy(heap* h) {
|
163
|
+
// Check that h is not null
|
164
|
+
assert(h != NULL);
|
165
|
+
|
166
|
+
// Map out the table
|
167
|
+
map_out_pages(h->table, h->allocated_pages);
|
168
|
+
|
169
|
+
// Clear everything
|
170
|
+
h->active_entries = 0;
|
171
|
+
h->allocated_pages = 0;
|
172
|
+
h->table = NULL;
|
173
|
+
}
|
174
|
+
|
175
|
+
|
176
|
+
// Gets the size of the heap
|
177
|
+
int heap_size(heap* h) {
|
178
|
+
// Return the active entries
|
179
|
+
return h->active_entries;
|
180
|
+
}
|
181
|
+
|
182
|
+
|
183
|
+
// Gets the minimum element
|
184
|
+
int heap_min(heap* h, void** key, void** value) {
|
185
|
+
// Check the number of elements, abort if 0
|
186
|
+
if (h->active_entries == 0)
|
187
|
+
return 0;
|
188
|
+
|
189
|
+
// Get the 0th element
|
190
|
+
heap_entry* root = GET_ENTRY(0, h->table);
|
191
|
+
|
192
|
+
// Set the key and value
|
193
|
+
if (key) *key = root->key;
|
194
|
+
if (value) *value = root->value;
|
195
|
+
|
196
|
+
// Success
|
197
|
+
return 1;
|
198
|
+
}
|
199
|
+
|
200
|
+
|
201
|
+
// Insert a new element
|
202
|
+
void heap_insert(heap *h, void* key, void* value) {
|
203
|
+
// Check if this heap is not destoyed
|
204
|
+
assert(h->table != NULL);
|
205
|
+
|
206
|
+
// Check if we have room
|
207
|
+
int max_entries = h->allocated_pages * ENTRIES_PER_PAGE;
|
208
|
+
if (h->active_entries + 1 > max_entries) {
|
209
|
+
// Get the new number of entries we need
|
210
|
+
int new_size = h->allocated_pages * 2;
|
211
|
+
|
212
|
+
// Map in a new table
|
213
|
+
heap_entry* new_table = map_in_pages(new_size);
|
214
|
+
|
215
|
+
// Copy the old entries, copy the entire pages
|
216
|
+
memcpy(new_table, h->table, h->allocated_pages*PAGE_SIZE);
|
217
|
+
|
218
|
+
// Cleanup the old table
|
219
|
+
map_out_pages(h->table, h->allocated_pages);
|
220
|
+
|
221
|
+
// Switch to the new table
|
222
|
+
h->table = new_table;
|
223
|
+
h->allocated_pages = new_size;
|
224
|
+
}
|
225
|
+
|
226
|
+
// Store the comparison function
|
227
|
+
int (*cmp_func)(void*,void*) = h->compare_func;
|
228
|
+
|
229
|
+
// Store the table address
|
230
|
+
heap_entry* table = h->table;
|
231
|
+
|
232
|
+
// Get the current index
|
233
|
+
int current_index = h->active_entries;
|
234
|
+
heap_entry* current = GET_ENTRY(current_index, table);
|
235
|
+
|
236
|
+
// Loop variables
|
237
|
+
int parent_index;
|
238
|
+
heap_entry *parent;
|
239
|
+
|
240
|
+
// While we can, keep swapping with our parent
|
241
|
+
while (current_index > 0) {
|
242
|
+
// Get the parent index
|
243
|
+
parent_index = PARENT_ENTRY(current_index);
|
244
|
+
|
245
|
+
// Get the parent entry
|
246
|
+
parent = GET_ENTRY(parent_index, table);
|
247
|
+
|
248
|
+
// Compare the keys, and swap if we need to
|
249
|
+
if (cmp_func(key, parent->key) < 0) {
|
250
|
+
// Move the parent down
|
251
|
+
current->key = parent->key;
|
252
|
+
current->value = parent->value;
|
253
|
+
|
254
|
+
// Move our reference
|
255
|
+
current_index = parent_index;
|
256
|
+
current = parent;
|
257
|
+
|
258
|
+
// We are done swapping
|
259
|
+
} else
|
260
|
+
break;
|
261
|
+
}
|
262
|
+
|
263
|
+
// Insert at the current index
|
264
|
+
current->key = key;
|
265
|
+
current->value = value;
|
266
|
+
|
267
|
+
// Increase the number of active entries
|
268
|
+
h->active_entries++;
|
269
|
+
}
|
270
|
+
|
271
|
+
|
272
|
+
// Deletes the minimum entry in the heap
|
273
|
+
int heap_delmin(heap* h, void** key, void** value) {
|
274
|
+
// Check there is a minimum
|
275
|
+
if (h->active_entries == 0)
|
276
|
+
return 0;
|
277
|
+
|
278
|
+
// Load in the map table
|
279
|
+
heap_entry* table = h->table;
|
280
|
+
|
281
|
+
// Get the root element
|
282
|
+
int current_index = 0;
|
283
|
+
heap_entry* current = GET_ENTRY(current_index, table);
|
284
|
+
|
285
|
+
// Store the outputs
|
286
|
+
if (key) *key = current->key;
|
287
|
+
if (value) *value = current->value;
|
288
|
+
|
289
|
+
// Reduce the number of active entries
|
290
|
+
h->active_entries--;
|
291
|
+
|
292
|
+
// Get the active entries
|
293
|
+
int entries = h->active_entries;
|
294
|
+
|
295
|
+
// If there are any other nodes, we may need to move them up
|
296
|
+
if (h->active_entries > 0) {
|
297
|
+
// Move the last element to the root
|
298
|
+
heap_entry* last = GET_ENTRY(entries,table);
|
299
|
+
current->key = last->key;
|
300
|
+
current->value = last->value;
|
301
|
+
|
302
|
+
// Loop variables
|
303
|
+
heap_entry* left_child;
|
304
|
+
heap_entry* right_child;
|
305
|
+
|
306
|
+
// Load the comparison function
|
307
|
+
int (*cmp_func)(void*,void*) = h->compare_func;
|
308
|
+
|
309
|
+
// Store the left index
|
310
|
+
int left_child_index;
|
311
|
+
|
312
|
+
while (left_child_index = LEFT_CHILD(current_index), left_child_index < entries) {
|
313
|
+
// Load the left child
|
314
|
+
left_child = GET_ENTRY(left_child_index, table);
|
315
|
+
|
316
|
+
// We have a left + right child
|
317
|
+
if (left_child_index+1 < entries) {
|
318
|
+
// Load the right child
|
319
|
+
right_child = GET_ENTRY((left_child_index+1), table);
|
320
|
+
|
321
|
+
// Find the smaller child
|
322
|
+
if (cmp_func(left_child->key, right_child->key) <= 0) {
|
323
|
+
|
324
|
+
// Swap with the left if it is smaller
|
325
|
+
if (cmp_func(current->key, left_child->key) == 1) {
|
326
|
+
SWAP_ENTRIES(current,left_child);
|
327
|
+
current_index = left_child_index;
|
328
|
+
current = left_child;
|
329
|
+
|
330
|
+
// Otherwise, the current is smaller
|
331
|
+
} else
|
332
|
+
break;
|
333
|
+
|
334
|
+
// Right child is smaller
|
335
|
+
} else {
|
336
|
+
|
337
|
+
// Swap with the right if it is smaller
|
338
|
+
if (cmp_func(current->key, right_child->key) == 1) {
|
339
|
+
SWAP_ENTRIES(current,right_child);
|
340
|
+
current_index = left_child_index+1;
|
341
|
+
current = right_child;
|
342
|
+
|
343
|
+
// Current is smaller
|
344
|
+
} else
|
345
|
+
break;
|
346
|
+
|
347
|
+
}
|
348
|
+
|
349
|
+
|
350
|
+
// We only have a left child, only do something if the left is smaller
|
351
|
+
} else if (cmp_func(current->key, left_child->key) == 1) {
|
352
|
+
SWAP_ENTRIES(current,left_child);
|
353
|
+
current_index = left_child_index;
|
354
|
+
current = left_child;
|
355
|
+
|
356
|
+
// Done otherwise
|
357
|
+
} else
|
358
|
+
break;
|
359
|
+
|
360
|
+
}
|
361
|
+
}
|
362
|
+
|
363
|
+
// Check if we should release a page of memory
|
364
|
+
int used_pages = entries / ENTRIES_PER_PAGE + ((entries % ENTRIES_PER_PAGE > 0) ? 1 : 0);
|
365
|
+
|
366
|
+
// Allow one empty page, but not two
|
367
|
+
if (h->allocated_pages / 2 > used_pages + 1 && h->allocated_pages / 2 >= h->minimum_pages) {
|
368
|
+
// Get the new number of entries we need
|
369
|
+
int new_size = h->allocated_pages / 2;
|
370
|
+
|
371
|
+
// Map in a new table
|
372
|
+
heap_entry* new_table = map_in_pages(new_size);
|
373
|
+
|
374
|
+
// Copy the old entries, copy the entire pages
|
375
|
+
memcpy(new_table, h->table, used_pages*PAGE_SIZE);
|
376
|
+
|
377
|
+
// Cleanup the old table
|
378
|
+
map_out_pages(h->table, h->allocated_pages);
|
379
|
+
|
380
|
+
// Switch to the new table
|
381
|
+
h->table = new_table;
|
382
|
+
h->allocated_pages = new_size;
|
383
|
+
}
|
384
|
+
|
385
|
+
// Success
|
386
|
+
return 1;
|
387
|
+
}
|
388
|
+
|
389
|
+
|
390
|
+
// Allows a user to iterate over all entries, e.g. to free() the memory
|
391
|
+
void heap_foreach(heap* h, void (*func)(void*,void*)) {
|
392
|
+
// Store the current index and max index
|
393
|
+
int index = 0;
|
394
|
+
int entries = h->active_entries;
|
395
|
+
|
396
|
+
heap_entry* entry;
|
397
|
+
heap_entry* table = h->table;
|
398
|
+
|
399
|
+
for (;index<entries;index++) {
|
400
|
+
// Get the entry
|
401
|
+
entry = GET_ENTRY(index,table);
|
402
|
+
|
403
|
+
// Call the user function
|
404
|
+
func(entry->key, entry->value);
|
405
|
+
}
|
406
|
+
}
|
407
|
+
|