stream_stats 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,88 @@
1
+ /**
2
+ * This module implements the Cormode-Muthukrishnan algorithm
3
+ * for computation of biased quantiles over data streams from
4
+ * "Effective Computation of Biased Quantiles over Data Streams"
5
+ *
6
+ */
7
+ #ifndef CM_QUANTILE_H
8
+ #define CM_QUANTILE_H
9
+ #include <stdint.h>
10
+ #include "heap.h"
11
+
12
+ typedef struct cm_sample {
13
+ double value; // The sampled value
14
+ uint64_t width; // The number of ranks represented
15
+ uint64_t delta; // Delta between min/max rank
16
+ struct cm_sample *next;
17
+ struct cm_sample *prev;
18
+ } cm_sample;
19
+
20
+ struct cm_insert_cursor {
21
+ cm_sample *curs;
22
+ };
23
+
24
+ struct cm_compress_cursor {
25
+ cm_sample *curs;
26
+ uint64_t min_rank;
27
+ };
28
+
29
+ typedef struct {
30
+ double eps; // Desired epsilon
31
+
32
+ double *quantiles; // Queryable quantiles, sorted array
33
+ uint32_t num_quantiles; // Number of quantiles
34
+
35
+ uint64_t num_samples; // Number of samples
36
+ uint64_t num_values; // Number of values added
37
+
38
+ cm_sample *samples; // Sorted linked list of samples
39
+ cm_sample *end; // Pointer to the end of the sampels
40
+ heap *bufLess, *bufMore;// Sample buffer
41
+
42
+ struct cm_insert_cursor insert; // Insertion cursor
43
+ struct cm_compress_cursor compress; // Compression cursor
44
+ } cm_quantile;
45
+
46
+
47
+ /**
48
+ * Initializes the CM quantile struct
49
+ * @arg eps The maximum error for the quantiles
50
+ * @arg quantiles A sorted array of double quantile values, must be on (0, 1)
51
+ * @arg num_quants The number of entries in the quantiles array
52
+ * @arg cm_quantile The cm_quantile struct to initialize
53
+ * @return 0 on success.
54
+ */
55
+ int init_cm_quantile(double eps, double *quantiles, uint32_t num_quants, cm_quantile *cm);
56
+
57
+ /**
58
+ * Destroy the CM quantile struct.
59
+ * @arg cm_quantile The cm_quantile to destroy
60
+ * @return 0 on success.
61
+ */
62
+ int destroy_cm_quantile(cm_quantile *cm);
63
+
64
+ /**
65
+ * Adds a new sample to the struct
66
+ * @arg cm_quantile The cm_quantile to add to
67
+ * @arg sample The new sample value
68
+ * @return 0 on success.
69
+ */
70
+ int cm_add_sample(cm_quantile *cm, double sample);
71
+
72
+ /**
73
+ * Queries for a quantile value
74
+ * @arg cm_quantile The cm_quantile to query
75
+ * @arg quantile The quantile to query
76
+ * @return The value on success or 0.
77
+ */
78
+ double cm_query(cm_quantile *cm, double quantile);
79
+
80
+ /**
81
+ * Forces the internal buffers to be flushed,
82
+ * this allows query to have maximum accuracy.
83
+ * @arg cm_quantile The cm_quantile to add to
84
+ * @return 0 on success.
85
+ */
86
+ int cm_flush(cm_quantile *cm);
87
+
88
+ #endif
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile('stream_stats/stream_stats')
@@ -0,0 +1,407 @@
1
+ /*
2
+ Source: https://github.com/armon/statsite/blob/master/src/heap.c
3
+ Copyright (c) 2012, Armon Dadgar
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+ * Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+ * Redistributions in binary form must reproduce the above copyright
11
+ notice, this list of conditions and the following disclaimer in the
12
+ documentation and/or other materials provided with the distribution.
13
+ * Neither the name of the organization nor the
14
+ names of its contributors may be used to endorse or promote products
15
+ derived from this software without specific prior written permission.
16
+
17
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20
+ DISCLAIMED. IN NO EVENT SHALL ARMON DADGAR BE LIABLE FOR ANY
21
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
+ */
28
+
29
+ /**
30
+ * This file defines the methods declared in heap.h
31
+ * These are used to create and manipulate a heap
32
+ * data structure.
33
+ */
34
+
35
+ #include <unistd.h>
36
+ #include <sys/mman.h>
37
+ #include <assert.h>
38
+ #include <strings.h>
39
+ #include <string.h>
40
+ #include <stdio.h>
41
+ #include <stdlib.h>
42
+ #include "heap.h"
43
+
44
+ // Helpful Macro's
45
+ #define LEFT_CHILD(i) ((i<<1)+1)
46
+ #define RIGHT_CHILD(i) ((i<<1)+2)
47
+ #define PARENT_ENTRY(i) ((i-1)>>1)
48
+ #define SWAP_ENTRIES(parent,child) { \
49
+ void* temp = parent->key; \
50
+ parent->key = child->key; \
51
+ child->key = temp; \
52
+ temp = parent->value; \
53
+ parent->value = child->value; \
54
+ child->value = temp; \
55
+ }
56
+
57
+ #define GET_ENTRY(index,table) ((heap_entry*)(table+index))
58
+
59
+
60
+
61
+
62
+ /**
63
+ * Stores the number of heap_entry structures
64
+ * we can fit into a single page of memory.
65
+ *
66
+ * This is determined by the page size, so we
67
+ * need to determine this at run time.
68
+ */
69
+ static int ENTRIES_PER_PAGE = 0;
70
+
71
+ /**
72
+ * Stores the number of bytes in a single
73
+ * page of memory.
74
+ */
75
+ static int PAGE_SIZE = 0;
76
+
77
+ // Helper function to map a number of pages into memory
78
+ // Returns NULL on error, otherwise returns a pointer to the
79
+ // first page.
80
+ static void* map_in_pages(int page_count) {
81
+ // Check everything
82
+ assert(page_count > 0);
83
+
84
+ // Call malloc to get the pages
85
+ void* addr = malloc(page_count*PAGE_SIZE);
86
+
87
+ if (!addr)
88
+ return NULL;
89
+ else {
90
+ // Clear the memory
91
+ bzero(addr,page_count*PAGE_SIZE);
92
+
93
+ // Return the address
94
+ return addr;
95
+ }
96
+ }
97
+
98
+
99
+ // Helper function to map a number of pages out of memory
100
+ static void map_out_pages(void* addr, int page_count) {
101
+ // Check everything
102
+ assert(addr != NULL);
103
+ assert(page_count > 0);
104
+
105
+ // Call munmap to get rid of the pages
106
+ free(addr);
107
+ }
108
+
109
+
110
+ // This is a comparison function that treats keys as signed ints
111
+ int compare_int_keys(register void* key1, register void* key2) {
112
+ // Cast them as int* and read them in
113
+ register int key1_v = *((int*)key1);
114
+ register int key2_v = *((int*)key2);
115
+
116
+ // Perform the comparison
117
+ if (key1_v < key2_v)
118
+ return -1;
119
+ else if (key1_v == key2_v)
120
+ return 0;
121
+ else
122
+ return 1;
123
+ }
124
+
125
+
126
+ // Creates a new heap
127
+ void heap_create(heap* h, int initial_size, int (*comp_func)(void*,void*)) {
128
+ // Check if we need to setup our globals
129
+ if (PAGE_SIZE == 0) {
130
+ // Get the page size
131
+ PAGE_SIZE = getpagesize();
132
+
133
+ // Calculate the max entries
134
+ ENTRIES_PER_PAGE = PAGE_SIZE / sizeof(heap_entry);
135
+ }
136
+
137
+ // Check that initial size is greater than 0, else set it to ENTRIES_PER_PAGE
138
+ if (initial_size <= 0)
139
+ initial_size = ENTRIES_PER_PAGE;
140
+
141
+ // If the comp_func is null, treat the keys as signed ints
142
+ if (comp_func == NULL)
143
+ comp_func = compare_int_keys;
144
+
145
+
146
+ // Store the compare function
147
+ h->compare_func = comp_func;
148
+
149
+ // Set active entries to 0
150
+ h->active_entries = 0;
151
+
152
+ // Determine how many pages of entries we need
153
+ h->allocated_pages = initial_size / ENTRIES_PER_PAGE + ((initial_size % ENTRIES_PER_PAGE > 0) ? 1 : 0);
154
+ h->minimum_pages = h->allocated_pages;
155
+
156
+ // Allocate the table
157
+ h->table = (void*)map_in_pages(h->allocated_pages);
158
+ }
159
+
160
+
161
+ // Cleanup a heap
162
+ void heap_destroy(heap* h) {
163
+ // Check that h is not null
164
+ assert(h != NULL);
165
+
166
+ // Map out the table
167
+ map_out_pages(h->table, h->allocated_pages);
168
+
169
+ // Clear everything
170
+ h->active_entries = 0;
171
+ h->allocated_pages = 0;
172
+ h->table = NULL;
173
+ }
174
+
175
+
176
+ // Gets the size of the heap
177
+ int heap_size(heap* h) {
178
+ // Return the active entries
179
+ return h->active_entries;
180
+ }
181
+
182
+
183
+ // Gets the minimum element
184
+ int heap_min(heap* h, void** key, void** value) {
185
+ // Check the number of elements, abort if 0
186
+ if (h->active_entries == 0)
187
+ return 0;
188
+
189
+ // Get the 0th element
190
+ heap_entry* root = GET_ENTRY(0, h->table);
191
+
192
+ // Set the key and value
193
+ if (key) *key = root->key;
194
+ if (value) *value = root->value;
195
+
196
+ // Success
197
+ return 1;
198
+ }
199
+
200
+
201
+ // Insert a new element
202
+ void heap_insert(heap *h, void* key, void* value) {
203
+ // Check if this heap is not destoyed
204
+ assert(h->table != NULL);
205
+
206
+ // Check if we have room
207
+ int max_entries = h->allocated_pages * ENTRIES_PER_PAGE;
208
+ if (h->active_entries + 1 > max_entries) {
209
+ // Get the new number of entries we need
210
+ int new_size = h->allocated_pages * 2;
211
+
212
+ // Map in a new table
213
+ heap_entry* new_table = map_in_pages(new_size);
214
+
215
+ // Copy the old entries, copy the entire pages
216
+ memcpy(new_table, h->table, h->allocated_pages*PAGE_SIZE);
217
+
218
+ // Cleanup the old table
219
+ map_out_pages(h->table, h->allocated_pages);
220
+
221
+ // Switch to the new table
222
+ h->table = new_table;
223
+ h->allocated_pages = new_size;
224
+ }
225
+
226
+ // Store the comparison function
227
+ int (*cmp_func)(void*,void*) = h->compare_func;
228
+
229
+ // Store the table address
230
+ heap_entry* table = h->table;
231
+
232
+ // Get the current index
233
+ int current_index = h->active_entries;
234
+ heap_entry* current = GET_ENTRY(current_index, table);
235
+
236
+ // Loop variables
237
+ int parent_index;
238
+ heap_entry *parent;
239
+
240
+ // While we can, keep swapping with our parent
241
+ while (current_index > 0) {
242
+ // Get the parent index
243
+ parent_index = PARENT_ENTRY(current_index);
244
+
245
+ // Get the parent entry
246
+ parent = GET_ENTRY(parent_index, table);
247
+
248
+ // Compare the keys, and swap if we need to
249
+ if (cmp_func(key, parent->key) < 0) {
250
+ // Move the parent down
251
+ current->key = parent->key;
252
+ current->value = parent->value;
253
+
254
+ // Move our reference
255
+ current_index = parent_index;
256
+ current = parent;
257
+
258
+ // We are done swapping
259
+ } else
260
+ break;
261
+ }
262
+
263
+ // Insert at the current index
264
+ current->key = key;
265
+ current->value = value;
266
+
267
+ // Increase the number of active entries
268
+ h->active_entries++;
269
+ }
270
+
271
+
272
+ // Deletes the minimum entry in the heap
273
+ int heap_delmin(heap* h, void** key, void** value) {
274
+ // Check there is a minimum
275
+ if (h->active_entries == 0)
276
+ return 0;
277
+
278
+ // Load in the map table
279
+ heap_entry* table = h->table;
280
+
281
+ // Get the root element
282
+ int current_index = 0;
283
+ heap_entry* current = GET_ENTRY(current_index, table);
284
+
285
+ // Store the outputs
286
+ if (key) *key = current->key;
287
+ if (value) *value = current->value;
288
+
289
+ // Reduce the number of active entries
290
+ h->active_entries--;
291
+
292
+ // Get the active entries
293
+ int entries = h->active_entries;
294
+
295
+ // If there are any other nodes, we may need to move them up
296
+ if (h->active_entries > 0) {
297
+ // Move the last element to the root
298
+ heap_entry* last = GET_ENTRY(entries,table);
299
+ current->key = last->key;
300
+ current->value = last->value;
301
+
302
+ // Loop variables
303
+ heap_entry* left_child;
304
+ heap_entry* right_child;
305
+
306
+ // Load the comparison function
307
+ int (*cmp_func)(void*,void*) = h->compare_func;
308
+
309
+ // Store the left index
310
+ int left_child_index;
311
+
312
+ while (left_child_index = LEFT_CHILD(current_index), left_child_index < entries) {
313
+ // Load the left child
314
+ left_child = GET_ENTRY(left_child_index, table);
315
+
316
+ // We have a left + right child
317
+ if (left_child_index+1 < entries) {
318
+ // Load the right child
319
+ right_child = GET_ENTRY((left_child_index+1), table);
320
+
321
+ // Find the smaller child
322
+ if (cmp_func(left_child->key, right_child->key) <= 0) {
323
+
324
+ // Swap with the left if it is smaller
325
+ if (cmp_func(current->key, left_child->key) == 1) {
326
+ SWAP_ENTRIES(current,left_child);
327
+ current_index = left_child_index;
328
+ current = left_child;
329
+
330
+ // Otherwise, the current is smaller
331
+ } else
332
+ break;
333
+
334
+ // Right child is smaller
335
+ } else {
336
+
337
+ // Swap with the right if it is smaller
338
+ if (cmp_func(current->key, right_child->key) == 1) {
339
+ SWAP_ENTRIES(current,right_child);
340
+ current_index = left_child_index+1;
341
+ current = right_child;
342
+
343
+ // Current is smaller
344
+ } else
345
+ break;
346
+
347
+ }
348
+
349
+
350
+ // We only have a left child, only do something if the left is smaller
351
+ } else if (cmp_func(current->key, left_child->key) == 1) {
352
+ SWAP_ENTRIES(current,left_child);
353
+ current_index = left_child_index;
354
+ current = left_child;
355
+
356
+ // Done otherwise
357
+ } else
358
+ break;
359
+
360
+ }
361
+ }
362
+
363
+ // Check if we should release a page of memory
364
+ int used_pages = entries / ENTRIES_PER_PAGE + ((entries % ENTRIES_PER_PAGE > 0) ? 1 : 0);
365
+
366
+ // Allow one empty page, but not two
367
+ if (h->allocated_pages / 2 > used_pages + 1 && h->allocated_pages / 2 >= h->minimum_pages) {
368
+ // Get the new number of entries we need
369
+ int new_size = h->allocated_pages / 2;
370
+
371
+ // Map in a new table
372
+ heap_entry* new_table = map_in_pages(new_size);
373
+
374
+ // Copy the old entries, copy the entire pages
375
+ memcpy(new_table, h->table, used_pages*PAGE_SIZE);
376
+
377
+ // Cleanup the old table
378
+ map_out_pages(h->table, h->allocated_pages);
379
+
380
+ // Switch to the new table
381
+ h->table = new_table;
382
+ h->allocated_pages = new_size;
383
+ }
384
+
385
+ // Success
386
+ return 1;
387
+ }
388
+
389
+
390
+ // Allows a user to iterate over all entries, e.g. to free() the memory
391
+ void heap_foreach(heap* h, void (*func)(void*,void*)) {
392
+ // Store the current index and max index
393
+ int index = 0;
394
+ int entries = h->active_entries;
395
+
396
+ heap_entry* entry;
397
+ heap_entry* table = h->table;
398
+
399
+ for (;index<entries;index++) {
400
+ // Get the entry
401
+ entry = GET_ENTRY(index,table);
402
+
403
+ // Call the user function
404
+ func(entry->key, entry->value);
405
+ }
406
+ }
407
+