wordtriez 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/changes +21 -0
- data/copying +18 -0
- data/ext/common.h +8 -0
- data/ext/extconf.rb +32 -0
- data/ext/hat-trie/ahtable.c +550 -0
- data/ext/hat-trie/ahtable.h +93 -0
- data/ext/hat-trie/common.h +19 -0
- data/ext/hat-trie/hat-trie.c +771 -0
- data/ext/hat-trie/hat-trie.h +86 -0
- data/ext/hat-trie/misc.c +46 -0
- data/ext/hat-trie/misc.h +22 -0
- data/ext/hat-trie/murmurhash3.c +77 -0
- data/ext/hat-trie/murmurhash3.h +12 -0
- data/ext/hat-trie/pstdint.h +800 -0
- data/ext/hat-trie/text.c +174 -0
- data/ext/hat-trie/text.h +22 -0
- data/ext/triez.cc +313 -0
- data/lib/wordtriez.rb +65 -0
- data/readme.md +223 -0
- data/test/triez_test.rb +225 -0
- metadata +67 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
/*
|
2
|
+
* This file is part of hat-trie
|
3
|
+
*
|
4
|
+
* Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
|
5
|
+
*
|
6
|
+
*
|
7
|
+
* This is an implementation of the HAT-trie data structure described in,
|
8
|
+
*
|
9
|
+
* Askitis, N., & Sinha, R. (2007). HAT-trie: a cache-conscious trie-based data
|
10
|
+
* structure for strings. Proceedings of the thirtieth Australasian conference on
|
11
|
+
* Computer science-Volume 62 (pp. 97–105). Australian Computer Society, Inc.
|
12
|
+
*
|
13
|
+
* The HAT-trie is in essence a hybrid data structure, combining tries and hash
|
14
|
+
* tables in a clever way to try to get the best of both worlds.
|
15
|
+
*
|
16
|
+
*/
|
17
|
+
|
18
|
+
#ifndef HATTRIE_HATTRIE_H
|
19
|
+
#define HATTRIE_HATTRIE_H
|
20
|
+
|
21
|
+
#ifdef __cplusplus
|
22
|
+
extern "C" {
|
23
|
+
#endif
|
24
|
+
|
25
|
+
#include "common.h"
|
26
|
+
#include <stdlib.h>
|
27
|
+
#include <stdbool.h>
|
28
|
+
|
29
|
+
typedef struct hattrie_t_ hattrie_t;
|
30
|
+
|
31
|
+
hattrie_t* hattrie_create (void); //< Create an empty hat-trie.
|
32
|
+
void hattrie_free (hattrie_t*); //< Free all memory used by a trie.
|
33
|
+
hattrie_t* hattrie_dup (const hattrie_t*); //< Duplicate an existing trie.
|
34
|
+
void hattrie_clear (hattrie_t*); //< Remove all entries.
|
35
|
+
|
36
|
+
/** number of inserted keys
|
37
|
+
*/
|
38
|
+
size_t hattrie_size (hattrie_t*);
|
39
|
+
|
40
|
+
/** Find the given key in the trie, inserting it if it does not exist, and
|
41
|
+
* returning a pointer to it's key.
|
42
|
+
*
|
43
|
+
* This pointer is not guaranteed to be valid after additional calls to
|
44
|
+
* hattrie_get, hattrie_del, hattrie_clear, or other functions that modifies the
|
45
|
+
* trie.
|
46
|
+
*/
|
47
|
+
value_t* hattrie_get (hattrie_t*, const char* key, size_t len);
|
48
|
+
|
49
|
+
/** Find a given key in the table, returning a NULL pointer if it does not
|
50
|
+
* exist. */
|
51
|
+
value_t* hattrie_tryget (hattrie_t*, const char* key, size_t len);
|
52
|
+
|
53
|
+
/** hattrie_walk callback signature */
|
54
|
+
typedef int (*hattrie_walk_cb)(const char* key, size_t len, value_t* val, void* user_data);
|
55
|
+
|
56
|
+
/** hattrie_walk callback return values, controls whether should stop the walk or not */
|
57
|
+
#define hattrie_walk_stop 0
|
58
|
+
#define hattrie_walk_continue 1
|
59
|
+
|
60
|
+
/** Find stored keys which are prefices of key, and invoke callback for every found key and val.
|
61
|
+
* The invocation order is: short key to long key.
|
62
|
+
*/
|
63
|
+
void hattrie_walk (hattrie_t*, const char* key, size_t len, void* user_data, hattrie_walk_cb);
|
64
|
+
|
65
|
+
/** Delete a given key from trie. Returns 0 if successful or -1 if not found.
|
66
|
+
*/
|
67
|
+
int hattrie_del(hattrie_t* T, const char* key, size_t len);
|
68
|
+
|
69
|
+
typedef struct hattrie_iter_t_ hattrie_iter_t;
|
70
|
+
|
71
|
+
hattrie_iter_t* hattrie_iter_begin (const hattrie_t*, bool sorted);
|
72
|
+
void hattrie_iter_next (hattrie_iter_t*);
|
73
|
+
bool hattrie_iter_finished (hattrie_iter_t*);
|
74
|
+
void hattrie_iter_free (hattrie_iter_t*);
|
75
|
+
const char* hattrie_iter_key (hattrie_iter_t*, size_t* len);
|
76
|
+
value_t* hattrie_iter_val (hattrie_iter_t*);
|
77
|
+
|
78
|
+
/** Note the hattrie_iter_key() for prefixed search gets the suffix instead of the whole key
|
79
|
+
*/
|
80
|
+
hattrie_iter_t* hattrie_iter_with_prefix(const hattrie_t*, bool sorted, const char* prefix, size_t prefix_len);
|
81
|
+
|
82
|
+
#ifdef __cplusplus
|
83
|
+
}
|
84
|
+
#endif
|
85
|
+
|
86
|
+
#endif
|
data/ext/hat-trie/misc.c
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
/*
|
2
|
+
* This file is part of hat-trie.
|
3
|
+
*
|
4
|
+
* Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
|
5
|
+
*
|
6
|
+
*/
|
7
|
+
|
8
|
+
#include "misc.h"
|
9
|
+
#include <stdlib.h>
|
10
|
+
|
11
|
+
|
12
|
+
void* malloc_or_die(size_t n)
|
13
|
+
{
|
14
|
+
void* p = malloc(n);
|
15
|
+
if (p == NULL && n != 0) {
|
16
|
+
fprintf(stderr, "Cannot allocate %zu bytes.\n", n);
|
17
|
+
exit(EXIT_FAILURE);
|
18
|
+
}
|
19
|
+
return p;
|
20
|
+
}
|
21
|
+
|
22
|
+
|
23
|
+
void* realloc_or_die(void* ptr, size_t n)
|
24
|
+
{
|
25
|
+
void* p = realloc(ptr, n);
|
26
|
+
if (p == NULL && n != 0) {
|
27
|
+
fprintf(stderr, "Cannot allocate %zu bytes.\n", n);
|
28
|
+
exit(EXIT_FAILURE);
|
29
|
+
}
|
30
|
+
return p;
|
31
|
+
}
|
32
|
+
|
33
|
+
|
34
|
+
FILE* fopen_or_die(const char* path, const char* mode)
|
35
|
+
{
|
36
|
+
FILE* f = fopen(path, mode);
|
37
|
+
if (f == NULL) {
|
38
|
+
fprintf(stderr, "Cannot open file %s with mode %s.\n", path, mode);
|
39
|
+
exit(EXIT_FAILURE);
|
40
|
+
}
|
41
|
+
return f;
|
42
|
+
}
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
|
data/ext/hat-trie/misc.h
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
/*
|
2
|
+
* This file is part of hat-trie.
|
3
|
+
*
|
4
|
+
* Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
|
5
|
+
*
|
6
|
+
* misc :
|
7
|
+
* miscelaneous functions.
|
8
|
+
*
|
9
|
+
*/
|
10
|
+
|
11
|
+
#ifndef LINESET_MISC_H
|
12
|
+
#define LINESET_MISC_H
|
13
|
+
|
14
|
+
#include <stdio.h>
|
15
|
+
|
16
|
+
void* malloc_or_die(size_t);
|
17
|
+
void* realloc_or_die(void*, size_t);
|
18
|
+
FILE* fopen_or_die(const char*, const char*);
|
19
|
+
|
20
|
+
#endif
|
21
|
+
|
22
|
+
|
@@ -0,0 +1,77 @@
|
|
1
|
+
/* This is MurmurHash3. The original C++ code was placed in the public domain
|
2
|
+
* by its author, Austin Appleby. */
|
3
|
+
|
4
|
+
#include "murmurhash3.h"
|
5
|
+
|
6
|
+
static inline uint32_t fmix(uint32_t h)
|
7
|
+
{
|
8
|
+
h ^= h >> 16;
|
9
|
+
h *= 0x85ebca6b;
|
10
|
+
h ^= h >> 13;
|
11
|
+
h *= 0xc2b2ae35;
|
12
|
+
h ^= h >> 16;
|
13
|
+
|
14
|
+
return h;
|
15
|
+
}
|
16
|
+
|
17
|
+
|
18
|
+
static inline uint32_t rotl32(uint32_t x, int8_t r)
|
19
|
+
{
|
20
|
+
return (x << r) | (x >> (32 - r));
|
21
|
+
}
|
22
|
+
|
23
|
+
|
24
|
+
uint32_t hash(const char* data, size_t len_)
|
25
|
+
{
|
26
|
+
const int len = (int) len_;
|
27
|
+
const int nblocks = len / 4;
|
28
|
+
|
29
|
+
uint32_t h1 = 0xc062fb4a;
|
30
|
+
|
31
|
+
uint32_t c1 = 0xcc9e2d51;
|
32
|
+
uint32_t c2 = 0x1b873593;
|
33
|
+
|
34
|
+
//----------
|
35
|
+
// body
|
36
|
+
|
37
|
+
const uint32_t * blocks = (const uint32_t*) (data + nblocks * 4);
|
38
|
+
|
39
|
+
int i;
|
40
|
+
for(i = -nblocks; i; i++)
|
41
|
+
{
|
42
|
+
uint32_t k1 = blocks[i];
|
43
|
+
|
44
|
+
k1 *= c1;
|
45
|
+
k1 = rotl32(k1, 15);
|
46
|
+
k1 *= c2;
|
47
|
+
|
48
|
+
h1 ^= k1;
|
49
|
+
h1 = rotl32(h1, 13);
|
50
|
+
h1 = h1*5+0xe6546b64;
|
51
|
+
}
|
52
|
+
|
53
|
+
//----------
|
54
|
+
// tail
|
55
|
+
|
56
|
+
const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
|
57
|
+
|
58
|
+
uint32_t k1 = 0;
|
59
|
+
|
60
|
+
switch(len & 3)
|
61
|
+
{
|
62
|
+
case 3: k1 ^= tail[2] << 16;
|
63
|
+
case 2: k1 ^= tail[1] << 8;
|
64
|
+
case 1: k1 ^= tail[0];
|
65
|
+
k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
|
66
|
+
}
|
67
|
+
|
68
|
+
//----------
|
69
|
+
// finalization
|
70
|
+
|
71
|
+
h1 ^= len;
|
72
|
+
|
73
|
+
h1 = fmix(h1);
|
74
|
+
|
75
|
+
return h1;
|
76
|
+
}
|
77
|
+
|