wordtriez 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/changes +21 -0
- data/copying +18 -0
- data/ext/common.h +8 -0
- data/ext/extconf.rb +32 -0
- data/ext/hat-trie/ahtable.c +550 -0
- data/ext/hat-trie/ahtable.h +93 -0
- data/ext/hat-trie/common.h +19 -0
- data/ext/hat-trie/hat-trie.c +771 -0
- data/ext/hat-trie/hat-trie.h +86 -0
- data/ext/hat-trie/misc.c +46 -0
- data/ext/hat-trie/misc.h +22 -0
- data/ext/hat-trie/murmurhash3.c +77 -0
- data/ext/hat-trie/murmurhash3.h +12 -0
- data/ext/hat-trie/pstdint.h +800 -0
- data/ext/hat-trie/text.c +174 -0
- data/ext/hat-trie/text.h +22 -0
- data/ext/triez.cc +313 -0
- data/lib/wordtriez.rb +65 -0
- data/readme.md +223 -0
- data/test/triez_test.rb +225 -0
- metadata +67 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
/*
|
2
|
+
* This file is part of hat-trie
|
3
|
+
*
|
4
|
+
* Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
|
5
|
+
*
|
6
|
+
*
|
7
|
+
* This is an implementation of the HAT-trie data structure described in,
|
8
|
+
*
|
9
|
+
* Askitis, N., & Sinha, R. (2007). HAT-trie: a cache-conscious trie-based data
|
10
|
+
* structure for strings. Proceedings of the thirtieth Australasian conference on
|
11
|
+
* Computer science-Volume 62 (pp. 97–105). Australian Computer Society, Inc.
|
12
|
+
*
|
13
|
+
* The HAT-trie is in essence a hybrid data structure, combining tries and hash
|
14
|
+
* tables in a clever way to try to get the best of both worlds.
|
15
|
+
*
|
16
|
+
*/
|
17
|
+
|
18
|
+
#ifndef HATTRIE_HATTRIE_H
|
19
|
+
#define HATTRIE_HATTRIE_H
|
20
|
+
|
21
|
+
#ifdef __cplusplus
|
22
|
+
extern "C" {
|
23
|
+
#endif
|
24
|
+
|
25
|
+
#include "common.h"
|
26
|
+
#include <stdlib.h>
|
27
|
+
#include <stdbool.h>
|
28
|
+
|
29
|
+
typedef struct hattrie_t_ hattrie_t;
|
30
|
+
|
31
|
+
hattrie_t* hattrie_create (void); //< Create an empty hat-trie.
|
32
|
+
void hattrie_free (hattrie_t*); //< Free all memory used by a trie.
|
33
|
+
hattrie_t* hattrie_dup (const hattrie_t*); //< Duplicate an existing trie.
|
34
|
+
void hattrie_clear (hattrie_t*); //< Remove all entries.
|
35
|
+
|
36
|
+
/** number of inserted keys
|
37
|
+
*/
|
38
|
+
size_t hattrie_size (hattrie_t*);
|
39
|
+
|
40
|
+
/** Find the given key in the trie, inserting it if it does not exist, and
|
41
|
+
* returning a pointer to it's key.
|
42
|
+
*
|
43
|
+
* This pointer is not guaranteed to be valid after additional calls to
|
44
|
+
* hattrie_get, hattrie_del, hattrie_clear, or other functions that modifies the
|
45
|
+
* trie.
|
46
|
+
*/
|
47
|
+
value_t* hattrie_get (hattrie_t*, const char* key, size_t len);
|
48
|
+
|
49
|
+
/** Find a given key in the table, returning a NULL pointer if it does not
|
50
|
+
* exist. */
|
51
|
+
value_t* hattrie_tryget (hattrie_t*, const char* key, size_t len);
|
52
|
+
|
53
|
+
/** hattrie_walk callback signature */
|
54
|
+
typedef int (*hattrie_walk_cb)(const char* key, size_t len, value_t* val, void* user_data);
|
55
|
+
|
56
|
+
/** hattrie_walk callback return values, controls whether should stop the walk or not */
|
57
|
+
#define hattrie_walk_stop 0
|
58
|
+
#define hattrie_walk_continue 1
|
59
|
+
|
60
|
+
/** Find stored keys which are prefices of key, and invoke callback for every found key and val.
|
61
|
+
* The invocation order is: short key to long key.
|
62
|
+
*/
|
63
|
+
void hattrie_walk (hattrie_t*, const char* key, size_t len, void* user_data, hattrie_walk_cb);
|
64
|
+
|
65
|
+
/** Delete a given key from trie. Returns 0 if successful or -1 if not found.
|
66
|
+
*/
|
67
|
+
int hattrie_del(hattrie_t* T, const char* key, size_t len);
|
68
|
+
|
69
|
+
typedef struct hattrie_iter_t_ hattrie_iter_t;
|
70
|
+
|
71
|
+
hattrie_iter_t* hattrie_iter_begin (const hattrie_t*, bool sorted);
|
72
|
+
void hattrie_iter_next (hattrie_iter_t*);
|
73
|
+
bool hattrie_iter_finished (hattrie_iter_t*);
|
74
|
+
void hattrie_iter_free (hattrie_iter_t*);
|
75
|
+
const char* hattrie_iter_key (hattrie_iter_t*, size_t* len);
|
76
|
+
value_t* hattrie_iter_val (hattrie_iter_t*);
|
77
|
+
|
78
|
+
/** Note the hattrie_iter_key() for prefixed search gets the suffix instead of the whole key
|
79
|
+
*/
|
80
|
+
hattrie_iter_t* hattrie_iter_with_prefix(const hattrie_t*, bool sorted, const char* prefix, size_t prefix_len);
|
81
|
+
|
82
|
+
#ifdef __cplusplus
|
83
|
+
}
|
84
|
+
#endif
|
85
|
+
|
86
|
+
#endif
|
data/ext/hat-trie/misc.c
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
/*
|
2
|
+
* This file is part of hat-trie.
|
3
|
+
*
|
4
|
+
* Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
|
5
|
+
*
|
6
|
+
*/
|
7
|
+
|
8
|
+
#include "misc.h"
|
9
|
+
#include <stdlib.h>
|
10
|
+
|
11
|
+
|
12
|
+
void* malloc_or_die(size_t n)
|
13
|
+
{
|
14
|
+
void* p = malloc(n);
|
15
|
+
if (p == NULL && n != 0) {
|
16
|
+
fprintf(stderr, "Cannot allocate %zu bytes.\n", n);
|
17
|
+
exit(EXIT_FAILURE);
|
18
|
+
}
|
19
|
+
return p;
|
20
|
+
}
|
21
|
+
|
22
|
+
|
23
|
+
void* realloc_or_die(void* ptr, size_t n)
|
24
|
+
{
|
25
|
+
void* p = realloc(ptr, n);
|
26
|
+
if (p == NULL && n != 0) {
|
27
|
+
fprintf(stderr, "Cannot allocate %zu bytes.\n", n);
|
28
|
+
exit(EXIT_FAILURE);
|
29
|
+
}
|
30
|
+
return p;
|
31
|
+
}
|
32
|
+
|
33
|
+
|
34
|
+
FILE* fopen_or_die(const char* path, const char* mode)
|
35
|
+
{
|
36
|
+
FILE* f = fopen(path, mode);
|
37
|
+
if (f == NULL) {
|
38
|
+
fprintf(stderr, "Cannot open file %s with mode %s.\n", path, mode);
|
39
|
+
exit(EXIT_FAILURE);
|
40
|
+
}
|
41
|
+
return f;
|
42
|
+
}
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
|
data/ext/hat-trie/misc.h
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
/*
|
2
|
+
* This file is part of hat-trie.
|
3
|
+
*
|
4
|
+
* Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
|
5
|
+
*
|
6
|
+
* misc :
|
7
|
+
* miscelaneous functions.
|
8
|
+
*
|
9
|
+
*/
|
10
|
+
|
11
|
+
#ifndef LINESET_MISC_H
|
12
|
+
#define LINESET_MISC_H
|
13
|
+
|
14
|
+
#include <stdio.h>
|
15
|
+
|
16
|
+
void* malloc_or_die(size_t);
|
17
|
+
void* realloc_or_die(void*, size_t);
|
18
|
+
FILE* fopen_or_die(const char*, const char*);
|
19
|
+
|
20
|
+
#endif
|
21
|
+
|
22
|
+
|
@@ -0,0 +1,77 @@
|
|
1
|
+
/* This is MurmurHash3. The original C++ code was placed in the public domain
|
2
|
+
* by its author, Austin Appleby. */
|
3
|
+
|
4
|
+
#include "murmurhash3.h"
|
5
|
+
|
6
|
+
static inline uint32_t fmix(uint32_t h)
|
7
|
+
{
|
8
|
+
h ^= h >> 16;
|
9
|
+
h *= 0x85ebca6b;
|
10
|
+
h ^= h >> 13;
|
11
|
+
h *= 0xc2b2ae35;
|
12
|
+
h ^= h >> 16;
|
13
|
+
|
14
|
+
return h;
|
15
|
+
}
|
16
|
+
|
17
|
+
|
18
|
+
static inline uint32_t rotl32(uint32_t x, int8_t r)
|
19
|
+
{
|
20
|
+
return (x << r) | (x >> (32 - r));
|
21
|
+
}
|
22
|
+
|
23
|
+
|
24
|
+
uint32_t hash(const char* data, size_t len_)
|
25
|
+
{
|
26
|
+
const int len = (int) len_;
|
27
|
+
const int nblocks = len / 4;
|
28
|
+
|
29
|
+
uint32_t h1 = 0xc062fb4a;
|
30
|
+
|
31
|
+
uint32_t c1 = 0xcc9e2d51;
|
32
|
+
uint32_t c2 = 0x1b873593;
|
33
|
+
|
34
|
+
//----------
|
35
|
+
// body
|
36
|
+
|
37
|
+
const uint32_t * blocks = (const uint32_t*) (data + nblocks * 4);
|
38
|
+
|
39
|
+
int i;
|
40
|
+
for(i = -nblocks; i; i++)
|
41
|
+
{
|
42
|
+
uint32_t k1 = blocks[i];
|
43
|
+
|
44
|
+
k1 *= c1;
|
45
|
+
k1 = rotl32(k1, 15);
|
46
|
+
k1 *= c2;
|
47
|
+
|
48
|
+
h1 ^= k1;
|
49
|
+
h1 = rotl32(h1, 13);
|
50
|
+
h1 = h1*5+0xe6546b64;
|
51
|
+
}
|
52
|
+
|
53
|
+
//----------
|
54
|
+
// tail
|
55
|
+
|
56
|
+
const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
|
57
|
+
|
58
|
+
uint32_t k1 = 0;
|
59
|
+
|
60
|
+
switch(len & 3)
|
61
|
+
{
|
62
|
+
case 3: k1 ^= tail[2] << 16;
|
63
|
+
case 2: k1 ^= tail[1] << 8;
|
64
|
+
case 1: k1 ^= tail[0];
|
65
|
+
k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
|
66
|
+
}
|
67
|
+
|
68
|
+
//----------
|
69
|
+
// finalization
|
70
|
+
|
71
|
+
h1 ^= len;
|
72
|
+
|
73
|
+
h1 = fmix(h1);
|
74
|
+
|
75
|
+
return h1;
|
76
|
+
}
|
77
|
+
|