whistlepig 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +86 -0
- data/ext/whistlepig/defaults.h +28 -0
- data/ext/whistlepig/entry.c +181 -0
- data/ext/whistlepig/entry.h +66 -0
- data/ext/whistlepig/error.c +24 -0
- data/ext/whistlepig/error.h +94 -0
- data/ext/whistlepig/extconf.rb +6 -0
- data/ext/whistlepig/index.c +294 -0
- data/ext/whistlepig/index.h +88 -0
- data/ext/whistlepig/khash.h +316 -0
- data/ext/whistlepig/mmap-obj.c +76 -0
- data/ext/whistlepig/mmap-obj.h +52 -0
- data/ext/whistlepig/query-parser.c +37 -0
- data/ext/whistlepig/query-parser.h +25 -0
- data/ext/whistlepig/query-parser.lex.c +2249 -0
- data/ext/whistlepig/query-parser.lex.h +359 -0
- data/ext/whistlepig/query-parser.tab.c +1757 -0
- data/ext/whistlepig/query-parser.tab.h +85 -0
- data/ext/whistlepig/query.c +194 -0
- data/ext/whistlepig/query.h +78 -0
- data/ext/whistlepig/search.c +746 -0
- data/ext/whistlepig/search.h +76 -0
- data/ext/whistlepig/segment.c +615 -0
- data/ext/whistlepig/segment.h +137 -0
- data/ext/whistlepig/stringmap.c +278 -0
- data/ext/whistlepig/stringmap.h +82 -0
- data/ext/whistlepig/stringpool.c +44 -0
- data/ext/whistlepig/stringpool.h +58 -0
- data/ext/whistlepig/termhash.c +294 -0
- data/ext/whistlepig/termhash.h +79 -0
- data/ext/whistlepig/tokenizer.lex.c +2263 -0
- data/ext/whistlepig/tokenizer.lex.h +360 -0
- data/ext/whistlepig/whistlepig.h +15 -0
- data/ext/whistlepig/whistlepigc.c +537 -0
- data/lib/whistlepig.rb +119 -0
- metadata +103 -0
@@ -0,0 +1,85 @@
|
|
1
|
+
|
2
|
+
/* A Bison parser, made by GNU Bison 2.4.1. */
|
3
|
+
|
4
|
+
/* Skeleton interface for Bison's Yacc-like parsers in C
|
5
|
+
|
6
|
+
Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006
|
7
|
+
Free Software Foundation, Inc.
|
8
|
+
|
9
|
+
This program is free software: you can redistribute it and/or modify
|
10
|
+
it under the terms of the GNU General Public License as published by
|
11
|
+
the Free Software Foundation, either version 3 of the License, or
|
12
|
+
(at your option) any later version.
|
13
|
+
|
14
|
+
This program is distributed in the hope that it will be useful,
|
15
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
GNU General Public License for more details.
|
18
|
+
|
19
|
+
You should have received a copy of the GNU General Public License
|
20
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>. */
|
21
|
+
|
22
|
+
/* As a special exception, you may create a larger work that contains
|
23
|
+
part or all of the Bison parser skeleton and distribute that work
|
24
|
+
under terms of your choice, so long as that work isn't itself a
|
25
|
+
parser generator using the skeleton or a modified version thereof
|
26
|
+
as a parser skeleton. Alternatively, if you modify or redistribute
|
27
|
+
the parser skeleton itself, you may (at your option) remove this
|
28
|
+
special exception, which will cause the skeleton and the resulting
|
29
|
+
Bison output files to be licensed under the GNU General Public
|
30
|
+
License without this special exception.
|
31
|
+
|
32
|
+
This special exception was added by the Free Software Foundation in
|
33
|
+
version 2.2 of Bison. */
|
34
|
+
|
35
|
+
|
36
|
+
/* Tokens. */
|
37
|
+
#ifndef YYTOKENTYPE
|
38
|
+
# define YYTOKENTYPE
|
39
|
+
/* Put the tokens into the symbol table, so that GDB and other debuggers
|
40
|
+
know about them. */
|
41
|
+
enum yytokentype {
|
42
|
+
WORD = 258,
|
43
|
+
OR = 259
|
44
|
+
};
|
45
|
+
#endif
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
|
50
|
+
typedef union YYSTYPE
|
51
|
+
{
|
52
|
+
|
53
|
+
/* Line 1676 of yacc.c */
|
54
|
+
#line 30 "query-parser.y"
|
55
|
+
|
56
|
+
wp_query* query;
|
57
|
+
char* string;
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
/* Line 1676 of yacc.c */
|
62
|
+
#line 63 "query-parser.tab.h"
|
63
|
+
} YYSTYPE;
|
64
|
+
# define YYSTYPE_IS_TRIVIAL 1
|
65
|
+
# define yystype YYSTYPE /* obsolescent; will be withdrawn */
|
66
|
+
# define YYSTYPE_IS_DECLARED 1
|
67
|
+
#endif
|
68
|
+
|
69
|
+
|
70
|
+
|
71
|
+
#if ! defined YYLTYPE && ! defined YYLTYPE_IS_DECLARED
|
72
|
+
typedef struct YYLTYPE
|
73
|
+
{
|
74
|
+
int first_line;
|
75
|
+
int first_column;
|
76
|
+
int last_line;
|
77
|
+
int last_column;
|
78
|
+
} YYLTYPE;
|
79
|
+
# define yyltype YYLTYPE /* obsolescent; will be withdrawn */
|
80
|
+
# define YYLTYPE_IS_DECLARED 1
|
81
|
+
# define YYLTYPE_IS_TRIVIAL 1
|
82
|
+
#endif
|
83
|
+
|
84
|
+
|
85
|
+
|
@@ -0,0 +1,194 @@
|
|
1
|
+
#include "query.h"
|
2
|
+
|
3
|
+
static wp_query* wp_query_new() {
|
4
|
+
wp_query* ret = malloc(sizeof(wp_query));
|
5
|
+
ret->type = 0; // error
|
6
|
+
ret->field = ret->word = NULL;
|
7
|
+
ret->num_children = 0;
|
8
|
+
ret->children = ret->next = ret->last = NULL;
|
9
|
+
ret->search_data = NULL;
|
10
|
+
|
11
|
+
return ret;
|
12
|
+
}
|
13
|
+
|
14
|
+
static char* strdup(const char* old) { // sigh... not in c99
|
15
|
+
size_t len = strlen(old) + 1;
|
16
|
+
char *new = malloc(len * sizeof(char));
|
17
|
+
return memcpy(new, old, len);
|
18
|
+
}
|
19
|
+
|
20
|
+
wp_query* wp_query_clone(wp_query* other) {
|
21
|
+
wp_query* ret = malloc(sizeof(wp_query));
|
22
|
+
ret->type = other->type;
|
23
|
+
ret->num_children = other->num_children;
|
24
|
+
ret->search_data = NULL;
|
25
|
+
|
26
|
+
if(other->field) ret->field = strdup(other->field);
|
27
|
+
else ret->field = NULL;
|
28
|
+
|
29
|
+
if(other->word) ret->word = strdup(other->word);
|
30
|
+
else ret->word = NULL;
|
31
|
+
|
32
|
+
ret->children = ret->next = ret->last = NULL; // set below
|
33
|
+
for(wp_query* child = other->children; child != NULL; child = child->next) {
|
34
|
+
wp_query* clone = wp_query_clone(child);
|
35
|
+
if(ret->last == NULL) ret->children = ret->last = clone;
|
36
|
+
else {
|
37
|
+
ret->last->next = clone;
|
38
|
+
ret->last = clone;
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
return ret;
|
43
|
+
}
|
44
|
+
|
45
|
+
wp_query* wp_query_new_term(const char* field, const char* word) {
|
46
|
+
wp_query* ret = wp_query_new();
|
47
|
+
ret->type = WP_QUERY_TERM;
|
48
|
+
ret->field = field;
|
49
|
+
ret->word = word;
|
50
|
+
return ret;
|
51
|
+
}
|
52
|
+
|
53
|
+
wp_query* wp_query_new_label(const char* label) {
|
54
|
+
wp_query* ret = wp_query_new();
|
55
|
+
ret->type = WP_QUERY_LABEL;
|
56
|
+
ret->word = label;
|
57
|
+
ret->field = NULL;
|
58
|
+
return ret;
|
59
|
+
}
|
60
|
+
|
61
|
+
wp_query* wp_query_new_conjunction() {
|
62
|
+
wp_query* ret = wp_query_new();
|
63
|
+
ret->type = WP_QUERY_CONJ;
|
64
|
+
return ret;
|
65
|
+
}
|
66
|
+
|
67
|
+
wp_query* wp_query_new_disjunction() {
|
68
|
+
wp_query* ret = wp_query_new();
|
69
|
+
ret->type = WP_QUERY_DISJ;
|
70
|
+
return ret;
|
71
|
+
}
|
72
|
+
|
73
|
+
wp_query* wp_query_new_phrase() {
|
74
|
+
wp_query* ret = wp_query_new();
|
75
|
+
ret->type = WP_QUERY_PHRASE;
|
76
|
+
return ret;
|
77
|
+
}
|
78
|
+
|
79
|
+
wp_query* wp_query_new_negation() {
|
80
|
+
wp_query* ret = wp_query_new();
|
81
|
+
ret->type = WP_QUERY_NEG;
|
82
|
+
return ret;
|
83
|
+
}
|
84
|
+
|
85
|
+
wp_query* wp_query_new_empty() {
|
86
|
+
wp_query* ret = wp_query_new();
|
87
|
+
ret->type = WP_QUERY_EMPTY;
|
88
|
+
return ret;
|
89
|
+
}
|
90
|
+
|
91
|
+
wp_query* wp_query_add(wp_query* a, wp_query* b) {
|
92
|
+
if(a->type == WP_QUERY_EMPTY) {
|
93
|
+
wp_query_free(a);
|
94
|
+
return b;
|
95
|
+
}
|
96
|
+
else if(b->type == WP_QUERY_EMPTY) {
|
97
|
+
wp_query_free(b);
|
98
|
+
return a;
|
99
|
+
}
|
100
|
+
else {
|
101
|
+
a->num_children++;
|
102
|
+
if(a->last == NULL) a->children = a->last = b;
|
103
|
+
else {
|
104
|
+
a->last->next = b;
|
105
|
+
a->last = b;
|
106
|
+
}
|
107
|
+
return a;
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
void wp_query_free(wp_query* q) {
|
112
|
+
if(q->field) free((void*)q->field);
|
113
|
+
if(q->word) free((void*)q->word);
|
114
|
+
while(q->children) {
|
115
|
+
wp_query* b = q->children;
|
116
|
+
q->children = q->children->next;
|
117
|
+
wp_query_free(b);
|
118
|
+
}
|
119
|
+
free(q);
|
120
|
+
}
|
121
|
+
|
122
|
+
static int subquery_to_s(wp_query* q, size_t n, char* buf) {
|
123
|
+
char* orig_buf = buf;
|
124
|
+
|
125
|
+
for(wp_query* child = q->children; child != NULL; child = child->next) {
|
126
|
+
if((n - (buf - orig_buf)) < 1) break; // can we add a space?
|
127
|
+
buf += sprintf(buf, " ");
|
128
|
+
buf += wp_query_to_s(child, n - (buf - orig_buf), buf);
|
129
|
+
}
|
130
|
+
|
131
|
+
return buf - orig_buf;
|
132
|
+
}
|
133
|
+
|
134
|
+
#define min(a, b) (a < b ? a : b)
|
135
|
+
|
136
|
+
int wp_query_to_s(wp_query* q, size_t n, char* buf) {
|
137
|
+
int ret;
|
138
|
+
char* orig_buf = buf;
|
139
|
+
|
140
|
+
if(q->type == WP_QUERY_EMPTY) {
|
141
|
+
buf[0] = '\0';
|
142
|
+
ret = n;
|
143
|
+
}
|
144
|
+
else if(q->type == WP_QUERY_TERM) {
|
145
|
+
size_t term_n = (size_t)snprintf(buf, n, "%s:\"%s\"", q->field, q->word);
|
146
|
+
ret = min(term_n, n);
|
147
|
+
}
|
148
|
+
else if(q->type == WP_QUERY_LABEL) {
|
149
|
+
size_t term_n = (size_t)snprintf(buf, n, "~%s", q->word);
|
150
|
+
ret = min(term_n, n);
|
151
|
+
}
|
152
|
+
else {
|
153
|
+
switch(q->type) {
|
154
|
+
case WP_QUERY_CONJ:
|
155
|
+
if(n >= 4) { // "(AND"
|
156
|
+
buf += snprintf(buf, n, "(AND");
|
157
|
+
n -= 4;
|
158
|
+
}
|
159
|
+
break;
|
160
|
+
case WP_QUERY_DISJ:
|
161
|
+
if(n >= 3) { // "(OR"
|
162
|
+
buf += snprintf(buf, n, "(OR");
|
163
|
+
n -= 3;
|
164
|
+
}
|
165
|
+
break;
|
166
|
+
case WP_QUERY_PHRASE:
|
167
|
+
if(n >= 7) { // "(PHRASE"
|
168
|
+
buf += snprintf(buf, n, "(PHRASE");
|
169
|
+
n -= 7;
|
170
|
+
}
|
171
|
+
break;
|
172
|
+
case WP_QUERY_NEG:
|
173
|
+
if(n >= 4) {
|
174
|
+
buf += snprintf(buf, n, "(NOT");
|
175
|
+
n -= 4;
|
176
|
+
}
|
177
|
+
break;
|
178
|
+
}
|
179
|
+
|
180
|
+
int subq_size = subquery_to_s(q, n, buf);
|
181
|
+
n -= subq_size;
|
182
|
+
buf += subq_size;
|
183
|
+
if(n >= 1) buf += sprintf(buf, ")");
|
184
|
+
ret = buf - orig_buf;
|
185
|
+
}
|
186
|
+
|
187
|
+
return ret;
|
188
|
+
}
|
189
|
+
|
190
|
+
wp_query* wp_query_set_all_child_fields(wp_query* q, const char* field) {
|
191
|
+
if(q->type == WP_QUERY_TERM) q->field = field;
|
192
|
+
else for(wp_query* child = q->children; child != NULL; child = child->next) wp_query_set_all_child_fields(child, strdup(field));
|
193
|
+
return q;
|
194
|
+
}
|
@@ -0,0 +1,78 @@
|
|
1
|
+
#ifndef WP_QUERY_H_
|
2
|
+
#define WP_QUERY_H_
|
3
|
+
|
4
|
+
// whistlepig query
|
5
|
+
// (c) 2011 William Morgan. See COPYING for license terms.
|
6
|
+
//
|
7
|
+
// a query. typically built up by the parser, but you can also build it
|
8
|
+
// programmatically yourself if you like.
|
9
|
+
//
|
10
|
+
// note that queries contain segment-specific search state in them. see
|
11
|
+
// search.c for details.
|
12
|
+
|
13
|
+
#include <stdint.h>
|
14
|
+
#include <stdlib.h>
|
15
|
+
#include "segment.h"
|
16
|
+
|
17
|
+
#define WP_QUERY_TERM 1
|
18
|
+
#define WP_QUERY_CONJ 2
|
19
|
+
#define WP_QUERY_DISJ 3
|
20
|
+
#define WP_QUERY_PHRASE 4
|
21
|
+
#define WP_QUERY_NEG 5
|
22
|
+
#define WP_QUERY_LABEL 6
|
23
|
+
#define WP_QUERY_EMPTY 7
|
24
|
+
|
25
|
+
// a node in the query tree
|
26
|
+
typedef struct wp_query {
|
27
|
+
uint8_t type;
|
28
|
+
const char* field;
|
29
|
+
const char* word;
|
30
|
+
|
31
|
+
uint16_t num_children;
|
32
|
+
struct wp_query* children;
|
33
|
+
struct wp_query* next;
|
34
|
+
struct wp_query* last;
|
35
|
+
|
36
|
+
uint16_t segment_idx; // used to continue queries across segments (see index.c)
|
37
|
+
void* search_data; // whatever state we need for actually doing searches
|
38
|
+
} wp_query;
|
39
|
+
|
40
|
+
// API methods
|
41
|
+
|
42
|
+
// public: make a query node with a term
|
43
|
+
wp_query* wp_query_new_term(const char* field, const char* word);
|
44
|
+
|
45
|
+
// public: make a query node with a label
|
46
|
+
wp_query* wp_query_new_label(const char* label);
|
47
|
+
|
48
|
+
// public: make a query conjuction node
|
49
|
+
wp_query* wp_query_new_conjunction();
|
50
|
+
|
51
|
+
// public: make a query disjunction node
|
52
|
+
wp_query* wp_query_new_disjunction();
|
53
|
+
|
54
|
+
// public: make a query phrase node
|
55
|
+
wp_query* wp_query_new_phrase();
|
56
|
+
|
57
|
+
// public: make a query negation node
|
58
|
+
wp_query* wp_query_new_negation();
|
59
|
+
|
60
|
+
// public: make an empty query node.
|
61
|
+
wp_query* wp_query_new_empty();
|
62
|
+
|
63
|
+
// public: deep clone of a query, but dropping all search state.
|
64
|
+
wp_query* wp_query_clone(wp_query* other);
|
65
|
+
|
66
|
+
// public: add a query node as a child of another
|
67
|
+
wp_query* wp_query_add(wp_query* a, wp_query* b);
|
68
|
+
|
69
|
+
// private: set all children fields to a particular value
|
70
|
+
wp_query* wp_query_set_all_child_fields(wp_query* q, const char* field);
|
71
|
+
|
72
|
+
// public: free a query
|
73
|
+
void wp_query_free(wp_query* q);
|
74
|
+
|
75
|
+
// public: build a string representation of a query by writing at most n chars to buf
|
76
|
+
int wp_query_to_s(wp_query* q, size_t n, char* buf);
|
77
|
+
|
78
|
+
#endif
|