whistlepig 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +86 -0
- data/ext/whistlepig/defaults.h +28 -0
- data/ext/whistlepig/entry.c +181 -0
- data/ext/whistlepig/entry.h +66 -0
- data/ext/whistlepig/error.c +24 -0
- data/ext/whistlepig/error.h +94 -0
- data/ext/whistlepig/extconf.rb +6 -0
- data/ext/whistlepig/index.c +294 -0
- data/ext/whistlepig/index.h +88 -0
- data/ext/whistlepig/khash.h +316 -0
- data/ext/whistlepig/mmap-obj.c +76 -0
- data/ext/whistlepig/mmap-obj.h +52 -0
- data/ext/whistlepig/query-parser.c +37 -0
- data/ext/whistlepig/query-parser.h +25 -0
- data/ext/whistlepig/query-parser.lex.c +2249 -0
- data/ext/whistlepig/query-parser.lex.h +359 -0
- data/ext/whistlepig/query-parser.tab.c +1757 -0
- data/ext/whistlepig/query-parser.tab.h +85 -0
- data/ext/whistlepig/query.c +194 -0
- data/ext/whistlepig/query.h +78 -0
- data/ext/whistlepig/search.c +746 -0
- data/ext/whistlepig/search.h +76 -0
- data/ext/whistlepig/segment.c +615 -0
- data/ext/whistlepig/segment.h +137 -0
- data/ext/whistlepig/stringmap.c +278 -0
- data/ext/whistlepig/stringmap.h +82 -0
- data/ext/whistlepig/stringpool.c +44 -0
- data/ext/whistlepig/stringpool.h +58 -0
- data/ext/whistlepig/termhash.c +294 -0
- data/ext/whistlepig/termhash.h +79 -0
- data/ext/whistlepig/tokenizer.lex.c +2263 -0
- data/ext/whistlepig/tokenizer.lex.h +360 -0
- data/ext/whistlepig/whistlepig.h +15 -0
- data/ext/whistlepig/whistlepigc.c +537 -0
- data/lib/whistlepig.rb +119 -0
- metadata +103 -0
@@ -0,0 +1,85 @@
|
|
1
|
+
|
2
|
+
/* A Bison parser, made by GNU Bison 2.4.1. */
|
3
|
+
|
4
|
+
/* Skeleton interface for Bison's Yacc-like parsers in C
|
5
|
+
|
6
|
+
Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006
|
7
|
+
Free Software Foundation, Inc.
|
8
|
+
|
9
|
+
This program is free software: you can redistribute it and/or modify
|
10
|
+
it under the terms of the GNU General Public License as published by
|
11
|
+
the Free Software Foundation, either version 3 of the License, or
|
12
|
+
(at your option) any later version.
|
13
|
+
|
14
|
+
This program is distributed in the hope that it will be useful,
|
15
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
GNU General Public License for more details.
|
18
|
+
|
19
|
+
You should have received a copy of the GNU General Public License
|
20
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>. */
|
21
|
+
|
22
|
+
/* As a special exception, you may create a larger work that contains
|
23
|
+
part or all of the Bison parser skeleton and distribute that work
|
24
|
+
under terms of your choice, so long as that work isn't itself a
|
25
|
+
parser generator using the skeleton or a modified version thereof
|
26
|
+
as a parser skeleton. Alternatively, if you modify or redistribute
|
27
|
+
the parser skeleton itself, you may (at your option) remove this
|
28
|
+
special exception, which will cause the skeleton and the resulting
|
29
|
+
Bison output files to be licensed under the GNU General Public
|
30
|
+
License without this special exception.
|
31
|
+
|
32
|
+
This special exception was added by the Free Software Foundation in
|
33
|
+
version 2.2 of Bison. */
|
34
|
+
|
35
|
+
|
36
|
+
/* Tokens. */
|
37
|
+
#ifndef YYTOKENTYPE
|
38
|
+
# define YYTOKENTYPE
|
39
|
+
/* Put the tokens into the symbol table, so that GDB and other debuggers
|
40
|
+
know about them. */
|
41
|
+
enum yytokentype {
|
42
|
+
WORD = 258,
|
43
|
+
OR = 259
|
44
|
+
};
|
45
|
+
#endif
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
|
50
|
+
typedef union YYSTYPE
|
51
|
+
{
|
52
|
+
|
53
|
+
/* Line 1676 of yacc.c */
|
54
|
+
#line 30 "query-parser.y"
|
55
|
+
|
56
|
+
wp_query* query;
|
57
|
+
char* string;
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
/* Line 1676 of yacc.c */
|
62
|
+
#line 63 "query-parser.tab.h"
|
63
|
+
} YYSTYPE;
|
64
|
+
# define YYSTYPE_IS_TRIVIAL 1
|
65
|
+
# define yystype YYSTYPE /* obsolescent; will be withdrawn */
|
66
|
+
# define YYSTYPE_IS_DECLARED 1
|
67
|
+
#endif
|
68
|
+
|
69
|
+
|
70
|
+
|
71
|
+
#if ! defined YYLTYPE && ! defined YYLTYPE_IS_DECLARED
|
72
|
+
typedef struct YYLTYPE
|
73
|
+
{
|
74
|
+
int first_line;
|
75
|
+
int first_column;
|
76
|
+
int last_line;
|
77
|
+
int last_column;
|
78
|
+
} YYLTYPE;
|
79
|
+
# define yyltype YYLTYPE /* obsolescent; will be withdrawn */
|
80
|
+
# define YYLTYPE_IS_DECLARED 1
|
81
|
+
# define YYLTYPE_IS_TRIVIAL 1
|
82
|
+
#endif
|
83
|
+
|
84
|
+
|
85
|
+
|
@@ -0,0 +1,194 @@
|
|
1
|
+
#include "query.h"
|
2
|
+
|
3
|
+
static wp_query* wp_query_new() {
|
4
|
+
wp_query* ret = malloc(sizeof(wp_query));
|
5
|
+
ret->type = 0; // error
|
6
|
+
ret->field = ret->word = NULL;
|
7
|
+
ret->num_children = 0;
|
8
|
+
ret->children = ret->next = ret->last = NULL;
|
9
|
+
ret->search_data = NULL;
|
10
|
+
|
11
|
+
return ret;
|
12
|
+
}
|
13
|
+
|
14
|
+
static char* strdup(const char* old) { // sigh... not in c99
|
15
|
+
size_t len = strlen(old) + 1;
|
16
|
+
char *new = malloc(len * sizeof(char));
|
17
|
+
return memcpy(new, old, len);
|
18
|
+
}
|
19
|
+
|
20
|
+
wp_query* wp_query_clone(wp_query* other) {
|
21
|
+
wp_query* ret = malloc(sizeof(wp_query));
|
22
|
+
ret->type = other->type;
|
23
|
+
ret->num_children = other->num_children;
|
24
|
+
ret->search_data = NULL;
|
25
|
+
|
26
|
+
if(other->field) ret->field = strdup(other->field);
|
27
|
+
else ret->field = NULL;
|
28
|
+
|
29
|
+
if(other->word) ret->word = strdup(other->word);
|
30
|
+
else ret->word = NULL;
|
31
|
+
|
32
|
+
ret->children = ret->next = ret->last = NULL; // set below
|
33
|
+
for(wp_query* child = other->children; child != NULL; child = child->next) {
|
34
|
+
wp_query* clone = wp_query_clone(child);
|
35
|
+
if(ret->last == NULL) ret->children = ret->last = clone;
|
36
|
+
else {
|
37
|
+
ret->last->next = clone;
|
38
|
+
ret->last = clone;
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
return ret;
|
43
|
+
}
|
44
|
+
|
45
|
+
wp_query* wp_query_new_term(const char* field, const char* word) {
|
46
|
+
wp_query* ret = wp_query_new();
|
47
|
+
ret->type = WP_QUERY_TERM;
|
48
|
+
ret->field = field;
|
49
|
+
ret->word = word;
|
50
|
+
return ret;
|
51
|
+
}
|
52
|
+
|
53
|
+
wp_query* wp_query_new_label(const char* label) {
|
54
|
+
wp_query* ret = wp_query_new();
|
55
|
+
ret->type = WP_QUERY_LABEL;
|
56
|
+
ret->word = label;
|
57
|
+
ret->field = NULL;
|
58
|
+
return ret;
|
59
|
+
}
|
60
|
+
|
61
|
+
wp_query* wp_query_new_conjunction() {
|
62
|
+
wp_query* ret = wp_query_new();
|
63
|
+
ret->type = WP_QUERY_CONJ;
|
64
|
+
return ret;
|
65
|
+
}
|
66
|
+
|
67
|
+
wp_query* wp_query_new_disjunction() {
|
68
|
+
wp_query* ret = wp_query_new();
|
69
|
+
ret->type = WP_QUERY_DISJ;
|
70
|
+
return ret;
|
71
|
+
}
|
72
|
+
|
73
|
+
wp_query* wp_query_new_phrase() {
|
74
|
+
wp_query* ret = wp_query_new();
|
75
|
+
ret->type = WP_QUERY_PHRASE;
|
76
|
+
return ret;
|
77
|
+
}
|
78
|
+
|
79
|
+
wp_query* wp_query_new_negation() {
|
80
|
+
wp_query* ret = wp_query_new();
|
81
|
+
ret->type = WP_QUERY_NEG;
|
82
|
+
return ret;
|
83
|
+
}
|
84
|
+
|
85
|
+
wp_query* wp_query_new_empty() {
|
86
|
+
wp_query* ret = wp_query_new();
|
87
|
+
ret->type = WP_QUERY_EMPTY;
|
88
|
+
return ret;
|
89
|
+
}
|
90
|
+
|
91
|
+
wp_query* wp_query_add(wp_query* a, wp_query* b) {
|
92
|
+
if(a->type == WP_QUERY_EMPTY) {
|
93
|
+
wp_query_free(a);
|
94
|
+
return b;
|
95
|
+
}
|
96
|
+
else if(b->type == WP_QUERY_EMPTY) {
|
97
|
+
wp_query_free(b);
|
98
|
+
return a;
|
99
|
+
}
|
100
|
+
else {
|
101
|
+
a->num_children++;
|
102
|
+
if(a->last == NULL) a->children = a->last = b;
|
103
|
+
else {
|
104
|
+
a->last->next = b;
|
105
|
+
a->last = b;
|
106
|
+
}
|
107
|
+
return a;
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
void wp_query_free(wp_query* q) {
|
112
|
+
if(q->field) free((void*)q->field);
|
113
|
+
if(q->word) free((void*)q->word);
|
114
|
+
while(q->children) {
|
115
|
+
wp_query* b = q->children;
|
116
|
+
q->children = q->children->next;
|
117
|
+
wp_query_free(b);
|
118
|
+
}
|
119
|
+
free(q);
|
120
|
+
}
|
121
|
+
|
122
|
+
static int subquery_to_s(wp_query* q, size_t n, char* buf) {
|
123
|
+
char* orig_buf = buf;
|
124
|
+
|
125
|
+
for(wp_query* child = q->children; child != NULL; child = child->next) {
|
126
|
+
if((n - (buf - orig_buf)) < 1) break; // can we add a space?
|
127
|
+
buf += sprintf(buf, " ");
|
128
|
+
buf += wp_query_to_s(child, n - (buf - orig_buf), buf);
|
129
|
+
}
|
130
|
+
|
131
|
+
return buf - orig_buf;
|
132
|
+
}
|
133
|
+
|
134
|
+
#define min(a, b) (a < b ? a : b)
|
135
|
+
|
136
|
+
int wp_query_to_s(wp_query* q, size_t n, char* buf) {
|
137
|
+
int ret;
|
138
|
+
char* orig_buf = buf;
|
139
|
+
|
140
|
+
if(q->type == WP_QUERY_EMPTY) {
|
141
|
+
buf[0] = '\0';
|
142
|
+
ret = n;
|
143
|
+
}
|
144
|
+
else if(q->type == WP_QUERY_TERM) {
|
145
|
+
size_t term_n = (size_t)snprintf(buf, n, "%s:\"%s\"", q->field, q->word);
|
146
|
+
ret = min(term_n, n);
|
147
|
+
}
|
148
|
+
else if(q->type == WP_QUERY_LABEL) {
|
149
|
+
size_t term_n = (size_t)snprintf(buf, n, "~%s", q->word);
|
150
|
+
ret = min(term_n, n);
|
151
|
+
}
|
152
|
+
else {
|
153
|
+
switch(q->type) {
|
154
|
+
case WP_QUERY_CONJ:
|
155
|
+
if(n >= 4) { // "(AND"
|
156
|
+
buf += snprintf(buf, n, "(AND");
|
157
|
+
n -= 4;
|
158
|
+
}
|
159
|
+
break;
|
160
|
+
case WP_QUERY_DISJ:
|
161
|
+
if(n >= 3) { // "(OR"
|
162
|
+
buf += snprintf(buf, n, "(OR");
|
163
|
+
n -= 3;
|
164
|
+
}
|
165
|
+
break;
|
166
|
+
case WP_QUERY_PHRASE:
|
167
|
+
if(n >= 7) { // "(PHRASE"
|
168
|
+
buf += snprintf(buf, n, "(PHRASE");
|
169
|
+
n -= 7;
|
170
|
+
}
|
171
|
+
break;
|
172
|
+
case WP_QUERY_NEG:
|
173
|
+
if(n >= 4) {
|
174
|
+
buf += snprintf(buf, n, "(NOT");
|
175
|
+
n -= 4;
|
176
|
+
}
|
177
|
+
break;
|
178
|
+
}
|
179
|
+
|
180
|
+
int subq_size = subquery_to_s(q, n, buf);
|
181
|
+
n -= subq_size;
|
182
|
+
buf += subq_size;
|
183
|
+
if(n >= 1) buf += sprintf(buf, ")");
|
184
|
+
ret = buf - orig_buf;
|
185
|
+
}
|
186
|
+
|
187
|
+
return ret;
|
188
|
+
}
|
189
|
+
|
190
|
+
wp_query* wp_query_set_all_child_fields(wp_query* q, const char* field) {
|
191
|
+
if(q->type == WP_QUERY_TERM) q->field = field;
|
192
|
+
else for(wp_query* child = q->children; child != NULL; child = child->next) wp_query_set_all_child_fields(child, strdup(field));
|
193
|
+
return q;
|
194
|
+
}
|
@@ -0,0 +1,78 @@
|
|
1
|
+
#ifndef WP_QUERY_H_
|
2
|
+
#define WP_QUERY_H_
|
3
|
+
|
4
|
+
// whistlepig query
|
5
|
+
// (c) 2011 William Morgan. See COPYING for license terms.
|
6
|
+
//
|
7
|
+
// a query. typically built up by the parser, but you can also build it
|
8
|
+
// programmatically yourself if you like.
|
9
|
+
//
|
10
|
+
// note that queries contain segment-specific search state in them. see
|
11
|
+
// search.c for details.
|
12
|
+
|
13
|
+
#include <stdint.h>
|
14
|
+
#include <stdlib.h>
|
15
|
+
#include "segment.h"
|
16
|
+
|
17
|
+
#define WP_QUERY_TERM 1
|
18
|
+
#define WP_QUERY_CONJ 2
|
19
|
+
#define WP_QUERY_DISJ 3
|
20
|
+
#define WP_QUERY_PHRASE 4
|
21
|
+
#define WP_QUERY_NEG 5
|
22
|
+
#define WP_QUERY_LABEL 6
|
23
|
+
#define WP_QUERY_EMPTY 7
|
24
|
+
|
25
|
+
// a node in the query tree
|
26
|
+
typedef struct wp_query {
|
27
|
+
uint8_t type;
|
28
|
+
const char* field;
|
29
|
+
const char* word;
|
30
|
+
|
31
|
+
uint16_t num_children;
|
32
|
+
struct wp_query* children;
|
33
|
+
struct wp_query* next;
|
34
|
+
struct wp_query* last;
|
35
|
+
|
36
|
+
uint16_t segment_idx; // used to continue queries across segments (see index.c)
|
37
|
+
void* search_data; // whatever state we need for actually doing searches
|
38
|
+
} wp_query;
|
39
|
+
|
40
|
+
// API methods
|
41
|
+
|
42
|
+
// public: make a query node with a term
|
43
|
+
wp_query* wp_query_new_term(const char* field, const char* word);
|
44
|
+
|
45
|
+
// public: make a query node with a label
|
46
|
+
wp_query* wp_query_new_label(const char* label);
|
47
|
+
|
48
|
+
// public: make a query conjuction node
|
49
|
+
wp_query* wp_query_new_conjunction();
|
50
|
+
|
51
|
+
// public: make a query disjunction node
|
52
|
+
wp_query* wp_query_new_disjunction();
|
53
|
+
|
54
|
+
// public: make a query phrase node
|
55
|
+
wp_query* wp_query_new_phrase();
|
56
|
+
|
57
|
+
// public: make a query negation node
|
58
|
+
wp_query* wp_query_new_negation();
|
59
|
+
|
60
|
+
// public: make an empty query node.
|
61
|
+
wp_query* wp_query_new_empty();
|
62
|
+
|
63
|
+
// public: deep clone of a query, but dropping all search state.
|
64
|
+
wp_query* wp_query_clone(wp_query* other);
|
65
|
+
|
66
|
+
// public: add a query node as a child of another
|
67
|
+
wp_query* wp_query_add(wp_query* a, wp_query* b);
|
68
|
+
|
69
|
+
// private: set all children fields to a particular value
|
70
|
+
wp_query* wp_query_set_all_child_fields(wp_query* q, const char* field);
|
71
|
+
|
72
|
+
// public: free a query
|
73
|
+
void wp_query_free(wp_query* q);
|
74
|
+
|
75
|
+
// public: build a string representation of a query by writing at most n chars to buf
|
76
|
+
int wp_query_to_s(wp_query* q, size_t n, char* buf);
|
77
|
+
|
78
|
+
#endif
|