fuzz_ball 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/fuzz_ball/duple_index/DupleIndex.c +276 -0
- data/ext/fuzz_ball/duple_index/DupleIndex.h +60 -0
- data/ext/fuzz_ball/duple_index/extconf.rb +5 -0
- data/ext/fuzz_ball/duple_index/utarray.h +226 -0
- data/ext/fuzz_ball/duple_index/uthash.h +904 -0
- data/ext/fuzz_ball/duple_index/utlist.h +522 -0
- data/ext/fuzz_ball/duple_index/utstring.h +148 -0
- data/ext/fuzz_ball/smith_waterman/SmithWaterman.c +211 -0
- data/ext/fuzz_ball/smith_waterman/SmithWaterman.h +19 -0
- data/ext/fuzz_ball/smith_waterman/extconf.rb +5 -0
- data/lib/fuzz_ball.rb +6 -0
- data/lib/fuzz_ball/searcher.rb +92 -0
- metadata +66 -0
@@ -0,0 +1,148 @@
|
|
1
|
+
/*
|
2
|
+
Copyright (c) 2008-2011, Troy D. Hanson http://uthash.sourceforge.net
|
3
|
+
All rights reserved.
|
4
|
+
|
5
|
+
Redistribution and use in source and binary forms, with or without
|
6
|
+
modification, are permitted provided that the following conditions are met:
|
7
|
+
|
8
|
+
* Redistributions of source code must retain the above copyright
|
9
|
+
notice, this list of conditions and the following disclaimer.
|
10
|
+
|
11
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
12
|
+
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
13
|
+
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
14
|
+
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
15
|
+
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
16
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
17
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
18
|
+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
19
|
+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
20
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
21
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
22
|
+
*/
|
23
|
+
|
24
|
+
/* a dynamic string implementation using macros
|
25
|
+
* see http://uthash.sourceforge.net/utstring
|
26
|
+
*/
|
27
|
+
#ifndef UTSTRING_H
|
28
|
+
#define UTSTRING_H
|
29
|
+
|
30
|
+
#define UTSTRING_VERSION 1.9.4
|
31
|
+
|
32
|
+
#ifdef __GNUC__
|
33
|
+
#define _UNUSED_ __attribute__ ((__unused__))
|
34
|
+
#else
|
35
|
+
#define _UNUSED_
|
36
|
+
#endif
|
37
|
+
|
38
|
+
#include <stdlib.h>
|
39
|
+
#include <string.h>
|
40
|
+
#include <stdarg.h>
|
41
|
+
#define oom() exit(-1)
|
42
|
+
|
43
|
+
typedef struct {
|
44
|
+
char *d;
|
45
|
+
size_t n; /* allocd size */
|
46
|
+
size_t i; /* index of first unused byte */
|
47
|
+
} UT_string;
|
48
|
+
|
49
|
+
#define utstring_reserve(s,amt) \
|
50
|
+
do { \
|
51
|
+
if (((s)->n - (s)->i) < (size_t)(amt)) { \
|
52
|
+
(s)->d = (char*)realloc((s)->d, (s)->n + amt); \
|
53
|
+
if ((s)->d == NULL) oom(); \
|
54
|
+
(s)->n += amt; \
|
55
|
+
} \
|
56
|
+
} while(0)
|
57
|
+
|
58
|
+
#define utstring_init(s) \
|
59
|
+
do { \
|
60
|
+
(s)->n = 0; (s)->i = 0; (s)->d = NULL; \
|
61
|
+
utstring_reserve(s,100); \
|
62
|
+
(s)->d[0] = '\0'; \
|
63
|
+
} while(0)
|
64
|
+
|
65
|
+
#define utstring_done(s) \
|
66
|
+
do { \
|
67
|
+
if ((s)->d != NULL) free((s)->d); \
|
68
|
+
(s)->n = 0; \
|
69
|
+
} while(0)
|
70
|
+
|
71
|
+
#define utstring_free(s) \
|
72
|
+
do { \
|
73
|
+
utstring_done(s); \
|
74
|
+
free(s); \
|
75
|
+
} while(0)
|
76
|
+
|
77
|
+
#define utstring_new(s) \
|
78
|
+
do { \
|
79
|
+
s = (UT_string*)calloc(sizeof(UT_string),1); \
|
80
|
+
if (!s) oom(); \
|
81
|
+
utstring_init(s); \
|
82
|
+
} while(0)
|
83
|
+
|
84
|
+
#define utstring_renew(s) \
|
85
|
+
do { \
|
86
|
+
if (s) { \
|
87
|
+
utstring_clear(s); \
|
88
|
+
} else { \
|
89
|
+
utstring_new(s); \
|
90
|
+
} \
|
91
|
+
} while(0)
|
92
|
+
|
93
|
+
#define utstring_clear(s) \
|
94
|
+
do { \
|
95
|
+
(s)->i = 0; \
|
96
|
+
(s)->d[0] = '\0'; \
|
97
|
+
} while(0)
|
98
|
+
|
99
|
+
#define utstring_bincpy(s,b,l) \
|
100
|
+
do { \
|
101
|
+
utstring_reserve(s,(l)+1); \
|
102
|
+
if (l) memcpy(&(s)->d[(s)->i], b, l); \
|
103
|
+
s->i += l; \
|
104
|
+
s->d[s->i]='\0'; \
|
105
|
+
} while(0)
|
106
|
+
|
107
|
+
#define utstring_concat(dst,src) \
|
108
|
+
do { \
|
109
|
+
utstring_reserve(dst,(src->i)+1); \
|
110
|
+
if (src->i) memcpy(&(dst)->d[(dst)->i], src->d, src->i); \
|
111
|
+
dst->i += src->i; \
|
112
|
+
dst->d[dst->i]='\0'; \
|
113
|
+
} while(0)
|
114
|
+
|
115
|
+
#define utstring_len(s) ((unsigned)((s)->i))
|
116
|
+
|
117
|
+
#define utstring_body(s) ((s)->d)
|
118
|
+
|
119
|
+
_UNUSED_ static void utstring_printf_va(UT_string *s, const char *fmt, va_list ap) {
|
120
|
+
int n;
|
121
|
+
va_list cp;
|
122
|
+
while (1) {
|
123
|
+
#ifdef _WIN32
|
124
|
+
cp = ap;
|
125
|
+
#else
|
126
|
+
va_copy(cp, ap);
|
127
|
+
#endif
|
128
|
+
n = vsnprintf (&s->d[s->i], s->n-s->i, fmt, cp);
|
129
|
+
va_end(cp);
|
130
|
+
|
131
|
+
if ((n > -1) && (n < (int)(s->n-s->i))) {
|
132
|
+
s->i += n;
|
133
|
+
return;
|
134
|
+
}
|
135
|
+
|
136
|
+
/* Else try again with more space. */
|
137
|
+
if (n > -1) utstring_reserve(s,n+1); /* exact */
|
138
|
+
else utstring_reserve(s,(s->n)*2); /* 2x */
|
139
|
+
}
|
140
|
+
}
|
141
|
+
_UNUSED_ static void utstring_printf(UT_string *s, const char *fmt, ...) {
|
142
|
+
va_list ap;
|
143
|
+
va_start(ap,fmt);
|
144
|
+
utstring_printf_va(s,fmt,ap);
|
145
|
+
va_end(ap);
|
146
|
+
}
|
147
|
+
|
148
|
+
#endif /* UTSTRING_H */
|
@@ -0,0 +1,211 @@
|
|
1
|
+
// Include the Ruby headers and goodies
|
2
|
+
#include <ruby.h>
|
3
|
+
#include <SmithWaterman.h>
|
4
|
+
|
5
|
+
// The initialization method for this module
|
6
|
+
void Init_smith_waterman() {
|
7
|
+
FuzzBall = rb_define_module("FuzzBall");
|
8
|
+
SmithWaterman = rb_define_class_under(FuzzBall, "SmithWaterman", rb_cObject);
|
9
|
+
|
10
|
+
rb_define_method(SmithWaterman, "initialize", method_initialize, 2);
|
11
|
+
rb_define_attr(SmithWaterman, "alignment", 1, 0);
|
12
|
+
rb_define_attr(SmithWaterman, "score", 1, 0);
|
13
|
+
}
|
14
|
+
|
15
|
+
/* method_initialize
|
16
|
+
*
|
17
|
+
* We use the Smith-Waterman algorithm to align each candidate string with the
|
18
|
+
* needle string and to see how well the two fit. The Smith-Waterman algorithm
|
19
|
+
* is a dynamic programming algorith that keeps track of different alignments
|
20
|
+
* between two strings using a matrix. The best alignment is determined using
|
21
|
+
* a recursive search through the alignment matrix. For more information, see:
|
22
|
+
*
|
23
|
+
* http://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
|
24
|
+
*/
|
25
|
+
VALUE method_initialize(VALUE self, VALUE needle, VALUE candidate) {
|
26
|
+
int i, j, i_max, j_max;
|
27
|
+
int n_needle = (int) RARRAY_LEN(needle);
|
28
|
+
int n_candidate = (int) RARRAY_LEN(candidate);
|
29
|
+
int *c_needle, *c_candidate;
|
30
|
+
|
31
|
+
double max_score;
|
32
|
+
double **mat;
|
33
|
+
|
34
|
+
VALUE alignment = rb_ary_new();
|
35
|
+
|
36
|
+
alloc_vars(&mat, &c_needle, &c_candidate, n_needle, n_candidate);
|
37
|
+
|
38
|
+
// Copy the needle / candidate strings from their ruby versions
|
39
|
+
// into plain old C-integer arrays.
|
40
|
+
for (i=0; i<n_needle; i++) {
|
41
|
+
c_needle[i] = NUM2INT( RARRAY_PTR(needle)[i] );
|
42
|
+
}
|
43
|
+
|
44
|
+
for (i=0; i<n_candidate; i++) {
|
45
|
+
c_candidate[i] = NUM2INT( RARRAY_PTR(candidate)[i] );
|
46
|
+
}
|
47
|
+
|
48
|
+
assign_cells(mat, c_needle, c_candidate, n_needle, n_candidate, &i_max, &j_max, &max_score);
|
49
|
+
recurse_optimal_path(mat, i_max, j_max, alignment);
|
50
|
+
|
51
|
+
rb_iv_set(self, "@alignment", alignment);
|
52
|
+
rb_iv_set(self, "@score", DBL2NUM(max_score));
|
53
|
+
|
54
|
+
free_vars(&mat, &c_needle, &c_candidate, n_needle, n_candidate);
|
55
|
+
|
56
|
+
return self;
|
57
|
+
}
|
58
|
+
|
59
|
+
// A few convenience methods for determining the max of 2, 3, and 4 doubles
|
60
|
+
double max(double a, double b) {
|
61
|
+
return ( (a > b) ? a : b );
|
62
|
+
}
|
63
|
+
|
64
|
+
double max3(double a, double b, double c) {
|
65
|
+
return max(a, max(b, c));
|
66
|
+
}
|
67
|
+
|
68
|
+
double max4(double a, double b, double c, double d) {
|
69
|
+
return max(a, max3(b, c, d));
|
70
|
+
}
|
71
|
+
|
72
|
+
/* alloc_vars
|
73
|
+
*
|
74
|
+
* A simple function that allocates memory for the alignment matrix, as well
|
75
|
+
* as the arrays that store the characters of the needle and candidate strings
|
76
|
+
* given their lengths.
|
77
|
+
*/
|
78
|
+
void alloc_vars(
|
79
|
+
double ***mat,
|
80
|
+
int **needle,
|
81
|
+
int **candidate,
|
82
|
+
int n_needle,
|
83
|
+
int n_candidate
|
84
|
+
) {
|
85
|
+
int i;
|
86
|
+
|
87
|
+
*mat = malloc(n_needle * sizeof(double *));
|
88
|
+
for (i=0; i<n_needle; i++) {
|
89
|
+
*((*mat) + i) = malloc(n_candidate * sizeof(double)); // ptr arithmetic FTW
|
90
|
+
}
|
91
|
+
|
92
|
+
*needle = malloc(n_needle * sizeof(int));
|
93
|
+
*candidate = malloc(n_candidate * sizeof(int));
|
94
|
+
}
|
95
|
+
|
96
|
+
/* free_vars
|
97
|
+
*
|
98
|
+
* The method that frees memory associated with the alignment matrix, and the
|
99
|
+
* needle and candidate strings.
|
100
|
+
*/
|
101
|
+
void free_vars(
|
102
|
+
double ***mat,
|
103
|
+
int **needle,
|
104
|
+
int **candidate,
|
105
|
+
int n_needle,
|
106
|
+
int n_candidate
|
107
|
+
) {
|
108
|
+
|
109
|
+
int i;
|
110
|
+
|
111
|
+
free(*needle); *needle = NULL;
|
112
|
+
free(*candidate); *candidate = NULL;
|
113
|
+
|
114
|
+
for (i=0; i<n_needle; i++) {
|
115
|
+
free( (*mat)[i] );
|
116
|
+
}
|
117
|
+
free( *mat );
|
118
|
+
*mat = NULL;
|
119
|
+
}
|
120
|
+
|
121
|
+
/* assign_cells
|
122
|
+
*
|
123
|
+
* Called within the smith_waterman loop; this is the function that assigns
|
124
|
+
* each cell of the alignment matrix; the value of each cell represents the
|
125
|
+
* score of that given alignment, up to that point, taking into account any
|
126
|
+
* deletions, additions, etc.. that result in that alignment. For two strings
|
127
|
+
* of length m, n the alignment matrix is m x n large. As the values are
|
128
|
+
* assigned, we keep track of the cell with the higest score. At the end of
|
129
|
+
* the assignment, we start at this highest-scoring cell and recursively walk
|
130
|
+
* backwards through the cells, maximizing the score at each step. This
|
131
|
+
* becomes the highest scoring alignment
|
132
|
+
*/
|
133
|
+
void assign_cells(
|
134
|
+
double **mat,
|
135
|
+
int *needle,
|
136
|
+
int *candidate,
|
137
|
+
int n_needle,
|
138
|
+
int n_candidate,
|
139
|
+
int *i_max,
|
140
|
+
int *j_max,
|
141
|
+
double *max_score
|
142
|
+
) {
|
143
|
+
int i, j;
|
144
|
+
double score, value;
|
145
|
+
|
146
|
+
for (i=0; i<n_needle; i++) {
|
147
|
+
for (j=0; j<n_candidate; j++) {
|
148
|
+
mat[i][j] = 0.0;
|
149
|
+
}
|
150
|
+
}
|
151
|
+
|
152
|
+
*max_score = -10000.0;
|
153
|
+
for (i=1; i<n_needle; i++) {
|
154
|
+
for (j=1; j<n_candidate; j++) {
|
155
|
+
if (needle[i-1] == candidate[j-1]) {
|
156
|
+
score = SCORE_MATCH;
|
157
|
+
} else {
|
158
|
+
score = SCORE_MISS;
|
159
|
+
}
|
160
|
+
|
161
|
+
value = max4(0.0, mat[i-1][j-1] + score, mat[i-1][j] + SCORE_DELETE, mat[i][j-1] + SCORE_INSERT);
|
162
|
+
mat[i][j] = value;
|
163
|
+
|
164
|
+
if (value > *max_score) {
|
165
|
+
*max_score = value;
|
166
|
+
*i_max = i;
|
167
|
+
*j_max = j;
|
168
|
+
}
|
169
|
+
}
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
/* recurse_optimal_path
|
174
|
+
*
|
175
|
+
* This method searches the alignment matrix from a starting point,
|
176
|
+
* and recursively walks backwards, maximizing the score at each step.
|
177
|
+
* This will result in an alignment of greatest score.
|
178
|
+
*/
|
179
|
+
void recurse_optimal_path(double **mat, int i, int j, VALUE alignment) {
|
180
|
+
int ii, jj;
|
181
|
+
double max_value;
|
182
|
+
|
183
|
+
// Push current position of the alignment into the array that stores it.
|
184
|
+
rb_ary_push(alignment, INT2NUM(i));
|
185
|
+
rb_ary_push(alignment, INT2NUM(j));
|
186
|
+
|
187
|
+
max_value = max3(mat[i-1][j-1], mat[i-1][j], mat[i][j-1]);
|
188
|
+
|
189
|
+
if (max_value == mat[i-1][j]) {
|
190
|
+
ii = i-1;
|
191
|
+
jj = j;
|
192
|
+
}
|
193
|
+
|
194
|
+
if (max_value == mat[i][j-1]) {
|
195
|
+
ii = i;
|
196
|
+
jj = j-1;
|
197
|
+
}
|
198
|
+
|
199
|
+
if (max_value == mat[i-1][j-1]) {
|
200
|
+
ii = i-1;
|
201
|
+
jj = j-1;
|
202
|
+
}
|
203
|
+
|
204
|
+
// The recursive loop. If we reach an edge (i.e., m(i,j) == 0), then stop
|
205
|
+
// the recursion. Otherwise, keep going!
|
206
|
+
if (mat[i][j] == 0) {
|
207
|
+
return;
|
208
|
+
} else {
|
209
|
+
return recurse_optimal_path(mat, ii, jj, alignment);
|
210
|
+
}
|
211
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#define SCORE_INSERT -0.1
|
2
|
+
#define SCORE_DELETE -1.0
|
3
|
+
#define SCORE_MISS -1.0
|
4
|
+
#define SCORE_MATCH 2.0
|
5
|
+
|
6
|
+
// Prototype some shit
|
7
|
+
VALUE FuzzBall = Qnil;
|
8
|
+
VALUE SmithWaterman = Qnil;
|
9
|
+
void Init_smith_waterman();
|
10
|
+
VALUE method_initialize(VALUE self, VALUE needle, VALUE candidate);
|
11
|
+
|
12
|
+
void assign_cells(double **mat, int *needle, int *candidate, int n_needle, int n_candidate, int *i_max, int *j_max, double *max_score);
|
13
|
+
void recurse_optimal_path(double **mat, int i, int j, VALUE alignment);
|
14
|
+
void alloc_vars(double ***mat, int **needle, int **candidate, int n_needle, int n_candidate);
|
15
|
+
void free_vars(double ***mat, int **needle, int **candidate, int n_needle, int n_candidate);
|
16
|
+
|
17
|
+
double max(double a, double b);
|
18
|
+
double max3(double a, double b, double c);
|
19
|
+
double max4(double a, double b, double c, double d);
|
data/lib/fuzz_ball.rb
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
module FuzzBall
|
2
|
+
class Searcher
|
3
|
+
|
4
|
+
attr_reader :files, :files_array, :options, :duple_index
|
5
|
+
|
6
|
+
def initialize(files, opts = {})
|
7
|
+
@options = opts
|
8
|
+
@files = files
|
9
|
+
@files_array = files.collect {|f| str2arr(f)}
|
10
|
+
|
11
|
+
index_duples!
|
12
|
+
end
|
13
|
+
|
14
|
+
def add( str )
|
15
|
+
str_arr = str2arr( str )
|
16
|
+
|
17
|
+
files << str
|
18
|
+
files_array << str_arr
|
19
|
+
|
20
|
+
duple_index.add( files_array.count - 1, str_arr )
|
21
|
+
|
22
|
+
true
|
23
|
+
end
|
24
|
+
|
25
|
+
def search(needle, opts = {})
|
26
|
+
|
27
|
+
needle_ary = str2arr(needle)
|
28
|
+
results = []
|
29
|
+
|
30
|
+
return results if (needle.length < 2)
|
31
|
+
|
32
|
+
decimate_strings!( needle_ary ).each do |candidate|
|
33
|
+
smith = SmithWaterman.new(needle_ary, candidate)
|
34
|
+
|
35
|
+
results << {
|
36
|
+
:alignment => smith.alignment,
|
37
|
+
:score => (smith.score / candidate.length), # normalize by string length; this favors shorter strings even if a longer string has a higher smith score
|
38
|
+
:string => candidate.pack("U*")
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
if (opts[:order] == :descending)
|
43
|
+
results.sort! {|a,b| b[:score] <=> a[:score]}
|
44
|
+
else
|
45
|
+
results.sort! {|a,b| a[:score] <=> b[:score]}
|
46
|
+
end
|
47
|
+
|
48
|
+
results = results.first(opts[:limit]) if opts[:limit].is_a?(Fixnum)
|
49
|
+
|
50
|
+
results
|
51
|
+
end
|
52
|
+
|
53
|
+
def inspect
|
54
|
+
%Q[<FuzzBall::Searcher n_files=#{files_array.count}>]
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def index_duples!
|
60
|
+
@duple_index = DupleIndex.new
|
61
|
+
files_array.each_with_index do |str, index|
|
62
|
+
duple_index.add(index, str)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def decimate_strings!(needle)
|
67
|
+
matches_by_score = duple_index.match(needle)
|
68
|
+
max_score = matches_by_score.keys.max
|
69
|
+
indices = matches_by_score[max_score]
|
70
|
+
|
71
|
+
files_array.values_at(*indices)
|
72
|
+
end
|
73
|
+
|
74
|
+
def str2arr( str )
|
75
|
+
if options[:ignore]
|
76
|
+
|
77
|
+
regexp = options[:ignore].collect { |s|
|
78
|
+
Regexp.escape(s)
|
79
|
+
}.join("|")
|
80
|
+
|
81
|
+
regexp = Regexp.new("(#{regexp})")
|
82
|
+
str.gsub(regexp, "").unpack("U*")
|
83
|
+
else
|
84
|
+
str.unpack("U*")
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def arr2str( arr )
|
89
|
+
arr.pack("U*")
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|