fuzz_ball 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,148 @@
1
+ /*
2
+ Copyright (c) 2008-2011, Troy D. Hanson http://uthash.sourceforge.net
3
+ All rights reserved.
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ * Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+
11
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
12
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
13
+ TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
14
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
15
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
16
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
17
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
18
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
19
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
20
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
21
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
22
+ */
23
+
24
+ /* a dynamic string implementation using macros
25
+ * see http://uthash.sourceforge.net/utstring
26
+ */
27
+ #ifndef UTSTRING_H
28
+ #define UTSTRING_H
29
+
30
+ #define UTSTRING_VERSION 1.9.4
31
+
32
+ #ifdef __GNUC__
33
+ #define _UNUSED_ __attribute__ ((__unused__))
34
+ #else
35
+ #define _UNUSED_
36
+ #endif
37
+
38
+ #include <stdlib.h>
39
+ #include <string.h>
40
+ #include <stdarg.h>
41
+ #define oom() exit(-1)
42
+
43
+ typedef struct {
44
+ char *d;
45
+ size_t n; /* allocd size */
46
+ size_t i; /* index of first unused byte */
47
+ } UT_string;
48
+
49
+ #define utstring_reserve(s,amt) \
50
+ do { \
51
+ if (((s)->n - (s)->i) < (size_t)(amt)) { \
52
+ (s)->d = (char*)realloc((s)->d, (s)->n + amt); \
53
+ if ((s)->d == NULL) oom(); \
54
+ (s)->n += amt; \
55
+ } \
56
+ } while(0)
57
+
58
+ #define utstring_init(s) \
59
+ do { \
60
+ (s)->n = 0; (s)->i = 0; (s)->d = NULL; \
61
+ utstring_reserve(s,100); \
62
+ (s)->d[0] = '\0'; \
63
+ } while(0)
64
+
65
+ #define utstring_done(s) \
66
+ do { \
67
+ if ((s)->d != NULL) free((s)->d); \
68
+ (s)->n = 0; \
69
+ } while(0)
70
+
71
+ #define utstring_free(s) \
72
+ do { \
73
+ utstring_done(s); \
74
+ free(s); \
75
+ } while(0)
76
+
77
+ #define utstring_new(s) \
78
+ do { \
79
+ s = (UT_string*)calloc(sizeof(UT_string),1); \
80
+ if (!s) oom(); \
81
+ utstring_init(s); \
82
+ } while(0)
83
+
84
+ #define utstring_renew(s) \
85
+ do { \
86
+ if (s) { \
87
+ utstring_clear(s); \
88
+ } else { \
89
+ utstring_new(s); \
90
+ } \
91
+ } while(0)
92
+
93
+ #define utstring_clear(s) \
94
+ do { \
95
+ (s)->i = 0; \
96
+ (s)->d[0] = '\0'; \
97
+ } while(0)
98
+
99
+ #define utstring_bincpy(s,b,l) \
100
+ do { \
101
+ utstring_reserve(s,(l)+1); \
102
+ if (l) memcpy(&(s)->d[(s)->i], b, l); \
103
+ s->i += l; \
104
+ s->d[s->i]='\0'; \
105
+ } while(0)
106
+
107
+ #define utstring_concat(dst,src) \
108
+ do { \
109
+ utstring_reserve(dst,(src->i)+1); \
110
+ if (src->i) memcpy(&(dst)->d[(dst)->i], src->d, src->i); \
111
+ dst->i += src->i; \
112
+ dst->d[dst->i]='\0'; \
113
+ } while(0)
114
+
115
+ #define utstring_len(s) ((unsigned)((s)->i))
116
+
117
+ #define utstring_body(s) ((s)->d)
118
+
119
+ _UNUSED_ static void utstring_printf_va(UT_string *s, const char *fmt, va_list ap) {
120
+ int n;
121
+ va_list cp;
122
+ while (1) {
123
+ #ifdef _WIN32
124
+ cp = ap;
125
+ #else
126
+ va_copy(cp, ap);
127
+ #endif
128
+ n = vsnprintf (&s->d[s->i], s->n-s->i, fmt, cp);
129
+ va_end(cp);
130
+
131
+ if ((n > -1) && (n < (int)(s->n-s->i))) {
132
+ s->i += n;
133
+ return;
134
+ }
135
+
136
+ /* Else try again with more space. */
137
+ if (n > -1) utstring_reserve(s,n+1); /* exact */
138
+ else utstring_reserve(s,(s->n)*2); /* 2x */
139
+ }
140
+ }
141
+ _UNUSED_ static void utstring_printf(UT_string *s, const char *fmt, ...) {
142
+ va_list ap;
143
+ va_start(ap,fmt);
144
+ utstring_printf_va(s,fmt,ap);
145
+ va_end(ap);
146
+ }
147
+
148
+ #endif /* UTSTRING_H */
@@ -0,0 +1,211 @@
1
+ // Include the Ruby headers and goodies
2
+ #include <ruby.h>
3
+ #include <SmithWaterman.h>
4
+
5
+ // The initialization method for this module
6
+ void Init_smith_waterman() {
7
+ FuzzBall = rb_define_module("FuzzBall");
8
+ SmithWaterman = rb_define_class_under(FuzzBall, "SmithWaterman", rb_cObject);
9
+
10
+ rb_define_method(SmithWaterman, "initialize", method_initialize, 2);
11
+ rb_define_attr(SmithWaterman, "alignment", 1, 0);
12
+ rb_define_attr(SmithWaterman, "score", 1, 0);
13
+ }
14
+
15
+ /* method_initialize
16
+ *
17
+ * We use the Smith-Waterman algorithm to align each candidate string with the
18
+ * needle string and to see how well the two fit. The Smith-Waterman algorithm
19
+ * is a dynamic programming algorith that keeps track of different alignments
20
+ * between two strings using a matrix. The best alignment is determined using
21
+ * a recursive search through the alignment matrix. For more information, see:
22
+ *
23
+ * http://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
24
+ */
25
+ VALUE method_initialize(VALUE self, VALUE needle, VALUE candidate) {
26
+ int i, j, i_max, j_max;
27
+ int n_needle = (int) RARRAY_LEN(needle);
28
+ int n_candidate = (int) RARRAY_LEN(candidate);
29
+ int *c_needle, *c_candidate;
30
+
31
+ double max_score;
32
+ double **mat;
33
+
34
+ VALUE alignment = rb_ary_new();
35
+
36
+ alloc_vars(&mat, &c_needle, &c_candidate, n_needle, n_candidate);
37
+
38
+ // Copy the needle / candidate strings from their ruby versions
39
+ // into plain old C-integer arrays.
40
+ for (i=0; i<n_needle; i++) {
41
+ c_needle[i] = NUM2INT( RARRAY_PTR(needle)[i] );
42
+ }
43
+
44
+ for (i=0; i<n_candidate; i++) {
45
+ c_candidate[i] = NUM2INT( RARRAY_PTR(candidate)[i] );
46
+ }
47
+
48
+ assign_cells(mat, c_needle, c_candidate, n_needle, n_candidate, &i_max, &j_max, &max_score);
49
+ recurse_optimal_path(mat, i_max, j_max, alignment);
50
+
51
+ rb_iv_set(self, "@alignment", alignment);
52
+ rb_iv_set(self, "@score", DBL2NUM(max_score));
53
+
54
+ free_vars(&mat, &c_needle, &c_candidate, n_needle, n_candidate);
55
+
56
+ return self;
57
+ }
58
+
59
+ // A few convenience methods for determining the max of 2, 3, and 4 doubles
60
+ double max(double a, double b) {
61
+ return ( (a > b) ? a : b );
62
+ }
63
+
64
+ double max3(double a, double b, double c) {
65
+ return max(a, max(b, c));
66
+ }
67
+
68
+ double max4(double a, double b, double c, double d) {
69
+ return max(a, max3(b, c, d));
70
+ }
71
+
72
+ /* alloc_vars
73
+ *
74
+ * A simple function that allocates memory for the alignment matrix, as well
75
+ * as the arrays that store the characters of the needle and candidate strings
76
+ * given their lengths.
77
+ */
78
+ void alloc_vars(
79
+ double ***mat,
80
+ int **needle,
81
+ int **candidate,
82
+ int n_needle,
83
+ int n_candidate
84
+ ) {
85
+ int i;
86
+
87
+ *mat = malloc(n_needle * sizeof(double *));
88
+ for (i=0; i<n_needle; i++) {
89
+ *((*mat) + i) = malloc(n_candidate * sizeof(double)); // ptr arithmetic FTW
90
+ }
91
+
92
+ *needle = malloc(n_needle * sizeof(int));
93
+ *candidate = malloc(n_candidate * sizeof(int));
94
+ }
95
+
96
+ /* free_vars
97
+ *
98
+ * The method that frees memory associated with the alignment matrix, and the
99
+ * needle and candidate strings.
100
+ */
101
+ void free_vars(
102
+ double ***mat,
103
+ int **needle,
104
+ int **candidate,
105
+ int n_needle,
106
+ int n_candidate
107
+ ) {
108
+
109
+ int i;
110
+
111
+ free(*needle); *needle = NULL;
112
+ free(*candidate); *candidate = NULL;
113
+
114
+ for (i=0; i<n_needle; i++) {
115
+ free( (*mat)[i] );
116
+ }
117
+ free( *mat );
118
+ *mat = NULL;
119
+ }
120
+
121
+ /* assign_cells
122
+ *
123
+ * Called within the smith_waterman loop; this is the function that assigns
124
+ * each cell of the alignment matrix; the value of each cell represents the
125
+ * score of that given alignment, up to that point, taking into account any
126
+ * deletions, additions, etc.. that result in that alignment. For two strings
127
+ * of length m, n the alignment matrix is m x n large. As the values are
128
+ * assigned, we keep track of the cell with the higest score. At the end of
129
+ * the assignment, we start at this highest-scoring cell and recursively walk
130
+ * backwards through the cells, maximizing the score at each step. This
131
+ * becomes the highest scoring alignment
132
+ */
133
+ void assign_cells(
134
+ double **mat,
135
+ int *needle,
136
+ int *candidate,
137
+ int n_needle,
138
+ int n_candidate,
139
+ int *i_max,
140
+ int *j_max,
141
+ double *max_score
142
+ ) {
143
+ int i, j;
144
+ double score, value;
145
+
146
+ for (i=0; i<n_needle; i++) {
147
+ for (j=0; j<n_candidate; j++) {
148
+ mat[i][j] = 0.0;
149
+ }
150
+ }
151
+
152
+ *max_score = -10000.0;
153
+ for (i=1; i<n_needle; i++) {
154
+ for (j=1; j<n_candidate; j++) {
155
+ if (needle[i-1] == candidate[j-1]) {
156
+ score = SCORE_MATCH;
157
+ } else {
158
+ score = SCORE_MISS;
159
+ }
160
+
161
+ value = max4(0.0, mat[i-1][j-1] + score, mat[i-1][j] + SCORE_DELETE, mat[i][j-1] + SCORE_INSERT);
162
+ mat[i][j] = value;
163
+
164
+ if (value > *max_score) {
165
+ *max_score = value;
166
+ *i_max = i;
167
+ *j_max = j;
168
+ }
169
+ }
170
+ }
171
+ }
172
+
173
+ /* recurse_optimal_path
174
+ *
175
+ * This method searches the alignment matrix from a starting point,
176
+ * and recursively walks backwards, maximizing the score at each step.
177
+ * This will result in an alignment of greatest score.
178
+ */
179
+ void recurse_optimal_path(double **mat, int i, int j, VALUE alignment) {
180
+ int ii, jj;
181
+ double max_value;
182
+
183
+ // Push current position of the alignment into the array that stores it.
184
+ rb_ary_push(alignment, INT2NUM(i));
185
+ rb_ary_push(alignment, INT2NUM(j));
186
+
187
+ max_value = max3(mat[i-1][j-1], mat[i-1][j], mat[i][j-1]);
188
+
189
+ if (max_value == mat[i-1][j]) {
190
+ ii = i-1;
191
+ jj = j;
192
+ }
193
+
194
+ if (max_value == mat[i][j-1]) {
195
+ ii = i;
196
+ jj = j-1;
197
+ }
198
+
199
+ if (max_value == mat[i-1][j-1]) {
200
+ ii = i-1;
201
+ jj = j-1;
202
+ }
203
+
204
+ // The recursive loop. If we reach an edge (i.e., m(i,j) == 0), then stop
205
+ // the recursion. Otherwise, keep going!
206
+ if (mat[i][j] == 0) {
207
+ return;
208
+ } else {
209
+ return recurse_optimal_path(mat, ii, jj, alignment);
210
+ }
211
+ }
@@ -0,0 +1,19 @@
1
+ #define SCORE_INSERT -0.1
2
+ #define SCORE_DELETE -1.0
3
+ #define SCORE_MISS -1.0
4
+ #define SCORE_MATCH 2.0
5
+
6
+ // Prototype some shit
7
+ VALUE FuzzBall = Qnil;
8
+ VALUE SmithWaterman = Qnil;
9
+ void Init_smith_waterman();
10
+ VALUE method_initialize(VALUE self, VALUE needle, VALUE candidate);
11
+
12
+ void assign_cells(double **mat, int *needle, int *candidate, int n_needle, int n_candidate, int *i_max, int *j_max, double *max_score);
13
+ void recurse_optimal_path(double **mat, int i, int j, VALUE alignment);
14
+ void alloc_vars(double ***mat, int **needle, int **candidate, int n_needle, int n_candidate);
15
+ void free_vars(double ***mat, int **needle, int **candidate, int n_needle, int n_candidate);
16
+
17
+ double max(double a, double b);
18
+ double max3(double a, double b, double c);
19
+ double max4(double a, double b, double c, double d);
@@ -0,0 +1,5 @@
1
+ require 'mkmf'
2
+
3
+ dir_config("fuzz_ball/smith_waterman")
4
+ create_makefile("fuzz_ball/smith_waterman")
5
+
data/lib/fuzz_ball.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'fuzz_ball/smith_waterman'
2
+ require 'fuzz_ball/duple_index'
3
+
4
+ module FuzzBall
5
+ autoload :Searcher, 'fuzz_ball/searcher'
6
+ end
@@ -0,0 +1,92 @@
1
+ module FuzzBall
2
+ class Searcher
3
+
4
+ attr_reader :files, :files_array, :options, :duple_index
5
+
6
+ def initialize(files, opts = {})
7
+ @options = opts
8
+ @files = files
9
+ @files_array = files.collect {|f| str2arr(f)}
10
+
11
+ index_duples!
12
+ end
13
+
14
+ def add( str )
15
+ str_arr = str2arr( str )
16
+
17
+ files << str
18
+ files_array << str_arr
19
+
20
+ duple_index.add( files_array.count - 1, str_arr )
21
+
22
+ true
23
+ end
24
+
25
+ def search(needle, opts = {})
26
+
27
+ needle_ary = str2arr(needle)
28
+ results = []
29
+
30
+ return results if (needle.length < 2)
31
+
32
+ decimate_strings!( needle_ary ).each do |candidate|
33
+ smith = SmithWaterman.new(needle_ary, candidate)
34
+
35
+ results << {
36
+ :alignment => smith.alignment,
37
+ :score => (smith.score / candidate.length), # normalize by string length; this favors shorter strings even if a longer string has a higher smith score
38
+ :string => candidate.pack("U*")
39
+ }
40
+ end
41
+
42
+ if (opts[:order] == :descending)
43
+ results.sort! {|a,b| b[:score] <=> a[:score]}
44
+ else
45
+ results.sort! {|a,b| a[:score] <=> b[:score]}
46
+ end
47
+
48
+ results = results.first(opts[:limit]) if opts[:limit].is_a?(Fixnum)
49
+
50
+ results
51
+ end
52
+
53
+ def inspect
54
+ %Q[<FuzzBall::Searcher n_files=#{files_array.count}>]
55
+ end
56
+
57
+ private
58
+
59
+ def index_duples!
60
+ @duple_index = DupleIndex.new
61
+ files_array.each_with_index do |str, index|
62
+ duple_index.add(index, str)
63
+ end
64
+ end
65
+
66
+ def decimate_strings!(needle)
67
+ matches_by_score = duple_index.match(needle)
68
+ max_score = matches_by_score.keys.max
69
+ indices = matches_by_score[max_score]
70
+
71
+ files_array.values_at(*indices)
72
+ end
73
+
74
+ def str2arr( str )
75
+ if options[:ignore]
76
+
77
+ regexp = options[:ignore].collect { |s|
78
+ Regexp.escape(s)
79
+ }.join("|")
80
+
81
+ regexp = Regexp.new("(#{regexp})")
82
+ str.gsub(regexp, "").unpack("U*")
83
+ else
84
+ str.unpack("U*")
85
+ end
86
+ end
87
+
88
+ def arr2str( arr )
89
+ arr.pack("U*")
90
+ end
91
+ end
92
+ end