code_zauker 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/BUGS.org +4 -0
- data/CHANGELOG.org +15 -0
- data/LICENSE.txt +674 -0
- data/Rakefile +11 -2
- data/bin/czindexer +46 -39
- data/bin/czsearch +85 -4
- data/code_zauker.gemspec +1 -0
- data/doc/CodeZauker.html +42 -13
- data/doc/CodeZauker/FileScanner.html +373 -183
- data/doc/CodeZauker/Util.html +360 -0
- data/doc/Grep.html +344 -0
- data/doc/_index.html +31 -3
- data/doc/class_list.html +1 -1
- data/doc/frames.html +1 -1
- data/doc/index.html +31 -3
- data/doc/method_list.html +48 -0
- data/doc/top-level-namespace.html +3 -3
- data/lib/code_zauker.rb +182 -41
- data/lib/code_zauker/constants.rb +19 -7
- data/lib/code_zauker/grep.rb +17 -13
- data/lib/code_zauker/version.rb +1 -1
- data/readme.org +19 -8
- data/test/fixture/TEST_LICENSE.txt +0 -970
- data/test/fixture/kurukku.txt +3 -1
- data/test/fixture/testArchive.zip +0 -0
- data/test/test_search.rb +87 -12
- metadata +25 -8
data/doc/_index.html
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
<head>
|
5
5
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
6
6
|
<title>
|
7
|
-
Code Zauker 0.0.
|
7
|
+
Code Zauker 0.0.3 Documentation
|
8
8
|
|
9
9
|
</title>
|
10
10
|
|
@@ -52,7 +52,7 @@
|
|
52
52
|
|
53
53
|
<iframe id="search_frame"></iframe>
|
54
54
|
|
55
|
-
<div id="content"><h1 class="noborder title">Code Zauker 0.0.
|
55
|
+
<div id="content"><h1 class="noborder title">Code Zauker 0.0.3 Documentation</h1>
|
56
56
|
<div id="listing">
|
57
57
|
<h1 class="alphaindex">Alphabetic Index</h1>
|
58
58
|
|
@@ -94,6 +94,34 @@
|
|
94
94
|
</ul>
|
95
95
|
</ul>
|
96
96
|
|
97
|
+
|
98
|
+
<ul id="alpha_G" class="alpha">
|
99
|
+
<li class="letter">G</li>
|
100
|
+
<ul>
|
101
|
+
|
102
|
+
<li>
|
103
|
+
<span class='object_link'><a href="Grep.html" title="Grep (module)">Grep</a></span>
|
104
|
+
|
105
|
+
</li>
|
106
|
+
|
107
|
+
</ul>
|
108
|
+
</ul>
|
109
|
+
|
110
|
+
|
111
|
+
<ul id="alpha_U" class="alpha">
|
112
|
+
<li class="letter">U</li>
|
113
|
+
<ul>
|
114
|
+
|
115
|
+
<li>
|
116
|
+
<span class='object_link'><a href="CodeZauker/Util.html" title="CodeZauker::Util (class)">Util</a></span>
|
117
|
+
|
118
|
+
<small>(CodeZauker)</small>
|
119
|
+
|
120
|
+
</li>
|
121
|
+
|
122
|
+
</ul>
|
123
|
+
</ul>
|
124
|
+
|
97
125
|
</td>
|
98
126
|
</tr>
|
99
127
|
</table>
|
@@ -103,7 +131,7 @@
|
|
103
131
|
</div>
|
104
132
|
|
105
133
|
<div id="footer">
|
106
|
-
Generated on Fri
|
134
|
+
Generated on Fri Feb 3 17:18:43 2012 by
|
107
135
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
108
136
|
0.7.4 (ruby-1.9.3).
|
109
137
|
</div>
|
data/doc/class_list.html
CHANGED
@@ -39,7 +39,7 @@
|
|
39
39
|
|
40
40
|
<ul id="full_list" class="class">
|
41
41
|
<li><span class='object_link'><a href="top-level-namespace.html" title=" (root)">Top Level Namespace</a></span></li>
|
42
|
-
<li><a class='toggle'></a> <span class='object_link'><a href="CodeZauker.html" title="CodeZauker (module)">CodeZauker</a></span><small class='search_info'>Top Level Namespace</small></li><ul><li><span class='object_link'><a href="CodeZauker/FileScanner.html" title="CodeZauker::FileScanner (class)">FileScanner</a></span> < Object<small class='search_info'>CodeZauker</small></li></ul>
|
42
|
+
<li><a class='toggle'></a> <span class='object_link'><a href="CodeZauker.html" title="CodeZauker (module)">CodeZauker</a></span><small class='search_info'>Top Level Namespace</small></li><ul><li><span class='object_link'><a href="CodeZauker/FileScanner.html" title="CodeZauker::FileScanner (class)">FileScanner</a></span> < Object<small class='search_info'>CodeZauker</small></li><li><span class='object_link'><a href="CodeZauker/Util.html" title="CodeZauker::Util (class)">Util</a></span> < Object<small class='search_info'>CodeZauker</small></li></ul><li><span class='object_link'><a href="Grep.html" title="Grep (module)">Grep</a></span><small class='search_info'>Top Level Namespace</small></li>
|
43
43
|
|
44
44
|
</ul>
|
45
45
|
</div>
|
data/doc/frames.html
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
5
5
|
<head>
|
6
6
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
7
|
-
<title>Code Zauker 0.0.
|
7
|
+
<title>Code Zauker 0.0.3 Documentation</title>
|
8
8
|
</head>
|
9
9
|
<frameset cols="20%,*">
|
10
10
|
<frame name="list" src="class_list.html" />
|
data/doc/index.html
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
<head>
|
5
5
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
6
6
|
<title>
|
7
|
-
Code Zauker 0.0.
|
7
|
+
Code Zauker 0.0.3 Documentation
|
8
8
|
|
9
9
|
</title>
|
10
10
|
|
@@ -52,7 +52,7 @@
|
|
52
52
|
|
53
53
|
<iframe id="search_frame"></iframe>
|
54
54
|
|
55
|
-
<div id="content"><h1 class="noborder title">Code Zauker 0.0.
|
55
|
+
<div id="content"><h1 class="noborder title">Code Zauker 0.0.3 Documentation</h1>
|
56
56
|
<div id="listing">
|
57
57
|
<h1 class="alphaindex">Alphabetic Index</h1>
|
58
58
|
|
@@ -94,6 +94,34 @@
|
|
94
94
|
</ul>
|
95
95
|
</ul>
|
96
96
|
|
97
|
+
|
98
|
+
<ul id="alpha_G" class="alpha">
|
99
|
+
<li class="letter">G</li>
|
100
|
+
<ul>
|
101
|
+
|
102
|
+
<li>
|
103
|
+
<span class='object_link'><a href="Grep.html" title="Grep (module)">Grep</a></span>
|
104
|
+
|
105
|
+
</li>
|
106
|
+
|
107
|
+
</ul>
|
108
|
+
</ul>
|
109
|
+
|
110
|
+
|
111
|
+
<ul id="alpha_U" class="alpha">
|
112
|
+
<li class="letter">U</li>
|
113
|
+
<ul>
|
114
|
+
|
115
|
+
<li>
|
116
|
+
<span class='object_link'><a href="CodeZauker/Util.html" title="CodeZauker::Util (class)">Util</a></span>
|
117
|
+
|
118
|
+
<small>(CodeZauker)</small>
|
119
|
+
|
120
|
+
</li>
|
121
|
+
|
122
|
+
</ul>
|
123
|
+
</ul>
|
124
|
+
|
97
125
|
</td>
|
98
126
|
</tr>
|
99
127
|
</table>
|
@@ -103,7 +131,7 @@
|
|
103
131
|
</div>
|
104
132
|
|
105
133
|
<div id="footer">
|
106
|
-
Generated on Fri
|
134
|
+
Generated on Fri Feb 3 17:18:43 2012 by
|
107
135
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
108
136
|
0.7.4 (ruby-1.9.3).
|
109
137
|
</div>
|
data/doc/method_list.html
CHANGED
@@ -48,6 +48,22 @@
|
|
48
48
|
</li>
|
49
49
|
|
50
50
|
|
51
|
+
<li class="r2 ">
|
52
|
+
<span class='object_link'><a href="CodeZauker/Util.html#ensureUTF8-instance_method" title="CodeZauker::Util#ensureUTF8 (method)">#ensureUTF8</a></span>
|
53
|
+
|
54
|
+
<small>CodeZauker::Util</small>
|
55
|
+
|
56
|
+
</li>
|
57
|
+
|
58
|
+
|
59
|
+
<li class="r1 ">
|
60
|
+
<span class='object_link'><a href="Grep.html#grep-instance_method" title="Grep#grep (method)">#grep</a></span>
|
61
|
+
|
62
|
+
<small>Grep</small>
|
63
|
+
|
64
|
+
</li>
|
65
|
+
|
66
|
+
|
51
67
|
<li class="r2 ">
|
52
68
|
<span class='object_link'><a href="CodeZauker/FileScanner.html#initialize-instance_method" title="CodeZauker::FileScanner#initialize (method)">#initialize</a></span>
|
53
69
|
|
@@ -57,6 +73,14 @@
|
|
57
73
|
|
58
74
|
|
59
75
|
<li class="r1 ">
|
76
|
+
<span class='object_link'><a href="CodeZauker/FileScanner.html#isearch-instance_method" title="CodeZauker::FileScanner#isearch (method)">#isearch</a></span>
|
77
|
+
|
78
|
+
<small>CodeZauker::FileScanner</small>
|
79
|
+
|
80
|
+
</li>
|
81
|
+
|
82
|
+
|
83
|
+
<li class="r2 ">
|
60
84
|
<span class='object_link'><a href="CodeZauker/FileScanner.html#load-instance_method" title="CodeZauker::FileScanner#load (method)">#load</a></span>
|
61
85
|
|
62
86
|
<small>CodeZauker::FileScanner</small>
|
@@ -64,6 +88,30 @@
|
|
64
88
|
</li>
|
65
89
|
|
66
90
|
|
91
|
+
<li class="r1 ">
|
92
|
+
<span class='object_link'><a href="CodeZauker/FileScanner.html#map_ids_to_files-instance_method" title="CodeZauker::FileScanner#map_ids_to_files (method)">#map_ids_to_files</a></span>
|
93
|
+
|
94
|
+
<small>CodeZauker::FileScanner</small>
|
95
|
+
|
96
|
+
</li>
|
97
|
+
|
98
|
+
|
99
|
+
<li class="r2 ">
|
100
|
+
<span class='object_link'><a href="CodeZauker/Util.html#mixCase-instance_method" title="CodeZauker::Util#mixCase (method)">#mixCase</a></span>
|
101
|
+
|
102
|
+
<small>CodeZauker::Util</small>
|
103
|
+
|
104
|
+
</li>
|
105
|
+
|
106
|
+
|
107
|
+
<li class="r1 ">
|
108
|
+
<span class='object_link'><a href="CodeZauker/FileScanner.html#reindex-instance_method" title="CodeZauker::FileScanner#reindex (method)">#reindex</a></span>
|
109
|
+
|
110
|
+
<small>CodeZauker::FileScanner</small>
|
111
|
+
|
112
|
+
</li>
|
113
|
+
|
114
|
+
|
67
115
|
<li class="r2 ">
|
68
116
|
<span class='object_link'><a href="CodeZauker/FileScanner.html#remove-instance_method" title="CodeZauker::FileScanner#remove (method)">#remove</a></span>
|
69
117
|
|
@@ -6,7 +6,7 @@
|
|
6
6
|
<title>
|
7
7
|
Top Level Namespace
|
8
8
|
|
9
|
-
— Code Zauker 0.0.
|
9
|
+
— Code Zauker 0.0.3 Documentation
|
10
10
|
|
11
11
|
</title>
|
12
12
|
|
@@ -78,7 +78,7 @@
|
|
78
78
|
<p class="children">
|
79
79
|
|
80
80
|
|
81
|
-
<strong class="modules">Modules:</strong> <span class='object_link'><a href="CodeZauker.html" title="CodeZauker (module)">CodeZauker</a></span>
|
81
|
+
<strong class="modules">Modules:</strong> <span class='object_link'><a href="CodeZauker.html" title="CodeZauker (module)">CodeZauker</a></span>, <span class='object_link'><a href="Grep.html" title="Grep (module)">Grep</a></span>
|
82
82
|
|
83
83
|
|
84
84
|
|
@@ -94,7 +94,7 @@
|
|
94
94
|
</div>
|
95
95
|
|
96
96
|
<div id="footer">
|
97
|
-
Generated on Fri
|
97
|
+
Generated on Fri Feb 3 17:18:44 2012 by
|
98
98
|
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
99
99
|
0.7.4 (ruby-1.9.3).
|
100
100
|
</div>
|
data/lib/code_zauker.rb
CHANGED
@@ -10,6 +10,70 @@ require 'set'
|
|
10
10
|
module CodeZauker
|
11
11
|
GRAM_SIZE=3
|
12
12
|
SPACE_GUY=" "*GRAM_SIZE
|
13
|
+
|
14
|
+
# = Basic utility class
|
15
|
+
class Util
|
16
|
+
# Compute all the possible case-mixed trigrams
|
17
|
+
# It works for every string size
|
18
|
+
# TODO: Very bad implementation, need improvements
|
19
|
+
def mixCase(trigram)
|
20
|
+
caseMixedElements=[]
|
21
|
+
lx=trigram.length
|
22
|
+
combos=2**lx
|
23
|
+
startString=trigram.downcase
|
24
|
+
#puts "Combos... 1..#{combos}... #{startString}"
|
25
|
+
for c in 0..(combos-1) do
|
26
|
+
# Make binary
|
27
|
+
maskForStuff=c.to_s(2)
|
28
|
+
p=0
|
29
|
+
#puts maskForStuff
|
30
|
+
currentMix=""
|
31
|
+
# Pad it
|
32
|
+
if maskForStuff.length < lx
|
33
|
+
maskForStuff = ("0"*(lx-maskForStuff.length)) +maskForStuff
|
34
|
+
end
|
35
|
+
maskForStuff.each_char { | x |
|
36
|
+
#putc x
|
37
|
+
if x=="1"
|
38
|
+
currentMix +=startString[p].upcase
|
39
|
+
else
|
40
|
+
currentMix +=startString[p].downcase
|
41
|
+
end
|
42
|
+
#puts currentMix
|
43
|
+
p+=1
|
44
|
+
}
|
45
|
+
caseMixedElements.push(currentMix)
|
46
|
+
end
|
47
|
+
return caseMixedElements
|
48
|
+
end
|
49
|
+
|
50
|
+
# = Ensure Data are correctly imported
|
51
|
+
# http://blog.grayproductions.net/articles/ruby_19s_string
|
52
|
+
# This code try to "guess" the right encoding
|
53
|
+
# switching to ISO-8859-1 if UTF-8 is not valid.
|
54
|
+
# Tipical use case: an italian source code wronlgy interpreted as a UTF-8
|
55
|
+
# whereas it is a ISO-8859 windows code.
|
56
|
+
def ensureUTF8(untrusted_string)
|
57
|
+
if untrusted_string.valid_encoding?()==false
|
58
|
+
#puts "DEBUG Trouble on #{untrusted_string}"
|
59
|
+
untrusted_string.force_encoding("ISO-8859-1")
|
60
|
+
# We try ISO-8859-1 tipical windows
|
61
|
+
begin
|
62
|
+
valid_string=untrusted_string.encode("UTF-8", { :undef =>:replace, :invalid => :replace} )
|
63
|
+
rescue Encoding::InvalidByteSequenceError => e
|
64
|
+
raise e
|
65
|
+
end
|
66
|
+
# if valid_string != untrusted_string
|
67
|
+
# puts "CONVERTED #{valid_string} Works?#{valid_string.valid_encoding?}"
|
68
|
+
# end
|
69
|
+
return valid_string
|
70
|
+
else
|
71
|
+
return untrusted_string
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
|
13
77
|
# Scan a file and push it inside redis...
|
14
78
|
# then it can provide handy method to find file scontaining the trigram...
|
15
79
|
class FileScanner
|
@@ -20,34 +84,66 @@ module CodeZauker
|
|
20
84
|
@redis=redisConnection
|
21
85
|
end
|
22
86
|
end
|
23
|
-
|
87
|
+
|
88
|
+
|
89
|
+
def disconnect()
|
24
90
|
@redis.quit
|
25
91
|
end
|
26
92
|
|
93
|
+
|
27
94
|
|
28
95
|
|
96
|
+
|
29
97
|
def pushTrigramsSet(s, fid, filename)
|
30
|
-
|
31
|
-
|
98
|
+
case_insensitive_trigram_failed=false
|
99
|
+
showlog=false
|
100
|
+
if s.length > (TRIGRAM_DEFAULT_PUSH_SIZE/2)
|
32
101
|
puts " >Pushing...#{s.length} for id #{fid}=#{filename}"
|
102
|
+
showlog=true
|
33
103
|
end
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
104
|
+
# Ask for a protected transaction
|
105
|
+
# Sometimes can fail...
|
106
|
+
welldone=false
|
107
|
+
tryCounter=0
|
108
|
+
while welldone == false do
|
38
109
|
begin
|
39
|
-
|
40
|
-
|
41
|
-
|
110
|
+
tryCounter +=1
|
111
|
+
case_insensitive_trigram_failed=pushTrigramsSetRecoverable(s,fid,filename)
|
112
|
+
welldone=true
|
113
|
+
rescue Errno::EAGAIN =>ea
|
114
|
+
if tryCounter >=MAX_PUSH_TRIGRAM_RETRIES
|
115
|
+
puts "FATAL: Too many Errno::EAGAIN Errors"
|
116
|
+
raise ea
|
117
|
+
else
|
118
|
+
puts "Trouble storing #{s.length} data. Retrying..."
|
119
|
+
welldone=false
|
120
|
+
end
|
42
121
|
end
|
43
122
|
end
|
44
|
-
if
|
123
|
+
if showlog
|
45
124
|
puts " <Pushed #{s.length}..."
|
46
|
-
|
47
|
-
|
125
|
+
end
|
126
|
+
puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if case_insensitive_trigram_failed
|
48
127
|
end
|
49
128
|
|
50
|
-
|
129
|
+
def pushTrigramsSetRecoverable(s, fid, filename)
|
130
|
+
error=false
|
131
|
+
@redis.multi do
|
132
|
+
s.each do | trigram |
|
133
|
+
@redis.sadd "trigram:#{trigram}",fid
|
134
|
+
@redis.sadd "fscan:trigramsOnFile:#{fid}", trigram
|
135
|
+
# Add the case-insensitive-trigram
|
136
|
+
begin
|
137
|
+
@redis.sadd "trigram:ci:#{trigram.downcase}",fid
|
138
|
+
rescue ArgumentError
|
139
|
+
error=true
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end # multi
|
143
|
+
return error
|
144
|
+
end
|
145
|
+
private :pushTrigramsSetRecoverable
|
146
|
+
|
51
147
|
|
52
148
|
def load(filename, noReload=false)
|
53
149
|
# Define my redis id...
|
@@ -61,7 +157,7 @@ module CodeZauker
|
|
61
157
|
@redis.set "fscan:id2filename:#{fid}",filename
|
62
158
|
else
|
63
159
|
if noReload
|
64
|
-
puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
|
160
|
+
#puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
|
65
161
|
return nil
|
66
162
|
end
|
67
163
|
end
|
@@ -73,10 +169,12 @@ module CodeZauker
|
|
73
169
|
# before sending it to redis. This avoid
|
74
170
|
# a lot of spourios work
|
75
171
|
s=Set.new
|
76
|
-
File.open(filename,"r")
|
172
|
+
File.open(filename,"r") { |f|
|
77
173
|
lines=f.readlines()
|
78
|
-
adaptiveSize=
|
79
|
-
|
174
|
+
adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE
|
175
|
+
util=Util.new()
|
176
|
+
lines.each do |lineNotUTF8|
|
177
|
+
l= util.ensureUTF8(lineNotUTF8)
|
80
178
|
# Split each line into 3-char chunks, and store in a redis set
|
81
179
|
i=0
|
82
180
|
for istart in 0...(l.length-GRAM_SIZE)
|
@@ -95,7 +193,7 @@ module CodeZauker
|
|
95
193
|
#puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
|
96
194
|
end
|
97
195
|
end
|
98
|
-
|
196
|
+
}
|
99
197
|
|
100
198
|
if s.length > 0
|
101
199
|
pushTrigramsSet(s,fid,filename)
|
@@ -107,22 +205,14 @@ module CodeZauker
|
|
107
205
|
trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}"
|
108
206
|
@redis.sadd "fscan:processedFiles", "#{filename}"
|
109
207
|
trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0
|
110
|
-
if trigramRatio < 10 or trigramRatio >75
|
111
|
-
puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} "
|
208
|
+
if trigramRatio < 10 or trigramRatio >75
|
209
|
+
puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} ?Binary" if trigramRatio >90 and trigramsOnFile>70
|
112
210
|
end
|
113
211
|
return nil
|
114
212
|
end
|
115
213
|
|
116
|
-
|
117
|
-
|
118
|
-
# The search string is padded into trigrams
|
119
|
-
def search(term)
|
120
|
-
if term.length < GRAM_SIZE
|
121
|
-
raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character"
|
122
|
-
end
|
123
|
-
#puts " ** Searching: #{term}"
|
124
|
-
# split the term in a padded trigram
|
125
|
-
trigramInAnd=[]
|
214
|
+
def split_in_trigrams(term, prefix)
|
215
|
+
trigramInAnd=Set.new()
|
126
216
|
# Search=> Sea AND ear AND arc AND rch
|
127
217
|
for j in 0...term.length
|
128
218
|
currentTrigram=term[j,GRAM_SIZE]
|
@@ -130,22 +220,59 @@ module CodeZauker
|
|
130
220
|
# We are at the end...
|
131
221
|
break
|
132
222
|
end
|
133
|
-
trigramInAnd.
|
223
|
+
trigramInAnd.add("#{prefix}:#{currentTrigram}")
|
134
224
|
end
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
fileIds= @redis.sinter(*trigramInAnd)
|
225
|
+
return trigramInAnd
|
226
|
+
end
|
227
|
+
|
228
|
+
def map_ids_to_files(fileIds)
|
140
229
|
filenames=[]
|
141
230
|
# fscan:id2filename:#{fid}....
|
142
231
|
fileIds.each do | id |
|
143
|
-
|
232
|
+
file_name=@redis.get("fscan:id2filename:#{id}")
|
233
|
+
filenames.push(file_name) if !file_name.nil?
|
144
234
|
end
|
145
235
|
#puts " ** Files found:#{filenames} from ids #{fileIds}"
|
146
236
|
return filenames
|
147
237
|
end
|
148
238
|
|
239
|
+
|
240
|
+
|
241
|
+
|
242
|
+
# = Do a case-insenitive search
|
243
|
+
# using the special set of trigrams
|
244
|
+
# "trigram:ci:*"
|
245
|
+
# all downcase
|
246
|
+
def isearch(term)
|
247
|
+
termLowercase=term.downcase()
|
248
|
+
trigramInAnd=split_in_trigrams(termLowercase,"trigram:ci")
|
249
|
+
if trigramInAnd.length==0
|
250
|
+
return []
|
251
|
+
end
|
252
|
+
fileIds= @redis.sinter(*trigramInAnd)
|
253
|
+
return map_ids_to_files(fileIds)
|
254
|
+
end
|
255
|
+
|
256
|
+
|
257
|
+
# = search
|
258
|
+
# Find a list of file candidates to a search string
|
259
|
+
# The search string is padded into trigrams
|
260
|
+
def search(term)
|
261
|
+
if term.length < GRAM_SIZE
|
262
|
+
raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character"
|
263
|
+
end
|
264
|
+
#puts " ** Searching: #{term}"
|
265
|
+
trigramInAnd=split_in_trigrams(term,"trigram")
|
266
|
+
#puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
|
267
|
+
if trigramInAnd.length==0
|
268
|
+
return []
|
269
|
+
end
|
270
|
+
fileIds= @redis.sinter(*trigramInAnd)
|
271
|
+
fileNames=map_ids_to_files(fileIds)
|
272
|
+
#puts "DEBUG #{fileIds} #{fileNames}"
|
273
|
+
return fileNames
|
274
|
+
end
|
275
|
+
|
149
276
|
def reindex(fileList)
|
150
277
|
#puts "Reindexing... #{fileList.length} files..."
|
151
278
|
fileList.each do |current_file |
|
@@ -156,7 +283,14 @@ module CodeZauker
|
|
156
283
|
|
157
284
|
# Remove all the keys
|
158
285
|
def removeAll()
|
159
|
-
|
286
|
+
tokill=[]
|
287
|
+
tokill=@redis.keys("fscan:*")
|
288
|
+
tokill.push(*(@redis.keys("trigram*")))
|
289
|
+
tokill.each do | x |
|
290
|
+
@redis.del x
|
291
|
+
#puts "Deleted #x"
|
292
|
+
end
|
293
|
+
@redis.del "fscan:processedFiles"
|
160
294
|
end
|
161
295
|
|
162
296
|
# Remove the files from the index, updating trigrams
|
@@ -178,22 +312,29 @@ module CodeZauker
|
|
178
312
|
if trigramsToExpurge.length==0
|
179
313
|
puts "?Nothing to do on #{filename}"
|
180
314
|
end
|
181
|
-
puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..."
|
315
|
+
puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..."
|
182
316
|
trigramsToExpurge.each do | ts |
|
183
317
|
@redis.srem "trigram:#{ts}", fid
|
184
318
|
begin
|
185
319
|
@redis.srem "trigram:ci:#{ts.downcase}",fid
|
320
|
+
#putc "."
|
186
321
|
rescue ArgumentError
|
187
322
|
# Ignore "ArgumentError: invalid byte sequence in UTF-8"
|
188
323
|
# and proceed...
|
189
324
|
end
|
190
325
|
end
|
326
|
+
#putc "\n"
|
191
327
|
|
192
|
-
@redis.del
|
328
|
+
@redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}"
|
193
329
|
@redis.srem "fscan:processedFiles", filename
|
194
330
|
end
|
195
331
|
return nil
|
196
332
|
end
|
197
333
|
|
334
|
+
private :pushTrigramsSet
|
335
|
+
private :split_in_trigrams
|
336
|
+
#private :map_ids_to_files
|
337
|
+
|
338
|
+
|
198
339
|
end
|
199
340
|
end
|