code_zauker 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/doc/_index.html CHANGED
@@ -4,7 +4,7 @@
4
4
  <head>
5
5
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
6
6
  <title>
7
- Code Zauker 0.0.2 Documentation
7
+ Code Zauker 0.0.3 Documentation
8
8
 
9
9
  </title>
10
10
 
@@ -52,7 +52,7 @@
52
52
 
53
53
  <iframe id="search_frame"></iframe>
54
54
 
55
- <div id="content"><h1 class="noborder title">Code Zauker 0.0.2 Documentation</h1>
55
+ <div id="content"><h1 class="noborder title">Code Zauker 0.0.3 Documentation</h1>
56
56
  <div id="listing">
57
57
  <h1 class="alphaindex">Alphabetic Index</h1>
58
58
 
@@ -94,6 +94,34 @@
94
94
  </ul>
95
95
  </ul>
96
96
 
97
+
98
+ <ul id="alpha_G" class="alpha">
99
+ <li class="letter">G</li>
100
+ <ul>
101
+
102
+ <li>
103
+ <span class='object_link'><a href="Grep.html" title="Grep (module)">Grep</a></span>
104
+
105
+ </li>
106
+
107
+ </ul>
108
+ </ul>
109
+
110
+
111
+ <ul id="alpha_U" class="alpha">
112
+ <li class="letter">U</li>
113
+ <ul>
114
+
115
+ <li>
116
+ <span class='object_link'><a href="CodeZauker/Util.html" title="CodeZauker::Util (class)">Util</a></span>
117
+
118
+ <small>(CodeZauker)</small>
119
+
120
+ </li>
121
+
122
+ </ul>
123
+ </ul>
124
+
97
125
  </td>
98
126
  </tr>
99
127
  </table>
@@ -103,7 +131,7 @@
103
131
  </div>
104
132
 
105
133
  <div id="footer">
106
- Generated on Fri Jan 27 14:54:06 2012 by
134
+ Generated on Fri Feb 3 17:18:43 2012 by
107
135
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
108
136
  0.7.4 (ruby-1.9.3).
109
137
  </div>
data/doc/class_list.html CHANGED
@@ -39,7 +39,7 @@
39
39
 
40
40
  <ul id="full_list" class="class">
41
41
  <li><span class='object_link'><a href="top-level-namespace.html" title=" (root)">Top Level Namespace</a></span></li>
42
- <li><a class='toggle'></a> <span class='object_link'><a href="CodeZauker.html" title="CodeZauker (module)">CodeZauker</a></span><small class='search_info'>Top Level Namespace</small></li><ul><li><span class='object_link'><a href="CodeZauker/FileScanner.html" title="CodeZauker::FileScanner (class)">FileScanner</a></span> &lt; Object<small class='search_info'>CodeZauker</small></li></ul>
42
+ <li><a class='toggle'></a> <span class='object_link'><a href="CodeZauker.html" title="CodeZauker (module)">CodeZauker</a></span><small class='search_info'>Top Level Namespace</small></li><ul><li><span class='object_link'><a href="CodeZauker/FileScanner.html" title="CodeZauker::FileScanner (class)">FileScanner</a></span> &lt; Object<small class='search_info'>CodeZauker</small></li><li><span class='object_link'><a href="CodeZauker/Util.html" title="CodeZauker::Util (class)">Util</a></span> &lt; Object<small class='search_info'>CodeZauker</small></li></ul><li><span class='object_link'><a href="Grep.html" title="Grep (module)">Grep</a></span><small class='search_info'>Top Level Namespace</small></li>
43
43
 
44
44
  </ul>
45
45
  </div>
data/doc/frames.html CHANGED
@@ -4,7 +4,7 @@
4
4
  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
5
5
  <head>
6
6
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
7
- <title>Code Zauker 0.0.2 Documentation</title>
7
+ <title>Code Zauker 0.0.3 Documentation</title>
8
8
  </head>
9
9
  <frameset cols="20%,*">
10
10
  <frame name="list" src="class_list.html" />
data/doc/index.html CHANGED
@@ -4,7 +4,7 @@
4
4
  <head>
5
5
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
6
6
  <title>
7
- Code Zauker 0.0.2 Documentation
7
+ Code Zauker 0.0.3 Documentation
8
8
 
9
9
  </title>
10
10
 
@@ -52,7 +52,7 @@
52
52
 
53
53
  <iframe id="search_frame"></iframe>
54
54
 
55
- <div id="content"><h1 class="noborder title">Code Zauker 0.0.2 Documentation</h1>
55
+ <div id="content"><h1 class="noborder title">Code Zauker 0.0.3 Documentation</h1>
56
56
  <div id="listing">
57
57
  <h1 class="alphaindex">Alphabetic Index</h1>
58
58
 
@@ -94,6 +94,34 @@
94
94
  </ul>
95
95
  </ul>
96
96
 
97
+
98
+ <ul id="alpha_G" class="alpha">
99
+ <li class="letter">G</li>
100
+ <ul>
101
+
102
+ <li>
103
+ <span class='object_link'><a href="Grep.html" title="Grep (module)">Grep</a></span>
104
+
105
+ </li>
106
+
107
+ </ul>
108
+ </ul>
109
+
110
+
111
+ <ul id="alpha_U" class="alpha">
112
+ <li class="letter">U</li>
113
+ <ul>
114
+
115
+ <li>
116
+ <span class='object_link'><a href="CodeZauker/Util.html" title="CodeZauker::Util (class)">Util</a></span>
117
+
118
+ <small>(CodeZauker)</small>
119
+
120
+ </li>
121
+
122
+ </ul>
123
+ </ul>
124
+
97
125
  </td>
98
126
  </tr>
99
127
  </table>
@@ -103,7 +131,7 @@
103
131
  </div>
104
132
 
105
133
  <div id="footer">
106
- Generated on Fri Jan 27 14:54:06 2012 by
134
+ Generated on Fri Feb 3 17:18:43 2012 by
107
135
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
108
136
  0.7.4 (ruby-1.9.3).
109
137
  </div>
data/doc/method_list.html CHANGED
@@ -48,6 +48,22 @@
48
48
  </li>
49
49
 
50
50
 
51
+ <li class="r2 ">
52
+ <span class='object_link'><a href="CodeZauker/Util.html#ensureUTF8-instance_method" title="CodeZauker::Util#ensureUTF8 (method)">#ensureUTF8</a></span>
53
+
54
+ <small>CodeZauker::Util</small>
55
+
56
+ </li>
57
+
58
+
59
+ <li class="r1 ">
60
+ <span class='object_link'><a href="Grep.html#grep-instance_method" title="Grep#grep (method)">#grep</a></span>
61
+
62
+ <small>Grep</small>
63
+
64
+ </li>
65
+
66
+
51
67
  <li class="r2 ">
52
68
  <span class='object_link'><a href="CodeZauker/FileScanner.html#initialize-instance_method" title="CodeZauker::FileScanner#initialize (method)">#initialize</a></span>
53
69
 
@@ -57,6 +73,14 @@
57
73
 
58
74
 
59
75
  <li class="r1 ">
76
+ <span class='object_link'><a href="CodeZauker/FileScanner.html#isearch-instance_method" title="CodeZauker::FileScanner#isearch (method)">#isearch</a></span>
77
+
78
+ <small>CodeZauker::FileScanner</small>
79
+
80
+ </li>
81
+
82
+
83
+ <li class="r2 ">
60
84
  <span class='object_link'><a href="CodeZauker/FileScanner.html#load-instance_method" title="CodeZauker::FileScanner#load (method)">#load</a></span>
61
85
 
62
86
  <small>CodeZauker::FileScanner</small>
@@ -64,6 +88,30 @@
64
88
  </li>
65
89
 
66
90
 
91
+ <li class="r1 ">
92
+ <span class='object_link'><a href="CodeZauker/FileScanner.html#map_ids_to_files-instance_method" title="CodeZauker::FileScanner#map_ids_to_files (method)">#map_ids_to_files</a></span>
93
+
94
+ <small>CodeZauker::FileScanner</small>
95
+
96
+ </li>
97
+
98
+
99
+ <li class="r2 ">
100
+ <span class='object_link'><a href="CodeZauker/Util.html#mixCase-instance_method" title="CodeZauker::Util#mixCase (method)">#mixCase</a></span>
101
+
102
+ <small>CodeZauker::Util</small>
103
+
104
+ </li>
105
+
106
+
107
+ <li class="r1 ">
108
+ <span class='object_link'><a href="CodeZauker/FileScanner.html#reindex-instance_method" title="CodeZauker::FileScanner#reindex (method)">#reindex</a></span>
109
+
110
+ <small>CodeZauker::FileScanner</small>
111
+
112
+ </li>
113
+
114
+
67
115
  <li class="r2 ">
68
116
  <span class='object_link'><a href="CodeZauker/FileScanner.html#remove-instance_method" title="CodeZauker::FileScanner#remove (method)">#remove</a></span>
69
117
 
@@ -6,7 +6,7 @@
6
6
  <title>
7
7
  Top Level Namespace
8
8
 
9
- &mdash; Code Zauker 0.0.2 Documentation
9
+ &mdash; Code Zauker 0.0.3 Documentation
10
10
 
11
11
  </title>
12
12
 
@@ -78,7 +78,7 @@
78
78
  <p class="children">
79
79
 
80
80
 
81
- <strong class="modules">Modules:</strong> <span class='object_link'><a href="CodeZauker.html" title="CodeZauker (module)">CodeZauker</a></span>
81
+ <strong class="modules">Modules:</strong> <span class='object_link'><a href="CodeZauker.html" title="CodeZauker (module)">CodeZauker</a></span>, <span class='object_link'><a href="Grep.html" title="Grep (module)">Grep</a></span>
82
82
 
83
83
 
84
84
 
@@ -94,7 +94,7 @@
94
94
  </div>
95
95
 
96
96
  <div id="footer">
97
- Generated on Fri Jan 27 14:54:06 2012 by
97
+ Generated on Fri Feb 3 17:18:44 2012 by
98
98
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
99
99
  0.7.4 (ruby-1.9.3).
100
100
  </div>
data/lib/code_zauker.rb CHANGED
@@ -10,6 +10,70 @@ require 'set'
10
10
  module CodeZauker
11
11
  GRAM_SIZE=3
12
12
  SPACE_GUY=" "*GRAM_SIZE
13
+
14
+ # = Basic utility class
15
+ class Util
16
+ # Compute all the possible case-mixed trigrams
17
+ # It works for every string size
18
+ # TODO: Very bad implementation, need improvements
19
+ def mixCase(trigram)
20
+ caseMixedElements=[]
21
+ lx=trigram.length
22
+ combos=2**lx
23
+ startString=trigram.downcase
24
+ #puts "Combos... 1..#{combos}... #{startString}"
25
+ for c in 0..(combos-1) do
26
+ # Make binary
27
+ maskForStuff=c.to_s(2)
28
+ p=0
29
+ #puts maskForStuff
30
+ currentMix=""
31
+ # Pad it
32
+ if maskForStuff.length < lx
33
+ maskForStuff = ("0"*(lx-maskForStuff.length)) +maskForStuff
34
+ end
35
+ maskForStuff.each_char { | x |
36
+ #putc x
37
+ if x=="1"
38
+ currentMix +=startString[p].upcase
39
+ else
40
+ currentMix +=startString[p].downcase
41
+ end
42
+ #puts currentMix
43
+ p+=1
44
+ }
45
+ caseMixedElements.push(currentMix)
46
+ end
47
+ return caseMixedElements
48
+ end
49
+
50
+ # = Ensure Data are correctly imported
51
+ # http://blog.grayproductions.net/articles/ruby_19s_string
52
+ # This code try to "guess" the right encoding
53
+ # switching to ISO-8859-1 if UTF-8 is not valid.
54
+ # Tipical use case: an italian source code wronlgy interpreted as a UTF-8
55
+ # whereas it is a ISO-8859 windows code.
56
+ def ensureUTF8(untrusted_string)
57
+ if untrusted_string.valid_encoding?()==false
58
+ #puts "DEBUG Trouble on #{untrusted_string}"
59
+ untrusted_string.force_encoding("ISO-8859-1")
60
+ # We try ISO-8859-1 tipical windows
61
+ begin
62
+ valid_string=untrusted_string.encode("UTF-8", { :undef =>:replace, :invalid => :replace} )
63
+ rescue Encoding::InvalidByteSequenceError => e
64
+ raise e
65
+ end
66
+ # if valid_string != untrusted_string
67
+ # puts "CONVERTED #{valid_string} Works?#{valid_string.valid_encoding?}"
68
+ # end
69
+ return valid_string
70
+ else
71
+ return untrusted_string
72
+ end
73
+ end
74
+
75
+ end
76
+
13
77
  # Scan a file and push it inside redis...
14
78
  # then it can provide handy method to find file scontaining the trigram...
15
79
  class FileScanner
@@ -20,34 +84,66 @@ module CodeZauker
20
84
  @redis=redisConnection
21
85
  end
22
86
  end
23
- def disconnect()
87
+
88
+
89
+ def disconnect()
24
90
  @redis.quit
25
91
  end
26
92
 
93
+
27
94
 
28
95
 
96
+
29
97
  def pushTrigramsSet(s, fid, filename)
30
- error=false
31
- if s.length > 5000
98
+ case_insensitive_trigram_failed=false
99
+ showlog=false
100
+ if s.length > (TRIGRAM_DEFAULT_PUSH_SIZE/2)
32
101
  puts " >Pushing...#{s.length} for id #{fid}=#{filename}"
102
+ showlog=true
33
103
  end
34
- s.each do | trigram |
35
- @redis.sadd "trigram:#{trigram}",fid
36
- @redis.sadd "fscan:trigramsOnFile:#{fid}", trigram
37
- # Add the case-insensitive-trigram
104
+ # Ask for a protected transaction
105
+ # Sometimes can fail...
106
+ welldone=false
107
+ tryCounter=0
108
+ while welldone == false do
38
109
  begin
39
- @redis.sadd "trigram:ci:#{trigram.downcase}",fid
40
- rescue ArgumentError
41
- error=true
110
+ tryCounter +=1
111
+ case_insensitive_trigram_failed=pushTrigramsSetRecoverable(s,fid,filename)
112
+ welldone=true
113
+ rescue Errno::EAGAIN =>ea
114
+ if tryCounter >=MAX_PUSH_TRIGRAM_RETRIES
115
+ puts "FATAL: Too many Errno::EAGAIN Errors"
116
+ raise ea
117
+ else
118
+ puts "Trouble storing #{s.length} data. Retrying..."
119
+ welldone=false
120
+ end
42
121
  end
43
122
  end
44
- if s.length > 5000
123
+ if showlog
45
124
  puts " <Pushed #{s.length}..."
46
- puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if error
47
- end
125
+ end
126
+ puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if case_insensitive_trigram_failed
48
127
  end
49
128
 
50
- private :pushTrigramsSet
129
+ def pushTrigramsSetRecoverable(s, fid, filename)
130
+ error=false
131
+ @redis.multi do
132
+ s.each do | trigram |
133
+ @redis.sadd "trigram:#{trigram}",fid
134
+ @redis.sadd "fscan:trigramsOnFile:#{fid}", trigram
135
+ # Add the case-insensitive-trigram
136
+ begin
137
+ @redis.sadd "trigram:ci:#{trigram.downcase}",fid
138
+ rescue ArgumentError
139
+ error=true
140
+ end
141
+ end
142
+ end # multi
143
+ return error
144
+ end
145
+ private :pushTrigramsSetRecoverable
146
+
51
147
 
52
148
  def load(filename, noReload=false)
53
149
  # Define my redis id...
@@ -61,7 +157,7 @@ module CodeZauker
61
157
  @redis.set "fscan:id2filename:#{fid}",filename
62
158
  else
63
159
  if noReload
64
- puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
160
+ #puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
65
161
  return nil
66
162
  end
67
163
  end
@@ -73,10 +169,12 @@ module CodeZauker
73
169
  # before sending it to redis. This avoid
74
170
  # a lot of spourios work
75
171
  s=Set.new
76
- File.open(filename,"r") do |f|
172
+ File.open(filename,"r") { |f|
77
173
  lines=f.readlines()
78
- adaptiveSize= 6000
79
- lines.each do |l|
174
+ adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE
175
+ util=Util.new()
176
+ lines.each do |lineNotUTF8|
177
+ l= util.ensureUTF8(lineNotUTF8)
80
178
  # Split each line into 3-char chunks, and store in a redis set
81
179
  i=0
82
180
  for istart in 0...(l.length-GRAM_SIZE)
@@ -95,7 +193,7 @@ module CodeZauker
95
193
  #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
96
194
  end
97
195
  end
98
- end
196
+ }
99
197
 
100
198
  if s.length > 0
101
199
  pushTrigramsSet(s,fid,filename)
@@ -107,22 +205,14 @@ module CodeZauker
107
205
  trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}"
108
206
  @redis.sadd "fscan:processedFiles", "#{filename}"
109
207
  trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0
110
- if trigramRatio < 10 or trigramRatio >75
111
- puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} "
208
+ if trigramRatio < 10 or trigramRatio >75
209
+ puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} ?Binary" if trigramRatio >90 and trigramsOnFile>70
112
210
  end
113
211
  return nil
114
212
  end
115
213
 
116
- # = search
117
- # Find a list of file candidates to a search string
118
- # The search string is padded into trigrams
119
- def search(term)
120
- if term.length < GRAM_SIZE
121
- raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character"
122
- end
123
- #puts " ** Searching: #{term}"
124
- # split the term in a padded trigram
125
- trigramInAnd=[]
214
+ def split_in_trigrams(term, prefix)
215
+ trigramInAnd=Set.new()
126
216
  # Search=> Sea AND ear AND arc AND rch
127
217
  for j in 0...term.length
128
218
  currentTrigram=term[j,GRAM_SIZE]
@@ -130,22 +220,59 @@ module CodeZauker
130
220
  # We are at the end...
131
221
  break
132
222
  end
133
- trigramInAnd.push("trigram:#{currentTrigram}")
223
+ trigramInAnd.add("#{prefix}:#{currentTrigram}")
134
224
  end
135
- #puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
136
- if trigramInAnd.length==0
137
- return []
138
- end
139
- fileIds= @redis.sinter(*trigramInAnd)
225
+ return trigramInAnd
226
+ end
227
+
228
+ def map_ids_to_files(fileIds)
140
229
  filenames=[]
141
230
  # fscan:id2filename:#{fid}....
142
231
  fileIds.each do | id |
143
- filenames.push(@redis.get("fscan:id2filename:#{id}"))
232
+ file_name=@redis.get("fscan:id2filename:#{id}")
233
+ filenames.push(file_name) if !file_name.nil?
144
234
  end
145
235
  #puts " ** Files found:#{filenames} from ids #{fileIds}"
146
236
  return filenames
147
237
  end
148
238
 
239
+
240
+
241
+
242
+ # = Do a case-insenitive search
243
+ # using the special set of trigrams
244
+ # "trigram:ci:*"
245
+ # all downcase
246
+ def isearch(term)
247
+ termLowercase=term.downcase()
248
+ trigramInAnd=split_in_trigrams(termLowercase,"trigram:ci")
249
+ if trigramInAnd.length==0
250
+ return []
251
+ end
252
+ fileIds= @redis.sinter(*trigramInAnd)
253
+ return map_ids_to_files(fileIds)
254
+ end
255
+
256
+
257
+ # = search
258
+ # Find a list of file candidates to a search string
259
+ # The search string is padded into trigrams
260
+ def search(term)
261
+ if term.length < GRAM_SIZE
262
+ raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character"
263
+ end
264
+ #puts " ** Searching: #{term}"
265
+ trigramInAnd=split_in_trigrams(term,"trigram")
266
+ #puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
267
+ if trigramInAnd.length==0
268
+ return []
269
+ end
270
+ fileIds= @redis.sinter(*trigramInAnd)
271
+ fileNames=map_ids_to_files(fileIds)
272
+ #puts "DEBUG #{fileIds} #{fileNames}"
273
+ return fileNames
274
+ end
275
+
149
276
  def reindex(fileList)
150
277
  #puts "Reindexing... #{fileList.length} files..."
151
278
  fileList.each do |current_file |
@@ -156,7 +283,14 @@ module CodeZauker
156
283
 
157
284
  # Remove all the keys
158
285
  def removeAll()
159
- self.remove(nil)
286
+ tokill=[]
287
+ tokill=@redis.keys("fscan:*")
288
+ tokill.push(*(@redis.keys("trigram*")))
289
+ tokill.each do | x |
290
+ @redis.del x
291
+ #puts "Deleted #x"
292
+ end
293
+ @redis.del "fscan:processedFiles"
160
294
  end
161
295
 
162
296
  # Remove the files from the index, updating trigrams
@@ -178,22 +312,29 @@ module CodeZauker
178
312
  if trigramsToExpurge.length==0
179
313
  puts "?Nothing to do on #{filename}"
180
314
  end
181
- puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..."
315
+ puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..."
182
316
  trigramsToExpurge.each do | ts |
183
317
  @redis.srem "trigram:#{ts}", fid
184
318
  begin
185
319
  @redis.srem "trigram:ci:#{ts.downcase}",fid
320
+ #putc "."
186
321
  rescue ArgumentError
187
322
  # Ignore "ArgumentError: invalid byte sequence in UTF-8"
188
323
  # and proceed...
189
324
  end
190
325
  end
326
+ #putc "\n"
191
327
 
192
- @redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}"
328
+ @redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}"
193
329
  @redis.srem "fscan:processedFiles", filename
194
330
  end
195
331
  return nil
196
332
  end
197
333
 
334
+ private :pushTrigramsSet
335
+ private :split_in_trigrams
336
+ #private :map_ids_to_files
337
+
338
+
198
339
  end
199
340
  end