code_zauker 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/doc/_index.html CHANGED
@@ -4,7 +4,7 @@
4
4
  <head>
5
5
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
6
6
  <title>
7
- Code Zauker 0.0.2 Documentation
7
+ Code Zauker 0.0.3 Documentation
8
8
 
9
9
  </title>
10
10
 
@@ -52,7 +52,7 @@
52
52
 
53
53
  <iframe id="search_frame"></iframe>
54
54
 
55
- <div id="content"><h1 class="noborder title">Code Zauker 0.0.2 Documentation</h1>
55
+ <div id="content"><h1 class="noborder title">Code Zauker 0.0.3 Documentation</h1>
56
56
  <div id="listing">
57
57
  <h1 class="alphaindex">Alphabetic Index</h1>
58
58
 
@@ -94,6 +94,34 @@
94
94
  </ul>
95
95
  </ul>
96
96
 
97
+
98
+ <ul id="alpha_G" class="alpha">
99
+ <li class="letter">G</li>
100
+ <ul>
101
+
102
+ <li>
103
+ <span class='object_link'><a href="Grep.html" title="Grep (module)">Grep</a></span>
104
+
105
+ </li>
106
+
107
+ </ul>
108
+ </ul>
109
+
110
+
111
+ <ul id="alpha_U" class="alpha">
112
+ <li class="letter">U</li>
113
+ <ul>
114
+
115
+ <li>
116
+ <span class='object_link'><a href="CodeZauker/Util.html" title="CodeZauker::Util (class)">Util</a></span>
117
+
118
+ <small>(CodeZauker)</small>
119
+
120
+ </li>
121
+
122
+ </ul>
123
+ </ul>
124
+
97
125
  </td>
98
126
  </tr>
99
127
  </table>
@@ -103,7 +131,7 @@
103
131
  </div>
104
132
 
105
133
  <div id="footer">
106
- Generated on Fri Jan 27 14:54:06 2012 by
134
+ Generated on Fri Feb 3 17:18:43 2012 by
107
135
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
108
136
  0.7.4 (ruby-1.9.3).
109
137
  </div>
data/doc/class_list.html CHANGED
@@ -39,7 +39,7 @@
39
39
 
40
40
  <ul id="full_list" class="class">
41
41
  <li><span class='object_link'><a href="top-level-namespace.html" title=" (root)">Top Level Namespace</a></span></li>
42
- <li><a class='toggle'></a> <span class='object_link'><a href="CodeZauker.html" title="CodeZauker (module)">CodeZauker</a></span><small class='search_info'>Top Level Namespace</small></li><ul><li><span class='object_link'><a href="CodeZauker/FileScanner.html" title="CodeZauker::FileScanner (class)">FileScanner</a></span> &lt; Object<small class='search_info'>CodeZauker</small></li></ul>
42
+ <li><a class='toggle'></a> <span class='object_link'><a href="CodeZauker.html" title="CodeZauker (module)">CodeZauker</a></span><small class='search_info'>Top Level Namespace</small></li><ul><li><span class='object_link'><a href="CodeZauker/FileScanner.html" title="CodeZauker::FileScanner (class)">FileScanner</a></span> &lt; Object<small class='search_info'>CodeZauker</small></li><li><span class='object_link'><a href="CodeZauker/Util.html" title="CodeZauker::Util (class)">Util</a></span> &lt; Object<small class='search_info'>CodeZauker</small></li></ul><li><span class='object_link'><a href="Grep.html" title="Grep (module)">Grep</a></span><small class='search_info'>Top Level Namespace</small></li>
43
43
 
44
44
  </ul>
45
45
  </div>
data/doc/frames.html CHANGED
@@ -4,7 +4,7 @@
4
4
  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
5
5
  <head>
6
6
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
7
- <title>Code Zauker 0.0.2 Documentation</title>
7
+ <title>Code Zauker 0.0.3 Documentation</title>
8
8
  </head>
9
9
  <frameset cols="20%,*">
10
10
  <frame name="list" src="class_list.html" />
data/doc/index.html CHANGED
@@ -4,7 +4,7 @@
4
4
  <head>
5
5
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
6
6
  <title>
7
- Code Zauker 0.0.2 Documentation
7
+ Code Zauker 0.0.3 Documentation
8
8
 
9
9
  </title>
10
10
 
@@ -52,7 +52,7 @@
52
52
 
53
53
  <iframe id="search_frame"></iframe>
54
54
 
55
- <div id="content"><h1 class="noborder title">Code Zauker 0.0.2 Documentation</h1>
55
+ <div id="content"><h1 class="noborder title">Code Zauker 0.0.3 Documentation</h1>
56
56
  <div id="listing">
57
57
  <h1 class="alphaindex">Alphabetic Index</h1>
58
58
 
@@ -94,6 +94,34 @@
94
94
  </ul>
95
95
  </ul>
96
96
 
97
+
98
+ <ul id="alpha_G" class="alpha">
99
+ <li class="letter">G</li>
100
+ <ul>
101
+
102
+ <li>
103
+ <span class='object_link'><a href="Grep.html" title="Grep (module)">Grep</a></span>
104
+
105
+ </li>
106
+
107
+ </ul>
108
+ </ul>
109
+
110
+
111
+ <ul id="alpha_U" class="alpha">
112
+ <li class="letter">U</li>
113
+ <ul>
114
+
115
+ <li>
116
+ <span class='object_link'><a href="CodeZauker/Util.html" title="CodeZauker::Util (class)">Util</a></span>
117
+
118
+ <small>(CodeZauker)</small>
119
+
120
+ </li>
121
+
122
+ </ul>
123
+ </ul>
124
+
97
125
  </td>
98
126
  </tr>
99
127
  </table>
@@ -103,7 +131,7 @@
103
131
  </div>
104
132
 
105
133
  <div id="footer">
106
- Generated on Fri Jan 27 14:54:06 2012 by
134
+ Generated on Fri Feb 3 17:18:43 2012 by
107
135
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
108
136
  0.7.4 (ruby-1.9.3).
109
137
  </div>
data/doc/method_list.html CHANGED
@@ -48,6 +48,22 @@
48
48
  </li>
49
49
 
50
50
 
51
+ <li class="r2 ">
52
+ <span class='object_link'><a href="CodeZauker/Util.html#ensureUTF8-instance_method" title="CodeZauker::Util#ensureUTF8 (method)">#ensureUTF8</a></span>
53
+
54
+ <small>CodeZauker::Util</small>
55
+
56
+ </li>
57
+
58
+
59
+ <li class="r1 ">
60
+ <span class='object_link'><a href="Grep.html#grep-instance_method" title="Grep#grep (method)">#grep</a></span>
61
+
62
+ <small>Grep</small>
63
+
64
+ </li>
65
+
66
+
51
67
  <li class="r2 ">
52
68
  <span class='object_link'><a href="CodeZauker/FileScanner.html#initialize-instance_method" title="CodeZauker::FileScanner#initialize (method)">#initialize</a></span>
53
69
 
@@ -57,6 +73,14 @@
57
73
 
58
74
 
59
75
  <li class="r1 ">
76
+ <span class='object_link'><a href="CodeZauker/FileScanner.html#isearch-instance_method" title="CodeZauker::FileScanner#isearch (method)">#isearch</a></span>
77
+
78
+ <small>CodeZauker::FileScanner</small>
79
+
80
+ </li>
81
+
82
+
83
+ <li class="r2 ">
60
84
  <span class='object_link'><a href="CodeZauker/FileScanner.html#load-instance_method" title="CodeZauker::FileScanner#load (method)">#load</a></span>
61
85
 
62
86
  <small>CodeZauker::FileScanner</small>
@@ -64,6 +88,30 @@
64
88
  </li>
65
89
 
66
90
 
91
+ <li class="r1 ">
92
+ <span class='object_link'><a href="CodeZauker/FileScanner.html#map_ids_to_files-instance_method" title="CodeZauker::FileScanner#map_ids_to_files (method)">#map_ids_to_files</a></span>
93
+
94
+ <small>CodeZauker::FileScanner</small>
95
+
96
+ </li>
97
+
98
+
99
+ <li class="r2 ">
100
+ <span class='object_link'><a href="CodeZauker/Util.html#mixCase-instance_method" title="CodeZauker::Util#mixCase (method)">#mixCase</a></span>
101
+
102
+ <small>CodeZauker::Util</small>
103
+
104
+ </li>
105
+
106
+
107
+ <li class="r1 ">
108
+ <span class='object_link'><a href="CodeZauker/FileScanner.html#reindex-instance_method" title="CodeZauker::FileScanner#reindex (method)">#reindex</a></span>
109
+
110
+ <small>CodeZauker::FileScanner</small>
111
+
112
+ </li>
113
+
114
+
67
115
  <li class="r2 ">
68
116
  <span class='object_link'><a href="CodeZauker/FileScanner.html#remove-instance_method" title="CodeZauker::FileScanner#remove (method)">#remove</a></span>
69
117
 
@@ -6,7 +6,7 @@
6
6
  <title>
7
7
  Top Level Namespace
8
8
 
9
- &mdash; Code Zauker 0.0.2 Documentation
9
+ &mdash; Code Zauker 0.0.3 Documentation
10
10
 
11
11
  </title>
12
12
 
@@ -78,7 +78,7 @@
78
78
  <p class="children">
79
79
 
80
80
 
81
- <strong class="modules">Modules:</strong> <span class='object_link'><a href="CodeZauker.html" title="CodeZauker (module)">CodeZauker</a></span>
81
+ <strong class="modules">Modules:</strong> <span class='object_link'><a href="CodeZauker.html" title="CodeZauker (module)">CodeZauker</a></span>, <span class='object_link'><a href="Grep.html" title="Grep (module)">Grep</a></span>
82
82
 
83
83
 
84
84
 
@@ -94,7 +94,7 @@
94
94
  </div>
95
95
 
96
96
  <div id="footer">
97
- Generated on Fri Jan 27 14:54:06 2012 by
97
+ Generated on Fri Feb 3 17:18:44 2012 by
98
98
  <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
99
99
  0.7.4 (ruby-1.9.3).
100
100
  </div>
data/lib/code_zauker.rb CHANGED
@@ -10,6 +10,70 @@ require 'set'
10
10
  module CodeZauker
11
11
  GRAM_SIZE=3
12
12
  SPACE_GUY=" "*GRAM_SIZE
13
+
14
+ # = Basic utility class
15
+ class Util
16
+ # Compute all the possible case-mixed trigrams
17
+ # It works for every string size
18
+ # TODO: Very bad implementation, need improvements
19
+ def mixCase(trigram)
20
+ caseMixedElements=[]
21
+ lx=trigram.length
22
+ combos=2**lx
23
+ startString=trigram.downcase
24
+ #puts "Combos... 1..#{combos}... #{startString}"
25
+ for c in 0..(combos-1) do
26
+ # Make binary
27
+ maskForStuff=c.to_s(2)
28
+ p=0
29
+ #puts maskForStuff
30
+ currentMix=""
31
+ # Pad it
32
+ if maskForStuff.length < lx
33
+ maskForStuff = ("0"*(lx-maskForStuff.length)) +maskForStuff
34
+ end
35
+ maskForStuff.each_char { | x |
36
+ #putc x
37
+ if x=="1"
38
+ currentMix +=startString[p].upcase
39
+ else
40
+ currentMix +=startString[p].downcase
41
+ end
42
+ #puts currentMix
43
+ p+=1
44
+ }
45
+ caseMixedElements.push(currentMix)
46
+ end
47
+ return caseMixedElements
48
+ end
49
+
50
+ # = Ensure Data are correctly imported
51
+ # http://blog.grayproductions.net/articles/ruby_19s_string
52
+ # This code try to "guess" the right encoding
53
+ # switching to ISO-8859-1 if UTF-8 is not valid.
54
+ # Tipical use case: an italian source code wronlgy interpreted as a UTF-8
55
+ # whereas it is a ISO-8859 windows code.
56
+ def ensureUTF8(untrusted_string)
57
+ if untrusted_string.valid_encoding?()==false
58
+ #puts "DEBUG Trouble on #{untrusted_string}"
59
+ untrusted_string.force_encoding("ISO-8859-1")
60
+ # We try ISO-8859-1 tipical windows
61
+ begin
62
+ valid_string=untrusted_string.encode("UTF-8", { :undef =>:replace, :invalid => :replace} )
63
+ rescue Encoding::InvalidByteSequenceError => e
64
+ raise e
65
+ end
66
+ # if valid_string != untrusted_string
67
+ # puts "CONVERTED #{valid_string} Works?#{valid_string.valid_encoding?}"
68
+ # end
69
+ return valid_string
70
+ else
71
+ return untrusted_string
72
+ end
73
+ end
74
+
75
+ end
76
+
13
77
  # Scan a file and push it inside redis...
14
78
  # then it can provide handy method to find file scontaining the trigram...
15
79
  class FileScanner
@@ -20,34 +84,66 @@ module CodeZauker
20
84
  @redis=redisConnection
21
85
  end
22
86
  end
23
- def disconnect()
87
+
88
+
89
+ def disconnect()
24
90
  @redis.quit
25
91
  end
26
92
 
93
+
27
94
 
28
95
 
96
+
29
97
  def pushTrigramsSet(s, fid, filename)
30
- error=false
31
- if s.length > 5000
98
+ case_insensitive_trigram_failed=false
99
+ showlog=false
100
+ if s.length > (TRIGRAM_DEFAULT_PUSH_SIZE/2)
32
101
  puts " >Pushing...#{s.length} for id #{fid}=#{filename}"
102
+ showlog=true
33
103
  end
34
- s.each do | trigram |
35
- @redis.sadd "trigram:#{trigram}",fid
36
- @redis.sadd "fscan:trigramsOnFile:#{fid}", trigram
37
- # Add the case-insensitive-trigram
104
+ # Ask for a protected transaction
105
+ # Sometimes can fail...
106
+ welldone=false
107
+ tryCounter=0
108
+ while welldone == false do
38
109
  begin
39
- @redis.sadd "trigram:ci:#{trigram.downcase}",fid
40
- rescue ArgumentError
41
- error=true
110
+ tryCounter +=1
111
+ case_insensitive_trigram_failed=pushTrigramsSetRecoverable(s,fid,filename)
112
+ welldone=true
113
+ rescue Errno::EAGAIN =>ea
114
+ if tryCounter >=MAX_PUSH_TRIGRAM_RETRIES
115
+ puts "FATAL: Too many Errno::EAGAIN Errors"
116
+ raise ea
117
+ else
118
+ puts "Trouble storing #{s.length} data. Retrying..."
119
+ welldone=false
120
+ end
42
121
  end
43
122
  end
44
- if s.length > 5000
123
+ if showlog
45
124
  puts " <Pushed #{s.length}..."
46
- puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if error
47
- end
125
+ end
126
+ puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if case_insensitive_trigram_failed
48
127
  end
49
128
 
50
- private :pushTrigramsSet
129
+ def pushTrigramsSetRecoverable(s, fid, filename)
130
+ error=false
131
+ @redis.multi do
132
+ s.each do | trigram |
133
+ @redis.sadd "trigram:#{trigram}",fid
134
+ @redis.sadd "fscan:trigramsOnFile:#{fid}", trigram
135
+ # Add the case-insensitive-trigram
136
+ begin
137
+ @redis.sadd "trigram:ci:#{trigram.downcase}",fid
138
+ rescue ArgumentError
139
+ error=true
140
+ end
141
+ end
142
+ end # multi
143
+ return error
144
+ end
145
+ private :pushTrigramsSetRecoverable
146
+
51
147
 
52
148
  def load(filename, noReload=false)
53
149
  # Define my redis id...
@@ -61,7 +157,7 @@ module CodeZauker
61
157
  @redis.set "fscan:id2filename:#{fid}",filename
62
158
  else
63
159
  if noReload
64
- puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
160
+ #puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
65
161
  return nil
66
162
  end
67
163
  end
@@ -73,10 +169,12 @@ module CodeZauker
73
169
  # before sending it to redis. This avoid
74
170
  # a lot of spourios work
75
171
  s=Set.new
76
- File.open(filename,"r") do |f|
172
+ File.open(filename,"r") { |f|
77
173
  lines=f.readlines()
78
- adaptiveSize= 6000
79
- lines.each do |l|
174
+ adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE
175
+ util=Util.new()
176
+ lines.each do |lineNotUTF8|
177
+ l= util.ensureUTF8(lineNotUTF8)
80
178
  # Split each line into 3-char chunks, and store in a redis set
81
179
  i=0
82
180
  for istart in 0...(l.length-GRAM_SIZE)
@@ -95,7 +193,7 @@ module CodeZauker
95
193
  #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
96
194
  end
97
195
  end
98
- end
196
+ }
99
197
 
100
198
  if s.length > 0
101
199
  pushTrigramsSet(s,fid,filename)
@@ -107,22 +205,14 @@ module CodeZauker
107
205
  trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}"
108
206
  @redis.sadd "fscan:processedFiles", "#{filename}"
109
207
  trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0
110
- if trigramRatio < 10 or trigramRatio >75
111
- puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} "
208
+ if trigramRatio < 10 or trigramRatio >75
209
+ puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} ?Binary" if trigramRatio >90 and trigramsOnFile>70
112
210
  end
113
211
  return nil
114
212
  end
115
213
 
116
- # = search
117
- # Find a list of file candidates to a search string
118
- # The search string is padded into trigrams
119
- def search(term)
120
- if term.length < GRAM_SIZE
121
- raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character"
122
- end
123
- #puts " ** Searching: #{term}"
124
- # split the term in a padded trigram
125
- trigramInAnd=[]
214
+ def split_in_trigrams(term, prefix)
215
+ trigramInAnd=Set.new()
126
216
  # Search=> Sea AND ear AND arc AND rch
127
217
  for j in 0...term.length
128
218
  currentTrigram=term[j,GRAM_SIZE]
@@ -130,22 +220,59 @@ module CodeZauker
130
220
  # We are at the end...
131
221
  break
132
222
  end
133
- trigramInAnd.push("trigram:#{currentTrigram}")
223
+ trigramInAnd.add("#{prefix}:#{currentTrigram}")
134
224
  end
135
- #puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
136
- if trigramInAnd.length==0
137
- return []
138
- end
139
- fileIds= @redis.sinter(*trigramInAnd)
225
+ return trigramInAnd
226
+ end
227
+
228
+ def map_ids_to_files(fileIds)
140
229
  filenames=[]
141
230
  # fscan:id2filename:#{fid}....
142
231
  fileIds.each do | id |
143
- filenames.push(@redis.get("fscan:id2filename:#{id}"))
232
+ file_name=@redis.get("fscan:id2filename:#{id}")
233
+ filenames.push(file_name) if !file_name.nil?
144
234
  end
145
235
  #puts " ** Files found:#{filenames} from ids #{fileIds}"
146
236
  return filenames
147
237
  end
148
238
 
239
+
240
+
241
+
242
+ # = Do a case-insenitive search
243
+ # using the special set of trigrams
244
+ # "trigram:ci:*"
245
+ # all downcase
246
+ def isearch(term)
247
+ termLowercase=term.downcase()
248
+ trigramInAnd=split_in_trigrams(termLowercase,"trigram:ci")
249
+ if trigramInAnd.length==0
250
+ return []
251
+ end
252
+ fileIds= @redis.sinter(*trigramInAnd)
253
+ return map_ids_to_files(fileIds)
254
+ end
255
+
256
+
257
+ # = search
258
+ # Find a list of file candidates to a search string
259
+ # The search string is padded into trigrams
260
+ def search(term)
261
+ if term.length < GRAM_SIZE
262
+ raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character"
263
+ end
264
+ #puts " ** Searching: #{term}"
265
+ trigramInAnd=split_in_trigrams(term,"trigram")
266
+ #puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
267
+ if trigramInAnd.length==0
268
+ return []
269
+ end
270
+ fileIds= @redis.sinter(*trigramInAnd)
271
+ fileNames=map_ids_to_files(fileIds)
272
+ #puts "DEBUG #{fileIds} #{fileNames}"
273
+ return fileNames
274
+ end
275
+
149
276
  def reindex(fileList)
150
277
  #puts "Reindexing... #{fileList.length} files..."
151
278
  fileList.each do |current_file |
@@ -156,7 +283,14 @@ module CodeZauker
156
283
 
157
284
  # Remove all the keys
158
285
  def removeAll()
159
- self.remove(nil)
286
+ tokill=[]
287
+ tokill=@redis.keys("fscan:*")
288
+ tokill.push(*(@redis.keys("trigram*")))
289
+ tokill.each do | x |
290
+ @redis.del x
291
+ #puts "Deleted #x"
292
+ end
293
+ @redis.del "fscan:processedFiles"
160
294
  end
161
295
 
162
296
  # Remove the files from the index, updating trigrams
@@ -178,22 +312,29 @@ module CodeZauker
178
312
  if trigramsToExpurge.length==0
179
313
  puts "?Nothing to do on #{filename}"
180
314
  end
181
- puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..."
315
+ puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..."
182
316
  trigramsToExpurge.each do | ts |
183
317
  @redis.srem "trigram:#{ts}", fid
184
318
  begin
185
319
  @redis.srem "trigram:ci:#{ts.downcase}",fid
320
+ #putc "."
186
321
  rescue ArgumentError
187
322
  # Ignore "ArgumentError: invalid byte sequence in UTF-8"
188
323
  # and proceed...
189
324
  end
190
325
  end
326
+ #putc "\n"
191
327
 
192
- @redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}"
328
+ @redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}"
193
329
  @redis.srem "fscan:processedFiles", filename
194
330
  end
195
331
  return nil
196
332
  end
197
333
 
334
+ private :pushTrigramsSet
335
+ private :split_in_trigrams
336
+ #private :map_ids_to_files
337
+
338
+
198
339
  end
199
340
  end