code_zauker 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,24 @@
1
+ module CodeZauker
2
+ DEFAULT_EXCLUDED_EXTENSION=[
3
+ ".pdf",
4
+ ".xps",
5
+ ".zip",
6
+ ".ppt",".xls",".rtf",".vsd",
7
+ ".dll",".exe",".out",".elf",".lib",".so",
8
+ ".jar",".class",
9
+ ".tar",
10
+ ".gz",
11
+ ".dropbox",
12
+ ".svn-base",".pdb",".cache",
13
+ # MS Office zip-like files...
14
+ ".pptx",".docx",".xlsx",
15
+ # Music exclusion
16
+ ".mp3",".mp4",".wav",
17
+ # Image exclusion
18
+ ".png",".gif",
19
+ # Temp stuff
20
+ ".tmp","~",
21
+ # Oracle exports...
22
+ ".exp"
23
+ ]
24
+ end
@@ -0,0 +1,138 @@
1
+ # GG Customized to suite Code Zauker needs
2
+ # Refer to https://rubygems.org/gems/grep
3
+ # for the original gem/code
4
+ # Skeleton module for the 'grep' routine.
5
+ #
6
+ # Ideally, one would do this in their code to import the "grep" call
7
+ # directly into their current namespace:
8
+ #
9
+ # require 'grep'
10
+ # include Grep
11
+ # # do something with grep()
12
+ #
13
+ #
14
+ # It is recommended that you look at the documentation for the grep()
15
+ # call directly for specific usage.
16
+ #
17
+ #--
18
+ #
19
+ # The compilation of software known as grep.rb is distributed under the
20
+ # following terms:
21
+ # Copyright (C) 2005-2006 Erik Hollensbe. All rights reserved.
22
+ #
23
+ # Redistribution and use in source form, with or without
24
+ # modification, are permitted provided that the following conditions
25
+ # are met:
26
+ # 1. Redistributions of source code must retain the above copyright
27
+ # notice, this list of conditions and the following disclaimer.
28
+ #
29
+ # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
30
+ # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32
+ # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
33
+ # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34
+ # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35
+ # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36
+ # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37
+ # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38
+ # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39
+ # SUCH DAMAGE.
40
+ #
41
+ #++
42
+
43
+ module Grep
44
+
45
+ #
46
+ # Grep works like a shell grep. `file' can be either a string,
47
+ # containing the name of a file to load and handle, or an IO object
48
+ # (such as $stdin) to deal with. `pattern' can be either a string or
49
+ # Regexp object which contains a pattern. Patterns as strings treat
50
+ # no part of the string as `special', such as '.' or '?' in a
51
+ # regex. `pre_context' and `post_context' determine the amount of
52
+ # lines to return that came before or after the content that was
53
+ # matched, respectively. If there are overlaps in the context, no
54
+ # duplicates will be printed.
55
+ #
56
+
57
+ def grep(file, pattern, pre_context=0, post_context=0, print_filename=true)
58
+ currentline=0
59
+ if file.kind_of? String
60
+ fileName=file
61
+ file = File.new(file, "r")
62
+ else
63
+ fileName=""
64
+ end
65
+
66
+ if ! file.kind_of? IO
67
+ throw IOError.new("File must be the name of an existing file or IO object")
68
+ end
69
+
70
+ if pattern.kind_of? String
71
+ pattern = /#{Regexp.escape(pattern)}/
72
+ end
73
+
74
+ if ! pattern.kind_of? Regexp
75
+ throw StandardError.new("Pattern must be string or regexp")
76
+ end
77
+
78
+ cache = []
79
+ lines = []
80
+
81
+ loop do
82
+ begin
83
+ line = file.readline
84
+ currentline +=1
85
+ cache.shift unless cache.length < pre_context
86
+
87
+ # GG Patch
88
+ # if print_filename==true
89
+ # cache.push("#{fileName}:#{line}")
90
+ # else
91
+ cache.push("#{currentline}:#{line}")
92
+ # end
93
+
94
+
95
+
96
+
97
+ if line =~ pattern
98
+ lines += cache
99
+ cache = []
100
+ if post_context > 0
101
+ post_context.times do
102
+ begin
103
+ lines.push(file.readline)
104
+ currentline +=1
105
+ rescue IOError => e
106
+ break
107
+ end
108
+ end
109
+ end
110
+ end
111
+ rescue IOError => e
112
+ break
113
+ end
114
+ end
115
+
116
+
117
+ file.each_line do |line|
118
+ cache.shift unless cache.length < pre_context
119
+ cache.push(line)
120
+
121
+ if line =~ pattern
122
+ lines += cache
123
+ if post_context > 0
124
+ post_context.times do
125
+ begin
126
+ lines.push(file.readline)
127
+ currentline +=1
128
+ rescue Exception => e
129
+ break
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end
135
+
136
+ return lines
137
+ end
138
+ end
@@ -1,3 +1,3 @@
1
1
  module CodeZauker
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/code_zauker.rb CHANGED
@@ -1,10 +1,11 @@
1
1
  # -*- mode:ruby ; -*- -*
2
2
  require "code_zauker/version"
3
+ require "code_zauker/constants"
3
4
  require 'redis/connection/hiredis'
4
5
  require 'redis'
5
6
  require 'set'
6
- # This module try to implement a simple reverse indexer
7
- # based on redis
7
+ # This module implements a simple reverse indexer
8
+ # based on Redis
8
9
  # The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html
9
10
  module CodeZauker
10
11
  GRAM_SIZE=3
@@ -12,19 +13,52 @@ module CodeZauker
12
13
  # Scan a file and push it inside redis...
13
14
  # then it can provide handy method to find file scontaining the trigram...
14
15
  class FileScanner
15
- def initialize()
16
+ def initialize(redisConnection=nil)
17
+ if redisConnection==nil
18
+ @redis=Redis.new
19
+ else
20
+ @redis=redisConnection
21
+ end
22
+ end
23
+ def disconnect()
24
+ @redis.quit
16
25
  end
26
+
27
+
28
+
29
+ def pushTrigramsSet(s, fid, filename)
30
+ error=false
31
+ if s.length > 5000
32
+ puts " >Pushing...#{s.length} for id #{fid}=#{filename}"
33
+ end
34
+ s.each do | trigram |
35
+ @redis.sadd "trigram:#{trigram}",fid
36
+ @redis.sadd "fscan:trigramsOnFile:#{fid}", trigram
37
+ # Add the case-insensitive-trigram
38
+ begin
39
+ @redis.sadd "trigram:ci:#{trigram.downcase}",fid
40
+ rescue ArgumentError
41
+ error=true
42
+ end
43
+ end
44
+ if s.length > 5000
45
+ puts " <Pushed #{s.length}..."
46
+ puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if error
47
+ end
48
+ end
49
+
50
+ private :pushTrigramsSet
51
+
17
52
  def load(filename, noReload=false)
18
- # Define my redis id...
19
- r=Redis.new
53
+ # Define my redis id...
20
54
  # Already exists?...
21
- fid=r.get "fscan:id:#{filename}"
55
+ fid=@redis.get "fscan:id:#{filename}"
22
56
  if fid==nil
23
- r.setnx "fscan:nextId",0
24
- fid=r.incr "fscan:nextId"
57
+ @redis.setnx "fscan:nextId",0
58
+ fid=@redis.incr "fscan:nextId"
25
59
  # BUG: Consider storing it at the END of the processing
26
- r.set "fscan:id:#{filename}", fid
27
- r.set "fscan:id2filename:#{fid}",filename
60
+ @redis.set "fscan:id:#{filename}", fid
61
+ @redis.set "fscan:id2filename:#{fid}",filename
28
62
  else
29
63
  if noReload
30
64
  puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
@@ -54,12 +88,7 @@ module CodeZauker
54
88
  # push the trigram to redis (highly optimized)
55
89
  s.add(trigram)
56
90
  if s.length > adaptiveSize
57
- puts " >Pushing...#{s.length}"
58
- s.each do | trigram |
59
- r.sadd "trigram:#{trigram}",fid
60
- r.sadd "fscan:trigramsOnFile:#{fid}", trigram
61
- end
62
- puts " <Pushed #{s.length}..."
91
+ pushTrigramsSet(s,fid,filename)
63
92
  s=Set.new()
64
93
  end
65
94
  trigramScanned += 1
@@ -69,19 +98,18 @@ module CodeZauker
69
98
  end
70
99
 
71
100
  if s.length > 0
72
- s.each do | trigram |
73
- r.sadd "trigram:#{trigram}",fid
74
- r.sadd "fscan:trigramsOnFile:#{fid}", trigram
75
- end
101
+ pushTrigramsSet(s,fid,filename)
102
+ s=nil
76
103
  #puts "Final push of #{s.length}"
77
104
  end
78
105
 
79
106
 
80
- trigramsOnFile=r.scard "fscan:trigramsOnFile:#{fid}"
81
- r.sadd "fscan:processedFiles", "fscan:id:#{filename}"
107
+ trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}"
108
+ @redis.sadd "fscan:processedFiles", "#{filename}"
82
109
  trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0
83
- puts "File processed. Unique Trigrams for #{filename}: #{trigramsOnFile} Total Scanned: #{trigramScanned} Ratio:#{trigramRatio}"
84
- r.quit
110
+ if trigramRatio < 10 or trigramRatio >75
111
+ puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} "
112
+ end
85
113
  return nil
86
114
  end
87
115
 
@@ -89,6 +117,9 @@ module CodeZauker
89
117
  # Find a list of file candidates to a search string
90
118
  # The search string is padded into trigrams
91
119
  def search(term)
120
+ if term.length < GRAM_SIZE
121
+ raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character"
122
+ end
92
123
  #puts " ** Searching: #{term}"
93
124
  # split the term in a padded trigram
94
125
  trigramInAnd=[]
@@ -104,25 +135,65 @@ module CodeZauker
104
135
  #puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
105
136
  if trigramInAnd.length==0
106
137
  return []
107
- end
108
- r=Redis.new
109
- fileIds= r.sinter(*trigramInAnd)
138
+ end
139
+ fileIds= @redis.sinter(*trigramInAnd)
110
140
  filenames=[]
111
141
  # fscan:id2filename:#{fid}....
112
142
  fileIds.each do | id |
113
- filenames.push(r.get("fscan:id2filename:#{id}"))
114
- end
115
- r.quit
143
+ filenames.push(@redis.get("fscan:id2filename:#{id}"))
144
+ end
116
145
  #puts " ** Files found:#{filenames} from ids #{fileIds}"
117
146
  return filenames
118
147
  end
119
-
120
- # This function accepts a very simple search query like
121
- # Gio*
122
- # will match Giovanni, Giovedi, Giorno...
123
- # Giova*ni
124
- # will match Giovanni, Giovani, Giovannini
125
- def searchSimpleRegexp(termWithStar)
148
+
149
+ def reindex(fileList)
150
+ #puts "Reindexing... #{fileList.length} files..."
151
+ fileList.each do |current_file |
152
+ self.remove([current_file])
153
+ self.load(current_file,noReload=false)
154
+ end
126
155
  end
156
+
157
+ # Remove all the keys
158
+ def removeAll()
159
+ self.remove(nil)
160
+ end
161
+
162
+ # Remove the files from the index, updating trigrams
163
+ def remove(filePaths=nil)
164
+ if filePaths==nil
165
+ fileList=[]
166
+ storedFiles=@redis.keys "fscan:id:*"
167
+ storedFiles.each do |fileKey|
168
+ filename=fileKey.split("fscan:id:")[1]
169
+ fileList.push(filename)
170
+ end
171
+ else
172
+ fileList=filePaths
173
+ end
174
+ # puts "Files to remove from index...#{fileList.length}"
175
+ fileList.each do |filename|
176
+ fid=@redis.get "fscan:id:#{filename}"
177
+ trigramsToExpurge=@redis.smembers "fscan:trigramsOnFile:#{fid}"
178
+ if trigramsToExpurge.length==0
179
+ puts "?Nothing to do on #{filename}"
180
+ end
181
+ puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..."
182
+ trigramsToExpurge.each do | ts |
183
+ @redis.srem "trigram:#{ts}", fid
184
+ begin
185
+ @redis.srem "trigram:ci:#{ts.downcase}",fid
186
+ rescue ArgumentError
187
+ # Ignore "ArgumentError: invalid byte sequence in UTF-8"
188
+ # and proceed...
189
+ end
190
+ end
191
+
192
+ @redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}"
193
+ @redis.srem "fscan:processedFiles", filename
194
+ end
195
+ return nil
196
+ end
197
+
127
198
  end
128
199
  end
data/readme.org CHANGED
@@ -1,10 +1,34 @@
1
+ * Code Zauker: your code, indexed
2
+ Code Zauker is a search engine for programming languages.
3
+ Code Zauker is based from ideas taken by old Google Code Search and uses Redis as a basic platform
4
+
5
+ For news and discussion: http://gioorgi.com/tag/code-zauker/
6
+
7
+
1
8
  * INSTALL
2
- To install Code Zauker, you must simply build and install the gem as usual
9
+ To install Code Zauker,simply issue
10
+ #+BEGIN_SRC sh
11
+ gem install code_zauker
12
+ #+END_SRC
13
+ You need also [[http://redis.io/][redis-2.4.6]] or better.
14
+ For a sample redis configuration see the etc/ dreictory of the project
3
15
 
4
- * DEVELOPING
5
- For developing with Code Zauker you need bundler 1.0.21 or above
16
+
17
+ * Release Notes
18
+ ** 0.0.1
19
+ First "we are here" release.
20
+ ** 0.0.2
21
+ Code Cleanup, reindexing features, and sample redis server config
22
+ czsearch now do not relay on unix grep, so it is easier to use with jruby
23
+ czsearch/czindexer supports options
6
24
 
7
25
  * Release History
8
26
  | Version | Date | Summary |
9
27
  | 0.0.1 | 26 Jan 2012 | First RubyGems Release (for testing purpose only) |
28
+ | 0.0.2 | 29 Jan 2012 | Removed dependency on unix find for czindexer. |
10
29
  | | | |
30
+
31
+ * DEVELOPING
32
+ For developing with Code Zauker you need bundler 1.0.21 or above
33
+ See devel.org file
34
+
data/test/test_search.rb CHANGED
@@ -13,7 +13,7 @@ class FileScannerBasicSearch < Test::Unit::TestCase
13
13
  fs=CodeZauker::FileScanner.new()
14
14
  time = Benchmark.bm(7) do |x|
15
15
  x.report ("kurukku.txt") { fs.load("./test/fixture/kurukku.txt") }
16
- x.report ("BigFile") { fs.load("./test/fixture/TEST_LICENSE.txt")}
16
+ #x.report ("BigFile") { fs.load("./test/fixture/TEST_LICENSE.txt")}
17
17
  x.report("Search common words"){ fs.search("and"); fs.search("terms") }
18
18
  end
19
19
  puts "Bench Result..."
@@ -47,11 +47,13 @@ class FileScannerBasicSearch < Test::Unit::TestCase
47
47
  assert(files.include?("./test/fixture/kurukku.txt") ==true)
48
48
  end
49
49
 
50
- def test_less_then3_must_not_give_Redis_error
50
+ def test_less_then3_must_give_error
51
51
  fs=CodeZauker::FileScanner.new()
52
52
  fs.load("./test/fixture/kurukku.txt")
53
- files=fs.search("di")
54
- assert_equal 0, files.length
53
+ assert_raise RuntimeError do
54
+ files=fs.search("di")
55
+ end
56
+ #assert_equal 0, files.length
55
57
  end
56
58
 
57
59
  def test_small4
@@ -69,6 +71,29 @@ class FileScannerBasicSearch < Test::Unit::TestCase
69
71
  assert files.include?("./test/fixture/TEST_LICENSE.txt")==true
70
72
  end
71
73
 
74
+ def test_remove
75
+ fs=CodeZauker::FileScanner.new()
76
+ fs.load("./test/fixture/kurukku.txt", noReload=true)
77
+ fs.remove(["./test/fixture/kurukku.txt"])
78
+ files=fs.search("\"Be hungry, be foolish\"")
79
+ assert files.length ==0,
80
+ "Expected zero search results after removal from index. Found instead:#{files}"
81
+ #assert(files[0].include?("test/fixture/kurukku.txt")==true)
82
+ end
83
+
84
+ def test_removeAll
85
+ require 'redis/connection/hiredis'
86
+ require 'redis'
87
+ redis=Redis.new
88
+ fs=CodeZauker::FileScanner.new(redis)
89
+ fs.load("./test/fixture/kurukku.txt", noReload=true)
90
+ fs.removeAll()
91
+ foundKeys=redis.keys "*"
92
+ #puts "Keys at empty db:#{foundKeys}"
93
+ assert foundKeys.length==1, "Expected only one key at empty db. Found instead #{foundKeys}"
94
+ assert foundKeys[0]=="fscan:nextId", "Expected only the fscan:nextId key at empty db. Found instead #{foundKeys}"
95
+ end
96
+
72
97
 
73
98
  end
74
99
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: code_zauker
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-01-26 00:00:00.000000000 Z
12
+ date: 2012-01-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: yard
16
- requirement: &72883350 !ruby/object:Gem::Requirement
16
+ requirement: &78315190 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0.7'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *72883350
24
+ version_requirements: *78315190
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: hiredis
27
- requirement: &72883030 !ruby/object:Gem::Requirement
27
+ requirement: &78314880 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0.3'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *72883030
35
+ version_requirements: *78314880
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: redis
38
- requirement: &72882680 !ruby/object:Gem::Requirement
38
+ requirement: &78314600 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: '2.2'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *72882680
46
+ version_requirements: *78314600
47
47
  description: Code Zauker is based from ideas taken by old Google Code Search and uses
48
48
  Redis as a basic platform
49
49
  email:
@@ -51,6 +51,7 @@ email:
51
51
  executables:
52
52
  - czindexer
53
53
  - czsearch
54
+ - startRedis
54
55
  extensions: []
55
56
  extra_rdoc_files: []
56
57
  files:
@@ -59,8 +60,28 @@ files:
59
60
  - Rakefile
60
61
  - bin/czindexer
61
62
  - bin/czsearch
63
+ - bin/startRedis
62
64
  - code_zauker.gemspec
65
+ - devel.org
66
+ - doc/CodeZauker.html
67
+ - doc/CodeZauker/FileScanner.html
68
+ - doc/_index.html
69
+ - doc/class_list.html
70
+ - doc/css/common.css
71
+ - doc/css/full_list.css
72
+ - doc/css/style.css
73
+ - doc/file_list.html
74
+ - doc/frames.html
75
+ - doc/index.html
76
+ - doc/js/app.js
77
+ - doc/js/full_list.js
78
+ - doc/js/jquery.js
79
+ - doc/method_list.html
80
+ - doc/top-level-namespace.html
81
+ - etc/redis.conf
63
82
  - lib/code_zauker.rb
83
+ - lib/code_zauker/constants.rb
84
+ - lib/code_zauker/grep.rb
64
85
  - lib/code_zauker/version.rb
65
86
  - readme.org
66
87
  - test/fixture/TEST_LICENSE.txt