code_zauker 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ module CodeZauker
2
+ DEFAULT_EXCLUDED_EXTENSION=[
3
+ ".pdf",
4
+ ".xps",
5
+ ".zip",
6
+ ".ppt",".xls",".rtf",".vsd",
7
+ ".dll",".exe",".out",".elf",".lib",".so",
8
+ ".jar",".class",
9
+ ".tar",
10
+ ".gz",
11
+ ".dropbox",
12
+ ".svn-base",".pdb",".cache",
13
+ # MS Office zip-like files...
14
+ ".pptx",".docx",".xlsx",
15
+ # Music exclusion
16
+ ".mp3",".mp4",".wav",
17
+ # Image exclusion
18
+ ".png",".gif",
19
+ # Temp stuff
20
+ ".tmp","~",
21
+ # Oracle exports...
22
+ ".exp"
23
+ ]
24
+ end
@@ -0,0 +1,138 @@
1
+ # GG Customized to suite Code Zauker needs
2
+ # Refer to https://rubygems.org/gems/grep
3
+ # for the original gem/code
4
+ # Skeleton module for the 'grep' routine.
5
+ #
6
+ # Ideally, one would do this in their code to import the "grep" call
7
+ # directly into their current namespace:
8
+ #
9
+ # require 'grep'
10
+ # include Grep
11
+ # # do something with grep()
12
+ #
13
+ #
14
+ # It is recommended that you look at the documentation for the grep()
15
+ # call directly for specific usage.
16
+ #
17
+ #--
18
+ #
19
+ # The compilation of software known as grep.rb is distributed under the
20
+ # following terms:
21
+ # Copyright (C) 2005-2006 Erik Hollensbe. All rights reserved.
22
+ #
23
+ # Redistribution and use in source form, with or without
24
+ # modification, are permitted provided that the following conditions
25
+ # are met:
26
+ # 1. Redistributions of source code must retain the above copyright
27
+ # notice, this list of conditions and the following disclaimer.
28
+ #
29
+ # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
30
+ # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32
+ # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
33
+ # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34
+ # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35
+ # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36
+ # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37
+ # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38
+ # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39
+ # SUCH DAMAGE.
40
+ #
41
+ #++
42
+
43
+ module Grep
44
+
45
+ #
46
+ # Grep works like a shell grep. `file' can be either a string,
47
+ # containing the name of a file to load and handle, or an IO object
48
+ # (such as $stdin) to deal with. `pattern' can be either a string or
49
+ # Regexp object which contains a pattern. Patterns as strings treat
50
+ # no part of the string as `special', such as '.' or '?' in a
51
+ # regex. `pre_context' and `post_context' determine the amount of
52
+ # lines to return that came before or after the content that was
53
+ # matched, respectively. If there are overlaps in the context, no
54
+ # duplicates will be printed.
55
+ #
56
+
57
+ def grep(file, pattern, pre_context=0, post_context=0, print_filename=true)
58
+ currentline=0
59
+ if file.kind_of? String
60
+ fileName=file
61
+ file = File.new(file, "r")
62
+ else
63
+ fileName=""
64
+ end
65
+
66
+ if ! file.kind_of? IO
67
+ throw IOError.new("File must be the name of an existing file or IO object")
68
+ end
69
+
70
+ if pattern.kind_of? String
71
+ pattern = /#{Regexp.escape(pattern)}/
72
+ end
73
+
74
+ if ! pattern.kind_of? Regexp
75
+ throw StandardError.new("Pattern must be string or regexp")
76
+ end
77
+
78
+ cache = []
79
+ lines = []
80
+
81
+ loop do
82
+ begin
83
+ line = file.readline
84
+ currentline +=1
85
+ cache.shift unless cache.length < pre_context
86
+
87
+ # GG Patch
88
+ # if print_filename==true
89
+ # cache.push("#{fileName}:#{line}")
90
+ # else
91
+ cache.push("#{currentline}:#{line}")
92
+ # end
93
+
94
+
95
+
96
+
97
+ if line =~ pattern
98
+ lines += cache
99
+ cache = []
100
+ if post_context > 0
101
+ post_context.times do
102
+ begin
103
+ lines.push(file.readline)
104
+ currentline +=1
105
+ rescue IOError => e
106
+ break
107
+ end
108
+ end
109
+ end
110
+ end
111
+ rescue IOError => e
112
+ break
113
+ end
114
+ end
115
+
116
+
117
+ file.each_line do |line|
118
+ cache.shift unless cache.length < pre_context
119
+ cache.push(line)
120
+
121
+ if line =~ pattern
122
+ lines += cache
123
+ if post_context > 0
124
+ post_context.times do
125
+ begin
126
+ lines.push(file.readline)
127
+ currentline +=1
128
+ rescue Exception => e
129
+ break
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end
135
+
136
+ return lines
137
+ end
138
+ end
@@ -1,3 +1,3 @@
1
1
  module CodeZauker
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/code_zauker.rb CHANGED
@@ -1,10 +1,11 @@
1
1
  # -*- mode:ruby ; -*- -*
2
2
  require "code_zauker/version"
3
+ require "code_zauker/constants"
3
4
  require 'redis/connection/hiredis'
4
5
  require 'redis'
5
6
  require 'set'
6
- # This module try to implement a simple reverse indexer
7
- # based on redis
7
+ # This module implements a simple reverse indexer
8
+ # based on Redis
8
9
  # The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html
9
10
  module CodeZauker
10
11
  GRAM_SIZE=3
@@ -12,19 +13,52 @@ module CodeZauker
12
13
  # Scan a file and push it inside redis...
13
14
  # then it can provide handy method to find file scontaining the trigram...
14
15
  class FileScanner
15
- def initialize()
16
+ def initialize(redisConnection=nil)
17
+ if redisConnection==nil
18
+ @redis=Redis.new
19
+ else
20
+ @redis=redisConnection
21
+ end
22
+ end
23
+ def disconnect()
24
+ @redis.quit
16
25
  end
26
+
27
+
28
+
29
+ def pushTrigramsSet(s, fid, filename)
30
+ error=false
31
+ if s.length > 5000
32
+ puts " >Pushing...#{s.length} for id #{fid}=#{filename}"
33
+ end
34
+ s.each do | trigram |
35
+ @redis.sadd "trigram:#{trigram}",fid
36
+ @redis.sadd "fscan:trigramsOnFile:#{fid}", trigram
37
+ # Add the case-insensitive-trigram
38
+ begin
39
+ @redis.sadd "trigram:ci:#{trigram.downcase}",fid
40
+ rescue ArgumentError
41
+ error=true
42
+ end
43
+ end
44
+ if s.length > 5000
45
+ puts " <Pushed #{s.length}..."
46
+ puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if error
47
+ end
48
+ end
49
+
50
+ private :pushTrigramsSet
51
+
17
52
  def load(filename, noReload=false)
18
- # Define my redis id...
19
- r=Redis.new
53
+ # Define my redis id...
20
54
  # Already exists?...
21
- fid=r.get "fscan:id:#{filename}"
55
+ fid=@redis.get "fscan:id:#{filename}"
22
56
  if fid==nil
23
- r.setnx "fscan:nextId",0
24
- fid=r.incr "fscan:nextId"
57
+ @redis.setnx "fscan:nextId",0
58
+ fid=@redis.incr "fscan:nextId"
25
59
  # BUG: Consider storing it at the END of the processing
26
- r.set "fscan:id:#{filename}", fid
27
- r.set "fscan:id2filename:#{fid}",filename
60
+ @redis.set "fscan:id:#{filename}", fid
61
+ @redis.set "fscan:id2filename:#{fid}",filename
28
62
  else
29
63
  if noReload
30
64
  puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
@@ -54,12 +88,7 @@ module CodeZauker
54
88
  # push the trigram to redis (highly optimized)
55
89
  s.add(trigram)
56
90
  if s.length > adaptiveSize
57
- puts " >Pushing...#{s.length}"
58
- s.each do | trigram |
59
- r.sadd "trigram:#{trigram}",fid
60
- r.sadd "fscan:trigramsOnFile:#{fid}", trigram
61
- end
62
- puts " <Pushed #{s.length}..."
91
+ pushTrigramsSet(s,fid,filename)
63
92
  s=Set.new()
64
93
  end
65
94
  trigramScanned += 1
@@ -69,19 +98,18 @@ module CodeZauker
69
98
  end
70
99
 
71
100
  if s.length > 0
72
- s.each do | trigram |
73
- r.sadd "trigram:#{trigram}",fid
74
- r.sadd "fscan:trigramsOnFile:#{fid}", trigram
75
- end
101
+ pushTrigramsSet(s,fid,filename)
102
+ s=nil
76
103
  #puts "Final push of #{s.length}"
77
104
  end
78
105
 
79
106
 
80
- trigramsOnFile=r.scard "fscan:trigramsOnFile:#{fid}"
81
- r.sadd "fscan:processedFiles", "fscan:id:#{filename}"
107
+ trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}"
108
+ @redis.sadd "fscan:processedFiles", "#{filename}"
82
109
  trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0
83
- puts "File processed. Unique Trigrams for #{filename}: #{trigramsOnFile} Total Scanned: #{trigramScanned} Ratio:#{trigramRatio}"
84
- r.quit
110
+ if trigramRatio < 10 or trigramRatio >75
111
+ puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} "
112
+ end
85
113
  return nil
86
114
  end
87
115
 
@@ -89,6 +117,9 @@ module CodeZauker
89
117
  # Find a list of file candidates to a search string
90
118
  # The search string is padded into trigrams
91
119
  def search(term)
120
+ if term.length < GRAM_SIZE
121
+ raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character"
122
+ end
92
123
  #puts " ** Searching: #{term}"
93
124
  # split the term in a padded trigram
94
125
  trigramInAnd=[]
@@ -104,25 +135,65 @@ module CodeZauker
104
135
  #puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
105
136
  if trigramInAnd.length==0
106
137
  return []
107
- end
108
- r=Redis.new
109
- fileIds= r.sinter(*trigramInAnd)
138
+ end
139
+ fileIds= @redis.sinter(*trigramInAnd)
110
140
  filenames=[]
111
141
  # fscan:id2filename:#{fid}....
112
142
  fileIds.each do | id |
113
- filenames.push(r.get("fscan:id2filename:#{id}"))
114
- end
115
- r.quit
143
+ filenames.push(@redis.get("fscan:id2filename:#{id}"))
144
+ end
116
145
  #puts " ** Files found:#{filenames} from ids #{fileIds}"
117
146
  return filenames
118
147
  end
119
-
120
- # This function accepts a very simple search query like
121
- # Gio*
122
- # will match Giovanni, Giovedi, Giorno...
123
- # Giova*ni
124
- # will match Giovanni, Giovani, Giovannini
125
- def searchSimpleRegexp(termWithStar)
148
+
149
+ def reindex(fileList)
150
+ #puts "Reindexing... #{fileList.length} files..."
151
+ fileList.each do |current_file |
152
+ self.remove([current_file])
153
+ self.load(current_file,noReload=false)
154
+ end
126
155
  end
156
+
157
+ # Remove all the keys
158
+ def removeAll()
159
+ self.remove(nil)
160
+ end
161
+
162
+ # Remove the files from the index, updating trigrams
163
+ def remove(filePaths=nil)
164
+ if filePaths==nil
165
+ fileList=[]
166
+ storedFiles=@redis.keys "fscan:id:*"
167
+ storedFiles.each do |fileKey|
168
+ filename=fileKey.split("fscan:id:")[1]
169
+ fileList.push(filename)
170
+ end
171
+ else
172
+ fileList=filePaths
173
+ end
174
+ # puts "Files to remove from index...#{fileList.length}"
175
+ fileList.each do |filename|
176
+ fid=@redis.get "fscan:id:#{filename}"
177
+ trigramsToExpurge=@redis.smembers "fscan:trigramsOnFile:#{fid}"
178
+ if trigramsToExpurge.length==0
179
+ puts "?Nothing to do on #{filename}"
180
+ end
181
+ puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..."
182
+ trigramsToExpurge.each do | ts |
183
+ @redis.srem "trigram:#{ts}", fid
184
+ begin
185
+ @redis.srem "trigram:ci:#{ts.downcase}",fid
186
+ rescue ArgumentError
187
+ # Ignore "ArgumentError: invalid byte sequence in UTF-8"
188
+ # and proceed...
189
+ end
190
+ end
191
+
192
+ @redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}"
193
+ @redis.srem "fscan:processedFiles", filename
194
+ end
195
+ return nil
196
+ end
197
+
127
198
  end
128
199
  end
data/readme.org CHANGED
@@ -1,10 +1,34 @@
1
+ * Code Zauker: your code, indexed
2
+ Code Zauker is a search engine for programming languages.
3
+ Code Zauker is based from ideas taken by old Google Code Search and uses Redis as a basic platform
4
+
5
+ For news and discussion: http://gioorgi.com/tag/code-zauker/
6
+
7
+
1
8
  * INSTALL
2
- To install Code Zauker, you must simply build and install the gem as usual
9
+ To install Code Zauker,simply issue
10
+ #+BEGIN_SRC sh
11
+ gem install code_zauker
12
+ #+END_SRC
13
+ You need also [[http://redis.io/][redis-2.4.6]] or better.
14
+ For a sample redis configuration see the etc/ dreictory of the project
3
15
 
4
- * DEVELOPING
5
- For developing with Code Zauker you need bundler 1.0.21 or above
16
+
17
+ * Release Notes
18
+ ** 0.0.1
19
+ First "we are here" release.
20
+ ** 0.0.2
21
+ Code Cleanup, reindexing features, and sample redis server config
22
+ czsearch now do not relay on unix grep, so it is easier to use with jruby
23
+ czsearch/czindexer supports options
6
24
 
7
25
  * Release History
8
26
  | Version | Date | Summary |
9
27
  | 0.0.1 | 26 Jan 2012 | First RubyGems Release (for testing purpose only) |
28
+ | 0.0.2 | 29 Jan 2012 | Removed dependency on unix find for czindexer. |
10
29
  | | | |
30
+
31
+ * DEVELOPING
32
+ For developing with Code Zauker you need bundler 1.0.21 or above
33
+ See devel.org file
34
+
data/test/test_search.rb CHANGED
@@ -13,7 +13,7 @@ class FileScannerBasicSearch < Test::Unit::TestCase
13
13
  fs=CodeZauker::FileScanner.new()
14
14
  time = Benchmark.bm(7) do |x|
15
15
  x.report ("kurukku.txt") { fs.load("./test/fixture/kurukku.txt") }
16
- x.report ("BigFile") { fs.load("./test/fixture/TEST_LICENSE.txt")}
16
+ #x.report ("BigFile") { fs.load("./test/fixture/TEST_LICENSE.txt")}
17
17
  x.report("Search common words"){ fs.search("and"); fs.search("terms") }
18
18
  end
19
19
  puts "Bench Result..."
@@ -47,11 +47,13 @@ class FileScannerBasicSearch < Test::Unit::TestCase
47
47
  assert(files.include?("./test/fixture/kurukku.txt") ==true)
48
48
  end
49
49
 
50
- def test_less_then3_must_not_give_Redis_error
50
+ def test_less_then3_must_give_error
51
51
  fs=CodeZauker::FileScanner.new()
52
52
  fs.load("./test/fixture/kurukku.txt")
53
- files=fs.search("di")
54
- assert_equal 0, files.length
53
+ assert_raise RuntimeError do
54
+ files=fs.search("di")
55
+ end
56
+ #assert_equal 0, files.length
55
57
  end
56
58
 
57
59
  def test_small4
@@ -69,6 +71,29 @@ class FileScannerBasicSearch < Test::Unit::TestCase
69
71
  assert files.include?("./test/fixture/TEST_LICENSE.txt")==true
70
72
  end
71
73
 
74
+ def test_remove
75
+ fs=CodeZauker::FileScanner.new()
76
+ fs.load("./test/fixture/kurukku.txt", noReload=true)
77
+ fs.remove(["./test/fixture/kurukku.txt"])
78
+ files=fs.search("\"Be hungry, be foolish\"")
79
+ assert files.length ==0,
80
+ "Expected zero search results after removal from index. Found instead:#{files}"
81
+ #assert(files[0].include?("test/fixture/kurukku.txt")==true)
82
+ end
83
+
84
+ def test_removeAll
85
+ require 'redis/connection/hiredis'
86
+ require 'redis'
87
+ redis=Redis.new
88
+ fs=CodeZauker::FileScanner.new(redis)
89
+ fs.load("./test/fixture/kurukku.txt", noReload=true)
90
+ fs.removeAll()
91
+ foundKeys=redis.keys "*"
92
+ #puts "Keys at empty db:#{foundKeys}"
93
+ assert foundKeys.length==1, "Expected only one key at empty db. Found instead #{foundKeys}"
94
+ assert foundKeys[0]=="fscan:nextId", "Expected only the fscan:nextId key at empty db. Found instead #{foundKeys}"
95
+ end
96
+
72
97
 
73
98
  end
74
99
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: code_zauker
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-01-26 00:00:00.000000000 Z
12
+ date: 2012-01-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: yard
16
- requirement: &72883350 !ruby/object:Gem::Requirement
16
+ requirement: &78315190 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0.7'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *72883350
24
+ version_requirements: *78315190
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: hiredis
27
- requirement: &72883030 !ruby/object:Gem::Requirement
27
+ requirement: &78314880 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0.3'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *72883030
35
+ version_requirements: *78314880
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: redis
38
- requirement: &72882680 !ruby/object:Gem::Requirement
38
+ requirement: &78314600 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: '2.2'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *72882680
46
+ version_requirements: *78314600
47
47
  description: Code Zauker is based from ideas taken by old Google Code Search and uses
48
48
  Redis as a basic platform
49
49
  email:
@@ -51,6 +51,7 @@ email:
51
51
  executables:
52
52
  - czindexer
53
53
  - czsearch
54
+ - startRedis
54
55
  extensions: []
55
56
  extra_rdoc_files: []
56
57
  files:
@@ -59,8 +60,28 @@ files:
59
60
  - Rakefile
60
61
  - bin/czindexer
61
62
  - bin/czsearch
63
+ - bin/startRedis
62
64
  - code_zauker.gemspec
65
+ - devel.org
66
+ - doc/CodeZauker.html
67
+ - doc/CodeZauker/FileScanner.html
68
+ - doc/_index.html
69
+ - doc/class_list.html
70
+ - doc/css/common.css
71
+ - doc/css/full_list.css
72
+ - doc/css/style.css
73
+ - doc/file_list.html
74
+ - doc/frames.html
75
+ - doc/index.html
76
+ - doc/js/app.js
77
+ - doc/js/full_list.js
78
+ - doc/js/jquery.js
79
+ - doc/method_list.html
80
+ - doc/top-level-namespace.html
81
+ - etc/redis.conf
63
82
  - lib/code_zauker.rb
83
+ - lib/code_zauker/constants.rb
84
+ - lib/code_zauker/grep.rb
64
85
  - lib/code_zauker/version.rb
65
86
  - readme.org
66
87
  - test/fixture/TEST_LICENSE.txt