code_zauker 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +7 -0
- data/Rakefile +21 -0
- data/bin/czindexer +15 -0
- data/bin/czsearch +17 -0
- data/code_zauker.gemspec +32 -0
- data/lib/code_zauker/version.rb +3 -0
- data/lib/code_zauker.rb +128 -0
- data/readme.org +10 -0
- data/test/fixture/TEST_LICENSE.txt +1000 -0
- data/test/fixture/foolish.txt +1 -0
- data/test/fixture/kurukku.txt +2 -0
- data/test/test_search.rb +74 -0
- metadata +95 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in code_zauker.gemspec
|
4
|
+
# GG From http://asciicasts.com/episodes/245-new-gem-with-bundler:
|
5
|
+
# It's better to manage the gem’s dependencies inside the Gemspec file and let Bundler
|
6
|
+
# load them automatically through the Gemfile
|
7
|
+
gemspec
|
data/Rakefile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- coding: utf-8 ; mode: ruby; -*-
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
|
4
|
+
# See http://jasonseifer.com/2010/04/06/rake-tutorial
|
5
|
+
require 'rake/testtask'
|
6
|
+
# See http://rake.rubyforge.org/classes/Rake/TestTask.html
|
7
|
+
Rake::TestTask.new do |t|
|
8
|
+
# List of directories to added to $LOAD_PATH before running the tests. (default is ‘lib’)
|
9
|
+
#t.libs << 'test'
|
10
|
+
t.test_files = FileList['test/test*.rb']
|
11
|
+
t.verbose = true
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
require 'yard'
|
16
|
+
YARD::Rake::YardocTask.new do |t|
|
17
|
+
t.files = ['lib/**/*.rb'] # optional
|
18
|
+
#t.options = ['--any', '--extra', '--opts'] # optional
|
19
|
+
end
|
20
|
+
|
21
|
+
|
data/bin/czindexer
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Suggested execution is mixing find / xargs with the parallel (P) parameters:
|
3
|
+
# find test/fixture/ -type f | xargs -P 5 -n 10 ./bin/czindexer
|
4
|
+
# will fire 5 czindexer each with 10 files to process...
|
5
|
+
require 'code_zauker'
|
6
|
+
ARGV.each do | l |
|
7
|
+
if Dir.exists?(l)
|
8
|
+
puts "Processing via find+xargs"
|
9
|
+
system("find #{l} -type f | xargs -P 5 -n 10 #{$0}")
|
10
|
+
else
|
11
|
+
puts "Meganoids indexing #{l}"
|
12
|
+
fs=CodeZauker::FileScanner.new()
|
13
|
+
fs.load(l,noReload=false)
|
14
|
+
end
|
15
|
+
end
|
data/bin/czsearch
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#== czsearch is a userful command to search via the Code Zauker facility
|
3
|
+
# Send somethiing like -W0 to ruby, for a cleaner output
|
4
|
+
$VERBOSE=nil
|
5
|
+
require 'code_zauker'
|
6
|
+
ARGV.each do | s |
|
7
|
+
#puts "Code Zauker Searching for #{s}"
|
8
|
+
fs=CodeZauker::FileScanner.new()
|
9
|
+
files=fs.search(s)
|
10
|
+
if files.length >0
|
11
|
+
fline=files.join(" ")
|
12
|
+
# -H forces to print file name also with only one match
|
13
|
+
cmd="grep -H --color -n '#{s}' #{fline}"
|
14
|
+
#puts cmd
|
15
|
+
system(cmd)
|
16
|
+
end
|
17
|
+
end
|
data/code_zauker.gemspec
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# -*- encoding: utf-8 ; mode: ruby; -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "code_zauker/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "code_zauker"
|
7
|
+
s.version = CodeZauker::VERSION
|
8
|
+
s.authors = ["Giovanni Giorgi"]
|
9
|
+
s.email = ["jj@gioorgi.com"]
|
10
|
+
s.homepage = "http://gioorgi.com/tag/code-zauker/"
|
11
|
+
s.summary = %q{A search engine for programming languages}
|
12
|
+
s.description = %q{Code Zauker is based from ideas taken by old Google Code Search and uses Redis as a basic platform}
|
13
|
+
|
14
|
+
s.rubyforge_project = "code_zauker"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
# s.add_development_dependency "rspec"
|
23
|
+
s.add_development_dependency "yard", "~>0.7"
|
24
|
+
|
25
|
+
s.add_runtime_dependency "hiredis", "~> 0.3"
|
26
|
+
s.add_runtime_dependency "redis", "~> 2.2"
|
27
|
+
|
28
|
+
## Install and require the hiredis gem before redis-rb for maximum performances.
|
29
|
+
#s.add_runtime_dependency "redis", "~> 2.2", :require => ["redis/connection/hiredis", "redis"]
|
30
|
+
|
31
|
+
|
32
|
+
end
|
data/lib/code_zauker.rb
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
# -*- mode:ruby ; -*- -*
|
2
|
+
require "code_zauker/version"
|
3
|
+
require 'redis/connection/hiredis'
|
4
|
+
require 'redis'
|
5
|
+
require 'set'
|
6
|
+
# This module try to implement a simple reverse indexer
|
7
|
+
# based on redis
|
8
|
+
# The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html
|
9
|
+
module CodeZauker
|
10
|
+
GRAM_SIZE=3
|
11
|
+
SPACE_GUY=" "*GRAM_SIZE
|
12
|
+
# Scan a file and push it inside redis...
|
13
|
+
# then it can provide handy method to find file scontaining the trigram...
|
14
|
+
class FileScanner
|
15
|
+
def initialize()
|
16
|
+
end
|
17
|
+
def load(filename, noReload=false)
|
18
|
+
# Define my redis id...
|
19
|
+
r=Redis.new
|
20
|
+
# Already exists?...
|
21
|
+
fid=r.get "fscan:id:#{filename}"
|
22
|
+
if fid==nil
|
23
|
+
r.setnx "fscan:nextId",0
|
24
|
+
fid=r.incr "fscan:nextId"
|
25
|
+
# BUG: Consider storing it at the END of the processing
|
26
|
+
r.set "fscan:id:#{filename}", fid
|
27
|
+
r.set "fscan:id2filename:#{fid}",filename
|
28
|
+
else
|
29
|
+
if noReload
|
30
|
+
puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
|
31
|
+
return nil
|
32
|
+
end
|
33
|
+
end
|
34
|
+
# fid is the set key!...
|
35
|
+
trigramScanned=0
|
36
|
+
# TEST_LICENSE.txt: 3290 Total Scanned: 24628
|
37
|
+
# The ratio is below 13% of total trigrams are unique for very big files
|
38
|
+
# So we avoid a huge roundtrip to redis, and store the trigram on a memory-based set
|
39
|
+
# before sending it to redis. This avoid
|
40
|
+
# a lot of spourios work
|
41
|
+
s=Set.new
|
42
|
+
File.open(filename,"r") do |f|
|
43
|
+
lines=f.readlines()
|
44
|
+
adaptiveSize= 6000
|
45
|
+
lines.each do |l|
|
46
|
+
# Split each line into 3-char chunks, and store in a redis set
|
47
|
+
i=0
|
48
|
+
for istart in 0...(l.length-GRAM_SIZE)
|
49
|
+
trigram = l[istart, GRAM_SIZE]
|
50
|
+
# Avoid storing the 3space guy enterely
|
51
|
+
if trigram==SPACE_GUY
|
52
|
+
next
|
53
|
+
end
|
54
|
+
# push the trigram to redis (highly optimized)
|
55
|
+
s.add(trigram)
|
56
|
+
if s.length > adaptiveSize
|
57
|
+
puts " >Pushing...#{s.length}"
|
58
|
+
s.each do | trigram |
|
59
|
+
r.sadd "trigram:#{trigram}",fid
|
60
|
+
r.sadd "fscan:trigramsOnFile:#{fid}", trigram
|
61
|
+
end
|
62
|
+
puts " <Pushed #{s.length}..."
|
63
|
+
s=Set.new()
|
64
|
+
end
|
65
|
+
trigramScanned += 1
|
66
|
+
#puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
if s.length > 0
|
72
|
+
s.each do | trigram |
|
73
|
+
r.sadd "trigram:#{trigram}",fid
|
74
|
+
r.sadd "fscan:trigramsOnFile:#{fid}", trigram
|
75
|
+
end
|
76
|
+
#puts "Final push of #{s.length}"
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
trigramsOnFile=r.scard "fscan:trigramsOnFile:#{fid}"
|
81
|
+
r.sadd "fscan:processedFiles", "fscan:id:#{filename}"
|
82
|
+
trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0
|
83
|
+
puts "File processed. Unique Trigrams for #{filename}: #{trigramsOnFile} Total Scanned: #{trigramScanned} Ratio:#{trigramRatio}"
|
84
|
+
r.quit
|
85
|
+
return nil
|
86
|
+
end
|
87
|
+
|
88
|
+
# = search
|
89
|
+
# Find a list of file candidates to a search string
|
90
|
+
# The search string is padded into trigrams
|
91
|
+
def search(term)
|
92
|
+
#puts " ** Searching: #{term}"
|
93
|
+
# split the term in a padded trigram
|
94
|
+
trigramInAnd=[]
|
95
|
+
# Search=> Sea AND ear AND arc AND rch
|
96
|
+
for j in 0...term.length
|
97
|
+
currentTrigram=term[j,GRAM_SIZE]
|
98
|
+
if currentTrigram.length <GRAM_SIZE
|
99
|
+
# We are at the end...
|
100
|
+
break
|
101
|
+
end
|
102
|
+
trigramInAnd.push("trigram:#{currentTrigram}")
|
103
|
+
end
|
104
|
+
#puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
|
105
|
+
if trigramInAnd.length==0
|
106
|
+
return []
|
107
|
+
end
|
108
|
+
r=Redis.new
|
109
|
+
fileIds= r.sinter(*trigramInAnd)
|
110
|
+
filenames=[]
|
111
|
+
# fscan:id2filename:#{fid}....
|
112
|
+
fileIds.each do | id |
|
113
|
+
filenames.push(r.get("fscan:id2filename:#{id}"))
|
114
|
+
end
|
115
|
+
r.quit
|
116
|
+
#puts " ** Files found:#{filenames} from ids #{fileIds}"
|
117
|
+
return filenames
|
118
|
+
end
|
119
|
+
|
120
|
+
# This function accepts a very simple search query like
|
121
|
+
# Gio*
|
122
|
+
# will match Giovanni, Giovedi, Giorno...
|
123
|
+
# Giova*ni
|
124
|
+
# will match Giovanni, Giovani, Giovannini
|
125
|
+
def searchSimpleRegexp(termWithStar)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
data/readme.org
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
* INSTALL
|
2
|
+
To install Code Zauker, you must simply build and install the gem as usual
|
3
|
+
|
4
|
+
* DEVELOPING
|
5
|
+
For developing with Code Zauker you need bundler 1.0.21 or above
|
6
|
+
|
7
|
+
* Release History
|
8
|
+
| Version | Date | Summary |
|
9
|
+
| 0.0.1 | 26 Jan 2012 | First RubyGems Release (for testing purpose only) |
|
10
|
+
| | | |
|