code_zauker 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +7 -0
- data/Rakefile +21 -0
- data/bin/czindexer +15 -0
- data/bin/czsearch +17 -0
- data/code_zauker.gemspec +32 -0
- data/lib/code_zauker/version.rb +3 -0
- data/lib/code_zauker.rb +128 -0
- data/readme.org +10 -0
- data/test/fixture/TEST_LICENSE.txt +1000 -0
- data/test/fixture/foolish.txt +1 -0
- data/test/fixture/kurukku.txt +2 -0
- data/test/test_search.rb +74 -0
- metadata +95 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in code_zauker.gemspec
|
4
|
+
# GG From http://asciicasts.com/episodes/245-new-gem-with-bundler:
|
5
|
+
# It's better to manage the gem’s dependencies inside the Gemspec file and let Bundler
|
6
|
+
# load them automatically through the Gemfile
|
7
|
+
gemspec
|
data/Rakefile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- coding: utf-8 ; mode: ruby; -*-
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
|
4
|
+
# See http://jasonseifer.com/2010/04/06/rake-tutorial
|
5
|
+
require 'rake/testtask'
|
6
|
+
# See http://rake.rubyforge.org/classes/Rake/TestTask.html
|
7
|
+
Rake::TestTask.new do |t|
|
8
|
+
# List of directories to added to $LOAD_PATH before running the tests. (default is ‘lib’)
|
9
|
+
#t.libs << 'test'
|
10
|
+
t.test_files = FileList['test/test*.rb']
|
11
|
+
t.verbose = true
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
require 'yard'
|
16
|
+
YARD::Rake::YardocTask.new do |t|
|
17
|
+
t.files = ['lib/**/*.rb'] # optional
|
18
|
+
#t.options = ['--any', '--extra', '--opts'] # optional
|
19
|
+
end
|
20
|
+
|
21
|
+
|
data/bin/czindexer
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Suggested execution is mixing find / xargs with the parallel (P) parameters:
|
3
|
+
# find test/fixture/ -type f | xargs -P 5 -n 10 ./bin/czindexer
|
4
|
+
# will fire 5 czindexer each with 10 files to process...
|
5
|
+
require 'code_zauker'
|
6
|
+
ARGV.each do | l |
|
7
|
+
if Dir.exists?(l)
|
8
|
+
puts "Processing via find+xargs"
|
9
|
+
system("find #{l} -type f | xargs -P 5 -n 10 #{$0}")
|
10
|
+
else
|
11
|
+
puts "Meganoids indexing #{l}"
|
12
|
+
fs=CodeZauker::FileScanner.new()
|
13
|
+
fs.load(l,noReload=false)
|
14
|
+
end
|
15
|
+
end
|
data/bin/czsearch
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#== czsearch is a userful command to search via the Code Zauker facility
|
3
|
+
# Send somethiing like -W0 to ruby, for a cleaner output
|
4
|
+
$VERBOSE=nil
|
5
|
+
require 'code_zauker'
|
6
|
+
ARGV.each do | s |
|
7
|
+
#puts "Code Zauker Searching for #{s}"
|
8
|
+
fs=CodeZauker::FileScanner.new()
|
9
|
+
files=fs.search(s)
|
10
|
+
if files.length >0
|
11
|
+
fline=files.join(" ")
|
12
|
+
# -H forces to print file name also with only one match
|
13
|
+
cmd="grep -H --color -n '#{s}' #{fline}"
|
14
|
+
#puts cmd
|
15
|
+
system(cmd)
|
16
|
+
end
|
17
|
+
end
|
data/code_zauker.gemspec
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# -*- encoding: utf-8 ; mode: ruby; -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "code_zauker/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "code_zauker"
|
7
|
+
s.version = CodeZauker::VERSION
|
8
|
+
s.authors = ["Giovanni Giorgi"]
|
9
|
+
s.email = ["jj@gioorgi.com"]
|
10
|
+
s.homepage = "http://gioorgi.com/tag/code-zauker/"
|
11
|
+
s.summary = %q{A search engine for programming languages}
|
12
|
+
s.description = %q{Code Zauker is based from ideas taken by old Google Code Search and uses Redis as a basic platform}
|
13
|
+
|
14
|
+
s.rubyforge_project = "code_zauker"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
# s.add_development_dependency "rspec"
|
23
|
+
s.add_development_dependency "yard", "~>0.7"
|
24
|
+
|
25
|
+
s.add_runtime_dependency "hiredis", "~> 0.3"
|
26
|
+
s.add_runtime_dependency "redis", "~> 2.2"
|
27
|
+
|
28
|
+
## Install and require the hiredis gem before redis-rb for maximum performances.
|
29
|
+
#s.add_runtime_dependency "redis", "~> 2.2", :require => ["redis/connection/hiredis", "redis"]
|
30
|
+
|
31
|
+
|
32
|
+
end
|
data/lib/code_zauker.rb
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
# -*- mode:ruby ; -*- -*
|
2
|
+
require "code_zauker/version"
|
3
|
+
require 'redis/connection/hiredis'
|
4
|
+
require 'redis'
|
5
|
+
require 'set'
|
6
|
+
# This module try to implement a simple reverse indexer
|
7
|
+
# based on redis
|
8
|
+
# The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html
|
9
|
+
module CodeZauker
|
10
|
+
GRAM_SIZE=3
|
11
|
+
SPACE_GUY=" "*GRAM_SIZE
|
12
|
+
# Scan a file and push it inside redis...
|
13
|
+
# then it can provide handy method to find file scontaining the trigram...
|
14
|
+
class FileScanner
|
15
|
+
def initialize()
|
16
|
+
end
|
17
|
+
def load(filename, noReload=false)
|
18
|
+
# Define my redis id...
|
19
|
+
r=Redis.new
|
20
|
+
# Already exists?...
|
21
|
+
fid=r.get "fscan:id:#{filename}"
|
22
|
+
if fid==nil
|
23
|
+
r.setnx "fscan:nextId",0
|
24
|
+
fid=r.incr "fscan:nextId"
|
25
|
+
# BUG: Consider storing it at the END of the processing
|
26
|
+
r.set "fscan:id:#{filename}", fid
|
27
|
+
r.set "fscan:id2filename:#{fid}",filename
|
28
|
+
else
|
29
|
+
if noReload
|
30
|
+
puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
|
31
|
+
return nil
|
32
|
+
end
|
33
|
+
end
|
34
|
+
# fid is the set key!...
|
35
|
+
trigramScanned=0
|
36
|
+
# TEST_LICENSE.txt: 3290 Total Scanned: 24628
|
37
|
+
# The ratio is below 13% of total trigrams are unique for very big files
|
38
|
+
# So we avoid a huge roundtrip to redis, and store the trigram on a memory-based set
|
39
|
+
# before sending it to redis. This avoid
|
40
|
+
# a lot of spourios work
|
41
|
+
s=Set.new
|
42
|
+
File.open(filename,"r") do |f|
|
43
|
+
lines=f.readlines()
|
44
|
+
adaptiveSize= 6000
|
45
|
+
lines.each do |l|
|
46
|
+
# Split each line into 3-char chunks, and store in a redis set
|
47
|
+
i=0
|
48
|
+
for istart in 0...(l.length-GRAM_SIZE)
|
49
|
+
trigram = l[istart, GRAM_SIZE]
|
50
|
+
# Avoid storing the 3space guy enterely
|
51
|
+
if trigram==SPACE_GUY
|
52
|
+
next
|
53
|
+
end
|
54
|
+
# push the trigram to redis (highly optimized)
|
55
|
+
s.add(trigram)
|
56
|
+
if s.length > adaptiveSize
|
57
|
+
puts " >Pushing...#{s.length}"
|
58
|
+
s.each do | trigram |
|
59
|
+
r.sadd "trigram:#{trigram}",fid
|
60
|
+
r.sadd "fscan:trigramsOnFile:#{fid}", trigram
|
61
|
+
end
|
62
|
+
puts " <Pushed #{s.length}..."
|
63
|
+
s=Set.new()
|
64
|
+
end
|
65
|
+
trigramScanned += 1
|
66
|
+
#puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
if s.length > 0
|
72
|
+
s.each do | trigram |
|
73
|
+
r.sadd "trigram:#{trigram}",fid
|
74
|
+
r.sadd "fscan:trigramsOnFile:#{fid}", trigram
|
75
|
+
end
|
76
|
+
#puts "Final push of #{s.length}"
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
trigramsOnFile=r.scard "fscan:trigramsOnFile:#{fid}"
|
81
|
+
r.sadd "fscan:processedFiles", "fscan:id:#{filename}"
|
82
|
+
trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0
|
83
|
+
puts "File processed. Unique Trigrams for #{filename}: #{trigramsOnFile} Total Scanned: #{trigramScanned} Ratio:#{trigramRatio}"
|
84
|
+
r.quit
|
85
|
+
return nil
|
86
|
+
end
|
87
|
+
|
88
|
+
# = search
|
89
|
+
# Find a list of file candidates to a search string
|
90
|
+
# The search string is padded into trigrams
|
91
|
+
def search(term)
|
92
|
+
#puts " ** Searching: #{term}"
|
93
|
+
# split the term in a padded trigram
|
94
|
+
trigramInAnd=[]
|
95
|
+
# Search=> Sea AND ear AND arc AND rch
|
96
|
+
for j in 0...term.length
|
97
|
+
currentTrigram=term[j,GRAM_SIZE]
|
98
|
+
if currentTrigram.length <GRAM_SIZE
|
99
|
+
# We are at the end...
|
100
|
+
break
|
101
|
+
end
|
102
|
+
trigramInAnd.push("trigram:#{currentTrigram}")
|
103
|
+
end
|
104
|
+
#puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
|
105
|
+
if trigramInAnd.length==0
|
106
|
+
return []
|
107
|
+
end
|
108
|
+
r=Redis.new
|
109
|
+
fileIds= r.sinter(*trigramInAnd)
|
110
|
+
filenames=[]
|
111
|
+
# fscan:id2filename:#{fid}....
|
112
|
+
fileIds.each do | id |
|
113
|
+
filenames.push(r.get("fscan:id2filename:#{id}"))
|
114
|
+
end
|
115
|
+
r.quit
|
116
|
+
#puts " ** Files found:#{filenames} from ids #{fileIds}"
|
117
|
+
return filenames
|
118
|
+
end
|
119
|
+
|
120
|
+
# This function accepts a very simple search query like
|
121
|
+
# Gio*
|
122
|
+
# will match Giovanni, Giovedi, Giorno...
|
123
|
+
# Giova*ni
|
124
|
+
# will match Giovanni, Giovani, Giovannini
|
125
|
+
def searchSimpleRegexp(termWithStar)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
data/readme.org
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
* INSTALL
|
2
|
+
To install Code Zauker, you must simply build and install the gem as usual
|
3
|
+
|
4
|
+
* DEVELOPING
|
5
|
+
For developing with Code Zauker you need bundler 1.0.21 or above
|
6
|
+
|
7
|
+
* Release History
|
8
|
+
| Version | Date | Summary |
|
9
|
+
| 0.0.1 | 26 Jan 2012 | First RubyGems Release (for testing purpose only) |
|
10
|
+
| | | |
|