code_zauker 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in code_zauker.gemspec
4
+ # GG From http://asciicasts.com/episodes/245-new-gem-with-bundler:
5
+ # It's better to manage the gem’s dependencies inside the Gemspec file and let Bundler
6
+ # load them automatically through the Gemfile
7
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ # -*- coding: utf-8 ; mode: ruby; -*-
2
+ require "bundler/gem_tasks"
3
+
4
+ # See http://jasonseifer.com/2010/04/06/rake-tutorial
5
+ require 'rake/testtask'
6
+ # See http://rake.rubyforge.org/classes/Rake/TestTask.html
7
+ Rake::TestTask.new do |t|
8
+ # List of directories to added to $LOAD_PATH before running the tests. (default is ‘lib’)
9
+ #t.libs << 'test'
10
+ t.test_files = FileList['test/test*.rb']
11
+ t.verbose = true
12
+ end
13
+
14
+
15
+ require 'yard'
16
+ YARD::Rake::YardocTask.new do |t|
17
+ t.files = ['lib/**/*.rb'] # optional
18
+ #t.options = ['--any', '--extra', '--opts'] # optional
19
+ end
20
+
21
+
data/bin/czindexer ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # Suggested execution is mixing find / xargs with the parallel (P) parameters:
3
+ # find test/fixture/ -type f | xargs -P 5 -n 10 ./bin/czindexer
4
+ # will fire 5 czindexer each with 10 files to process...
5
+ require 'code_zauker'
6
+ ARGV.each do | l |
7
+ if Dir.exists?(l)
8
+ puts "Processing via find+xargs"
9
+ system("find #{l} -type f | xargs -P 5 -n 10 #{$0}")
10
+ else
11
+ puts "Meganoids indexing #{l}"
12
+ fs=CodeZauker::FileScanner.new()
13
+ fs.load(l,noReload=false)
14
+ end
15
+ end
data/bin/czsearch ADDED
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+ #== czsearch is a userful command to search via the Code Zauker facility
3
+ # Send somethiing like -W0 to ruby, for a cleaner output
4
+ $VERBOSE=nil
5
+ require 'code_zauker'
6
+ ARGV.each do | s |
7
+ #puts "Code Zauker Searching for #{s}"
8
+ fs=CodeZauker::FileScanner.new()
9
+ files=fs.search(s)
10
+ if files.length >0
11
+ fline=files.join(" ")
12
+ # -H forces to print file name also with only one match
13
+ cmd="grep -H --color -n '#{s}' #{fline}"
14
+ #puts cmd
15
+ system(cmd)
16
+ end
17
+ end
@@ -0,0 +1,32 @@
1
+ # -*- encoding: utf-8 ; mode: ruby; -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "code_zauker/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "code_zauker"
7
+ s.version = CodeZauker::VERSION
8
+ s.authors = ["Giovanni Giorgi"]
9
+ s.email = ["jj@gioorgi.com"]
10
+ s.homepage = "http://gioorgi.com/tag/code-zauker/"
11
+ s.summary = %q{A search engine for programming languages}
12
+ s.description = %q{Code Zauker is based from ideas taken by old Google Code Search and uses Redis as a basic platform}
13
+
14
+ s.rubyforge_project = "code_zauker"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ # s.add_development_dependency "rspec"
23
+ s.add_development_dependency "yard", "~>0.7"
24
+
25
+ s.add_runtime_dependency "hiredis", "~> 0.3"
26
+ s.add_runtime_dependency "redis", "~> 2.2"
27
+
28
+ ## Install and require the hiredis gem before redis-rb for maximum performances.
29
+ #s.add_runtime_dependency "redis", "~> 2.2", :require => ["redis/connection/hiredis", "redis"]
30
+
31
+
32
+ end
@@ -0,0 +1,3 @@
1
+ module CodeZauker
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,128 @@
1
+ # -*- mode:ruby ; -*- -*
2
+ require "code_zauker/version"
3
+ require 'redis/connection/hiredis'
4
+ require 'redis'
5
+ require 'set'
6
+ # This module try to implement a simple reverse indexer
7
+ # based on redis
8
+ # The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html
9
+ module CodeZauker
10
+ GRAM_SIZE=3
11
+ SPACE_GUY=" "*GRAM_SIZE
12
+ # Scan a file and push it inside redis...
13
+ # then it can provide handy method to find file scontaining the trigram...
14
+ class FileScanner
15
+ def initialize()
16
+ end
17
+ def load(filename, noReload=false)
18
+ # Define my redis id...
19
+ r=Redis.new
20
+ # Already exists?...
21
+ fid=r.get "fscan:id:#{filename}"
22
+ if fid==nil
23
+ r.setnx "fscan:nextId",0
24
+ fid=r.incr "fscan:nextId"
25
+ # BUG: Consider storing it at the END of the processing
26
+ r.set "fscan:id:#{filename}", fid
27
+ r.set "fscan:id2filename:#{fid}",filename
28
+ else
29
+ if noReload
30
+ puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
31
+ return nil
32
+ end
33
+ end
34
+ # fid is the set key!...
35
+ trigramScanned=0
36
+ # TEST_LICENSE.txt: 3290 Total Scanned: 24628
37
+ # The ratio is below 13% of total trigrams are unique for very big files
38
+ # So we avoid a huge roundtrip to redis, and store the trigram on a memory-based set
39
+ # before sending it to redis. This avoid
40
+ # a lot of spourios work
41
+ s=Set.new
42
+ File.open(filename,"r") do |f|
43
+ lines=f.readlines()
44
+ adaptiveSize= 6000
45
+ lines.each do |l|
46
+ # Split each line into 3-char chunks, and store in a redis set
47
+ i=0
48
+ for istart in 0...(l.length-GRAM_SIZE)
49
+ trigram = l[istart, GRAM_SIZE]
50
+ # Avoid storing the 3space guy enterely
51
+ if trigram==SPACE_GUY
52
+ next
53
+ end
54
+ # push the trigram to redis (highly optimized)
55
+ s.add(trigram)
56
+ if s.length > adaptiveSize
57
+ puts " >Pushing...#{s.length}"
58
+ s.each do | trigram |
59
+ r.sadd "trigram:#{trigram}",fid
60
+ r.sadd "fscan:trigramsOnFile:#{fid}", trigram
61
+ end
62
+ puts " <Pushed #{s.length}..."
63
+ s=Set.new()
64
+ end
65
+ trigramScanned += 1
66
+ #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
67
+ end
68
+ end
69
+ end
70
+
71
+ if s.length > 0
72
+ s.each do | trigram |
73
+ r.sadd "trigram:#{trigram}",fid
74
+ r.sadd "fscan:trigramsOnFile:#{fid}", trigram
75
+ end
76
+ #puts "Final push of #{s.length}"
77
+ end
78
+
79
+
80
+ trigramsOnFile=r.scard "fscan:trigramsOnFile:#{fid}"
81
+ r.sadd "fscan:processedFiles", "fscan:id:#{filename}"
82
+ trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0
83
+ puts "File processed. Unique Trigrams for #{filename}: #{trigramsOnFile} Total Scanned: #{trigramScanned} Ratio:#{trigramRatio}"
84
+ r.quit
85
+ return nil
86
+ end
87
+
88
+ # = search
89
+ # Find a list of file candidates to a search string
90
+ # The search string is padded into trigrams
91
+ def search(term)
92
+ #puts " ** Searching: #{term}"
93
+ # split the term in a padded trigram
94
+ trigramInAnd=[]
95
+ # Search=> Sea AND ear AND arc AND rch
96
+ for j in 0...term.length
97
+ currentTrigram=term[j,GRAM_SIZE]
98
+ if currentTrigram.length <GRAM_SIZE
99
+ # We are at the end...
100
+ break
101
+ end
102
+ trigramInAnd.push("trigram:#{currentTrigram}")
103
+ end
104
+ #puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
105
+ if trigramInAnd.length==0
106
+ return []
107
+ end
108
+ r=Redis.new
109
+ fileIds= r.sinter(*trigramInAnd)
110
+ filenames=[]
111
+ # fscan:id2filename:#{fid}....
112
+ fileIds.each do | id |
113
+ filenames.push(r.get("fscan:id2filename:#{id}"))
114
+ end
115
+ r.quit
116
+ #puts " ** Files found:#{filenames} from ids #{fileIds}"
117
+ return filenames
118
+ end
119
+
120
+ # This function accepts a very simple search query like
121
+ # Gio*
122
+ # will match Giovanni, Giovedi, Giorno...
123
+ # Giova*ni
124
+ # will match Giovanni, Giovani, Giovannini
125
+ def searchSimpleRegexp(termWithStar)
126
+ end
127
+ end
128
+ end
data/readme.org ADDED
@@ -0,0 +1,10 @@
1
+ * INSTALL
2
+ To install Code Zauker, you must simply build and install the gem as usual
3
+
4
+ * DEVELOPING
5
+ For developing with Code Zauker you need bundler 1.0.21 or above
6
+
7
+ * Release History
8
+ | Version | Date | Summary |
9
+ | 0.0.1 | 26 Jan 2012 | First RubyGems Release (for testing purpose only) |
10
+ | | | |