code_zauker 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in code_zauker.gemspec
4
+ # GG From http://asciicasts.com/episodes/245-new-gem-with-bundler:
5
+ # It's better to manage the gem’s dependencies inside the Gemspec file and let Bundler
6
+ # load them automatically through the Gemfile
7
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ # -*- coding: utf-8 ; mode: ruby; -*-
2
+ require "bundler/gem_tasks"
3
+
4
+ # See http://jasonseifer.com/2010/04/06/rake-tutorial
5
+ require 'rake/testtask'
6
+ # See http://rake.rubyforge.org/classes/Rake/TestTask.html
7
+ Rake::TestTask.new do |t|
8
+ # List of directories to added to $LOAD_PATH before running the tests. (default is ‘lib’)
9
+ #t.libs << 'test'
10
+ t.test_files = FileList['test/test*.rb']
11
+ t.verbose = true
12
+ end
13
+
14
+
15
+ require 'yard'
16
+ YARD::Rake::YardocTask.new do |t|
17
+ t.files = ['lib/**/*.rb'] # optional
18
+ #t.options = ['--any', '--extra', '--opts'] # optional
19
+ end
20
+
21
+
data/bin/czindexer ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # Suggested execution is mixing find / xargs with the parallel (P) parameters:
3
+ # find test/fixture/ -type f | xargs -P 5 -n 10 ./bin/czindexer
4
+ # will fire 5 czindexer each with 10 files to process...
5
+ require 'code_zauker'
6
+ ARGV.each do | l |
7
+ if Dir.exists?(l)
8
+ puts "Processing via find+xargs"
9
+ system("find #{l} -type f | xargs -P 5 -n 10 #{$0}")
10
+ else
11
+ puts "Meganoids indexing #{l}"
12
+ fs=CodeZauker::FileScanner.new()
13
+ fs.load(l,noReload=false)
14
+ end
15
+ end
data/bin/czsearch ADDED
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+ #== czsearch is a userful command to search via the Code Zauker facility
3
+ # Send somethiing like -W0 to ruby, for a cleaner output
4
+ $VERBOSE=nil
5
+ require 'code_zauker'
6
+ ARGV.each do | s |
7
+ #puts "Code Zauker Searching for #{s}"
8
+ fs=CodeZauker::FileScanner.new()
9
+ files=fs.search(s)
10
+ if files.length >0
11
+ fline=files.join(" ")
12
+ # -H forces to print file name also with only one match
13
+ cmd="grep -H --color -n '#{s}' #{fline}"
14
+ #puts cmd
15
+ system(cmd)
16
+ end
17
+ end
@@ -0,0 +1,32 @@
1
+ # -*- encoding: utf-8 ; mode: ruby; -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "code_zauker/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "code_zauker"
7
+ s.version = CodeZauker::VERSION
8
+ s.authors = ["Giovanni Giorgi"]
9
+ s.email = ["jj@gioorgi.com"]
10
+ s.homepage = "http://gioorgi.com/tag/code-zauker/"
11
+ s.summary = %q{A search engine for programming languages}
12
+ s.description = %q{Code Zauker is based from ideas taken by old Google Code Search and uses Redis as a basic platform}
13
+
14
+ s.rubyforge_project = "code_zauker"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ # s.add_development_dependency "rspec"
23
+ s.add_development_dependency "yard", "~>0.7"
24
+
25
+ s.add_runtime_dependency "hiredis", "~> 0.3"
26
+ s.add_runtime_dependency "redis", "~> 2.2"
27
+
28
+ ## Install and require the hiredis gem before redis-rb for maximum performances.
29
+ #s.add_runtime_dependency "redis", "~> 2.2", :require => ["redis/connection/hiredis", "redis"]
30
+
31
+
32
+ end
@@ -0,0 +1,3 @@
1
+ module CodeZauker
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,128 @@
1
+ # -*- mode:ruby ; -*- -*
2
+ require "code_zauker/version"
3
+ require 'redis/connection/hiredis'
4
+ require 'redis'
5
+ require 'set'
6
+ # This module try to implement a simple reverse indexer
7
+ # based on redis
8
+ # The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html
9
+ module CodeZauker
10
+ GRAM_SIZE=3
11
+ SPACE_GUY=" "*GRAM_SIZE
12
+ # Scan a file and push it inside redis...
13
+ # then it can provide handy method to find file scontaining the trigram...
14
+ class FileScanner
15
+ def initialize()
16
+ end
17
+ def load(filename, noReload=false)
18
+ # Define my redis id...
19
+ r=Redis.new
20
+ # Already exists?...
21
+ fid=r.get "fscan:id:#{filename}"
22
+ if fid==nil
23
+ r.setnx "fscan:nextId",0
24
+ fid=r.incr "fscan:nextId"
25
+ # BUG: Consider storing it at the END of the processing
26
+ r.set "fscan:id:#{filename}", fid
27
+ r.set "fscan:id2filename:#{fid}",filename
28
+ else
29
+ if noReload
30
+ puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
31
+ return nil
32
+ end
33
+ end
34
+ # fid is the set key!...
35
+ trigramScanned=0
36
+ # TEST_LICENSE.txt: 3290 Total Scanned: 24628
37
+ # The ratio is below 13% of total trigrams are unique for very big files
38
+ # So we avoid a huge roundtrip to redis, and store the trigram on a memory-based set
39
+ # before sending it to redis. This avoid
40
+ # a lot of spourios work
41
+ s=Set.new
42
+ File.open(filename,"r") do |f|
43
+ lines=f.readlines()
44
+ adaptiveSize= 6000
45
+ lines.each do |l|
46
+ # Split each line into 3-char chunks, and store in a redis set
47
+ i=0
48
+ for istart in 0...(l.length-GRAM_SIZE)
49
+ trigram = l[istart, GRAM_SIZE]
50
+ # Avoid storing the 3space guy enterely
51
+ if trigram==SPACE_GUY
52
+ next
53
+ end
54
+ # push the trigram to redis (highly optimized)
55
+ s.add(trigram)
56
+ if s.length > adaptiveSize
57
+ puts " >Pushing...#{s.length}"
58
+ s.each do | trigram |
59
+ r.sadd "trigram:#{trigram}",fid
60
+ r.sadd "fscan:trigramsOnFile:#{fid}", trigram
61
+ end
62
+ puts " <Pushed #{s.length}..."
63
+ s=Set.new()
64
+ end
65
+ trigramScanned += 1
66
+ #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
67
+ end
68
+ end
69
+ end
70
+
71
+ if s.length > 0
72
+ s.each do | trigram |
73
+ r.sadd "trigram:#{trigram}",fid
74
+ r.sadd "fscan:trigramsOnFile:#{fid}", trigram
75
+ end
76
+ #puts "Final push of #{s.length}"
77
+ end
78
+
79
+
80
+ trigramsOnFile=r.scard "fscan:trigramsOnFile:#{fid}"
81
+ r.sadd "fscan:processedFiles", "fscan:id:#{filename}"
82
+ trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0
83
+ puts "File processed. Unique Trigrams for #{filename}: #{trigramsOnFile} Total Scanned: #{trigramScanned} Ratio:#{trigramRatio}"
84
+ r.quit
85
+ return nil
86
+ end
87
+
88
+ # = search
89
+ # Find a list of file candidates to a search string
90
+ # The search string is padded into trigrams
91
+ def search(term)
92
+ #puts " ** Searching: #{term}"
93
+ # split the term in a padded trigram
94
+ trigramInAnd=[]
95
+ # Search=> Sea AND ear AND arc AND rch
96
+ for j in 0...term.length
97
+ currentTrigram=term[j,GRAM_SIZE]
98
+ if currentTrigram.length <GRAM_SIZE
99
+ # We are at the end...
100
+ break
101
+ end
102
+ trigramInAnd.push("trigram:#{currentTrigram}")
103
+ end
104
+ #puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
105
+ if trigramInAnd.length==0
106
+ return []
107
+ end
108
+ r=Redis.new
109
+ fileIds= r.sinter(*trigramInAnd)
110
+ filenames=[]
111
+ # fscan:id2filename:#{fid}....
112
+ fileIds.each do | id |
113
+ filenames.push(r.get("fscan:id2filename:#{id}"))
114
+ end
115
+ r.quit
116
+ #puts " ** Files found:#{filenames} from ids #{fileIds}"
117
+ return filenames
118
+ end
119
+
120
+ # This function accepts a very simple search query like
121
+ # Gio*
122
+ # will match Giovanni, Giovedi, Giorno...
123
+ # Giova*ni
124
+ # will match Giovanni, Giovani, Giovannini
125
+ def searchSimpleRegexp(termWithStar)
126
+ end
127
+ end
128
+ end
data/readme.org ADDED
@@ -0,0 +1,10 @@
1
+ * INSTALL
2
+ To install Code Zauker, you must simply build and install the gem as usual
3
+
4
+ * DEVELOPING
5
+ For developing with Code Zauker you need bundler 1.0.21 or above
6
+
7
+ * Release History
8
+ | Version | Date | Summary |
9
+ | 0.0.1 | 26 Jan 2012 | First RubyGems Release (for testing purpose only) |
10
+ | | | |