RubyGems - code_zauker - Versions diffs - 0.0.1 - Mend

code_zauker 0.0.1

Files changed (14) hide show

data/.gitignore +4 -0
data/Gemfile +7 -0
data/Rakefile +21 -0
data/bin/czindexer +15 -0
data/bin/czsearch +17 -0
data/code_zauker.gemspec +32 -0
data/lib/code_zauker/version.rb +3 -0
data/lib/code_zauker.rb +128 -0
data/readme.org +10 -0
data/test/fixture/TEST_LICENSE.txt +1000 -0
data/test/fixture/foolish.txt +1 -0
data/test/fixture/kurukku.txt +2 -0
data/test/test_search.rb +74 -0
metadata +95 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,4 @@
+*.gem
+.bundle
+Gemfile.lock
+pkg/*

data/Gemfile ADDED Viewed

@@ -0,0 +1,7 @@
+source "http://rubygems.org"
+# Specify your gem's dependencies in code_zauker.gemspec
+# GG From http://asciicasts.com/episodes/245-new-gem-with-bundler:
+# It's better to manage the gem’s dependencies inside the Gemspec file and let Bundler
+# load them automatically through the Gemfile
+gemspec

data/Rakefile ADDED Viewed

@@ -0,0 +1,21 @@
+# -*- coding: utf-8 ; mode: ruby; -*-
+require "bundler/gem_tasks"
+# See http://jasonseifer.com/2010/04/06/rake-tutorial
+require 'rake/testtask'
+# See http://rake.rubyforge.org/classes/Rake/TestTask.html
+Rake::TestTask.new do |t|
+  # List of directories to added to $LOAD_PATH before running the tests. (default is ‘lib’)
+  #t.libs << 'test'
+  t.test_files = FileList['test/test*.rb']
+  t.verbose = true
+end
+require 'yard'
+YARD::Rake::YardocTask.new do |t|
+  t.files   = ['lib/**/*.rb']   # optional
+  #t.options = ['--any', '--extra', '--opts'] # optional
+end

data/bin/czindexer ADDED Viewed

@@ -0,0 +1,15 @@
+#!/usr/bin/env ruby
+# Suggested execution is mixing find / xargs with the parallel (P) parameters:
+# find test/fixture/ -type f | xargs -P 5  -n 10 ./bin/czindexer
+# will fire 5 czindexer each with 10 files to process...
+require 'code_zauker'
+ARGV.each do | l |
+  if Dir.exists?(l)
+    puts "Processing via find+xargs"
+    system("find #{l}  -type f | xargs -P 5  -n 10 #{$0}")
+  else
+    puts "Meganoids indexing #{l}"
+    fs=CodeZauker::FileScanner.new()
+    fs.load(l,noReload=false)
+  end
+end

data/bin/czsearch ADDED Viewed

@@ -0,0 +1,17 @@
+#!/usr/bin/env ruby
+#== czsearch is a userful command to search via the Code Zauker facility
+# Send somethiing like -W0 to ruby, for a cleaner output
+$VERBOSE=nil
+require 'code_zauker'
+ARGV.each do | s |
+  #puts "Code Zauker Searching for #{s}"
+  fs=CodeZauker::FileScanner.new()
+  files=fs.search(s)
+  if files.length >0
+    fline=files.join(" ")
+    # -H forces to print file name also with only one match
+    cmd="grep -H --color -n '#{s}' #{fline}"
+    #puts cmd
+    system(cmd)
+  end
+end

data/code_zauker.gemspec ADDED Viewed

@@ -0,0 +1,32 @@
+# -*- encoding: utf-8 ; mode: ruby; -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "code_zauker/version"
+Gem::Specification.new do |s|
+  s.name        = "code_zauker"
+  s.version     = CodeZauker::VERSION
+  s.authors     = ["Giovanni Giorgi"]
+  s.email       = ["jj@gioorgi.com"]
+  s.homepage    = "http://gioorgi.com/tag/code-zauker/"
+  s.summary     = %q{A search engine for programming languages}
+  s.description = %q{Code Zauker is based from ideas taken by old Google Code Search and uses Redis as a basic platform}
+  s.rubyforge_project = "code_zauker"
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  # specify any dependencies here; for example:
+  # s.add_development_dependency "rspec"
+  s.add_development_dependency "yard", "~>0.7"
+  s.add_runtime_dependency "hiredis", "~> 0.3"
+  s.add_runtime_dependency "redis", "~> 2.2"
+  ## Install and require the hiredis gem before redis-rb for maximum performances.
+  #s.add_runtime_dependency "redis", "~> 2.2", :require => ["redis/connection/hiredis", "redis"]
+end

data/lib/code_zauker/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module CodeZauker
+  VERSION = "0.0.1"
+end

data/lib/code_zauker.rb ADDED Viewed

@@ -0,0 +1,128 @@
+# -*- mode:ruby ; -*- -*
+require "code_zauker/version"
+require 'redis/connection/hiredis'
+require 'redis'
+require 'set'
+# This module try to implement a simple reverse indexer
+# based on redis
+# The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html
+module CodeZauker
+  GRAM_SIZE=3
+  SPACE_GUY=" "*GRAM_SIZE
+  # Scan a file and push it inside redis...
+  # then it can provide handy method to find file scontaining the trigram...
+  class FileScanner
+    def initialize()
+    end
+    def load(filename, noReload=false)
+      # Define my redis id...
+      r=Redis.new
+      # Already exists?...
+      fid=r.get "fscan:id:#{filename}"
+      if fid==nil
+        r.setnx "fscan:nextId",0
+        fid=r.incr "fscan:nextId"
+        # BUG: Consider storing it at the END of the processing
+        r.set "fscan:id:#{filename}", fid
+        r.set "fscan:id2filename:#{fid}",filename
+      else
+        if noReload
+          puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
+          return nil
+        end
+      end
+      # fid is the set key!...
+      trigramScanned=0
+      # TEST_LICENSE.txt: 3290 Total Scanned: 24628
+      # The ratio is below 13% of total trigrams are unique for very big files
+      # So we avoid a huge roundtrip to redis, and store the trigram on a memory-based set
+      # before sending it to redis. This avoid
+      # a lot of spourios work
+      s=Set.new
+      File.open(filename,"r") do |f|
+        lines=f.readlines()
+        adaptiveSize= 6000
+        lines.each do  |l|
+          # Split each line into 3-char chunks, and store in a redis set
+          i=0
+          for istart in 0...(l.length-GRAM_SIZE)
+            trigram = l[istart, GRAM_SIZE]
+            # Avoid storing the 3space guy enterely
+            if trigram==SPACE_GUY
+              next
+            end
+            # push the trigram to redis (highly optimized)
+            s.add(trigram)
+            if s.length > adaptiveSize
+              puts " >Pushing...#{s.length}"
+              s.each do | trigram |
+                r.sadd "trigram:#{trigram}",fid
+                r.sadd "fscan:trigramsOnFile:#{fid}", trigram
+              end
+              puts " <Pushed #{s.length}..."
+              s=Set.new()
+            end
+            trigramScanned += 1
+            #puts "#{istart} Trigram fscan:#{trigram}/  FileId: #{fid}"
+          end
+        end
+      end
+      if s.length > 0
+        s.each do | trigram |
+          r.sadd "trigram:#{trigram}",fid
+          r.sadd "fscan:trigramsOnFile:#{fid}", trigram
+        end
+        #puts "Final push of #{s.length}"
+      end
+      trigramsOnFile=r.scard "fscan:trigramsOnFile:#{fid}"
+      r.sadd "fscan:processedFiles", "fscan:id:#{filename}"
+      trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0
+      puts "File processed. Unique Trigrams for #{filename}: #{trigramsOnFile} Total Scanned: #{trigramScanned} Ratio:#{trigramRatio}"
+      r.quit
+      return nil
+    end
+    # = search
+    # Find a list of file candidates to a search string
+    # The search string is padded into trigrams
+    def search(term)
+      #puts " ** Searching: #{term}"
+      # split the term in a padded trigram
+      trigramInAnd=[]
+      # Search=> Sea AND ear AND arc AND rch
+      for j in 0...term.length
+        currentTrigram=term[j,GRAM_SIZE]
+        if currentTrigram.length <GRAM_SIZE
+          # We are at the end...
+          break
+        end
+        trigramInAnd.push("trigram:#{currentTrigram}")
+      end
+      #puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
+      if trigramInAnd.length==0
+        return []
+      end
+      r=Redis.new
+      fileIds=    r.sinter(*trigramInAnd)
+      filenames=[]
+      # fscan:id2filename:#{fid}....
+      fileIds.each do | id |
+        filenames.push(r.get("fscan:id2filename:#{id}"))
+      end
+      r.quit
+      #puts " ** Files found:#{filenames} from ids #{fileIds}"
+      return filenames
+    end
+    # This function accepts a very simple search query like
+    # Gio*
+    # will match Giovanni, Giovedi, Giorno...
+    # Giova*ni
+    # will match Giovanni, Giovani, Giovannini
+    def searchSimpleRegexp(termWithStar)
+    end
+  end
+end

data/readme.org ADDED Viewed

@@ -0,0 +1,10 @@
+* INSTALL
+To install Code Zauker, you must simply build and install the gem as usual
+* DEVELOPING
+For developing with Code Zauker you need bundler 1.0.21 or above
+* Release History
+  | Version | Date        | Summary                                           |
+  |   0.0.1 | 26 Jan 2012 | First RubyGems Release (for testing purpose only) |
+  |         |             |                                                   |