RubyGems - picolena - Versions diffs - 0.1.7 → 0.1.8 - Mend

picolena 0.1.7 → 0.1.8

Files changed (36) hide show

data/History.txt CHANGED Viewed

@@ -1,3 +1,13 @@
+== 0.1.8  2008-05-08
+* 2 minor enhancements:
+  * New IndexerLogger with basic statistics
+  * More specs & documentation.
+* 2 bug fixes:
+  * Binary documents without extension are not considered supported anymore
+  * Ensure that index is locked system-wide by using lock file.
 == 0.1.7  2008-04-30
 * 5 minor enhancements:

data/Manifest.txt CHANGED Viewed

@@ -50,6 +50,7 @@ lib/picolena/templates/lang/ui/en.yml
 lib/picolena/templates/lang/ui/es.yml
 lib/picolena/templates/lang/ui/fr.yml
 lib/picolena/templates/lib/core_exts.rb
+lib/picolena/templates/lib/indexer_logger.rb
 lib/picolena/templates/lib/plain_text_extractor_DSL.rb
 lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
 lib/picolena/templates/lib/plain_text_extractors/html.rb

data/lib/picolena/templates/app/controllers/documents_controller.rb CHANGED Viewed

@@ -22,9 +22,9 @@ class DocumentsController < ApplicationController
   def show
     start=Time.now
       @query=[params[:id],params.delete(:format)].compact.join('.')
-      @sort=params[:sort]
+      @sort_by=params[:sort_by]
       page=params[:page]||1
-      finder=Finder.new(@query,@sort,page)
+      finder=Finder.new(@query,@sort_by,page)
       finder.execute!
       pager=::Paginator.new(finder.total_hits, Picolena::ResultsPerPage) do
         finder.matching_documents
@@ -64,7 +64,7 @@ class DocumentsController < ApplicationController
   def ensure_index_is_created
     Indexer.ensure_index_existence
-    while Indexer.do_not_disturb_while_indexing do
+    while Indexer.locked? do
       sleep 1
     end
   end

data/lib/picolena/templates/app/helpers/documents_helper.rb CHANGED Viewed

@@ -6,10 +6,10 @@ module DocumentsHelper
   # Very basic pagination.
   # Provides liks to Next, Prev and FirstPage when needed.
-  def should_paginate(page,query, sort)
-    [(link_to("&larr;&larr;", :action => :show, :id => query, :sort=>sort) if page.number>2),
-     (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number, :sort=>sort) if page.prev?),
-     (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number, :sort=>sort) if page.next?)].compact.join(" | ")
+  def should_paginate(page,query, sort_by)
+    [(link_to("&larr;&larr;", :action => :show, :id => query, :sort_by=>sort_by) if page.number>2),
+     (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number, :sort_by=>sort_by) if page.prev?),
+     (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number, :sort_by=>sort_by) if page.next?)].compact.join(" | ")
   end
   # Returns a localized sentence like "Results 1-10 of 12 for Zimbabwe (0.472s)" or
@@ -81,7 +81,7 @@ module DocumentsHelper
   end
   def sort_by_date_or_relevance(query)
-    [link_to_unless_current('By date', document_path(query, :sort=>'by_date')),
+    [link_to_unless_current('By date', document_path(query, :sort_by=>'date')),
      link_to_unless_current('By relevance', document_path(query))].join("&nbsp;")
   end
 end

data/lib/picolena/templates/app/models/document.rb CHANGED Viewed

@@ -11,10 +11,18 @@ class Document
   end
   #Delegating properties to File::method_name(complete_path)
-  [:dirname, :basename, :extname, :ext_as_sym, :file?, :size, :ext_as_sym].each{|method_name|
+  [:dirname, :basename, :extname, :ext_as_sym, :file?, :plain_text?, :size, :ext_as_sym].each{|method_name|
     define_method(method_name){File.send(method_name,complete_path)}
   }
   alias_method :filename, :basename
+  alias_method :to_s, :complete_path
+  def inspect
+    [self,("(#{pretty_score})" if @score),("(language:#{language})" if language)].compact.join(" ")
+  end
   # Returns filename without extension
   #   "buildings.odt" => "buildings"
@@ -50,7 +58,7 @@ class Document
   #  Document.new("presentation.pdf").supported? => true
   #  Document.new("presentation.some_weird_extension").supported? => false
   def supported?
-    PlainTextExtractor.supported_extensions.include?(self.ext_as_sym)
+    PlainTextExtractor.supported_extensions.include?(self.ext_as_sym) unless ext_as_sym==:no_extension and !plain_text?
   end
   # Retrieves content as it is *now*.
@@ -91,6 +99,10 @@ class Document
     from_index[:language]
   end
+  def pretty_score
+    "%3.1f%" % (@score*100)
+  end
   # Fields that are shared between every document.
   def self.default_fields_for(complete_path)
     {
@@ -103,6 +115,7 @@ class Document
     }
   end
   private
   # FIXME: Is there a way to easily retrieve doc_id for a given document?
@@ -138,4 +151,4 @@ class Document
   def validate_in_indexed_directory
     raise ArgumentError, "required document is not in indexed directory" unless in_indexed_directory?
   end
-end
+end

data/lib/picolena/templates/app/models/finder.rb CHANGED Viewed

@@ -2,23 +2,24 @@ class Finder
   attr_reader :query
   def index
-    @@index ||= Indexer.index
+    @@index ||= Indexer.index
   end
-  def initialize(raw_query,by_date=false, page=1,results_per_page=Picolena::ResultsPerPage)
+  def initialize(raw_query,sort_by='relevance', page=1,results_per_page=Picolena::ResultsPerPage)
     @query = Query.extract_from(raw_query)
     @raw_query= raw_query
     Indexer.ensure_index_existence
+    reload_index! if should_be_reloaded?
     @per_page=results_per_page
     @offset=(page.to_i-1)*results_per_page
-    @by_date=by_date
+    @sort_by=sort_by
     index_should_have_documents
   end
   def execute!
     @matching_documents=[]
     start=Time.now
-      @total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @by_date)){|index_id, score|
+      @total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @sort_by=='date')){|index_id, score|
         begin
           found_doc=Document.new(index[index_id][:complete_path])
           found_doc.matching_content=index.highlight(query, index_id,
@@ -52,11 +53,21 @@ class Finder
     }
   }
-  def self.reload!
+  private
+  def reload_index!
+    Indexer.close
     @@index = nil
+    @@last_reload = Time.now
   end
-  private
+  def should_be_reloaded?
+    Indexer.reload_file_mtime > last_reload
+  end
+  def last_reload
+    @@last_reload ||= Time.at(0)
+  end
   def sort_by_date
     Ferret::Search::SortField.new(:modified, :type => :byte, :reverse => true)
@@ -65,4 +76,4 @@ class Finder
   def index_should_have_documents
     raise IndexError, "no document found" unless index.size > 0
   end
-end
+end

data/lib/picolena/templates/app/models/indexer.rb CHANGED Viewed

@@ -1,63 +1,74 @@
+# Indexer is used to index (duh!) documents contained in IndexedDirectories
+# It can create, update, delete and prune the index, and take care that only
+# one IndexWriter exists at any given time, even when used in a multi-threaded
+# way.
+require 'indexer_logger'
 class Indexer
   # This regexp defines which files should *not* be indexed.
   @@exclude          = /(Thumbs\.db)/
   # Number of threads that will be used during indexing process
   @@threads_number = 8
-  cattr_reader :do_not_disturb_while_indexing
   class << self
+    # Finds every document included in IndexedDirectories, parses them with
+    # PlainTextExtractor and adds them to the index.
+    #
+    # Updates the index unless remove_first parameter is set to true, in which
+    # case it removes the index first before re-creating it.
     def index_every_directory(remove_first=false)
-      @@do_not_disturb_while_indexing=true
       clear! if remove_first
+      lock!
       @from_scratch = remove_first
-      # Forces Finder.searcher and Finder.index to be reloaded, by removing them from the cache.
-      Finder.reload!
-      log :debug => "Indexing every directory"
-      start=Time.now
+      logger.start_indexing
       Picolena::IndexedDirectories.each{|dir, alias_dir|
         index_directory_with_multithreads(dir)
       }
-      log :debug => "Now optimizing index"
+      logger.debug "Now optimizing index"
       index.optimize
-      @@do_not_disturb_while_indexing=false
-      log :debug => "Indexing done in #{Time.now-start} s."
+      index_time_dbm_file['last']=Time.now._dump
+      unlock!
+      logger.show_report
     end
+    # Indexes a given directory, using @@threads_number threads.
+    # To do so, it retrieves a list of every included document, cuts it in
+    # @@threads_number chunks, and create a new indexing thread for every chunk.
     def index_directory_with_multithreads(dir)
-      log :debug => "Indexing #{dir}, #{@@threads_number} threads"
+      logger.debug "Indexing #{dir}, #{@@threads_number} threads"
       indexing_list=Dir[File.join(dir,"**/*")].select{|filename|
         File.file?(filename) && filename !~ @@exclude
       }
       indexing_list_chunks=indexing_list.in_transposed_slices(@@threads_number)
       prepare_multi_threads_environment
       indexing_list_chunks.each_with_thread{|chunk|
         chunk.each{|complete_path|
-          last_itime=index_time_dbm_file[complete_path]
-          if @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime) then
+          if should_index_this_document?(complete_path) then
             add_or_update_file(complete_path)
           else
-            log :debug => "Identical : #{complete_path}"
+            logger.debug "Identical : #{complete_path}"
           end
           index_time_dbm_file[complete_path] = Time.now._dump
         }
       }
     end
+    # Retrieves content and language from a given document, and adds it to the index.
+    # Since Document#probably_unique_id is used as index :key, no document will be added
+    # twice to the index, and the old document will just get updated.
+    #
+    # If for some reason (no content found or no defined PlainTextExtractor), content cannot
+    # be found, some basic information about the document (mtime, filename, complete_path)
+    # gets indexed anyway.
     def add_or_update_file(complete_path)
-      default_fields = Document.default_fields_for(complete_path)
+      document = Document.default_fields_for(complete_path)
       begin
-        document = PlainTextExtractor.extract_content_and_language_from(complete_path)
+        document.merge! PlainTextExtractor.extract_content_and_language_from(complete_path)
         raise "empty document #{complete_path}" if document[:content].strip.empty?
-        document.merge! default_fields
-        log :debug => ["Added : #{complete_path}",document[:language] ? " (#{document[:language]})" : ""].join
+        logger.add_document document
       rescue => e
-        log :debug => "\tindexing without content: #{e.message}"
-        document = default_fields
+        logger.reject_document document, e
       end
       index << document
     end
@@ -73,11 +84,9 @@ class Indexer
     # ensures that a new Index is instantiated next time index is called.
     def close
       @@index.close rescue nil
-      # Ferret will SEGFAULT otherwise.
       @@index = nil
     end
     # Checks for indexed files that are missing from filesytem
     # and removes them from index & dbm file.
     def prune_index
@@ -85,7 +94,7 @@ class Indexer
       missing_files.each{|filename, itime|
         index.writer.delete(:complete_path, filename)
         index_time_dbm_file.delete(filename)
-        log :debug => "Removed : #{filename}"
+        logger.debug "Removed : #{filename}"
       }
       index.optimize
     end
@@ -97,6 +106,7 @@ class Indexer
       @@index ||= Ferret::Index::Index.new(default_index_params)
     end
+    # Creates the index unless it already exists.
     def ensure_index_existence
       index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production"
     end
@@ -106,11 +116,66 @@ class Indexer
       index.size
     end
+    # Returns the time at which the index was last created/updated.
+    # Returns "none" if it doesn't exist.
+    def last_update
+      Time._load(index_time_dbm_file['last']) rescue "none"
+    end
+    # Returns the time at which the reload file was last touched.
+    # Useful to know if other processes have modified the shared index,
+    # and if the Indexer should be reloaded.
+    def reload_file_mtime
+      touch_reload_file! unless File.exists?(reload_file)
+      File.mtime(reload_file)
+    end
+    # For a given document, it retrieves the time it was last indexed, compare it to
+    # its modification time and returns false unless the file has been
+    # modified after the last indexing process.
+    def should_index_this_document?(complete_path)
+      last_itime=index_time_dbm_file[complete_path]
+      @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime)
+    end
+    def locked?
+      File.exists?(lock_file)
+    end
     private
+    def touch_reload_file!
+      FileUtils.touch(reload_file)
+      # To ensure that every process can touch reload_file, even if Picolena
+      # is launched as a special user.
+      FileUtils.chmod(0666, reload_file)
+    end
+    def reload_file
+      File.join(Picolena::MetaIndexPath,'reload')
+    end
+    def lock!
+      FileUtils.touch(lock_file)
+    end
+    def unlock!
+      FileUtils.rm(lock_file)
+      # Forces Finder.index to be reloaded.
+      touch_reload_file!
+    end
+    def lock_file
+      File.join(Picolena::MetaIndexPath,'lock')
+    end
+    def logger
+      @@logger ||= IndexerLogger.new
+    end
     # Copied from Ferret book, By David Balmain
     def index_time_dbm_file
-      @@dbm_file ||= DBM.open(File.join(Picolena::IndexSavePath, 'added_at'))
+      @@dbm_file ||= DBM.open(File.join(Picolena::MetaIndexPath, 'added_at'))
     end
     def index_exists?
@@ -121,12 +186,6 @@ class Indexer
       Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first
     end
-    def log(hash)
-      hash.each{|level,message|
-        IndexerLogger.send(level,message)
-      }
-    end
     def default_index_params
       {
         :path        => Picolena::IndexSavePath,

data/lib/picolena/templates/app/models/query.rb CHANGED Viewed

@@ -4,6 +4,11 @@ class Query
     def extract_from(raw_query)
       parser.parse(convert_to_english(raw_query))
     end
+    # Returns terms related to content. Useful for cache highlighting
+    def content_terms(raw_query)
+      Query.extract_from(raw_query).terms(Indexer.index.searcher).select{|term| term.field==:content}.collect{|term| term.text}.uniq
+    end
     private

data/lib/picolena/templates/app/views/documents/_document.html.haml CHANGED Viewed

@@ -3,14 +3,15 @@
   =language_icon_for(document)
   %small=number_to_percentage(document.score*100, :precision=>1)
 =highlight_matching_content(document)
-%p=link_to_containing_directory(document)
--if document.supported?
-  %p
+%p
+  =link_to_containing_directory(document)
+  %br/
+  -if document.supported?
     =link_to_plain_text_content(document)
     &#45;
-    =number_to_human_size(document.size)
-    &#45;
-    =document.pretty_date
-    &#45;
     =link_to_cached_content(document,query)
-%hr/
+    &#45;
+  =number_to_human_size(document.size)
+  &#45;
+  =document.pretty_date
+%hr/

data/lib/picolena/templates/config/environment.rb CHANGED Viewed

@@ -7,8 +7,6 @@
 # Specifies gem version of Rails to use when vendor/rails is not present
 RAILS_GEM_VERSION = '2.0.2' unless defined? RAILS_GEM_VERSION
-IndexerLogger=Logger.new($stdout)
 # Bootstrap the Rails environment, frameworks, and default configuration
 require File.join(File.dirname(__FILE__), 'boot')

data/lib/picolena/templates/config/environments/development.rb CHANGED Viewed

@@ -16,6 +16,3 @@ config.action_view.cache_template_extensions         = false
 # Don't care if the mailer can't send
 config.action_mailer.raise_delivery_errors = false
-IndexerLogger.level = Logger::DEBUG

data/lib/picolena/templates/config/environments/production.rb CHANGED Viewed

@@ -17,5 +17,3 @@ config.action_view.cache_template_loading            = true
 # Disable delivery errors, bad email addresses will be ignored
 # config.action_mailer.raise_delivery_errors = false
-IndexerLogger.level = Logger::INFO

data/lib/picolena/templates/config/environments/test.rb CHANGED Viewed

@@ -20,6 +20,3 @@ config.action_controller.allow_forgery_protection    = false
 # The :test delivery method accumulates sent emails in the
 # ActionMailer::Base.deliveries array.
 config.action_mailer.delivery_method = :test
-IndexerLogger.level = Logger::WARN

data/lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb CHANGED Viewed

@@ -7,4 +7,7 @@ module Picolena
   }
   IndexSavePath=File.join(IndexesSavePath,ENV["RAILS_ENV"] || "development")
+  FileUtils.mkpath IndexSavePath
+  MetaIndexPath= File.join(IndexSavePath,'meta')
+  FileUtils.mkpath MetaIndexPath
 end

data/lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb CHANGED Viewed

@@ -3,4 +3,4 @@ module Picolena
   YAML.load_file('config/custom/indexing_performance.yml').each_pair{|param, value|
     IndexingConfiguration[param.to_sym]= value=~/^[\d_]+$/ ? value.to_i : value
   }
-end
+end

data/lib/picolena/templates/lang/ui/de.yml CHANGED Viewed

@@ -22,5 +22,5 @@ LIKE: WIE
 filename: filename|file|datei
 filetype: erweiterung|ext
 content: inhalt
-modified: jahr|zeit|geändert
-language: lang|sprache
+modified: jahr|zeit|geändert|geaendert|geandert
+language: lang|sprache

data/lib/picolena/templates/lang/ui/en.yml CHANGED Viewed

@@ -20,7 +20,7 @@ LIKE: LIKE
 ## Fields
 filename: filename|file
-filetype: filetype|ext
+filetype: filetype|ext|extension
 content: content
 modified: year|date|modified
 language: lang|language

data/lib/picolena/templates/lang/ui/es.yml CHANGED Viewed

@@ -20,7 +20,7 @@ LIKE: COMO
 ## Fields
 filename: filename|file|archivo
-filetype: extensión|ext
+filetype: extensión|ext|extension
 content: contenido
 modified: fecha|año|anho|modificado
 language: lang|idioma

data/lib/picolena/templates/lang/ui/fr.yml CHANGED Viewed

@@ -22,5 +22,5 @@ LIKE: COMME
 filename: filename|file|fichier
 filetype: extension|ext
 content: contenu
-modified: année|date|annee|modifie
-language: lang|langue
+modified: année|date|annee|modifie|modifié
+language: lang|langue

data/lib/picolena/templates/lib/core_exts.rb CHANGED Viewed

@@ -1,20 +1,3 @@
-class MimeType
-  @@all=[]
-  def self.all
-    @@all
-  end
-  def self.add(exts,mime_name)
-    all<<new(exts,mime_name)
-  end
-  attr_reader :exts, :name
-  def initialize(exts,mime_name)
-    @exts,@name=exts,mime_name
-  end
-end
 class String
   # Creates a "probably unique" id with the desired length, composed only of lowercase letters.
   def base26_hash(length=Picolena::HashLength)
@@ -23,6 +6,9 @@ class String
 end
 module Enumerable
+  # Similar to Enumerable#each, but creates a new thread for each element.
+  # Used for the indexer to make it multi-threaded.
+  # It ensures that threads are joined together before returning.
   def each_with_thread(&block)
     tds=self.collect{|elem|
       Thread.new(elem) {|elem|
@@ -57,17 +43,31 @@ class Array
   end
 end
+class Hash
+  def add(category)
+    self[category]||={:size=>0}
+    self[category][:size]+=1
+  end
+end
 class File
+  # Returns the filetype of filename as a symbol.
+  # Returns :no_extension unless an extension is found
+  #  >> File.ext_as_sym("test.pdf")
+  #  => :pdf
+  #  >> File.ext_as_sym("test.tar.gz")
+  #  => :gz
+  #  >> File.ext_as_sym("test")
+  #  => :no_extension
   def self.ext_as_sym(filename)
     File.extname(filename).sub(/^\./,'').downcase.to_sym rescue :no_extension
   end
-  def self.mime(filename)
-    ext=ext_as_sym(filename)
-    m=MimeType.all.find{|m| m.exts.include?(ext)}
-    m ? m.name : 'application/octet-stream'
-  end
+  # Returns a probable encoding for a given plain text file
+  # If source is a html file, it parses for metadata to retrieve encoding,
+  # and uses file -i otherwise.
+  # Returns iso-8859-15 instead of iso-8859-1, to be sure € char can be
+  # encoded
   def self.encoding(source)
     parse_for_charset="grep -io charset=[a-z0-9\\-]* | sed 's/charset=//i'"
     if File.extname(source)[0,4]==".htm" then
@@ -86,9 +86,18 @@ class File
      end
   end
+  # Returns the content of a file and removes it after.
+  # Could be used to read temporary output file written by a PlainTextExtractor.
   def self.read_and_remove(filename)
     content=read(filename)
     FileUtils.rm filename, :force=>true
     content
   end
+  # Returns nil unless filename is a plain text file.
+  # It requires file command.
+  # NOTE: What to use for Win32?
+  def self.plain_text?(filename)
+    %x{file -i "#{filename}"} =~ /: text\//
+  end
 end

data/lib/picolena/templates/lib/indexer_logger.rb ADDED Viewed

@@ -0,0 +1,45 @@
+class IndexerLogger<Logger
+  def initialize
+    super($stdout)
+    #FIXME: Should be defined in config/environments/*.rb
+    levels={
+      "development"=>Logger::DEBUG,
+      "production" =>Logger::INFO,
+      "test"       =>Logger::WARN
+    }
+    @level=levels[RAILS_ENV]
+    @found_languages={}
+    @supported_filetypes={}
+    @unsupported_filetypes={}
+  end
+  def start_indexing
+    @start_time=Time.now
+    debug "Indexing every directory"
+  end
+  def add_document(document)
+    debug ["Added : #{document[:complete_path]}",document[:language] && " ("<<document[:language]<<")"].join
+    @found_languages.add(document[:language]) if document[:language]
+    @supported_filetypes.add(document[:filetype])
+  end
+  def reject_document(document, error)
+    @unsupported_filetypes.add(document[:filetype])
+    debug "Added without content (#{error.message}) : #{document[:complete_path]}"
+  end
+  def show_report
+    describe :found_languages, :supported_filetypes, :unsupported_filetypes
+    info "Time needed              : #{Time.now-@start_time} s."
+  end
+  private
+  def describe(*instance_variable_names)
+    instance_variable_names.each{|var_name|
+      hash=instance_variable_get("@#{var_name}")
+      info var_name.to_s.humanize.ljust(25)<<": "<<hash.reject{|k,v| k.blank?}.sort_by{|k,v| v[:size]}.reverse.collect{|k,v| "#{k.downcase} (#{v[:size]})"}.join(", ") unless hash.empty?
+    }
+  end
+end

data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb CHANGED Viewed

@@ -16,7 +16,6 @@ module PlainTextExtractorDSL
     @content_and_file_examples=[]
     self.instance_eval(&block)
     PlainTextExtractor.add(self)
-    MimeType.add(self.exts,self.mime_name)
   end
   def every(*exts)

data/lib/picolena/templates/lib/plain_text_extractors/plain_text.rb CHANGED Viewed

@@ -3,12 +3,12 @@ PlainTextExtractor.new {
   as "application/plain"
   aka "plain text file"
   with {|source|
+    raise "binary file" unless File.plain_text?(source)
     encoding=File.encoding(source)
-    #TODO: Return "binary file" if binary
     if encoding.empty? then
-       File.read(source)
+      File.read(source)
     else
-       %x{iconv -f #{encoding} -t utf8  "#{source}" 2>/dev/null}
+      %x{iconv -f #{encoding} -t utf8  "#{source}" 2>/dev/null}
     end
   }
   # for dependencies spec

data/lib/picolena/templates/lib/tasks/index.rake CHANGED Viewed

@@ -25,10 +25,15 @@ namespace :index do
     puts "#{Indexer.size} documents are currently indexed in #{Picolena::IndexSavePath}"
   end
+  desc 'Returns the last time the index was created/update'
+  task :last_update => :environment do
+    puts Indexer.last_update
+  end
   # Search index with query "some query" :
   # rake index:search query="some query"
   desc 'Search index'
   task :search => :environment do
-    Finder.new(ENV["query"]).matching_documents.entries.each{|doc| puts doc.to_s}
+    puts Finder.new(ENV["query"]).matching_documents.entries.collect{|doc| doc.inspect}.join("\n"<<"#"*80<<"\n")
   end
 end

data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb CHANGED Viewed

@@ -1,8 +1,6 @@
 require File.dirname(__FILE__) + '/../spec_helper'
 describe DocumentsHelper do
-  it "shouldn't raise if matching not in content field"
   PlainTextExtractor.supported_extensions.each{|ext|
     it "should have an icon for .#{ext} filetype" do
       icon_for(ext).should_not be_nil

data/lib/picolena/templates/spec/models/basic_finder_spec.rb CHANGED Viewed

@@ -7,10 +7,13 @@ describe "Finder without index on disk" do
     @original_indexed_dirs=Picolena::IndexedDirectories.dup
     @new_index_path=File.join(Dir::tmpdir,'ferret_tst')
     Picolena::IndexSavePath.replace(@new_index_path)
+    Picolena::MetaIndexPath.replace(File.join(@new_index_path,'meta'))
+    FileUtils.mkpath Picolena::MetaIndexPath
   end
   before(:each) do
     Indexer.clear!
+    Finder.send(:class_variable_set,'@@last_reload',nil)
   end
   it "should create index" do
@@ -29,6 +32,7 @@ describe "Finder without index on disk" do
   after(:all) do
     Picolena::IndexedDirectories.replace(@original_indexed_dirs)
     Picolena::IndexSavePath.replace(@original_index_path)
+    Picolena::MetaIndexPath.replace(File.join(@original_index_path,'meta'))
   end
 end

data/lib/picolena/templates/spec/models/document_spec.rb CHANGED Viewed

@@ -78,6 +78,12 @@ describe Document do
     @valid_document.should be_supported
     Document.new("spec/test_dirs/indexed/others/ghjopdfg.xyz").should_not be_supported
   end
+  it "should not be considered supported if binary" do
+    Document.new("spec/test_dirs/indexed/others/BIN_FILE_WITHOUT_EXTENSION").should_not be_supported
+  end
   it "should know its language when enough content is available" do
     Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"

data/lib/picolena/templates/spec/models/finder_spec.rb CHANGED Viewed

@@ -123,7 +123,6 @@ describe Finder do
     end
   end
-  it "should not index content of binary files"
   # Ferret sometimes SEGFAULT crashed with '*.pdf' queries
   it "should not crash while looking for *.pdf" do

data/lib/picolena/templates/spec/models/indexer_spec.rb CHANGED Viewed

@@ -4,4 +4,13 @@ describe Indexer do
   it "should have at least 32MB memory allocated" do
     Indexer.index.writer.max_buffer_memory.should > 2**25-1
   end
+  it "should know the time it was updated" do
+    Indexer.should respond_to(:last_update)
+    begin
+      Indexer.last_update.should be_kind_of(Time)
+    rescue
+      Indexer.last_update.should == "none"
+    end
+  end
 end

data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb CHANGED Viewed

@@ -27,4 +27,9 @@ describe "PlainTextExtractors" do
       end
     }
   }
+  it "should not extract content of binary files" do
+    bin_file="spec/test_dirs/indexed/others/BIN_FILE_WITHOUT_EXTENSION"
+    lambda{PlainTextExtractor.extract_content_from(bin_file)}.should raise_error(RuntimeError, "binary file")
+  end
 end

data/lib/picolena/templates/spec/models/query_spec.rb CHANGED Viewed

@@ -37,7 +37,21 @@ describe Query do
     }
   end
-  it "should accept field terms in different languages"
+  it "should accept field terms in different languages" do
+    Globalite.language = :en
+      english_query_with_french_words = Query.extract_from("absorption language:fr extension:pdf")
+      english_query_with_german_words = Query.extract_from("Unabhängigkeit modified:>2005 filename:job.txt")
+    Globalite.language = :de
+      Query.extract_from("absorption sprache:fr erweiterung:pdf").should == english_query_with_french_words
+      Query.extract_from("Unabhängigkeit geändert:>2005 datei:job.txt").should == english_query_with_german_words
+    Globalite.language = :fr
+      Query.extract_from("absorption langue:fr extension:pdf").should == english_query_with_french_words
+      Query.extract_from("Unabhängigkeit modifié:>2005 fichier:job.txt").should == english_query_with_german_words
+    Globalite.language = :es
+      Query.extract_from("absorption idioma:fr extensión:pdf").should == english_query_with_french_words
+      Query.extract_from("Unabhängigkeit modificado:>2005 archivo:job.txt").should == english_query_with_german_words
+  end
   it "should use AND as default boolean ops" do
     query_without_and = Query.extract_from("one AND two")
@@ -62,4 +76,14 @@ describe Query do
     Query.extract_from("test").should == Query.extract_from("tesT")
     Query.extract_from("test").should_not == Query.extract_from("tesTe")
   end
-end
+  it "should be able to extract search terms related to :content" do
+    Query.content_terms("plain text").should == %w(plain text)
+    Query.content_terms("plain text extension:pdf").should == %w(plain text)
+    Query.content_terms("plain AND text").should == %w(plain text)
+    Query.content_terms("absorption OR adsorption").should ==%w(absorption adsorption)
+    Query.content_terms("filename:plain_text").should be_empty
+    Globalite.language = :en
+    Query.content_terms("LIKE absorption").include?("adsorption").should be_true
+  end
+end

data/lib/picolena/version.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module Picolena #:nodoc:
   module VERSION #:nodoc:
     MAJOR = 0
     MINOR = 1
-    TINY  = 7
+    TINY  = 8
     STRING = [MAJOR, MINOR, TINY].join('.')
   end

data/website/index.html CHANGED Viewed

@@ -33,7 +33,7 @@
     <h1>Picolena</h1>
     <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
       <p>Get Version</p>
-      <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.7</a>
+      <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.8</a>
     </div>
     <h1>&#x2192; &#8216;picolena&#8217;</h1>

data.tar.gz.sig CHANGED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: picolena
 version: !ruby/object:Gem::Version
-  version: 0.1.7
+  version: 0.1.8
 platform: ruby
 authors:
 - Eric Duminil
@@ -30,7 +30,7 @@ cert_chain:
   qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
   -----END CERTIFICATE-----
-date: 2008-04-30 00:00:00 +02:00
+date: 2008-05-08 00:00:00 +02:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -185,6 +185,7 @@ files:
 - lib/picolena/templates/lang/ui/es.yml
 - lib/picolena/templates/lang/ui/fr.yml
 - lib/picolena/templates/lib/core_exts.rb
+- lib/picolena/templates/lib/indexer_logger.rb
 - lib/picolena/templates/lib/plain_text_extractor_DSL.rb
 - lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
 - lib/picolena/templates/lib/plain_text_extractors/html.rb

metadata.gz.sig CHANGED Viewed

Binary file