RubyGems - picolena - Versions diffs - 0.1.6 → 0.1.7 - Mend

picolena 0.1.6 → 0.1.7

Files changed (36) hide show

data/History.txt +12 -3
data/Manifest.txt +2 -0
data/bin/picolena +1 -1
data/config/files_to_clean +1 -0
data/lib/picolena/config/basic.rb +6 -2
data/lib/picolena/config/indexing_performance.yml +30 -0
data/lib/picolena/picolena_generator.rb +9 -4
data/lib/picolena/templates/app/controllers/documents_controller.rb +3 -1
data/lib/picolena/templates/app/helpers/documents_helper.rb +18 -9
data/lib/picolena/templates/app/models/document.rb +20 -3
data/lib/picolena/templates/app/models/finder.rb +19 -19
data/lib/picolena/templates/app/models/indexer.rb +36 -9
data/lib/picolena/templates/app/views/documents/_document.html.haml +7 -2
data/lib/picolena/templates/app/views/documents/cached.html.haml +2 -2
data/lib/picolena/templates/app/views/documents/show.html.haml +5 -2
data/lib/picolena/templates/config/environment.rb +1 -1
data/lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb +6 -0
data/lib/picolena/templates/lib/tasks/index.rake +6 -1
data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
data/lib/picolena/templates/public/stylesheets/style.css +17 -1
data/lib/picolena/templates/spec/models/basic_finder_spec.rb +4 -4
data/lib/picolena/templates/spec/models/document_spec.rb +65 -16
data/lib/picolena/templates/spec/models/finder_spec.rb +4 -3
data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +8 -0
data/lib/picolena/templates/spec/models/indexer_spec.rb +2 -2
data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +0 -12
data/lib/picolena/templates/spec/models/query_spec.rb +10 -1
data/lib/picolena/version.rb +1 -1
data/website/index.html +1 -1
data/website/index.txt +0 -0
data/website/index_devjavu +0 -0
data/website/javascripts/rounded_corners_lite.inc.js +0 -0
data/website/stylesheets/screen.css +0 -0
data.tar.gz.sig +0 -0
metadata +4 -2
metadata.gz.sig +3 -1

data/History.txt CHANGED Viewed

@@ -1,10 +1,19 @@
+== 0.1.7  2008-04-30
+* 5 minor enhancements:
+  * added cache highlighting à la Google
+  * rake index:update implemented as described in Ferret book by David Balmain
+  * rake index:prune removes missing files from indexer
+  * possibility to sort results by relevance / by date
+  * one configuration file for performance tweaks
 == 0.1.6  2008-04-25
 * 1 minor enhancement:
   * replaced index key by Document#probably_unique_id
 * bug fixes:
-  * Added forgotten public/images/flags to generator file.
+  * Added forgotten public/images/flags to generator file
 == 0.1.5  2008-04-25
@@ -24,7 +33,7 @@
 == 0.1.3  2008-04-20
 * 1 bug fix:
-  * removed verbose debug info.
+  * removed verbose debug info
 == 0.1.2  2008-04-20
@@ -49,7 +58,7 @@
 * 3 minor enhancements:
   * can now be installed on win32 (doesn't pass every spec though)
   * moved rails_plugins away from lib/ so that they don't get parsed by rdoc/ri
-  * shorter and prettier base26_hash id for documents.
+  * shorter and prettier base26_hash id for documents
 == 0.0.99  2008-04-06

data/Manifest.txt CHANGED Viewed

@@ -11,6 +11,7 @@ lib/picolena/USAGE
 lib/picolena/config/basic.rb
 lib/picolena/config/icons_and_filetypes.yml
 lib/picolena/config/indexed_directories.yml
+lib/picolena/config/indexing_performance.yml
 lib/picolena/config/title_and_names_and_links.yml
 lib/picolena/config/white_list_ip.yml
 lib/picolena/picolena_generator.rb
@@ -42,6 +43,7 @@ lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb
 lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
 lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
 lib/picolena/templates/config/initializers/006_load_icons.rb
+lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb
 lib/picolena/templates/config/routes.rb
 lib/picolena/templates/lang/ui/de.yml
 lib/picolena/templates/lang/ui/en.yml

data/bin/picolena CHANGED Viewed

@@ -11,7 +11,7 @@ if %w(-v --version).include? ARGV.first
   exit(0)
 end
-action= ARGV.include?("--spec-only") ? "testing" : "installing"
+action= ARGV.any?{|opt| opt[0,6]=="--spec"} ? "testing" : "installing"
 require 'rubigen/scripts/generate'
 source = RubiGen::PathSource.new(:application,

data/config/files_to_clean CHANGED Viewed

@@ -5,6 +5,7 @@ lib/picolena/templates/config/custom/indexed_directories.yml
 lib/picolena/templates/config/custom/white_list_ip.yml
 lib/picolena/templates/config/custom/title_and_names_and_links.yml
 lib/picolena/templates/config/custom/icons_and_filetypes.yml
+lib/picolena/templates/config/custom/indexing_performance.yml
 lib/picolena/templates/log
 lib/picolena/templates/spec/test_dirs/indexed/others/bäñüßé.txt
 lib/picolena/templates/tmp

data/lib/picolena/config/basic.rb CHANGED Viewed

@@ -42,5 +42,9 @@ module Picolena
   # Specify the default Levenshtein distance when using FuzzyQuery
   # see http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html for more information.
   Ferret::Search::FuzzyQuery.default_min_similarity=0.6
-  Analyzer=Ferret::Analysis::StandardAnalyzer.new
-end
+  # PerFieldAnalyzer is used to prevent queries like "language:it" to be broken by StopFilter.
+  per_field_analyzer=Ferret::Analysis::PerFieldAnalyzer.new(Ferret::Analysis::StandardAnalyzer.new)
+  per_field_analyzer[:language]=Ferret::Analysis::WhiteSpaceAnalyzer.new
+  Analyzer=per_field_analyzer
+end

data/lib/picolena/config/indexing_performance.yml ADDED Viewed

@@ -0,0 +1,30 @@
+# You probably shouldn't change those parameters
+# if you don't know what they represent.
+# For more information, refer to:
+#  http://ferret.davebalmain.com/api/classes/Ferret/Index/IndexWriter.html
+## Main performance parameters
+# Allowed memory for indexing process.
+# 128MB by default, or 2^27
+max_buffer_memory: 134_217_728
+# High value => fast indexing, slow searching
+# Low  value => slow indexing, fast searching
+# 10 by default
+merge_factor: 10
+# Maximum number of extracted terms for any given document
+max_field_length: 10_000
+## Other parameters
+# 1MB by default, or 2**20
+chunk_size: 1_048_576
+max_buffered_docs: 10_000
+# NOTE: Be extra careful with this parameter, setting it to -1 (infinite)
+# multiplied indexing time by an order of magnitude.
+# max_merge_docs: -1
+use_compound_file: true
+index_skip_interval: 128
+doc_skip_interval: 16

data/lib/picolena/picolena_generator.rb CHANGED Viewed

@@ -16,10 +16,14 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
     usage if args.empty? and !options[:spec_only]
     @destination_root = options[:destination]
-    @directories_to_index=ARGV.collect{|relative_path|
-      abs_dir=Pathname.new(relative_path).realpath.to_s
-      "\"#{abs_dir}\" : \"#{abs_dir}\""
-    }.join("\n  ")
+    @directories_to_index=if options[:spec_only] then
+       "/whatever : /whatever"
+    else
+      ARGV.collect{|relative_path|
+        abs_dir=Pathname.new(relative_path).realpath.to_s
+        "\"#{abs_dir}\" : \"#{abs_dir}\""
+      }.join("\n  ")
+    end
     extract_options
   end
@@ -63,6 +67,7 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
       m.template '../config/indexed_directories.yml', 'config/custom/indexed_directories.yml', :assigns => {:directories_to_index => @directories_to_index}
       m.template '../config/title_and_names_and_links.yml', 'config/custom/title_and_names_and_links.yml', :assigns => {:version => Picolena::VERSION::STRING}
       m.file '../config/icons_and_filetypes.yml', 'config/custom/icons_and_filetypes.yml'
+      m.file '../config/indexing_performance.yml', 'config/custom/indexing_performance.yml'
       # README, License & Rakefile
       m.file 'MIT-LICENSE', 'LICENSE'

data/lib/picolena/templates/app/controllers/documents_controller.rb CHANGED Viewed

@@ -22,8 +22,9 @@ class DocumentsController < ApplicationController
   def show
     start=Time.now
       @query=[params[:id],params.delete(:format)].compact.join('.')
+      @sort=params[:sort]
       page=params[:page]||1
-      finder=Finder.new(@query,page)
+      finder=Finder.new(@query,@sort,page)
       finder.execute!
       pager=::Paginator.new(finder.total_hits, Picolena::ResultsPerPage) do
         finder.matching_documents
@@ -47,6 +48,7 @@ class DocumentsController < ApplicationController
   # Returns the content of the document identified by probably_unique_id, as it was at the time it was indexed.
   # similar to Google cache.
   def cached
+    @query=[params[:query],params.delete(:format)].compact.join('.')
   end
   private

data/lib/picolena/templates/app/helpers/documents_helper.rb CHANGED Viewed

@@ -3,15 +3,15 @@ module DocumentsHelper
   def nothing_found?
     @matching_documents.nil? or @matching_documents.entries.empty?
   end
   # Very basic pagination.
   # Provides liks to Next, Prev and FirstPage when needed.
-  def should_paginate(page,query)
-    [(link_to("&larr;&larr;", :action => :show, :id => query, :page => 1) if page.number>2),
-     (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number) if page.prev?),
-     (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number) if page.next?)].compact.join(" | ")
+  def should_paginate(page,query, sort)
+    [(link_to("&larr;&larr;", :action => :show, :id => query, :sort=>sort) if page.number>2),
+     (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number, :sort=>sort) if page.prev?),
+     (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number, :sort=>sort) if page.next?)].compact.join(" | ")
   end
   # Returns a localized sentence like "Results 1-10 of 12 for Zimbabwe (0.472s)" or
   # "Résultats 1-2 parmi 2 pour whatever (0.012s)"
   def describe_results(page, total_hits, dt, query)
@@ -24,7 +24,7 @@ module DocumentsHelper
     show_time_needed(dt)
     ].join(' ')
   end
   # Returns the time needed to treat the query and launch the search, with a ms precision : (0.472s)
   def show_time_needed(dt)
     content_tag(:small,'('<<number_with_precision(dt,3)<<'s)')
@@ -71,8 +71,17 @@ module DocumentsHelper
   end
   # For any indexed document, returns a link to show its cached content.
-  def link_to_cached_content(document)
+  def link_to_cached_content(document, query)
     link_name="("<<content_tag(:small,:cached.l)<<")"
-    link_to link_name, cached_document_path(document.probably_unique_id)
+    link_to link_name, cached_document_path(:id => document.probably_unique_id, :query => query)
+  end
+  def highlighted_cache(document, query)
+    h(document.highlighted_cache(query)).gsub(/\n/,'<br/>').gsub(/&lt;&lt;(.*?)&gt;&gt;/,content_tag(:span, '\1', :class=>"matching_content"))
+  end
+  def sort_by_date_or_relevance(query)
+    [link_to_unless_current('By date', document_path(query, :sort=>'by_date')),
+     link_to_unless_current('By relevance', document_path(query))].join("&nbsp;")
   end
 end

data/lib/picolena/templates/app/models/document.rb CHANGED Viewed

@@ -11,7 +11,7 @@ class Document
   end
   #Delegating properties to File::method_name(complete_path)
-  [:dirname, :basename, :extname, :ext_as_sym, :file?, :ext_as_sym].each{|method_name|
+  [:dirname, :basename, :extname, :ext_as_sym, :file?, :size, :ext_as_sym].each{|method_name|
     define_method(method_name){File.send(method_name,complete_path)}
   }
   alias_method :filename, :basename
@@ -63,11 +63,22 @@ class Document
   def cached
     from_index[:content]
   end
+  def highlighted_cache(raw_query)
+    #TODO: Report to Ferret. Highlight should accept :key and not only :doc_id.
+    Indexer.index.highlight(Query.extract_from(raw_query), doc_id,
+                            :field => :content, :excerpt_length => :all,
+                            :pre_tag => "<<", :post_tag => ">>"
+    ).first
+  end
-  # FIXME: Not just date anymore.
   # Returns the last modification date before the document got indexed.
   # Useful to know how old a document is, and to which version the cache corresponds.
-  def date
+  def pretty_date
+    from_index[:modified].sub(/(\d{4})(\d{2})(\d{2})\d{6}/,'\1-\2-\3')
+  end
+  def pretty_mtime
     from_index[:modified].sub(/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/,'\1-\2-\3 \4:\5:\6')
   end
@@ -93,6 +104,12 @@ class Document
   end
   private
+  # FIXME: Is there a way to easily retrieve doc_id for a given document?
+  # Better yet, fix Index#highlight to accept :probably_unique_id and stop using :doc_id.
+  def doc_id
+    Indexer.index.search(Ferret::Search::TermQuery.new(:probably_unique_id,probably_unique_id)).hits.first.doc
+  end
   # Retrieves the document from the index.
   # Useful to get meta-info about it.

data/lib/picolena/templates/app/models/finder.rb CHANGED Viewed

@@ -5,36 +5,34 @@ class Finder
     @@index ||= Indexer.index
   end
-  def initialize(raw_query,page=1,results_per_page=Picolena::ResultsPerPage)
+  def initialize(raw_query,by_date=false, page=1,results_per_page=Picolena::ResultsPerPage)
     @query = Query.extract_from(raw_query)
     @raw_query= raw_query
     Indexer.ensure_index_existence
     @per_page=results_per_page
     @offset=(page.to_i-1)*results_per_page
+    @by_date=by_date
     index_should_have_documents
   end
   def execute!
     @matching_documents=[]
     start=Time.now
-    top_docs=index.search(query, :limit => @per_page, :offset=>@offset)
-    top_docs.hits.each{|hit|
-      index_id,score=hit.doc,hit.score
-      begin
-        found_doc=Document.new(index[index_id][:complete_path])
-        found_doc.matching_content=index.highlight(query, index_id,
-                                                   :field => :content, :excerpt_length => 80,
-                                                   :pre_tag => "<<", :post_tag => ">>"
-        ) unless @raw_query=~/^\*+\.\w*$/
-        found_doc.score=score
-        @matching_documents<<found_doc
-      rescue Errno::ENOENT
-        #"File has been moved/deleted!"
-      end
+      @total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @by_date)){|index_id, score|
+        begin
+          found_doc=Document.new(index[index_id][:complete_path])
+          found_doc.matching_content=index.highlight(query, index_id,
+                                                     :field => :content, :excerpt_length => 80,
+                                                     :pre_tag => "<<", :post_tag => ">>"
+          )
+          found_doc.score=score
+          @matching_documents<<found_doc
+        rescue Errno::ENOENT
+          #"File has been moved/deleted!"
+        end
       }
       @executed=true
-      @time_needed=Time.now-start
-      @total_hits=top_docs.total_hits
+    @time_needed=Time.now-start
   end
   # Returns true if it has been executed.
@@ -54,13 +52,15 @@ class Finder
     }
   }
   def self.reload!
     @@index = nil
   end
   private
+  def sort_by_date
+    Ferret::Search::SortField.new(:modified, :type => :byte, :reverse => true)
+  end
   def index_should_have_documents
     raise IndexError, "no document found" unless index.size > 0

data/lib/picolena/templates/app/models/indexer.rb CHANGED Viewed

@@ -10,6 +10,7 @@ class Indexer
     def index_every_directory(remove_first=false)
       @@do_not_disturb_while_indexing=true
       clear! if remove_first
+      @from_scratch = remove_first
       # Forces Finder.searcher and Finder.index to be reloaded, by removing them from the cache.
       Finder.reload!
       log :debug => "Indexing every directory"
@@ -35,13 +36,19 @@ class Indexer
       prepare_multi_threads_environment
       indexing_list_chunks.each_with_thread{|chunk|
-        chunk.each{|filename|
-          add_file(filename)
+        chunk.each{|complete_path|
+          last_itime=index_time_dbm_file[complete_path]
+          if @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime) then
+            add_or_update_file(complete_path)
+          else
+            log :debug => "Identical : #{complete_path}"
+          end
+          index_time_dbm_file[complete_path] = Time.now._dump
         }
       }
     end
-    def add_file(complete_path)
+    def add_or_update_file(complete_path)
       default_fields = Document.default_fields_for(complete_path)
       begin
         document = PlainTextExtractor.extract_content_and_language_from(complete_path)
@@ -69,6 +76,19 @@ class Indexer
       # Ferret will SEGFAULT otherwise.
       @@index = nil
     end
+    # Checks for indexed files that are missing from filesytem
+    # and removes them from index & dbm file.
+    def prune_index
+      missing_files=index_time_dbm_file.reject{|filename,itime| File.exists?(filename) && Picolena::IndexedDirectories.any?{|dir,alias_path| filename.starts_with?(dir)}}
+      missing_files.each{|filename, itime|
+        index.writer.delete(:complete_path, filename)
+        index_time_dbm_file.delete(filename)
+        log :debug => "Removed : #{filename}"
+      }
+      index.optimize
+    end
     # Only one IndexWriter should be instantiated.
     # If one index already exists, returns it.
@@ -81,11 +101,17 @@ class Indexer
       index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production"
     end
-    def doc_count
-      index.writer.doc_count
+    # Returns how many files are indexed.
+    def size
+      index.size
     end
     private
+    # Copied from Ferret book, By David Balmain
+    def index_time_dbm_file
+      @@dbm_file ||= DBM.open(File.join(Picolena::IndexSavePath, 'added_at'))
+    end
     def index_exists?
       index_filename and File.exists?(index_filename)
@@ -108,7 +134,7 @@ class Indexer
         :field_infos => default_field_infos,
         # Great way to ensure that no file is indexed twice!
         :key         => :probably_unique_id
-        }
+        }.merge Picolena::IndexingConfiguration
     end
     def default_field_infos
@@ -120,7 +146,7 @@ class Indexer
         field_infos.add_field(:filetype,           :store => :no,  :index => :yes, :boost => 1.5)
         field_infos.add_field(:modified,           :store => :yes, :index => :untokenized)
         field_infos.add_field(:probably_unique_id, :store => :no,  :index => :untokenized)
-        field_infos.add_field(:language,           :store => :yes, :index => :yes)
+        field_infos.add_field(:language,           :store => :yes, :index => :untokenized)
       end
     end
@@ -130,7 +156,8 @@ class Indexer
       # an IndexWriter at the same time, and get a
       #  Ferret::Store::Lock::LockError
       index
-      # NOTE: is it really necessary?
+      # Opens dbm file to dump indexing time.
+      index_time_dbm_file
       # ActiveSupport sometime raises
       #  Expected Object is NOT missing constant
       # without.
@@ -140,4 +167,4 @@ class Indexer
       PlainTextExtractor
     end
   end
-end
+end

data/lib/picolena/templates/app/views/documents/_document.html.haml CHANGED Viewed

@@ -7,5 +7,10 @@
 -if document.supported?
   %p
     =link_to_plain_text_content(document)
-    =link_to_cached_content(document)
-%hr/
+    &#45;
+    =number_to_human_size(document.size)
+    &#45;
+    =document.pretty_date
+    &#45;
+    =link_to_cached_content(document,query)
+%hr/

data/lib/picolena/templates/app/views/documents/cached.html.haml CHANGED Viewed

@@ -2,7 +2,7 @@
   =link_to icon_and_filename_for(@document), download_document_path(@probably_unique_id)
   (
   =:as_it_was_indexed_on.l
-  =@document.date
+  =@document.pretty_date
   )
 %p=link_to_containing_directory(@document)
-%blockquote=h(@document.cached).gsub(/\n/,'<br/>')
+%blockquote=highlighted_cache(@document, @query)

data/lib/picolena/templates/app/views/documents/show.html.haml CHANGED Viewed

@@ -7,6 +7,9 @@
         %strong=h(@query)
         =show_time_needed(@time_needed)
       -else
-        %span{:class=>'pagination'}=should_paginate(@matching_documents, @query)
+        %span{:class=>'pagination'}=should_paginate(@matching_documents, @query, @sort)
         =describe_results(@matching_documents, @total_hits, @time_needed, h(@query))
-= render :partial =>'document', :collection => @matching_documents
+-unless nothing_found?
+  %p
+    %span{:class=>'sort_by'}=sort_by_date_or_relevance(@query)
+= render :partial =>'document', :collection => @matching_documents, :locals => { :query => @query}

data/lib/picolena/templates/config/environment.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-%w(rubygems paginator fileutils pathname logger thread).each{|lib| require lib}
+%w(rubygems paginator fileutils pathname logger thread dbm).each{|lib| require lib}
 # Uncomment below to force Rails into production mode when
 # you don't control web/app server and can't set it the proper way

data/lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb ADDED Viewed

@@ -0,0 +1,6 @@
+module Picolena
+  IndexingConfiguration={}
+  YAML.load_file('config/custom/indexing_performance.yml').each_pair{|param, value|
+    IndexingConfiguration[param.to_sym]= value=~/^[\d_]+$/ ? value.to_i : value
+  }
+end

data/lib/picolena/templates/lib/tasks/index.rake CHANGED Viewed

@@ -14,10 +14,15 @@ namespace :index do
   task :update => :environment do
     Indexer.index_every_directory
   end
+  desc 'Remove unneeded files from index'
+  task :prune => :environment do
+    Indexer.prune_index
+  end
   desc 'Returns the number of indexed documents'
   task :size => :environment do
-    puts "#{Indexer.doc_count} documents are currently indexed in #{Picolena::IndexSavePath}"
+    puts "#{Indexer.size} documents are currently indexed in #{Picolena::IndexSavePath}"
   end
   # Search index with query "some query" :

data/lib/picolena/templates/lib/tasks/install_dependencies.rake CHANGED Viewed

@@ -30,7 +30,7 @@ namespace :install_dependencies do
   task :deb_packages do
     root_privileges_required!
     #TODO: Should load this list from defined PlainTextExtractor's
-    packages=%w{antiword poppler-utils odt2txt html2text catdoc unrtf mguesser}.join(" ")
+    packages=%w{antiword poppler-utils odt2txt html2text catdoc unrtf mguesser libdbm-ruby1.8}.join(" ")
     puts "Installing "<<packages
     system("apt-get install "<<packages)
   end

data/lib/picolena/templates/public/stylesheets/style.css CHANGED Viewed

@@ -82,6 +82,17 @@ h1, h2, h3, h4, h5, h6, p, form {
 	text-decoration:none;
 }
+.sort_by {
+	float:right;
+	font-size: 13px;
+	color:#000;
+}
+.sort_by a{
+	color: #EE8907;
+	text-decoration:none;
+}
 #mainimg input.btn{
    margin-right: 10px;
    height: 20px;
@@ -116,7 +127,7 @@ width: 80%;
 #results {
 	width:778px;
-	padding-top: 25px;
+	padding-top: 15px;
 }
 #results h2 a{
@@ -137,6 +148,11 @@ width: 80%;
 	padding:0px 20px;
 }
+#results .matching_content{
+	background-color:#ffff66;
+}
 #results a, #results small{
 	font-family:"Trebuchet MS";
 	font-size:11px;

data/lib/picolena/templates/spec/models/basic_finder_spec.rb CHANGED Viewed

@@ -50,16 +50,16 @@ describe "Basic Finder" do
     Indexer.index_every_directory(remove_first=true)
   end
-  it "should accept one parameter as query, and 2 optionals for paginating" do
+  it "should accept one parameter as query, 1 optional for sorting results and 2 optionals for paginating" do
     lambda {Finder.new}.should raise_error(ArgumentError, "wrong number of arguments (0 for 1)")
     # show first page with 10 results per page
     lambda {Finder.new("a b")}.should_not raise_error
     # show second page
-    lambda {Finder.new("a", 2)}.should_not raise_error
+    lambda {Finder.new("a", "by_date")}.should_not raise_error
     # show first page with 15 results
-    lambda {Finder.new("a", 1, 15)}.should_not raise_error
+    lambda {Finder.new("a",  "by_date", 1, 15)}.should_not raise_error
     # Too many parameters
-    lambda {Finder.new("a", 10, 20, 30)}.should raise_error(ArgumentError, "wrong number of arguments (4 for 3)")
+    lambda {Finder.new("a",  "by_date", 10, 20, 30)}.should raise_error(ArgumentError, "wrong number of arguments (5 for 4)")
   end
   it "should return matching documents if executed successfully" do

data/lib/picolena/templates/spec/models/document_spec.rb CHANGED Viewed

@@ -5,28 +5,30 @@ basic_pdf_attribute={
   :basename=>'basic',
   :complete_path=>File.join(RAILS_ROOT, '/spec/test_dirs/indexed/basic/basic.pdf'),
   :extname=>'.pdf',
-  :filename=>'basic.pdf'
+  :ext_as_sym => :pdf,
+  :filename=>'basic.pdf',
+  :size => 9380
 }
 describe Document do
   before(:each) do
-    @valid_random_doc=Document.find(:random) rescue Document.new("spec/test_dirs/indexed/basic/basic.pdf")
+    @valid_document=Document.new("spec/test_dirs/indexed/basic/basic.pdf")
   end
   it "should be an existing file" do
     lambda {Document.new("/patapouf.txt")}.should raise_error(Errno::ENOENT)
-    lambda {@valid_random_doc}.should_not raise_error
+    lambda {@valid_document}.should_not raise_error
     lambda {Document.new("spec/test_dirs/not_indexed/Rakefile")}.should_not raise_error(Errno::ENOENT)
   end
   it "should belong to an indexed directory" do
-    lambda {@valid_random_doc}.should_not raise_error
+    lambda {@valid_document}.should_not raise_error
     lambda {Document.new("spec/test_dirs/not_indexed/Rakefile")}.should raise_error(ArgumentError, "required document is not in indexed directory")
   end
   basic_pdf_attribute.each{|attribute,expected_value|
     it "should know its #{attribute}" do
-      @valid_random_doc.should respond_to(attribute)
+      @valid_document.should respond_to(attribute)
       @basic_pdf=Document.new('spec/test_dirs/indexed/basic/basic.pdf')
       @basic_pdf.send(attribute).should == expected_value
     end
@@ -36,23 +38,70 @@ describe Document do
     another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
     another_doc.content.should == "just a content test\nin a txt file"
   end
+  it "should know its cached content" do
+    another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
+    another_doc.cached.should == "just a content test\nin a txt file"
+  end
+  it "should know its highlighted cached content for a given query" do
+    another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
+    another_doc.highlighted_cache('a content test').should == "just a <<content>> <<test>>\nin a txt file"
+  end
   it "should know its alias_path" do
-    @valid_random_doc.should respond_to(:alias_path)
-    @valid_random_doc.alias_path.starts_with?("http://picolena.devjavu.com/browser/trunk/lib/picolena/templates/spec/test_dirs/indexed").should be_true
+    @valid_document.should respond_to(:alias_path)
+    @valid_document.alias_path.starts_with?("http://picolena.devjavu.com/browser/trunk/lib/picolena/templates/spec/test_dirs/indexed").should be_true
+  end
+  it "should know its probably_unique_id" do
+    @valid_document.should respond_to(:probably_unique_id)
+    @valid_document.probably_unique_id.should =~/^[a-z]+$/
+    @valid_document.probably_unique_id.size.should == Picolena::HashLength
   end
+  it "should know its modification date" do
+    @valid_document.pretty_date.class.should == String
+    @valid_document.pretty_date.should =~/^\d{4}\-\d{2}\-\d{2}$/
+  end
+  it "should know its modification time and returns it in a pretty way" do
+    @valid_document.should respond_to(:mtime)
+    @valid_document.mtime.should be_kind_of(Integer)
+    @valid_document.should respond_to(:pretty_mtime)
+    @valid_document.pretty_mtime.class.should == String
+    @valid_document.pretty_mtime.should =~/^\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}$/
+  end
+  it "should know if its content can be extracted" do
+    @valid_document.should respond_to(:supported?)
+    @valid_document.should be_supported
+    Document.new("spec/test_dirs/indexed/others/ghjopdfg.xyz").should_not be_supported
+  end
+  it "should know its language when enough content is available" do
+    Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
+    Document.new("spec/test_dirs/indexed/lang/shakespeare").language.should == "en"
+    Document.new("spec/test_dirs/indexed/lang/lorca").language.should == "es"
+    Document.new("spec/test_dirs/indexed/lang/hugo").language.should == "fr"
+  end if Picolena::UseLanguageRecognition
+  it "should not try to guess language when file is too small" do
+    Document.new("spec/test_dirs/indexed/basic/hello.rb").language.should be_nil
+    Document.new("spec/test_dirs/indexed/README").language.should be_nil
+  end if Picolena::UseLanguageRecognition
   it "should let finder specify its score" do
-    @valid_random_doc.should respond_to(:score)
-    @valid_random_doc.score.should be_nil
-    @valid_random_doc.score=25
-    @valid_random_doc.score.should == 25
+    @valid_document.should respond_to(:score)
+    @valid_document.score.should be_nil
+    @valid_document.score=25
+    @valid_document.score.should == 25
   end
   it "should let finder specify its matching content" do
-    @valid_random_doc.should respond_to(:matching_content)
-    @valid_random_doc.matching_content.should be_nil
-    @valid_random_doc.matching_content=["thermal cooling", "heat driven cooling"]
-    @valid_random_doc.matching_content.should include("thermal cooling")
+    @valid_document.should respond_to(:matching_content)
+    @valid_document.matching_content.should be_nil
+    @valid_document.matching_content=["thermal cooling", "heat driven cooling"]
+    @valid_document.matching_content.should include("thermal cooling")
   end
-end
+end

data/lib/picolena/templates/spec/models/finder_spec.rb CHANGED Viewed

@@ -8,9 +8,9 @@ end
 def matching_document_for(query)
-   # Returns matching document for any given query only if
-   # exactly one document is found.
-   # Specs don't pass otherwise.
+  # Returns matching document for any given query only if
+  # exactly one document is found.
+  # Specs don't pass otherwise.
   matching_documents=Finder.new(query).matching_documents
   matching_documents.size.should == 1
   matching_documents.first
@@ -19,6 +19,7 @@ end
 describe Finder do
   before(:all) do
+    Globalite.language = :en
     # SVN doesn't like non-ascii filenames.
     revert_changes!('spec/test_dirs/indexed/others/bäñüßé.txt',"just to know if files are indexed with utf8 filenames")

data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb CHANGED Viewed

@@ -13,10 +13,18 @@ describe "Host indexing system" do
   it "should know which IP addresses are allowed (config/custom/white_list_ip.yml)" do
     File.should be_readable('config/custom/white_list_ip.yml')
+    ip_conf=YAML.load_file('config/custom/white_list_ip.yml')
+    ip_conf.class.should == Hash
+    ip_conf['Allow'].should_not be_nil
   end
   it "should know which directories are to be indexed (config/custom/indexed_directories.yml)" do
     File.should be_readable('config/custom/indexed_directories.yml')
+    dirs_conf=YAML.load_file('config/custom/indexed_directories.yml')
+    dirs_conf.class.should == Hash
+    %w(development test production).all?{|env|
+      dirs_conf[env].should_not be_nil
+    }
   end
   it "should be able to calculate base26 hash from strings" do

data/lib/picolena/templates/spec/models/indexer_spec.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 require File.dirname(__FILE__) + '/../spec_helper'
 describe Indexer do
-  before(:each) do
-    @indexer = Indexer.new
+  it "should have at least 32MB memory allocated" do
+    Indexer.index.writer.max_buffer_memory.should > 2**25-1
   end
 end

data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb CHANGED Viewed

@@ -27,16 +27,4 @@ describe "PlainTextExtractors" do
       end
     }
   }
-  it "should guess language when enough content is available" do
-    Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
-    Document.new("spec/test_dirs/indexed/lang/shakespeare").language.should == "en"
-    Document.new("spec/test_dirs/indexed/lang/lorca").language.should == "es"
-    Document.new("spec/test_dirs/indexed/lang/hugo").language.should == "fr"
-  end if Picolena::UseLanguageRecognition
-  it "should not try to guess language when file is too small" do
-    Document.new("spec/test_dirs/indexed/basic/hello.rb").language.should be_nil
-    Document.new("spec/test_dirs/indexed/README").language.should be_nil
-  end if Picolena::UseLanguageRecognition
 end

data/lib/picolena/templates/spec/models/query_spec.rb CHANGED Viewed

@@ -1,8 +1,16 @@
 require File.dirname(__FILE__) + '/../spec_helper'
 describe Query do
-  it "should return a BooleanQuery" do
+  it "should return a BooleanQuery, a TermQuery or a RangeQuery" do
     Query.extract_from("whatever").class.should == Ferret::Search::BooleanQuery
+    Query.extract_from("lang:de").class.should  == Ferret::Search::TermQuery
+    Query.extract_from("date:<1990").class.should  == Ferret::Search::RangeQuery
+  end
+  it "should not remove stop-words from TermQuery" do
+    # it means "Italian language", but also is a stop-word.
+    Query.extract_from("lang:it").class.should  == Ferret::Search::TermQuery
+    Query.extract_from("lang:it").to_s.should   == "language:it"
   end
   it "should translate LIKE, NOT, OR and AND boolean ops to English" do
@@ -12,6 +20,7 @@ describe Query do
       :fr=>["COMME","NON","OU","ET"]
     }
+    Globalite.language = :en
     english_query_with_like_and_not=Query.extract_from("LIKE something NOT something")
     english_query_with_or=Query.extract_from("test OR another")
     english_query_with_and=Query.extract_from("test AND another")

data/lib/picolena/version.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module Picolena #:nodoc:
   module VERSION #:nodoc:
     MAJOR = 0
     MINOR = 1
-    TINY  = 6
+    TINY  = 7
     STRING = [MAJOR, MINOR, TINY].join('.')
   end

data/website/index.html CHANGED Viewed

@@ -33,7 +33,7 @@
     <h1>Picolena</h1>
     <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
       <p>Get Version</p>
-      <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.6</a>
+      <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.7</a>
     </div>
     <h1>&#x2192; &#8216;picolena&#8217;</h1>

data/website/index.txt CHANGED Viewed

File without changes

data/website/index_devjavu CHANGED Viewed

File without changes

data/website/javascripts/rounded_corners_lite.inc.js CHANGED Viewed

File without changes

data/website/stylesheets/screen.css CHANGED Viewed

File without changes

data.tar.gz.sig CHANGED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: picolena
 version: !ruby/object:Gem::Version
-  version: 0.1.6
+  version: 0.1.7
 platform: ruby
 authors:
 - Eric Duminil
@@ -30,7 +30,7 @@ cert_chain:
   qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
   -----END CERTIFICATE-----
-date: 2008-04-25 00:00:00 +02:00
+date: 2008-04-30 00:00:00 +02:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -146,6 +146,7 @@ files:
 - lib/picolena/config/basic.rb
 - lib/picolena/config/icons_and_filetypes.yml
 - lib/picolena/config/indexed_directories.yml
+- lib/picolena/config/indexing_performance.yml
 - lib/picolena/config/title_and_names_and_links.yml
 - lib/picolena/config/white_list_ip.yml
 - lib/picolena/picolena_generator.rb
@@ -177,6 +178,7 @@ files:
 - lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
 - lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
 - lib/picolena/templates/config/initializers/006_load_icons.rb
+- lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb
 - lib/picolena/templates/config/routes.rb
 - lib/picolena/templates/lang/ui/de.yml
 - lib/picolena/templates/lang/ui/en.yml

metadata.gz.sig CHANGED Viewed

@@ -1 +1,2 @@
-��"#m��EZY����v��>��lmLWA�ft�-�����<d�B��w]��T�7��ꞅ�tR-i��X�W��%2e�8f�]b�M�<IAF�ͯ7]krz�)w���
+�;����U�=nƷ�8߿X�`>����B����2Ħ@,u!��~�u�9>�Ӽq�J1� ֖i�T������-.q�^l*�`�>"��m�8��ɏP�cWk��y%����W�:r=&����CtaO;c
+.&��}�e)�g(O�)0ة)!����s�
+�"��Fm��>8���n���q�?I�P'����`|����`�\�>{\a4�Ӷ�JǮ}�&�?�d�UM{