RubyGems - picolena - Versions diffs - 0.1.3 → 0.1.4 - Mend

picolena 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/History.txt CHANGED Viewed

@@ -1,3 +1,7 @@
+== 0.1.4  2008-04-23
+* 1 minor enhancement:
+  * minimal MacOS support
 == 0.1.3  2008-04-20
 * 1 bug fix:
@@ -5,7 +9,7 @@
 == 0.1.2  2008-04-20
-* major enhancement:
+* 3 major enhancements:
   * complete Indexer & Index rewrite
   * new DSL syntax
   * multi-threaded Indexer
@@ -13,7 +17,7 @@
 == 0.1.1  2008-04-12
-* major enhancement:
+* 1 major enhancement:
   * cache à la Google
 * minor enhancements:
@@ -24,14 +28,14 @@
 == 0.1.0  2008-04-08
-* minor enhancements:
+* 3 minor enhancements:
   * can now be installed on win32 (doesn't pass every spec though)
   * moved rails_plugins away from lib/ so that they don't get parsed by rdoc/ri
   * shorter and prettier base26_hash id for documents.
 == 0.0.99  2008-04-06
-* minor enhancements:
+* 2 minor enhancements:
   * more complete specs
   * mtime is now indexed and available in queries as "date:20080406"

data/lib/picolena/templates/app/models/indexer.rb CHANGED Viewed

@@ -59,7 +59,7 @@ class Indexer
         File.file?(filename) && filename !~ @@exclude
       }
-      indexing_list_chunks=indexing_list.in_transposed_chunks(threads_number)
+      indexing_list_chunks=indexing_list.in_transposed_slices(threads_number)
       indexing_list_chunks.each_with_thread{|chunk|
         chunk.each{|filename|

data/lib/picolena/templates/app/models/plain_text_extractor.rb CHANGED Viewed

@@ -13,11 +13,6 @@ class PlainTextExtractor
       all<<extractor
     end
-    # Calls block for each extractor
-    def each(&block)
-      all.each(&block)
-    end
     # Returns every required dependency for every defined extractor
     def dependencies
       @@dependencies||=all.collect{|extractor| extractor.dependencies}.flatten.compact.uniq.sort
@@ -28,13 +23,19 @@ class PlainTextExtractor
       @@supported_exts||=all.collect{|extractor| extractor.exts}.flatten.compact.uniq
     end
-    # Finds which extractor should be used for a given file, according to its extension
+    # Finds which extractor should be used for a given file.
     # Raises if the file is unsupported.
     def find_by_filename(filename)
       ext=File.ext_as_sym(filename)
-      found_extractor=all.find{|extractor| extractor.exts.include?(ext)} || raise(ArgumentError, "no convertor for #{filename}")
-      found_extractor.source=filename
-      found_extractor
+      returning find_by_extension(ext) do |found_extractor|
+        found_extractor.source=filename
+      end
+    end
+    # Finds which extractor should be used for a given file, according to its extension
+    # Raises if the file is unsupported.
+    def find_by_extension(ext)
+      all.find{|extractor| extractor.exts.include?(ext)} || raise(ArgumentError, "no convertor for .#{ext}")
     end
     # Launches extractor on given file and outputs plain text result
@@ -42,10 +43,13 @@ class PlainTextExtractor
       find_by_filename(source).extract_content
     end
+    # Launches extractor on given file and outputs plain text result and language (if found)
     def extract_content_and_language_from(source)
       find_by_filename(source).extract_content_and_language
     end
+    # Returns which language guesser should be used by the system.
+    # Returns nil if none is found.
     def language_guesser
       @@language_guesser||=('mguesser -n1' unless IO.popen("which mguesser"){|i| i.read}.empty?)
     end
@@ -105,8 +109,12 @@ class PlainTextExtractor
   # and if probability score is higher than 90%.
   def extract_content_and_language
     content=extract_content
-    # Language recognition is too unreliable for small files.
-    return [content, nil] unless Picolena::UseLanguageRecognition && PlainTextExtractor.language_guesser && content.size > 500
+    return [content, nil] unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb)
+                                  Picolena::UseLanguageRecognition,
+                                  # Is a language guesser already installed?
+                                  PlainTextExtractor.language_guesser,
+                                  # Language recognition is too unreliable for small files.
+                                  content.size > 500].all?
     language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser|
       lang_guesser.write content
       lang_guesser.close_write

data/lib/picolena/templates/lib/core_exts.rb CHANGED Viewed

@@ -34,9 +34,25 @@ module Enumerable
 end
 class Array
-  def in_transposed_chunks(n)
-    s=self.size
-    i=n-s%n
+  # Returns a partition of n arrays.
+  # Transposition is used to avoid getting arrays that are too different.
+  #   >> (0..17).to_a.in_transposed_slices(5)
+  #   => [[0, 5, 10, 15], [1, 6, 11, 16], [2, 7, 12, 17], [3, 8, 13], [4, 9, 14]]
+  # while
+  #   >> (0..17).enum_slice(5).to_a
+  #   => [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17]]
+  #
+  # If some folders contain big files and some others contain small ones,
+  # every indexing thread will get some of both!
+  def in_transposed_slices(n)
+    # no need to compute anything if n==1
+    return [self] if n==1
+    # Array#transpose would raise if Array is not a square array of arrays.
+    i=n-self.size%n
+    # Adds nils so that size is a multiple of n,
+    # cuts array in slices of size n,
+    # transposes to get n slices,
+    # and removes added nils.
     (self+[nil]*i).enum_slice(n).to_a.transpose.collect{|e| e.compact}
   end
 end

data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb CHANGED Viewed

@@ -54,19 +54,37 @@ module PlainTextExtractorDSL
     #TODO: Find a better way to manage platforms, and include OS X, Vista, BSD...
     platform=case RUBY_PLATFORM
     when /linux/
-      :on_linux
+      :linux
     when /win/
-      :on_windows
+      :windows
+    when /darwin/
+      :mac_os
     end
     @command=case command_as_hash_or_string
     when String
       command_as_hash_or_string
     when Hash
-      #dup must be used, otherwise @command gets frozen. No idea why though....
-      command_as_hash_or_string.invert[platform].dup
+      # Allows to write
+      #     with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
+      #          "some other command" => :on_windows
+      #
+      # On linux and mac_os platforms, it returns "pdftotext -enc UTF-8 SOURCE -",
+      # on windows, it returns "some other command"
+      #
+      # If commands for linux & mac os were different :
+      #     with "some command"        => :on_linux,
+      #          "another command"     => :on_mac_os,
+      #          "yet another command" => :on_windows
+      #
+      #TODO: Make it clearer and more robust.
+      #NOTE: What to do when no command is defined for a given platform?
+      command_as_hash_or_string.invert.find{|platforms,command|
+        platforms.to_s.split(/_?and_?/i).collect{|on_platform| on_platform.sub(/on_/,'').to_sym}.include?(platform)
+      }.last.dup
     else
       block || raise("No command defined for this extractor: #{description}")
     end
-    @command<<' 2>/dev/null' if (@command.is_a?(String) && platform==:on_linux && !@command.include?('|'))
+    # TODO, replace it with Open3 or something.
+    @command<<' 2>/dev/null' if (@command.is_a?(String) && platform.to_s=~/(linux|mac_os)/ && !@command.include?('|'))
   end
 end

data/lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb CHANGED Viewed

@@ -8,6 +8,7 @@ PlainTextExtractor.new {
   every :pdf
   as "application/pdf"
   aka "Adobe Portable Document Format"
-  with "pdftotext -enc UTF-8 SOURCE -" => :on_linux, "some other command" => :on_windows
+  with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
+       "some other command" => :on_windows
   which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
 }

data/lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb CHANGED Viewed

@@ -4,7 +4,8 @@ PlainTextExtractor.new {
   every :xls
   as "application/excel"
   aka "Microsoft Office Excel document"
-  with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux, "some other command" => :on_windows
+  with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux_and_mac_os,
+       "some other command" => :on_windows
   which_should_for_example_extract 'Some text (should be indexed!)', :from => 'table.xls'
 }

data/lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb CHANGED Viewed

@@ -4,7 +4,8 @@ PlainTextExtractor.new {
   every :ppt, :pps
   as "application/powerpoint"
   aka "Microsoft Office Powerpoint document"
-  with "catppt SOURCE" => :on_linux, "some other command" => :on_windows
+  with "catppt SOURCE" => :on_linux_and_mac_os,
+       "some other command" => :on_windows
   which_should_for_example_extract 'unofficial written by OOo Impress', :from => 'one_page.ppt'
   #FIXME: it seems that catppt cannot open .pps files.
   #or_extract 'a lightweight ferret-powered search engine written in Ruby on rails.', :from => 'picolena.pps'

data/lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb CHANGED Viewed

@@ -8,6 +8,7 @@ PlainTextExtractor.new {
   every :rtf
   as "application/rtf"
   aka "Microsoft Rich Text Format"
-  with "unrtf  SOURCE -t text" => :on_linux, "some other command" => :on_windows
+  with "unrtf  SOURCE -t text" => :on_linux_and_mac_os,
+       "some other command" => :on_windows
   which_should_for_example_extract 'Resampling when limiting', :from => 'ReadMe.rtf'
 }

data/lib/picolena/templates/lib/plain_text_extractors/ms.word.rb CHANGED Viewed

@@ -4,7 +4,8 @@ PlainTextExtractor.new {
   every :doc, :dot
   as "application/msword"
   aka "Microsoft Office Word document"
-  with "antiword SOURCE" => :on_linux, "some other command" => :on_windows
+  with "antiword SOURCE" => :on_linux_and_mac_os,
+       "some other command" => :on_windows
   which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
   or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
 }

data/lib/picolena/templates/spec/controllers/application_controller_spec.rb CHANGED Viewed

@@ -1,12 +1,17 @@
 require File.dirname(__FILE__) + '/../spec_helper'
 describe ApplicationController do
-  it "should give 403 when denying access" do
+  it "should give 403 status when denying access" do
     get 'access_denied'
     response.headers["Status"].should == "403 Forbidden"
   end
-  it "should flash with a wrong request" do
+  it "should render 'Access denied' text when denying access" do
+    get 'access_denied'
+    response.body.should == 'Access denied'
+  end
+  it "should flash a warning if given a wrong request" do
     get 'unknown_request'
     response.should be_redirect
     response.should redirect_to(documents_url)

data/lib/picolena/templates/spec/models/finder_spec.rb CHANGED Viewed

@@ -90,6 +90,7 @@ describe Finder do
   end
   it "should find documents according to their modification date" do
+    #FIXME: Specs don't pass in /tmp folder. They do pass in any other directory, though. Why???
     Finder.new("date:<1982").matching_documents.should be_empty
     Finder.new("19831209*").matching_document.basename.should == "office2003-word-template"
     Finder.new("date:<1983").matching_document.filename.should == "basic.pdf"

data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb CHANGED Viewed

@@ -5,7 +5,7 @@ describe "PlainTextExtractors" do
     IndexReader.ensure_existence
   end
-  PlainTextExtractor.each{|extractor|
+  PlainTextExtractor.all.each{|extractor|
     extractor.exts.each{|ext|
       should_extract= "should be able to extract content from #{extractor.description} (.#{ext})"
       content_and_file_examples_for_this_ext=extractor.content_and_file_examples.select{|content,file| File.ext_as_sym(file)==ext}

data/lib/picolena/version.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module Picolena #:nodoc:
   module VERSION #:nodoc:
     MAJOR = 0
     MINOR = 1
-    TINY  = 3
+    TINY  = 4
     STRING = [MAJOR, MINOR, TINY].join('.')
   end

data/website/index.html CHANGED Viewed

@@ -33,7 +33,7 @@
     <h1>Picolena</h1>
     <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
       <p>Get Version</p>
-      <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.3</a>
+      <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.4</a>
     </div>
     <h1>&#x2192; &#8216;picolena&#8217;</h1>

data.tar.gz.sig CHANGED Viewed

@@ -1,3 +1,2 @@
-$��L���n�鏪Gr�ぅ{�y�S�w\
-cn��ڭb�fU�E�"��̺��;	ظN�A<��n/�@BM�x�)���MT��[٩����\�F�c#�����܂Nk�ND2�[3���P��f�������C��a2�1�5zr�V��ײU��,'�X����v����'A��\�7��쳧�+\��VR��.tX�PI�\���W��j�^1
-Y��E(��|�dX�kU���2.��m�Xd/�L�!�$�?��p�Ī'(
+.��@�:���N6JQp�9V�"QT�k�7~4*�D��w��u���%v��[��r�Y���hB�t:Cv=�,8ڽ��c���;I��V[$y�ǌ�ϓέN�3��x+��yC�Q^ہ�C(�L)�O7�-��2ZV�L�]���i~��JK"8F�|��:�eT��Vp��ߋU��]��
+��V���[;#̧KM���$�;=X�~�>����	wYI7��3ksv��A߶�	��0�GZTi7$�����>@

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: picolena
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.1.4
 platform: ruby
 authors:
 - Eric Duminil
@@ -30,7 +30,7 @@ cert_chain:
   qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
   -----END CERTIFICATE-----
-date: 2008-04-20 00:00:00 +02:00
+date: 2008-04-23 00:00:00 +02:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency

metadata.gz.sig CHANGED Viewed

Binary file