RubyGems - picolena - Versions diffs - 0.2.0 → 0.2.2 - Mend

picolena 0.2.0 → 0.2.2

Files changed (70) hide show

data/lib/picolena/templates/config/initializers/{006_load_icons.rb → 007_load_icons.rb} RENAMED

File without changes

data/lib/picolena/templates/config/initializers/{007_load_performance_tweaks.rb → 008_load_performance_tweaks.rb} RENAMED

File without changes

data/lib/picolena/templates/lib/core_exts.rb CHANGED

@@ -3,6 +3,15 @@ class String
   def base26_hash(length=Picolena::HashLength)
     Digest::MD5.hexdigest(self).to_i(16).to_s(26).tr('0-9a-p', 'a-z')[-length,length]
   end
+  # Returns true iff self is an available command on the system
+  # >> "grep".installed?
+  # => true
+  # >> "sdfgsdfgsdf".installed?
+  # => false
+  def installed?
+     !IO.popen("which #{self}"){|i| i.read}.empty?
+  end
 end
 module Enumerable
@@ -100,4 +109,47 @@ class File
   def self.plain_text?(filename)
     %x{file -i "#{filename}"} =~ /: text\//
   end
+  # For a given file, returns the path at which a thumbnail should be saved
+  def self.thumbnail_path(filename, public_dir=false)
+    thumb=expand_path(filename).base26_hash+'.jpg'
+    public_dir ? File.join('thumbnails', thumb) : File.join(RAILS_ROOT,  'public/images/thumbnails', thumb)
+  end
+end
+class Object
+  # [1,2,3].is_an?(Array) just looks better than [1,2,3].is_a?(Array)
+  alias_method :is_an?, :is_a?
+end
+module Kernel
+  require 'open3'
+  # Executes a command and returns stdout while silenting stderr
+  # NOTE: Restricted to systems on which forking is possible. How to do on windows?
+  def silently_execute(command)
+    Open3.popen3(command){|i,e,o| e.read}
+  end
+end
+# A PlainTextExtractor.command can be either a String, a Block or undefined.
+class String
+  # For a given *nix command line, returns an Array of required commands:
+  #   >> "xls2csv SOURCE | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'".dependencies
+  #   => ["xls2csv", "grep", "sed"]
+  def dependencies
+    self.split(/\|\s*/).collect{|command_part| command_part.split(/ /).first}
+  end
+end
+class Proc
+  def dependencies
+    []
+  end
+end
+class NilClass
+  def dependencies
+    []
+  end
 end

data/lib/picolena/templates/lib/development_helpers.rb ADDED

@@ -0,0 +1,35 @@
+# D.query displays matching_documents for query, and returns the document
+# with the highest score.
+# Useful for development and debugging purposes
+#
+# >> D.test
+#  71 document(s) found for test:
+#  for_test.txt
+#  some_test_files.zip
+#  plain.txt
+#  another_plain.text
+#  other_basic.PDF
+#  basic.pdf
+#  basic.odt
+#  basic.tex
+#  queens.for
+#  README
+#  ...........
+#  => "spec/test_dirs/indexed/just_one_doc/for_test.txt (82.7%)"
+class D
+  def self.method_missing(query,*params)
+    self[query.to_s] || super
+  end
+  def self.[](query)
+    f=Finder.new(query.to_s)
+    hits=f.total_hits
+    if hits > 0 then
+      puts "#{hits} document(s) found for #{query}:"
+      f.matching_documents.each{|doc| puts "  "+doc.filename}
+      puts "  ..........." if hits > f.matching_documents.size
+      f.matching_documents.first
+    else
+      puts "Nothing found for #{query}"
+    end
+  end
+end

data/lib/picolena/templates/lib/plain_text_extractor_dsl.rb ADDED

@@ -0,0 +1,128 @@
+# Defines plain text extractors with DSL
+# For example, to convert "Microsoft Office Word document" to plain text
+#  PlainTextExtractor.new {
+#    every :doc, :dot
+#    as "application/msword"
+#    aka "Microsoft Office Word document"
+#    extract_content_with "antiword SOURCE" => :on_linux, "some other command" => :on_windows
+#    which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
+#    or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
+#  }
+module PlainTextExtractorDSL
+  attr_reader :exts, :mime_name, :description, :command, :content_and_file_examples, :thumbnail_command
+  def initialize(&block)
+    @content_and_file_examples=[]
+    self.instance_eval(&block)
+    PlainTextExtractor.add(self)
+  end
+  def every(*exts)
+    @exts ||=[]
+    @exts |= exts
+  end
+  def as(mime_name)
+    @mime_name=mime_name
+  end
+  def aka(description)
+    @description=description
+  end
+  def which_requires(*dependencies)
+    @dependencies=dependencies
+  end
+  #used by rspec to test extractors:
+  #  which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
+  #  or_extract 'some other stuff inside another pdf file', :from => 'yet_another.pdf'
+  #
+  #this spec will pass if 'basic.pdf' and 'yet_another.pdf' are included in an indexed directory, if every dependency is installed,
+  #and if plain text output from the extractor applied to 'basic.pdf' and 'yet_another.pdf' respectively include 'in a pdf file' and 'some other stuff inside another pdf file'
+  def which_should_for_example_extract(content, file)
+    @content_and_file_examples << [content,file[:from]]
+  end
+  #it allows to define specs in this way:
+  #  which_should_for_example_extract 'Hello world!', :from => 'hello.rb'
+  #  or_extract 'text inside!', :from => 'crossed.txt'
+  alias_method :or_extract, :which_should_for_example_extract
+  def extract_content_with(command_as_hash_or_string=nil,&block)
+    #TODO: Find a better way to manage platforms, and include OS X, Vista, BSD...
+  @command=case command_as_hash_or_string
+  when String
+    command_as_hash_or_string
+  when Hash
+    command_for_current_platform(command_as_hash_or_string)
+    else
+      block || raise("No command defined for this extractor: #{description}")
+    end
+  end
+  def extract_thumbnail_with(command_as_hash_or_string=nil, &block)
+    #TODO: Don't ignore block and use it as in extract_content_with
+    @thumbnail_command=case command_as_hash_or_string
+    when String
+      command_as_hash_or_string
+    when Hash
+      command_for_current_platform(command_as_hash_or_string)
+    end
+  end
+  # Unpack an archive and extract content from every supported file
+  def extract_content_from_archive_with(unpack_command)
+    #FIXME: Cleaner code needed!
+    @command=lambda {|source|
+      begin
+        global_temp_dir   = File.join(Dir::tmpdir, 'picolena_archive_temp')
+        specific_temp_dir = File.join(global_temp_dir, source.base26_hash)
+        FileUtils.mkpath specific_temp_dir
+        specific_unpack_command=unpack_command.sub('SOURCE','"'<<source<<'"').sub(/TE?MPDIR/,'"'<<specific_temp_dir<<'"')
+        silently_execute(specific_unpack_command)
+        Dir["#{specific_temp_dir}/**/*"].select{|f| File.file?(f)}.map{|filename|
+          content=PlainTextExtractor.extract_content_from(filename) rescue "---"
+          ["##"<<filename.sub(specific_temp_dir,'').gsub('/', '>'),
+            content]
+        }.join("\n")
+      ensure
+        FileUtils.remove_entry_secure(specific_temp_dir)
+        FileUtils.rmdir(global_temp_dir) rescue "not empty"
+      end
+    }
+    (@dependencies||=[])<<unpack_command.dependencies
+  end
+  private
+  def command_for_current_platform(command_as_hash)
+    # Allows to write
+    #     with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
+    #          "some other command" => :on_windows
+    #
+    # On linux and mac_os platforms, it returns "pdftotext -enc UTF-8 SOURCE -",
+    # on windows, it returns "some other command"
+    #
+    # If commands for linux & mac os were different :
+    #     with "some command"        => :on_linux,
+    #          "another command"     => :on_mac_os,
+    #          "yet another command" => :on_windows
+    #
+    #NOTE: What to do when no command is defined for a given platform?
+    command_as_hash.invert.find{|platforms,command|
+      platforms.to_s.split(/_?and_?/i).collect{|on_platform| on_platform.sub(/on_/,'').to_sym}.include?(current_platform_symbol)
+    }.last.dup
+  end
+  def current_platform_symbol
+    @@platform_symbol||=case RUBY_PLATFORM
+      when /linux/
+        :linux
+      when /win/
+        :windows
+      when /darwin/
+        :mac_os
+      end
+    end
+end

data/lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb CHANGED

@@ -8,7 +8,7 @@ PlainTextExtractor.new {
   every :pdf
   as "application/pdf"
   aka "Adobe Portable Document Format"
-  with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
-       "some other command" => :on_windows
+  extract_content_with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
+                       "some other command" => :on_windows
   which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
 }

data/lib/picolena/templates/lib/plain_text_extractors/adobe.photoshop.rb ADDED

@@ -0,0 +1,12 @@
+PlainTextExtractor.new {
+  every :psd
+  as "image/adobe.photoshop"
+  aka "Adobe Photoshop Format"
+  #NOTE: PSD gets its own Extractor since convert method is different from one-layer pictures
+  #      and needs -flatten option
+  extract_thumbnail_with           'convert SOURCE -flatten -thumbnail 80x80 -quality 50 THUMBNAIL'
+  extract_content_with             'exiftool SOURCE'
+  which_should_for_example_extract '"Adobe Photoshop CS2 Windows" 584x150', :from => 'picolena.psd'
+}

data/lib/picolena/templates/lib/plain_text_extractors/html.rb CHANGED

@@ -2,7 +2,7 @@ PlainTextExtractor.new {
   every :html, :htm
   as "text/html"
   aka "HyperText Markup Language document"
-  with {|source|
+  extract_content_with {|source|
     encoding=File.encoding(source)
     if encoding.empty? or encoding.gsub(/[^\w]/,'').downcase=="utf8" then
       %x{html2text -nobs "#{source}"}

data/lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb CHANGED

@@ -4,8 +4,8 @@ PlainTextExtractor.new {
   every :xls
   as "application/excel"
   aka "Microsoft Office Excel document"
-  with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux_and_mac_os,
-       "some other command" => :on_windows
+  extract_content_with "xls2csv SOURCE | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux_and_mac_os,
+                       "some other command" => :on_windows
   which_should_for_example_extract 'Some text (should be indexed!)', :from => 'table.xls'
 }
@@ -16,7 +16,7 @@ PlainTextExtractor.new {
   every :xlsx
   as 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
   aka "Microsoft Office 2007 Excel spreadsheet"
-  with {|source|
+  extract_content_with {|source|
     Zip::ZipFile.open(source){|zipfile|
       text_cells=zipfile.read("xl/sharedStrings.xml").split(/</).grep(/^t/).collect{|l|
         l.sub(/^[^>]+>/,'')
@@ -39,4 +39,4 @@ PlainTextExtractor.new {
 ##   Home page: http://www.winfield.demon.nl/
 ## MS OOXML excel to text conversion:
-## Ruby code written by Eric DUMINIL
+## Ruby code written by Eric DUMINIL

data/lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb CHANGED

@@ -4,8 +4,8 @@ PlainTextExtractor.new {
   every :ppt, :pps
   as "application/powerpoint"
   aka "Microsoft Office Powerpoint document"
-  with "catppt SOURCE" => :on_linux_and_mac_os,
-       "some other command" => :on_windows
+  extract_content_with "catppt SOURCE" => :on_linux_and_mac_os,
+                       "some other command" => :on_windows
   which_should_for_example_extract 'unofficial written by OOo Impress', :from => 'one_page.ppt'
   #FIXME: it seems that catppt cannot open .pps files.
   #or_extract 'a lightweight ferret-powered search engine written in Ruby on rails.', :from => 'picolena.pps'
@@ -18,7 +18,7 @@ PlainTextExtractor.new {
   every :pptx
   as 'application/vnd.openxmlformats-officedocument.presentationml.presentation' #could that mime BE any longer?
   aka "Microsoft Office 2007 Powerpoint document"
-  with {|source|
+  extract_content_with {|source|
     Zip::ZipFile.open(source){|zipfile|
       slides=zipfile.entries.select{|l| l.name=~/^ppt\/slides\/slide\d+.xml/}
       slides.collect{|entry|
@@ -38,4 +38,4 @@ PlainTextExtractor.new {
 ##   Home page: http://www.wagner.pp.ru/~vitus/software/catdoc/
 ## MS OOXML powerpoint to text conversion:
-## Ruby code written by Eric DUMINIL
+## Ruby code written by Eric DUMINIL

data/lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb CHANGED

@@ -8,7 +8,7 @@ PlainTextExtractor.new {
   every :rtf
   as "application/rtf"
   aka "Microsoft Rich Text Format"
-  with "unrtf  SOURCE -t text" => :on_linux_and_mac_os,
-       "some other command" => :on_windows
+  extract_content_with "unrtf  SOURCE -t text" => :on_linux_and_mac_os,
+                       "some other command" => :on_windows
   which_should_for_example_extract 'Resampling when limiting', :from => 'ReadMe.rtf'
-}
+}

data/lib/picolena/templates/lib/plain_text_extractors/ms.word.rb CHANGED

@@ -4,8 +4,8 @@ PlainTextExtractor.new {
   every :doc, :dot
   as "application/msword"
   aka "Microsoft Office Word document"
-  with "antiword SOURCE" => :on_linux_and_mac_os,
-       "some other command" => :on_windows
+  extract_content_with "antiword SOURCE" => :on_linux_and_mac_os,
+                       "some other command" => :on_windows
   which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
   or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
 }
@@ -17,7 +17,7 @@ PlainTextExtractor.new {
   every :docx, :dotx
   as 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
   aka "Microsoft Office 2007 Word document"
-  with {|source|
+  extract_content_with {|source|
     Zip::ZipFile.open(source){|zipfile|
       zipfile.read("word/document.xml").split(/</).grep(/^w:t/).collect{|l|
         l.sub(/^[^>]+>/,'')
@@ -35,4 +35,4 @@ PlainTextExtractor.new {
 ##   Home page: http://www.winfield.demon.nl/
 ## MS OOXML word to text conversion:
-## Ruby code written by Eric DUMINIL
+## Ruby code written by Eric DUMINIL

data/lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb CHANGED

@@ -5,7 +5,7 @@ PlainTextExtractor.new {
   every :odp
   as 'application/vnd.oasis.opendocument.presentation'
   aka "Open Document Format for presentation"
-  with {|source|
+  extract_content_with {|source|
     Zip::ZipFile.open(source){|zipfile|
       zipfile.read("content.xml").split(/</).grep(/^text:(p|span)/).collect{|l|
         l.sub(/^[^>]+>/,'')
@@ -13,4 +13,4 @@ PlainTextExtractor.new {
     }
   }
   which_should_for_example_extract 'Picolena can it find me maybe!', :from => 'ubuntu_theme.odp'
-}
+}

data/lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb CHANGED

@@ -5,7 +5,7 @@ PlainTextExtractor.new {
   every :ods
   as 'application/vnd.oasis.opendocument.spreadsheet'
   aka "Open Document Format for spreadsheet"
-  with {|source|
+  extract_content_with {|source|
     Zip::ZipFile.open(source){|zipfile|
       zipfile.read("content.xml").split(/</).grep(/^text:(p|span)/).collect{|l|
         l.sub(/^[^>]+>/,'')
@@ -13,4 +13,4 @@ PlainTextExtractor.new {
     }
   }
   which_should_for_example_extract 'Cessna F-172P G-BIDF, serial number 2045', :from => 'weight_and_balance.ods'
-}
+}

data/lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb CHANGED

@@ -5,7 +5,7 @@ PlainTextExtractor.new {
   every :odt
   as 'application/vnd.oasis.opendocument.text'
   aka "Open Document Format for text"
-  with {|source|
+  extract_content_with {|source|
     Zip::ZipFile.open(source){|zipfile|
       zipfile.read("content.xml").split(/</).grep(/^text:(p|span)/).collect{|l|
         l.sub(/^[^>]+>/,'')
@@ -13,4 +13,4 @@ PlainTextExtractor.new {
     }
   }
   which_should_for_example_extract 'written with OpenOffice.org', :from => 'basic.odt'
-}
+}

data/lib/picolena/templates/lib/plain_text_extractors/pictures.rb CHANGED

@@ -1,8 +1,19 @@
 PlainTextExtractor.new {
-  every :bmp, :crw, :eps, :gif, :jpeg, :jpg, :nef, :png, :psd, :raw, :tif, :tiff
+  every :bmp, :crw, :eps, :gif, :jpeg, :jpg, :nef, :png, :raw, :tif, :tiff
   as "image/*"
   aka "some picture"
-  with 'exiftool SOURCE'
-  which_requires 'exiftool'
-  which_should_for_example_extract 'Eric Duminil Nikon D90', :from => 'crow.jpg'
+  extract_thumbnail_with           'convert -quality 50 -thumbnail 80x80 SOURCE THUMBNAIL'
+  extract_content_with             'exiftool SOURCE'
+  which_should_for_example_extract 'Eric Duminil Nikon D90'                      , :from => 'crow.jpg'
+  or_extract                       '64x64 BMP'                                   , :from => 'gnu.bmp'
+  or_extract                       'application/postscript 258x43'               , :from => 'diceface.eps'
+  or_extract                       'Panasonic DMC-FZ8 320x240'                   , :from => 'glass.png'
+  or_extract                       'Panasonic DMC-FZ8 "35mm equivalent: 432.0mm"', :from => 'cygnus.jpeg'
+  or_extract                       '"1990 bytes" 24x24 LZW'                      , :from => 'warning.tiff'
+  or_extract                       '"1978 bytes" 24x24 LZW'                      , :from => 'caution.tif'
+  or_extract                       'GIF 110x140'                                 , :from => 'rails_logo_remix.gif'
+  # Raw pictures (.nef, .crw, .raw) would also need to be tested, but their size doesn't make it worth including
+  # corresponding files in the repository. Specs will therefore stay with "Not Yet Implemented" status.
 }

data/lib/picolena/templates/lib/plain_text_extractors/plain_text.rb CHANGED

@@ -1,8 +1,15 @@
 PlainTextExtractor.new {
-  every :txt, :text, :tex, :for, :cpp, :c, :rb, :ins, :vee, :java, :no_extension
+  every :txt, :text
+  every :tex, :bib, :for, :cpp, :c, :rb, :ins, :vee, :java
+  every :ini
+  every :sub, :srt
+  #NOTE: Could be interesting to extract thumbnail from vCards
+  every :vcf, :vcard
+  every :no_extension
   as "application/plain"
   aka "plain text file"
-  with {|source|
+  extract_content_with {|source|
     raise "binary file" unless File.plain_text?(source)
     encoding=File.encoding(source)
     if encoding.empty? then

data/lib/picolena/templates/lib/plain_text_extractors/rar.rb ADDED

@@ -0,0 +1,18 @@
+PlainTextExtractor.new {
+  every :rar
+  as "archive/rar"
+  aka "RAR Archive"
+  # If a non-free version of unrar is available, uses it
+  # because unrar-nonfree supports more archives than  unrar-free
+  if "unrar".installed? then
+   extract_content_from_archive_with "unrar x SOURCE TEMPDIR"
+  else
+   # falls back to unrar-free otherwise
+   extract_content_from_archive_with "unrar-free --extract SOURCE TEMPDIR"
+  end
+  which_should_for_example_extract 'IAE2ORREucIRPx+XgpYcYoO8Twz1TN5/LezRbdwWonlAqpDanBTR+McCehXpk7Pz',
+                                                                              :from => 'dumb_file.rar'
+  or_extract                       '"(Same file, but inside one directory)"', :from => 'dumb_file.rar'
+}