picolena 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ == 0.1.4 2008-04-23
2
+ * 1 minor enhancement:
3
+ * minimal MacOS support
4
+
1
5
  == 0.1.3 2008-04-20
2
6
 
3
7
  * 1 bug fix:
@@ -5,7 +9,7 @@
5
9
 
6
10
  == 0.1.2 2008-04-20
7
11
 
8
- * major enhancement:
12
+ * 3 major enhancements:
9
13
  * complete Indexer & Index rewrite
10
14
  * new DSL syntax
11
15
  * multi-threaded Indexer
@@ -13,7 +17,7 @@
13
17
 
14
18
  == 0.1.1 2008-04-12
15
19
 
16
- * major enhancement:
20
+ * 1 major enhancement:
17
21
  * cache à la Google
18
22
 
19
23
  * minor enhancements:
@@ -24,14 +28,14 @@
24
28
 
25
29
  == 0.1.0 2008-04-08
26
30
 
27
- * minor enhancements:
31
+ * 3 minor enhancements:
28
32
  * can now be installed on win32 (doesn't pass every spec though)
29
33
  * moved rails_plugins away from lib/ so that they don't get parsed by rdoc/ri
30
34
  * shorter and prettier base26_hash id for documents.
31
35
 
32
36
  == 0.0.99 2008-04-06
33
37
 
34
- * minor enhancements:
38
+ * 2 minor enhancements:
35
39
  * more complete specs
36
40
  * mtime is now indexed and available in queries as "date:20080406"
37
41
 
@@ -59,7 +59,7 @@ class Indexer
59
59
  File.file?(filename) && filename !~ @@exclude
60
60
  }
61
61
 
62
- indexing_list_chunks=indexing_list.in_transposed_chunks(threads_number)
62
+ indexing_list_chunks=indexing_list.in_transposed_slices(threads_number)
63
63
 
64
64
  indexing_list_chunks.each_with_thread{|chunk|
65
65
  chunk.each{|filename|
@@ -13,11 +13,6 @@ class PlainTextExtractor
13
13
  all<<extractor
14
14
  end
15
15
 
16
- # Calls block for each extractor
17
- def each(&block)
18
- all.each(&block)
19
- end
20
-
21
16
  # Returns every required dependency for every defined extractor
22
17
  def dependencies
23
18
  @@dependencies||=all.collect{|extractor| extractor.dependencies}.flatten.compact.uniq.sort
@@ -28,13 +23,19 @@ class PlainTextExtractor
28
23
  @@supported_exts||=all.collect{|extractor| extractor.exts}.flatten.compact.uniq
29
24
  end
30
25
 
31
- # Finds which extractor should be used for a given file, according to its extension
26
+ # Finds which extractor should be used for a given file.
32
27
  # Raises if the file is unsupported.
33
28
  def find_by_filename(filename)
34
29
  ext=File.ext_as_sym(filename)
35
- found_extractor=all.find{|extractor| extractor.exts.include?(ext)} || raise(ArgumentError, "no convertor for #{filename}")
36
- found_extractor.source=filename
37
- found_extractor
30
+ returning find_by_extension(ext) do |found_extractor|
31
+ found_extractor.source=filename
32
+ end
33
+ end
34
+
35
+ # Finds which extractor should be used for a given file, according to its extension
36
+ # Raises if the file is unsupported.
37
+ def find_by_extension(ext)
38
+ all.find{|extractor| extractor.exts.include?(ext)} || raise(ArgumentError, "no convertor for .#{ext}")
38
39
  end
39
40
 
40
41
  # Launches extractor on given file and outputs plain text result
@@ -42,10 +43,13 @@ class PlainTextExtractor
42
43
  find_by_filename(source).extract_content
43
44
  end
44
45
 
46
+ # Launches extractor on given file and outputs plain text result and language (if found)
45
47
  def extract_content_and_language_from(source)
46
48
  find_by_filename(source).extract_content_and_language
47
49
  end
48
50
 
51
+ # Returns which language guesser should be used by the system.
52
+ # Returns nil if none is found.
49
53
  def language_guesser
50
54
  @@language_guesser||=('mguesser -n1' unless IO.popen("which mguesser"){|i| i.read}.empty?)
51
55
  end
@@ -105,8 +109,12 @@ class PlainTextExtractor
105
109
  # and if probability score is higher than 90%.
106
110
  def extract_content_and_language
107
111
  content=extract_content
108
- # Language recognition is too unreliable for small files.
109
- return [content, nil] unless Picolena::UseLanguageRecognition && PlainTextExtractor.language_guesser && content.size > 500
112
+ return [content, nil] unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb)
113
+ Picolena::UseLanguageRecognition,
114
+ # Is a language guesser already installed?
115
+ PlainTextExtractor.language_guesser,
116
+ # Language recognition is too unreliable for small files.
117
+ content.size > 500].all?
110
118
  language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser|
111
119
  lang_guesser.write content
112
120
  lang_guesser.close_write
@@ -34,9 +34,25 @@ module Enumerable
34
34
  end
35
35
 
36
36
  class Array
37
- def in_transposed_chunks(n)
38
- s=self.size
39
- i=n-s%n
37
+ # Returns a partition of n arrays.
38
+ # Transposition is used to avoid getting arrays that are too different.
39
+ # >> (0..17).to_a.in_transposed_slices(5)
40
+ # => [[0, 5, 10, 15], [1, 6, 11, 16], [2, 7, 12, 17], [3, 8, 13], [4, 9, 14]]
41
+ # while
42
+ # >> (0..17).enum_slice(5).to_a
43
+ # => [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17]]
44
+ #
45
+ # If some folders contain big files and some others contain small ones,
46
+ # every indexing thread will get some of both!
47
+ def in_transposed_slices(n)
48
+ # no need to compute anything if n==1
49
+ return [self] if n==1
50
+ # Array#transpose would raise if Array is not a square array of arrays.
51
+ i=n-self.size%n
52
+ # Adds nils so that size is a multiple of n,
53
+ # cuts array in slices of size n,
54
+ # transposes to get n slices,
55
+ # and removes added nils.
40
56
  (self+[nil]*i).enum_slice(n).to_a.transpose.collect{|e| e.compact}
41
57
  end
42
58
  end
@@ -54,19 +54,37 @@ module PlainTextExtractorDSL
54
54
  #TODO: Find a better way to manage platforms, and include OS X, Vista, BSD...
55
55
  platform=case RUBY_PLATFORM
56
56
  when /linux/
57
- :on_linux
57
+ :linux
58
58
  when /win/
59
- :on_windows
59
+ :windows
60
+ when /darwin/
61
+ :mac_os
60
62
  end
61
63
  @command=case command_as_hash_or_string
62
64
  when String
63
65
  command_as_hash_or_string
64
66
  when Hash
65
- #dup must be used, otherwise @command gets frozen. No idea why though....
66
- command_as_hash_or_string.invert[platform].dup
67
+ # Allows to write
68
+ # with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
69
+ # "some other command" => :on_windows
70
+ #
71
+ # On linux and mac_os platforms, it returns "pdftotext -enc UTF-8 SOURCE -",
72
+ # on windows, it returns "some other command"
73
+ #
74
+ # If commands for linux & mac os were different :
75
+ # with "some command" => :on_linux,
76
+ # "another command" => :on_mac_os,
77
+ # "yet another command" => :on_windows
78
+ #
79
+ #TODO: Make it clearer and more robust.
80
+ #NOTE: What to do when no command is defined for a given platform?
81
+ command_as_hash_or_string.invert.find{|platforms,command|
82
+ platforms.to_s.split(/_?and_?/i).collect{|on_platform| on_platform.sub(/on_/,'').to_sym}.include?(platform)
83
+ }.last.dup
67
84
  else
68
85
  block || raise("No command defined for this extractor: #{description}")
69
86
  end
70
- @command<<' 2>/dev/null' if (@command.is_a?(String) && platform==:on_linux && !@command.include?('|'))
87
+ # TODO, replace it with Open3 or something.
88
+ @command<<' 2>/dev/null' if (@command.is_a?(String) && platform.to_s=~/(linux|mac_os)/ && !@command.include?('|'))
71
89
  end
72
90
  end
@@ -8,6 +8,7 @@ PlainTextExtractor.new {
8
8
  every :pdf
9
9
  as "application/pdf"
10
10
  aka "Adobe Portable Document Format"
11
- with "pdftotext -enc UTF-8 SOURCE -" => :on_linux, "some other command" => :on_windows
11
+ with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
12
+ "some other command" => :on_windows
12
13
  which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
13
14
  }
@@ -4,7 +4,8 @@ PlainTextExtractor.new {
4
4
  every :xls
5
5
  as "application/excel"
6
6
  aka "Microsoft Office Excel document"
7
- with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux, "some other command" => :on_windows
7
+ with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux_and_mac_os,
8
+ "some other command" => :on_windows
8
9
  which_should_for_example_extract 'Some text (should be indexed!)', :from => 'table.xls'
9
10
  }
10
11
 
@@ -4,7 +4,8 @@ PlainTextExtractor.new {
4
4
  every :ppt, :pps
5
5
  as "application/powerpoint"
6
6
  aka "Microsoft Office Powerpoint document"
7
- with "catppt SOURCE" => :on_linux, "some other command" => :on_windows
7
+ with "catppt SOURCE" => :on_linux_and_mac_os,
8
+ "some other command" => :on_windows
8
9
  which_should_for_example_extract 'unofficial written by OOo Impress', :from => 'one_page.ppt'
9
10
  #FIXME: it seems that catppt cannot open .pps files.
10
11
  #or_extract 'a lightweight ferret-powered search engine written in Ruby on rails.', :from => 'picolena.pps'
@@ -8,6 +8,7 @@ PlainTextExtractor.new {
8
8
  every :rtf
9
9
  as "application/rtf"
10
10
  aka "Microsoft Rich Text Format"
11
- with "unrtf SOURCE -t text" => :on_linux, "some other command" => :on_windows
11
+ with "unrtf SOURCE -t text" => :on_linux_and_mac_os,
12
+ "some other command" => :on_windows
12
13
  which_should_for_example_extract 'Resampling when limiting', :from => 'ReadMe.rtf'
13
14
  }
@@ -4,7 +4,8 @@ PlainTextExtractor.new {
4
4
  every :doc, :dot
5
5
  as "application/msword"
6
6
  aka "Microsoft Office Word document"
7
- with "antiword SOURCE" => :on_linux, "some other command" => :on_windows
7
+ with "antiword SOURCE" => :on_linux_and_mac_os,
8
+ "some other command" => :on_windows
8
9
  which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
9
10
  or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
10
11
  }
@@ -1,12 +1,17 @@
1
1
  require File.dirname(__FILE__) + '/../spec_helper'
2
2
 
3
3
  describe ApplicationController do
4
- it "should give 403 when denying access" do
4
+ it "should give 403 status when denying access" do
5
5
  get 'access_denied'
6
6
  response.headers["Status"].should == "403 Forbidden"
7
7
  end
8
8
 
9
- it "should flash with a wrong request" do
9
+ it "should render 'Access denied' text when denying access" do
10
+ get 'access_denied'
11
+ response.body.should == 'Access denied'
12
+ end
13
+
14
+ it "should flash a warning if given a wrong request" do
10
15
  get 'unknown_request'
11
16
  response.should be_redirect
12
17
  response.should redirect_to(documents_url)
@@ -90,6 +90,7 @@ describe Finder do
90
90
  end
91
91
 
92
92
  it "should find documents according to their modification date" do
93
+ #FIXME: Specs don't pass in /tmp folder. They do pass in any other directory, though. Why???
93
94
  Finder.new("date:<1982").matching_documents.should be_empty
94
95
  Finder.new("19831209*").matching_document.basename.should == "office2003-word-template"
95
96
  Finder.new("date:<1983").matching_document.filename.should == "basic.pdf"
@@ -5,7 +5,7 @@ describe "PlainTextExtractors" do
5
5
  IndexReader.ensure_existence
6
6
  end
7
7
 
8
- PlainTextExtractor.each{|extractor|
8
+ PlainTextExtractor.all.each{|extractor|
9
9
  extractor.exts.each{|ext|
10
10
  should_extract= "should be able to extract content from #{extractor.description} (.#{ext})"
11
11
  content_and_file_examples_for_this_ext=extractor.content_and_file_examples.select{|content,file| File.ext_as_sym(file)==ext}
@@ -2,7 +2,7 @@ module Picolena #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 1
5
- TINY = 3
5
+ TINY = 4
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
data/website/index.html CHANGED
@@ -33,7 +33,7 @@
33
33
  <h1>Picolena</h1>
34
34
  <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
35
35
  <p>Get Version</p>
36
- <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.3</a>
36
+ <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.4</a>
37
37
  </div>
38
38
  <h1>&#x2192; &#8216;picolena&#8217;</h1>
39
39
 
data.tar.gz.sig CHANGED
@@ -1,3 +1,2 @@
1
- $��L���n�鏪Gr�ぅ{�y�Sw\
2
- cn��ڭb�fU�E�"��̺��; ظN�A<��n/�@BM�x�)���MT��[٩����\�F�c#�����܂Nk�ND2�[3���P��f�������C��a2�1�5zr�V��ײU��,'�X����v����'A��\�7��쳧�+\��VR��.tXPI�\���W��j�^1
3
- Y��E(��|�dX�kU���2.��m�Xd/�L�!�$�?��p�Ī'(
1
+ .��@�:���N6JQp9V�"QT�k�7~4*�D��w��u���%v��[��r�Y���hB�t:Cv=�,8ڽ��c���;I��V[$y�nj�ϓέN�3��x+��yCQ^ہ�C(L)�O7�-��2ZV�L�]���i~��JK"8F�|��:�eT��Vp��ߋU��] ��
2
+ ��V���[;#̧KM���$�;=X�~�>���� wYI7��3ksv��A߶� ��0GZTi7$�����>@
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: picolena
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Duminil
@@ -30,7 +30,7 @@ cert_chain:
30
30
  qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2008-04-20 00:00:00 +02:00
33
+ date: 2008-04-23 00:00:00 +02:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
metadata.gz.sig CHANGED
Binary file