picolena 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ == 0.1.4 2008-04-23
2
+ * 1 minor enhancement:
3
+ * minimal MacOS support
4
+
1
5
  == 0.1.3 2008-04-20
2
6
 
3
7
  * 1 bug fix:
@@ -5,7 +9,7 @@
5
9
 
6
10
  == 0.1.2 2008-04-20
7
11
 
8
- * major enhancement:
12
+ * 3 major enhancements:
9
13
  * complete Indexer & Index rewrite
10
14
  * new DSL syntax
11
15
  * multi-threaded Indexer
@@ -13,7 +17,7 @@
13
17
 
14
18
  == 0.1.1 2008-04-12
15
19
 
16
- * major enhancement:
20
+ * 1 major enhancement:
17
21
  * cache à la Google
18
22
 
19
23
  * minor enhancements:
@@ -24,14 +28,14 @@
24
28
 
25
29
  == 0.1.0 2008-04-08
26
30
 
27
- * minor enhancements:
31
+ * 3 minor enhancements:
28
32
  * can now be installed on win32 (doesn't pass every spec though)
29
33
  * moved rails_plugins away from lib/ so that they don't get parsed by rdoc/ri
30
34
  * shorter and prettier base26_hash id for documents.
31
35
 
32
36
  == 0.0.99 2008-04-06
33
37
 
34
- * minor enhancements:
38
+ * 2 minor enhancements:
35
39
  * more complete specs
36
40
  * mtime is now indexed and available in queries as "date:20080406"
37
41
 
@@ -59,7 +59,7 @@ class Indexer
59
59
  File.file?(filename) && filename !~ @@exclude
60
60
  }
61
61
 
62
- indexing_list_chunks=indexing_list.in_transposed_chunks(threads_number)
62
+ indexing_list_chunks=indexing_list.in_transposed_slices(threads_number)
63
63
 
64
64
  indexing_list_chunks.each_with_thread{|chunk|
65
65
  chunk.each{|filename|
@@ -13,11 +13,6 @@ class PlainTextExtractor
13
13
  all<<extractor
14
14
  end
15
15
 
16
- # Calls block for each extractor
17
- def each(&block)
18
- all.each(&block)
19
- end
20
-
21
16
  # Returns every required dependency for every defined extractor
22
17
  def dependencies
23
18
  @@dependencies||=all.collect{|extractor| extractor.dependencies}.flatten.compact.uniq.sort
@@ -28,13 +23,19 @@ class PlainTextExtractor
28
23
  @@supported_exts||=all.collect{|extractor| extractor.exts}.flatten.compact.uniq
29
24
  end
30
25
 
31
- # Finds which extractor should be used for a given file, according to its extension
26
+ # Finds which extractor should be used for a given file.
32
27
  # Raises if the file is unsupported.
33
28
  def find_by_filename(filename)
34
29
  ext=File.ext_as_sym(filename)
35
- found_extractor=all.find{|extractor| extractor.exts.include?(ext)} || raise(ArgumentError, "no convertor for #{filename}")
36
- found_extractor.source=filename
37
- found_extractor
30
+ returning find_by_extension(ext) do |found_extractor|
31
+ found_extractor.source=filename
32
+ end
33
+ end
34
+
35
+ # Finds which extractor should be used for a given file, according to its extension
36
+ # Raises if the file is unsupported.
37
+ def find_by_extension(ext)
38
+ all.find{|extractor| extractor.exts.include?(ext)} || raise(ArgumentError, "no convertor for .#{ext}")
38
39
  end
39
40
 
40
41
  # Launches extractor on given file and outputs plain text result
@@ -42,10 +43,13 @@ class PlainTextExtractor
42
43
  find_by_filename(source).extract_content
43
44
  end
44
45
 
46
+ # Launches extractor on given file and outputs plain text result and language (if found)
45
47
  def extract_content_and_language_from(source)
46
48
  find_by_filename(source).extract_content_and_language
47
49
  end
48
50
 
51
+ # Returns which language guesser should be used by the system.
52
+ # Returns nil if none is found.
49
53
  def language_guesser
50
54
  @@language_guesser||=('mguesser -n1' unless IO.popen("which mguesser"){|i| i.read}.empty?)
51
55
  end
@@ -105,8 +109,12 @@ class PlainTextExtractor
105
109
  # and if probability score is higher than 90%.
106
110
  def extract_content_and_language
107
111
  content=extract_content
108
- # Language recognition is too unreliable for small files.
109
- return [content, nil] unless Picolena::UseLanguageRecognition && PlainTextExtractor.language_guesser && content.size > 500
112
+ return [content, nil] unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb)
113
+ Picolena::UseLanguageRecognition,
114
+ # Is a language guesser already installed?
115
+ PlainTextExtractor.language_guesser,
116
+ # Language recognition is too unreliable for small files.
117
+ content.size > 500].all?
110
118
  language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser|
111
119
  lang_guesser.write content
112
120
  lang_guesser.close_write
@@ -34,9 +34,25 @@ module Enumerable
34
34
  end
35
35
 
36
36
  class Array
37
- def in_transposed_chunks(n)
38
- s=self.size
39
- i=n-s%n
37
+ # Returns a partition of n arrays.
38
+ # Transposition is used to avoid getting arrays that are too different.
39
+ # >> (0..17).to_a.in_transposed_slices(5)
40
+ # => [[0, 5, 10, 15], [1, 6, 11, 16], [2, 7, 12, 17], [3, 8, 13], [4, 9, 14]]
41
+ # while
42
+ # >> (0..17).enum_slice(5).to_a
43
+ # => [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17]]
44
+ #
45
+ # If some folders contain big files and some others contain small ones,
46
+ # every indexing thread will get some of both!
47
+ def in_transposed_slices(n)
48
+ # no need to compute anything if n==1
49
+ return [self] if n==1
50
+ # Array#transpose would raise if Array is not a square array of arrays.
51
+ i=n-self.size%n
52
+ # Adds nils so that size is a multiple of n,
53
+ # cuts array in slices of size n,
54
+ # transposes to get n slices,
55
+ # and removes added nils.
40
56
  (self+[nil]*i).enum_slice(n).to_a.transpose.collect{|e| e.compact}
41
57
  end
42
58
  end
@@ -54,19 +54,37 @@ module PlainTextExtractorDSL
54
54
  #TODO: Find a better way to manage platforms, and include OS X, Vista, BSD...
55
55
  platform=case RUBY_PLATFORM
56
56
  when /linux/
57
- :on_linux
57
+ :linux
58
58
  when /win/
59
- :on_windows
59
+ :windows
60
+ when /darwin/
61
+ :mac_os
60
62
  end
61
63
  @command=case command_as_hash_or_string
62
64
  when String
63
65
  command_as_hash_or_string
64
66
  when Hash
65
- #dup must be used, otherwise @command gets frozen. No idea why though....
66
- command_as_hash_or_string.invert[platform].dup
67
+ # Allows to write
68
+ # with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
69
+ # "some other command" => :on_windows
70
+ #
71
+ # On linux and mac_os platforms, it returns "pdftotext -enc UTF-8 SOURCE -",
72
+ # on windows, it returns "some other command"
73
+ #
74
+ # If commands for linux & mac os were different :
75
+ # with "some command" => :on_linux,
76
+ # "another command" => :on_mac_os,
77
+ # "yet another command" => :on_windows
78
+ #
79
+ #TODO: Make it clearer and more robust.
80
+ #NOTE: What to do when no command is defined for a given platform?
81
+ command_as_hash_or_string.invert.find{|platforms,command|
82
+ platforms.to_s.split(/_?and_?/i).collect{|on_platform| on_platform.sub(/on_/,'').to_sym}.include?(platform)
83
+ }.last.dup
67
84
  else
68
85
  block || raise("No command defined for this extractor: #{description}")
69
86
  end
70
- @command<<' 2>/dev/null' if (@command.is_a?(String) && platform==:on_linux && !@command.include?('|'))
87
+ # TODO, replace it with Open3 or something.
88
+ @command<<' 2>/dev/null' if (@command.is_a?(String) && platform.to_s=~/(linux|mac_os)/ && !@command.include?('|'))
71
89
  end
72
90
  end
@@ -8,6 +8,7 @@ PlainTextExtractor.new {
8
8
  every :pdf
9
9
  as "application/pdf"
10
10
  aka "Adobe Portable Document Format"
11
- with "pdftotext -enc UTF-8 SOURCE -" => :on_linux, "some other command" => :on_windows
11
+ with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
12
+ "some other command" => :on_windows
12
13
  which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
13
14
  }
@@ -4,7 +4,8 @@ PlainTextExtractor.new {
4
4
  every :xls
5
5
  as "application/excel"
6
6
  aka "Microsoft Office Excel document"
7
- with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux, "some other command" => :on_windows
7
+ with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux_and_mac_os,
8
+ "some other command" => :on_windows
8
9
  which_should_for_example_extract 'Some text (should be indexed!)', :from => 'table.xls'
9
10
  }
10
11
 
@@ -4,7 +4,8 @@ PlainTextExtractor.new {
4
4
  every :ppt, :pps
5
5
  as "application/powerpoint"
6
6
  aka "Microsoft Office Powerpoint document"
7
- with "catppt SOURCE" => :on_linux, "some other command" => :on_windows
7
+ with "catppt SOURCE" => :on_linux_and_mac_os,
8
+ "some other command" => :on_windows
8
9
  which_should_for_example_extract 'unofficial written by OOo Impress', :from => 'one_page.ppt'
9
10
  #FIXME: it seems that catppt cannot open .pps files.
10
11
  #or_extract 'a lightweight ferret-powered search engine written in Ruby on rails.', :from => 'picolena.pps'
@@ -8,6 +8,7 @@ PlainTextExtractor.new {
8
8
  every :rtf
9
9
  as "application/rtf"
10
10
  aka "Microsoft Rich Text Format"
11
- with "unrtf SOURCE -t text" => :on_linux, "some other command" => :on_windows
11
+ with "unrtf SOURCE -t text" => :on_linux_and_mac_os,
12
+ "some other command" => :on_windows
12
13
  which_should_for_example_extract 'Resampling when limiting', :from => 'ReadMe.rtf'
13
14
  }
@@ -4,7 +4,8 @@ PlainTextExtractor.new {
4
4
  every :doc, :dot
5
5
  as "application/msword"
6
6
  aka "Microsoft Office Word document"
7
- with "antiword SOURCE" => :on_linux, "some other command" => :on_windows
7
+ with "antiword SOURCE" => :on_linux_and_mac_os,
8
+ "some other command" => :on_windows
8
9
  which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
9
10
  or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
10
11
  }
@@ -1,12 +1,17 @@
1
1
  require File.dirname(__FILE__) + '/../spec_helper'
2
2
 
3
3
  describe ApplicationController do
4
- it "should give 403 when denying access" do
4
+ it "should give 403 status when denying access" do
5
5
  get 'access_denied'
6
6
  response.headers["Status"].should == "403 Forbidden"
7
7
  end
8
8
 
9
- it "should flash with a wrong request" do
9
+ it "should render 'Access denied' text when denying access" do
10
+ get 'access_denied'
11
+ response.body.should == 'Access denied'
12
+ end
13
+
14
+ it "should flash a warning if given a wrong request" do
10
15
  get 'unknown_request'
11
16
  response.should be_redirect
12
17
  response.should redirect_to(documents_url)
@@ -90,6 +90,7 @@ describe Finder do
90
90
  end
91
91
 
92
92
  it "should find documents according to their modification date" do
93
+ #FIXME: Specs don't pass in /tmp folder. They do pass in any other directory, though. Why???
93
94
  Finder.new("date:<1982").matching_documents.should be_empty
94
95
  Finder.new("19831209*").matching_document.basename.should == "office2003-word-template"
95
96
  Finder.new("date:<1983").matching_document.filename.should == "basic.pdf"
@@ -5,7 +5,7 @@ describe "PlainTextExtractors" do
5
5
  IndexReader.ensure_existence
6
6
  end
7
7
 
8
- PlainTextExtractor.each{|extractor|
8
+ PlainTextExtractor.all.each{|extractor|
9
9
  extractor.exts.each{|ext|
10
10
  should_extract= "should be able to extract content from #{extractor.description} (.#{ext})"
11
11
  content_and_file_examples_for_this_ext=extractor.content_and_file_examples.select{|content,file| File.ext_as_sym(file)==ext}
@@ -2,7 +2,7 @@ module Picolena #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 1
5
- TINY = 3
5
+ TINY = 4
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
data/website/index.html CHANGED
@@ -33,7 +33,7 @@
33
33
  <h1>Picolena</h1>
34
34
  <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
35
35
  <p>Get Version</p>
36
- <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.3</a>
36
+ <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.4</a>
37
37
  </div>
38
38
  <h1>&#x2192; &#8216;picolena&#8217;</h1>
39
39
 
data.tar.gz.sig CHANGED
@@ -1,3 +1,2 @@
1
- $��L���n�鏪Gr�ぅ{�y�Sw\
2
- cn��ڭb�fU�E�"��̺��; ظN�A<��n/�@BM�x�)���MT��[٩����\�F�c#�����܂Nk�ND2�[3���P��f�������C��a2�1�5zr�V��ײU��,'�X����v����'A��\�7��쳧�+\��VR��.tXPI�\���W��j�^1
3
- Y��E(��|�dX�kU���2.��m�Xd/�L�!�$�?��p�Ī'(
1
+ .��@�:���N6JQp9V�"QT�k�7~4*�D��w��u���%v��[��r�Y���hB�t:Cv=�,8ڽ��c���;I��V[$y�nj�ϓέN�3��x+��yCQ^ہ�C(L)�O7�-��2ZV�L�]���i~��JK"8F�|��:�eT��Vp��ߋU��] ��
2
+ ��V���[;#̧KM���$�;=X�~�>���� wYI7��3ksv��A߶� ��0GZTi7$�����>@
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: picolena
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Duminil
@@ -30,7 +30,7 @@ cert_chain:
30
30
  qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2008-04-20 00:00:00 +02:00
33
+ date: 2008-04-23 00:00:00 +02:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
metadata.gz.sig CHANGED
Binary file