picolena 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +8 -4
- data/lib/picolena/templates/app/models/indexer.rb +1 -1
- data/lib/picolena/templates/app/models/plain_text_extractor.rb +19 -11
- data/lib/picolena/templates/lib/core_exts.rb +19 -3
- data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +23 -5
- data/lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb +2 -1
- data/lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb +2 -1
- data/lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb +2 -1
- data/lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb +2 -1
- data/lib/picolena/templates/lib/plain_text_extractors/ms.word.rb +2 -1
- data/lib/picolena/templates/spec/controllers/application_controller_spec.rb +7 -2
- data/lib/picolena/templates/spec/models/finder_spec.rb +1 -0
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +1 -1
- data/lib/picolena/version.rb +1 -1
- data/website/index.html +1 -1
- data.tar.gz.sig +2 -3
- metadata +2 -2
- metadata.gz.sig +0 -0
data/History.txt
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
== 0.1.4 2008-04-23
|
2
|
+
* 1 minor enhancement:
|
3
|
+
* minimal MacOS support
|
4
|
+
|
1
5
|
== 0.1.3 2008-04-20
|
2
6
|
|
3
7
|
* 1 bug fix:
|
@@ -5,7 +9,7 @@
|
|
5
9
|
|
6
10
|
== 0.1.2 2008-04-20
|
7
11
|
|
8
|
-
* major
|
12
|
+
* 3 major enhancements:
|
9
13
|
* complete Indexer & Index rewrite
|
10
14
|
* new DSL syntax
|
11
15
|
* multi-threaded Indexer
|
@@ -13,7 +17,7 @@
|
|
13
17
|
|
14
18
|
== 0.1.1 2008-04-12
|
15
19
|
|
16
|
-
* major enhancement:
|
20
|
+
* 1 major enhancement:
|
17
21
|
* cache à la Google
|
18
22
|
|
19
23
|
* minor enhancements:
|
@@ -24,14 +28,14 @@
|
|
24
28
|
|
25
29
|
== 0.1.0 2008-04-08
|
26
30
|
|
27
|
-
* minor enhancements:
|
31
|
+
* 3 minor enhancements:
|
28
32
|
* can now be installed on win32 (doesn't pass every spec though)
|
29
33
|
* moved rails_plugins away from lib/ so that they don't get parsed by rdoc/ri
|
30
34
|
* shorter and prettier base26_hash id for documents.
|
31
35
|
|
32
36
|
== 0.0.99 2008-04-06
|
33
37
|
|
34
|
-
* minor enhancements:
|
38
|
+
* 2 minor enhancements:
|
35
39
|
* more complete specs
|
36
40
|
* mtime is now indexed and available in queries as "date:20080406"
|
37
41
|
|
@@ -59,7 +59,7 @@ class Indexer
|
|
59
59
|
File.file?(filename) && filename !~ @@exclude
|
60
60
|
}
|
61
61
|
|
62
|
-
indexing_list_chunks=indexing_list.
|
62
|
+
indexing_list_chunks=indexing_list.in_transposed_slices(threads_number)
|
63
63
|
|
64
64
|
indexing_list_chunks.each_with_thread{|chunk|
|
65
65
|
chunk.each{|filename|
|
@@ -13,11 +13,6 @@ class PlainTextExtractor
|
|
13
13
|
all<<extractor
|
14
14
|
end
|
15
15
|
|
16
|
-
# Calls block for each extractor
|
17
|
-
def each(&block)
|
18
|
-
all.each(&block)
|
19
|
-
end
|
20
|
-
|
21
16
|
# Returns every required dependency for every defined extractor
|
22
17
|
def dependencies
|
23
18
|
@@dependencies||=all.collect{|extractor| extractor.dependencies}.flatten.compact.uniq.sort
|
@@ -28,13 +23,19 @@ class PlainTextExtractor
|
|
28
23
|
@@supported_exts||=all.collect{|extractor| extractor.exts}.flatten.compact.uniq
|
29
24
|
end
|
30
25
|
|
31
|
-
# Finds which extractor should be used for a given file
|
26
|
+
# Finds which extractor should be used for a given file.
|
32
27
|
# Raises if the file is unsupported.
|
33
28
|
def find_by_filename(filename)
|
34
29
|
ext=File.ext_as_sym(filename)
|
35
|
-
|
36
|
-
|
37
|
-
|
30
|
+
returning find_by_extension(ext) do |found_extractor|
|
31
|
+
found_extractor.source=filename
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Finds which extractor should be used for a given file, according to its extension
|
36
|
+
# Raises if the file is unsupported.
|
37
|
+
def find_by_extension(ext)
|
38
|
+
all.find{|extractor| extractor.exts.include?(ext)} || raise(ArgumentError, "no convertor for .#{ext}")
|
38
39
|
end
|
39
40
|
|
40
41
|
# Launches extractor on given file and outputs plain text result
|
@@ -42,10 +43,13 @@ class PlainTextExtractor
|
|
42
43
|
find_by_filename(source).extract_content
|
43
44
|
end
|
44
45
|
|
46
|
+
# Launches extractor on given file and outputs plain text result and language (if found)
|
45
47
|
def extract_content_and_language_from(source)
|
46
48
|
find_by_filename(source).extract_content_and_language
|
47
49
|
end
|
48
50
|
|
51
|
+
# Returns which language guesser should be used by the system.
|
52
|
+
# Returns nil if none is found.
|
49
53
|
def language_guesser
|
50
54
|
@@language_guesser||=('mguesser -n1' unless IO.popen("which mguesser"){|i| i.read}.empty?)
|
51
55
|
end
|
@@ -105,8 +109,12 @@ class PlainTextExtractor
|
|
105
109
|
# and if probability score is higher than 90%.
|
106
110
|
def extract_content_and_language
|
107
111
|
content=extract_content
|
108
|
-
|
109
|
-
|
112
|
+
return [content, nil] unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb)
|
113
|
+
Picolena::UseLanguageRecognition,
|
114
|
+
# Is a language guesser already installed?
|
115
|
+
PlainTextExtractor.language_guesser,
|
116
|
+
# Language recognition is too unreliable for small files.
|
117
|
+
content.size > 500].all?
|
110
118
|
language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser|
|
111
119
|
lang_guesser.write content
|
112
120
|
lang_guesser.close_write
|
@@ -34,9 +34,25 @@ module Enumerable
|
|
34
34
|
end
|
35
35
|
|
36
36
|
class Array
|
37
|
-
|
38
|
-
|
39
|
-
|
37
|
+
# Returns a partition of n arrays.
|
38
|
+
# Transposition is used to avoid getting arrays that are too different.
|
39
|
+
# >> (0..17).to_a.in_transposed_slices(5)
|
40
|
+
# => [[0, 5, 10, 15], [1, 6, 11, 16], [2, 7, 12, 17], [3, 8, 13], [4, 9, 14]]
|
41
|
+
# while
|
42
|
+
# >> (0..17).enum_slice(5).to_a
|
43
|
+
# => [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17]]
|
44
|
+
#
|
45
|
+
# If some folders contain big files and some others contain small ones,
|
46
|
+
# every indexing thread will get some of both!
|
47
|
+
def in_transposed_slices(n)
|
48
|
+
# no need to compute anything if n==1
|
49
|
+
return [self] if n==1
|
50
|
+
# Array#transpose would raise if Array is not a square array of arrays.
|
51
|
+
i=n-self.size%n
|
52
|
+
# Adds nils so that size is a multiple of n,
|
53
|
+
# cuts array in slices of size n,
|
54
|
+
# transposes to get n slices,
|
55
|
+
# and removes added nils.
|
40
56
|
(self+[nil]*i).enum_slice(n).to_a.transpose.collect{|e| e.compact}
|
41
57
|
end
|
42
58
|
end
|
@@ -54,19 +54,37 @@ module PlainTextExtractorDSL
|
|
54
54
|
#TODO: Find a better way to manage platforms, and include OS X, Vista, BSD...
|
55
55
|
platform=case RUBY_PLATFORM
|
56
56
|
when /linux/
|
57
|
-
:
|
57
|
+
:linux
|
58
58
|
when /win/
|
59
|
-
:
|
59
|
+
:windows
|
60
|
+
when /darwin/
|
61
|
+
:mac_os
|
60
62
|
end
|
61
63
|
@command=case command_as_hash_or_string
|
62
64
|
when String
|
63
65
|
command_as_hash_or_string
|
64
66
|
when Hash
|
65
|
-
#
|
66
|
-
|
67
|
+
# Allows to write
|
68
|
+
# with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
|
69
|
+
# "some other command" => :on_windows
|
70
|
+
#
|
71
|
+
# On linux and mac_os platforms, it returns "pdftotext -enc UTF-8 SOURCE -",
|
72
|
+
# on windows, it returns "some other command"
|
73
|
+
#
|
74
|
+
# If commands for linux & mac os were different :
|
75
|
+
# with "some command" => :on_linux,
|
76
|
+
# "another command" => :on_mac_os,
|
77
|
+
# "yet another command" => :on_windows
|
78
|
+
#
|
79
|
+
#TODO: Make it clearer and more robust.
|
80
|
+
#NOTE: What to do when no command is defined for a given platform?
|
81
|
+
command_as_hash_or_string.invert.find{|platforms,command|
|
82
|
+
platforms.to_s.split(/_?and_?/i).collect{|on_platform| on_platform.sub(/on_/,'').to_sym}.include?(platform)
|
83
|
+
}.last.dup
|
67
84
|
else
|
68
85
|
block || raise("No command defined for this extractor: #{description}")
|
69
86
|
end
|
70
|
-
|
87
|
+
# TODO, replace it with Open3 or something.
|
88
|
+
@command<<' 2>/dev/null' if (@command.is_a?(String) && platform.to_s=~/(linux|mac_os)/ && !@command.include?('|'))
|
71
89
|
end
|
72
90
|
end
|
@@ -8,6 +8,7 @@ PlainTextExtractor.new {
|
|
8
8
|
every :pdf
|
9
9
|
as "application/pdf"
|
10
10
|
aka "Adobe Portable Document Format"
|
11
|
-
with "pdftotext -enc UTF-8 SOURCE -" => :
|
11
|
+
with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
|
12
|
+
"some other command" => :on_windows
|
12
13
|
which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
|
13
14
|
}
|
@@ -4,7 +4,8 @@ PlainTextExtractor.new {
|
|
4
4
|
every :xls
|
5
5
|
as "application/excel"
|
6
6
|
aka "Microsoft Office Excel document"
|
7
|
-
with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :
|
7
|
+
with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux_and_mac_os,
|
8
|
+
"some other command" => :on_windows
|
8
9
|
which_should_for_example_extract 'Some text (should be indexed!)', :from => 'table.xls'
|
9
10
|
}
|
10
11
|
|
@@ -4,7 +4,8 @@ PlainTextExtractor.new {
|
|
4
4
|
every :ppt, :pps
|
5
5
|
as "application/powerpoint"
|
6
6
|
aka "Microsoft Office Powerpoint document"
|
7
|
-
with "catppt SOURCE" => :
|
7
|
+
with "catppt SOURCE" => :on_linux_and_mac_os,
|
8
|
+
"some other command" => :on_windows
|
8
9
|
which_should_for_example_extract 'unofficial written by OOo Impress', :from => 'one_page.ppt'
|
9
10
|
#FIXME: it seems that catppt cannot open .pps files.
|
10
11
|
#or_extract 'a lightweight ferret-powered search engine written in Ruby on rails.', :from => 'picolena.pps'
|
@@ -8,6 +8,7 @@ PlainTextExtractor.new {
|
|
8
8
|
every :rtf
|
9
9
|
as "application/rtf"
|
10
10
|
aka "Microsoft Rich Text Format"
|
11
|
-
with "unrtf SOURCE -t text" => :
|
11
|
+
with "unrtf SOURCE -t text" => :on_linux_and_mac_os,
|
12
|
+
"some other command" => :on_windows
|
12
13
|
which_should_for_example_extract 'Resampling when limiting', :from => 'ReadMe.rtf'
|
13
14
|
}
|
@@ -4,7 +4,8 @@ PlainTextExtractor.new {
|
|
4
4
|
every :doc, :dot
|
5
5
|
as "application/msword"
|
6
6
|
aka "Microsoft Office Word document"
|
7
|
-
with "antiword SOURCE" => :
|
7
|
+
with "antiword SOURCE" => :on_linux_and_mac_os,
|
8
|
+
"some other command" => :on_windows
|
8
9
|
which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
|
9
10
|
or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
|
10
11
|
}
|
@@ -1,12 +1,17 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../spec_helper'
|
2
2
|
|
3
3
|
describe ApplicationController do
|
4
|
-
it "should give 403 when denying access" do
|
4
|
+
it "should give 403 status when denying access" do
|
5
5
|
get 'access_denied'
|
6
6
|
response.headers["Status"].should == "403 Forbidden"
|
7
7
|
end
|
8
8
|
|
9
|
-
it "should
|
9
|
+
it "should render 'Access denied' text when denying access" do
|
10
|
+
get 'access_denied'
|
11
|
+
response.body.should == 'Access denied'
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should flash a warning if given a wrong request" do
|
10
15
|
get 'unknown_request'
|
11
16
|
response.should be_redirect
|
12
17
|
response.should redirect_to(documents_url)
|
@@ -90,6 +90,7 @@ describe Finder do
|
|
90
90
|
end
|
91
91
|
|
92
92
|
it "should find documents according to their modification date" do
|
93
|
+
#FIXME: Specs don't pass in /tmp folder. They do pass in any other directory, though. Why???
|
93
94
|
Finder.new("date:<1982").matching_documents.should be_empty
|
94
95
|
Finder.new("19831209*").matching_document.basename.should == "office2003-word-template"
|
95
96
|
Finder.new("date:<1983").matching_document.filename.should == "basic.pdf"
|
@@ -5,7 +5,7 @@ describe "PlainTextExtractors" do
|
|
5
5
|
IndexReader.ensure_existence
|
6
6
|
end
|
7
7
|
|
8
|
-
PlainTextExtractor.each{|extractor|
|
8
|
+
PlainTextExtractor.all.each{|extractor|
|
9
9
|
extractor.exts.each{|ext|
|
10
10
|
should_extract= "should be able to extract content from #{extractor.description} (.#{ext})"
|
11
11
|
content_and_file_examples_for_this_ext=extractor.content_and_file_examples.select{|content,file| File.ext_as_sym(file)==ext}
|
data/lib/picolena/version.rb
CHANGED
data/website/index.html
CHANGED
@@ -33,7 +33,7 @@
|
|
33
33
|
<h1>Picolena</h1>
|
34
34
|
<div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
|
35
35
|
<p>Get Version</p>
|
36
|
-
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.
|
36
|
+
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.4</a>
|
37
37
|
</div>
|
38
38
|
<h1>→ ‘picolena’</h1>
|
39
39
|
|
data.tar.gz.sig
CHANGED
@@ -1,3 +1,2 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
Y��E(��|�dX�kU���2.��m�Xd/�L�!�$�?��p�Ī'(
|
1
|
+
.��@�:���N6JQp�9V�"QT�k�7~4*�D��w��u���%v��[��r�Y���hB�t:Cv=�,8ڽ��c���;I��V[$y�nj�ϓέN�3��x+��yC�Q^ہ�C(�L)�O7�-��2ZV�L�]���i~��JK"8F�|��:�eT��Vp��ߋU��]��
|
2
|
+
��V���[;#̧KM���$�;=X�~�>���� wYI7��3ksv��A߶� ��0�GZTi7$�����>@
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: picolena
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Duminil
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
|
31
31
|
-----END CERTIFICATE-----
|
32
32
|
|
33
|
-
date: 2008-04-
|
33
|
+
date: 2008-04-23 00:00:00 +02:00
|
34
34
|
default_executable:
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
metadata.gz.sig
CHANGED
Binary file
|