picolena 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +8 -4
- data/lib/picolena/templates/app/models/indexer.rb +1 -1
- data/lib/picolena/templates/app/models/plain_text_extractor.rb +19 -11
- data/lib/picolena/templates/lib/core_exts.rb +19 -3
- data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +23 -5
- data/lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb +2 -1
- data/lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb +2 -1
- data/lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb +2 -1
- data/lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb +2 -1
- data/lib/picolena/templates/lib/plain_text_extractors/ms.word.rb +2 -1
- data/lib/picolena/templates/spec/controllers/application_controller_spec.rb +7 -2
- data/lib/picolena/templates/spec/models/finder_spec.rb +1 -0
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +1 -1
- data/lib/picolena/version.rb +1 -1
- data/website/index.html +1 -1
- data.tar.gz.sig +2 -3
- metadata +2 -2
- metadata.gz.sig +0 -0
data/History.txt
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
== 0.1.4 2008-04-23
|
2
|
+
* 1 minor enhancement:
|
3
|
+
* minimal MacOS support
|
4
|
+
|
1
5
|
== 0.1.3 2008-04-20
|
2
6
|
|
3
7
|
* 1 bug fix:
|
@@ -5,7 +9,7 @@
|
|
5
9
|
|
6
10
|
== 0.1.2 2008-04-20
|
7
11
|
|
8
|
-
* major
|
12
|
+
* 3 major enhancements:
|
9
13
|
* complete Indexer & Index rewrite
|
10
14
|
* new DSL syntax
|
11
15
|
* multi-threaded Indexer
|
@@ -13,7 +17,7 @@
|
|
13
17
|
|
14
18
|
== 0.1.1 2008-04-12
|
15
19
|
|
16
|
-
* major enhancement:
|
20
|
+
* 1 major enhancement:
|
17
21
|
* cache à la Google
|
18
22
|
|
19
23
|
* minor enhancements:
|
@@ -24,14 +28,14 @@
|
|
24
28
|
|
25
29
|
== 0.1.0 2008-04-08
|
26
30
|
|
27
|
-
* minor enhancements:
|
31
|
+
* 3 minor enhancements:
|
28
32
|
* can now be installed on win32 (doesn't pass every spec though)
|
29
33
|
* moved rails_plugins away from lib/ so that they don't get parsed by rdoc/ri
|
30
34
|
* shorter and prettier base26_hash id for documents.
|
31
35
|
|
32
36
|
== 0.0.99 2008-04-06
|
33
37
|
|
34
|
-
* minor enhancements:
|
38
|
+
* 2 minor enhancements:
|
35
39
|
* more complete specs
|
36
40
|
* mtime is now indexed and available in queries as "date:20080406"
|
37
41
|
|
@@ -59,7 +59,7 @@ class Indexer
|
|
59
59
|
File.file?(filename) && filename !~ @@exclude
|
60
60
|
}
|
61
61
|
|
62
|
-
indexing_list_chunks=indexing_list.
|
62
|
+
indexing_list_chunks=indexing_list.in_transposed_slices(threads_number)
|
63
63
|
|
64
64
|
indexing_list_chunks.each_with_thread{|chunk|
|
65
65
|
chunk.each{|filename|
|
@@ -13,11 +13,6 @@ class PlainTextExtractor
|
|
13
13
|
all<<extractor
|
14
14
|
end
|
15
15
|
|
16
|
-
# Calls block for each extractor
|
17
|
-
def each(&block)
|
18
|
-
all.each(&block)
|
19
|
-
end
|
20
|
-
|
21
16
|
# Returns every required dependency for every defined extractor
|
22
17
|
def dependencies
|
23
18
|
@@dependencies||=all.collect{|extractor| extractor.dependencies}.flatten.compact.uniq.sort
|
@@ -28,13 +23,19 @@ class PlainTextExtractor
|
|
28
23
|
@@supported_exts||=all.collect{|extractor| extractor.exts}.flatten.compact.uniq
|
29
24
|
end
|
30
25
|
|
31
|
-
# Finds which extractor should be used for a given file
|
26
|
+
# Finds which extractor should be used for a given file.
|
32
27
|
# Raises if the file is unsupported.
|
33
28
|
def find_by_filename(filename)
|
34
29
|
ext=File.ext_as_sym(filename)
|
35
|
-
|
36
|
-
|
37
|
-
|
30
|
+
returning find_by_extension(ext) do |found_extractor|
|
31
|
+
found_extractor.source=filename
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Finds which extractor should be used for a given file, according to its extension
|
36
|
+
# Raises if the file is unsupported.
|
37
|
+
def find_by_extension(ext)
|
38
|
+
all.find{|extractor| extractor.exts.include?(ext)} || raise(ArgumentError, "no convertor for .#{ext}")
|
38
39
|
end
|
39
40
|
|
40
41
|
# Launches extractor on given file and outputs plain text result
|
@@ -42,10 +43,13 @@ class PlainTextExtractor
|
|
42
43
|
find_by_filename(source).extract_content
|
43
44
|
end
|
44
45
|
|
46
|
+
# Launches extractor on given file and outputs plain text result and language (if found)
|
45
47
|
def extract_content_and_language_from(source)
|
46
48
|
find_by_filename(source).extract_content_and_language
|
47
49
|
end
|
48
50
|
|
51
|
+
# Returns which language guesser should be used by the system.
|
52
|
+
# Returns nil if none is found.
|
49
53
|
def language_guesser
|
50
54
|
@@language_guesser||=('mguesser -n1' unless IO.popen("which mguesser"){|i| i.read}.empty?)
|
51
55
|
end
|
@@ -105,8 +109,12 @@ class PlainTextExtractor
|
|
105
109
|
# and if probability score is higher than 90%.
|
106
110
|
def extract_content_and_language
|
107
111
|
content=extract_content
|
108
|
-
|
109
|
-
|
112
|
+
return [content, nil] unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb)
|
113
|
+
Picolena::UseLanguageRecognition,
|
114
|
+
# Is a language guesser already installed?
|
115
|
+
PlainTextExtractor.language_guesser,
|
116
|
+
# Language recognition is too unreliable for small files.
|
117
|
+
content.size > 500].all?
|
110
118
|
language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser|
|
111
119
|
lang_guesser.write content
|
112
120
|
lang_guesser.close_write
|
@@ -34,9 +34,25 @@ module Enumerable
|
|
34
34
|
end
|
35
35
|
|
36
36
|
class Array
|
37
|
-
|
38
|
-
|
39
|
-
|
37
|
+
# Returns a partition of n arrays.
|
38
|
+
# Transposition is used to avoid getting arrays that are too different.
|
39
|
+
# >> (0..17).to_a.in_transposed_slices(5)
|
40
|
+
# => [[0, 5, 10, 15], [1, 6, 11, 16], [2, 7, 12, 17], [3, 8, 13], [4, 9, 14]]
|
41
|
+
# while
|
42
|
+
# >> (0..17).enum_slice(5).to_a
|
43
|
+
# => [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17]]
|
44
|
+
#
|
45
|
+
# If some folders contain big files and some others contain small ones,
|
46
|
+
# every indexing thread will get some of both!
|
47
|
+
def in_transposed_slices(n)
|
48
|
+
# no need to compute anything if n==1
|
49
|
+
return [self] if n==1
|
50
|
+
# Array#transpose would raise if Array is not a square array of arrays.
|
51
|
+
i=n-self.size%n
|
52
|
+
# Adds nils so that size is a multiple of n,
|
53
|
+
# cuts array in slices of size n,
|
54
|
+
# transposes to get n slices,
|
55
|
+
# and removes added nils.
|
40
56
|
(self+[nil]*i).enum_slice(n).to_a.transpose.collect{|e| e.compact}
|
41
57
|
end
|
42
58
|
end
|
@@ -54,19 +54,37 @@ module PlainTextExtractorDSL
|
|
54
54
|
#TODO: Find a better way to manage platforms, and include OS X, Vista, BSD...
|
55
55
|
platform=case RUBY_PLATFORM
|
56
56
|
when /linux/
|
57
|
-
:
|
57
|
+
:linux
|
58
58
|
when /win/
|
59
|
-
:
|
59
|
+
:windows
|
60
|
+
when /darwin/
|
61
|
+
:mac_os
|
60
62
|
end
|
61
63
|
@command=case command_as_hash_or_string
|
62
64
|
when String
|
63
65
|
command_as_hash_or_string
|
64
66
|
when Hash
|
65
|
-
#
|
66
|
-
|
67
|
+
# Allows to write
|
68
|
+
# with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
|
69
|
+
# "some other command" => :on_windows
|
70
|
+
#
|
71
|
+
# On linux and mac_os platforms, it returns "pdftotext -enc UTF-8 SOURCE -",
|
72
|
+
# on windows, it returns "some other command"
|
73
|
+
#
|
74
|
+
# If commands for linux & mac os were different :
|
75
|
+
# with "some command" => :on_linux,
|
76
|
+
# "another command" => :on_mac_os,
|
77
|
+
# "yet another command" => :on_windows
|
78
|
+
#
|
79
|
+
#TODO: Make it clearer and more robust.
|
80
|
+
#NOTE: What to do when no command is defined for a given platform?
|
81
|
+
command_as_hash_or_string.invert.find{|platforms,command|
|
82
|
+
platforms.to_s.split(/_?and_?/i).collect{|on_platform| on_platform.sub(/on_/,'').to_sym}.include?(platform)
|
83
|
+
}.last.dup
|
67
84
|
else
|
68
85
|
block || raise("No command defined for this extractor: #{description}")
|
69
86
|
end
|
70
|
-
|
87
|
+
# TODO, replace it with Open3 or something.
|
88
|
+
@command<<' 2>/dev/null' if (@command.is_a?(String) && platform.to_s=~/(linux|mac_os)/ && !@command.include?('|'))
|
71
89
|
end
|
72
90
|
end
|
@@ -8,6 +8,7 @@ PlainTextExtractor.new {
|
|
8
8
|
every :pdf
|
9
9
|
as "application/pdf"
|
10
10
|
aka "Adobe Portable Document Format"
|
11
|
-
with "pdftotext -enc UTF-8 SOURCE -" => :
|
11
|
+
with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
|
12
|
+
"some other command" => :on_windows
|
12
13
|
which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
|
13
14
|
}
|
@@ -4,7 +4,8 @@ PlainTextExtractor.new {
|
|
4
4
|
every :xls
|
5
5
|
as "application/excel"
|
6
6
|
aka "Microsoft Office Excel document"
|
7
|
-
with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :
|
7
|
+
with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux_and_mac_os,
|
8
|
+
"some other command" => :on_windows
|
8
9
|
which_should_for_example_extract 'Some text (should be indexed!)', :from => 'table.xls'
|
9
10
|
}
|
10
11
|
|
@@ -4,7 +4,8 @@ PlainTextExtractor.new {
|
|
4
4
|
every :ppt, :pps
|
5
5
|
as "application/powerpoint"
|
6
6
|
aka "Microsoft Office Powerpoint document"
|
7
|
-
with "catppt SOURCE" => :
|
7
|
+
with "catppt SOURCE" => :on_linux_and_mac_os,
|
8
|
+
"some other command" => :on_windows
|
8
9
|
which_should_for_example_extract 'unofficial written by OOo Impress', :from => 'one_page.ppt'
|
9
10
|
#FIXME: it seems that catppt cannot open .pps files.
|
10
11
|
#or_extract 'a lightweight ferret-powered search engine written in Ruby on rails.', :from => 'picolena.pps'
|
@@ -8,6 +8,7 @@ PlainTextExtractor.new {
|
|
8
8
|
every :rtf
|
9
9
|
as "application/rtf"
|
10
10
|
aka "Microsoft Rich Text Format"
|
11
|
-
with "unrtf SOURCE -t text" => :
|
11
|
+
with "unrtf SOURCE -t text" => :on_linux_and_mac_os,
|
12
|
+
"some other command" => :on_windows
|
12
13
|
which_should_for_example_extract 'Resampling when limiting', :from => 'ReadMe.rtf'
|
13
14
|
}
|
@@ -4,7 +4,8 @@ PlainTextExtractor.new {
|
|
4
4
|
every :doc, :dot
|
5
5
|
as "application/msword"
|
6
6
|
aka "Microsoft Office Word document"
|
7
|
-
with "antiword SOURCE" => :
|
7
|
+
with "antiword SOURCE" => :on_linux_and_mac_os,
|
8
|
+
"some other command" => :on_windows
|
8
9
|
which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
|
9
10
|
or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
|
10
11
|
}
|
@@ -1,12 +1,17 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../spec_helper'
|
2
2
|
|
3
3
|
describe ApplicationController do
|
4
|
-
it "should give 403 when denying access" do
|
4
|
+
it "should give 403 status when denying access" do
|
5
5
|
get 'access_denied'
|
6
6
|
response.headers["Status"].should == "403 Forbidden"
|
7
7
|
end
|
8
8
|
|
9
|
-
it "should
|
9
|
+
it "should render 'Access denied' text when denying access" do
|
10
|
+
get 'access_denied'
|
11
|
+
response.body.should == 'Access denied'
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should flash a warning if given a wrong request" do
|
10
15
|
get 'unknown_request'
|
11
16
|
response.should be_redirect
|
12
17
|
response.should redirect_to(documents_url)
|
@@ -90,6 +90,7 @@ describe Finder do
|
|
90
90
|
end
|
91
91
|
|
92
92
|
it "should find documents according to their modification date" do
|
93
|
+
#FIXME: Specs don't pass in /tmp folder. They do pass in any other directory, though. Why???
|
93
94
|
Finder.new("date:<1982").matching_documents.should be_empty
|
94
95
|
Finder.new("19831209*").matching_document.basename.should == "office2003-word-template"
|
95
96
|
Finder.new("date:<1983").matching_document.filename.should == "basic.pdf"
|
@@ -5,7 +5,7 @@ describe "PlainTextExtractors" do
|
|
5
5
|
IndexReader.ensure_existence
|
6
6
|
end
|
7
7
|
|
8
|
-
PlainTextExtractor.each{|extractor|
|
8
|
+
PlainTextExtractor.all.each{|extractor|
|
9
9
|
extractor.exts.each{|ext|
|
10
10
|
should_extract= "should be able to extract content from #{extractor.description} (.#{ext})"
|
11
11
|
content_and_file_examples_for_this_ext=extractor.content_and_file_examples.select{|content,file| File.ext_as_sym(file)==ext}
|
data/lib/picolena/version.rb
CHANGED
data/website/index.html
CHANGED
@@ -33,7 +33,7 @@
|
|
33
33
|
<h1>Picolena</h1>
|
34
34
|
<div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
|
35
35
|
<p>Get Version</p>
|
36
|
-
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.
|
36
|
+
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.4</a>
|
37
37
|
</div>
|
38
38
|
<h1>→ ‘picolena’</h1>
|
39
39
|
|
data.tar.gz.sig
CHANGED
@@ -1,3 +1,2 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
Y��E(��|�dX�kU���2.��m�Xd/�L�!�$�?��p�Ī'(
|
1
|
+
.��@�:���N6JQp�9V�"QT�k�7~4*�D��w��u���%v��[��r�Y���hB�t:Cv=�,8ڽ��c���;I��V[$y�nj�ϓέN�3��x+��yC�Q^ہ�C(�L)�O7�-��2ZV�L�]���i~��JK"8F�|��:�eT��Vp��ߋU��]��
|
2
|
+
��V���[;#̧KM���$�;=X�~�>���� wYI7��3ksv��A߶� ��0�GZTi7$�����>@
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: picolena
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Duminil
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
|
31
31
|
-----END CERTIFICATE-----
|
32
32
|
|
33
|
-
date: 2008-04-
|
33
|
+
date: 2008-04-23 00:00:00 +02:00
|
34
34
|
default_executable:
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
metadata.gz.sig
CHANGED
Binary file
|