picolena 0.2.0 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +14 -0
- data/Manifest.txt +28 -8
- data/config/files_to_clean +1 -0
- data/config/requirements.rb +1 -1
- data/lib/picolena/config/basic.rb +2 -1
- data/lib/picolena/config/icons_and_filetypes.yml +5 -0
- data/lib/picolena/picolena_generator.rb +3 -1
- data/lib/picolena/templates/app/helpers/documents_helper.rb +4 -4
- data/lib/picolena/templates/app/models/document.rb +27 -4
- data/lib/picolena/templates/app/models/indexer.rb +6 -2
- data/lib/picolena/templates/app/models/plain_text_extractor.rb +27 -13
- data/lib/picolena/templates/app/models/query.rb +2 -2
- data/lib/picolena/templates/app/views/documents/_document.html.haml +1 -1
- data/lib/picolena/templates/config/environments/development.rb +2 -0
- data/lib/picolena/templates/config/initializers/001_load_ferret.rb +17 -0
- data/lib/picolena/templates/config/initializers/{001_load_custom_config.rb → 002_load_custom_config.rb} +1 -2
- data/lib/picolena/templates/config/initializers/{002_load_indexed_dirs.rb → 003_load_indexed_dirs.rb} +0 -0
- data/lib/picolena/templates/config/initializers/{003_load_white_list_IPs.rb → 004_load_white_list_IPs.rb} +0 -0
- data/lib/picolena/templates/config/initializers/{004_load_plain_text_extractors.rb → 005_load_plain_text_extractors.rb} +1 -1
- data/lib/picolena/templates/config/initializers/{005_load_custom_title_and_names_and_links.rb → 006_load_custom_title_and_names_and_links.rb} +0 -0
- data/lib/picolena/templates/config/initializers/{006_load_icons.rb → 007_load_icons.rb} +0 -0
- data/lib/picolena/templates/config/initializers/{007_load_performance_tweaks.rb → 008_load_performance_tweaks.rb} +0 -0
- data/lib/picolena/templates/lib/core_exts.rb +52 -0
- data/lib/picolena/templates/lib/development_helpers.rb +35 -0
- data/lib/picolena/templates/lib/plain_text_extractor_dsl.rb +128 -0
- data/lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb +2 -2
- data/lib/picolena/templates/lib/plain_text_extractors/adobe.photoshop.rb +12 -0
- data/lib/picolena/templates/lib/plain_text_extractors/html.rb +1 -1
- data/lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb +4 -4
- data/lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb +4 -4
- data/lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb +3 -3
- data/lib/picolena/templates/lib/plain_text_extractors/ms.word.rb +4 -4
- data/lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb +2 -2
- data/lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb +2 -2
- data/lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb +2 -2
- data/lib/picolena/templates/lib/plain_text_extractors/pictures.rb +15 -4
- data/lib/picolena/templates/lib/plain_text_extractors/plain_text.rb +9 -2
- data/lib/picolena/templates/lib/plain_text_extractors/rar.rb +18 -0
- data/lib/picolena/templates/lib/plain_text_extractors/videos.rb +13 -0
- data/lib/picolena/templates/lib/plain_text_extractors/zip.rb +17 -0
- data/lib/picolena/templates/lib/tasks/extract.rake +16 -0
- data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
- data/lib/picolena/templates/public/images/thumbnails/NOTE +2 -0
- data/lib/picolena/templates/spec/controllers/documents_controller_spec.rb +8 -0
- data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +12 -1
- data/lib/picolena/templates/spec/models/basic_finder_spec.rb +6 -4
- data/lib/picolena/templates/spec/models/document_spec.rb +24 -4
- data/lib/picolena/templates/spec/models/finder_spec.rb +18 -11
- data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +1 -1
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +25 -8
- data/lib/picolena/templates/spec/models/query_spec.rb +4 -5
- data/lib/picolena/templates/spec/spec_helper.rb +9 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/archives/dumb_file.rar +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/archives/some_test_files.zip +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/basic/fake_thumbnailer +14 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/badminton.avi +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/caution.tif +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/cygnus.jpeg +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/diceface.eps +79 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/glass.png +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/gnu.bmp +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/picolena.psd +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/rails_logo_remix.gif +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/warning.tiff +0 -0
- data/lib/picolena/version.rb +1 -1
- data/website/index.html +1 -1
- metadata +31 -32
- data.tar.gz.sig +0 -0
- data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +0 -88
- metadata.gz.sig +0 -0
data/History.txt
CHANGED
@@ -1,3 +1,17 @@
|
|
1
|
+
== 0.2.2 2009-02-13
|
2
|
+
|
3
|
+
* 3 major enhancements :
|
4
|
+
* Thumbnails created for pictures & videos
|
5
|
+
* Support for .zip & .rar archives
|
6
|
+
* Alias_path is now LetterTokenized
|
7
|
+
|
8
|
+
* 2 minor enhancements:
|
9
|
+
* More specs
|
10
|
+
* Some aesthetical changes
|
11
|
+
|
12
|
+
* 1 bug fix :
|
13
|
+
* Displaying filetypes without any icon would raise an Exception
|
14
|
+
|
1
15
|
== 0.2.0 2009-02-02
|
2
16
|
|
3
17
|
* 1 major enhancement :
|
data/Manifest.txt
CHANGED
@@ -37,22 +37,25 @@ lib/picolena/templates/config/boot.rb
|
|
37
37
|
lib/picolena/templates/config/environments/development.rb
|
38
38
|
lib/picolena/templates/config/environments/production.rb
|
39
39
|
lib/picolena/templates/config/environments/test.rb
|
40
|
-
lib/picolena/templates/config/initializers/
|
41
|
-
lib/picolena/templates/config/initializers/
|
42
|
-
lib/picolena/templates/config/initializers/
|
43
|
-
lib/picolena/templates/config/initializers/
|
44
|
-
lib/picolena/templates/config/initializers/
|
45
|
-
lib/picolena/templates/config/initializers/
|
46
|
-
lib/picolena/templates/config/initializers/
|
40
|
+
lib/picolena/templates/config/initializers/001_load_ferret.rb
|
41
|
+
lib/picolena/templates/config/initializers/002_load_custom_config.rb
|
42
|
+
lib/picolena/templates/config/initializers/003_load_indexed_dirs.rb
|
43
|
+
lib/picolena/templates/config/initializers/004_load_white_list_IPs.rb
|
44
|
+
lib/picolena/templates/config/initializers/005_load_plain_text_extractors.rb
|
45
|
+
lib/picolena/templates/config/initializers/006_load_custom_title_and_names_and_links.rb
|
46
|
+
lib/picolena/templates/config/initializers/007_load_icons.rb
|
47
|
+
lib/picolena/templates/config/initializers/008_load_performance_tweaks.rb
|
47
48
|
lib/picolena/templates/config/routes.rb
|
48
49
|
lib/picolena/templates/lang/ui/de.yml
|
49
50
|
lib/picolena/templates/lang/ui/en.yml
|
50
51
|
lib/picolena/templates/lang/ui/es.yml
|
51
52
|
lib/picolena/templates/lang/ui/fr.yml
|
52
53
|
lib/picolena/templates/lib/core_exts.rb
|
54
|
+
lib/picolena/templates/lib/development_helpers.rb
|
53
55
|
lib/picolena/templates/lib/indexer_logger.rb
|
54
|
-
lib/picolena/templates/lib/
|
56
|
+
lib/picolena/templates/lib/plain_text_extractor_dsl.rb
|
55
57
|
lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
|
58
|
+
lib/picolena/templates/lib/plain_text_extractors/adobe.photoshop.rb
|
56
59
|
lib/picolena/templates/lib/plain_text_extractors/html.rb
|
57
60
|
lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb
|
58
61
|
lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb
|
@@ -63,7 +66,11 @@ lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb
|
|
63
66
|
lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb
|
64
67
|
lib/picolena/templates/lib/plain_text_extractors/pictures.rb
|
65
68
|
lib/picolena/templates/lib/plain_text_extractors/plain_text.rb
|
69
|
+
lib/picolena/templates/lib/plain_text_extractors/rar.rb
|
70
|
+
lib/picolena/templates/lib/plain_text_extractors/videos.rb
|
71
|
+
lib/picolena/templates/lib/plain_text_extractors/zip.rb
|
66
72
|
lib/picolena/templates/lib/tasks/annotations.rake
|
73
|
+
lib/picolena/templates/lib/tasks/extract.rake
|
67
74
|
lib/picolena/templates/lib/tasks/index.rake
|
68
75
|
lib/picolena/templates/lib/tasks/install_dependencies.rake
|
69
76
|
lib/picolena/templates/lib/tasks/log.rake
|
@@ -112,6 +119,7 @@ lib/picolena/templates/public/images/icons/txt.png
|
|
112
119
|
lib/picolena/templates/public/images/icons/video.png
|
113
120
|
lib/picolena/templates/public/images/icons/xls.png
|
114
121
|
lib/picolena/templates/public/images/main_img.jpg
|
122
|
+
lib/picolena/templates/public/images/thumbnails/NOTE
|
115
123
|
lib/picolena/templates/public/images/zafh_net.png
|
116
124
|
lib/picolena/templates/public/robots.txt
|
117
125
|
lib/picolena/templates/public/stylesheets/style.css
|
@@ -147,12 +155,15 @@ lib/picolena/templates/spec/rcov.opts
|
|
147
155
|
lib/picolena/templates/spec/spec.opts
|
148
156
|
lib/picolena/templates/spec/spec_helper.rb
|
149
157
|
lib/picolena/templates/spec/test_dirs/indexed/README
|
158
|
+
lib/picolena/templates/spec/test_dirs/indexed/archives/dumb_file.rar
|
159
|
+
lib/picolena/templates/spec/test_dirs/indexed/archives/some_test_files.zip
|
150
160
|
lib/picolena/templates/spec/test_dirs/indexed/basic/another_plain.text
|
151
161
|
lib/picolena/templates/spec/test_dirs/indexed/basic/basic.odt
|
152
162
|
lib/picolena/templates/spec/test_dirs/indexed/basic/basic.pdf
|
153
163
|
lib/picolena/templates/spec/test_dirs/indexed/basic/basic.tex
|
154
164
|
lib/picolena/templates/spec/test_dirs/indexed/basic/crossed.text
|
155
165
|
lib/picolena/templates/spec/test_dirs/indexed/basic/crossed.txt
|
166
|
+
lib/picolena/templates/spec/test_dirs/indexed/basic/fake_thumbnailer
|
156
167
|
lib/picolena/templates/spec/test_dirs/indexed/basic/hello.rb
|
157
168
|
lib/picolena/templates/spec/test_dirs/indexed/basic/myfirstjavaprog.java
|
158
169
|
lib/picolena/templates/spec/test_dirs/indexed/basic/one_page.ppt
|
@@ -173,7 +184,16 @@ lib/picolena/templates/spec/test_dirs/indexed/lang/lorca
|
|
173
184
|
lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare
|
174
185
|
lib/picolena/templates/spec/test_dirs/indexed/literature/Simulation of district heating systems for evaluation of real-time control strategies.pdf
|
175
186
|
lib/picolena/templates/spec/test_dirs/indexed/literature/Types of malfunction in DH substations.doc
|
187
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/badminton.avi
|
188
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/caution.tif
|
176
189
|
lib/picolena/templates/spec/test_dirs/indexed/media/crow.jpg
|
190
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/cygnus.jpeg
|
191
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/diceface.eps
|
192
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/glass.png
|
193
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/gnu.bmp
|
194
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/picolena.psd
|
195
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/rails_logo_remix.gif
|
196
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/warning.tiff
|
177
197
|
lib/picolena/templates/spec/test_dirs/indexed/others/'weird'filename.txt
|
178
198
|
lib/picolena/templates/spec/test_dirs/indexed/others/7.html
|
179
199
|
lib/picolena/templates/spec/test_dirs/indexed/others/BIN_FILE_WITHOUT_EXTENSION
|
data/config/files_to_clean
CHANGED
data/config/requirements.rb
CHANGED
@@ -46,5 +46,6 @@ module Picolena
|
|
46
46
|
# PerFieldAnalyzer is used to prevent queries like "language:it" to be broken by StopFilter.
|
47
47
|
per_field_analyzer=Ferret::Analysis::PerFieldAnalyzer.new(Ferret::Analysis::StandardAnalyzer.new)
|
48
48
|
per_field_analyzer[:language]=Ferret::Analysis::WhiteSpaceAnalyzer.new
|
49
|
+
per_field_analyzer[:alias_path]=Ferret::Analysis::LetterAnalyzerWithStopFilter.new
|
49
50
|
Analyzer=per_field_analyzer
|
50
|
-
end
|
51
|
+
end
|
@@ -87,7 +87,7 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
|
|
87
87
|
m.rake 'spec' unless options[:no_spec]
|
88
88
|
|
89
89
|
# Cleaning up temp folder if --spec-only
|
90
|
-
m.clean if
|
90
|
+
m.clean if options[:spec_only]
|
91
91
|
end
|
92
92
|
end
|
93
93
|
|
@@ -150,6 +150,7 @@ EOS
|
|
150
150
|
public/images
|
151
151
|
public/images/icons
|
152
152
|
public/images/flags
|
153
|
+
public/images/thumbnails
|
153
154
|
public/javascripts
|
154
155
|
public/stylesheets
|
155
156
|
spec
|
@@ -160,6 +161,7 @@ EOS
|
|
160
161
|
spec/test_dirs
|
161
162
|
spec/test_dirs/empty_folder
|
162
163
|
spec/test_dirs/indexed
|
164
|
+
spec/test_dirs/indexed/archives
|
163
165
|
spec/test_dirs/indexed/basic
|
164
166
|
spec/test_dirs/indexed/different_encodings
|
165
167
|
spec/test_dirs/indexed/just_one_doc
|
@@ -46,13 +46,13 @@ module DocumentsHelper
|
|
46
46
|
|
47
47
|
# Returns icon and filename for any given document.
|
48
48
|
def icon_and_filename_for(document)
|
49
|
-
[icon_for(document
|
49
|
+
[icon_for(document),document.filename].join(" ")
|
50
50
|
end
|
51
51
|
|
52
52
|
# Returns the location (if avaible) of the filetype icon.
|
53
|
-
def icon_for(
|
54
|
-
|
55
|
-
image_tag(
|
53
|
+
def icon_for(document)
|
54
|
+
path=document.icon_path
|
55
|
+
image_tag(document.icon_path) if path
|
56
56
|
end
|
57
57
|
|
58
58
|
# Returns a link to a backup search engine that could maybe find more results for the same query.
|
@@ -87,10 +87,11 @@ class Document
|
|
87
87
|
|
88
88
|
# Returns cached content with matching terms between '<<' '>>'.
|
89
89
|
def highlighted_cache(raw_query)
|
90
|
-
Indexer.index.highlight(Query.extract_from(raw_query), doc_id,
|
90
|
+
excerpts=Indexer.index.highlight(Query.extract_from(raw_query), doc_id,
|
91
91
|
:field => :content, :excerpt_length => :all,
|
92
92
|
:pre_tag => "<<", :post_tag => ">>"
|
93
|
-
|
93
|
+
)
|
94
|
+
excerpts.is_an?(Array) ? excerpts.first : ""
|
94
95
|
end
|
95
96
|
|
96
97
|
# Returns the last modification date before the document got indexed.
|
@@ -127,18 +128,40 @@ class Document
|
|
127
128
|
|
128
129
|
# Indexing fields that are shared between every document.
|
129
130
|
def self.default_fields_for(complete_path)
|
131
|
+
doc=Document.new(complete_path)
|
130
132
|
{
|
131
133
|
:complete_path => complete_path,
|
132
134
|
:probably_unique_id => complete_path.base26_hash,
|
135
|
+
:alias_path => doc.alias_path,
|
133
136
|
:filename => File.basename(complete_path),
|
134
137
|
:basename => File.basename(complete_path, File.extname(complete_path)).gsub(/_/,' '),
|
135
138
|
:filetype => File.extname(complete_path),
|
136
139
|
:modified => File.mtime(complete_path).strftime("%Y%m%d%H%M%S")
|
137
140
|
}
|
138
141
|
end
|
139
|
-
|
140
|
-
|
142
|
+
|
143
|
+
# Returns thumbnail if available, mime icon otherwise
|
144
|
+
def icon_path
|
145
|
+
if File.exists?(thumbnail_path) then
|
146
|
+
thumbnail_path(:public_dir)
|
147
|
+
else
|
148
|
+
icon_symbol=Picolena::FiletypeToIconSymbol[ext_as_sym]
|
149
|
+
"icons/#{icon_symbol}.png" if icon_symbol
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
# Did at least one letter got extracted from the document?
|
154
|
+
# This boolean is used in views to know if a link should be
|
155
|
+
# displayed to show the content
|
156
|
+
def has_content?
|
157
|
+
cached =~ /\w/
|
158
|
+
end
|
159
|
+
|
141
160
|
private
|
161
|
+
|
162
|
+
def thumbnail_path(public_dir=false)
|
163
|
+
File.thumbnail_path(complete_path,public_dir)
|
164
|
+
end
|
142
165
|
|
143
166
|
# FIXME: Is there a way to easily retrieve doc_id for a given document?
|
144
167
|
# Better yet, fix Index#highlight to accept :probably_unique_id and stop using :doc_id.
|
@@ -67,7 +67,8 @@ class Indexer
|
|
67
67
|
def add_or_update_file(complete_path)
|
68
68
|
document = Document.default_fields_for(complete_path)
|
69
69
|
begin
|
70
|
-
|
70
|
+
PlainTextExtractor.extract_thumbnail_from(complete_path)
|
71
|
+
document.merge! PlainTextExtractor.extract_information_from(complete_path)
|
71
72
|
raise "empty document #{complete_path}" if document[:content].strip.empty?
|
72
73
|
logger.add_document document
|
73
74
|
rescue => e
|
@@ -177,6 +178,8 @@ class Indexer
|
|
177
178
|
end
|
178
179
|
|
179
180
|
# Copied from Ferret book, By David Balmain
|
181
|
+
# FIXME : Find an alternative that doesn't need any more dependency.
|
182
|
+
# NOTE: Not supported on windows.
|
180
183
|
def index_time_dbm_file
|
181
184
|
@@dbm_file ||= DBM.open(File.join(Picolena::MetaIndexPath, 'added_at'))
|
182
185
|
end
|
@@ -201,13 +204,14 @@ class Indexer
|
|
201
204
|
|
202
205
|
def default_field_infos
|
203
206
|
returning Ferret::Index::FieldInfos.new do |field_infos|
|
207
|
+
field_infos.add_field(:probably_unique_id, :store => :no, :index => :untokenized)
|
204
208
|
field_infos.add_field(:complete_path, :store => :yes, :index => :untokenized)
|
205
209
|
field_infos.add_field(:content, :store => :yes, :index => :yes)
|
210
|
+
field_infos.add_field(:alias_path, :store => :no, :index => :yes, :boost => 0.5)
|
206
211
|
field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5)
|
207
212
|
field_infos.add_field(:filename, :store => :no, :index => :yes, :boost => 1.5)
|
208
213
|
field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
|
209
214
|
field_infos.add_field(:modified, :store => :yes, :index => :untokenized)
|
210
|
-
field_infos.add_field(:probably_unique_id, :store => :no, :index => :untokenized)
|
211
215
|
field_infos.add_field(:language, :store => :yes, :index => :untokenized)
|
212
216
|
end
|
213
217
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require 'plain_text_extractor_dsl'
|
2
2
|
|
3
3
|
# PlainTextExtractor is the class responsible for extracting plain text contents from
|
4
4
|
# different documents filetypes (.doc, .html, .pdf, .od?), as defined in
|
@@ -47,27 +47,29 @@ class PlainTextExtractor
|
|
47
47
|
end
|
48
48
|
|
49
49
|
# Launches extractor on given file and outputs plain text result and language (if found)
|
50
|
-
def
|
51
|
-
find_by_filename(source).
|
50
|
+
def extract_information_from(source)
|
51
|
+
find_by_filename(source).extract_information
|
52
|
+
end
|
53
|
+
|
54
|
+
# Tries to extract a thumbnail from source.
|
55
|
+
# Doesn't do anything if thumbnail_command isn't defined for the corresponding filetype.
|
56
|
+
def extract_thumbnail_from(source)
|
57
|
+
find_by_filename(source).extract_thumbnail
|
52
58
|
end
|
53
59
|
|
54
60
|
# Returns which language guesser should be used by the system.
|
55
61
|
# Returns nil if none is found.
|
56
62
|
def language_guesser
|
57
|
-
@@language_guesser||=('mguesser -n1'
|
63
|
+
@@language_guesser||=('mguesser -n1' if 'mguesser'.installed?)
|
58
64
|
end
|
59
65
|
end
|
60
66
|
|
61
67
|
attr_accessor :source
|
62
68
|
|
63
|
-
# Parses
|
69
|
+
# Parses commands in order to know which programs are needed.
|
64
70
|
# rspec will then check that every dependecy is installed on the system
|
65
71
|
def dependencies
|
66
|
-
|
67
|
-
command.split(/\|\s*/).collect{|command_part| command_part.split(/ /).first}
|
68
|
-
else
|
69
|
-
@dependencies
|
70
|
-
end
|
72
|
+
[@dependencies, command.dependencies, thumbnail_command.dependencies].flatten
|
71
73
|
end
|
72
74
|
|
73
75
|
## Conversion part
|
@@ -79,11 +81,11 @@ class PlainTextExtractor
|
|
79
81
|
# If command includes 'DESTINATION' keyword,
|
80
82
|
# launches the command and returns the content of
|
81
83
|
# DESTINATION file.
|
82
|
-
|
84
|
+
silently_execute(specific_command)
|
83
85
|
File.read_and_remove(destination)
|
84
86
|
else
|
85
87
|
# Otherwise, launches the command and returns STDOUT.
|
86
|
-
|
88
|
+
silently_execute(specific_command)
|
87
89
|
end
|
88
90
|
else
|
89
91
|
# command is a Block.
|
@@ -97,14 +99,16 @@ class PlainTextExtractor
|
|
97
99
|
# using mguesser to guess used language.
|
98
100
|
# This method only returns probable language if the content is bigger than 500 chars
|
99
101
|
# and if probability score is higher than 90%.
|
100
|
-
def
|
102
|
+
def extract_information
|
101
103
|
content=extract_content
|
104
|
+
|
102
105
|
return {:content => content} unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb)
|
103
106
|
Picolena::UseLanguageRecognition,
|
104
107
|
# Is a language guesser already installed?
|
105
108
|
PlainTextExtractor.language_guesser,
|
106
109
|
# Language recognition is too unreliable for small files.
|
107
110
|
content.size > 500].all?
|
111
|
+
|
108
112
|
language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser|
|
109
113
|
lang_guesser.write content
|
110
114
|
lang_guesser.close_write
|
@@ -115,9 +119,14 @@ class PlainTextExtractor
|
|
115
119
|
lang unless score<0.9
|
116
120
|
end
|
117
121
|
}
|
122
|
+
|
118
123
|
{:content => content, :language => language}
|
119
124
|
end
|
120
125
|
|
126
|
+
def extract_thumbnail
|
127
|
+
silently_execute(specific_thumbnail_command) if thumbnail_command
|
128
|
+
end
|
129
|
+
|
121
130
|
private
|
122
131
|
|
123
132
|
# destination method can be used by some conversion command that cannot output to stdout (example?)
|
@@ -131,4 +140,9 @@ class PlainTextExtractor
|
|
131
140
|
def specific_command
|
132
141
|
command.sub('SOURCE','"'<<source<<'"').sub('DESTINATION','"'<<destination<<'"')
|
133
142
|
end
|
143
|
+
|
144
|
+
# Replaces generic command with specific source and thumbnail (if specified) files
|
145
|
+
def specific_thumbnail_command
|
146
|
+
thumbnail_command.sub('SOURCE','"'<<source<<'"').sub('THUMBNAIL','"'<<File.thumbnail_path(source)<<'"')
|
147
|
+
end
|
134
148
|
end
|
@@ -32,7 +32,7 @@ class Query
|
|
32
32
|
|
33
33
|
# Instantiates a QueryParser once, and keeps it in cache.
|
34
34
|
def parser
|
35
|
-
@@parser ||= Ferret::QueryParser.new(:fields => [:content, :filename, :basename, :filetype, :modified], :or_default => false, :analyzer=>Picolena::Analyzer)
|
35
|
+
@@parser ||= Ferret::QueryParser.new(:fields => [:content, :filename, :basename, :alias_path, :filetype, :modified], :or_default => false, :analyzer=>Picolena::Analyzer)
|
36
36
|
end
|
37
37
|
end
|
38
|
-
end
|
38
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'ferret'
|
2
|
+
module Ferret
|
3
|
+
module Analysis
|
4
|
+
# Used for alias_path queries
|
5
|
+
class LetterAnalyzerWithStopFilter
|
6
|
+
def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
|
7
|
+
@lower = lower
|
8
|
+
@stop_words = stop_words
|
9
|
+
end
|
10
|
+
|
11
|
+
def token_stream(field, str)
|
12
|
+
ts = LetterTokenizer.new(str, @lower)
|
13
|
+
StopFilter.new(ts, @stop_words)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
File without changes
|
File without changes
|