picolena 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +14 -0
- data/Manifest.txt +28 -8
- data/config/files_to_clean +1 -0
- data/config/requirements.rb +1 -1
- data/lib/picolena/config/basic.rb +2 -1
- data/lib/picolena/config/icons_and_filetypes.yml +5 -0
- data/lib/picolena/picolena_generator.rb +3 -1
- data/lib/picolena/templates/app/helpers/documents_helper.rb +4 -4
- data/lib/picolena/templates/app/models/document.rb +27 -4
- data/lib/picolena/templates/app/models/indexer.rb +6 -2
- data/lib/picolena/templates/app/models/plain_text_extractor.rb +27 -13
- data/lib/picolena/templates/app/models/query.rb +2 -2
- data/lib/picolena/templates/app/views/documents/_document.html.haml +1 -1
- data/lib/picolena/templates/config/environments/development.rb +2 -0
- data/lib/picolena/templates/config/initializers/001_load_ferret.rb +17 -0
- data/lib/picolena/templates/config/initializers/{001_load_custom_config.rb → 002_load_custom_config.rb} +1 -2
- data/lib/picolena/templates/config/initializers/{002_load_indexed_dirs.rb → 003_load_indexed_dirs.rb} +0 -0
- data/lib/picolena/templates/config/initializers/{003_load_white_list_IPs.rb → 004_load_white_list_IPs.rb} +0 -0
- data/lib/picolena/templates/config/initializers/{004_load_plain_text_extractors.rb → 005_load_plain_text_extractors.rb} +1 -1
- data/lib/picolena/templates/config/initializers/{005_load_custom_title_and_names_and_links.rb → 006_load_custom_title_and_names_and_links.rb} +0 -0
- data/lib/picolena/templates/config/initializers/{006_load_icons.rb → 007_load_icons.rb} +0 -0
- data/lib/picolena/templates/config/initializers/{007_load_performance_tweaks.rb → 008_load_performance_tweaks.rb} +0 -0
- data/lib/picolena/templates/lib/core_exts.rb +52 -0
- data/lib/picolena/templates/lib/development_helpers.rb +35 -0
- data/lib/picolena/templates/lib/plain_text_extractor_dsl.rb +128 -0
- data/lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb +2 -2
- data/lib/picolena/templates/lib/plain_text_extractors/adobe.photoshop.rb +12 -0
- data/lib/picolena/templates/lib/plain_text_extractors/html.rb +1 -1
- data/lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb +4 -4
- data/lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb +4 -4
- data/lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb +3 -3
- data/lib/picolena/templates/lib/plain_text_extractors/ms.word.rb +4 -4
- data/lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb +2 -2
- data/lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb +2 -2
- data/lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb +2 -2
- data/lib/picolena/templates/lib/plain_text_extractors/pictures.rb +15 -4
- data/lib/picolena/templates/lib/plain_text_extractors/plain_text.rb +9 -2
- data/lib/picolena/templates/lib/plain_text_extractors/rar.rb +18 -0
- data/lib/picolena/templates/lib/plain_text_extractors/videos.rb +13 -0
- data/lib/picolena/templates/lib/plain_text_extractors/zip.rb +17 -0
- data/lib/picolena/templates/lib/tasks/extract.rake +16 -0
- data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
- data/lib/picolena/templates/public/images/thumbnails/NOTE +2 -0
- data/lib/picolena/templates/spec/controllers/documents_controller_spec.rb +8 -0
- data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +12 -1
- data/lib/picolena/templates/spec/models/basic_finder_spec.rb +6 -4
- data/lib/picolena/templates/spec/models/document_spec.rb +24 -4
- data/lib/picolena/templates/spec/models/finder_spec.rb +18 -11
- data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +1 -1
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +25 -8
- data/lib/picolena/templates/spec/models/query_spec.rb +4 -5
- data/lib/picolena/templates/spec/spec_helper.rb +9 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/archives/dumb_file.rar +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/archives/some_test_files.zip +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/basic/fake_thumbnailer +14 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/badminton.avi +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/caution.tif +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/cygnus.jpeg +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/diceface.eps +79 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/glass.png +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/gnu.bmp +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/picolena.psd +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/rails_logo_remix.gif +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/warning.tiff +0 -0
- data/lib/picolena/version.rb +1 -1
- data/website/index.html +1 -1
- metadata +31 -32
- data.tar.gz.sig +0 -0
- data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +0 -88
- metadata.gz.sig +0 -0
data/History.txt
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
+
== 0.2.2 2009-02-13
|
|
2
|
+
|
|
3
|
+
* 3 major enhancements :
|
|
4
|
+
* Thumbnails created for pictures & videos
|
|
5
|
+
* Support for .zip & .rar archives
|
|
6
|
+
* Alias_path is now LetterTokenized
|
|
7
|
+
|
|
8
|
+
* 2 minor enhancements:
|
|
9
|
+
* More specs
|
|
10
|
+
* Some aesthetical changes
|
|
11
|
+
|
|
12
|
+
* 1 bug fix :
|
|
13
|
+
* Displaying filetypes without any icon would raise an Exception
|
|
14
|
+
|
|
1
15
|
== 0.2.0 2009-02-02
|
|
2
16
|
|
|
3
17
|
* 1 major enhancement :
|
data/Manifest.txt
CHANGED
|
@@ -37,22 +37,25 @@ lib/picolena/templates/config/boot.rb
|
|
|
37
37
|
lib/picolena/templates/config/environments/development.rb
|
|
38
38
|
lib/picolena/templates/config/environments/production.rb
|
|
39
39
|
lib/picolena/templates/config/environments/test.rb
|
|
40
|
-
lib/picolena/templates/config/initializers/
|
|
41
|
-
lib/picolena/templates/config/initializers/
|
|
42
|
-
lib/picolena/templates/config/initializers/
|
|
43
|
-
lib/picolena/templates/config/initializers/
|
|
44
|
-
lib/picolena/templates/config/initializers/
|
|
45
|
-
lib/picolena/templates/config/initializers/
|
|
46
|
-
lib/picolena/templates/config/initializers/
|
|
40
|
+
lib/picolena/templates/config/initializers/001_load_ferret.rb
|
|
41
|
+
lib/picolena/templates/config/initializers/002_load_custom_config.rb
|
|
42
|
+
lib/picolena/templates/config/initializers/003_load_indexed_dirs.rb
|
|
43
|
+
lib/picolena/templates/config/initializers/004_load_white_list_IPs.rb
|
|
44
|
+
lib/picolena/templates/config/initializers/005_load_plain_text_extractors.rb
|
|
45
|
+
lib/picolena/templates/config/initializers/006_load_custom_title_and_names_and_links.rb
|
|
46
|
+
lib/picolena/templates/config/initializers/007_load_icons.rb
|
|
47
|
+
lib/picolena/templates/config/initializers/008_load_performance_tweaks.rb
|
|
47
48
|
lib/picolena/templates/config/routes.rb
|
|
48
49
|
lib/picolena/templates/lang/ui/de.yml
|
|
49
50
|
lib/picolena/templates/lang/ui/en.yml
|
|
50
51
|
lib/picolena/templates/lang/ui/es.yml
|
|
51
52
|
lib/picolena/templates/lang/ui/fr.yml
|
|
52
53
|
lib/picolena/templates/lib/core_exts.rb
|
|
54
|
+
lib/picolena/templates/lib/development_helpers.rb
|
|
53
55
|
lib/picolena/templates/lib/indexer_logger.rb
|
|
54
|
-
lib/picolena/templates/lib/
|
|
56
|
+
lib/picolena/templates/lib/plain_text_extractor_dsl.rb
|
|
55
57
|
lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
|
|
58
|
+
lib/picolena/templates/lib/plain_text_extractors/adobe.photoshop.rb
|
|
56
59
|
lib/picolena/templates/lib/plain_text_extractors/html.rb
|
|
57
60
|
lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb
|
|
58
61
|
lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb
|
|
@@ -63,7 +66,11 @@ lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb
|
|
|
63
66
|
lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb
|
|
64
67
|
lib/picolena/templates/lib/plain_text_extractors/pictures.rb
|
|
65
68
|
lib/picolena/templates/lib/plain_text_extractors/plain_text.rb
|
|
69
|
+
lib/picolena/templates/lib/plain_text_extractors/rar.rb
|
|
70
|
+
lib/picolena/templates/lib/plain_text_extractors/videos.rb
|
|
71
|
+
lib/picolena/templates/lib/plain_text_extractors/zip.rb
|
|
66
72
|
lib/picolena/templates/lib/tasks/annotations.rake
|
|
73
|
+
lib/picolena/templates/lib/tasks/extract.rake
|
|
67
74
|
lib/picolena/templates/lib/tasks/index.rake
|
|
68
75
|
lib/picolena/templates/lib/tasks/install_dependencies.rake
|
|
69
76
|
lib/picolena/templates/lib/tasks/log.rake
|
|
@@ -112,6 +119,7 @@ lib/picolena/templates/public/images/icons/txt.png
|
|
|
112
119
|
lib/picolena/templates/public/images/icons/video.png
|
|
113
120
|
lib/picolena/templates/public/images/icons/xls.png
|
|
114
121
|
lib/picolena/templates/public/images/main_img.jpg
|
|
122
|
+
lib/picolena/templates/public/images/thumbnails/NOTE
|
|
115
123
|
lib/picolena/templates/public/images/zafh_net.png
|
|
116
124
|
lib/picolena/templates/public/robots.txt
|
|
117
125
|
lib/picolena/templates/public/stylesheets/style.css
|
|
@@ -147,12 +155,15 @@ lib/picolena/templates/spec/rcov.opts
|
|
|
147
155
|
lib/picolena/templates/spec/spec.opts
|
|
148
156
|
lib/picolena/templates/spec/spec_helper.rb
|
|
149
157
|
lib/picolena/templates/spec/test_dirs/indexed/README
|
|
158
|
+
lib/picolena/templates/spec/test_dirs/indexed/archives/dumb_file.rar
|
|
159
|
+
lib/picolena/templates/spec/test_dirs/indexed/archives/some_test_files.zip
|
|
150
160
|
lib/picolena/templates/spec/test_dirs/indexed/basic/another_plain.text
|
|
151
161
|
lib/picolena/templates/spec/test_dirs/indexed/basic/basic.odt
|
|
152
162
|
lib/picolena/templates/spec/test_dirs/indexed/basic/basic.pdf
|
|
153
163
|
lib/picolena/templates/spec/test_dirs/indexed/basic/basic.tex
|
|
154
164
|
lib/picolena/templates/spec/test_dirs/indexed/basic/crossed.text
|
|
155
165
|
lib/picolena/templates/spec/test_dirs/indexed/basic/crossed.txt
|
|
166
|
+
lib/picolena/templates/spec/test_dirs/indexed/basic/fake_thumbnailer
|
|
156
167
|
lib/picolena/templates/spec/test_dirs/indexed/basic/hello.rb
|
|
157
168
|
lib/picolena/templates/spec/test_dirs/indexed/basic/myfirstjavaprog.java
|
|
158
169
|
lib/picolena/templates/spec/test_dirs/indexed/basic/one_page.ppt
|
|
@@ -173,7 +184,16 @@ lib/picolena/templates/spec/test_dirs/indexed/lang/lorca
|
|
|
173
184
|
lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare
|
|
174
185
|
lib/picolena/templates/spec/test_dirs/indexed/literature/Simulation of district heating systems for evaluation of real-time control strategies.pdf
|
|
175
186
|
lib/picolena/templates/spec/test_dirs/indexed/literature/Types of malfunction in DH substations.doc
|
|
187
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/badminton.avi
|
|
188
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/caution.tif
|
|
176
189
|
lib/picolena/templates/spec/test_dirs/indexed/media/crow.jpg
|
|
190
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/cygnus.jpeg
|
|
191
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/diceface.eps
|
|
192
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/glass.png
|
|
193
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/gnu.bmp
|
|
194
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/picolena.psd
|
|
195
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/rails_logo_remix.gif
|
|
196
|
+
lib/picolena/templates/spec/test_dirs/indexed/media/warning.tiff
|
|
177
197
|
lib/picolena/templates/spec/test_dirs/indexed/others/'weird'filename.txt
|
|
178
198
|
lib/picolena/templates/spec/test_dirs/indexed/others/7.html
|
|
179
199
|
lib/picolena/templates/spec/test_dirs/indexed/others/BIN_FILE_WITHOUT_EXTENSION
|
data/config/files_to_clean
CHANGED
data/config/requirements.rb
CHANGED
|
@@ -46,5 +46,6 @@ module Picolena
|
|
|
46
46
|
# PerFieldAnalyzer is used to prevent queries like "language:it" to be broken by StopFilter.
|
|
47
47
|
per_field_analyzer=Ferret::Analysis::PerFieldAnalyzer.new(Ferret::Analysis::StandardAnalyzer.new)
|
|
48
48
|
per_field_analyzer[:language]=Ferret::Analysis::WhiteSpaceAnalyzer.new
|
|
49
|
+
per_field_analyzer[:alias_path]=Ferret::Analysis::LetterAnalyzerWithStopFilter.new
|
|
49
50
|
Analyzer=per_field_analyzer
|
|
50
|
-
end
|
|
51
|
+
end
|
|
@@ -87,7 +87,7 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
|
|
|
87
87
|
m.rake 'spec' unless options[:no_spec]
|
|
88
88
|
|
|
89
89
|
# Cleaning up temp folder if --spec-only
|
|
90
|
-
m.clean if
|
|
90
|
+
m.clean if options[:spec_only]
|
|
91
91
|
end
|
|
92
92
|
end
|
|
93
93
|
|
|
@@ -150,6 +150,7 @@ EOS
|
|
|
150
150
|
public/images
|
|
151
151
|
public/images/icons
|
|
152
152
|
public/images/flags
|
|
153
|
+
public/images/thumbnails
|
|
153
154
|
public/javascripts
|
|
154
155
|
public/stylesheets
|
|
155
156
|
spec
|
|
@@ -160,6 +161,7 @@ EOS
|
|
|
160
161
|
spec/test_dirs
|
|
161
162
|
spec/test_dirs/empty_folder
|
|
162
163
|
spec/test_dirs/indexed
|
|
164
|
+
spec/test_dirs/indexed/archives
|
|
163
165
|
spec/test_dirs/indexed/basic
|
|
164
166
|
spec/test_dirs/indexed/different_encodings
|
|
165
167
|
spec/test_dirs/indexed/just_one_doc
|
|
@@ -46,13 +46,13 @@ module DocumentsHelper
|
|
|
46
46
|
|
|
47
47
|
# Returns icon and filename for any given document.
|
|
48
48
|
def icon_and_filename_for(document)
|
|
49
|
-
[icon_for(document
|
|
49
|
+
[icon_for(document),document.filename].join(" ")
|
|
50
50
|
end
|
|
51
51
|
|
|
52
52
|
# Returns the location (if avaible) of the filetype icon.
|
|
53
|
-
def icon_for(
|
|
54
|
-
|
|
55
|
-
image_tag(
|
|
53
|
+
def icon_for(document)
|
|
54
|
+
path=document.icon_path
|
|
55
|
+
image_tag(document.icon_path) if path
|
|
56
56
|
end
|
|
57
57
|
|
|
58
58
|
# Returns a link to a backup search engine that could maybe find more results for the same query.
|
|
@@ -87,10 +87,11 @@ class Document
|
|
|
87
87
|
|
|
88
88
|
# Returns cached content with matching terms between '<<' '>>'.
|
|
89
89
|
def highlighted_cache(raw_query)
|
|
90
|
-
Indexer.index.highlight(Query.extract_from(raw_query), doc_id,
|
|
90
|
+
excerpts=Indexer.index.highlight(Query.extract_from(raw_query), doc_id,
|
|
91
91
|
:field => :content, :excerpt_length => :all,
|
|
92
92
|
:pre_tag => "<<", :post_tag => ">>"
|
|
93
|
-
|
|
93
|
+
)
|
|
94
|
+
excerpts.is_an?(Array) ? excerpts.first : ""
|
|
94
95
|
end
|
|
95
96
|
|
|
96
97
|
# Returns the last modification date before the document got indexed.
|
|
@@ -127,18 +128,40 @@ class Document
|
|
|
127
128
|
|
|
128
129
|
# Indexing fields that are shared between every document.
|
|
129
130
|
def self.default_fields_for(complete_path)
|
|
131
|
+
doc=Document.new(complete_path)
|
|
130
132
|
{
|
|
131
133
|
:complete_path => complete_path,
|
|
132
134
|
:probably_unique_id => complete_path.base26_hash,
|
|
135
|
+
:alias_path => doc.alias_path,
|
|
133
136
|
:filename => File.basename(complete_path),
|
|
134
137
|
:basename => File.basename(complete_path, File.extname(complete_path)).gsub(/_/,' '),
|
|
135
138
|
:filetype => File.extname(complete_path),
|
|
136
139
|
:modified => File.mtime(complete_path).strftime("%Y%m%d%H%M%S")
|
|
137
140
|
}
|
|
138
141
|
end
|
|
139
|
-
|
|
140
|
-
|
|
142
|
+
|
|
143
|
+
# Returns thumbnail if available, mime icon otherwise
|
|
144
|
+
def icon_path
|
|
145
|
+
if File.exists?(thumbnail_path) then
|
|
146
|
+
thumbnail_path(:public_dir)
|
|
147
|
+
else
|
|
148
|
+
icon_symbol=Picolena::FiletypeToIconSymbol[ext_as_sym]
|
|
149
|
+
"icons/#{icon_symbol}.png" if icon_symbol
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Did at least one letter got extracted from the document?
|
|
154
|
+
# This boolean is used in views to know if a link should be
|
|
155
|
+
# displayed to show the content
|
|
156
|
+
def has_content?
|
|
157
|
+
cached =~ /\w/
|
|
158
|
+
end
|
|
159
|
+
|
|
141
160
|
private
|
|
161
|
+
|
|
162
|
+
def thumbnail_path(public_dir=false)
|
|
163
|
+
File.thumbnail_path(complete_path,public_dir)
|
|
164
|
+
end
|
|
142
165
|
|
|
143
166
|
# FIXME: Is there a way to easily retrieve doc_id for a given document?
|
|
144
167
|
# Better yet, fix Index#highlight to accept :probably_unique_id and stop using :doc_id.
|
|
@@ -67,7 +67,8 @@ class Indexer
|
|
|
67
67
|
def add_or_update_file(complete_path)
|
|
68
68
|
document = Document.default_fields_for(complete_path)
|
|
69
69
|
begin
|
|
70
|
-
|
|
70
|
+
PlainTextExtractor.extract_thumbnail_from(complete_path)
|
|
71
|
+
document.merge! PlainTextExtractor.extract_information_from(complete_path)
|
|
71
72
|
raise "empty document #{complete_path}" if document[:content].strip.empty?
|
|
72
73
|
logger.add_document document
|
|
73
74
|
rescue => e
|
|
@@ -177,6 +178,8 @@ class Indexer
|
|
|
177
178
|
end
|
|
178
179
|
|
|
179
180
|
# Copied from Ferret book, By David Balmain
|
|
181
|
+
# FIXME : Find an alternative that doesn't need any more dependency.
|
|
182
|
+
# NOTE: Not supported on windows.
|
|
180
183
|
def index_time_dbm_file
|
|
181
184
|
@@dbm_file ||= DBM.open(File.join(Picolena::MetaIndexPath, 'added_at'))
|
|
182
185
|
end
|
|
@@ -201,13 +204,14 @@ class Indexer
|
|
|
201
204
|
|
|
202
205
|
def default_field_infos
|
|
203
206
|
returning Ferret::Index::FieldInfos.new do |field_infos|
|
|
207
|
+
field_infos.add_field(:probably_unique_id, :store => :no, :index => :untokenized)
|
|
204
208
|
field_infos.add_field(:complete_path, :store => :yes, :index => :untokenized)
|
|
205
209
|
field_infos.add_field(:content, :store => :yes, :index => :yes)
|
|
210
|
+
field_infos.add_field(:alias_path, :store => :no, :index => :yes, :boost => 0.5)
|
|
206
211
|
field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5)
|
|
207
212
|
field_infos.add_field(:filename, :store => :no, :index => :yes, :boost => 1.5)
|
|
208
213
|
field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
|
|
209
214
|
field_infos.add_field(:modified, :store => :yes, :index => :untokenized)
|
|
210
|
-
field_infos.add_field(:probably_unique_id, :store => :no, :index => :untokenized)
|
|
211
215
|
field_infos.add_field(:language, :store => :yes, :index => :untokenized)
|
|
212
216
|
end
|
|
213
217
|
end
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
require '
|
|
1
|
+
require 'plain_text_extractor_dsl'
|
|
2
2
|
|
|
3
3
|
# PlainTextExtractor is the class responsible for extracting plain text contents from
|
|
4
4
|
# different documents filetypes (.doc, .html, .pdf, .od?), as defined in
|
|
@@ -47,27 +47,29 @@ class PlainTextExtractor
|
|
|
47
47
|
end
|
|
48
48
|
|
|
49
49
|
# Launches extractor on given file and outputs plain text result and language (if found)
|
|
50
|
-
def
|
|
51
|
-
find_by_filename(source).
|
|
50
|
+
def extract_information_from(source)
|
|
51
|
+
find_by_filename(source).extract_information
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Tries to extract a thumbnail from source.
|
|
55
|
+
# Doesn't do anything if thumbnail_command isn't defined for the corresponding filetype.
|
|
56
|
+
def extract_thumbnail_from(source)
|
|
57
|
+
find_by_filename(source).extract_thumbnail
|
|
52
58
|
end
|
|
53
59
|
|
|
54
60
|
# Returns which language guesser should be used by the system.
|
|
55
61
|
# Returns nil if none is found.
|
|
56
62
|
def language_guesser
|
|
57
|
-
@@language_guesser||=('mguesser -n1'
|
|
63
|
+
@@language_guesser||=('mguesser -n1' if 'mguesser'.installed?)
|
|
58
64
|
end
|
|
59
65
|
end
|
|
60
66
|
|
|
61
67
|
attr_accessor :source
|
|
62
68
|
|
|
63
|
-
# Parses
|
|
69
|
+
# Parses commands in order to know which programs are needed.
|
|
64
70
|
# rspec will then check that every dependecy is installed on the system
|
|
65
71
|
def dependencies
|
|
66
|
-
|
|
67
|
-
command.split(/\|\s*/).collect{|command_part| command_part.split(/ /).first}
|
|
68
|
-
else
|
|
69
|
-
@dependencies
|
|
70
|
-
end
|
|
72
|
+
[@dependencies, command.dependencies, thumbnail_command.dependencies].flatten
|
|
71
73
|
end
|
|
72
74
|
|
|
73
75
|
## Conversion part
|
|
@@ -79,11 +81,11 @@ class PlainTextExtractor
|
|
|
79
81
|
# If command includes 'DESTINATION' keyword,
|
|
80
82
|
# launches the command and returns the content of
|
|
81
83
|
# DESTINATION file.
|
|
82
|
-
|
|
84
|
+
silently_execute(specific_command)
|
|
83
85
|
File.read_and_remove(destination)
|
|
84
86
|
else
|
|
85
87
|
# Otherwise, launches the command and returns STDOUT.
|
|
86
|
-
|
|
88
|
+
silently_execute(specific_command)
|
|
87
89
|
end
|
|
88
90
|
else
|
|
89
91
|
# command is a Block.
|
|
@@ -97,14 +99,16 @@ class PlainTextExtractor
|
|
|
97
99
|
# using mguesser to guess used language.
|
|
98
100
|
# This method only returns probable language if the content is bigger than 500 chars
|
|
99
101
|
# and if probability score is higher than 90%.
|
|
100
|
-
def
|
|
102
|
+
def extract_information
|
|
101
103
|
content=extract_content
|
|
104
|
+
|
|
102
105
|
return {:content => content} unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb)
|
|
103
106
|
Picolena::UseLanguageRecognition,
|
|
104
107
|
# Is a language guesser already installed?
|
|
105
108
|
PlainTextExtractor.language_guesser,
|
|
106
109
|
# Language recognition is too unreliable for small files.
|
|
107
110
|
content.size > 500].all?
|
|
111
|
+
|
|
108
112
|
language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser|
|
|
109
113
|
lang_guesser.write content
|
|
110
114
|
lang_guesser.close_write
|
|
@@ -115,9 +119,14 @@ class PlainTextExtractor
|
|
|
115
119
|
lang unless score<0.9
|
|
116
120
|
end
|
|
117
121
|
}
|
|
122
|
+
|
|
118
123
|
{:content => content, :language => language}
|
|
119
124
|
end
|
|
120
125
|
|
|
126
|
+
def extract_thumbnail
|
|
127
|
+
silently_execute(specific_thumbnail_command) if thumbnail_command
|
|
128
|
+
end
|
|
129
|
+
|
|
121
130
|
private
|
|
122
131
|
|
|
123
132
|
# destination method can be used by some conversion command that cannot output to stdout (example?)
|
|
@@ -131,4 +140,9 @@ class PlainTextExtractor
|
|
|
131
140
|
def specific_command
|
|
132
141
|
command.sub('SOURCE','"'<<source<<'"').sub('DESTINATION','"'<<destination<<'"')
|
|
133
142
|
end
|
|
143
|
+
|
|
144
|
+
# Replaces generic command with specific source and thumbnail (if specified) files
|
|
145
|
+
def specific_thumbnail_command
|
|
146
|
+
thumbnail_command.sub('SOURCE','"'<<source<<'"').sub('THUMBNAIL','"'<<File.thumbnail_path(source)<<'"')
|
|
147
|
+
end
|
|
134
148
|
end
|
|
@@ -32,7 +32,7 @@ class Query
|
|
|
32
32
|
|
|
33
33
|
# Instantiates a QueryParser once, and keeps it in cache.
|
|
34
34
|
def parser
|
|
35
|
-
@@parser ||= Ferret::QueryParser.new(:fields => [:content, :filename, :basename, :filetype, :modified], :or_default => false, :analyzer=>Picolena::Analyzer)
|
|
35
|
+
@@parser ||= Ferret::QueryParser.new(:fields => [:content, :filename, :basename, :alias_path, :filetype, :modified], :or_default => false, :analyzer=>Picolena::Analyzer)
|
|
36
36
|
end
|
|
37
37
|
end
|
|
38
|
-
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
require 'ferret'
|
|
2
|
+
module Ferret
|
|
3
|
+
module Analysis
|
|
4
|
+
# Used for alias_path queries
|
|
5
|
+
class LetterAnalyzerWithStopFilter
|
|
6
|
+
def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
|
|
7
|
+
@lower = lower
|
|
8
|
+
@stop_words = stop_words
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def token_stream(field, str)
|
|
12
|
+
ts = LetterTokenizer.new(str, @lower)
|
|
13
|
+
StopFilter.new(ts, @stop_words)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
File without changes
|
|
File without changes
|