picolena 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +8 -0
- data/Manifest.txt +28 -15
- data/README.txt +1 -1
- data/config/files_to_clean +2 -1
- data/config/hoe.rb +1 -1
- data/lib/picolena/config/basic.rb +46 -35
- data/lib/picolena/config/icons_and_filetypes.yml +69 -0
- data/lib/picolena/config/indexed_directories.yml +1 -1
- data/lib/picolena/picolena_generator.rb +3 -1
- data/lib/picolena/templates/app/controllers/application.rb +2 -2
- data/lib/picolena/templates/app/controllers/documents_controller.rb +1 -1
- data/lib/picolena/templates/app/helpers/documents_helper.rb +7 -26
- data/lib/picolena/templates/app/models/document.rb +32 -14
- data/lib/picolena/templates/app/models/finder.rb +21 -78
- data/lib/picolena/templates/app/models/index_reader.rb +56 -0
- data/lib/picolena/templates/app/models/index_writer.rb +36 -0
- data/lib/picolena/templates/app/models/indexer.rb +142 -0
- data/lib/picolena/templates/app/models/plain_text_extractor.rb +122 -0
- data/lib/picolena/templates/app/models/query.rb +31 -0
- data/lib/picolena/templates/app/views/documents/_document.html.haml +2 -2
- data/lib/picolena/templates/config/environment.rb +2 -2
- data/lib/picolena/templates/config/environments/development.rb +1 -1
- data/lib/picolena/templates/config/environments/production.rb +1 -1
- data/lib/picolena/templates/config/environments/test.rb +1 -1
- data/lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb +2 -0
- data/lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb +3 -1
- data/lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb +6 -0
- data/lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb +2 -0
- data/lib/picolena/templates/config/initializers/006_load_icons.rb +8 -0
- data/lib/picolena/templates/lib/core_exts.rb +20 -1
- data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +72 -0
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/adobe.pdf.rb +3 -3
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/html.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.excel.rb +4 -4
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.powerpoint.rb +4 -4
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.rtf.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.word.rb +4 -4
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.presentation.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.spreadsheet.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.text.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/plain_text.rb +3 -3
- data/lib/picolena/templates/lib/tasks/index.rake +4 -6
- data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
- data/lib/picolena/templates/spec/controllers/documents_controller_spec.rb +5 -5
- data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +1 -1
- data/lib/picolena/templates/spec/models/basic_finder_spec.rb +13 -13
- data/lib/picolena/templates/spec/models/document_spec.rb +1 -1
- data/lib/picolena/templates/spec/models/finder_spec.rb +5 -70
- data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +6 -2
- data/lib/picolena/templates/spec/models/index_directories_spec.rb +4 -4
- data/lib/picolena/templates/spec/models/index_reader_spec.rb +7 -0
- data/lib/picolena/templates/spec/models/index_writer_spec.rb +7 -0
- data/lib/picolena/templates/spec/models/indexer_spec.rb +7 -0
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +42 -0
- data/lib/picolena/templates/spec/models/query_spec.rb +56 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/goethe +42 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/hugo +83 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/lorca +86 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare +90 -0
- data/lib/picolena/version.rb +1 -1
- data/tasks/hack.rake +2 -1
- data/website/index.html +2 -2
- data.tar.gz.sig +0 -0
- metadata +30 -17
- metadata.gz.sig +0 -0
- data/lib/picolena/templates/config/initializers/004_load_filters.rb +0 -6
- data/lib/picolena/templates/lib/ff.rb +0 -117
- data/lib/picolena/templates/lib/filter.rb +0 -75
- data/lib/picolena/templates/lib/filter_DSL.rb +0 -77
- data/lib/picolena/templates/spec/models/filters_spec.rb +0 -30
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
@@ -9,6 +9,7 @@ config/hoe.rb
|
|
9
9
|
config/requirements.rb
|
10
10
|
lib/picolena/USAGE
|
11
11
|
lib/picolena/config/basic.rb
|
12
|
+
lib/picolena/config/icons_and_filetypes.yml
|
12
13
|
lib/picolena/config/indexed_directories.yml
|
13
14
|
lib/picolena/config/title_and_names_and_links.yml
|
14
15
|
lib/picolena/config/white_list_ip.yml
|
@@ -21,6 +22,11 @@ lib/picolena/templates/app/helpers/application_helper.rb
|
|
21
22
|
lib/picolena/templates/app/helpers/documents_helper.rb
|
22
23
|
lib/picolena/templates/app/models/document.rb
|
23
24
|
lib/picolena/templates/app/models/finder.rb
|
25
|
+
lib/picolena/templates/app/models/index_reader.rb
|
26
|
+
lib/picolena/templates/app/models/index_writer.rb
|
27
|
+
lib/picolena/templates/app/models/indexer.rb
|
28
|
+
lib/picolena/templates/app/models/plain_text_extractor.rb
|
29
|
+
lib/picolena/templates/app/models/query.rb
|
24
30
|
lib/picolena/templates/app/views/documents/_document.html.haml
|
25
31
|
lib/picolena/templates/app/views/documents/cached.html.haml
|
26
32
|
lib/picolena/templates/app/views/documents/content.html.haml
|
@@ -35,27 +41,26 @@ lib/picolena/templates/config/environments/test.rb
|
|
35
41
|
lib/picolena/templates/config/initializers/001_load_custom_config.rb
|
36
42
|
lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb
|
37
43
|
lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb
|
38
|
-
lib/picolena/templates/config/initializers/
|
44
|
+
lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
|
39
45
|
lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
|
46
|
+
lib/picolena/templates/config/initializers/006_load_icons.rb
|
40
47
|
lib/picolena/templates/config/routes.rb
|
41
48
|
lib/picolena/templates/lang/ui/de.yml
|
42
49
|
lib/picolena/templates/lang/ui/en.yml
|
43
50
|
lib/picolena/templates/lang/ui/es.yml
|
44
51
|
lib/picolena/templates/lang/ui/fr.yml
|
45
52
|
lib/picolena/templates/lib/core_exts.rb
|
46
|
-
lib/picolena/templates/lib/
|
47
|
-
lib/picolena/templates/lib/
|
48
|
-
lib/picolena/templates/lib/
|
49
|
-
lib/picolena/templates/lib/
|
50
|
-
lib/picolena/templates/lib/
|
51
|
-
lib/picolena/templates/lib/
|
52
|
-
lib/picolena/templates/lib/
|
53
|
-
lib/picolena/templates/lib/
|
54
|
-
lib/picolena/templates/lib/
|
55
|
-
lib/picolena/templates/lib/
|
56
|
-
lib/picolena/templates/lib/
|
57
|
-
lib/picolena/templates/lib/filters/opendocument.text.rb
|
58
|
-
lib/picolena/templates/lib/filters/plain_text.rb
|
53
|
+
lib/picolena/templates/lib/plain_text_extractor_DSL.rb
|
54
|
+
lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
|
55
|
+
lib/picolena/templates/lib/plain_text_extractors/html.rb
|
56
|
+
lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb
|
57
|
+
lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb
|
58
|
+
lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb
|
59
|
+
lib/picolena/templates/lib/plain_text_extractors/ms.word.rb
|
60
|
+
lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb
|
61
|
+
lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb
|
62
|
+
lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb
|
63
|
+
lib/picolena/templates/lib/plain_text_extractors/plain_text.rb
|
59
64
|
lib/picolena/templates/lib/tasks/annotations.rake
|
60
65
|
lib/picolena/templates/lib/tasks/index.rake
|
61
66
|
lib/picolena/templates/lib/tasks/install_dependencies.rake
|
@@ -112,10 +117,14 @@ lib/picolena/templates/spec/helpers/application_helper_spec.rb
|
|
112
117
|
lib/picolena/templates/spec/helpers/documents_helper_spec.rb
|
113
118
|
lib/picolena/templates/spec/models/basic_finder_spec.rb
|
114
119
|
lib/picolena/templates/spec/models/document_spec.rb
|
115
|
-
lib/picolena/templates/spec/models/filters_spec.rb
|
116
120
|
lib/picolena/templates/spec/models/finder_spec.rb
|
117
121
|
lib/picolena/templates/spec/models/host_indexing_system_spec.rb
|
118
122
|
lib/picolena/templates/spec/models/index_directories_spec.rb
|
123
|
+
lib/picolena/templates/spec/models/index_reader_spec.rb
|
124
|
+
lib/picolena/templates/spec/models/index_writer_spec.rb
|
125
|
+
lib/picolena/templates/spec/models/indexer_spec.rb
|
126
|
+
lib/picolena/templates/spec/models/plain_text_extractor_spec.rb
|
127
|
+
lib/picolena/templates/spec/models/query_spec.rb
|
119
128
|
lib/picolena/templates/spec/rcov.opts
|
120
129
|
lib/picolena/templates/spec/spec.opts
|
121
130
|
lib/picolena/templates/spec/spec_helper.rb
|
@@ -140,6 +149,10 @@ lib/picolena/templates/spec/test_dirs/indexed/different_encodings/iso-8859-1.txt
|
|
140
149
|
lib/picolena/templates/spec/test_dirs/indexed/different_encodings/iso-8859-15.txt
|
141
150
|
lib/picolena/templates/spec/test_dirs/indexed/different_encodings/utf-8.txt
|
142
151
|
lib/picolena/templates/spec/test_dirs/indexed/just_one_doc/for_test.txt
|
152
|
+
lib/picolena/templates/spec/test_dirs/indexed/lang/goethe
|
153
|
+
lib/picolena/templates/spec/test_dirs/indexed/lang/hugo
|
154
|
+
lib/picolena/templates/spec/test_dirs/indexed/lang/lorca
|
155
|
+
lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare
|
143
156
|
lib/picolena/templates/spec/test_dirs/indexed/literature/Simulation of district heating systems for evaluation of real-time control strategies.pdf
|
144
157
|
lib/picolena/templates/spec/test_dirs/indexed/literature/Types of malfunction in DH substations.doc
|
145
158
|
lib/picolena/templates/spec/test_dirs/indexed/others/'weird'filename.txt
|
data/README.txt
CHANGED
@@ -14,7 +14,7 @@ Picolena is a lightweight ferret-powered documents search engine written in Ruby
|
|
14
14
|
|
15
15
|
Picolena has many advantages:
|
16
16
|
|
17
|
-
* it can index .pdf, .doc, .docx, .odt, .xls, .ods, .ppt, .pptx, .odp, .rtf, .html and plain text files will full text search, and offers a very easy way to add new
|
17
|
+
* it can index .pdf, .doc, .docx, .odt, .xls, .ods, .ppt, .pptx, .odp, .rtf, .html and plain text files will full text search, and offers a very easy way to add new extractors to index other filetype.
|
18
18
|
* it is free as in free beer and as in free speech
|
19
19
|
* thanks to Ferret, it is very fast
|
20
20
|
* it keeps your data private. By default, only the computer on which it is installed can get access to the search engine. Other IP addresses can then be added to a white list.
|
data/config/files_to_clean
CHANGED
@@ -4,9 +4,10 @@ lib/picolena/templates/config/custom/picolena.rb
|
|
4
4
|
lib/picolena/templates/config/custom/indexed_directories.yml
|
5
5
|
lib/picolena/templates/config/custom/white_list_ip.yml
|
6
6
|
lib/picolena/templates/config/custom/title_and_names_and_links.yml
|
7
|
+
lib/picolena/templates/config/custom/icons_and_filetypes.yml
|
7
8
|
lib/picolena/templates/log
|
8
9
|
lib/picolena/templates/spec/test_dirs/indexed/others/bäñüßé.txt
|
9
10
|
lib/picolena/templates/tmp
|
10
11
|
lib/picolena/templates/vendor
|
11
12
|
lib/picolena/templates/coverage
|
12
|
-
lib/picolena/templates/doc
|
13
|
+
lib/picolena/templates/doc
|
data/config/hoe.rb
CHANGED
@@ -61,7 +61,7 @@ hoe = Hoe.new(GEM_NAME, VERS) do |p|
|
|
61
61
|
# == Optional
|
62
62
|
p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
|
63
63
|
p.extra_deps = [ ['rails', '>= 2.0.2'],
|
64
|
-
# Ferret 0.11.6 is not yet available for win32
|
64
|
+
# Ferret 0.11.6 is not yet available for win32 (and will never be, it is a fix release for *nix)
|
65
65
|
# FIXME: How to require 0.11.6 for *nix and 0.11.5 for win32?
|
66
66
|
['ferret', '>= 0.11.5'],
|
67
67
|
['haml', '>= 1.8.2'],
|
@@ -1,35 +1,46 @@
|
|
1
|
-
|
2
|
-
#
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
#
|
8
|
-
# English
|
9
|
-
#
|
10
|
-
#
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
#
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
#
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
#
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
#
|
34
|
-
#
|
35
|
-
|
1
|
+
module Picolena
|
2
|
+
# Specify indexes path.
|
3
|
+
# Storage should be sufficient in order to store all indexed data.
|
4
|
+
IndexesSavePath=File.join(RAILS_ROOT, 'tmp/ferret_indexes/')
|
5
|
+
|
6
|
+
|
7
|
+
# Which language should be used?
|
8
|
+
# English (:en), German (:de), French (:fr) and Spanish (:es) are currently supported
|
9
|
+
# English is chosen by default.
|
10
|
+
# If you'd like to use another language, you can find templates in #{RAILS_ROOT}/lang/ui,
|
11
|
+
# then add your own language in this directory, and modify this line:
|
12
|
+
Globalite.language = :en
|
13
|
+
|
14
|
+
|
15
|
+
# Is more than one language used in indexed documents?
|
16
|
+
# Picolena can try to recognise the language used, and save it in the index.
|
17
|
+
# It is then possible to look for documents according to their language.
|
18
|
+
#
|
19
|
+
# If every document is written in the same language, turning UseLanguageRecognition to false
|
20
|
+
# will speed up the indexing process
|
21
|
+
UseLanguageRecognition = true
|
22
|
+
|
23
|
+
# Specify which locale should be used by Ferret
|
24
|
+
Ferret.locale = "en_US.UTF-8"
|
25
|
+
|
26
|
+
|
27
|
+
# Results per page
|
28
|
+
ResultsPerPage = 10
|
29
|
+
|
30
|
+
|
31
|
+
# Length of "probably unique id" 's
|
32
|
+
# Those id's are used to characterize every document, thus allowing tiny URLs in Controllers
|
33
|
+
# HashLength = 10
|
34
|
+
# Document.new("whatever.pdf").probably_unique_id => "bbuxhynait"
|
35
|
+
# HashLength = 20
|
36
|
+
# Document.new("whatever.pdf").probably_unique_id => "jfzjkyfkfkbbuxhynait"
|
37
|
+
# The more documents you have, the bigger HashLength should be in order to avoid collisions.
|
38
|
+
# It would not be wise (and specs won't pass) to specify HashLength smaller than 10.
|
39
|
+
HashLength = 10
|
40
|
+
|
41
|
+
|
42
|
+
# Specify the default Levenshtein distance when using FuzzyQuery
|
43
|
+
# see http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html for more information.
|
44
|
+
Ferret::Search::FuzzyQuery.default_min_similarity=0.6
|
45
|
+
Analyzer=Ferret::Analysis::StandardAnalyzer.new
|
46
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
video:
|
2
|
+
avi
|
3
|
+
wmv
|
4
|
+
mpg
|
5
|
+
mpeg
|
6
|
+
ogg:
|
7
|
+
mp3
|
8
|
+
ogg
|
9
|
+
wma
|
10
|
+
wav
|
11
|
+
wmv
|
12
|
+
tee
|
13
|
+
txt:
|
14
|
+
txt
|
15
|
+
text
|
16
|
+
tex
|
17
|
+
bib
|
18
|
+
log
|
19
|
+
ini
|
20
|
+
no_extension
|
21
|
+
doc:
|
22
|
+
doc
|
23
|
+
odt
|
24
|
+
rtf
|
25
|
+
dot
|
26
|
+
docx
|
27
|
+
dotx
|
28
|
+
insel:
|
29
|
+
ins
|
30
|
+
vee
|
31
|
+
ppt:
|
32
|
+
ppt
|
33
|
+
pps
|
34
|
+
pptx
|
35
|
+
odp
|
36
|
+
pdf:
|
37
|
+
pdf
|
38
|
+
package:
|
39
|
+
gz
|
40
|
+
rar
|
41
|
+
zip
|
42
|
+
bak
|
43
|
+
code:
|
44
|
+
for
|
45
|
+
cpp
|
46
|
+
c
|
47
|
+
rb
|
48
|
+
java
|
49
|
+
html:
|
50
|
+
html
|
51
|
+
htm
|
52
|
+
xls:
|
53
|
+
xls
|
54
|
+
xlsx
|
55
|
+
ods
|
56
|
+
picture:
|
57
|
+
psd
|
58
|
+
jpg
|
59
|
+
png
|
60
|
+
gif
|
61
|
+
eps
|
62
|
+
bmp
|
63
|
+
ico
|
64
|
+
cad:
|
65
|
+
dwg
|
66
|
+
dxf
|
67
|
+
exe:
|
68
|
+
exe
|
69
|
+
dll
|
@@ -5,6 +5,6 @@ development:
|
|
5
5
|
#alias path could be any smb, http, ftp or local directory that is available to the end-user.
|
6
6
|
<%= directories_to_index %>
|
7
7
|
test:
|
8
|
-
"spec/test_dirs/indexed": "http://picolena.devjavu.com/browser/trunk/spec/test_dirs/indexed"
|
8
|
+
"spec/test_dirs/indexed": "http://picolena.devjavu.com/browser/trunk/lib/picolena/templates/spec/test_dirs/indexed"
|
9
9
|
production:
|
10
10
|
<%= directories_to_index %>
|
@@ -62,6 +62,7 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
|
|
62
62
|
m.file '../config/basic.rb', 'config/custom/picolena.rb'
|
63
63
|
m.template '../config/indexed_directories.yml', 'config/custom/indexed_directories.yml', :assigns => {:directories_to_index => @directories_to_index}
|
64
64
|
m.template '../config/title_and_names_and_links.yml', 'config/custom/title_and_names_and_links.yml', :assigns => {:version => Picolena::VERSION::STRING}
|
65
|
+
m.file '../config/icons_and_filetypes.yml', 'config/custom/icons_and_filetypes.yml'
|
65
66
|
|
66
67
|
# README, License & Rakefile
|
67
68
|
m.file 'MIT-LICENSE', 'LICENSE'
|
@@ -135,7 +136,7 @@ EOS
|
|
135
136
|
doc
|
136
137
|
lang/ui
|
137
138
|
lib
|
138
|
-
lib/
|
139
|
+
lib/plain_text_extractors
|
139
140
|
lib/tasks
|
140
141
|
log
|
141
142
|
public
|
@@ -155,6 +156,7 @@ EOS
|
|
155
156
|
spec/test_dirs/indexed/basic
|
156
157
|
spec/test_dirs/indexed/different_encodings
|
157
158
|
spec/test_dirs/indexed/just_one_doc
|
159
|
+
spec/test_dirs/indexed/lang
|
158
160
|
spec/test_dirs/indexed/literature
|
159
161
|
spec/test_dirs/indexed/others
|
160
162
|
spec/test_dirs/indexed/others/nested
|
@@ -23,9 +23,9 @@ class ApplicationController < ActionController::Base
|
|
23
23
|
# Tries to match remote IP address with the white list defined in config/custom/white_list_ip.yml
|
24
24
|
# Redirects to :access_denied if the remote IP is not white listed.
|
25
25
|
def should_only_be_available_for_white_list_IPs
|
26
|
-
unless request.remote_ip =~ WhiteListIPs
|
26
|
+
unless request.remote_ip =~ Picolena::WhiteListIPs
|
27
27
|
redirect_to :controller => 'application', :action=>'access_denied'
|
28
28
|
return false
|
29
29
|
end
|
30
30
|
end
|
31
|
-
end
|
31
|
+
end
|
@@ -24,7 +24,7 @@ class DocumentsController < ApplicationController
|
|
24
24
|
page=params[:page]||1
|
25
25
|
finder=Finder.new(@query,page)
|
26
26
|
finder.execute!
|
27
|
-
pager=::Paginator.new(finder.total_hits, ResultsPerPage) do
|
27
|
+
pager=::Paginator.new(finder.total_hits, Picolena::ResultsPerPage) do
|
28
28
|
finder.matching_documents
|
29
29
|
end
|
30
30
|
@matching_documents=pager.page(page)
|
@@ -3,13 +3,13 @@ module DocumentsHelper
|
|
3
3
|
def nothing_found?
|
4
4
|
@matching_documents.nil? or @matching_documents.entries.empty?
|
5
5
|
end
|
6
|
-
|
6
|
+
|
7
7
|
# Very basic pagination.
|
8
8
|
# Provides liks to Next, Prev and FirstPage when needed.
|
9
9
|
def should_paginate(page,query)
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
[(link_to("←←", :action => :show, :id => query, :page => 1) if page.number>2),
|
11
|
+
(link_to("←", :action => :show, :id => query, :page => page.prev.number) if page.prev?),
|
12
|
+
(link_to("→", :action => :show, :id => query, :page => page.next.number) if page.next?)].compact.join(" | ")
|
13
13
|
end
|
14
14
|
|
15
15
|
# Returns a localized sentence like "Results 1-10 of 12 for Zimbabwe (0.472s)" or
|
@@ -30,7 +30,7 @@ module DocumentsHelper
|
|
30
30
|
content_tag(:small,'('<<number_with_precision(dt,3)<<'s)')
|
31
31
|
end
|
32
32
|
|
33
|
-
# When possible, highlights content of the document that
|
33
|
+
# When possible, highlights content of the document that matches the query.
|
34
34
|
def highlight_matching_content(document)
|
35
35
|
content_tag(:ul,document.matching_content.collect{|sentence|
|
36
36
|
content_tag(:li,h(sentence).gsub(/<<(.*?)>>/,'<strong>\1</strong>').gsub(/\v|\f/,''))
|
@@ -43,28 +43,9 @@ module DocumentsHelper
|
|
43
43
|
end
|
44
44
|
|
45
45
|
# Returns the location (if avaible) of the filetype icon.
|
46
|
-
# TODO: Move this hash to a .yml config file.
|
47
46
|
def icon_for(filetype)
|
48
|
-
|
49
|
-
|
50
|
-
:doc=>%w{doc odt rtf dot docx dotx},
|
51
|
-
:pdf=>%w{pdf},
|
52
|
-
:txt=>%w{txt text tex bib log ini no_extension},
|
53
|
-
:ogg=>%w{mp3 ogg wma wav wmv tee},
|
54
|
-
:html=>%w{html htm},
|
55
|
-
:ppt=>%w{ppt pps pptx odp},
|
56
|
-
:package=>%w{gz rar zip bak},
|
57
|
-
:picture=>%w{psd jpg png gif eps bmp ico},
|
58
|
-
:cad=>%w{dwg dxf},
|
59
|
-
:exe=>%w{exe dll},
|
60
|
-
:video=>%w{avi wmv mpg mpeg},
|
61
|
-
:code=>%w{for cpp c rb java},
|
62
|
-
:insel=>%w{ins vee}
|
63
|
-
}
|
64
|
-
pic=pic_for_exts.find{|pic, extensions|
|
65
|
-
extensions.any? { |ext| filetype.sub(/\./,'').downcase==ext}
|
66
|
-
}
|
67
|
-
image_tag("icons/#{pic.first}.png") if pic
|
47
|
+
icon_symbol=FiletypeToIconSymbol[filetype.downcase.sub(/^\./,'')]
|
48
|
+
image_tag("icons/#{icon_symbol}.png") if icon_symbol
|
68
49
|
end
|
69
50
|
|
70
51
|
# Returns a link to a backup search engine that could maybe find more results for the same query.
|
@@ -1,7 +1,8 @@
|
|
1
1
|
# Document class retrieves information from filesystem and the index for any given document.
|
2
2
|
class Document
|
3
3
|
attr_reader :complete_path
|
4
|
-
|
4
|
+
attr_writer :index_id
|
5
|
+
attr_accessor :user, :score, :matching_content
|
5
6
|
|
6
7
|
def initialize(path)
|
7
8
|
#To ensure @complete_path is an absolute direction.
|
@@ -10,8 +11,6 @@ class Document
|
|
10
11
|
validate_in_indexed_directory
|
11
12
|
end
|
12
13
|
|
13
|
-
alias_method :to_param, :id
|
14
|
-
|
15
14
|
#Delegating properties to File::method_name(complete_path)
|
16
15
|
[:dirname, :basename, :extname, :size?, :file?, :read, :ext_as_sym].each{|method_name|
|
17
16
|
define_method(method_name){File.send(method_name,complete_path)}
|
@@ -38,7 +37,7 @@ class Document
|
|
38
37
|
# "http://www.mycompany.com/wiki/organigram.odp"
|
39
38
|
def alias_path
|
40
39
|
original_dir=indexed_directory
|
41
|
-
alias_dir=IndexedDirectories[original_dir]
|
40
|
+
alias_dir=Picolena::IndexedDirectories[original_dir]
|
42
41
|
dirname.sub(original_dir,alias_dir)
|
43
42
|
end
|
44
43
|
|
@@ -50,48 +49,67 @@ class Document
|
|
50
49
|
@probably_unique_id||=complete_path.base26_hash
|
51
50
|
end
|
52
51
|
|
53
|
-
# Returns true iff some
|
52
|
+
# Returns true iff some PlainTextExtractor has been defined to convert it to plain text.
|
54
53
|
# Document.new("presentation.pdf").supported? => true
|
55
54
|
# Document.new("presentation.some_weird_extension").supported? => false
|
56
55
|
def supported?
|
57
|
-
|
56
|
+
PlainTextExtractor.supported_extensions.include?(self.ext_as_sym)
|
58
57
|
end
|
59
58
|
|
60
59
|
# Retrieves content as it is *now*.
|
61
60
|
def content
|
62
|
-
|
61
|
+
PlainTextExtractor.extract_content_from(complete_path)
|
63
62
|
end
|
64
63
|
|
65
64
|
# Cache à la Google.
|
66
65
|
# Returns content as it was at the time it was indexed.
|
67
66
|
def cached
|
68
|
-
|
69
|
-
Finder.index[index_id][:content]
|
67
|
+
from_index[:content]
|
70
68
|
end
|
71
69
|
|
70
|
+
# FIXME: Not just date anymore.
|
72
71
|
# Returns the last modification date before the document got indexed.
|
73
72
|
# Useful to know how old a document is, and to which version the cache corresponds.
|
74
73
|
def date
|
75
|
-
|
76
|
-
|
74
|
+
from_index[:date].sub(/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/,'\1-\2-\3 \4:\5:\6')
|
75
|
+
end
|
76
|
+
|
77
|
+
def mtime
|
78
|
+
from_index[:date].to_i
|
79
|
+
end
|
80
|
+
|
81
|
+
# Returns language.
|
82
|
+
def lang
|
83
|
+
from_index[:lang]
|
84
|
+
end
|
85
|
+
|
86
|
+
# Returns the id with which the document is indexed.
|
87
|
+
def index_id
|
88
|
+
@index_id ||= Document.find_by_complete_path(complete_path).index_id
|
77
89
|
end
|
78
90
|
|
79
91
|
private
|
80
92
|
|
81
|
-
|
82
|
-
|
93
|
+
# Retrieves the document from the index.
|
94
|
+
# Useful to get meta-info about it.
|
95
|
+
def from_index
|
96
|
+
IndexReader.new[index_id]
|
83
97
|
end
|
84
98
|
|
85
99
|
def self.find_by_unique_id(some_id)
|
86
100
|
Finder.new("probably_unique_id:"<<some_id).matching_document
|
87
101
|
end
|
88
102
|
|
103
|
+
def self.find_by_complete_path(complete_path)
|
104
|
+
Finder.new('complete_path:"'<<complete_path<<'"').matching_document
|
105
|
+
end
|
106
|
+
|
89
107
|
def in_indexed_directory?
|
90
108
|
!indexed_directory.nil?
|
91
109
|
end
|
92
110
|
|
93
111
|
def indexed_directory
|
94
|
-
IndexedDirectories.keys.find{|indexed_dir|
|
112
|
+
Picolena::IndexedDirectories.keys.find{|indexed_dir|
|
95
113
|
dirname.starts_with?(indexed_dir)
|
96
114
|
}
|
97
115
|
end
|
@@ -1,42 +1,36 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
class Finder
|
4
|
-
#FIXME: Should not use all those class methods to access index.
|
5
|
-
|
1
|
+
class Finder
|
6
2
|
attr_reader :query
|
7
3
|
|
8
|
-
def
|
4
|
+
def index
|
9
5
|
# caching index @@index ||=
|
10
6
|
# causes ferret-0.11.6/lib/ferret/index.rb:768: [BUG] Segmentation fault
|
11
|
-
|
7
|
+
IndexReader.new
|
12
8
|
end
|
13
9
|
|
14
|
-
def initialize(raw_query,page=1,results_per_page=ResultsPerPage)
|
15
|
-
|
16
|
-
@query = query_parser.parse(convert_to_english(raw_query))
|
10
|
+
def initialize(raw_query,page=1,results_per_page=Picolena::ResultsPerPage)
|
11
|
+
@query = Query.extract_from(raw_query)
|
17
12
|
@raw_query= raw_query
|
18
|
-
|
13
|
+
IndexReader.ensure_existence
|
19
14
|
@per_page=results_per_page
|
20
15
|
@offset=(page.to_i-1)*results_per_page
|
21
|
-
|
16
|
+
index.should_have_documents
|
22
17
|
end
|
23
18
|
|
24
19
|
def execute!
|
25
20
|
@matching_documents=[]
|
26
21
|
start=Time.now
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
@matching_documents<<found_doc
|
22
|
+
top_docs=index.search(query, :limit => @per_page, :offset=>@offset)
|
23
|
+
top_docs.hits.each{|hit|
|
24
|
+
index_id,score=hit.doc,hit.score
|
25
|
+
begin
|
26
|
+
found_doc=Document.new(index[index_id][:complete_path])
|
27
|
+
found_doc.matching_content=index.highlight(query, index_id,
|
28
|
+
:field => :content, :excerpt_length => 80,
|
29
|
+
:pre_tag => "<<", :post_tag => ">>"
|
30
|
+
) unless @raw_query=~/^\*+\.\w*$/
|
31
|
+
found_doc.score=score
|
32
|
+
found_doc.index_id=index_id
|
33
|
+
@matching_documents<<found_doc
|
40
34
|
rescue Errno::ENOENT
|
41
35
|
#"File has been moved/deleted!"
|
42
36
|
end
|
@@ -44,9 +38,6 @@ class Finder
|
|
44
38
|
@executed=true
|
45
39
|
@time_needed=Time.now-start
|
46
40
|
@total_hits=top_docs.total_hits
|
47
|
-
ensure
|
48
|
-
#index.close
|
49
|
-
end
|
50
41
|
end
|
51
42
|
|
52
43
|
# Returns true if it has been executed.
|
@@ -66,17 +57,7 @@ class Finder
|
|
66
57
|
}
|
67
58
|
}
|
68
59
|
|
69
|
-
# Returns
|
70
|
-
def self.has_index?
|
71
|
-
index_filename and File.exists?(index_filename)
|
72
|
-
end
|
73
|
-
|
74
|
-
# Returns true if there's at least one document indexed.
|
75
|
-
def has_documents?
|
76
|
-
Finder.index.size>0
|
77
|
-
end
|
78
|
-
|
79
|
-
# Returns matching document for any given query, if only
|
60
|
+
# Returns matching document for any given query only if
|
80
61
|
# exactly one document is found.
|
81
62
|
# Raises otherwise.
|
82
63
|
def matching_document
|
@@ -89,42 +70,4 @@ class Finder
|
|
89
70
|
raise IndexError, "More than one document found"
|
90
71
|
end
|
91
72
|
end
|
92
|
-
|
93
|
-
private
|
94
|
-
|
95
|
-
# Convert query keywords to english so they can be parsed by Ferret.
|
96
|
-
def convert_to_english(query)
|
97
|
-
to_en={
|
98
|
-
/\b#{:AND.l}\b/=>'AND',
|
99
|
-
/\b#{:OR.l}\b/=>'OR',
|
100
|
-
/\b#{:NOT.l}\b/=>'NOT',
|
101
|
-
/(#{:filetype.l}):/=>'filetype:',
|
102
|
-
/#{:content.l}:/ => 'content:',
|
103
|
-
/#{:date.l}:/ => 'date:',
|
104
|
-
/\b#{:LIKE.l}\s+(\S+)/=>'\1~'
|
105
|
-
}
|
106
|
-
to_en.inject(query){|mem,non_english_to_english_keyword|
|
107
|
-
mem.gsub(*non_english_to_english_keyword)
|
108
|
-
}
|
109
|
-
end
|
110
|
-
|
111
|
-
def self.index_filename
|
112
|
-
Dir.glob(File.join(IndexSavePath,'*.cfs')).first
|
113
|
-
end
|
114
|
-
|
115
|
-
def self.ensure_that_index_exists_on_disk
|
116
|
-
force_index_creation unless has_index? or RAILS_ENV=="production"
|
117
|
-
end
|
118
|
-
|
119
|
-
def self.force_index_creation
|
120
|
-
create_index(IndexedDirectories.keys)
|
121
|
-
end
|
122
|
-
|
123
|
-
def self.delete_index
|
124
|
-
FileUtils.rm(Dir.glob(File.join(IndexSavePath,'*.cfs'))) if has_index?
|
125
|
-
end
|
126
|
-
|
127
|
-
def validate_that_index_has_documents
|
128
|
-
raise IndexError, "no document found" unless has_documents?
|
129
|
-
end
|
130
|
-
end
|
73
|
+
end
|