picolena 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +8 -0
- data/Manifest.txt +28 -15
- data/README.txt +1 -1
- data/config/files_to_clean +2 -1
- data/config/hoe.rb +1 -1
- data/lib/picolena/config/basic.rb +46 -35
- data/lib/picolena/config/icons_and_filetypes.yml +69 -0
- data/lib/picolena/config/indexed_directories.yml +1 -1
- data/lib/picolena/picolena_generator.rb +3 -1
- data/lib/picolena/templates/app/controllers/application.rb +2 -2
- data/lib/picolena/templates/app/controllers/documents_controller.rb +1 -1
- data/lib/picolena/templates/app/helpers/documents_helper.rb +7 -26
- data/lib/picolena/templates/app/models/document.rb +32 -14
- data/lib/picolena/templates/app/models/finder.rb +21 -78
- data/lib/picolena/templates/app/models/index_reader.rb +56 -0
- data/lib/picolena/templates/app/models/index_writer.rb +36 -0
- data/lib/picolena/templates/app/models/indexer.rb +142 -0
- data/lib/picolena/templates/app/models/plain_text_extractor.rb +122 -0
- data/lib/picolena/templates/app/models/query.rb +31 -0
- data/lib/picolena/templates/app/views/documents/_document.html.haml +2 -2
- data/lib/picolena/templates/config/environment.rb +2 -2
- data/lib/picolena/templates/config/environments/development.rb +1 -1
- data/lib/picolena/templates/config/environments/production.rb +1 -1
- data/lib/picolena/templates/config/environments/test.rb +1 -1
- data/lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb +2 -0
- data/lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb +3 -1
- data/lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb +6 -0
- data/lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb +2 -0
- data/lib/picolena/templates/config/initializers/006_load_icons.rb +8 -0
- data/lib/picolena/templates/lib/core_exts.rb +20 -1
- data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +72 -0
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/adobe.pdf.rb +3 -3
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/html.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.excel.rb +4 -4
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.powerpoint.rb +4 -4
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.rtf.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.word.rb +4 -4
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.presentation.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.spreadsheet.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.text.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/plain_text.rb +3 -3
- data/lib/picolena/templates/lib/tasks/index.rake +4 -6
- data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
- data/lib/picolena/templates/spec/controllers/documents_controller_spec.rb +5 -5
- data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +1 -1
- data/lib/picolena/templates/spec/models/basic_finder_spec.rb +13 -13
- data/lib/picolena/templates/spec/models/document_spec.rb +1 -1
- data/lib/picolena/templates/spec/models/finder_spec.rb +5 -70
- data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +6 -2
- data/lib/picolena/templates/spec/models/index_directories_spec.rb +4 -4
- data/lib/picolena/templates/spec/models/index_reader_spec.rb +7 -0
- data/lib/picolena/templates/spec/models/index_writer_spec.rb +7 -0
- data/lib/picolena/templates/spec/models/indexer_spec.rb +7 -0
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +42 -0
- data/lib/picolena/templates/spec/models/query_spec.rb +56 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/goethe +42 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/hugo +83 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/lorca +86 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare +90 -0
- data/lib/picolena/version.rb +1 -1
- data/tasks/hack.rake +2 -1
- data/website/index.html +2 -2
- data.tar.gz.sig +0 -0
- metadata +30 -17
- metadata.gz.sig +0 -0
- data/lib/picolena/templates/config/initializers/004_load_filters.rb +0 -6
- data/lib/picolena/templates/lib/ff.rb +0 -117
- data/lib/picolena/templates/lib/filter.rb +0 -75
- data/lib/picolena/templates/lib/filter_DSL.rb +0 -77
- data/lib/picolena/templates/spec/models/filters_spec.rb +0 -30
data/History.txt
CHANGED
data/Manifest.txt
CHANGED
|
@@ -9,6 +9,7 @@ config/hoe.rb
|
|
|
9
9
|
config/requirements.rb
|
|
10
10
|
lib/picolena/USAGE
|
|
11
11
|
lib/picolena/config/basic.rb
|
|
12
|
+
lib/picolena/config/icons_and_filetypes.yml
|
|
12
13
|
lib/picolena/config/indexed_directories.yml
|
|
13
14
|
lib/picolena/config/title_and_names_and_links.yml
|
|
14
15
|
lib/picolena/config/white_list_ip.yml
|
|
@@ -21,6 +22,11 @@ lib/picolena/templates/app/helpers/application_helper.rb
|
|
|
21
22
|
lib/picolena/templates/app/helpers/documents_helper.rb
|
|
22
23
|
lib/picolena/templates/app/models/document.rb
|
|
23
24
|
lib/picolena/templates/app/models/finder.rb
|
|
25
|
+
lib/picolena/templates/app/models/index_reader.rb
|
|
26
|
+
lib/picolena/templates/app/models/index_writer.rb
|
|
27
|
+
lib/picolena/templates/app/models/indexer.rb
|
|
28
|
+
lib/picolena/templates/app/models/plain_text_extractor.rb
|
|
29
|
+
lib/picolena/templates/app/models/query.rb
|
|
24
30
|
lib/picolena/templates/app/views/documents/_document.html.haml
|
|
25
31
|
lib/picolena/templates/app/views/documents/cached.html.haml
|
|
26
32
|
lib/picolena/templates/app/views/documents/content.html.haml
|
|
@@ -35,27 +41,26 @@ lib/picolena/templates/config/environments/test.rb
|
|
|
35
41
|
lib/picolena/templates/config/initializers/001_load_custom_config.rb
|
|
36
42
|
lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb
|
|
37
43
|
lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb
|
|
38
|
-
lib/picolena/templates/config/initializers/
|
|
44
|
+
lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
|
|
39
45
|
lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
|
|
46
|
+
lib/picolena/templates/config/initializers/006_load_icons.rb
|
|
40
47
|
lib/picolena/templates/config/routes.rb
|
|
41
48
|
lib/picolena/templates/lang/ui/de.yml
|
|
42
49
|
lib/picolena/templates/lang/ui/en.yml
|
|
43
50
|
lib/picolena/templates/lang/ui/es.yml
|
|
44
51
|
lib/picolena/templates/lang/ui/fr.yml
|
|
45
52
|
lib/picolena/templates/lib/core_exts.rb
|
|
46
|
-
lib/picolena/templates/lib/
|
|
47
|
-
lib/picolena/templates/lib/
|
|
48
|
-
lib/picolena/templates/lib/
|
|
49
|
-
lib/picolena/templates/lib/
|
|
50
|
-
lib/picolena/templates/lib/
|
|
51
|
-
lib/picolena/templates/lib/
|
|
52
|
-
lib/picolena/templates/lib/
|
|
53
|
-
lib/picolena/templates/lib/
|
|
54
|
-
lib/picolena/templates/lib/
|
|
55
|
-
lib/picolena/templates/lib/
|
|
56
|
-
lib/picolena/templates/lib/
|
|
57
|
-
lib/picolena/templates/lib/filters/opendocument.text.rb
|
|
58
|
-
lib/picolena/templates/lib/filters/plain_text.rb
|
|
53
|
+
lib/picolena/templates/lib/plain_text_extractor_DSL.rb
|
|
54
|
+
lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
|
|
55
|
+
lib/picolena/templates/lib/plain_text_extractors/html.rb
|
|
56
|
+
lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb
|
|
57
|
+
lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb
|
|
58
|
+
lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb
|
|
59
|
+
lib/picolena/templates/lib/plain_text_extractors/ms.word.rb
|
|
60
|
+
lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb
|
|
61
|
+
lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb
|
|
62
|
+
lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb
|
|
63
|
+
lib/picolena/templates/lib/plain_text_extractors/plain_text.rb
|
|
59
64
|
lib/picolena/templates/lib/tasks/annotations.rake
|
|
60
65
|
lib/picolena/templates/lib/tasks/index.rake
|
|
61
66
|
lib/picolena/templates/lib/tasks/install_dependencies.rake
|
|
@@ -112,10 +117,14 @@ lib/picolena/templates/spec/helpers/application_helper_spec.rb
|
|
|
112
117
|
lib/picolena/templates/spec/helpers/documents_helper_spec.rb
|
|
113
118
|
lib/picolena/templates/spec/models/basic_finder_spec.rb
|
|
114
119
|
lib/picolena/templates/spec/models/document_spec.rb
|
|
115
|
-
lib/picolena/templates/spec/models/filters_spec.rb
|
|
116
120
|
lib/picolena/templates/spec/models/finder_spec.rb
|
|
117
121
|
lib/picolena/templates/spec/models/host_indexing_system_spec.rb
|
|
118
122
|
lib/picolena/templates/spec/models/index_directories_spec.rb
|
|
123
|
+
lib/picolena/templates/spec/models/index_reader_spec.rb
|
|
124
|
+
lib/picolena/templates/spec/models/index_writer_spec.rb
|
|
125
|
+
lib/picolena/templates/spec/models/indexer_spec.rb
|
|
126
|
+
lib/picolena/templates/spec/models/plain_text_extractor_spec.rb
|
|
127
|
+
lib/picolena/templates/spec/models/query_spec.rb
|
|
119
128
|
lib/picolena/templates/spec/rcov.opts
|
|
120
129
|
lib/picolena/templates/spec/spec.opts
|
|
121
130
|
lib/picolena/templates/spec/spec_helper.rb
|
|
@@ -140,6 +149,10 @@ lib/picolena/templates/spec/test_dirs/indexed/different_encodings/iso-8859-1.txt
|
|
|
140
149
|
lib/picolena/templates/spec/test_dirs/indexed/different_encodings/iso-8859-15.txt
|
|
141
150
|
lib/picolena/templates/spec/test_dirs/indexed/different_encodings/utf-8.txt
|
|
142
151
|
lib/picolena/templates/spec/test_dirs/indexed/just_one_doc/for_test.txt
|
|
152
|
+
lib/picolena/templates/spec/test_dirs/indexed/lang/goethe
|
|
153
|
+
lib/picolena/templates/spec/test_dirs/indexed/lang/hugo
|
|
154
|
+
lib/picolena/templates/spec/test_dirs/indexed/lang/lorca
|
|
155
|
+
lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare
|
|
143
156
|
lib/picolena/templates/spec/test_dirs/indexed/literature/Simulation of district heating systems for evaluation of real-time control strategies.pdf
|
|
144
157
|
lib/picolena/templates/spec/test_dirs/indexed/literature/Types of malfunction in DH substations.doc
|
|
145
158
|
lib/picolena/templates/spec/test_dirs/indexed/others/'weird'filename.txt
|
data/README.txt
CHANGED
|
@@ -14,7 +14,7 @@ Picolena is a lightweight ferret-powered documents search engine written in Ruby
|
|
|
14
14
|
|
|
15
15
|
Picolena has many advantages:
|
|
16
16
|
|
|
17
|
-
* it can index .pdf, .doc, .docx, .odt, .xls, .ods, .ppt, .pptx, .odp, .rtf, .html and plain text files will full text search, and offers a very easy way to add new
|
|
17
|
+
* it can index .pdf, .doc, .docx, .odt, .xls, .ods, .ppt, .pptx, .odp, .rtf, .html and plain text files will full text search, and offers a very easy way to add new extractors to index other filetype.
|
|
18
18
|
* it is free as in free beer and as in free speech
|
|
19
19
|
* thanks to Ferret, it is very fast
|
|
20
20
|
* it keeps your data private. By default, only the computer on which it is installed can get access to the search engine. Other IP addresses can then be added to a white list.
|
data/config/files_to_clean
CHANGED
|
@@ -4,9 +4,10 @@ lib/picolena/templates/config/custom/picolena.rb
|
|
|
4
4
|
lib/picolena/templates/config/custom/indexed_directories.yml
|
|
5
5
|
lib/picolena/templates/config/custom/white_list_ip.yml
|
|
6
6
|
lib/picolena/templates/config/custom/title_and_names_and_links.yml
|
|
7
|
+
lib/picolena/templates/config/custom/icons_and_filetypes.yml
|
|
7
8
|
lib/picolena/templates/log
|
|
8
9
|
lib/picolena/templates/spec/test_dirs/indexed/others/bäñüßé.txt
|
|
9
10
|
lib/picolena/templates/tmp
|
|
10
11
|
lib/picolena/templates/vendor
|
|
11
12
|
lib/picolena/templates/coverage
|
|
12
|
-
lib/picolena/templates/doc
|
|
13
|
+
lib/picolena/templates/doc
|
data/config/hoe.rb
CHANGED
|
@@ -61,7 +61,7 @@ hoe = Hoe.new(GEM_NAME, VERS) do |p|
|
|
|
61
61
|
# == Optional
|
|
62
62
|
p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
|
|
63
63
|
p.extra_deps = [ ['rails', '>= 2.0.2'],
|
|
64
|
-
# Ferret 0.11.6 is not yet available for win32
|
|
64
|
+
# Ferret 0.11.6 is not yet available for win32 (and will never be, it is a fix release for *nix)
|
|
65
65
|
# FIXME: How to require 0.11.6 for *nix and 0.11.5 for win32?
|
|
66
66
|
['ferret', '>= 0.11.5'],
|
|
67
67
|
['haml', '>= 1.8.2'],
|
|
@@ -1,35 +1,46 @@
|
|
|
1
|
-
|
|
2
|
-
#
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
#
|
|
8
|
-
# English
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
#
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
#
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
#
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
#
|
|
34
|
-
#
|
|
35
|
-
|
|
1
|
+
module Picolena
|
|
2
|
+
# Specify indexes path.
|
|
3
|
+
# Storage should be sufficient in order to store all indexed data.
|
|
4
|
+
IndexesSavePath=File.join(RAILS_ROOT, 'tmp/ferret_indexes/')
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# Which language should be used?
|
|
8
|
+
# English (:en), German (:de), French (:fr) and Spanish (:es) are currently supported
|
|
9
|
+
# English is chosen by default.
|
|
10
|
+
# If you'd like to use another language, you can find templates in #{RAILS_ROOT}/lang/ui,
|
|
11
|
+
# then add your own language in this directory, and modify this line:
|
|
12
|
+
Globalite.language = :en
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Is more than one language used in indexed documents?
|
|
16
|
+
# Picolena can try to recognise the language used, and save it in the index.
|
|
17
|
+
# It is then possible to look for documents according to their language.
|
|
18
|
+
#
|
|
19
|
+
# If every document is written in the same language, turning UseLanguageRecognition to false
|
|
20
|
+
# will speed up the indexing process
|
|
21
|
+
UseLanguageRecognition = true
|
|
22
|
+
|
|
23
|
+
# Specify which locale should be used by Ferret
|
|
24
|
+
Ferret.locale = "en_US.UTF-8"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# Results per page
|
|
28
|
+
ResultsPerPage = 10
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Length of "probably unique id" 's
|
|
32
|
+
# Those id's are used to characterize every document, thus allowing tiny URLs in Controllers
|
|
33
|
+
# HashLength = 10
|
|
34
|
+
# Document.new("whatever.pdf").probably_unique_id => "bbuxhynait"
|
|
35
|
+
# HashLength = 20
|
|
36
|
+
# Document.new("whatever.pdf").probably_unique_id => "jfzjkyfkfkbbuxhynait"
|
|
37
|
+
# The more documents you have, the bigger HashLength should be in order to avoid collisions.
|
|
38
|
+
# It would not be wise (and specs won't pass) to specify HashLength smaller than 10.
|
|
39
|
+
HashLength = 10
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Specify the default Levenshtein distance when using FuzzyQuery
|
|
43
|
+
# see http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html for more information.
|
|
44
|
+
Ferret::Search::FuzzyQuery.default_min_similarity=0.6
|
|
45
|
+
Analyzer=Ferret::Analysis::StandardAnalyzer.new
|
|
46
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
video:
|
|
2
|
+
avi
|
|
3
|
+
wmv
|
|
4
|
+
mpg
|
|
5
|
+
mpeg
|
|
6
|
+
ogg:
|
|
7
|
+
mp3
|
|
8
|
+
ogg
|
|
9
|
+
wma
|
|
10
|
+
wav
|
|
11
|
+
wmv
|
|
12
|
+
tee
|
|
13
|
+
txt:
|
|
14
|
+
txt
|
|
15
|
+
text
|
|
16
|
+
tex
|
|
17
|
+
bib
|
|
18
|
+
log
|
|
19
|
+
ini
|
|
20
|
+
no_extension
|
|
21
|
+
doc:
|
|
22
|
+
doc
|
|
23
|
+
odt
|
|
24
|
+
rtf
|
|
25
|
+
dot
|
|
26
|
+
docx
|
|
27
|
+
dotx
|
|
28
|
+
insel:
|
|
29
|
+
ins
|
|
30
|
+
vee
|
|
31
|
+
ppt:
|
|
32
|
+
ppt
|
|
33
|
+
pps
|
|
34
|
+
pptx
|
|
35
|
+
odp
|
|
36
|
+
pdf:
|
|
37
|
+
pdf
|
|
38
|
+
package:
|
|
39
|
+
gz
|
|
40
|
+
rar
|
|
41
|
+
zip
|
|
42
|
+
bak
|
|
43
|
+
code:
|
|
44
|
+
for
|
|
45
|
+
cpp
|
|
46
|
+
c
|
|
47
|
+
rb
|
|
48
|
+
java
|
|
49
|
+
html:
|
|
50
|
+
html
|
|
51
|
+
htm
|
|
52
|
+
xls:
|
|
53
|
+
xls
|
|
54
|
+
xlsx
|
|
55
|
+
ods
|
|
56
|
+
picture:
|
|
57
|
+
psd
|
|
58
|
+
jpg
|
|
59
|
+
png
|
|
60
|
+
gif
|
|
61
|
+
eps
|
|
62
|
+
bmp
|
|
63
|
+
ico
|
|
64
|
+
cad:
|
|
65
|
+
dwg
|
|
66
|
+
dxf
|
|
67
|
+
exe:
|
|
68
|
+
exe
|
|
69
|
+
dll
|
|
@@ -5,6 +5,6 @@ development:
|
|
|
5
5
|
#alias path could be any smb, http, ftp or local directory that is available to the end-user.
|
|
6
6
|
<%= directories_to_index %>
|
|
7
7
|
test:
|
|
8
|
-
"spec/test_dirs/indexed": "http://picolena.devjavu.com/browser/trunk/spec/test_dirs/indexed"
|
|
8
|
+
"spec/test_dirs/indexed": "http://picolena.devjavu.com/browser/trunk/lib/picolena/templates/spec/test_dirs/indexed"
|
|
9
9
|
production:
|
|
10
10
|
<%= directories_to_index %>
|
|
@@ -62,6 +62,7 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
|
|
|
62
62
|
m.file '../config/basic.rb', 'config/custom/picolena.rb'
|
|
63
63
|
m.template '../config/indexed_directories.yml', 'config/custom/indexed_directories.yml', :assigns => {:directories_to_index => @directories_to_index}
|
|
64
64
|
m.template '../config/title_and_names_and_links.yml', 'config/custom/title_and_names_and_links.yml', :assigns => {:version => Picolena::VERSION::STRING}
|
|
65
|
+
m.file '../config/icons_and_filetypes.yml', 'config/custom/icons_and_filetypes.yml'
|
|
65
66
|
|
|
66
67
|
# README, License & Rakefile
|
|
67
68
|
m.file 'MIT-LICENSE', 'LICENSE'
|
|
@@ -135,7 +136,7 @@ EOS
|
|
|
135
136
|
doc
|
|
136
137
|
lang/ui
|
|
137
138
|
lib
|
|
138
|
-
lib/
|
|
139
|
+
lib/plain_text_extractors
|
|
139
140
|
lib/tasks
|
|
140
141
|
log
|
|
141
142
|
public
|
|
@@ -155,6 +156,7 @@ EOS
|
|
|
155
156
|
spec/test_dirs/indexed/basic
|
|
156
157
|
spec/test_dirs/indexed/different_encodings
|
|
157
158
|
spec/test_dirs/indexed/just_one_doc
|
|
159
|
+
spec/test_dirs/indexed/lang
|
|
158
160
|
spec/test_dirs/indexed/literature
|
|
159
161
|
spec/test_dirs/indexed/others
|
|
160
162
|
spec/test_dirs/indexed/others/nested
|
|
@@ -23,9 +23,9 @@ class ApplicationController < ActionController::Base
|
|
|
23
23
|
# Tries to match remote IP address with the white list defined in config/custom/white_list_ip.yml
|
|
24
24
|
# Redirects to :access_denied if the remote IP is not white listed.
|
|
25
25
|
def should_only_be_available_for_white_list_IPs
|
|
26
|
-
unless request.remote_ip =~ WhiteListIPs
|
|
26
|
+
unless request.remote_ip =~ Picolena::WhiteListIPs
|
|
27
27
|
redirect_to :controller => 'application', :action=>'access_denied'
|
|
28
28
|
return false
|
|
29
29
|
end
|
|
30
30
|
end
|
|
31
|
-
end
|
|
31
|
+
end
|
|
@@ -24,7 +24,7 @@ class DocumentsController < ApplicationController
|
|
|
24
24
|
page=params[:page]||1
|
|
25
25
|
finder=Finder.new(@query,page)
|
|
26
26
|
finder.execute!
|
|
27
|
-
pager=::Paginator.new(finder.total_hits, ResultsPerPage) do
|
|
27
|
+
pager=::Paginator.new(finder.total_hits, Picolena::ResultsPerPage) do
|
|
28
28
|
finder.matching_documents
|
|
29
29
|
end
|
|
30
30
|
@matching_documents=pager.page(page)
|
|
@@ -3,13 +3,13 @@ module DocumentsHelper
|
|
|
3
3
|
def nothing_found?
|
|
4
4
|
@matching_documents.nil? or @matching_documents.entries.empty?
|
|
5
5
|
end
|
|
6
|
-
|
|
6
|
+
|
|
7
7
|
# Very basic pagination.
|
|
8
8
|
# Provides liks to Next, Prev and FirstPage when needed.
|
|
9
9
|
def should_paginate(page,query)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
[(link_to("←←", :action => :show, :id => query, :page => 1) if page.number>2),
|
|
11
|
+
(link_to("←", :action => :show, :id => query, :page => page.prev.number) if page.prev?),
|
|
12
|
+
(link_to("→", :action => :show, :id => query, :page => page.next.number) if page.next?)].compact.join(" | ")
|
|
13
13
|
end
|
|
14
14
|
|
|
15
15
|
# Returns a localized sentence like "Results 1-10 of 12 for Zimbabwe (0.472s)" or
|
|
@@ -30,7 +30,7 @@ module DocumentsHelper
|
|
|
30
30
|
content_tag(:small,'('<<number_with_precision(dt,3)<<'s)')
|
|
31
31
|
end
|
|
32
32
|
|
|
33
|
-
# When possible, highlights content of the document that
|
|
33
|
+
# When possible, highlights content of the document that matches the query.
|
|
34
34
|
def highlight_matching_content(document)
|
|
35
35
|
content_tag(:ul,document.matching_content.collect{|sentence|
|
|
36
36
|
content_tag(:li,h(sentence).gsub(/<<(.*?)>>/,'<strong>\1</strong>').gsub(/\v|\f/,''))
|
|
@@ -43,28 +43,9 @@ module DocumentsHelper
|
|
|
43
43
|
end
|
|
44
44
|
|
|
45
45
|
# Returns the location (if avaible) of the filetype icon.
|
|
46
|
-
# TODO: Move this hash to a .yml config file.
|
|
47
46
|
def icon_for(filetype)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
:doc=>%w{doc odt rtf dot docx dotx},
|
|
51
|
-
:pdf=>%w{pdf},
|
|
52
|
-
:txt=>%w{txt text tex bib log ini no_extension},
|
|
53
|
-
:ogg=>%w{mp3 ogg wma wav wmv tee},
|
|
54
|
-
:html=>%w{html htm},
|
|
55
|
-
:ppt=>%w{ppt pps pptx odp},
|
|
56
|
-
:package=>%w{gz rar zip bak},
|
|
57
|
-
:picture=>%w{psd jpg png gif eps bmp ico},
|
|
58
|
-
:cad=>%w{dwg dxf},
|
|
59
|
-
:exe=>%w{exe dll},
|
|
60
|
-
:video=>%w{avi wmv mpg mpeg},
|
|
61
|
-
:code=>%w{for cpp c rb java},
|
|
62
|
-
:insel=>%w{ins vee}
|
|
63
|
-
}
|
|
64
|
-
pic=pic_for_exts.find{|pic, extensions|
|
|
65
|
-
extensions.any? { |ext| filetype.sub(/\./,'').downcase==ext}
|
|
66
|
-
}
|
|
67
|
-
image_tag("icons/#{pic.first}.png") if pic
|
|
47
|
+
icon_symbol=FiletypeToIconSymbol[filetype.downcase.sub(/^\./,'')]
|
|
48
|
+
image_tag("icons/#{icon_symbol}.png") if icon_symbol
|
|
68
49
|
end
|
|
69
50
|
|
|
70
51
|
# Returns a link to a backup search engine that could maybe find more results for the same query.
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
# Document class retrieves information from filesystem and the index for any given document.
|
|
2
2
|
class Document
|
|
3
3
|
attr_reader :complete_path
|
|
4
|
-
|
|
4
|
+
attr_writer :index_id
|
|
5
|
+
attr_accessor :user, :score, :matching_content
|
|
5
6
|
|
|
6
7
|
def initialize(path)
|
|
7
8
|
#To ensure @complete_path is an absolute direction.
|
|
@@ -10,8 +11,6 @@ class Document
|
|
|
10
11
|
validate_in_indexed_directory
|
|
11
12
|
end
|
|
12
13
|
|
|
13
|
-
alias_method :to_param, :id
|
|
14
|
-
|
|
15
14
|
#Delegating properties to File::method_name(complete_path)
|
|
16
15
|
[:dirname, :basename, :extname, :size?, :file?, :read, :ext_as_sym].each{|method_name|
|
|
17
16
|
define_method(method_name){File.send(method_name,complete_path)}
|
|
@@ -38,7 +37,7 @@ class Document
|
|
|
38
37
|
# "http://www.mycompany.com/wiki/organigram.odp"
|
|
39
38
|
def alias_path
|
|
40
39
|
original_dir=indexed_directory
|
|
41
|
-
alias_dir=IndexedDirectories[original_dir]
|
|
40
|
+
alias_dir=Picolena::IndexedDirectories[original_dir]
|
|
42
41
|
dirname.sub(original_dir,alias_dir)
|
|
43
42
|
end
|
|
44
43
|
|
|
@@ -50,48 +49,67 @@ class Document
|
|
|
50
49
|
@probably_unique_id||=complete_path.base26_hash
|
|
51
50
|
end
|
|
52
51
|
|
|
53
|
-
# Returns true iff some
|
|
52
|
+
# Returns true iff some PlainTextExtractor has been defined to convert it to plain text.
|
|
54
53
|
# Document.new("presentation.pdf").supported? => true
|
|
55
54
|
# Document.new("presentation.some_weird_extension").supported? => false
|
|
56
55
|
def supported?
|
|
57
|
-
|
|
56
|
+
PlainTextExtractor.supported_extensions.include?(self.ext_as_sym)
|
|
58
57
|
end
|
|
59
58
|
|
|
60
59
|
# Retrieves content as it is *now*.
|
|
61
60
|
def content
|
|
62
|
-
|
|
61
|
+
PlainTextExtractor.extract_content_from(complete_path)
|
|
63
62
|
end
|
|
64
63
|
|
|
65
64
|
# Cache à la Google.
|
|
66
65
|
# Returns content as it was at the time it was indexed.
|
|
67
66
|
def cached
|
|
68
|
-
|
|
69
|
-
Finder.index[index_id][:content]
|
|
67
|
+
from_index[:content]
|
|
70
68
|
end
|
|
71
69
|
|
|
70
|
+
# FIXME: Not just date anymore.
|
|
72
71
|
# Returns the last modification date before the document got indexed.
|
|
73
72
|
# Useful to know how old a document is, and to which version the cache corresponds.
|
|
74
73
|
def date
|
|
75
|
-
|
|
76
|
-
|
|
74
|
+
from_index[:date].sub(/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/,'\1-\2-\3 \4:\5:\6')
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def mtime
|
|
78
|
+
from_index[:date].to_i
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Returns language.
|
|
82
|
+
def lang
|
|
83
|
+
from_index[:lang]
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Returns the id with which the document is indexed.
|
|
87
|
+
def index_id
|
|
88
|
+
@index_id ||= Document.find_by_complete_path(complete_path).index_id
|
|
77
89
|
end
|
|
78
90
|
|
|
79
91
|
private
|
|
80
92
|
|
|
81
|
-
|
|
82
|
-
|
|
93
|
+
# Retrieves the document from the index.
|
|
94
|
+
# Useful to get meta-info about it.
|
|
95
|
+
def from_index
|
|
96
|
+
IndexReader.new[index_id]
|
|
83
97
|
end
|
|
84
98
|
|
|
85
99
|
def self.find_by_unique_id(some_id)
|
|
86
100
|
Finder.new("probably_unique_id:"<<some_id).matching_document
|
|
87
101
|
end
|
|
88
102
|
|
|
103
|
+
def self.find_by_complete_path(complete_path)
|
|
104
|
+
Finder.new('complete_path:"'<<complete_path<<'"').matching_document
|
|
105
|
+
end
|
|
106
|
+
|
|
89
107
|
def in_indexed_directory?
|
|
90
108
|
!indexed_directory.nil?
|
|
91
109
|
end
|
|
92
110
|
|
|
93
111
|
def indexed_directory
|
|
94
|
-
IndexedDirectories.keys.find{|indexed_dir|
|
|
112
|
+
Picolena::IndexedDirectories.keys.find{|indexed_dir|
|
|
95
113
|
dirname.starts_with?(indexed_dir)
|
|
96
114
|
}
|
|
97
115
|
end
|
|
@@ -1,42 +1,36 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
class Finder
|
|
4
|
-
#FIXME: Should not use all those class methods to access index.
|
|
5
|
-
|
|
1
|
+
class Finder
|
|
6
2
|
attr_reader :query
|
|
7
3
|
|
|
8
|
-
def
|
|
4
|
+
def index
|
|
9
5
|
# caching index @@index ||=
|
|
10
6
|
# causes ferret-0.11.6/lib/ferret/index.rb:768: [BUG] Segmentation fault
|
|
11
|
-
|
|
7
|
+
IndexReader.new
|
|
12
8
|
end
|
|
13
9
|
|
|
14
|
-
def initialize(raw_query,page=1,results_per_page=ResultsPerPage)
|
|
15
|
-
|
|
16
|
-
@query = query_parser.parse(convert_to_english(raw_query))
|
|
10
|
+
def initialize(raw_query,page=1,results_per_page=Picolena::ResultsPerPage)
|
|
11
|
+
@query = Query.extract_from(raw_query)
|
|
17
12
|
@raw_query= raw_query
|
|
18
|
-
|
|
13
|
+
IndexReader.ensure_existence
|
|
19
14
|
@per_page=results_per_page
|
|
20
15
|
@offset=(page.to_i-1)*results_per_page
|
|
21
|
-
|
|
16
|
+
index.should_have_documents
|
|
22
17
|
end
|
|
23
18
|
|
|
24
19
|
def execute!
|
|
25
20
|
@matching_documents=[]
|
|
26
21
|
start=Time.now
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
@matching_documents<<found_doc
|
|
22
|
+
top_docs=index.search(query, :limit => @per_page, :offset=>@offset)
|
|
23
|
+
top_docs.hits.each{|hit|
|
|
24
|
+
index_id,score=hit.doc,hit.score
|
|
25
|
+
begin
|
|
26
|
+
found_doc=Document.new(index[index_id][:complete_path])
|
|
27
|
+
found_doc.matching_content=index.highlight(query, index_id,
|
|
28
|
+
:field => :content, :excerpt_length => 80,
|
|
29
|
+
:pre_tag => "<<", :post_tag => ">>"
|
|
30
|
+
) unless @raw_query=~/^\*+\.\w*$/
|
|
31
|
+
found_doc.score=score
|
|
32
|
+
found_doc.index_id=index_id
|
|
33
|
+
@matching_documents<<found_doc
|
|
40
34
|
rescue Errno::ENOENT
|
|
41
35
|
#"File has been moved/deleted!"
|
|
42
36
|
end
|
|
@@ -44,9 +38,6 @@ class Finder
|
|
|
44
38
|
@executed=true
|
|
45
39
|
@time_needed=Time.now-start
|
|
46
40
|
@total_hits=top_docs.total_hits
|
|
47
|
-
ensure
|
|
48
|
-
#index.close
|
|
49
|
-
end
|
|
50
41
|
end
|
|
51
42
|
|
|
52
43
|
# Returns true if it has been executed.
|
|
@@ -66,17 +57,7 @@ class Finder
|
|
|
66
57
|
}
|
|
67
58
|
}
|
|
68
59
|
|
|
69
|
-
# Returns
|
|
70
|
-
def self.has_index?
|
|
71
|
-
index_filename and File.exists?(index_filename)
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
# Returns true if there's at least one document indexed.
|
|
75
|
-
def has_documents?
|
|
76
|
-
Finder.index.size>0
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
# Returns matching document for any given query, if only
|
|
60
|
+
# Returns matching document for any given query only if
|
|
80
61
|
# exactly one document is found.
|
|
81
62
|
# Raises otherwise.
|
|
82
63
|
def matching_document
|
|
@@ -89,42 +70,4 @@ class Finder
|
|
|
89
70
|
raise IndexError, "More than one document found"
|
|
90
71
|
end
|
|
91
72
|
end
|
|
92
|
-
|
|
93
|
-
private
|
|
94
|
-
|
|
95
|
-
# Convert query keywords to english so they can be parsed by Ferret.
|
|
96
|
-
def convert_to_english(query)
|
|
97
|
-
to_en={
|
|
98
|
-
/\b#{:AND.l}\b/=>'AND',
|
|
99
|
-
/\b#{:OR.l}\b/=>'OR',
|
|
100
|
-
/\b#{:NOT.l}\b/=>'NOT',
|
|
101
|
-
/(#{:filetype.l}):/=>'filetype:',
|
|
102
|
-
/#{:content.l}:/ => 'content:',
|
|
103
|
-
/#{:date.l}:/ => 'date:',
|
|
104
|
-
/\b#{:LIKE.l}\s+(\S+)/=>'\1~'
|
|
105
|
-
}
|
|
106
|
-
to_en.inject(query){|mem,non_english_to_english_keyword|
|
|
107
|
-
mem.gsub(*non_english_to_english_keyword)
|
|
108
|
-
}
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
def self.index_filename
|
|
112
|
-
Dir.glob(File.join(IndexSavePath,'*.cfs')).first
|
|
113
|
-
end
|
|
114
|
-
|
|
115
|
-
def self.ensure_that_index_exists_on_disk
|
|
116
|
-
force_index_creation unless has_index? or RAILS_ENV=="production"
|
|
117
|
-
end
|
|
118
|
-
|
|
119
|
-
def self.force_index_creation
|
|
120
|
-
create_index(IndexedDirectories.keys)
|
|
121
|
-
end
|
|
122
|
-
|
|
123
|
-
def self.delete_index
|
|
124
|
-
FileUtils.rm(Dir.glob(File.join(IndexSavePath,'*.cfs'))) if has_index?
|
|
125
|
-
end
|
|
126
|
-
|
|
127
|
-
def validate_that_index_has_documents
|
|
128
|
-
raise IndexError, "no document found" unless has_documents?
|
|
129
|
-
end
|
|
130
|
-
end
|
|
73
|
+
end
|