picolena 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. data/History.txt +8 -0
  2. data/Manifest.txt +28 -15
  3. data/README.txt +1 -1
  4. data/config/files_to_clean +2 -1
  5. data/config/hoe.rb +1 -1
  6. data/lib/picolena/config/basic.rb +46 -35
  7. data/lib/picolena/config/icons_and_filetypes.yml +69 -0
  8. data/lib/picolena/config/indexed_directories.yml +1 -1
  9. data/lib/picolena/picolena_generator.rb +3 -1
  10. data/lib/picolena/templates/app/controllers/application.rb +2 -2
  11. data/lib/picolena/templates/app/controllers/documents_controller.rb +1 -1
  12. data/lib/picolena/templates/app/helpers/documents_helper.rb +7 -26
  13. data/lib/picolena/templates/app/models/document.rb +32 -14
  14. data/lib/picolena/templates/app/models/finder.rb +21 -78
  15. data/lib/picolena/templates/app/models/index_reader.rb +56 -0
  16. data/lib/picolena/templates/app/models/index_writer.rb +36 -0
  17. data/lib/picolena/templates/app/models/indexer.rb +142 -0
  18. data/lib/picolena/templates/app/models/plain_text_extractor.rb +122 -0
  19. data/lib/picolena/templates/app/models/query.rb +31 -0
  20. data/lib/picolena/templates/app/views/documents/_document.html.haml +2 -2
  21. data/lib/picolena/templates/config/environment.rb +2 -2
  22. data/lib/picolena/templates/config/environments/development.rb +1 -1
  23. data/lib/picolena/templates/config/environments/production.rb +1 -1
  24. data/lib/picolena/templates/config/environments/test.rb +1 -1
  25. data/lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb +2 -0
  26. data/lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb +3 -1
  27. data/lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb +6 -0
  28. data/lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb +2 -0
  29. data/lib/picolena/templates/config/initializers/006_load_icons.rb +8 -0
  30. data/lib/picolena/templates/lib/core_exts.rb +20 -1
  31. data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +72 -0
  32. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/adobe.pdf.rb +3 -3
  33. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/html.rb +2 -2
  34. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.excel.rb +4 -4
  35. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.powerpoint.rb +4 -4
  36. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.rtf.rb +2 -2
  37. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.word.rb +4 -4
  38. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.presentation.rb +2 -2
  39. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.spreadsheet.rb +2 -2
  40. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.text.rb +2 -2
  41. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/plain_text.rb +3 -3
  42. data/lib/picolena/templates/lib/tasks/index.rake +4 -6
  43. data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
  44. data/lib/picolena/templates/spec/controllers/documents_controller_spec.rb +5 -5
  45. data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +1 -1
  46. data/lib/picolena/templates/spec/models/basic_finder_spec.rb +13 -13
  47. data/lib/picolena/templates/spec/models/document_spec.rb +1 -1
  48. data/lib/picolena/templates/spec/models/finder_spec.rb +5 -70
  49. data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +6 -2
  50. data/lib/picolena/templates/spec/models/index_directories_spec.rb +4 -4
  51. data/lib/picolena/templates/spec/models/index_reader_spec.rb +7 -0
  52. data/lib/picolena/templates/spec/models/index_writer_spec.rb +7 -0
  53. data/lib/picolena/templates/spec/models/indexer_spec.rb +7 -0
  54. data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +42 -0
  55. data/lib/picolena/templates/spec/models/query_spec.rb +56 -0
  56. data/lib/picolena/templates/spec/test_dirs/indexed/lang/goethe +42 -0
  57. data/lib/picolena/templates/spec/test_dirs/indexed/lang/hugo +83 -0
  58. data/lib/picolena/templates/spec/test_dirs/indexed/lang/lorca +86 -0
  59. data/lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare +90 -0
  60. data/lib/picolena/version.rb +1 -1
  61. data/tasks/hack.rake +2 -1
  62. data/website/index.html +2 -2
  63. data.tar.gz.sig +0 -0
  64. metadata +30 -17
  65. metadata.gz.sig +0 -0
  66. data/lib/picolena/templates/config/initializers/004_load_filters.rb +0 -6
  67. data/lib/picolena/templates/lib/ff.rb +0 -117
  68. data/lib/picolena/templates/lib/filter.rb +0 -75
  69. data/lib/picolena/templates/lib/filter_DSL.rb +0 -77
  70. data/lib/picolena/templates/spec/models/filters_spec.rb +0 -30
data/History.txt CHANGED
@@ -1,3 +1,11 @@
1
+ == 0.1.2 2008-04-20
2
+
3
+ * major enhancement:
4
+ * complete Indexer & Index rewrite
5
+ * new DSL syntax
6
+ * multi-threaded Indexer
7
+
8
+
1
9
  == 0.1.1 2008-04-12
2
10
 
3
11
  * major enhancement:
data/Manifest.txt CHANGED
@@ -9,6 +9,7 @@ config/hoe.rb
9
9
  config/requirements.rb
10
10
  lib/picolena/USAGE
11
11
  lib/picolena/config/basic.rb
12
+ lib/picolena/config/icons_and_filetypes.yml
12
13
  lib/picolena/config/indexed_directories.yml
13
14
  lib/picolena/config/title_and_names_and_links.yml
14
15
  lib/picolena/config/white_list_ip.yml
@@ -21,6 +22,11 @@ lib/picolena/templates/app/helpers/application_helper.rb
21
22
  lib/picolena/templates/app/helpers/documents_helper.rb
22
23
  lib/picolena/templates/app/models/document.rb
23
24
  lib/picolena/templates/app/models/finder.rb
25
+ lib/picolena/templates/app/models/index_reader.rb
26
+ lib/picolena/templates/app/models/index_writer.rb
27
+ lib/picolena/templates/app/models/indexer.rb
28
+ lib/picolena/templates/app/models/plain_text_extractor.rb
29
+ lib/picolena/templates/app/models/query.rb
24
30
  lib/picolena/templates/app/views/documents/_document.html.haml
25
31
  lib/picolena/templates/app/views/documents/cached.html.haml
26
32
  lib/picolena/templates/app/views/documents/content.html.haml
@@ -35,27 +41,26 @@ lib/picolena/templates/config/environments/test.rb
35
41
  lib/picolena/templates/config/initializers/001_load_custom_config.rb
36
42
  lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb
37
43
  lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb
38
- lib/picolena/templates/config/initializers/004_load_filters.rb
44
+ lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
39
45
  lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
46
+ lib/picolena/templates/config/initializers/006_load_icons.rb
40
47
  lib/picolena/templates/config/routes.rb
41
48
  lib/picolena/templates/lang/ui/de.yml
42
49
  lib/picolena/templates/lang/ui/en.yml
43
50
  lib/picolena/templates/lang/ui/es.yml
44
51
  lib/picolena/templates/lang/ui/fr.yml
45
52
  lib/picolena/templates/lib/core_exts.rb
46
- lib/picolena/templates/lib/ff.rb
47
- lib/picolena/templates/lib/filter.rb
48
- lib/picolena/templates/lib/filter_DSL.rb
49
- lib/picolena/templates/lib/filters/adobe.pdf.rb
50
- lib/picolena/templates/lib/filters/html.rb
51
- lib/picolena/templates/lib/filters/ms.excel.rb
52
- lib/picolena/templates/lib/filters/ms.powerpoint.rb
53
- lib/picolena/templates/lib/filters/ms.rtf.rb
54
- lib/picolena/templates/lib/filters/ms.word.rb
55
- lib/picolena/templates/lib/filters/opendocument.presentation.rb
56
- lib/picolena/templates/lib/filters/opendocument.spreadsheet.rb
57
- lib/picolena/templates/lib/filters/opendocument.text.rb
58
- lib/picolena/templates/lib/filters/plain_text.rb
53
+ lib/picolena/templates/lib/plain_text_extractor_DSL.rb
54
+ lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
55
+ lib/picolena/templates/lib/plain_text_extractors/html.rb
56
+ lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb
57
+ lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb
58
+ lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb
59
+ lib/picolena/templates/lib/plain_text_extractors/ms.word.rb
60
+ lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb
61
+ lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb
62
+ lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb
63
+ lib/picolena/templates/lib/plain_text_extractors/plain_text.rb
59
64
  lib/picolena/templates/lib/tasks/annotations.rake
60
65
  lib/picolena/templates/lib/tasks/index.rake
61
66
  lib/picolena/templates/lib/tasks/install_dependencies.rake
@@ -112,10 +117,14 @@ lib/picolena/templates/spec/helpers/application_helper_spec.rb
112
117
  lib/picolena/templates/spec/helpers/documents_helper_spec.rb
113
118
  lib/picolena/templates/spec/models/basic_finder_spec.rb
114
119
  lib/picolena/templates/spec/models/document_spec.rb
115
- lib/picolena/templates/spec/models/filters_spec.rb
116
120
  lib/picolena/templates/spec/models/finder_spec.rb
117
121
  lib/picolena/templates/spec/models/host_indexing_system_spec.rb
118
122
  lib/picolena/templates/spec/models/index_directories_spec.rb
123
+ lib/picolena/templates/spec/models/index_reader_spec.rb
124
+ lib/picolena/templates/spec/models/index_writer_spec.rb
125
+ lib/picolena/templates/spec/models/indexer_spec.rb
126
+ lib/picolena/templates/spec/models/plain_text_extractor_spec.rb
127
+ lib/picolena/templates/spec/models/query_spec.rb
119
128
  lib/picolena/templates/spec/rcov.opts
120
129
  lib/picolena/templates/spec/spec.opts
121
130
  lib/picolena/templates/spec/spec_helper.rb
@@ -140,6 +149,10 @@ lib/picolena/templates/spec/test_dirs/indexed/different_encodings/iso-8859-1.txt
140
149
  lib/picolena/templates/spec/test_dirs/indexed/different_encodings/iso-8859-15.txt
141
150
  lib/picolena/templates/spec/test_dirs/indexed/different_encodings/utf-8.txt
142
151
  lib/picolena/templates/spec/test_dirs/indexed/just_one_doc/for_test.txt
152
+ lib/picolena/templates/spec/test_dirs/indexed/lang/goethe
153
+ lib/picolena/templates/spec/test_dirs/indexed/lang/hugo
154
+ lib/picolena/templates/spec/test_dirs/indexed/lang/lorca
155
+ lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare
143
156
  lib/picolena/templates/spec/test_dirs/indexed/literature/Simulation of district heating systems for evaluation of real-time control strategies.pdf
144
157
  lib/picolena/templates/spec/test_dirs/indexed/literature/Types of malfunction in DH substations.doc
145
158
  lib/picolena/templates/spec/test_dirs/indexed/others/'weird'filename.txt
data/README.txt CHANGED
@@ -14,7 +14,7 @@ Picolena is a lightweight ferret-powered documents search engine written in Ruby
14
14
 
15
15
  Picolena has many advantages:
16
16
 
17
- * it can index .pdf, .doc, .docx, .odt, .xls, .ods, .ppt, .pptx, .odp, .rtf, .html and plain text files will full text search, and offers a very easy way to add new filters to index other filetype.
17
+ * it can index .pdf, .doc, .docx, .odt, .xls, .ods, .ppt, .pptx, .odp, .rtf, .html and plain text files will full text search, and offers a very easy way to add new extractors to index other filetype.
18
18
  * it is free as in free beer and as in free speech
19
19
  * thanks to Ferret, it is very fast
20
20
  * it keeps your data private. By default, only the computer on which it is installed can get access to the search engine. Other IP addresses can then be added to a white list.
@@ -4,9 +4,10 @@ lib/picolena/templates/config/custom/picolena.rb
4
4
  lib/picolena/templates/config/custom/indexed_directories.yml
5
5
  lib/picolena/templates/config/custom/white_list_ip.yml
6
6
  lib/picolena/templates/config/custom/title_and_names_and_links.yml
7
+ lib/picolena/templates/config/custom/icons_and_filetypes.yml
7
8
  lib/picolena/templates/log
8
9
  lib/picolena/templates/spec/test_dirs/indexed/others/bäñüßé.txt
9
10
  lib/picolena/templates/tmp
10
11
  lib/picolena/templates/vendor
11
12
  lib/picolena/templates/coverage
12
- lib/picolena/templates/doc
13
+ lib/picolena/templates/doc
data/config/hoe.rb CHANGED
@@ -61,7 +61,7 @@ hoe = Hoe.new(GEM_NAME, VERS) do |p|
61
61
  # == Optional
62
62
  p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
63
63
  p.extra_deps = [ ['rails', '>= 2.0.2'],
64
- # Ferret 0.11.6 is not yet available for win32
64
+ # Ferret 0.11.6 is not yet available for win32 (and will never be, it is a fix release for *nix)
65
65
  # FIXME: How to require 0.11.6 for *nix and 0.11.5 for win32?
66
66
  ['ferret', '>= 0.11.5'],
67
67
  ['haml', '>= 1.8.2'],
@@ -1,35 +1,46 @@
1
- # Specify indexes path.
2
- # Storage should be sufficient in order to store all indexed data.
3
- IndexesSavePath=File.join(RAILS_ROOT, 'tmp/ferret_indexes/')
4
-
5
-
6
- # Which language should be used?
7
- # English (:en), German (:de), French (:fr) and Spanish (:es) are currently supported
8
- # English is chosen by default.
9
- # If you'd like to use another language, you can find templates in #{RAILS_ROOT}/lang/ui,
10
- # then add your own language in this directory, and modify this line:
11
- Globalite.language = :en
12
-
13
-
14
- # Specify which locale should be used by Ferret
15
- Ferret.locale = "en_US.UTF-8"
16
-
17
-
18
- # Results per page
19
- ResultsPerPage = 10
20
-
21
-
22
- # Length of "probably unique id" 's
23
- # Those id's are used to characterize every document, thus allowing tiny URLs in Controllers
24
- # HashLength = 10
25
- # Document.new("whatever.pdf").probably_unique_id => "bbuxhynait"
26
- # HashLength = 20
27
- # Document.new("whatever.pdf").probably_unique_id => "jfzjkyfkfkbbuxhynait"
28
- # The more documents you have, the bigger HashLength should be in order to avoid collisions.
29
- # It would not be wise (and specs won't pass) to specify HashLength smaller than 10.
30
- HashLength = 10
31
-
32
-
33
- # Specify the default Levenshtein distance when using FuzzyQuery
34
- # see http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html for more information.
35
- Ferret::Search::FuzzyQuery.default_min_similarity=0.6
1
+ module Picolena
2
+ # Specify indexes path.
3
+ # Storage should be sufficient in order to store all indexed data.
4
+ IndexesSavePath=File.join(RAILS_ROOT, 'tmp/ferret_indexes/')
5
+
6
+
7
+ # Which language should be used?
8
+ # English (:en), German (:de), French (:fr) and Spanish (:es) are currently supported
9
+ # English is chosen by default.
10
+ # If you'd like to use another language, you can find templates in #{RAILS_ROOT}/lang/ui,
11
+ # then add your own language in this directory, and modify this line:
12
+ Globalite.language = :en
13
+
14
+
15
+ # Is more than one language used in indexed documents?
16
+ # Picolena can try to recognise the language used, and save it in the index.
17
+ # It is then possible to look for documents according to their language.
18
+ #
19
+ # If every document is written in the same language, turning UseLanguageRecognition to false
20
+ # will speed up the indexing process
21
+ UseLanguageRecognition = true
22
+
23
+ # Specify which locale should be used by Ferret
24
+ Ferret.locale = "en_US.UTF-8"
25
+
26
+
27
+ # Results per page
28
+ ResultsPerPage = 10
29
+
30
+
31
+ # Length of "probably unique id" 's
32
+ # Those id's are used to characterize every document, thus allowing tiny URLs in Controllers
33
+ # HashLength = 10
34
+ # Document.new("whatever.pdf").probably_unique_id => "bbuxhynait"
35
+ # HashLength = 20
36
+ # Document.new("whatever.pdf").probably_unique_id => "jfzjkyfkfkbbuxhynait"
37
+ # The more documents you have, the bigger HashLength should be in order to avoid collisions.
38
+ # It would not be wise (and specs won't pass) to specify HashLength smaller than 10.
39
+ HashLength = 10
40
+
41
+
42
+ # Specify the default Levenshtein distance when using FuzzyQuery
43
+ # see http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html for more information.
44
+ Ferret::Search::FuzzyQuery.default_min_similarity=0.6
45
+ Analyzer=Ferret::Analysis::StandardAnalyzer.new
46
+ end
@@ -0,0 +1,69 @@
1
+ video:
2
+ avi
3
+ wmv
4
+ mpg
5
+ mpeg
6
+ ogg:
7
+ mp3
8
+ ogg
9
+ wma
10
+ wav
11
+ wmv
12
+ tee
13
+ txt:
14
+ txt
15
+ text
16
+ tex
17
+ bib
18
+ log
19
+ ini
20
+ no_extension
21
+ doc:
22
+ doc
23
+ odt
24
+ rtf
25
+ dot
26
+ docx
27
+ dotx
28
+ insel:
29
+ ins
30
+ vee
31
+ ppt:
32
+ ppt
33
+ pps
34
+ pptx
35
+ odp
36
+ pdf:
37
+ pdf
38
+ package:
39
+ gz
40
+ rar
41
+ zip
42
+ bak
43
+ code:
44
+ for
45
+ cpp
46
+ c
47
+ rb
48
+ java
49
+ html:
50
+ html
51
+ htm
52
+ xls:
53
+ xls
54
+ xlsx
55
+ ods
56
+ picture:
57
+ psd
58
+ jpg
59
+ png
60
+ gif
61
+ eps
62
+ bmp
63
+ ico
64
+ cad:
65
+ dwg
66
+ dxf
67
+ exe:
68
+ exe
69
+ dll
@@ -5,6 +5,6 @@ development:
5
5
  #alias path could be any smb, http, ftp or local directory that is available to the end-user.
6
6
  <%= directories_to_index %>
7
7
  test:
8
- "spec/test_dirs/indexed": "http://picolena.devjavu.com/browser/trunk/spec/test_dirs/indexed"
8
+ "spec/test_dirs/indexed": "http://picolena.devjavu.com/browser/trunk/lib/picolena/templates/spec/test_dirs/indexed"
9
9
  production:
10
10
  <%= directories_to_index %>
@@ -62,6 +62,7 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
62
62
  m.file '../config/basic.rb', 'config/custom/picolena.rb'
63
63
  m.template '../config/indexed_directories.yml', 'config/custom/indexed_directories.yml', :assigns => {:directories_to_index => @directories_to_index}
64
64
  m.template '../config/title_and_names_and_links.yml', 'config/custom/title_and_names_and_links.yml', :assigns => {:version => Picolena::VERSION::STRING}
65
+ m.file '../config/icons_and_filetypes.yml', 'config/custom/icons_and_filetypes.yml'
65
66
 
66
67
  # README, License & Rakefile
67
68
  m.file 'MIT-LICENSE', 'LICENSE'
@@ -135,7 +136,7 @@ EOS
135
136
  doc
136
137
  lang/ui
137
138
  lib
138
- lib/filters
139
+ lib/plain_text_extractors
139
140
  lib/tasks
140
141
  log
141
142
  public
@@ -155,6 +156,7 @@ EOS
155
156
  spec/test_dirs/indexed/basic
156
157
  spec/test_dirs/indexed/different_encodings
157
158
  spec/test_dirs/indexed/just_one_doc
159
+ spec/test_dirs/indexed/lang
158
160
  spec/test_dirs/indexed/literature
159
161
  spec/test_dirs/indexed/others
160
162
  spec/test_dirs/indexed/others/nested
@@ -23,9 +23,9 @@ class ApplicationController < ActionController::Base
23
23
  # Tries to match remote IP address with the white list defined in config/custom/white_list_ip.yml
24
24
  # Redirects to :access_denied if the remote IP is not white listed.
25
25
  def should_only_be_available_for_white_list_IPs
26
- unless request.remote_ip =~ WhiteListIPs
26
+ unless request.remote_ip =~ Picolena::WhiteListIPs
27
27
  redirect_to :controller => 'application', :action=>'access_denied'
28
28
  return false
29
29
  end
30
30
  end
31
- end
31
+ end
@@ -24,7 +24,7 @@ class DocumentsController < ApplicationController
24
24
  page=params[:page]||1
25
25
  finder=Finder.new(@query,page)
26
26
  finder.execute!
27
- pager=::Paginator.new(finder.total_hits, ResultsPerPage) do
27
+ pager=::Paginator.new(finder.total_hits, Picolena::ResultsPerPage) do
28
28
  finder.matching_documents
29
29
  end
30
30
  @matching_documents=pager.page(page)
@@ -3,13 +3,13 @@ module DocumentsHelper
3
3
  def nothing_found?
4
4
  @matching_documents.nil? or @matching_documents.entries.empty?
5
5
  end
6
-
6
+
7
7
  # Very basic pagination.
8
8
  # Provides liks to Next, Prev and FirstPage when needed.
9
9
  def should_paginate(page,query)
10
- [(link_to("&larr;&larr;", :action => :show, :id => query, :page => 1) if page.number>2),
11
- (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number) if page.prev?),
12
- (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number) if page.next?)].compact.join(" | ")
10
+ [(link_to("&larr;&larr;", :action => :show, :id => query, :page => 1) if page.number>2),
11
+ (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number) if page.prev?),
12
+ (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number) if page.next?)].compact.join(" | ")
13
13
  end
14
14
 
15
15
  # Returns a localized sentence like "Results 1-10 of 12 for Zimbabwe (0.472s)" or
@@ -30,7 +30,7 @@ module DocumentsHelper
30
30
  content_tag(:small,'('<<number_with_precision(dt,3)<<'s)')
31
31
  end
32
32
 
33
- # When possible, highlights content of the document that match the query.
33
+ # When possible, highlights content of the document that matches the query.
34
34
  def highlight_matching_content(document)
35
35
  content_tag(:ul,document.matching_content.collect{|sentence|
36
36
  content_tag(:li,h(sentence).gsub(/&lt;&lt;(.*?)&gt;&gt;/,'<strong>\1</strong>').gsub(/\v|\f/,''))
@@ -43,28 +43,9 @@ module DocumentsHelper
43
43
  end
44
44
 
45
45
  # Returns the location (if avaible) of the filetype icon.
46
- # TODO: Move this hash to a .yml config file.
47
46
  def icon_for(filetype)
48
- pic_for_exts={
49
- :xls=>%w{xls xlsx ods},
50
- :doc=>%w{doc odt rtf dot docx dotx},
51
- :pdf=>%w{pdf},
52
- :txt=>%w{txt text tex bib log ini no_extension},
53
- :ogg=>%w{mp3 ogg wma wav wmv tee},
54
- :html=>%w{html htm},
55
- :ppt=>%w{ppt pps pptx odp},
56
- :package=>%w{gz rar zip bak},
57
- :picture=>%w{psd jpg png gif eps bmp ico},
58
- :cad=>%w{dwg dxf},
59
- :exe=>%w{exe dll},
60
- :video=>%w{avi wmv mpg mpeg},
61
- :code=>%w{for cpp c rb java},
62
- :insel=>%w{ins vee}
63
- }
64
- pic=pic_for_exts.find{|pic, extensions|
65
- extensions.any? { |ext| filetype.sub(/\./,'').downcase==ext}
66
- }
67
- image_tag("icons/#{pic.first}.png") if pic
47
+ icon_symbol=FiletypeToIconSymbol[filetype.downcase.sub(/^\./,'')]
48
+ image_tag("icons/#{icon_symbol}.png") if icon_symbol
68
49
  end
69
50
 
70
51
  # Returns a link to a backup search engine that could maybe find more results for the same query.
@@ -1,7 +1,8 @@
1
1
  # Document class retrieves information from filesystem and the index for any given document.
2
2
  class Document
3
3
  attr_reader :complete_path
4
- attr_accessor :user, :score, :matching_content, :index_id
4
+ attr_writer :index_id
5
+ attr_accessor :user, :score, :matching_content
5
6
 
6
7
  def initialize(path)
7
8
  #To ensure @complete_path is an absolute direction.
@@ -10,8 +11,6 @@ class Document
10
11
  validate_in_indexed_directory
11
12
  end
12
13
 
13
- alias_method :to_param, :id
14
-
15
14
  #Delegating properties to File::method_name(complete_path)
16
15
  [:dirname, :basename, :extname, :size?, :file?, :read, :ext_as_sym].each{|method_name|
17
16
  define_method(method_name){File.send(method_name,complete_path)}
@@ -38,7 +37,7 @@ class Document
38
37
  # "http://www.mycompany.com/wiki/organigram.odp"
39
38
  def alias_path
40
39
  original_dir=indexed_directory
41
- alias_dir=IndexedDirectories[original_dir]
40
+ alias_dir=Picolena::IndexedDirectories[original_dir]
42
41
  dirname.sub(original_dir,alias_dir)
43
42
  end
44
43
 
@@ -50,48 +49,67 @@ class Document
50
49
  @probably_unique_id||=complete_path.base26_hash
51
50
  end
52
51
 
53
- # Returns true iff some Filter has been defined to convert it to plain text.
52
+ # Returns true iff some PlainTextExtractor has been defined to convert it to plain text.
54
53
  # Document.new("presentation.pdf").supported? => true
55
54
  # Document.new("presentation.some_weird_extension").supported? => false
56
55
  def supported?
57
- PlainText.supported_extensions.include?(self.ext_as_sym)
56
+ PlainTextExtractor.supported_extensions.include?(self.ext_as_sym)
58
57
  end
59
58
 
60
59
  # Retrieves content as it is *now*.
61
60
  def content
62
- PlainText.extract_content_from(complete_path)
61
+ PlainTextExtractor.extract_content_from(complete_path)
63
62
  end
64
63
 
65
64
  # Cache à la Google.
66
65
  # Returns content as it was at the time it was indexed.
67
66
  def cached
68
- get_index_id! unless index_id
69
- Finder.index[index_id][:content]
67
+ from_index[:content]
70
68
  end
71
69
 
70
+ # FIXME: Not just date anymore.
72
71
  # Returns the last modification date before the document got indexed.
73
72
  # Useful to know how old a document is, and to which version the cache corresponds.
74
73
  def date
75
- get_index_id! unless index_id
76
- Finder.index[index_id][:date].sub(/(\d{4})(\d{2})(\d{2})/,'\1-\2-\3')
74
+ from_index[:date].sub(/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/,'\1-\2-\3 \4:\5:\6')
75
+ end
76
+
77
+ def mtime
78
+ from_index[:date].to_i
79
+ end
80
+
81
+ # Returns language.
82
+ def lang
83
+ from_index[:lang]
84
+ end
85
+
86
+ # Returns the id with which the document is indexed.
87
+ def index_id
88
+ @index_id ||= Document.find_by_complete_path(complete_path).index_id
77
89
  end
78
90
 
79
91
  private
80
92
 
81
- def get_index_id!
82
- @index_id = Document.find_by_unique_id(probably_unique_id).index_id
93
+ # Retrieves the document from the index.
94
+ # Useful to get meta-info about it.
95
+ def from_index
96
+ IndexReader.new[index_id]
83
97
  end
84
98
 
85
99
  def self.find_by_unique_id(some_id)
86
100
  Finder.new("probably_unique_id:"<<some_id).matching_document
87
101
  end
88
102
 
103
+ def self.find_by_complete_path(complete_path)
104
+ Finder.new('complete_path:"'<<complete_path<<'"').matching_document
105
+ end
106
+
89
107
  def in_indexed_directory?
90
108
  !indexed_directory.nil?
91
109
  end
92
110
 
93
111
  def indexed_directory
94
- IndexedDirectories.keys.find{|indexed_dir|
112
+ Picolena::IndexedDirectories.keys.find{|indexed_dir|
95
113
  dirname.starts_with?(indexed_dir)
96
114
  }
97
115
  end
@@ -1,42 +1,36 @@
1
- require 'ff'
2
-
3
- class Finder
4
- #FIXME: Should not use all those class methods to access index.
5
-
1
+ class Finder
6
2
  attr_reader :query
7
3
 
8
- def self.index
4
+ def index
9
5
  # caching index @@index ||=
10
6
  # causes ferret-0.11.6/lib/ferret/index.rb:768: [BUG] Segmentation fault
11
- Ferret::Index::Index.new(:path => IndexSavePath, :analyzer=>Analyzer)
7
+ IndexReader.new
12
8
  end
13
9
 
14
- def initialize(raw_query,page=1,results_per_page=ResultsPerPage)
15
- query_parser = Ferret::QueryParser.new(:fields => [:content, :file, :basename, :filetype, :date], :or_default => false, :analyzer=>Analyzer)
16
- @query = query_parser.parse(convert_to_english(raw_query))
10
+ def initialize(raw_query,page=1,results_per_page=Picolena::ResultsPerPage)
11
+ @query = Query.extract_from(raw_query)
17
12
  @raw_query= raw_query
18
- Finder.ensure_that_index_exists_on_disk
13
+ IndexReader.ensure_existence
19
14
  @per_page=results_per_page
20
15
  @offset=(page.to_i-1)*results_per_page
21
- validate_that_index_has_documents
16
+ index.should_have_documents
22
17
  end
23
18
 
24
19
  def execute!
25
20
  @matching_documents=[]
26
21
  start=Time.now
27
- begin
28
- top_docs=Finder.index.search(query, :limit => @per_page, :offset=>@offset)
29
- top_docs.hits.each{|hit|
30
- index_id,score=hit.doc,hit.score
31
- begin
32
- found_doc=Document.new(Finder.index[index_id][:complete_path])
33
- found_doc.matching_content=Finder.index.highlight(query, index_id,
34
- :field => :content, :excerpt_length => 80,
35
- :pre_tag => "<<", :post_tag => ">>"
36
- ) unless @raw_query=~/^\*+\.\w*$/
37
- found_doc.score=score
38
- found_doc.index_id=index_id
39
- @matching_documents<<found_doc
22
+ top_docs=index.search(query, :limit => @per_page, :offset=>@offset)
23
+ top_docs.hits.each{|hit|
24
+ index_id,score=hit.doc,hit.score
25
+ begin
26
+ found_doc=Document.new(index[index_id][:complete_path])
27
+ found_doc.matching_content=index.highlight(query, index_id,
28
+ :field => :content, :excerpt_length => 80,
29
+ :pre_tag => "<<", :post_tag => ">>"
30
+ ) unless @raw_query=~/^\*+\.\w*$/
31
+ found_doc.score=score
32
+ found_doc.index_id=index_id
33
+ @matching_documents<<found_doc
40
34
  rescue Errno::ENOENT
41
35
  #"File has been moved/deleted!"
42
36
  end
@@ -44,9 +38,6 @@ class Finder
44
38
  @executed=true
45
39
  @time_needed=Time.now-start
46
40
  @total_hits=top_docs.total_hits
47
- ensure
48
- #index.close
49
- end
50
41
  end
51
42
 
52
43
  # Returns true if it has been executed.
@@ -66,17 +57,7 @@ class Finder
66
57
  }
67
58
  }
68
59
 
69
- # Returns true if index is existing.
70
- def self.has_index?
71
- index_filename and File.exists?(index_filename)
72
- end
73
-
74
- # Returns true if there's at least one document indexed.
75
- def has_documents?
76
- Finder.index.size>0
77
- end
78
-
79
- # Returns matching document for any given query, if only
60
+ # Returns matching document for any given query only if
80
61
  # exactly one document is found.
81
62
  # Raises otherwise.
82
63
  def matching_document
@@ -89,42 +70,4 @@ class Finder
89
70
  raise IndexError, "More than one document found"
90
71
  end
91
72
  end
92
-
93
- private
94
-
95
- # Convert query keywords to english so they can be parsed by Ferret.
96
- def convert_to_english(query)
97
- to_en={
98
- /\b#{:AND.l}\b/=>'AND',
99
- /\b#{:OR.l}\b/=>'OR',
100
- /\b#{:NOT.l}\b/=>'NOT',
101
- /(#{:filetype.l}):/=>'filetype:',
102
- /#{:content.l}:/ => 'content:',
103
- /#{:date.l}:/ => 'date:',
104
- /\b#{:LIKE.l}\s+(\S+)/=>'\1~'
105
- }
106
- to_en.inject(query){|mem,non_english_to_english_keyword|
107
- mem.gsub(*non_english_to_english_keyword)
108
- }
109
- end
110
-
111
- def self.index_filename
112
- Dir.glob(File.join(IndexSavePath,'*.cfs')).first
113
- end
114
-
115
- def self.ensure_that_index_exists_on_disk
116
- force_index_creation unless has_index? or RAILS_ENV=="production"
117
- end
118
-
119
- def self.force_index_creation
120
- create_index(IndexedDirectories.keys)
121
- end
122
-
123
- def self.delete_index
124
- FileUtils.rm(Dir.glob(File.join(IndexSavePath,'*.cfs'))) if has_index?
125
- end
126
-
127
- def validate_that_index_has_documents
128
- raise IndexError, "no document found" unless has_documents?
129
- end
130
- end
73
+ end