picolena 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. data/History.txt +8 -0
  2. data/Manifest.txt +28 -15
  3. data/README.txt +1 -1
  4. data/config/files_to_clean +2 -1
  5. data/config/hoe.rb +1 -1
  6. data/lib/picolena/config/basic.rb +46 -35
  7. data/lib/picolena/config/icons_and_filetypes.yml +69 -0
  8. data/lib/picolena/config/indexed_directories.yml +1 -1
  9. data/lib/picolena/picolena_generator.rb +3 -1
  10. data/lib/picolena/templates/app/controllers/application.rb +2 -2
  11. data/lib/picolena/templates/app/controllers/documents_controller.rb +1 -1
  12. data/lib/picolena/templates/app/helpers/documents_helper.rb +7 -26
  13. data/lib/picolena/templates/app/models/document.rb +32 -14
  14. data/lib/picolena/templates/app/models/finder.rb +21 -78
  15. data/lib/picolena/templates/app/models/index_reader.rb +56 -0
  16. data/lib/picolena/templates/app/models/index_writer.rb +36 -0
  17. data/lib/picolena/templates/app/models/indexer.rb +142 -0
  18. data/lib/picolena/templates/app/models/plain_text_extractor.rb +122 -0
  19. data/lib/picolena/templates/app/models/query.rb +31 -0
  20. data/lib/picolena/templates/app/views/documents/_document.html.haml +2 -2
  21. data/lib/picolena/templates/config/environment.rb +2 -2
  22. data/lib/picolena/templates/config/environments/development.rb +1 -1
  23. data/lib/picolena/templates/config/environments/production.rb +1 -1
  24. data/lib/picolena/templates/config/environments/test.rb +1 -1
  25. data/lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb +2 -0
  26. data/lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb +3 -1
  27. data/lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb +6 -0
  28. data/lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb +2 -0
  29. data/lib/picolena/templates/config/initializers/006_load_icons.rb +8 -0
  30. data/lib/picolena/templates/lib/core_exts.rb +20 -1
  31. data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +72 -0
  32. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/adobe.pdf.rb +3 -3
  33. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/html.rb +2 -2
  34. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.excel.rb +4 -4
  35. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.powerpoint.rb +4 -4
  36. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.rtf.rb +2 -2
  37. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.word.rb +4 -4
  38. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.presentation.rb +2 -2
  39. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.spreadsheet.rb +2 -2
  40. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.text.rb +2 -2
  41. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/plain_text.rb +3 -3
  42. data/lib/picolena/templates/lib/tasks/index.rake +4 -6
  43. data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
  44. data/lib/picolena/templates/spec/controllers/documents_controller_spec.rb +5 -5
  45. data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +1 -1
  46. data/lib/picolena/templates/spec/models/basic_finder_spec.rb +13 -13
  47. data/lib/picolena/templates/spec/models/document_spec.rb +1 -1
  48. data/lib/picolena/templates/spec/models/finder_spec.rb +5 -70
  49. data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +6 -2
  50. data/lib/picolena/templates/spec/models/index_directories_spec.rb +4 -4
  51. data/lib/picolena/templates/spec/models/index_reader_spec.rb +7 -0
  52. data/lib/picolena/templates/spec/models/index_writer_spec.rb +7 -0
  53. data/lib/picolena/templates/spec/models/indexer_spec.rb +7 -0
  54. data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +42 -0
  55. data/lib/picolena/templates/spec/models/query_spec.rb +56 -0
  56. data/lib/picolena/templates/spec/test_dirs/indexed/lang/goethe +42 -0
  57. data/lib/picolena/templates/spec/test_dirs/indexed/lang/hugo +83 -0
  58. data/lib/picolena/templates/spec/test_dirs/indexed/lang/lorca +86 -0
  59. data/lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare +90 -0
  60. data/lib/picolena/version.rb +1 -1
  61. data/tasks/hack.rake +2 -1
  62. data/website/index.html +2 -2
  63. data.tar.gz.sig +0 -0
  64. metadata +30 -17
  65. metadata.gz.sig +0 -0
  66. data/lib/picolena/templates/config/initializers/004_load_filters.rb +0 -6
  67. data/lib/picolena/templates/lib/ff.rb +0 -117
  68. data/lib/picolena/templates/lib/filter.rb +0 -75
  69. data/lib/picolena/templates/lib/filter_DSL.rb +0 -77
  70. data/lib/picolena/templates/spec/models/filters_spec.rb +0 -30
data/History.txt CHANGED
@@ -1,3 +1,11 @@
1
+ == 0.1.2 2008-04-20
2
+
3
+ * major enhancement:
4
+ * complete Indexer & Index rewrite
5
+ * new DSL syntax
6
+ * multi-threaded Indexer
7
+
8
+
1
9
  == 0.1.1 2008-04-12
2
10
 
3
11
  * major enhancement:
data/Manifest.txt CHANGED
@@ -9,6 +9,7 @@ config/hoe.rb
9
9
  config/requirements.rb
10
10
  lib/picolena/USAGE
11
11
  lib/picolena/config/basic.rb
12
+ lib/picolena/config/icons_and_filetypes.yml
12
13
  lib/picolena/config/indexed_directories.yml
13
14
  lib/picolena/config/title_and_names_and_links.yml
14
15
  lib/picolena/config/white_list_ip.yml
@@ -21,6 +22,11 @@ lib/picolena/templates/app/helpers/application_helper.rb
21
22
  lib/picolena/templates/app/helpers/documents_helper.rb
22
23
  lib/picolena/templates/app/models/document.rb
23
24
  lib/picolena/templates/app/models/finder.rb
25
+ lib/picolena/templates/app/models/index_reader.rb
26
+ lib/picolena/templates/app/models/index_writer.rb
27
+ lib/picolena/templates/app/models/indexer.rb
28
+ lib/picolena/templates/app/models/plain_text_extractor.rb
29
+ lib/picolena/templates/app/models/query.rb
24
30
  lib/picolena/templates/app/views/documents/_document.html.haml
25
31
  lib/picolena/templates/app/views/documents/cached.html.haml
26
32
  lib/picolena/templates/app/views/documents/content.html.haml
@@ -35,27 +41,26 @@ lib/picolena/templates/config/environments/test.rb
35
41
  lib/picolena/templates/config/initializers/001_load_custom_config.rb
36
42
  lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb
37
43
  lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb
38
- lib/picolena/templates/config/initializers/004_load_filters.rb
44
+ lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
39
45
  lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
46
+ lib/picolena/templates/config/initializers/006_load_icons.rb
40
47
  lib/picolena/templates/config/routes.rb
41
48
  lib/picolena/templates/lang/ui/de.yml
42
49
  lib/picolena/templates/lang/ui/en.yml
43
50
  lib/picolena/templates/lang/ui/es.yml
44
51
  lib/picolena/templates/lang/ui/fr.yml
45
52
  lib/picolena/templates/lib/core_exts.rb
46
- lib/picolena/templates/lib/ff.rb
47
- lib/picolena/templates/lib/filter.rb
48
- lib/picolena/templates/lib/filter_DSL.rb
49
- lib/picolena/templates/lib/filters/adobe.pdf.rb
50
- lib/picolena/templates/lib/filters/html.rb
51
- lib/picolena/templates/lib/filters/ms.excel.rb
52
- lib/picolena/templates/lib/filters/ms.powerpoint.rb
53
- lib/picolena/templates/lib/filters/ms.rtf.rb
54
- lib/picolena/templates/lib/filters/ms.word.rb
55
- lib/picolena/templates/lib/filters/opendocument.presentation.rb
56
- lib/picolena/templates/lib/filters/opendocument.spreadsheet.rb
57
- lib/picolena/templates/lib/filters/opendocument.text.rb
58
- lib/picolena/templates/lib/filters/plain_text.rb
53
+ lib/picolena/templates/lib/plain_text_extractor_DSL.rb
54
+ lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
55
+ lib/picolena/templates/lib/plain_text_extractors/html.rb
56
+ lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb
57
+ lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb
58
+ lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb
59
+ lib/picolena/templates/lib/plain_text_extractors/ms.word.rb
60
+ lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb
61
+ lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb
62
+ lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb
63
+ lib/picolena/templates/lib/plain_text_extractors/plain_text.rb
59
64
  lib/picolena/templates/lib/tasks/annotations.rake
60
65
  lib/picolena/templates/lib/tasks/index.rake
61
66
  lib/picolena/templates/lib/tasks/install_dependencies.rake
@@ -112,10 +117,14 @@ lib/picolena/templates/spec/helpers/application_helper_spec.rb
112
117
  lib/picolena/templates/spec/helpers/documents_helper_spec.rb
113
118
  lib/picolena/templates/spec/models/basic_finder_spec.rb
114
119
  lib/picolena/templates/spec/models/document_spec.rb
115
- lib/picolena/templates/spec/models/filters_spec.rb
116
120
  lib/picolena/templates/spec/models/finder_spec.rb
117
121
  lib/picolena/templates/spec/models/host_indexing_system_spec.rb
118
122
  lib/picolena/templates/spec/models/index_directories_spec.rb
123
+ lib/picolena/templates/spec/models/index_reader_spec.rb
124
+ lib/picolena/templates/spec/models/index_writer_spec.rb
125
+ lib/picolena/templates/spec/models/indexer_spec.rb
126
+ lib/picolena/templates/spec/models/plain_text_extractor_spec.rb
127
+ lib/picolena/templates/spec/models/query_spec.rb
119
128
  lib/picolena/templates/spec/rcov.opts
120
129
  lib/picolena/templates/spec/spec.opts
121
130
  lib/picolena/templates/spec/spec_helper.rb
@@ -140,6 +149,10 @@ lib/picolena/templates/spec/test_dirs/indexed/different_encodings/iso-8859-1.txt
140
149
  lib/picolena/templates/spec/test_dirs/indexed/different_encodings/iso-8859-15.txt
141
150
  lib/picolena/templates/spec/test_dirs/indexed/different_encodings/utf-8.txt
142
151
  lib/picolena/templates/spec/test_dirs/indexed/just_one_doc/for_test.txt
152
+ lib/picolena/templates/spec/test_dirs/indexed/lang/goethe
153
+ lib/picolena/templates/spec/test_dirs/indexed/lang/hugo
154
+ lib/picolena/templates/spec/test_dirs/indexed/lang/lorca
155
+ lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare
143
156
  lib/picolena/templates/spec/test_dirs/indexed/literature/Simulation of district heating systems for evaluation of real-time control strategies.pdf
144
157
  lib/picolena/templates/spec/test_dirs/indexed/literature/Types of malfunction in DH substations.doc
145
158
  lib/picolena/templates/spec/test_dirs/indexed/others/'weird'filename.txt
data/README.txt CHANGED
@@ -14,7 +14,7 @@ Picolena is a lightweight ferret-powered documents search engine written in Ruby
14
14
 
15
15
  Picolena has many advantages:
16
16
 
17
- * it can index .pdf, .doc, .docx, .odt, .xls, .ods, .ppt, .pptx, .odp, .rtf, .html and plain text files will full text search, and offers a very easy way to add new filters to index other filetype.
17
+ * it can index .pdf, .doc, .docx, .odt, .xls, .ods, .ppt, .pptx, .odp, .rtf, .html and plain text files will full text search, and offers a very easy way to add new extractors to index other filetype.
18
18
  * it is free as in free beer and as in free speech
19
19
  * thanks to Ferret, it is very fast
20
20
  * it keeps your data private. By default, only the computer on which it is installed can get access to the search engine. Other IP addresses can then be added to a white list.
@@ -4,9 +4,10 @@ lib/picolena/templates/config/custom/picolena.rb
4
4
  lib/picolena/templates/config/custom/indexed_directories.yml
5
5
  lib/picolena/templates/config/custom/white_list_ip.yml
6
6
  lib/picolena/templates/config/custom/title_and_names_and_links.yml
7
+ lib/picolena/templates/config/custom/icons_and_filetypes.yml
7
8
  lib/picolena/templates/log
8
9
  lib/picolena/templates/spec/test_dirs/indexed/others/bäñüßé.txt
9
10
  lib/picolena/templates/tmp
10
11
  lib/picolena/templates/vendor
11
12
  lib/picolena/templates/coverage
12
- lib/picolena/templates/doc
13
+ lib/picolena/templates/doc
data/config/hoe.rb CHANGED
@@ -61,7 +61,7 @@ hoe = Hoe.new(GEM_NAME, VERS) do |p|
61
61
  # == Optional
62
62
  p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
63
63
  p.extra_deps = [ ['rails', '>= 2.0.2'],
64
- # Ferret 0.11.6 is not yet available for win32
64
+ # Ferret 0.11.6 is not yet available for win32 (and will never be, it is a fix release for *nix)
65
65
  # FIXME: How to require 0.11.6 for *nix and 0.11.5 for win32?
66
66
  ['ferret', '>= 0.11.5'],
67
67
  ['haml', '>= 1.8.2'],
@@ -1,35 +1,46 @@
1
- # Specify indexes path.
2
- # Storage should be sufficient in order to store all indexed data.
3
- IndexesSavePath=File.join(RAILS_ROOT, 'tmp/ferret_indexes/')
4
-
5
-
6
- # Which language should be used?
7
- # English (:en), German (:de), French (:fr) and Spanish (:es) are currently supported
8
- # English is chosen by default.
9
- # If you'd like to use another language, you can find templates in #{RAILS_ROOT}/lang/ui,
10
- # then add your own language in this directory, and modify this line:
11
- Globalite.language = :en
12
-
13
-
14
- # Specify which locale should be used by Ferret
15
- Ferret.locale = "en_US.UTF-8"
16
-
17
-
18
- # Results per page
19
- ResultsPerPage = 10
20
-
21
-
22
- # Length of "probably unique id" 's
23
- # Those id's are used to characterize every document, thus allowing tiny URLs in Controllers
24
- # HashLength = 10
25
- # Document.new("whatever.pdf").probably_unique_id => "bbuxhynait"
26
- # HashLength = 20
27
- # Document.new("whatever.pdf").probably_unique_id => "jfzjkyfkfkbbuxhynait"
28
- # The more documents you have, the bigger HashLength should be in order to avoid collisions.
29
- # It would not be wise (and specs won't pass) to specify HashLength smaller than 10.
30
- HashLength = 10
31
-
32
-
33
- # Specify the default Levenshtein distance when using FuzzyQuery
34
- # see http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html for more information.
35
- Ferret::Search::FuzzyQuery.default_min_similarity=0.6
1
+ module Picolena
2
+ # Specify indexes path.
3
+ # Storage should be sufficient in order to store all indexed data.
4
+ IndexesSavePath=File.join(RAILS_ROOT, 'tmp/ferret_indexes/')
5
+
6
+
7
+ # Which language should be used?
8
+ # English (:en), German (:de), French (:fr) and Spanish (:es) are currently supported
9
+ # English is chosen by default.
10
+ # If you'd like to use another language, you can find templates in #{RAILS_ROOT}/lang/ui,
11
+ # then add your own language in this directory, and modify this line:
12
+ Globalite.language = :en
13
+
14
+
15
+ # Is more than one language used in indexed documents?
16
+ # Picolena can try to recognise the language used, and save it in the index.
17
+ # It is then possible to look for documents according to their language.
18
+ #
19
+ # If every document is written in the same language, turning UseLanguageRecognition to false
20
+ # will speed up the indexing process
21
+ UseLanguageRecognition = true
22
+
23
+ # Specify which locale should be used by Ferret
24
+ Ferret.locale = "en_US.UTF-8"
25
+
26
+
27
+ # Results per page
28
+ ResultsPerPage = 10
29
+
30
+
31
+ # Length of "probably unique id" 's
32
+ # Those id's are used to characterize every document, thus allowing tiny URLs in Controllers
33
+ # HashLength = 10
34
+ # Document.new("whatever.pdf").probably_unique_id => "bbuxhynait"
35
+ # HashLength = 20
36
+ # Document.new("whatever.pdf").probably_unique_id => "jfzjkyfkfkbbuxhynait"
37
+ # The more documents you have, the bigger HashLength should be in order to avoid collisions.
38
+ # It would not be wise (and specs won't pass) to specify HashLength smaller than 10.
39
+ HashLength = 10
40
+
41
+
42
+ # Specify the default Levenshtein distance when using FuzzyQuery
43
+ # see http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html for more information.
44
+ Ferret::Search::FuzzyQuery.default_min_similarity=0.6
45
+ Analyzer=Ferret::Analysis::StandardAnalyzer.new
46
+ end
@@ -0,0 +1,69 @@
1
+ video:
2
+ avi
3
+ wmv
4
+ mpg
5
+ mpeg
6
+ ogg:
7
+ mp3
8
+ ogg
9
+ wma
10
+ wav
11
+ wmv
12
+ tee
13
+ txt:
14
+ txt
15
+ text
16
+ tex
17
+ bib
18
+ log
19
+ ini
20
+ no_extension
21
+ doc:
22
+ doc
23
+ odt
24
+ rtf
25
+ dot
26
+ docx
27
+ dotx
28
+ insel:
29
+ ins
30
+ vee
31
+ ppt:
32
+ ppt
33
+ pps
34
+ pptx
35
+ odp
36
+ pdf:
37
+ pdf
38
+ package:
39
+ gz
40
+ rar
41
+ zip
42
+ bak
43
+ code:
44
+ for
45
+ cpp
46
+ c
47
+ rb
48
+ java
49
+ html:
50
+ html
51
+ htm
52
+ xls:
53
+ xls
54
+ xlsx
55
+ ods
56
+ picture:
57
+ psd
58
+ jpg
59
+ png
60
+ gif
61
+ eps
62
+ bmp
63
+ ico
64
+ cad:
65
+ dwg
66
+ dxf
67
+ exe:
68
+ exe
69
+ dll
@@ -5,6 +5,6 @@ development:
5
5
  #alias path could be any smb, http, ftp or local directory that is available to the end-user.
6
6
  <%= directories_to_index %>
7
7
  test:
8
- "spec/test_dirs/indexed": "http://picolena.devjavu.com/browser/trunk/spec/test_dirs/indexed"
8
+ "spec/test_dirs/indexed": "http://picolena.devjavu.com/browser/trunk/lib/picolena/templates/spec/test_dirs/indexed"
9
9
  production:
10
10
  <%= directories_to_index %>
@@ -62,6 +62,7 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
62
62
  m.file '../config/basic.rb', 'config/custom/picolena.rb'
63
63
  m.template '../config/indexed_directories.yml', 'config/custom/indexed_directories.yml', :assigns => {:directories_to_index => @directories_to_index}
64
64
  m.template '../config/title_and_names_and_links.yml', 'config/custom/title_and_names_and_links.yml', :assigns => {:version => Picolena::VERSION::STRING}
65
+ m.file '../config/icons_and_filetypes.yml', 'config/custom/icons_and_filetypes.yml'
65
66
 
66
67
  # README, License & Rakefile
67
68
  m.file 'MIT-LICENSE', 'LICENSE'
@@ -135,7 +136,7 @@ EOS
135
136
  doc
136
137
  lang/ui
137
138
  lib
138
- lib/filters
139
+ lib/plain_text_extractors
139
140
  lib/tasks
140
141
  log
141
142
  public
@@ -155,6 +156,7 @@ EOS
155
156
  spec/test_dirs/indexed/basic
156
157
  spec/test_dirs/indexed/different_encodings
157
158
  spec/test_dirs/indexed/just_one_doc
159
+ spec/test_dirs/indexed/lang
158
160
  spec/test_dirs/indexed/literature
159
161
  spec/test_dirs/indexed/others
160
162
  spec/test_dirs/indexed/others/nested
@@ -23,9 +23,9 @@ class ApplicationController < ActionController::Base
23
23
  # Tries to match remote IP address with the white list defined in config/custom/white_list_ip.yml
24
24
  # Redirects to :access_denied if the remote IP is not white listed.
25
25
  def should_only_be_available_for_white_list_IPs
26
- unless request.remote_ip =~ WhiteListIPs
26
+ unless request.remote_ip =~ Picolena::WhiteListIPs
27
27
  redirect_to :controller => 'application', :action=>'access_denied'
28
28
  return false
29
29
  end
30
30
  end
31
- end
31
+ end
@@ -24,7 +24,7 @@ class DocumentsController < ApplicationController
24
24
  page=params[:page]||1
25
25
  finder=Finder.new(@query,page)
26
26
  finder.execute!
27
- pager=::Paginator.new(finder.total_hits, ResultsPerPage) do
27
+ pager=::Paginator.new(finder.total_hits, Picolena::ResultsPerPage) do
28
28
  finder.matching_documents
29
29
  end
30
30
  @matching_documents=pager.page(page)
@@ -3,13 +3,13 @@ module DocumentsHelper
3
3
  def nothing_found?
4
4
  @matching_documents.nil? or @matching_documents.entries.empty?
5
5
  end
6
-
6
+
7
7
  # Very basic pagination.
8
8
  # Provides liks to Next, Prev and FirstPage when needed.
9
9
  def should_paginate(page,query)
10
- [(link_to("&larr;&larr;", :action => :show, :id => query, :page => 1) if page.number>2),
11
- (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number) if page.prev?),
12
- (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number) if page.next?)].compact.join(" | ")
10
+ [(link_to("&larr;&larr;", :action => :show, :id => query, :page => 1) if page.number>2),
11
+ (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number) if page.prev?),
12
+ (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number) if page.next?)].compact.join(" | ")
13
13
  end
14
14
 
15
15
  # Returns a localized sentence like "Results 1-10 of 12 for Zimbabwe (0.472s)" or
@@ -30,7 +30,7 @@ module DocumentsHelper
30
30
  content_tag(:small,'('<<number_with_precision(dt,3)<<'s)')
31
31
  end
32
32
 
33
- # When possible, highlights content of the document that match the query.
33
+ # When possible, highlights content of the document that matches the query.
34
34
  def highlight_matching_content(document)
35
35
  content_tag(:ul,document.matching_content.collect{|sentence|
36
36
  content_tag(:li,h(sentence).gsub(/&lt;&lt;(.*?)&gt;&gt;/,'<strong>\1</strong>').gsub(/\v|\f/,''))
@@ -43,28 +43,9 @@ module DocumentsHelper
43
43
  end
44
44
 
45
45
  # Returns the location (if avaible) of the filetype icon.
46
- # TODO: Move this hash to a .yml config file.
47
46
  def icon_for(filetype)
48
- pic_for_exts={
49
- :xls=>%w{xls xlsx ods},
50
- :doc=>%w{doc odt rtf dot docx dotx},
51
- :pdf=>%w{pdf},
52
- :txt=>%w{txt text tex bib log ini no_extension},
53
- :ogg=>%w{mp3 ogg wma wav wmv tee},
54
- :html=>%w{html htm},
55
- :ppt=>%w{ppt pps pptx odp},
56
- :package=>%w{gz rar zip bak},
57
- :picture=>%w{psd jpg png gif eps bmp ico},
58
- :cad=>%w{dwg dxf},
59
- :exe=>%w{exe dll},
60
- :video=>%w{avi wmv mpg mpeg},
61
- :code=>%w{for cpp c rb java},
62
- :insel=>%w{ins vee}
63
- }
64
- pic=pic_for_exts.find{|pic, extensions|
65
- extensions.any? { |ext| filetype.sub(/\./,'').downcase==ext}
66
- }
67
- image_tag("icons/#{pic.first}.png") if pic
47
+ icon_symbol=FiletypeToIconSymbol[filetype.downcase.sub(/^\./,'')]
48
+ image_tag("icons/#{icon_symbol}.png") if icon_symbol
68
49
  end
69
50
 
70
51
  # Returns a link to a backup search engine that could maybe find more results for the same query.
@@ -1,7 +1,8 @@
1
1
  # Document class retrieves information from filesystem and the index for any given document.
2
2
  class Document
3
3
  attr_reader :complete_path
4
- attr_accessor :user, :score, :matching_content, :index_id
4
+ attr_writer :index_id
5
+ attr_accessor :user, :score, :matching_content
5
6
 
6
7
  def initialize(path)
7
8
  #To ensure @complete_path is an absolute direction.
@@ -10,8 +11,6 @@ class Document
10
11
  validate_in_indexed_directory
11
12
  end
12
13
 
13
- alias_method :to_param, :id
14
-
15
14
  #Delegating properties to File::method_name(complete_path)
16
15
  [:dirname, :basename, :extname, :size?, :file?, :read, :ext_as_sym].each{|method_name|
17
16
  define_method(method_name){File.send(method_name,complete_path)}
@@ -38,7 +37,7 @@ class Document
38
37
  # "http://www.mycompany.com/wiki/organigram.odp"
39
38
  def alias_path
40
39
  original_dir=indexed_directory
41
- alias_dir=IndexedDirectories[original_dir]
40
+ alias_dir=Picolena::IndexedDirectories[original_dir]
42
41
  dirname.sub(original_dir,alias_dir)
43
42
  end
44
43
 
@@ -50,48 +49,67 @@ class Document
50
49
  @probably_unique_id||=complete_path.base26_hash
51
50
  end
52
51
 
53
- # Returns true iff some Filter has been defined to convert it to plain text.
52
+ # Returns true iff some PlainTextExtractor has been defined to convert it to plain text.
54
53
  # Document.new("presentation.pdf").supported? => true
55
54
  # Document.new("presentation.some_weird_extension").supported? => false
56
55
  def supported?
57
- PlainText.supported_extensions.include?(self.ext_as_sym)
56
+ PlainTextExtractor.supported_extensions.include?(self.ext_as_sym)
58
57
  end
59
58
 
60
59
  # Retrieves content as it is *now*.
61
60
  def content
62
- PlainText.extract_content_from(complete_path)
61
+ PlainTextExtractor.extract_content_from(complete_path)
63
62
  end
64
63
 
65
64
  # Cache à la Google.
66
65
  # Returns content as it was at the time it was indexed.
67
66
  def cached
68
- get_index_id! unless index_id
69
- Finder.index[index_id][:content]
67
+ from_index[:content]
70
68
  end
71
69
 
70
+ # FIXME: Not just date anymore.
72
71
  # Returns the last modification date before the document got indexed.
73
72
  # Useful to know how old a document is, and to which version the cache corresponds.
74
73
  def date
75
- get_index_id! unless index_id
76
- Finder.index[index_id][:date].sub(/(\d{4})(\d{2})(\d{2})/,'\1-\2-\3')
74
+ from_index[:date].sub(/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/,'\1-\2-\3 \4:\5:\6')
75
+ end
76
+
77
+ def mtime
78
+ from_index[:date].to_i
79
+ end
80
+
81
+ # Returns language.
82
+ def lang
83
+ from_index[:lang]
84
+ end
85
+
86
+ # Returns the id with which the document is indexed.
87
+ def index_id
88
+ @index_id ||= Document.find_by_complete_path(complete_path).index_id
77
89
  end
78
90
 
79
91
  private
80
92
 
81
- def get_index_id!
82
- @index_id = Document.find_by_unique_id(probably_unique_id).index_id
93
+ # Retrieves the document from the index.
94
+ # Useful to get meta-info about it.
95
+ def from_index
96
+ IndexReader.new[index_id]
83
97
  end
84
98
 
85
99
  def self.find_by_unique_id(some_id)
86
100
  Finder.new("probably_unique_id:"<<some_id).matching_document
87
101
  end
88
102
 
103
+ def self.find_by_complete_path(complete_path)
104
+ Finder.new('complete_path:"'<<complete_path<<'"').matching_document
105
+ end
106
+
89
107
  def in_indexed_directory?
90
108
  !indexed_directory.nil?
91
109
  end
92
110
 
93
111
  def indexed_directory
94
- IndexedDirectories.keys.find{|indexed_dir|
112
+ Picolena::IndexedDirectories.keys.find{|indexed_dir|
95
113
  dirname.starts_with?(indexed_dir)
96
114
  }
97
115
  end
@@ -1,42 +1,36 @@
1
- require 'ff'
2
-
3
- class Finder
4
- #FIXME: Should not use all those class methods to access index.
5
-
1
+ class Finder
6
2
  attr_reader :query
7
3
 
8
- def self.index
4
+ def index
9
5
  # caching index @@index ||=
10
6
  # causes ferret-0.11.6/lib/ferret/index.rb:768: [BUG] Segmentation fault
11
- Ferret::Index::Index.new(:path => IndexSavePath, :analyzer=>Analyzer)
7
+ IndexReader.new
12
8
  end
13
9
 
14
- def initialize(raw_query,page=1,results_per_page=ResultsPerPage)
15
- query_parser = Ferret::QueryParser.new(:fields => [:content, :file, :basename, :filetype, :date], :or_default => false, :analyzer=>Analyzer)
16
- @query = query_parser.parse(convert_to_english(raw_query))
10
+ def initialize(raw_query,page=1,results_per_page=Picolena::ResultsPerPage)
11
+ @query = Query.extract_from(raw_query)
17
12
  @raw_query= raw_query
18
- Finder.ensure_that_index_exists_on_disk
13
+ IndexReader.ensure_existence
19
14
  @per_page=results_per_page
20
15
  @offset=(page.to_i-1)*results_per_page
21
- validate_that_index_has_documents
16
+ index.should_have_documents
22
17
  end
23
18
 
24
19
  def execute!
25
20
  @matching_documents=[]
26
21
  start=Time.now
27
- begin
28
- top_docs=Finder.index.search(query, :limit => @per_page, :offset=>@offset)
29
- top_docs.hits.each{|hit|
30
- index_id,score=hit.doc,hit.score
31
- begin
32
- found_doc=Document.new(Finder.index[index_id][:complete_path])
33
- found_doc.matching_content=Finder.index.highlight(query, index_id,
34
- :field => :content, :excerpt_length => 80,
35
- :pre_tag => "<<", :post_tag => ">>"
36
- ) unless @raw_query=~/^\*+\.\w*$/
37
- found_doc.score=score
38
- found_doc.index_id=index_id
39
- @matching_documents<<found_doc
22
+ top_docs=index.search(query, :limit => @per_page, :offset=>@offset)
23
+ top_docs.hits.each{|hit|
24
+ index_id,score=hit.doc,hit.score
25
+ begin
26
+ found_doc=Document.new(index[index_id][:complete_path])
27
+ found_doc.matching_content=index.highlight(query, index_id,
28
+ :field => :content, :excerpt_length => 80,
29
+ :pre_tag => "<<", :post_tag => ">>"
30
+ ) unless @raw_query=~/^\*+\.\w*$/
31
+ found_doc.score=score
32
+ found_doc.index_id=index_id
33
+ @matching_documents<<found_doc
40
34
  rescue Errno::ENOENT
41
35
  #"File has been moved/deleted!"
42
36
  end
@@ -44,9 +38,6 @@ class Finder
44
38
  @executed=true
45
39
  @time_needed=Time.now-start
46
40
  @total_hits=top_docs.total_hits
47
- ensure
48
- #index.close
49
- end
50
41
  end
51
42
 
52
43
  # Returns true if it has been executed.
@@ -66,17 +57,7 @@ class Finder
66
57
  }
67
58
  }
68
59
 
69
- # Returns true if index is existing.
70
- def self.has_index?
71
- index_filename and File.exists?(index_filename)
72
- end
73
-
74
- # Returns true if there's at least one document indexed.
75
- def has_documents?
76
- Finder.index.size>0
77
- end
78
-
79
- # Returns matching document for any given query, if only
60
+ # Returns matching document for any given query only if
80
61
  # exactly one document is found.
81
62
  # Raises otherwise.
82
63
  def matching_document
@@ -89,42 +70,4 @@ class Finder
89
70
  raise IndexError, "More than one document found"
90
71
  end
91
72
  end
92
-
93
- private
94
-
95
- # Convert query keywords to english so they can be parsed by Ferret.
96
- def convert_to_english(query)
97
- to_en={
98
- /\b#{:AND.l}\b/=>'AND',
99
- /\b#{:OR.l}\b/=>'OR',
100
- /\b#{:NOT.l}\b/=>'NOT',
101
- /(#{:filetype.l}):/=>'filetype:',
102
- /#{:content.l}:/ => 'content:',
103
- /#{:date.l}:/ => 'date:',
104
- /\b#{:LIKE.l}\s+(\S+)/=>'\1~'
105
- }
106
- to_en.inject(query){|mem,non_english_to_english_keyword|
107
- mem.gsub(*non_english_to_english_keyword)
108
- }
109
- end
110
-
111
- def self.index_filename
112
- Dir.glob(File.join(IndexSavePath,'*.cfs')).first
113
- end
114
-
115
- def self.ensure_that_index_exists_on_disk
116
- force_index_creation unless has_index? or RAILS_ENV=="production"
117
- end
118
-
119
- def self.force_index_creation
120
- create_index(IndexedDirectories.keys)
121
- end
122
-
123
- def self.delete_index
124
- FileUtils.rm(Dir.glob(File.join(IndexSavePath,'*.cfs'))) if has_index?
125
- end
126
-
127
- def validate_that_index_has_documents
128
- raise IndexError, "no document found" unless has_documents?
129
- end
130
- end
73
+ end