picolena 0.1.7 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +1 -0
  3. data/lib/picolena/templates/app/controllers/documents_controller.rb +3 -3
  4. data/lib/picolena/templates/app/helpers/documents_helper.rb +5 -5
  5. data/lib/picolena/templates/app/models/document.rb +16 -3
  6. data/lib/picolena/templates/app/models/finder.rb +18 -7
  7. data/lib/picolena/templates/app/models/indexer.rb +92 -33
  8. data/lib/picolena/templates/app/models/query.rb +5 -0
  9. data/lib/picolena/templates/app/views/documents/_document.html.haml +9 -8
  10. data/lib/picolena/templates/config/environment.rb +0 -2
  11. data/lib/picolena/templates/config/environments/development.rb +0 -3
  12. data/lib/picolena/templates/config/environments/production.rb +0 -2
  13. data/lib/picolena/templates/config/environments/test.rb +0 -3
  14. data/lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb +3 -0
  15. data/lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb +1 -1
  16. data/lib/picolena/templates/lang/ui/de.yml +2 -2
  17. data/lib/picolena/templates/lang/ui/en.yml +1 -1
  18. data/lib/picolena/templates/lang/ui/es.yml +1 -1
  19. data/lib/picolena/templates/lang/ui/fr.yml +2 -2
  20. data/lib/picolena/templates/lib/core_exts.rb +32 -23
  21. data/lib/picolena/templates/lib/indexer_logger.rb +45 -0
  22. data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +0 -1
  23. data/lib/picolena/templates/lib/plain_text_extractors/plain_text.rb +3 -3
  24. data/lib/picolena/templates/lib/tasks/index.rake +6 -1
  25. data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +0 -2
  26. data/lib/picolena/templates/spec/models/basic_finder_spec.rb +4 -0
  27. data/lib/picolena/templates/spec/models/document_spec.rb +6 -0
  28. data/lib/picolena/templates/spec/models/finder_spec.rb +0 -1
  29. data/lib/picolena/templates/spec/models/indexer_spec.rb +9 -0
  30. data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +5 -0
  31. data/lib/picolena/templates/spec/models/query_spec.rb +26 -2
  32. data/lib/picolena/version.rb +1 -1
  33. data/website/index.html +1 -1
  34. data.tar.gz.sig +0 -0
  35. metadata +3 -2
  36. metadata.gz.sig +0 -0
data/History.txt CHANGED
@@ -1,3 +1,13 @@
1
+ == 0.1.8 2008-05-08
2
+
3
+ * 2 minor enhancements:
4
+ * New IndexerLogger with basic statistics
5
+ * More specs & documentation.
6
+
7
+ * 2 bug fixes:
8
+ * Binary documents without extension are not considered supported anymore
9
+ * Ensure that index is locked system-wide by using lock file.
10
+
1
11
  == 0.1.7 2008-04-30
2
12
 
3
13
  * 5 minor enhancements:
data/Manifest.txt CHANGED
@@ -50,6 +50,7 @@ lib/picolena/templates/lang/ui/en.yml
50
50
  lib/picolena/templates/lang/ui/es.yml
51
51
  lib/picolena/templates/lang/ui/fr.yml
52
52
  lib/picolena/templates/lib/core_exts.rb
53
+ lib/picolena/templates/lib/indexer_logger.rb
53
54
  lib/picolena/templates/lib/plain_text_extractor_DSL.rb
54
55
  lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
55
56
  lib/picolena/templates/lib/plain_text_extractors/html.rb
@@ -22,9 +22,9 @@ class DocumentsController < ApplicationController
22
22
  def show
23
23
  start=Time.now
24
24
  @query=[params[:id],params.delete(:format)].compact.join('.')
25
- @sort=params[:sort]
25
+ @sort_by=params[:sort_by]
26
26
  page=params[:page]||1
27
- finder=Finder.new(@query,@sort,page)
27
+ finder=Finder.new(@query,@sort_by,page)
28
28
  finder.execute!
29
29
  pager=::Paginator.new(finder.total_hits, Picolena::ResultsPerPage) do
30
30
  finder.matching_documents
@@ -64,7 +64,7 @@ class DocumentsController < ApplicationController
64
64
 
65
65
  def ensure_index_is_created
66
66
  Indexer.ensure_index_existence
67
- while Indexer.do_not_disturb_while_indexing do
67
+ while Indexer.locked? do
68
68
  sleep 1
69
69
  end
70
70
  end
@@ -6,10 +6,10 @@ module DocumentsHelper
6
6
 
7
7
  # Very basic pagination.
8
8
  # Provides liks to Next, Prev and FirstPage when needed.
9
- def should_paginate(page,query, sort)
10
- [(link_to("&larr;&larr;", :action => :show, :id => query, :sort=>sort) if page.number>2),
11
- (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number, :sort=>sort) if page.prev?),
12
- (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number, :sort=>sort) if page.next?)].compact.join(" | ")
9
+ def should_paginate(page,query, sort_by)
10
+ [(link_to("&larr;&larr;", :action => :show, :id => query, :sort_by=>sort_by) if page.number>2),
11
+ (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number, :sort_by=>sort_by) if page.prev?),
12
+ (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number, :sort_by=>sort_by) if page.next?)].compact.join(" | ")
13
13
  end
14
14
 
15
15
  # Returns a localized sentence like "Results 1-10 of 12 for Zimbabwe (0.472s)" or
@@ -81,7 +81,7 @@ module DocumentsHelper
81
81
  end
82
82
 
83
83
  def sort_by_date_or_relevance(query)
84
- [link_to_unless_current('By date', document_path(query, :sort=>'by_date')),
84
+ [link_to_unless_current('By date', document_path(query, :sort_by=>'date')),
85
85
  link_to_unless_current('By relevance', document_path(query))].join("&nbsp;")
86
86
  end
87
87
  end
@@ -11,10 +11,18 @@ class Document
11
11
  end
12
12
 
13
13
  #Delegating properties to File::method_name(complete_path)
14
- [:dirname, :basename, :extname, :ext_as_sym, :file?, :size, :ext_as_sym].each{|method_name|
14
+ [:dirname, :basename, :extname, :ext_as_sym, :file?, :plain_text?, :size, :ext_as_sym].each{|method_name|
15
15
  define_method(method_name){File.send(method_name,complete_path)}
16
16
  }
17
17
  alias_method :filename, :basename
18
+ alias_method :to_s, :complete_path
19
+
20
+
21
+ def inspect
22
+ [self,("(#{pretty_score})" if @score),("(language:#{language})" if language)].compact.join(" ")
23
+ end
24
+
25
+
18
26
 
19
27
  # Returns filename without extension
20
28
  # "buildings.odt" => "buildings"
@@ -50,7 +58,7 @@ class Document
50
58
  # Document.new("presentation.pdf").supported? => true
51
59
  # Document.new("presentation.some_weird_extension").supported? => false
52
60
  def supported?
53
- PlainTextExtractor.supported_extensions.include?(self.ext_as_sym)
61
+ PlainTextExtractor.supported_extensions.include?(self.ext_as_sym) unless ext_as_sym==:no_extension and !plain_text?
54
62
  end
55
63
 
56
64
  # Retrieves content as it is *now*.
@@ -91,6 +99,10 @@ class Document
91
99
  from_index[:language]
92
100
  end
93
101
 
102
+ def pretty_score
103
+ "%3.1f%" % (@score*100)
104
+ end
105
+
94
106
  # Fields that are shared between every document.
95
107
  def self.default_fields_for(complete_path)
96
108
  {
@@ -103,6 +115,7 @@ class Document
103
115
  }
104
116
  end
105
117
 
118
+
106
119
  private
107
120
 
108
121
  # FIXME: Is there a way to easily retrieve doc_id for a given document?
@@ -138,4 +151,4 @@ class Document
138
151
  def validate_in_indexed_directory
139
152
  raise ArgumentError, "required document is not in indexed directory" unless in_indexed_directory?
140
153
  end
141
- end
154
+ end
@@ -2,23 +2,24 @@ class Finder
2
2
  attr_reader :query
3
3
 
4
4
  def index
5
- @@index ||= Indexer.index
5
+ @@index ||= Indexer.index
6
6
  end
7
7
 
8
- def initialize(raw_query,by_date=false, page=1,results_per_page=Picolena::ResultsPerPage)
8
+ def initialize(raw_query,sort_by='relevance', page=1,results_per_page=Picolena::ResultsPerPage)
9
9
  @query = Query.extract_from(raw_query)
10
10
  @raw_query= raw_query
11
11
  Indexer.ensure_index_existence
12
+ reload_index! if should_be_reloaded?
12
13
  @per_page=results_per_page
13
14
  @offset=(page.to_i-1)*results_per_page
14
- @by_date=by_date
15
+ @sort_by=sort_by
15
16
  index_should_have_documents
16
17
  end
17
18
 
18
19
  def execute!
19
20
  @matching_documents=[]
20
21
  start=Time.now
21
- @total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @by_date)){|index_id, score|
22
+ @total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @sort_by=='date')){|index_id, score|
22
23
  begin
23
24
  found_doc=Document.new(index[index_id][:complete_path])
24
25
  found_doc.matching_content=index.highlight(query, index_id,
@@ -52,11 +53,21 @@ class Finder
52
53
  }
53
54
  }
54
55
 
55
- def self.reload!
56
+ private
57
+
58
+ def reload_index!
59
+ Indexer.close
56
60
  @@index = nil
61
+ @@last_reload = Time.now
57
62
  end
58
63
 
59
- private
64
+ def should_be_reloaded?
65
+ Indexer.reload_file_mtime > last_reload
66
+ end
67
+
68
+ def last_reload
69
+ @@last_reload ||= Time.at(0)
70
+ end
60
71
 
61
72
  def sort_by_date
62
73
  Ferret::Search::SortField.new(:modified, :type => :byte, :reverse => true)
@@ -65,4 +76,4 @@ class Finder
65
76
  def index_should_have_documents
66
77
  raise IndexError, "no document found" unless index.size > 0
67
78
  end
68
- end
79
+ end
@@ -1,63 +1,74 @@
1
+ # Indexer is used to index (duh!) documents contained in IndexedDirectories
2
+ # It can create, update, delete and prune the index, and take care that only
3
+ # one IndexWriter exists at any given time, even when used in a multi-threaded
4
+ # way.
5
+ require 'indexer_logger'
1
6
  class Indexer
2
7
  # This regexp defines which files should *not* be indexed.
3
8
  @@exclude = /(Thumbs\.db)/
4
9
  # Number of threads that will be used during indexing process
5
10
  @@threads_number = 8
6
-
7
- cattr_reader :do_not_disturb_while_indexing
8
11
 
9
12
  class << self
13
+ # Finds every document included in IndexedDirectories, parses them with
14
+ # PlainTextExtractor and adds them to the index.
15
+ #
16
+ # Updates the index unless remove_first parameter is set to true, in which
17
+ # case it removes the index first before re-creating it.
10
18
  def index_every_directory(remove_first=false)
11
- @@do_not_disturb_while_indexing=true
12
19
  clear! if remove_first
20
+ lock!
13
21
  @from_scratch = remove_first
14
- # Forces Finder.searcher and Finder.index to be reloaded, by removing them from the cache.
15
- Finder.reload!
16
- log :debug => "Indexing every directory"
17
- start=Time.now
22
+ logger.start_indexing
18
23
  Picolena::IndexedDirectories.each{|dir, alias_dir|
19
24
  index_directory_with_multithreads(dir)
20
25
  }
21
- log :debug => "Now optimizing index"
26
+ logger.debug "Now optimizing index"
22
27
  index.optimize
23
- @@do_not_disturb_while_indexing=false
24
- log :debug => "Indexing done in #{Time.now-start} s."
28
+ index_time_dbm_file['last']=Time.now._dump
29
+ unlock!
30
+ logger.show_report
25
31
  end
26
32
 
33
+ # Indexes a given directory, using @@threads_number threads.
34
+ # To do so, it retrieves a list of every included document, cuts it in
35
+ # @@threads_number chunks, and create a new indexing thread for every chunk.
27
36
  def index_directory_with_multithreads(dir)
28
- log :debug => "Indexing #{dir}, #{@@threads_number} threads"
29
-
37
+ logger.debug "Indexing #{dir}, #{@@threads_number} threads"
30
38
  indexing_list=Dir[File.join(dir,"**/*")].select{|filename|
31
39
  File.file?(filename) && filename !~ @@exclude
32
40
  }
33
41
 
34
42
  indexing_list_chunks=indexing_list.in_transposed_slices(@@threads_number)
35
-
36
43
  prepare_multi_threads_environment
37
-
44
+
38
45
  indexing_list_chunks.each_with_thread{|chunk|
39
46
  chunk.each{|complete_path|
40
- last_itime=index_time_dbm_file[complete_path]
41
- if @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime) then
47
+ if should_index_this_document?(complete_path) then
42
48
  add_or_update_file(complete_path)
43
49
  else
44
- log :debug => "Identical : #{complete_path}"
50
+ logger.debug "Identical : #{complete_path}"
45
51
  end
46
52
  index_time_dbm_file[complete_path] = Time.now._dump
47
53
  }
48
54
  }
49
55
  end
50
56
 
57
+ # Retrieves content and language from a given document, and adds it to the index.
58
+ # Since Document#probably_unique_id is used as index :key, no document will be added
59
+ # twice to the index, and the old document will just get updated.
60
+ #
61
+ # If for some reason (no content found or no defined PlainTextExtractor), content cannot
62
+ # be found, some basic information about the document (mtime, filename, complete_path)
63
+ # gets indexed anyway.
51
64
  def add_or_update_file(complete_path)
52
- default_fields = Document.default_fields_for(complete_path)
65
+ document = Document.default_fields_for(complete_path)
53
66
  begin
54
- document = PlainTextExtractor.extract_content_and_language_from(complete_path)
67
+ document.merge! PlainTextExtractor.extract_content_and_language_from(complete_path)
55
68
  raise "empty document #{complete_path}" if document[:content].strip.empty?
56
- document.merge! default_fields
57
- log :debug => ["Added : #{complete_path}",document[:language] ? " (#{document[:language]})" : ""].join
69
+ logger.add_document document
58
70
  rescue => e
59
- log :debug => "\tindexing without content: #{e.message}"
60
- document = default_fields
71
+ logger.reject_document document, e
61
72
  end
62
73
  index << document
63
74
  end
@@ -73,11 +84,9 @@ class Indexer
73
84
  # ensures that a new Index is instantiated next time index is called.
74
85
  def close
75
86
  @@index.close rescue nil
76
- # Ferret will SEGFAULT otherwise.
77
87
  @@index = nil
78
88
  end
79
89
 
80
-
81
90
  # Checks for indexed files that are missing from filesytem
82
91
  # and removes them from index & dbm file.
83
92
  def prune_index
@@ -85,7 +94,7 @@ class Indexer
85
94
  missing_files.each{|filename, itime|
86
95
  index.writer.delete(:complete_path, filename)
87
96
  index_time_dbm_file.delete(filename)
88
- log :debug => "Removed : #{filename}"
97
+ logger.debug "Removed : #{filename}"
89
98
  }
90
99
  index.optimize
91
100
  end
@@ -97,6 +106,7 @@ class Indexer
97
106
  @@index ||= Ferret::Index::Index.new(default_index_params)
98
107
  end
99
108
 
109
+ # Creates the index unless it already exists.
100
110
  def ensure_index_existence
101
111
  index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production"
102
112
  end
@@ -106,11 +116,66 @@ class Indexer
106
116
  index.size
107
117
  end
108
118
 
119
+ # Returns the time at which the index was last created/updated.
120
+ # Returns "none" if it doesn't exist.
121
+ def last_update
122
+ Time._load(index_time_dbm_file['last']) rescue "none"
123
+ end
124
+
125
+ # Returns the time at which the reload file was last touched.
126
+ # Useful to know if other processes have modified the shared index,
127
+ # and if the Indexer should be reloaded.
128
+ def reload_file_mtime
129
+ touch_reload_file! unless File.exists?(reload_file)
130
+ File.mtime(reload_file)
131
+ end
132
+
133
+ # For a given document, it retrieves the time it was last indexed, compare it to
134
+ # its modification time and returns false unless the file has been
135
+ # modified after the last indexing process.
136
+ def should_index_this_document?(complete_path)
137
+ last_itime=index_time_dbm_file[complete_path]
138
+ @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime)
139
+ end
140
+
141
+ def locked?
142
+ File.exists?(lock_file)
143
+ end
144
+
109
145
  private
110
146
 
147
+ def touch_reload_file!
148
+ FileUtils.touch(reload_file)
149
+ # To ensure that every process can touch reload_file, even if Picolena
150
+ # is launched as a special user.
151
+ FileUtils.chmod(0666, reload_file)
152
+ end
153
+
154
+ def reload_file
155
+ File.join(Picolena::MetaIndexPath,'reload')
156
+ end
157
+
158
+ def lock!
159
+ FileUtils.touch(lock_file)
160
+ end
161
+
162
+ def unlock!
163
+ FileUtils.rm(lock_file)
164
+ # Forces Finder.index to be reloaded.
165
+ touch_reload_file!
166
+ end
167
+
168
+ def lock_file
169
+ File.join(Picolena::MetaIndexPath,'lock')
170
+ end
171
+
172
+ def logger
173
+ @@logger ||= IndexerLogger.new
174
+ end
175
+
111
176
  # Copied from Ferret book, By David Balmain
112
177
  def index_time_dbm_file
113
- @@dbm_file ||= DBM.open(File.join(Picolena::IndexSavePath, 'added_at'))
178
+ @@dbm_file ||= DBM.open(File.join(Picolena::MetaIndexPath, 'added_at'))
114
179
  end
115
180
 
116
181
  def index_exists?
@@ -121,12 +186,6 @@ class Indexer
121
186
  Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first
122
187
  end
123
188
 
124
- def log(hash)
125
- hash.each{|level,message|
126
- IndexerLogger.send(level,message)
127
- }
128
- end
129
-
130
189
  def default_index_params
131
190
  {
132
191
  :path => Picolena::IndexSavePath,
@@ -4,6 +4,11 @@ class Query
4
4
  def extract_from(raw_query)
5
5
  parser.parse(convert_to_english(raw_query))
6
6
  end
7
+
8
+ # Returns terms related to content. Useful for cache highlighting
9
+ def content_terms(raw_query)
10
+ Query.extract_from(raw_query).terms(Indexer.index.searcher).select{|term| term.field==:content}.collect{|term| term.text}.uniq
11
+ end
7
12
 
8
13
  private
9
14
 
@@ -3,14 +3,15 @@
3
3
  =language_icon_for(document)
4
4
  %small=number_to_percentage(document.score*100, :precision=>1)
5
5
  =highlight_matching_content(document)
6
- %p=link_to_containing_directory(document)
7
- -if document.supported?
8
- %p
6
+ %p
7
+ =link_to_containing_directory(document)
8
+ %br/
9
+ -if document.supported?
9
10
  =link_to_plain_text_content(document)
10
11
  &#45;
11
- =number_to_human_size(document.size)
12
- &#45;
13
- =document.pretty_date
14
- &#45;
15
12
  =link_to_cached_content(document,query)
16
- %hr/
13
+ &#45;
14
+ =number_to_human_size(document.size)
15
+ &#45;
16
+ =document.pretty_date
17
+ %hr/
@@ -7,8 +7,6 @@
7
7
  # Specifies gem version of Rails to use when vendor/rails is not present
8
8
  RAILS_GEM_VERSION = '2.0.2' unless defined? RAILS_GEM_VERSION
9
9
 
10
- IndexerLogger=Logger.new($stdout)
11
-
12
10
  # Bootstrap the Rails environment, frameworks, and default configuration
13
11
  require File.join(File.dirname(__FILE__), 'boot')
14
12
 
@@ -16,6 +16,3 @@ config.action_view.cache_template_extensions = false
16
16
 
17
17
  # Don't care if the mailer can't send
18
18
  config.action_mailer.raise_delivery_errors = false
19
-
20
-
21
- IndexerLogger.level = Logger::DEBUG
@@ -17,5 +17,3 @@ config.action_view.cache_template_loading = true
17
17
 
18
18
  # Disable delivery errors, bad email addresses will be ignored
19
19
  # config.action_mailer.raise_delivery_errors = false
20
-
21
- IndexerLogger.level = Logger::INFO
@@ -20,6 +20,3 @@ config.action_controller.allow_forgery_protection = false
20
20
  # The :test delivery method accumulates sent emails in the
21
21
  # ActionMailer::Base.deliveries array.
22
22
  config.action_mailer.delivery_method = :test
23
-
24
-
25
- IndexerLogger.level = Logger::WARN
@@ -7,4 +7,7 @@ module Picolena
7
7
  }
8
8
 
9
9
  IndexSavePath=File.join(IndexesSavePath,ENV["RAILS_ENV"] || "development")
10
+ FileUtils.mkpath IndexSavePath
11
+ MetaIndexPath= File.join(IndexSavePath,'meta')
12
+ FileUtils.mkpath MetaIndexPath
10
13
  end
@@ -3,4 +3,4 @@ module Picolena
3
3
  YAML.load_file('config/custom/indexing_performance.yml').each_pair{|param, value|
4
4
  IndexingConfiguration[param.to_sym]= value=~/^[\d_]+$/ ? value.to_i : value
5
5
  }
6
- end
6
+ end
@@ -22,5 +22,5 @@ LIKE: WIE
22
22
  filename: filename|file|datei
23
23
  filetype: erweiterung|ext
24
24
  content: inhalt
25
- modified: jahr|zeit|geändert
26
- language: lang|sprache
25
+ modified: jahr|zeit|geändert|geaendert|geandert
26
+ language: lang|sprache
@@ -20,7 +20,7 @@ LIKE: LIKE
20
20
 
21
21
  ## Fields
22
22
  filename: filename|file
23
- filetype: filetype|ext
23
+ filetype: filetype|ext|extension
24
24
  content: content
25
25
  modified: year|date|modified
26
26
  language: lang|language
@@ -20,7 +20,7 @@ LIKE: COMO
20
20
 
21
21
  ## Fields
22
22
  filename: filename|file|archivo
23
- filetype: extensión|ext
23
+ filetype: extensión|ext|extension
24
24
  content: contenido
25
25
  modified: fecha|año|anho|modificado
26
26
  language: lang|idioma
@@ -22,5 +22,5 @@ LIKE: COMME
22
22
  filename: filename|file|fichier
23
23
  filetype: extension|ext
24
24
  content: contenu
25
- modified: année|date|annee|modifie
26
- language: lang|langue
25
+ modified: année|date|annee|modifie|modifié
26
+ language: lang|langue
@@ -1,20 +1,3 @@
1
- class MimeType
2
- @@all=[]
3
- def self.all
4
- @@all
5
- end
6
-
7
- def self.add(exts,mime_name)
8
- all<<new(exts,mime_name)
9
- end
10
-
11
- attr_reader :exts, :name
12
-
13
- def initialize(exts,mime_name)
14
- @exts,@name=exts,mime_name
15
- end
16
- end
17
-
18
1
  class String
19
2
  # Creates a "probably unique" id with the desired length, composed only of lowercase letters.
20
3
  def base26_hash(length=Picolena::HashLength)
@@ -23,6 +6,9 @@ class String
23
6
  end
24
7
 
25
8
  module Enumerable
9
+ # Similar to Enumerable#each, but creates a new thread for each element.
10
+ # Used for the indexer to make it multi-threaded.
11
+ # It ensures that threads are joined together before returning.
26
12
  def each_with_thread(&block)
27
13
  tds=self.collect{|elem|
28
14
  Thread.new(elem) {|elem|
@@ -57,17 +43,31 @@ class Array
57
43
  end
58
44
  end
59
45
 
46
+ class Hash
47
+ def add(category)
48
+ self[category]||={:size=>0}
49
+ self[category][:size]+=1
50
+ end
51
+ end
52
+
60
53
  class File
54
+ # Returns the filetype of filename as a symbol.
55
+ # Returns :no_extension unless an extension is found
56
+ # >> File.ext_as_sym("test.pdf")
57
+ # => :pdf
58
+ # >> File.ext_as_sym("test.tar.gz")
59
+ # => :gz
60
+ # >> File.ext_as_sym("test")
61
+ # => :no_extension
61
62
  def self.ext_as_sym(filename)
62
63
  File.extname(filename).sub(/^\./,'').downcase.to_sym rescue :no_extension
63
64
  end
64
65
 
65
- def self.mime(filename)
66
- ext=ext_as_sym(filename)
67
- m=MimeType.all.find{|m| m.exts.include?(ext)}
68
- m ? m.name : 'application/octet-stream'
69
- end
70
-
66
+ # Returns a probable encoding for a given plain text file
67
+ # If source is a html file, it parses for metadata to retrieve encoding,
68
+ # and uses file -i otherwise.
69
+ # Returns iso-8859-15 instead of iso-8859-1, to be sure € char can be
70
+ # encoded
71
71
  def self.encoding(source)
72
72
  parse_for_charset="grep -io charset=[a-z0-9\\-]* | sed 's/charset=//i'"
73
73
  if File.extname(source)[0,4]==".htm" then
@@ -86,9 +86,18 @@ class File
86
86
  end
87
87
  end
88
88
 
89
+ # Returns the content of a file and removes it after.
90
+ # Could be used to read temporary output file written by a PlainTextExtractor.
89
91
  def self.read_and_remove(filename)
90
92
  content=read(filename)
91
93
  FileUtils.rm filename, :force=>true
92
94
  content
93
95
  end
96
+
97
+ # Returns nil unless filename is a plain text file.
98
+ # It requires file command.
99
+ # NOTE: What to use for Win32?
100
+ def self.plain_text?(filename)
101
+ %x{file -i "#{filename}"} =~ /: text\//
102
+ end
94
103
  end
@@ -0,0 +1,45 @@
1
+ class IndexerLogger<Logger
2
+ def initialize
3
+ super($stdout)
4
+ #FIXME: Should be defined in config/environments/*.rb
5
+ levels={
6
+ "development"=>Logger::DEBUG,
7
+ "production" =>Logger::INFO,
8
+ "test" =>Logger::WARN
9
+ }
10
+ @level=levels[RAILS_ENV]
11
+ @found_languages={}
12
+ @supported_filetypes={}
13
+ @unsupported_filetypes={}
14
+ end
15
+
16
+ def start_indexing
17
+ @start_time=Time.now
18
+ debug "Indexing every directory"
19
+ end
20
+
21
+ def add_document(document)
22
+ debug ["Added : #{document[:complete_path]}",document[:language] && " ("<<document[:language]<<")"].join
23
+ @found_languages.add(document[:language]) if document[:language]
24
+ @supported_filetypes.add(document[:filetype])
25
+ end
26
+
27
+ def reject_document(document, error)
28
+ @unsupported_filetypes.add(document[:filetype])
29
+ debug "Added without content (#{error.message}) : #{document[:complete_path]}"
30
+ end
31
+
32
+ def show_report
33
+ describe :found_languages, :supported_filetypes, :unsupported_filetypes
34
+ info "Time needed : #{Time.now-@start_time} s."
35
+ end
36
+
37
+ private
38
+
39
+ def describe(*instance_variable_names)
40
+ instance_variable_names.each{|var_name|
41
+ hash=instance_variable_get("@#{var_name}")
42
+ info var_name.to_s.humanize.ljust(25)<<": "<<hash.reject{|k,v| k.blank?}.sort_by{|k,v| v[:size]}.reverse.collect{|k,v| "#{k.downcase} (#{v[:size]})"}.join(", ") unless hash.empty?
43
+ }
44
+ end
45
+ end
@@ -16,7 +16,6 @@ module PlainTextExtractorDSL
16
16
  @content_and_file_examples=[]
17
17
  self.instance_eval(&block)
18
18
  PlainTextExtractor.add(self)
19
- MimeType.add(self.exts,self.mime_name)
20
19
  end
21
20
 
22
21
  def every(*exts)
@@ -3,12 +3,12 @@ PlainTextExtractor.new {
3
3
  as "application/plain"
4
4
  aka "plain text file"
5
5
  with {|source|
6
+ raise "binary file" unless File.plain_text?(source)
6
7
  encoding=File.encoding(source)
7
- #TODO: Return "binary file" if binary
8
8
  if encoding.empty? then
9
- File.read(source)
9
+ File.read(source)
10
10
  else
11
- %x{iconv -f #{encoding} -t utf8 "#{source}" 2>/dev/null}
11
+ %x{iconv -f #{encoding} -t utf8 "#{source}" 2>/dev/null}
12
12
  end
13
13
  }
14
14
  # for dependencies spec
@@ -25,10 +25,15 @@ namespace :index do
25
25
  puts "#{Indexer.size} documents are currently indexed in #{Picolena::IndexSavePath}"
26
26
  end
27
27
 
28
+ desc 'Returns the last time the index was created/update'
29
+ task :last_update => :environment do
30
+ puts Indexer.last_update
31
+ end
32
+
28
33
  # Search index with query "some query" :
29
34
  # rake index:search query="some query"
30
35
  desc 'Search index'
31
36
  task :search => :environment do
32
- Finder.new(ENV["query"]).matching_documents.entries.each{|doc| puts doc.to_s}
37
+ puts Finder.new(ENV["query"]).matching_documents.entries.collect{|doc| doc.inspect}.join("\n"<<"#"*80<<"\n")
33
38
  end
34
39
  end
@@ -1,8 +1,6 @@
1
1
  require File.dirname(__FILE__) + '/../spec_helper'
2
2
 
3
3
  describe DocumentsHelper do
4
- it "shouldn't raise if matching not in content field"
5
-
6
4
  PlainTextExtractor.supported_extensions.each{|ext|
7
5
  it "should have an icon for .#{ext} filetype" do
8
6
  icon_for(ext).should_not be_nil
@@ -7,10 +7,13 @@ describe "Finder without index on disk" do
7
7
  @original_indexed_dirs=Picolena::IndexedDirectories.dup
8
8
  @new_index_path=File.join(Dir::tmpdir,'ferret_tst')
9
9
  Picolena::IndexSavePath.replace(@new_index_path)
10
+ Picolena::MetaIndexPath.replace(File.join(@new_index_path,'meta'))
11
+ FileUtils.mkpath Picolena::MetaIndexPath
10
12
  end
11
13
 
12
14
  before(:each) do
13
15
  Indexer.clear!
16
+ Finder.send(:class_variable_set,'@@last_reload',nil)
14
17
  end
15
18
 
16
19
  it "should create index" do
@@ -29,6 +32,7 @@ describe "Finder without index on disk" do
29
32
  after(:all) do
30
33
  Picolena::IndexedDirectories.replace(@original_indexed_dirs)
31
34
  Picolena::IndexSavePath.replace(@original_index_path)
35
+ Picolena::MetaIndexPath.replace(File.join(@original_index_path,'meta'))
32
36
  end
33
37
  end
34
38
 
@@ -78,6 +78,12 @@ describe Document do
78
78
  @valid_document.should be_supported
79
79
  Document.new("spec/test_dirs/indexed/others/ghjopdfg.xyz").should_not be_supported
80
80
  end
81
+
82
+ it "should not be considered supported if binary" do
83
+ Document.new("spec/test_dirs/indexed/others/BIN_FILE_WITHOUT_EXTENSION").should_not be_supported
84
+ end
85
+
86
+
81
87
 
82
88
  it "should know its language when enough content is available" do
83
89
  Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
@@ -123,7 +123,6 @@ describe Finder do
123
123
  end
124
124
  end
125
125
 
126
- it "should not index content of binary files"
127
126
 
128
127
  # Ferret sometimes SEGFAULT crashed with '*.pdf' queries
129
128
  it "should not crash while looking for *.pdf" do
@@ -4,4 +4,13 @@ describe Indexer do
4
4
  it "should have at least 32MB memory allocated" do
5
5
  Indexer.index.writer.max_buffer_memory.should > 2**25-1
6
6
  end
7
+
8
+ it "should know the time it was updated" do
9
+ Indexer.should respond_to(:last_update)
10
+ begin
11
+ Indexer.last_update.should be_kind_of(Time)
12
+ rescue
13
+ Indexer.last_update.should == "none"
14
+ end
15
+ end
7
16
  end
@@ -27,4 +27,9 @@ describe "PlainTextExtractors" do
27
27
  end
28
28
  }
29
29
  }
30
+
31
+ it "should not extract content of binary files" do
32
+ bin_file="spec/test_dirs/indexed/others/BIN_FILE_WITHOUT_EXTENSION"
33
+ lambda{PlainTextExtractor.extract_content_from(bin_file)}.should raise_error(RuntimeError, "binary file")
34
+ end
30
35
  end
@@ -37,7 +37,21 @@ describe Query do
37
37
  }
38
38
  end
39
39
 
40
- it "should accept field terms in different languages"
40
+ it "should accept field terms in different languages" do
41
+ Globalite.language = :en
42
+ english_query_with_french_words = Query.extract_from("absorption language:fr extension:pdf")
43
+ english_query_with_german_words = Query.extract_from("Unabhängigkeit modified:>2005 filename:job.txt")
44
+ Globalite.language = :de
45
+ Query.extract_from("absorption sprache:fr erweiterung:pdf").should == english_query_with_french_words
46
+ Query.extract_from("Unabhängigkeit geändert:>2005 datei:job.txt").should == english_query_with_german_words
47
+ Globalite.language = :fr
48
+ Query.extract_from("absorption langue:fr extension:pdf").should == english_query_with_french_words
49
+ Query.extract_from("Unabhängigkeit modifié:>2005 fichier:job.txt").should == english_query_with_german_words
50
+ Globalite.language = :es
51
+ Query.extract_from("absorption idioma:fr extensión:pdf").should == english_query_with_french_words
52
+ Query.extract_from("Unabhängigkeit modificado:>2005 archivo:job.txt").should == english_query_with_german_words
53
+
54
+ end
41
55
 
42
56
  it "should use AND as default boolean ops" do
43
57
  query_without_and = Query.extract_from("one AND two")
@@ -62,4 +76,14 @@ describe Query do
62
76
  Query.extract_from("test").should == Query.extract_from("tesT")
63
77
  Query.extract_from("test").should_not == Query.extract_from("tesTe")
64
78
  end
65
- end
79
+
80
+ it "should be able to extract search terms related to :content" do
81
+ Query.content_terms("plain text").should == %w(plain text)
82
+ Query.content_terms("plain text extension:pdf").should == %w(plain text)
83
+ Query.content_terms("plain AND text").should == %w(plain text)
84
+ Query.content_terms("absorption OR adsorption").should ==%w(absorption adsorption)
85
+ Query.content_terms("filename:plain_text").should be_empty
86
+ Globalite.language = :en
87
+ Query.content_terms("LIKE absorption").include?("adsorption").should be_true
88
+ end
89
+ end
@@ -2,7 +2,7 @@ module Picolena #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 1
5
- TINY = 7
5
+ TINY = 8
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
data/website/index.html CHANGED
@@ -33,7 +33,7 @@
33
33
  <h1>Picolena</h1>
34
34
  <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
35
35
  <p>Get Version</p>
36
- <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.7</a>
36
+ <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.8</a>
37
37
  </div>
38
38
  <h1>&#x2192; &#8216;picolena&#8217;</h1>
39
39
 
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: picolena
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Duminil
@@ -30,7 +30,7 @@ cert_chain:
30
30
  qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2008-04-30 00:00:00 +02:00
33
+ date: 2008-05-08 00:00:00 +02:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
@@ -185,6 +185,7 @@ files:
185
185
  - lib/picolena/templates/lang/ui/es.yml
186
186
  - lib/picolena/templates/lang/ui/fr.yml
187
187
  - lib/picolena/templates/lib/core_exts.rb
188
+ - lib/picolena/templates/lib/indexer_logger.rb
188
189
  - lib/picolena/templates/lib/plain_text_extractor_DSL.rb
189
190
  - lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
190
191
  - lib/picolena/templates/lib/plain_text_extractors/html.rb
metadata.gz.sig CHANGED
Binary file