picolena 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +1 -0
  3. data/lib/picolena/templates/app/controllers/documents_controller.rb +3 -3
  4. data/lib/picolena/templates/app/helpers/documents_helper.rb +5 -5
  5. data/lib/picolena/templates/app/models/document.rb +16 -3
  6. data/lib/picolena/templates/app/models/finder.rb +18 -7
  7. data/lib/picolena/templates/app/models/indexer.rb +92 -33
  8. data/lib/picolena/templates/app/models/query.rb +5 -0
  9. data/lib/picolena/templates/app/views/documents/_document.html.haml +9 -8
  10. data/lib/picolena/templates/config/environment.rb +0 -2
  11. data/lib/picolena/templates/config/environments/development.rb +0 -3
  12. data/lib/picolena/templates/config/environments/production.rb +0 -2
  13. data/lib/picolena/templates/config/environments/test.rb +0 -3
  14. data/lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb +3 -0
  15. data/lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb +1 -1
  16. data/lib/picolena/templates/lang/ui/de.yml +2 -2
  17. data/lib/picolena/templates/lang/ui/en.yml +1 -1
  18. data/lib/picolena/templates/lang/ui/es.yml +1 -1
  19. data/lib/picolena/templates/lang/ui/fr.yml +2 -2
  20. data/lib/picolena/templates/lib/core_exts.rb +32 -23
  21. data/lib/picolena/templates/lib/indexer_logger.rb +45 -0
  22. data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +0 -1
  23. data/lib/picolena/templates/lib/plain_text_extractors/plain_text.rb +3 -3
  24. data/lib/picolena/templates/lib/tasks/index.rake +6 -1
  25. data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +0 -2
  26. data/lib/picolena/templates/spec/models/basic_finder_spec.rb +4 -0
  27. data/lib/picolena/templates/spec/models/document_spec.rb +6 -0
  28. data/lib/picolena/templates/spec/models/finder_spec.rb +0 -1
  29. data/lib/picolena/templates/spec/models/indexer_spec.rb +9 -0
  30. data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +5 -0
  31. data/lib/picolena/templates/spec/models/query_spec.rb +26 -2
  32. data/lib/picolena/version.rb +1 -1
  33. data/website/index.html +1 -1
  34. data.tar.gz.sig +0 -0
  35. metadata +3 -2
  36. metadata.gz.sig +0 -0
data/History.txt CHANGED
@@ -1,3 +1,13 @@
1
+ == 0.1.8 2008-05-08
2
+
3
+ * 2 minor enhancements:
4
+ * New IndexerLogger with basic statistics
5
+ * More specs & documentation.
6
+
7
+ * 2 bug fixes:
8
+ * Binary documents without extension are not considered supported anymore
9
+ * Ensure that index is locked system-wide by using lock file.
10
+
1
11
  == 0.1.7 2008-04-30
2
12
 
3
13
  * 5 minor enhancements:
data/Manifest.txt CHANGED
@@ -50,6 +50,7 @@ lib/picolena/templates/lang/ui/en.yml
50
50
  lib/picolena/templates/lang/ui/es.yml
51
51
  lib/picolena/templates/lang/ui/fr.yml
52
52
  lib/picolena/templates/lib/core_exts.rb
53
+ lib/picolena/templates/lib/indexer_logger.rb
53
54
  lib/picolena/templates/lib/plain_text_extractor_DSL.rb
54
55
  lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
55
56
  lib/picolena/templates/lib/plain_text_extractors/html.rb
@@ -22,9 +22,9 @@ class DocumentsController < ApplicationController
22
22
  def show
23
23
  start=Time.now
24
24
  @query=[params[:id],params.delete(:format)].compact.join('.')
25
- @sort=params[:sort]
25
+ @sort_by=params[:sort_by]
26
26
  page=params[:page]||1
27
- finder=Finder.new(@query,@sort,page)
27
+ finder=Finder.new(@query,@sort_by,page)
28
28
  finder.execute!
29
29
  pager=::Paginator.new(finder.total_hits, Picolena::ResultsPerPage) do
30
30
  finder.matching_documents
@@ -64,7 +64,7 @@ class DocumentsController < ApplicationController
64
64
 
65
65
  def ensure_index_is_created
66
66
  Indexer.ensure_index_existence
67
- while Indexer.do_not_disturb_while_indexing do
67
+ while Indexer.locked? do
68
68
  sleep 1
69
69
  end
70
70
  end
@@ -6,10 +6,10 @@ module DocumentsHelper
6
6
 
7
7
  # Very basic pagination.
8
8
  # Provides liks to Next, Prev and FirstPage when needed.
9
- def should_paginate(page,query, sort)
10
- [(link_to("&larr;&larr;", :action => :show, :id => query, :sort=>sort) if page.number>2),
11
- (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number, :sort=>sort) if page.prev?),
12
- (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number, :sort=>sort) if page.next?)].compact.join(" | ")
9
+ def should_paginate(page,query, sort_by)
10
+ [(link_to("&larr;&larr;", :action => :show, :id => query, :sort_by=>sort_by) if page.number>2),
11
+ (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number, :sort_by=>sort_by) if page.prev?),
12
+ (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number, :sort_by=>sort_by) if page.next?)].compact.join(" | ")
13
13
  end
14
14
 
15
15
  # Returns a localized sentence like "Results 1-10 of 12 for Zimbabwe (0.472s)" or
@@ -81,7 +81,7 @@ module DocumentsHelper
81
81
  end
82
82
 
83
83
  def sort_by_date_or_relevance(query)
84
- [link_to_unless_current('By date', document_path(query, :sort=>'by_date')),
84
+ [link_to_unless_current('By date', document_path(query, :sort_by=>'date')),
85
85
  link_to_unless_current('By relevance', document_path(query))].join("&nbsp;")
86
86
  end
87
87
  end
@@ -11,10 +11,18 @@ class Document
11
11
  end
12
12
 
13
13
  #Delegating properties to File::method_name(complete_path)
14
- [:dirname, :basename, :extname, :ext_as_sym, :file?, :size, :ext_as_sym].each{|method_name|
14
+ [:dirname, :basename, :extname, :ext_as_sym, :file?, :plain_text?, :size, :ext_as_sym].each{|method_name|
15
15
  define_method(method_name){File.send(method_name,complete_path)}
16
16
  }
17
17
  alias_method :filename, :basename
18
+ alias_method :to_s, :complete_path
19
+
20
+
21
+ def inspect
22
+ [self,("(#{pretty_score})" if @score),("(language:#{language})" if language)].compact.join(" ")
23
+ end
24
+
25
+
18
26
 
19
27
  # Returns filename without extension
20
28
  # "buildings.odt" => "buildings"
@@ -50,7 +58,7 @@ class Document
50
58
  # Document.new("presentation.pdf").supported? => true
51
59
  # Document.new("presentation.some_weird_extension").supported? => false
52
60
  def supported?
53
- PlainTextExtractor.supported_extensions.include?(self.ext_as_sym)
61
+ PlainTextExtractor.supported_extensions.include?(self.ext_as_sym) unless ext_as_sym==:no_extension and !plain_text?
54
62
  end
55
63
 
56
64
  # Retrieves content as it is *now*.
@@ -91,6 +99,10 @@ class Document
91
99
  from_index[:language]
92
100
  end
93
101
 
102
+ def pretty_score
103
+ "%3.1f%" % (@score*100)
104
+ end
105
+
94
106
  # Fields that are shared between every document.
95
107
  def self.default_fields_for(complete_path)
96
108
  {
@@ -103,6 +115,7 @@ class Document
103
115
  }
104
116
  end
105
117
 
118
+
106
119
  private
107
120
 
108
121
  # FIXME: Is there a way to easily retrieve doc_id for a given document?
@@ -138,4 +151,4 @@ class Document
138
151
  def validate_in_indexed_directory
139
152
  raise ArgumentError, "required document is not in indexed directory" unless in_indexed_directory?
140
153
  end
141
- end
154
+ end
@@ -2,23 +2,24 @@ class Finder
2
2
  attr_reader :query
3
3
 
4
4
  def index
5
- @@index ||= Indexer.index
5
+ @@index ||= Indexer.index
6
6
  end
7
7
 
8
- def initialize(raw_query,by_date=false, page=1,results_per_page=Picolena::ResultsPerPage)
8
+ def initialize(raw_query,sort_by='relevance', page=1,results_per_page=Picolena::ResultsPerPage)
9
9
  @query = Query.extract_from(raw_query)
10
10
  @raw_query= raw_query
11
11
  Indexer.ensure_index_existence
12
+ reload_index! if should_be_reloaded?
12
13
  @per_page=results_per_page
13
14
  @offset=(page.to_i-1)*results_per_page
14
- @by_date=by_date
15
+ @sort_by=sort_by
15
16
  index_should_have_documents
16
17
  end
17
18
 
18
19
  def execute!
19
20
  @matching_documents=[]
20
21
  start=Time.now
21
- @total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @by_date)){|index_id, score|
22
+ @total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @sort_by=='date')){|index_id, score|
22
23
  begin
23
24
  found_doc=Document.new(index[index_id][:complete_path])
24
25
  found_doc.matching_content=index.highlight(query, index_id,
@@ -52,11 +53,21 @@ class Finder
52
53
  }
53
54
  }
54
55
 
55
- def self.reload!
56
+ private
57
+
58
+ def reload_index!
59
+ Indexer.close
56
60
  @@index = nil
61
+ @@last_reload = Time.now
57
62
  end
58
63
 
59
- private
64
+ def should_be_reloaded?
65
+ Indexer.reload_file_mtime > last_reload
66
+ end
67
+
68
+ def last_reload
69
+ @@last_reload ||= Time.at(0)
70
+ end
60
71
 
61
72
  def sort_by_date
62
73
  Ferret::Search::SortField.new(:modified, :type => :byte, :reverse => true)
@@ -65,4 +76,4 @@ class Finder
65
76
  def index_should_have_documents
66
77
  raise IndexError, "no document found" unless index.size > 0
67
78
  end
68
- end
79
+ end
@@ -1,63 +1,74 @@
1
+ # Indexer is used to index (duh!) documents contained in IndexedDirectories
2
+ # It can create, update, delete and prune the index, and take care that only
3
+ # one IndexWriter exists at any given time, even when used in a multi-threaded
4
+ # way.
5
+ require 'indexer_logger'
1
6
  class Indexer
2
7
  # This regexp defines which files should *not* be indexed.
3
8
  @@exclude = /(Thumbs\.db)/
4
9
  # Number of threads that will be used during indexing process
5
10
  @@threads_number = 8
6
-
7
- cattr_reader :do_not_disturb_while_indexing
8
11
 
9
12
  class << self
13
+ # Finds every document included in IndexedDirectories, parses them with
14
+ # PlainTextExtractor and adds them to the index.
15
+ #
16
+ # Updates the index unless remove_first parameter is set to true, in which
17
+ # case it removes the index first before re-creating it.
10
18
  def index_every_directory(remove_first=false)
11
- @@do_not_disturb_while_indexing=true
12
19
  clear! if remove_first
20
+ lock!
13
21
  @from_scratch = remove_first
14
- # Forces Finder.searcher and Finder.index to be reloaded, by removing them from the cache.
15
- Finder.reload!
16
- log :debug => "Indexing every directory"
17
- start=Time.now
22
+ logger.start_indexing
18
23
  Picolena::IndexedDirectories.each{|dir, alias_dir|
19
24
  index_directory_with_multithreads(dir)
20
25
  }
21
- log :debug => "Now optimizing index"
26
+ logger.debug "Now optimizing index"
22
27
  index.optimize
23
- @@do_not_disturb_while_indexing=false
24
- log :debug => "Indexing done in #{Time.now-start} s."
28
+ index_time_dbm_file['last']=Time.now._dump
29
+ unlock!
30
+ logger.show_report
25
31
  end
26
32
 
33
+ # Indexes a given directory, using @@threads_number threads.
34
+ # To do so, it retrieves a list of every included document, cuts it in
35
+ # @@threads_number chunks, and create a new indexing thread for every chunk.
27
36
  def index_directory_with_multithreads(dir)
28
- log :debug => "Indexing #{dir}, #{@@threads_number} threads"
29
-
37
+ logger.debug "Indexing #{dir}, #{@@threads_number} threads"
30
38
  indexing_list=Dir[File.join(dir,"**/*")].select{|filename|
31
39
  File.file?(filename) && filename !~ @@exclude
32
40
  }
33
41
 
34
42
  indexing_list_chunks=indexing_list.in_transposed_slices(@@threads_number)
35
-
36
43
  prepare_multi_threads_environment
37
-
44
+
38
45
  indexing_list_chunks.each_with_thread{|chunk|
39
46
  chunk.each{|complete_path|
40
- last_itime=index_time_dbm_file[complete_path]
41
- if @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime) then
47
+ if should_index_this_document?(complete_path) then
42
48
  add_or_update_file(complete_path)
43
49
  else
44
- log :debug => "Identical : #{complete_path}"
50
+ logger.debug "Identical : #{complete_path}"
45
51
  end
46
52
  index_time_dbm_file[complete_path] = Time.now._dump
47
53
  }
48
54
  }
49
55
  end
50
56
 
57
+ # Retrieves content and language from a given document, and adds it to the index.
58
+ # Since Document#probably_unique_id is used as index :key, no document will be added
59
+ # twice to the index, and the old document will just get updated.
60
+ #
61
+ # If for some reason (no content found or no defined PlainTextExtractor), content cannot
62
+ # be found, some basic information about the document (mtime, filename, complete_path)
63
+ # gets indexed anyway.
51
64
  def add_or_update_file(complete_path)
52
- default_fields = Document.default_fields_for(complete_path)
65
+ document = Document.default_fields_for(complete_path)
53
66
  begin
54
- document = PlainTextExtractor.extract_content_and_language_from(complete_path)
67
+ document.merge! PlainTextExtractor.extract_content_and_language_from(complete_path)
55
68
  raise "empty document #{complete_path}" if document[:content].strip.empty?
56
- document.merge! default_fields
57
- log :debug => ["Added : #{complete_path}",document[:language] ? " (#{document[:language]})" : ""].join
69
+ logger.add_document document
58
70
  rescue => e
59
- log :debug => "\tindexing without content: #{e.message}"
60
- document = default_fields
71
+ logger.reject_document document, e
61
72
  end
62
73
  index << document
63
74
  end
@@ -73,11 +84,9 @@ class Indexer
73
84
  # ensures that a new Index is instantiated next time index is called.
74
85
  def close
75
86
  @@index.close rescue nil
76
- # Ferret will SEGFAULT otherwise.
77
87
  @@index = nil
78
88
  end
79
89
 
80
-
81
90
  # Checks for indexed files that are missing from filesytem
82
91
  # and removes them from index & dbm file.
83
92
  def prune_index
@@ -85,7 +94,7 @@ class Indexer
85
94
  missing_files.each{|filename, itime|
86
95
  index.writer.delete(:complete_path, filename)
87
96
  index_time_dbm_file.delete(filename)
88
- log :debug => "Removed : #{filename}"
97
+ logger.debug "Removed : #{filename}"
89
98
  }
90
99
  index.optimize
91
100
  end
@@ -97,6 +106,7 @@ class Indexer
97
106
  @@index ||= Ferret::Index::Index.new(default_index_params)
98
107
  end
99
108
 
109
+ # Creates the index unless it already exists.
100
110
  def ensure_index_existence
101
111
  index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production"
102
112
  end
@@ -106,11 +116,66 @@ class Indexer
106
116
  index.size
107
117
  end
108
118
 
119
+ # Returns the time at which the index was last created/updated.
120
+ # Returns "none" if it doesn't exist.
121
+ def last_update
122
+ Time._load(index_time_dbm_file['last']) rescue "none"
123
+ end
124
+
125
+ # Returns the time at which the reload file was last touched.
126
+ # Useful to know if other processes have modified the shared index,
127
+ # and if the Indexer should be reloaded.
128
+ def reload_file_mtime
129
+ touch_reload_file! unless File.exists?(reload_file)
130
+ File.mtime(reload_file)
131
+ end
132
+
133
+ # For a given document, it retrieves the time it was last indexed, compare it to
134
+ # its modification time and returns false unless the file has been
135
+ # modified after the last indexing process.
136
+ def should_index_this_document?(complete_path)
137
+ last_itime=index_time_dbm_file[complete_path]
138
+ @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime)
139
+ end
140
+
141
+ def locked?
142
+ File.exists?(lock_file)
143
+ end
144
+
109
145
  private
110
146
 
147
+ def touch_reload_file!
148
+ FileUtils.touch(reload_file)
149
+ # To ensure that every process can touch reload_file, even if Picolena
150
+ # is launched as a special user.
151
+ FileUtils.chmod(0666, reload_file)
152
+ end
153
+
154
+ def reload_file
155
+ File.join(Picolena::MetaIndexPath,'reload')
156
+ end
157
+
158
+ def lock!
159
+ FileUtils.touch(lock_file)
160
+ end
161
+
162
+ def unlock!
163
+ FileUtils.rm(lock_file)
164
+ # Forces Finder.index to be reloaded.
165
+ touch_reload_file!
166
+ end
167
+
168
+ def lock_file
169
+ File.join(Picolena::MetaIndexPath,'lock')
170
+ end
171
+
172
+ def logger
173
+ @@logger ||= IndexerLogger.new
174
+ end
175
+
111
176
  # Copied from Ferret book, By David Balmain
112
177
  def index_time_dbm_file
113
- @@dbm_file ||= DBM.open(File.join(Picolena::IndexSavePath, 'added_at'))
178
+ @@dbm_file ||= DBM.open(File.join(Picolena::MetaIndexPath, 'added_at'))
114
179
  end
115
180
 
116
181
  def index_exists?
@@ -121,12 +186,6 @@ class Indexer
121
186
  Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first
122
187
  end
123
188
 
124
- def log(hash)
125
- hash.each{|level,message|
126
- IndexerLogger.send(level,message)
127
- }
128
- end
129
-
130
189
  def default_index_params
131
190
  {
132
191
  :path => Picolena::IndexSavePath,
@@ -4,6 +4,11 @@ class Query
4
4
  def extract_from(raw_query)
5
5
  parser.parse(convert_to_english(raw_query))
6
6
  end
7
+
8
+ # Returns terms related to content. Useful for cache highlighting
9
+ def content_terms(raw_query)
10
+ Query.extract_from(raw_query).terms(Indexer.index.searcher).select{|term| term.field==:content}.collect{|term| term.text}.uniq
11
+ end
7
12
 
8
13
  private
9
14
 
@@ -3,14 +3,15 @@
3
3
  =language_icon_for(document)
4
4
  %small=number_to_percentage(document.score*100, :precision=>1)
5
5
  =highlight_matching_content(document)
6
- %p=link_to_containing_directory(document)
7
- -if document.supported?
8
- %p
6
+ %p
7
+ =link_to_containing_directory(document)
8
+ %br/
9
+ -if document.supported?
9
10
  =link_to_plain_text_content(document)
10
11
  &#45;
11
- =number_to_human_size(document.size)
12
- &#45;
13
- =document.pretty_date
14
- &#45;
15
12
  =link_to_cached_content(document,query)
16
- %hr/
13
+ &#45;
14
+ =number_to_human_size(document.size)
15
+ &#45;
16
+ =document.pretty_date
17
+ %hr/
@@ -7,8 +7,6 @@
7
7
  # Specifies gem version of Rails to use when vendor/rails is not present
8
8
  RAILS_GEM_VERSION = '2.0.2' unless defined? RAILS_GEM_VERSION
9
9
 
10
- IndexerLogger=Logger.new($stdout)
11
-
12
10
  # Bootstrap the Rails environment, frameworks, and default configuration
13
11
  require File.join(File.dirname(__FILE__), 'boot')
14
12
 
@@ -16,6 +16,3 @@ config.action_view.cache_template_extensions = false
16
16
 
17
17
  # Don't care if the mailer can't send
18
18
  config.action_mailer.raise_delivery_errors = false
19
-
20
-
21
- IndexerLogger.level = Logger::DEBUG
@@ -17,5 +17,3 @@ config.action_view.cache_template_loading = true
17
17
 
18
18
  # Disable delivery errors, bad email addresses will be ignored
19
19
  # config.action_mailer.raise_delivery_errors = false
20
-
21
- IndexerLogger.level = Logger::INFO
@@ -20,6 +20,3 @@ config.action_controller.allow_forgery_protection = false
20
20
  # The :test delivery method accumulates sent emails in the
21
21
  # ActionMailer::Base.deliveries array.
22
22
  config.action_mailer.delivery_method = :test
23
-
24
-
25
- IndexerLogger.level = Logger::WARN
@@ -7,4 +7,7 @@ module Picolena
7
7
  }
8
8
 
9
9
  IndexSavePath=File.join(IndexesSavePath,ENV["RAILS_ENV"] || "development")
10
+ FileUtils.mkpath IndexSavePath
11
+ MetaIndexPath= File.join(IndexSavePath,'meta')
12
+ FileUtils.mkpath MetaIndexPath
10
13
  end
@@ -3,4 +3,4 @@ module Picolena
3
3
  YAML.load_file('config/custom/indexing_performance.yml').each_pair{|param, value|
4
4
  IndexingConfiguration[param.to_sym]= value=~/^[\d_]+$/ ? value.to_i : value
5
5
  }
6
- end
6
+ end
@@ -22,5 +22,5 @@ LIKE: WIE
22
22
  filename: filename|file|datei
23
23
  filetype: erweiterung|ext
24
24
  content: inhalt
25
- modified: jahr|zeit|geändert
26
- language: lang|sprache
25
+ modified: jahr|zeit|geändert|geaendert|geandert
26
+ language: lang|sprache
@@ -20,7 +20,7 @@ LIKE: LIKE
20
20
 
21
21
  ## Fields
22
22
  filename: filename|file
23
- filetype: filetype|ext
23
+ filetype: filetype|ext|extension
24
24
  content: content
25
25
  modified: year|date|modified
26
26
  language: lang|language
@@ -20,7 +20,7 @@ LIKE: COMO
20
20
 
21
21
  ## Fields
22
22
  filename: filename|file|archivo
23
- filetype: extensión|ext
23
+ filetype: extensión|ext|extension
24
24
  content: contenido
25
25
  modified: fecha|año|anho|modificado
26
26
  language: lang|idioma
@@ -22,5 +22,5 @@ LIKE: COMME
22
22
  filename: filename|file|fichier
23
23
  filetype: extension|ext
24
24
  content: contenu
25
- modified: année|date|annee|modifie
26
- language: lang|langue
25
+ modified: année|date|annee|modifie|modifié
26
+ language: lang|langue
@@ -1,20 +1,3 @@
1
- class MimeType
2
- @@all=[]
3
- def self.all
4
- @@all
5
- end
6
-
7
- def self.add(exts,mime_name)
8
- all<<new(exts,mime_name)
9
- end
10
-
11
- attr_reader :exts, :name
12
-
13
- def initialize(exts,mime_name)
14
- @exts,@name=exts,mime_name
15
- end
16
- end
17
-
18
1
  class String
19
2
  # Creates a "probably unique" id with the desired length, composed only of lowercase letters.
20
3
  def base26_hash(length=Picolena::HashLength)
@@ -23,6 +6,9 @@ class String
23
6
  end
24
7
 
25
8
  module Enumerable
9
+ # Similar to Enumerable#each, but creates a new thread for each element.
10
+ # Used for the indexer to make it multi-threaded.
11
+ # It ensures that threads are joined together before returning.
26
12
  def each_with_thread(&block)
27
13
  tds=self.collect{|elem|
28
14
  Thread.new(elem) {|elem|
@@ -57,17 +43,31 @@ class Array
57
43
  end
58
44
  end
59
45
 
46
+ class Hash
47
+ def add(category)
48
+ self[category]||={:size=>0}
49
+ self[category][:size]+=1
50
+ end
51
+ end
52
+
60
53
  class File
54
+ # Returns the filetype of filename as a symbol.
55
+ # Returns :no_extension unless an extension is found
56
+ # >> File.ext_as_sym("test.pdf")
57
+ # => :pdf
58
+ # >> File.ext_as_sym("test.tar.gz")
59
+ # => :gz
60
+ # >> File.ext_as_sym("test")
61
+ # => :no_extension
61
62
  def self.ext_as_sym(filename)
62
63
  File.extname(filename).sub(/^\./,'').downcase.to_sym rescue :no_extension
63
64
  end
64
65
 
65
- def self.mime(filename)
66
- ext=ext_as_sym(filename)
67
- m=MimeType.all.find{|m| m.exts.include?(ext)}
68
- m ? m.name : 'application/octet-stream'
69
- end
70
-
66
+ # Returns a probable encoding for a given plain text file
67
+ # If source is a html file, it parses for metadata to retrieve encoding,
68
+ # and uses file -i otherwise.
69
+ # Returns iso-8859-15 instead of iso-8859-1, to be sure € char can be
70
+ # encoded
71
71
  def self.encoding(source)
72
72
  parse_for_charset="grep -io charset=[a-z0-9\\-]* | sed 's/charset=//i'"
73
73
  if File.extname(source)[0,4]==".htm" then
@@ -86,9 +86,18 @@ class File
86
86
  end
87
87
  end
88
88
 
89
+ # Returns the content of a file and removes it after.
90
+ # Could be used to read temporary output file written by a PlainTextExtractor.
89
91
  def self.read_and_remove(filename)
90
92
  content=read(filename)
91
93
  FileUtils.rm filename, :force=>true
92
94
  content
93
95
  end
96
+
97
+ # Returns nil unless filename is a plain text file.
98
+ # It requires file command.
99
+ # NOTE: What to use for Win32?
100
+ def self.plain_text?(filename)
101
+ %x{file -i "#{filename}"} =~ /: text\//
102
+ end
94
103
  end
@@ -0,0 +1,45 @@
1
+ class IndexerLogger<Logger
2
+ def initialize
3
+ super($stdout)
4
+ #FIXME: Should be defined in config/environments/*.rb
5
+ levels={
6
+ "development"=>Logger::DEBUG,
7
+ "production" =>Logger::INFO,
8
+ "test" =>Logger::WARN
9
+ }
10
+ @level=levels[RAILS_ENV]
11
+ @found_languages={}
12
+ @supported_filetypes={}
13
+ @unsupported_filetypes={}
14
+ end
15
+
16
+ def start_indexing
17
+ @start_time=Time.now
18
+ debug "Indexing every directory"
19
+ end
20
+
21
+ def add_document(document)
22
+ debug ["Added : #{document[:complete_path]}",document[:language] && " ("<<document[:language]<<")"].join
23
+ @found_languages.add(document[:language]) if document[:language]
24
+ @supported_filetypes.add(document[:filetype])
25
+ end
26
+
27
+ def reject_document(document, error)
28
+ @unsupported_filetypes.add(document[:filetype])
29
+ debug "Added without content (#{error.message}) : #{document[:complete_path]}"
30
+ end
31
+
32
+ def show_report
33
+ describe :found_languages, :supported_filetypes, :unsupported_filetypes
34
+ info "Time needed : #{Time.now-@start_time} s."
35
+ end
36
+
37
+ private
38
+
39
+ def describe(*instance_variable_names)
40
+ instance_variable_names.each{|var_name|
41
+ hash=instance_variable_get("@#{var_name}")
42
+ info var_name.to_s.humanize.ljust(25)<<": "<<hash.reject{|k,v| k.blank?}.sort_by{|k,v| v[:size]}.reverse.collect{|k,v| "#{k.downcase} (#{v[:size]})"}.join(", ") unless hash.empty?
43
+ }
44
+ end
45
+ end
@@ -16,7 +16,6 @@ module PlainTextExtractorDSL
16
16
  @content_and_file_examples=[]
17
17
  self.instance_eval(&block)
18
18
  PlainTextExtractor.add(self)
19
- MimeType.add(self.exts,self.mime_name)
20
19
  end
21
20
 
22
21
  def every(*exts)
@@ -3,12 +3,12 @@ PlainTextExtractor.new {
3
3
  as "application/plain"
4
4
  aka "plain text file"
5
5
  with {|source|
6
+ raise "binary file" unless File.plain_text?(source)
6
7
  encoding=File.encoding(source)
7
- #TODO: Return "binary file" if binary
8
8
  if encoding.empty? then
9
- File.read(source)
9
+ File.read(source)
10
10
  else
11
- %x{iconv -f #{encoding} -t utf8 "#{source}" 2>/dev/null}
11
+ %x{iconv -f #{encoding} -t utf8 "#{source}" 2>/dev/null}
12
12
  end
13
13
  }
14
14
  # for dependencies spec
@@ -25,10 +25,15 @@ namespace :index do
25
25
  puts "#{Indexer.size} documents are currently indexed in #{Picolena::IndexSavePath}"
26
26
  end
27
27
 
28
+ desc 'Returns the last time the index was created/update'
29
+ task :last_update => :environment do
30
+ puts Indexer.last_update
31
+ end
32
+
28
33
  # Search index with query "some query" :
29
34
  # rake index:search query="some query"
30
35
  desc 'Search index'
31
36
  task :search => :environment do
32
- Finder.new(ENV["query"]).matching_documents.entries.each{|doc| puts doc.to_s}
37
+ puts Finder.new(ENV["query"]).matching_documents.entries.collect{|doc| doc.inspect}.join("\n"<<"#"*80<<"\n")
33
38
  end
34
39
  end
@@ -1,8 +1,6 @@
1
1
  require File.dirname(__FILE__) + '/../spec_helper'
2
2
 
3
3
  describe DocumentsHelper do
4
- it "shouldn't raise if matching not in content field"
5
-
6
4
  PlainTextExtractor.supported_extensions.each{|ext|
7
5
  it "should have an icon for .#{ext} filetype" do
8
6
  icon_for(ext).should_not be_nil
@@ -7,10 +7,13 @@ describe "Finder without index on disk" do
7
7
  @original_indexed_dirs=Picolena::IndexedDirectories.dup
8
8
  @new_index_path=File.join(Dir::tmpdir,'ferret_tst')
9
9
  Picolena::IndexSavePath.replace(@new_index_path)
10
+ Picolena::MetaIndexPath.replace(File.join(@new_index_path,'meta'))
11
+ FileUtils.mkpath Picolena::MetaIndexPath
10
12
  end
11
13
 
12
14
  before(:each) do
13
15
  Indexer.clear!
16
+ Finder.send(:class_variable_set,'@@last_reload',nil)
14
17
  end
15
18
 
16
19
  it "should create index" do
@@ -29,6 +32,7 @@ describe "Finder without index on disk" do
29
32
  after(:all) do
30
33
  Picolena::IndexedDirectories.replace(@original_indexed_dirs)
31
34
  Picolena::IndexSavePath.replace(@original_index_path)
35
+ Picolena::MetaIndexPath.replace(File.join(@original_index_path,'meta'))
32
36
  end
33
37
  end
34
38
 
@@ -78,6 +78,12 @@ describe Document do
78
78
  @valid_document.should be_supported
79
79
  Document.new("spec/test_dirs/indexed/others/ghjopdfg.xyz").should_not be_supported
80
80
  end
81
+
82
+ it "should not be considered supported if binary" do
83
+ Document.new("spec/test_dirs/indexed/others/BIN_FILE_WITHOUT_EXTENSION").should_not be_supported
84
+ end
85
+
86
+
81
87
 
82
88
  it "should know its language when enough content is available" do
83
89
  Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
@@ -123,7 +123,6 @@ describe Finder do
123
123
  end
124
124
  end
125
125
 
126
- it "should not index content of binary files"
127
126
 
128
127
  # Ferret sometimes SEGFAULT crashed with '*.pdf' queries
129
128
  it "should not crash while looking for *.pdf" do
@@ -4,4 +4,13 @@ describe Indexer do
4
4
  it "should have at least 32MB memory allocated" do
5
5
  Indexer.index.writer.max_buffer_memory.should > 2**25-1
6
6
  end
7
+
8
+ it "should know the time it was updated" do
9
+ Indexer.should respond_to(:last_update)
10
+ begin
11
+ Indexer.last_update.should be_kind_of(Time)
12
+ rescue
13
+ Indexer.last_update.should == "none"
14
+ end
15
+ end
7
16
  end
@@ -27,4 +27,9 @@ describe "PlainTextExtractors" do
27
27
  end
28
28
  }
29
29
  }
30
+
31
+ it "should not extract content of binary files" do
32
+ bin_file="spec/test_dirs/indexed/others/BIN_FILE_WITHOUT_EXTENSION"
33
+ lambda{PlainTextExtractor.extract_content_from(bin_file)}.should raise_error(RuntimeError, "binary file")
34
+ end
30
35
  end
@@ -37,7 +37,21 @@ describe Query do
37
37
  }
38
38
  end
39
39
 
40
- it "should accept field terms in different languages"
40
+ it "should accept field terms in different languages" do
41
+ Globalite.language = :en
42
+ english_query_with_french_words = Query.extract_from("absorption language:fr extension:pdf")
43
+ english_query_with_german_words = Query.extract_from("Unabhängigkeit modified:>2005 filename:job.txt")
44
+ Globalite.language = :de
45
+ Query.extract_from("absorption sprache:fr erweiterung:pdf").should == english_query_with_french_words
46
+ Query.extract_from("Unabhängigkeit geändert:>2005 datei:job.txt").should == english_query_with_german_words
47
+ Globalite.language = :fr
48
+ Query.extract_from("absorption langue:fr extension:pdf").should == english_query_with_french_words
49
+ Query.extract_from("Unabhängigkeit modifié:>2005 fichier:job.txt").should == english_query_with_german_words
50
+ Globalite.language = :es
51
+ Query.extract_from("absorption idioma:fr extensión:pdf").should == english_query_with_french_words
52
+ Query.extract_from("Unabhängigkeit modificado:>2005 archivo:job.txt").should == english_query_with_german_words
53
+
54
+ end
41
55
 
42
56
  it "should use AND as default boolean ops" do
43
57
  query_without_and = Query.extract_from("one AND two")
@@ -62,4 +76,14 @@ describe Query do
62
76
  Query.extract_from("test").should == Query.extract_from("tesT")
63
77
  Query.extract_from("test").should_not == Query.extract_from("tesTe")
64
78
  end
65
- end
79
+
80
+ it "should be able to extract search terms related to :content" do
81
+ Query.content_terms("plain text").should == %w(plain text)
82
+ Query.content_terms("plain text extension:pdf").should == %w(plain text)
83
+ Query.content_terms("plain AND text").should == %w(plain text)
84
+ Query.content_terms("absorption OR adsorption").should ==%w(absorption adsorption)
85
+ Query.content_terms("filename:plain_text").should be_empty
86
+ Globalite.language = :en
87
+ Query.content_terms("LIKE absorption").include?("adsorption").should be_true
88
+ end
89
+ end
@@ -2,7 +2,7 @@ module Picolena #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 1
5
- TINY = 7
5
+ TINY = 8
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
data/website/index.html CHANGED
@@ -33,7 +33,7 @@
33
33
  <h1>Picolena</h1>
34
34
  <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
35
35
  <p>Get Version</p>
36
- <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.7</a>
36
+ <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.8</a>
37
37
  </div>
38
38
  <h1>&#x2192; &#8216;picolena&#8217;</h1>
39
39
 
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: picolena
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Duminil
@@ -30,7 +30,7 @@ cert_chain:
30
30
  qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2008-04-30 00:00:00 +02:00
33
+ date: 2008-05-08 00:00:00 +02:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
@@ -185,6 +185,7 @@ files:
185
185
  - lib/picolena/templates/lang/ui/es.yml
186
186
  - lib/picolena/templates/lang/ui/fr.yml
187
187
  - lib/picolena/templates/lib/core_exts.rb
188
+ - lib/picolena/templates/lib/indexer_logger.rb
188
189
  - lib/picolena/templates/lib/plain_text_extractor_DSL.rb
189
190
  - lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
190
191
  - lib/picolena/templates/lib/plain_text_extractors/html.rb
metadata.gz.sig CHANGED
Binary file