picolena 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. data/History.txt +12 -3
  2. data/Manifest.txt +2 -0
  3. data/bin/picolena +1 -1
  4. data/config/files_to_clean +1 -0
  5. data/lib/picolena/config/basic.rb +6 -2
  6. data/lib/picolena/config/indexing_performance.yml +30 -0
  7. data/lib/picolena/picolena_generator.rb +9 -4
  8. data/lib/picolena/templates/app/controllers/documents_controller.rb +3 -1
  9. data/lib/picolena/templates/app/helpers/documents_helper.rb +18 -9
  10. data/lib/picolena/templates/app/models/document.rb +20 -3
  11. data/lib/picolena/templates/app/models/finder.rb +19 -19
  12. data/lib/picolena/templates/app/models/indexer.rb +36 -9
  13. data/lib/picolena/templates/app/views/documents/_document.html.haml +7 -2
  14. data/lib/picolena/templates/app/views/documents/cached.html.haml +2 -2
  15. data/lib/picolena/templates/app/views/documents/show.html.haml +5 -2
  16. data/lib/picolena/templates/config/environment.rb +1 -1
  17. data/lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb +6 -0
  18. data/lib/picolena/templates/lib/tasks/index.rake +6 -1
  19. data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
  20. data/lib/picolena/templates/public/stylesheets/style.css +17 -1
  21. data/lib/picolena/templates/spec/models/basic_finder_spec.rb +4 -4
  22. data/lib/picolena/templates/spec/models/document_spec.rb +65 -16
  23. data/lib/picolena/templates/spec/models/finder_spec.rb +4 -3
  24. data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +8 -0
  25. data/lib/picolena/templates/spec/models/indexer_spec.rb +2 -2
  26. data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +0 -12
  27. data/lib/picolena/templates/spec/models/query_spec.rb +10 -1
  28. data/lib/picolena/version.rb +1 -1
  29. data/website/index.html +1 -1
  30. data/website/index.txt +0 -0
  31. data/website/index_devjavu +0 -0
  32. data/website/javascripts/rounded_corners_lite.inc.js +0 -0
  33. data/website/stylesheets/screen.css +0 -0
  34. data.tar.gz.sig +0 -0
  35. metadata +4 -2
  36. metadata.gz.sig +3 -1
data/History.txt CHANGED
@@ -1,10 +1,19 @@
1
+ == 0.1.7 2008-04-30
2
+
3
+ * 5 minor enhancements:
4
+ * added cache highlighting à la Google
5
+ * rake index:update implemented as described in Ferret book by David Balmain
6
+ * rake index:prune removes missing files from indexer
7
+ * possibility to sort results by relevance / by date
8
+ * one configuration file for performance tweaks
9
+
1
10
  == 0.1.6 2008-04-25
2
11
 
3
12
  * 1 minor enhancement:
4
13
  * replaced index key by Document#probably_unique_id
5
14
 
6
15
  * bug fixes:
7
- * Added forgotten public/images/flags to generator file.
16
+ * Added forgotten public/images/flags to generator file
8
17
 
9
18
  == 0.1.5 2008-04-25
10
19
 
@@ -24,7 +33,7 @@
24
33
  == 0.1.3 2008-04-20
25
34
 
26
35
  * 1 bug fix:
27
- * removed verbose debug info.
36
+ * removed verbose debug info
28
37
 
29
38
  == 0.1.2 2008-04-20
30
39
 
@@ -49,7 +58,7 @@
49
58
  * 3 minor enhancements:
50
59
  * can now be installed on win32 (doesn't pass every spec though)
51
60
  * moved rails_plugins away from lib/ so that they don't get parsed by rdoc/ri
52
- * shorter and prettier base26_hash id for documents.
61
+ * shorter and prettier base26_hash id for documents
53
62
 
54
63
  == 0.0.99 2008-04-06
55
64
 
data/Manifest.txt CHANGED
@@ -11,6 +11,7 @@ lib/picolena/USAGE
11
11
  lib/picolena/config/basic.rb
12
12
  lib/picolena/config/icons_and_filetypes.yml
13
13
  lib/picolena/config/indexed_directories.yml
14
+ lib/picolena/config/indexing_performance.yml
14
15
  lib/picolena/config/title_and_names_and_links.yml
15
16
  lib/picolena/config/white_list_ip.yml
16
17
  lib/picolena/picolena_generator.rb
@@ -42,6 +43,7 @@ lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb
42
43
  lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
43
44
  lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
44
45
  lib/picolena/templates/config/initializers/006_load_icons.rb
46
+ lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb
45
47
  lib/picolena/templates/config/routes.rb
46
48
  lib/picolena/templates/lang/ui/de.yml
47
49
  lib/picolena/templates/lang/ui/en.yml
data/bin/picolena CHANGED
@@ -11,7 +11,7 @@ if %w(-v --version).include? ARGV.first
11
11
  exit(0)
12
12
  end
13
13
 
14
- action= ARGV.include?("--spec-only") ? "testing" : "installing"
14
+ action= ARGV.any?{|opt| opt[0,6]=="--spec"} ? "testing" : "installing"
15
15
 
16
16
  require 'rubigen/scripts/generate'
17
17
  source = RubiGen::PathSource.new(:application,
@@ -5,6 +5,7 @@ lib/picolena/templates/config/custom/indexed_directories.yml
5
5
  lib/picolena/templates/config/custom/white_list_ip.yml
6
6
  lib/picolena/templates/config/custom/title_and_names_and_links.yml
7
7
  lib/picolena/templates/config/custom/icons_and_filetypes.yml
8
+ lib/picolena/templates/config/custom/indexing_performance.yml
8
9
  lib/picolena/templates/log
9
10
  lib/picolena/templates/spec/test_dirs/indexed/others/bäñüßé.txt
10
11
  lib/picolena/templates/tmp
@@ -42,5 +42,9 @@ module Picolena
42
42
  # Specify the default Levenshtein distance when using FuzzyQuery
43
43
  # see http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html for more information.
44
44
  Ferret::Search::FuzzyQuery.default_min_similarity=0.6
45
- Analyzer=Ferret::Analysis::StandardAnalyzer.new
46
- end
45
+
46
+ # PerFieldAnalyzer is used to prevent queries like "language:it" to be broken by StopFilter.
47
+ per_field_analyzer=Ferret::Analysis::PerFieldAnalyzer.new(Ferret::Analysis::StandardAnalyzer.new)
48
+ per_field_analyzer[:language]=Ferret::Analysis::WhiteSpaceAnalyzer.new
49
+ Analyzer=per_field_analyzer
50
+ end
@@ -0,0 +1,30 @@
1
+ # You probably shouldn't change those parameters
2
+ # if you don't know what they represent.
3
+ # For more information, refer to:
4
+ # http://ferret.davebalmain.com/api/classes/Ferret/Index/IndexWriter.html
5
+
6
+ ## Main performance parameters
7
+
8
+ # Allowed memory for indexing process.
9
+ # 128MB by default, or 2^27
10
+ max_buffer_memory: 134_217_728
11
+
12
+ # High value => fast indexing, slow searching
13
+ # Low value => slow indexing, fast searching
14
+ # 10 by default
15
+ merge_factor: 10
16
+
17
+ # Maximum number of extracted terms for any given document
18
+ max_field_length: 10_000
19
+
20
+
21
+ ## Other parameters
22
+ # 1MB by default, or 2**20
23
+ chunk_size: 1_048_576
24
+ max_buffered_docs: 10_000
25
+ # NOTE: Be extra careful with this parameter, setting it to -1 (infinite)
26
+ # multiplied indexing time by an order of magnitude.
27
+ # max_merge_docs: -1
28
+ use_compound_file: true
29
+ index_skip_interval: 128
30
+ doc_skip_interval: 16
@@ -16,10 +16,14 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
16
16
  usage if args.empty? and !options[:spec_only]
17
17
  @destination_root = options[:destination]
18
18
 
19
- @directories_to_index=ARGV.collect{|relative_path|
20
- abs_dir=Pathname.new(relative_path).realpath.to_s
21
- "\"#{abs_dir}\" : \"#{abs_dir}\""
22
- }.join("\n ")
19
+ @directories_to_index=if options[:spec_only] then
20
+ "/whatever : /whatever"
21
+ else
22
+ ARGV.collect{|relative_path|
23
+ abs_dir=Pathname.new(relative_path).realpath.to_s
24
+ "\"#{abs_dir}\" : \"#{abs_dir}\""
25
+ }.join("\n ")
26
+ end
23
27
 
24
28
  extract_options
25
29
  end
@@ -63,6 +67,7 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
63
67
  m.template '../config/indexed_directories.yml', 'config/custom/indexed_directories.yml', :assigns => {:directories_to_index => @directories_to_index}
64
68
  m.template '../config/title_and_names_and_links.yml', 'config/custom/title_and_names_and_links.yml', :assigns => {:version => Picolena::VERSION::STRING}
65
69
  m.file '../config/icons_and_filetypes.yml', 'config/custom/icons_and_filetypes.yml'
70
+ m.file '../config/indexing_performance.yml', 'config/custom/indexing_performance.yml'
66
71
 
67
72
  # README, License & Rakefile
68
73
  m.file 'MIT-LICENSE', 'LICENSE'
@@ -22,8 +22,9 @@ class DocumentsController < ApplicationController
22
22
  def show
23
23
  start=Time.now
24
24
  @query=[params[:id],params.delete(:format)].compact.join('.')
25
+ @sort=params[:sort]
25
26
  page=params[:page]||1
26
- finder=Finder.new(@query,page)
27
+ finder=Finder.new(@query,@sort,page)
27
28
  finder.execute!
28
29
  pager=::Paginator.new(finder.total_hits, Picolena::ResultsPerPage) do
29
30
  finder.matching_documents
@@ -47,6 +48,7 @@ class DocumentsController < ApplicationController
47
48
  # Returns the content of the document identified by probably_unique_id, as it was at the time it was indexed.
48
49
  # similar to Google cache.
49
50
  def cached
51
+ @query=[params[:query],params.delete(:format)].compact.join('.')
50
52
  end
51
53
 
52
54
  private
@@ -3,15 +3,15 @@ module DocumentsHelper
3
3
  def nothing_found?
4
4
  @matching_documents.nil? or @matching_documents.entries.empty?
5
5
  end
6
-
6
+
7
7
  # Very basic pagination.
8
8
  # Provides liks to Next, Prev and FirstPage when needed.
9
- def should_paginate(page,query)
10
- [(link_to("&larr;&larr;", :action => :show, :id => query, :page => 1) if page.number>2),
11
- (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number) if page.prev?),
12
- (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number) if page.next?)].compact.join(" | ")
9
+ def should_paginate(page,query, sort)
10
+ [(link_to("&larr;&larr;", :action => :show, :id => query, :sort=>sort) if page.number>2),
11
+ (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number, :sort=>sort) if page.prev?),
12
+ (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number, :sort=>sort) if page.next?)].compact.join(" | ")
13
13
  end
14
-
14
+
15
15
  # Returns a localized sentence like "Results 1-10 of 12 for Zimbabwe (0.472s)" or
16
16
  # "Résultats 1-2 parmi 2 pour whatever (0.012s)"
17
17
  def describe_results(page, total_hits, dt, query)
@@ -24,7 +24,7 @@ module DocumentsHelper
24
24
  show_time_needed(dt)
25
25
  ].join(' ')
26
26
  end
27
-
27
+
28
28
  # Returns the time needed to treat the query and launch the search, with a ms precision : (0.472s)
29
29
  def show_time_needed(dt)
30
30
  content_tag(:small,'('<<number_with_precision(dt,3)<<'s)')
@@ -71,8 +71,17 @@ module DocumentsHelper
71
71
  end
72
72
 
73
73
  # For any indexed document, returns a link to show its cached content.
74
- def link_to_cached_content(document)
74
+ def link_to_cached_content(document, query)
75
75
  link_name="("<<content_tag(:small,:cached.l)<<")"
76
- link_to link_name, cached_document_path(document.probably_unique_id)
76
+ link_to link_name, cached_document_path(:id => document.probably_unique_id, :query => query)
77
+ end
78
+
79
+ def highlighted_cache(document, query)
80
+ h(document.highlighted_cache(query)).gsub(/\n/,'<br/>').gsub(/&lt;&lt;(.*?)&gt;&gt;/,content_tag(:span, '\1', :class=>"matching_content"))
81
+ end
82
+
83
+ def sort_by_date_or_relevance(query)
84
+ [link_to_unless_current('By date', document_path(query, :sort=>'by_date')),
85
+ link_to_unless_current('By relevance', document_path(query))].join("&nbsp;")
77
86
  end
78
87
  end
@@ -11,7 +11,7 @@ class Document
11
11
  end
12
12
 
13
13
  #Delegating properties to File::method_name(complete_path)
14
- [:dirname, :basename, :extname, :ext_as_sym, :file?, :ext_as_sym].each{|method_name|
14
+ [:dirname, :basename, :extname, :ext_as_sym, :file?, :size, :ext_as_sym].each{|method_name|
15
15
  define_method(method_name){File.send(method_name,complete_path)}
16
16
  }
17
17
  alias_method :filename, :basename
@@ -63,11 +63,22 @@ class Document
63
63
  def cached
64
64
  from_index[:content]
65
65
  end
66
+
67
+ def highlighted_cache(raw_query)
68
+ #TODO: Report to Ferret. Highlight should accept :key and not only :doc_id.
69
+ Indexer.index.highlight(Query.extract_from(raw_query), doc_id,
70
+ :field => :content, :excerpt_length => :all,
71
+ :pre_tag => "<<", :post_tag => ">>"
72
+ ).first
73
+ end
66
74
 
67
- # FIXME: Not just date anymore.
68
75
  # Returns the last modification date before the document got indexed.
69
76
  # Useful to know how old a document is, and to which version the cache corresponds.
70
- def date
77
+ def pretty_date
78
+ from_index[:modified].sub(/(\d{4})(\d{2})(\d{2})\d{6}/,'\1-\2-\3')
79
+ end
80
+
81
+ def pretty_mtime
71
82
  from_index[:modified].sub(/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/,'\1-\2-\3 \4:\5:\6')
72
83
  end
73
84
 
@@ -93,6 +104,12 @@ class Document
93
104
  end
94
105
 
95
106
  private
107
+
108
+ # FIXME: Is there a way to easily retrieve doc_id for a given document?
109
+ # Better yet, fix Index#highlight to accept :probably_unique_id and stop using :doc_id.
110
+ def doc_id
111
+ Indexer.index.search(Ferret::Search::TermQuery.new(:probably_unique_id,probably_unique_id)).hits.first.doc
112
+ end
96
113
 
97
114
  # Retrieves the document from the index.
98
115
  # Useful to get meta-info about it.
@@ -5,36 +5,34 @@ class Finder
5
5
  @@index ||= Indexer.index
6
6
  end
7
7
 
8
- def initialize(raw_query,page=1,results_per_page=Picolena::ResultsPerPage)
8
+ def initialize(raw_query,by_date=false, page=1,results_per_page=Picolena::ResultsPerPage)
9
9
  @query = Query.extract_from(raw_query)
10
10
  @raw_query= raw_query
11
11
  Indexer.ensure_index_existence
12
12
  @per_page=results_per_page
13
13
  @offset=(page.to_i-1)*results_per_page
14
+ @by_date=by_date
14
15
  index_should_have_documents
15
16
  end
16
17
 
17
18
  def execute!
18
19
  @matching_documents=[]
19
20
  start=Time.now
20
- top_docs=index.search(query, :limit => @per_page, :offset=>@offset)
21
- top_docs.hits.each{|hit|
22
- index_id,score=hit.doc,hit.score
23
- begin
24
- found_doc=Document.new(index[index_id][:complete_path])
25
- found_doc.matching_content=index.highlight(query, index_id,
26
- :field => :content, :excerpt_length => 80,
27
- :pre_tag => "<<", :post_tag => ">>"
28
- ) unless @raw_query=~/^\*+\.\w*$/
29
- found_doc.score=score
30
- @matching_documents<<found_doc
31
- rescue Errno::ENOENT
32
- #"File has been moved/deleted!"
33
- end
21
+ @total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @by_date)){|index_id, score|
22
+ begin
23
+ found_doc=Document.new(index[index_id][:complete_path])
24
+ found_doc.matching_content=index.highlight(query, index_id,
25
+ :field => :content, :excerpt_length => 80,
26
+ :pre_tag => "<<", :post_tag => ">>"
27
+ )
28
+ found_doc.score=score
29
+ @matching_documents<<found_doc
30
+ rescue Errno::ENOENT
31
+ #"File has been moved/deleted!"
32
+ end
34
33
  }
35
34
  @executed=true
36
- @time_needed=Time.now-start
37
- @total_hits=top_docs.total_hits
35
+ @time_needed=Time.now-start
38
36
  end
39
37
 
40
38
  # Returns true if it has been executed.
@@ -54,13 +52,15 @@ class Finder
54
52
  }
55
53
  }
56
54
 
57
-
58
-
59
55
  def self.reload!
60
56
  @@index = nil
61
57
  end
62
58
 
63
59
  private
60
+
61
+ def sort_by_date
62
+ Ferret::Search::SortField.new(:modified, :type => :byte, :reverse => true)
63
+ end
64
64
 
65
65
  def index_should_have_documents
66
66
  raise IndexError, "no document found" unless index.size > 0
@@ -10,6 +10,7 @@ class Indexer
10
10
  def index_every_directory(remove_first=false)
11
11
  @@do_not_disturb_while_indexing=true
12
12
  clear! if remove_first
13
+ @from_scratch = remove_first
13
14
  # Forces Finder.searcher and Finder.index to be reloaded, by removing them from the cache.
14
15
  Finder.reload!
15
16
  log :debug => "Indexing every directory"
@@ -35,13 +36,19 @@ class Indexer
35
36
  prepare_multi_threads_environment
36
37
 
37
38
  indexing_list_chunks.each_with_thread{|chunk|
38
- chunk.each{|filename|
39
- add_file(filename)
39
+ chunk.each{|complete_path|
40
+ last_itime=index_time_dbm_file[complete_path]
41
+ if @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime) then
42
+ add_or_update_file(complete_path)
43
+ else
44
+ log :debug => "Identical : #{complete_path}"
45
+ end
46
+ index_time_dbm_file[complete_path] = Time.now._dump
40
47
  }
41
48
  }
42
49
  end
43
50
 
44
- def add_file(complete_path)
51
+ def add_or_update_file(complete_path)
45
52
  default_fields = Document.default_fields_for(complete_path)
46
53
  begin
47
54
  document = PlainTextExtractor.extract_content_and_language_from(complete_path)
@@ -69,6 +76,19 @@ class Indexer
69
76
  # Ferret will SEGFAULT otherwise.
70
77
  @@index = nil
71
78
  end
79
+
80
+
81
+ # Checks for indexed files that are missing from filesytem
82
+ # and removes them from index & dbm file.
83
+ def prune_index
84
+ missing_files=index_time_dbm_file.reject{|filename,itime| File.exists?(filename) && Picolena::IndexedDirectories.any?{|dir,alias_path| filename.starts_with?(dir)}}
85
+ missing_files.each{|filename, itime|
86
+ index.writer.delete(:complete_path, filename)
87
+ index_time_dbm_file.delete(filename)
88
+ log :debug => "Removed : #{filename}"
89
+ }
90
+ index.optimize
91
+ end
72
92
 
73
93
  # Only one IndexWriter should be instantiated.
74
94
  # If one index already exists, returns it.
@@ -81,11 +101,17 @@ class Indexer
81
101
  index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production"
82
102
  end
83
103
 
84
- def doc_count
85
- index.writer.doc_count
104
+ # Returns how many files are indexed.
105
+ def size
106
+ index.size
86
107
  end
87
108
 
88
109
  private
110
+
111
+ # Copied from Ferret book, By David Balmain
112
+ def index_time_dbm_file
113
+ @@dbm_file ||= DBM.open(File.join(Picolena::IndexSavePath, 'added_at'))
114
+ end
89
115
 
90
116
  def index_exists?
91
117
  index_filename and File.exists?(index_filename)
@@ -108,7 +134,7 @@ class Indexer
108
134
  :field_infos => default_field_infos,
109
135
  # Great way to ensure that no file is indexed twice!
110
136
  :key => :probably_unique_id
111
- }
137
+ }.merge Picolena::IndexingConfiguration
112
138
  end
113
139
 
114
140
  def default_field_infos
@@ -120,7 +146,7 @@ class Indexer
120
146
  field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
121
147
  field_infos.add_field(:modified, :store => :yes, :index => :untokenized)
122
148
  field_infos.add_field(:probably_unique_id, :store => :no, :index => :untokenized)
123
- field_infos.add_field(:language, :store => :yes, :index => :yes)
149
+ field_infos.add_field(:language, :store => :yes, :index => :untokenized)
124
150
  end
125
151
  end
126
152
 
@@ -130,7 +156,8 @@ class Indexer
130
156
  # an IndexWriter at the same time, and get a
131
157
  # Ferret::Store::Lock::LockError
132
158
  index
133
- # NOTE: is it really necessary?
159
+ # Opens dbm file to dump indexing time.
160
+ index_time_dbm_file
134
161
  # ActiveSupport sometime raises
135
162
  # Expected Object is NOT missing constant
136
163
  # without.
@@ -140,4 +167,4 @@ class Indexer
140
167
  PlainTextExtractor
141
168
  end
142
169
  end
143
- end
170
+ end
@@ -7,5 +7,10 @@
7
7
  -if document.supported?
8
8
  %p
9
9
  =link_to_plain_text_content(document)
10
- =link_to_cached_content(document)
11
- %hr/
10
+ &#45;
11
+ =number_to_human_size(document.size)
12
+ &#45;
13
+ =document.pretty_date
14
+ &#45;
15
+ =link_to_cached_content(document,query)
16
+ %hr/
@@ -2,7 +2,7 @@
2
2
  =link_to icon_and_filename_for(@document), download_document_path(@probably_unique_id)
3
3
  (
4
4
  =:as_it_was_indexed_on.l
5
- =@document.date
5
+ =@document.pretty_date
6
6
  )
7
7
  %p=link_to_containing_directory(@document)
8
- %blockquote=h(@document.cached).gsub(/\n/,'<br/>')
8
+ %blockquote=highlighted_cache(@document, @query)
@@ -7,6 +7,9 @@
7
7
  %strong=h(@query)
8
8
  =show_time_needed(@time_needed)
9
9
  -else
10
- %span{:class=>'pagination'}=should_paginate(@matching_documents, @query)
10
+ %span{:class=>'pagination'}=should_paginate(@matching_documents, @query, @sort)
11
11
  =describe_results(@matching_documents, @total_hits, @time_needed, h(@query))
12
- = render :partial =>'document', :collection => @matching_documents
12
+ -unless nothing_found?
13
+ %p
14
+ %span{:class=>'sort_by'}=sort_by_date_or_relevance(@query)
15
+ = render :partial =>'document', :collection => @matching_documents, :locals => { :query => @query}
@@ -1,4 +1,4 @@
1
- %w(rubygems paginator fileutils pathname logger thread).each{|lib| require lib}
1
+ %w(rubygems paginator fileutils pathname logger thread dbm).each{|lib| require lib}
2
2
 
3
3
  # Uncomment below to force Rails into production mode when
4
4
  # you don't control web/app server and can't set it the proper way
@@ -0,0 +1,6 @@
1
+ module Picolena
2
+ IndexingConfiguration={}
3
+ YAML.load_file('config/custom/indexing_performance.yml').each_pair{|param, value|
4
+ IndexingConfiguration[param.to_sym]= value=~/^[\d_]+$/ ? value.to_i : value
5
+ }
6
+ end
@@ -14,10 +14,15 @@ namespace :index do
14
14
  task :update => :environment do
15
15
  Indexer.index_every_directory
16
16
  end
17
+
18
+ desc 'Remove unneeded files from index'
19
+ task :prune => :environment do
20
+ Indexer.prune_index
21
+ end
17
22
 
18
23
  desc 'Returns the number of indexed documents'
19
24
  task :size => :environment do
20
- puts "#{Indexer.doc_count} documents are currently indexed in #{Picolena::IndexSavePath}"
25
+ puts "#{Indexer.size} documents are currently indexed in #{Picolena::IndexSavePath}"
21
26
  end
22
27
 
23
28
  # Search index with query "some query" :
@@ -30,7 +30,7 @@ namespace :install_dependencies do
30
30
  task :deb_packages do
31
31
  root_privileges_required!
32
32
  #TODO: Should load this list from defined PlainTextExtractor's
33
- packages=%w{antiword poppler-utils odt2txt html2text catdoc unrtf mguesser}.join(" ")
33
+ packages=%w{antiword poppler-utils odt2txt html2text catdoc unrtf mguesser libdbm-ruby1.8}.join(" ")
34
34
  puts "Installing "<<packages
35
35
  system("apt-get install "<<packages)
36
36
  end
@@ -82,6 +82,17 @@ h1, h2, h3, h4, h5, h6, p, form {
82
82
  text-decoration:none;
83
83
  }
84
84
 
85
+ .sort_by {
86
+ float:right;
87
+ font-size: 13px;
88
+ color:#000;
89
+ }
90
+
91
+ .sort_by a{
92
+ color: #EE8907;
93
+ text-decoration:none;
94
+ }
95
+
85
96
  #mainimg input.btn{
86
97
  margin-right: 10px;
87
98
  height: 20px;
@@ -116,7 +127,7 @@ width: 80%;
116
127
 
117
128
  #results {
118
129
  width:778px;
119
- padding-top: 25px;
130
+ padding-top: 15px;
120
131
  }
121
132
 
122
133
  #results h2 a{
@@ -137,6 +148,11 @@ width: 80%;
137
148
  padding:0px 20px;
138
149
  }
139
150
 
151
+ #results .matching_content{
152
+ background-color:#ffff66;
153
+ }
154
+
155
+
140
156
  #results a, #results small{
141
157
  font-family:"Trebuchet MS";
142
158
  font-size:11px;
@@ -50,16 +50,16 @@ describe "Basic Finder" do
50
50
  Indexer.index_every_directory(remove_first=true)
51
51
  end
52
52
 
53
- it "should accept one parameter as query, and 2 optionals for paginating" do
53
+ it "should accept one parameter as query, 1 optional for sorting results and 2 optionals for paginating" do
54
54
  lambda {Finder.new}.should raise_error(ArgumentError, "wrong number of arguments (0 for 1)")
55
55
  # show first page with 10 results per page
56
56
  lambda {Finder.new("a b")}.should_not raise_error
57
57
  # show second page
58
- lambda {Finder.new("a", 2)}.should_not raise_error
58
+ lambda {Finder.new("a", "by_date")}.should_not raise_error
59
59
  # show first page with 15 results
60
- lambda {Finder.new("a", 1, 15)}.should_not raise_error
60
+ lambda {Finder.new("a", "by_date", 1, 15)}.should_not raise_error
61
61
  # Too many parameters
62
- lambda {Finder.new("a", 10, 20, 30)}.should raise_error(ArgumentError, "wrong number of arguments (4 for 3)")
62
+ lambda {Finder.new("a", "by_date", 10, 20, 30)}.should raise_error(ArgumentError, "wrong number of arguments (5 for 4)")
63
63
  end
64
64
 
65
65
  it "should return matching documents if executed successfully" do
@@ -5,28 +5,30 @@ basic_pdf_attribute={
5
5
  :basename=>'basic',
6
6
  :complete_path=>File.join(RAILS_ROOT, '/spec/test_dirs/indexed/basic/basic.pdf'),
7
7
  :extname=>'.pdf',
8
- :filename=>'basic.pdf'
8
+ :ext_as_sym => :pdf,
9
+ :filename=>'basic.pdf',
10
+ :size => 9380
9
11
  }
10
12
 
11
13
  describe Document do
12
14
  before(:each) do
13
- @valid_random_doc=Document.find(:random) rescue Document.new("spec/test_dirs/indexed/basic/basic.pdf")
15
+ @valid_document=Document.new("spec/test_dirs/indexed/basic/basic.pdf")
14
16
  end
15
17
 
16
18
  it "should be an existing file" do
17
19
  lambda {Document.new("/patapouf.txt")}.should raise_error(Errno::ENOENT)
18
- lambda {@valid_random_doc}.should_not raise_error
20
+ lambda {@valid_document}.should_not raise_error
19
21
  lambda {Document.new("spec/test_dirs/not_indexed/Rakefile")}.should_not raise_error(Errno::ENOENT)
20
22
  end
21
23
 
22
24
  it "should belong to an indexed directory" do
23
- lambda {@valid_random_doc}.should_not raise_error
25
+ lambda {@valid_document}.should_not raise_error
24
26
  lambda {Document.new("spec/test_dirs/not_indexed/Rakefile")}.should raise_error(ArgumentError, "required document is not in indexed directory")
25
27
  end
26
28
 
27
29
  basic_pdf_attribute.each{|attribute,expected_value|
28
30
  it "should know its #{attribute}" do
29
- @valid_random_doc.should respond_to(attribute)
31
+ @valid_document.should respond_to(attribute)
30
32
  @basic_pdf=Document.new('spec/test_dirs/indexed/basic/basic.pdf')
31
33
  @basic_pdf.send(attribute).should == expected_value
32
34
  end
@@ -36,23 +38,70 @@ describe Document do
36
38
  another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
37
39
  another_doc.content.should == "just a content test\nin a txt file"
38
40
  end
41
+
42
+ it "should know its cached content" do
43
+ another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
44
+ another_doc.cached.should == "just a content test\nin a txt file"
45
+ end
46
+
47
+ it "should know its highlighted cached content for a given query" do
48
+ another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
49
+ another_doc.highlighted_cache('a content test').should == "just a <<content>> <<test>>\nin a txt file"
50
+ end
39
51
 
40
52
  it "should know its alias_path" do
41
- @valid_random_doc.should respond_to(:alias_path)
42
- @valid_random_doc.alias_path.starts_with?("http://picolena.devjavu.com/browser/trunk/lib/picolena/templates/spec/test_dirs/indexed").should be_true
53
+ @valid_document.should respond_to(:alias_path)
54
+ @valid_document.alias_path.starts_with?("http://picolena.devjavu.com/browser/trunk/lib/picolena/templates/spec/test_dirs/indexed").should be_true
55
+ end
56
+
57
+ it "should know its probably_unique_id" do
58
+ @valid_document.should respond_to(:probably_unique_id)
59
+ @valid_document.probably_unique_id.should =~/^[a-z]+$/
60
+ @valid_document.probably_unique_id.size.should == Picolena::HashLength
43
61
  end
62
+
63
+ it "should know its modification date" do
64
+ @valid_document.pretty_date.class.should == String
65
+ @valid_document.pretty_date.should =~/^\d{4}\-\d{2}\-\d{2}$/
66
+ end
67
+
68
+ it "should know its modification time and returns it in a pretty way" do
69
+ @valid_document.should respond_to(:mtime)
70
+ @valid_document.mtime.should be_kind_of(Integer)
71
+ @valid_document.should respond_to(:pretty_mtime)
72
+ @valid_document.pretty_mtime.class.should == String
73
+ @valid_document.pretty_mtime.should =~/^\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}$/
74
+ end
75
+
76
+ it "should know if its content can be extracted" do
77
+ @valid_document.should respond_to(:supported?)
78
+ @valid_document.should be_supported
79
+ Document.new("spec/test_dirs/indexed/others/ghjopdfg.xyz").should_not be_supported
80
+ end
81
+
82
+ it "should know its language when enough content is available" do
83
+ Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
84
+ Document.new("spec/test_dirs/indexed/lang/shakespeare").language.should == "en"
85
+ Document.new("spec/test_dirs/indexed/lang/lorca").language.should == "es"
86
+ Document.new("spec/test_dirs/indexed/lang/hugo").language.should == "fr"
87
+ end if Picolena::UseLanguageRecognition
88
+
89
+ it "should not try to guess language when file is too small" do
90
+ Document.new("spec/test_dirs/indexed/basic/hello.rb").language.should be_nil
91
+ Document.new("spec/test_dirs/indexed/README").language.should be_nil
92
+ end if Picolena::UseLanguageRecognition
44
93
 
45
94
  it "should let finder specify its score" do
46
- @valid_random_doc.should respond_to(:score)
47
- @valid_random_doc.score.should be_nil
48
- @valid_random_doc.score=25
49
- @valid_random_doc.score.should == 25
95
+ @valid_document.should respond_to(:score)
96
+ @valid_document.score.should be_nil
97
+ @valid_document.score=25
98
+ @valid_document.score.should == 25
50
99
  end
51
100
 
52
101
  it "should let finder specify its matching content" do
53
- @valid_random_doc.should respond_to(:matching_content)
54
- @valid_random_doc.matching_content.should be_nil
55
- @valid_random_doc.matching_content=["thermal cooling", "heat driven cooling"]
56
- @valid_random_doc.matching_content.should include("thermal cooling")
102
+ @valid_document.should respond_to(:matching_content)
103
+ @valid_document.matching_content.should be_nil
104
+ @valid_document.matching_content=["thermal cooling", "heat driven cooling"]
105
+ @valid_document.matching_content.should include("thermal cooling")
57
106
  end
58
- end
107
+ end
@@ -8,9 +8,9 @@ end
8
8
 
9
9
 
10
10
  def matching_document_for(query)
11
- # Returns matching document for any given query only if
12
- # exactly one document is found.
13
- # Specs don't pass otherwise.
11
+ # Returns matching document for any given query only if
12
+ # exactly one document is found.
13
+ # Specs don't pass otherwise.
14
14
  matching_documents=Finder.new(query).matching_documents
15
15
  matching_documents.size.should == 1
16
16
  matching_documents.first
@@ -19,6 +19,7 @@ end
19
19
 
20
20
  describe Finder do
21
21
  before(:all) do
22
+ Globalite.language = :en
22
23
  # SVN doesn't like non-ascii filenames.
23
24
  revert_changes!('spec/test_dirs/indexed/others/bäñüßé.txt',"just to know if files are indexed with utf8 filenames")
24
25
 
@@ -13,10 +13,18 @@ describe "Host indexing system" do
13
13
 
14
14
  it "should know which IP addresses are allowed (config/custom/white_list_ip.yml)" do
15
15
  File.should be_readable('config/custom/white_list_ip.yml')
16
+ ip_conf=YAML.load_file('config/custom/white_list_ip.yml')
17
+ ip_conf.class.should == Hash
18
+ ip_conf['Allow'].should_not be_nil
16
19
  end
17
20
 
18
21
  it "should know which directories are to be indexed (config/custom/indexed_directories.yml)" do
19
22
  File.should be_readable('config/custom/indexed_directories.yml')
23
+ dirs_conf=YAML.load_file('config/custom/indexed_directories.yml')
24
+ dirs_conf.class.should == Hash
25
+ %w(development test production).all?{|env|
26
+ dirs_conf[env].should_not be_nil
27
+ }
20
28
  end
21
29
 
22
30
  it "should be able to calculate base26 hash from strings" do
@@ -1,7 +1,7 @@
1
1
  require File.dirname(__FILE__) + '/../spec_helper'
2
2
 
3
3
  describe Indexer do
4
- before(:each) do
5
- @indexer = Indexer.new
4
+ it "should have at least 32MB memory allocated" do
5
+ Indexer.index.writer.max_buffer_memory.should > 2**25-1
6
6
  end
7
7
  end
@@ -27,16 +27,4 @@ describe "PlainTextExtractors" do
27
27
  end
28
28
  }
29
29
  }
30
-
31
- it "should guess language when enough content is available" do
32
- Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
33
- Document.new("spec/test_dirs/indexed/lang/shakespeare").language.should == "en"
34
- Document.new("spec/test_dirs/indexed/lang/lorca").language.should == "es"
35
- Document.new("spec/test_dirs/indexed/lang/hugo").language.should == "fr"
36
- end if Picolena::UseLanguageRecognition
37
-
38
- it "should not try to guess language when file is too small" do
39
- Document.new("spec/test_dirs/indexed/basic/hello.rb").language.should be_nil
40
- Document.new("spec/test_dirs/indexed/README").language.should be_nil
41
- end if Picolena::UseLanguageRecognition
42
30
  end
@@ -1,8 +1,16 @@
1
1
  require File.dirname(__FILE__) + '/../spec_helper'
2
2
 
3
3
  describe Query do
4
- it "should return a BooleanQuery" do
4
+ it "should return a BooleanQuery, a TermQuery or a RangeQuery" do
5
5
  Query.extract_from("whatever").class.should == Ferret::Search::BooleanQuery
6
+ Query.extract_from("lang:de").class.should == Ferret::Search::TermQuery
7
+ Query.extract_from("date:<1990").class.should == Ferret::Search::RangeQuery
8
+ end
9
+
10
+ it "should not remove stop-words from TermQuery" do
11
+ # it means "Italian language", but also is a stop-word.
12
+ Query.extract_from("lang:it").class.should == Ferret::Search::TermQuery
13
+ Query.extract_from("lang:it").to_s.should == "language:it"
6
14
  end
7
15
 
8
16
  it "should translate LIKE, NOT, OR and AND boolean ops to English" do
@@ -12,6 +20,7 @@ describe Query do
12
20
  :fr=>["COMME","NON","OU","ET"]
13
21
  }
14
22
 
23
+ Globalite.language = :en
15
24
  english_query_with_like_and_not=Query.extract_from("LIKE something NOT something")
16
25
  english_query_with_or=Query.extract_from("test OR another")
17
26
  english_query_with_and=Query.extract_from("test AND another")
@@ -2,7 +2,7 @@ module Picolena #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 1
5
- TINY = 6
5
+ TINY = 7
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
data/website/index.html CHANGED
@@ -33,7 +33,7 @@
33
33
  <h1>Picolena</h1>
34
34
  <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
35
35
  <p>Get Version</p>
36
- <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.6</a>
36
+ <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.7</a>
37
37
  </div>
38
38
  <h1>&#x2192; &#8216;picolena&#8217;</h1>
39
39
 
data/website/index.txt CHANGED
File without changes
File without changes
File without changes
File without changes
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: picolena
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Duminil
@@ -30,7 +30,7 @@ cert_chain:
30
30
  qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2008-04-25 00:00:00 +02:00
33
+ date: 2008-04-30 00:00:00 +02:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
@@ -146,6 +146,7 @@ files:
146
146
  - lib/picolena/config/basic.rb
147
147
  - lib/picolena/config/icons_and_filetypes.yml
148
148
  - lib/picolena/config/indexed_directories.yml
149
+ - lib/picolena/config/indexing_performance.yml
149
150
  - lib/picolena/config/title_and_names_and_links.yml
150
151
  - lib/picolena/config/white_list_ip.yml
151
152
  - lib/picolena/picolena_generator.rb
@@ -177,6 +178,7 @@ files:
177
178
  - lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
178
179
  - lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
179
180
  - lib/picolena/templates/config/initializers/006_load_icons.rb
181
+ - lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb
180
182
  - lib/picolena/templates/config/routes.rb
181
183
  - lib/picolena/templates/lang/ui/de.yml
182
184
  - lib/picolena/templates/lang/ui/en.yml
metadata.gz.sig CHANGED
@@ -1 +1,2 @@
1
- ��"#m��EZY����v��>��lmLW A�ft �-�����<d�B��w]��T7��ꞅ�tR-i��X�W��%2e� 8f�]b�M�<IAF�ͯ7]krz�)w� ��
1
+ �;����U�=nƷ�8߿X�`>����B����2Ħ@,u!��~�u9>�Ӽq�J1� ֖i�T������-.q�^l*�`�>"��m�8��ɏP�cWk��y%����W�’:r=&����Ct aO;c
2
+ .&��}�e)�g(O�)0ة)!����s�
3
+ �"��Fm��>8���n���q�?I�P'����`|����`�\�>{\a4�Ӷ�JǮ}�&�?�d�UM{