picolena 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. data/History.txt +12 -3
  2. data/Manifest.txt +2 -0
  3. data/bin/picolena +1 -1
  4. data/config/files_to_clean +1 -0
  5. data/lib/picolena/config/basic.rb +6 -2
  6. data/lib/picolena/config/indexing_performance.yml +30 -0
  7. data/lib/picolena/picolena_generator.rb +9 -4
  8. data/lib/picolena/templates/app/controllers/documents_controller.rb +3 -1
  9. data/lib/picolena/templates/app/helpers/documents_helper.rb +18 -9
  10. data/lib/picolena/templates/app/models/document.rb +20 -3
  11. data/lib/picolena/templates/app/models/finder.rb +19 -19
  12. data/lib/picolena/templates/app/models/indexer.rb +36 -9
  13. data/lib/picolena/templates/app/views/documents/_document.html.haml +7 -2
  14. data/lib/picolena/templates/app/views/documents/cached.html.haml +2 -2
  15. data/lib/picolena/templates/app/views/documents/show.html.haml +5 -2
  16. data/lib/picolena/templates/config/environment.rb +1 -1
  17. data/lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb +6 -0
  18. data/lib/picolena/templates/lib/tasks/index.rake +6 -1
  19. data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
  20. data/lib/picolena/templates/public/stylesheets/style.css +17 -1
  21. data/lib/picolena/templates/spec/models/basic_finder_spec.rb +4 -4
  22. data/lib/picolena/templates/spec/models/document_spec.rb +65 -16
  23. data/lib/picolena/templates/spec/models/finder_spec.rb +4 -3
  24. data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +8 -0
  25. data/lib/picolena/templates/spec/models/indexer_spec.rb +2 -2
  26. data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +0 -12
  27. data/lib/picolena/templates/spec/models/query_spec.rb +10 -1
  28. data/lib/picolena/version.rb +1 -1
  29. data/website/index.html +1 -1
  30. data/website/index.txt +0 -0
  31. data/website/index_devjavu +0 -0
  32. data/website/javascripts/rounded_corners_lite.inc.js +0 -0
  33. data/website/stylesheets/screen.css +0 -0
  34. data.tar.gz.sig +0 -0
  35. metadata +4 -2
  36. metadata.gz.sig +3 -1
data/History.txt CHANGED
@@ -1,10 +1,19 @@
1
+ == 0.1.7 2008-04-30
2
+
3
+ * 5 minor enhancements:
4
+ * added cache highlighting à la Google
5
+ * rake index:update implemented as described in Ferret book by David Balmain
6
+ * rake index:prune removes missing files from indexer
7
+ * possibility to sort results by relevance / by date
8
+ * one configuration file for performance tweaks
9
+
1
10
  == 0.1.6 2008-04-25
2
11
 
3
12
  * 1 minor enhancement:
4
13
  * replaced index key by Document#probably_unique_id
5
14
 
6
15
  * bug fixes:
7
- * Added forgotten public/images/flags to generator file.
16
+ * Added forgotten public/images/flags to generator file
8
17
 
9
18
  == 0.1.5 2008-04-25
10
19
 
@@ -24,7 +33,7 @@
24
33
  == 0.1.3 2008-04-20
25
34
 
26
35
  * 1 bug fix:
27
- * removed verbose debug info.
36
+ * removed verbose debug info
28
37
 
29
38
  == 0.1.2 2008-04-20
30
39
 
@@ -49,7 +58,7 @@
49
58
  * 3 minor enhancements:
50
59
  * can now be installed on win32 (doesn't pass every spec though)
51
60
  * moved rails_plugins away from lib/ so that they don't get parsed by rdoc/ri
52
- * shorter and prettier base26_hash id for documents.
61
+ * shorter and prettier base26_hash id for documents
53
62
 
54
63
  == 0.0.99 2008-04-06
55
64
 
data/Manifest.txt CHANGED
@@ -11,6 +11,7 @@ lib/picolena/USAGE
11
11
  lib/picolena/config/basic.rb
12
12
  lib/picolena/config/icons_and_filetypes.yml
13
13
  lib/picolena/config/indexed_directories.yml
14
+ lib/picolena/config/indexing_performance.yml
14
15
  lib/picolena/config/title_and_names_and_links.yml
15
16
  lib/picolena/config/white_list_ip.yml
16
17
  lib/picolena/picolena_generator.rb
@@ -42,6 +43,7 @@ lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb
42
43
  lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
43
44
  lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
44
45
  lib/picolena/templates/config/initializers/006_load_icons.rb
46
+ lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb
45
47
  lib/picolena/templates/config/routes.rb
46
48
  lib/picolena/templates/lang/ui/de.yml
47
49
  lib/picolena/templates/lang/ui/en.yml
data/bin/picolena CHANGED
@@ -11,7 +11,7 @@ if %w(-v --version).include? ARGV.first
11
11
  exit(0)
12
12
  end
13
13
 
14
- action= ARGV.include?("--spec-only") ? "testing" : "installing"
14
+ action= ARGV.any?{|opt| opt[0,6]=="--spec"} ? "testing" : "installing"
15
15
 
16
16
  require 'rubigen/scripts/generate'
17
17
  source = RubiGen::PathSource.new(:application,
@@ -5,6 +5,7 @@ lib/picolena/templates/config/custom/indexed_directories.yml
5
5
  lib/picolena/templates/config/custom/white_list_ip.yml
6
6
  lib/picolena/templates/config/custom/title_and_names_and_links.yml
7
7
  lib/picolena/templates/config/custom/icons_and_filetypes.yml
8
+ lib/picolena/templates/config/custom/indexing_performance.yml
8
9
  lib/picolena/templates/log
9
10
  lib/picolena/templates/spec/test_dirs/indexed/others/bäñüßé.txt
10
11
  lib/picolena/templates/tmp
@@ -42,5 +42,9 @@ module Picolena
42
42
  # Specify the default Levenshtein distance when using FuzzyQuery
43
43
  # see http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html for more information.
44
44
  Ferret::Search::FuzzyQuery.default_min_similarity=0.6
45
- Analyzer=Ferret::Analysis::StandardAnalyzer.new
46
- end
45
+
46
+ # PerFieldAnalyzer is used to prevent queries like "language:it" to be broken by StopFilter.
47
+ per_field_analyzer=Ferret::Analysis::PerFieldAnalyzer.new(Ferret::Analysis::StandardAnalyzer.new)
48
+ per_field_analyzer[:language]=Ferret::Analysis::WhiteSpaceAnalyzer.new
49
+ Analyzer=per_field_analyzer
50
+ end
@@ -0,0 +1,30 @@
1
+ # You probably shouldn't change those parameters
2
+ # if you don't know what they represent.
3
+ # For more information, refer to:
4
+ # http://ferret.davebalmain.com/api/classes/Ferret/Index/IndexWriter.html
5
+
6
+ ## Main performance parameters
7
+
8
+ # Allowed memory for indexing process.
9
+ # 128MB by default, or 2^27
10
+ max_buffer_memory: 134_217_728
11
+
12
+ # High value => fast indexing, slow searching
13
+ # Low value => slow indexing, fast searching
14
+ # 10 by default
15
+ merge_factor: 10
16
+
17
+ # Maximum number of extracted terms for any given document
18
+ max_field_length: 10_000
19
+
20
+
21
+ ## Other parameters
22
+ # 1MB by default, or 2**20
23
+ chunk_size: 1_048_576
24
+ max_buffered_docs: 10_000
25
+ # NOTE: Be extra careful with this parameter, setting it to -1 (infinite)
26
+ # multiplied indexing time by an order of magnitude.
27
+ # max_merge_docs: -1
28
+ use_compound_file: true
29
+ index_skip_interval: 128
30
+ doc_skip_interval: 16
@@ -16,10 +16,14 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
16
16
  usage if args.empty? and !options[:spec_only]
17
17
  @destination_root = options[:destination]
18
18
 
19
- @directories_to_index=ARGV.collect{|relative_path|
20
- abs_dir=Pathname.new(relative_path).realpath.to_s
21
- "\"#{abs_dir}\" : \"#{abs_dir}\""
22
- }.join("\n ")
19
+ @directories_to_index=if options[:spec_only] then
20
+ "/whatever : /whatever"
21
+ else
22
+ ARGV.collect{|relative_path|
23
+ abs_dir=Pathname.new(relative_path).realpath.to_s
24
+ "\"#{abs_dir}\" : \"#{abs_dir}\""
25
+ }.join("\n ")
26
+ end
23
27
 
24
28
  extract_options
25
29
  end
@@ -63,6 +67,7 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
63
67
  m.template '../config/indexed_directories.yml', 'config/custom/indexed_directories.yml', :assigns => {:directories_to_index => @directories_to_index}
64
68
  m.template '../config/title_and_names_and_links.yml', 'config/custom/title_and_names_and_links.yml', :assigns => {:version => Picolena::VERSION::STRING}
65
69
  m.file '../config/icons_and_filetypes.yml', 'config/custom/icons_and_filetypes.yml'
70
+ m.file '../config/indexing_performance.yml', 'config/custom/indexing_performance.yml'
66
71
 
67
72
  # README, License & Rakefile
68
73
  m.file 'MIT-LICENSE', 'LICENSE'
@@ -22,8 +22,9 @@ class DocumentsController < ApplicationController
22
22
  def show
23
23
  start=Time.now
24
24
  @query=[params[:id],params.delete(:format)].compact.join('.')
25
+ @sort=params[:sort]
25
26
  page=params[:page]||1
26
- finder=Finder.new(@query,page)
27
+ finder=Finder.new(@query,@sort,page)
27
28
  finder.execute!
28
29
  pager=::Paginator.new(finder.total_hits, Picolena::ResultsPerPage) do
29
30
  finder.matching_documents
@@ -47,6 +48,7 @@ class DocumentsController < ApplicationController
47
48
  # Returns the content of the document identified by probably_unique_id, as it was at the time it was indexed.
48
49
  # similar to Google cache.
49
50
  def cached
51
+ @query=[params[:query],params.delete(:format)].compact.join('.')
50
52
  end
51
53
 
52
54
  private
@@ -3,15 +3,15 @@ module DocumentsHelper
3
3
  def nothing_found?
4
4
  @matching_documents.nil? or @matching_documents.entries.empty?
5
5
  end
6
-
6
+
7
7
  # Very basic pagination.
8
8
  # Provides liks to Next, Prev and FirstPage when needed.
9
- def should_paginate(page,query)
10
- [(link_to("&larr;&larr;", :action => :show, :id => query, :page => 1) if page.number>2),
11
- (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number) if page.prev?),
12
- (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number) if page.next?)].compact.join(" | ")
9
+ def should_paginate(page,query, sort)
10
+ [(link_to("&larr;&larr;", :action => :show, :id => query, :sort=>sort) if page.number>2),
11
+ (link_to("&larr;", :action => :show, :id => query, :page => page.prev.number, :sort=>sort) if page.prev?),
12
+ (link_to("&rarr;", :action => :show, :id => query, :page => page.next.number, :sort=>sort) if page.next?)].compact.join(" | ")
13
13
  end
14
-
14
+
15
15
  # Returns a localized sentence like "Results 1-10 of 12 for Zimbabwe (0.472s)" or
16
16
  # "Résultats 1-2 parmi 2 pour whatever (0.012s)"
17
17
  def describe_results(page, total_hits, dt, query)
@@ -24,7 +24,7 @@ module DocumentsHelper
24
24
  show_time_needed(dt)
25
25
  ].join(' ')
26
26
  end
27
-
27
+
28
28
  # Returns the time needed to treat the query and launch the search, with a ms precision : (0.472s)
29
29
  def show_time_needed(dt)
30
30
  content_tag(:small,'('<<number_with_precision(dt,3)<<'s)')
@@ -71,8 +71,17 @@ module DocumentsHelper
71
71
  end
72
72
 
73
73
  # For any indexed document, returns a link to show its cached content.
74
- def link_to_cached_content(document)
74
+ def link_to_cached_content(document, query)
75
75
  link_name="("<<content_tag(:small,:cached.l)<<")"
76
- link_to link_name, cached_document_path(document.probably_unique_id)
76
+ link_to link_name, cached_document_path(:id => document.probably_unique_id, :query => query)
77
+ end
78
+
79
+ def highlighted_cache(document, query)
80
+ h(document.highlighted_cache(query)).gsub(/\n/,'<br/>').gsub(/&lt;&lt;(.*?)&gt;&gt;/,content_tag(:span, '\1', :class=>"matching_content"))
81
+ end
82
+
83
+ def sort_by_date_or_relevance(query)
84
+ [link_to_unless_current('By date', document_path(query, :sort=>'by_date')),
85
+ link_to_unless_current('By relevance', document_path(query))].join("&nbsp;")
77
86
  end
78
87
  end
@@ -11,7 +11,7 @@ class Document
11
11
  end
12
12
 
13
13
  #Delegating properties to File::method_name(complete_path)
14
- [:dirname, :basename, :extname, :ext_as_sym, :file?, :ext_as_sym].each{|method_name|
14
+ [:dirname, :basename, :extname, :ext_as_sym, :file?, :size, :ext_as_sym].each{|method_name|
15
15
  define_method(method_name){File.send(method_name,complete_path)}
16
16
  }
17
17
  alias_method :filename, :basename
@@ -63,11 +63,22 @@ class Document
63
63
  def cached
64
64
  from_index[:content]
65
65
  end
66
+
67
+ def highlighted_cache(raw_query)
68
+ #TODO: Report to Ferret. Highlight should accept :key and not only :doc_id.
69
+ Indexer.index.highlight(Query.extract_from(raw_query), doc_id,
70
+ :field => :content, :excerpt_length => :all,
71
+ :pre_tag => "<<", :post_tag => ">>"
72
+ ).first
73
+ end
66
74
 
67
- # FIXME: Not just date anymore.
68
75
  # Returns the last modification date before the document got indexed.
69
76
  # Useful to know how old a document is, and to which version the cache corresponds.
70
- def date
77
+ def pretty_date
78
+ from_index[:modified].sub(/(\d{4})(\d{2})(\d{2})\d{6}/,'\1-\2-\3')
79
+ end
80
+
81
+ def pretty_mtime
71
82
  from_index[:modified].sub(/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/,'\1-\2-\3 \4:\5:\6')
72
83
  end
73
84
 
@@ -93,6 +104,12 @@ class Document
93
104
  end
94
105
 
95
106
  private
107
+
108
+ # FIXME: Is there a way to easily retrieve doc_id for a given document?
109
+ # Better yet, fix Index#highlight to accept :probably_unique_id and stop using :doc_id.
110
+ def doc_id
111
+ Indexer.index.search(Ferret::Search::TermQuery.new(:probably_unique_id,probably_unique_id)).hits.first.doc
112
+ end
96
113
 
97
114
  # Retrieves the document from the index.
98
115
  # Useful to get meta-info about it.
@@ -5,36 +5,34 @@ class Finder
5
5
  @@index ||= Indexer.index
6
6
  end
7
7
 
8
- def initialize(raw_query,page=1,results_per_page=Picolena::ResultsPerPage)
8
+ def initialize(raw_query,by_date=false, page=1,results_per_page=Picolena::ResultsPerPage)
9
9
  @query = Query.extract_from(raw_query)
10
10
  @raw_query= raw_query
11
11
  Indexer.ensure_index_existence
12
12
  @per_page=results_per_page
13
13
  @offset=(page.to_i-1)*results_per_page
14
+ @by_date=by_date
14
15
  index_should_have_documents
15
16
  end
16
17
 
17
18
  def execute!
18
19
  @matching_documents=[]
19
20
  start=Time.now
20
- top_docs=index.search(query, :limit => @per_page, :offset=>@offset)
21
- top_docs.hits.each{|hit|
22
- index_id,score=hit.doc,hit.score
23
- begin
24
- found_doc=Document.new(index[index_id][:complete_path])
25
- found_doc.matching_content=index.highlight(query, index_id,
26
- :field => :content, :excerpt_length => 80,
27
- :pre_tag => "<<", :post_tag => ">>"
28
- ) unless @raw_query=~/^\*+\.\w*$/
29
- found_doc.score=score
30
- @matching_documents<<found_doc
31
- rescue Errno::ENOENT
32
- #"File has been moved/deleted!"
33
- end
21
+ @total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @by_date)){|index_id, score|
22
+ begin
23
+ found_doc=Document.new(index[index_id][:complete_path])
24
+ found_doc.matching_content=index.highlight(query, index_id,
25
+ :field => :content, :excerpt_length => 80,
26
+ :pre_tag => "<<", :post_tag => ">>"
27
+ )
28
+ found_doc.score=score
29
+ @matching_documents<<found_doc
30
+ rescue Errno::ENOENT
31
+ #"File has been moved/deleted!"
32
+ end
34
33
  }
35
34
  @executed=true
36
- @time_needed=Time.now-start
37
- @total_hits=top_docs.total_hits
35
+ @time_needed=Time.now-start
38
36
  end
39
37
 
40
38
  # Returns true if it has been executed.
@@ -54,13 +52,15 @@ class Finder
54
52
  }
55
53
  }
56
54
 
57
-
58
-
59
55
  def self.reload!
60
56
  @@index = nil
61
57
  end
62
58
 
63
59
  private
60
+
61
+ def sort_by_date
62
+ Ferret::Search::SortField.new(:modified, :type => :byte, :reverse => true)
63
+ end
64
64
 
65
65
  def index_should_have_documents
66
66
  raise IndexError, "no document found" unless index.size > 0
@@ -10,6 +10,7 @@ class Indexer
10
10
  def index_every_directory(remove_first=false)
11
11
  @@do_not_disturb_while_indexing=true
12
12
  clear! if remove_first
13
+ @from_scratch = remove_first
13
14
  # Forces Finder.searcher and Finder.index to be reloaded, by removing them from the cache.
14
15
  Finder.reload!
15
16
  log :debug => "Indexing every directory"
@@ -35,13 +36,19 @@ class Indexer
35
36
  prepare_multi_threads_environment
36
37
 
37
38
  indexing_list_chunks.each_with_thread{|chunk|
38
- chunk.each{|filename|
39
- add_file(filename)
39
+ chunk.each{|complete_path|
40
+ last_itime=index_time_dbm_file[complete_path]
41
+ if @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime) then
42
+ add_or_update_file(complete_path)
43
+ else
44
+ log :debug => "Identical : #{complete_path}"
45
+ end
46
+ index_time_dbm_file[complete_path] = Time.now._dump
40
47
  }
41
48
  }
42
49
  end
43
50
 
44
- def add_file(complete_path)
51
+ def add_or_update_file(complete_path)
45
52
  default_fields = Document.default_fields_for(complete_path)
46
53
  begin
47
54
  document = PlainTextExtractor.extract_content_and_language_from(complete_path)
@@ -69,6 +76,19 @@ class Indexer
69
76
  # Ferret will SEGFAULT otherwise.
70
77
  @@index = nil
71
78
  end
79
+
80
+
81
+ # Checks for indexed files that are missing from filesytem
82
+ # and removes them from index & dbm file.
83
+ def prune_index
84
+ missing_files=index_time_dbm_file.reject{|filename,itime| File.exists?(filename) && Picolena::IndexedDirectories.any?{|dir,alias_path| filename.starts_with?(dir)}}
85
+ missing_files.each{|filename, itime|
86
+ index.writer.delete(:complete_path, filename)
87
+ index_time_dbm_file.delete(filename)
88
+ log :debug => "Removed : #{filename}"
89
+ }
90
+ index.optimize
91
+ end
72
92
 
73
93
  # Only one IndexWriter should be instantiated.
74
94
  # If one index already exists, returns it.
@@ -81,11 +101,17 @@ class Indexer
81
101
  index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production"
82
102
  end
83
103
 
84
- def doc_count
85
- index.writer.doc_count
104
+ # Returns how many files are indexed.
105
+ def size
106
+ index.size
86
107
  end
87
108
 
88
109
  private
110
+
111
+ # Copied from Ferret book, By David Balmain
112
+ def index_time_dbm_file
113
+ @@dbm_file ||= DBM.open(File.join(Picolena::IndexSavePath, 'added_at'))
114
+ end
89
115
 
90
116
  def index_exists?
91
117
  index_filename and File.exists?(index_filename)
@@ -108,7 +134,7 @@ class Indexer
108
134
  :field_infos => default_field_infos,
109
135
  # Great way to ensure that no file is indexed twice!
110
136
  :key => :probably_unique_id
111
- }
137
+ }.merge Picolena::IndexingConfiguration
112
138
  end
113
139
 
114
140
  def default_field_infos
@@ -120,7 +146,7 @@ class Indexer
120
146
  field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
121
147
  field_infos.add_field(:modified, :store => :yes, :index => :untokenized)
122
148
  field_infos.add_field(:probably_unique_id, :store => :no, :index => :untokenized)
123
- field_infos.add_field(:language, :store => :yes, :index => :yes)
149
+ field_infos.add_field(:language, :store => :yes, :index => :untokenized)
124
150
  end
125
151
  end
126
152
 
@@ -130,7 +156,8 @@ class Indexer
130
156
  # an IndexWriter at the same time, and get a
131
157
  # Ferret::Store::Lock::LockError
132
158
  index
133
- # NOTE: is it really necessary?
159
+ # Opens dbm file to dump indexing time.
160
+ index_time_dbm_file
134
161
  # ActiveSupport sometime raises
135
162
  # Expected Object is NOT missing constant
136
163
  # without.
@@ -140,4 +167,4 @@ class Indexer
140
167
  PlainTextExtractor
141
168
  end
142
169
  end
143
- end
170
+ end
@@ -7,5 +7,10 @@
7
7
  -if document.supported?
8
8
  %p
9
9
  =link_to_plain_text_content(document)
10
- =link_to_cached_content(document)
11
- %hr/
10
+ &#45;
11
+ =number_to_human_size(document.size)
12
+ &#45;
13
+ =document.pretty_date
14
+ &#45;
15
+ =link_to_cached_content(document,query)
16
+ %hr/
@@ -2,7 +2,7 @@
2
2
  =link_to icon_and_filename_for(@document), download_document_path(@probably_unique_id)
3
3
  (
4
4
  =:as_it_was_indexed_on.l
5
- =@document.date
5
+ =@document.pretty_date
6
6
  )
7
7
  %p=link_to_containing_directory(@document)
8
- %blockquote=h(@document.cached).gsub(/\n/,'<br/>')
8
+ %blockquote=highlighted_cache(@document, @query)
@@ -7,6 +7,9 @@
7
7
  %strong=h(@query)
8
8
  =show_time_needed(@time_needed)
9
9
  -else
10
- %span{:class=>'pagination'}=should_paginate(@matching_documents, @query)
10
+ %span{:class=>'pagination'}=should_paginate(@matching_documents, @query, @sort)
11
11
  =describe_results(@matching_documents, @total_hits, @time_needed, h(@query))
12
- = render :partial =>'document', :collection => @matching_documents
12
+ -unless nothing_found?
13
+ %p
14
+ %span{:class=>'sort_by'}=sort_by_date_or_relevance(@query)
15
+ = render :partial =>'document', :collection => @matching_documents, :locals => { :query => @query}
@@ -1,4 +1,4 @@
1
- %w(rubygems paginator fileutils pathname logger thread).each{|lib| require lib}
1
+ %w(rubygems paginator fileutils pathname logger thread dbm).each{|lib| require lib}
2
2
 
3
3
  # Uncomment below to force Rails into production mode when
4
4
  # you don't control web/app server and can't set it the proper way
@@ -0,0 +1,6 @@
1
+ module Picolena
2
+ IndexingConfiguration={}
3
+ YAML.load_file('config/custom/indexing_performance.yml').each_pair{|param, value|
4
+ IndexingConfiguration[param.to_sym]= value=~/^[\d_]+$/ ? value.to_i : value
5
+ }
6
+ end
@@ -14,10 +14,15 @@ namespace :index do
14
14
  task :update => :environment do
15
15
  Indexer.index_every_directory
16
16
  end
17
+
18
+ desc 'Remove unneeded files from index'
19
+ task :prune => :environment do
20
+ Indexer.prune_index
21
+ end
17
22
 
18
23
  desc 'Returns the number of indexed documents'
19
24
  task :size => :environment do
20
- puts "#{Indexer.doc_count} documents are currently indexed in #{Picolena::IndexSavePath}"
25
+ puts "#{Indexer.size} documents are currently indexed in #{Picolena::IndexSavePath}"
21
26
  end
22
27
 
23
28
  # Search index with query "some query" :
@@ -30,7 +30,7 @@ namespace :install_dependencies do
30
30
  task :deb_packages do
31
31
  root_privileges_required!
32
32
  #TODO: Should load this list from defined PlainTextExtractor's
33
- packages=%w{antiword poppler-utils odt2txt html2text catdoc unrtf mguesser}.join(" ")
33
+ packages=%w{antiword poppler-utils odt2txt html2text catdoc unrtf mguesser libdbm-ruby1.8}.join(" ")
34
34
  puts "Installing "<<packages
35
35
  system("apt-get install "<<packages)
36
36
  end
@@ -82,6 +82,17 @@ h1, h2, h3, h4, h5, h6, p, form {
82
82
  text-decoration:none;
83
83
  }
84
84
 
85
+ .sort_by {
86
+ float:right;
87
+ font-size: 13px;
88
+ color:#000;
89
+ }
90
+
91
+ .sort_by a{
92
+ color: #EE8907;
93
+ text-decoration:none;
94
+ }
95
+
85
96
  #mainimg input.btn{
86
97
  margin-right: 10px;
87
98
  height: 20px;
@@ -116,7 +127,7 @@ width: 80%;
116
127
 
117
128
  #results {
118
129
  width:778px;
119
- padding-top: 25px;
130
+ padding-top: 15px;
120
131
  }
121
132
 
122
133
  #results h2 a{
@@ -137,6 +148,11 @@ width: 80%;
137
148
  padding:0px 20px;
138
149
  }
139
150
 
151
+ #results .matching_content{
152
+ background-color:#ffff66;
153
+ }
154
+
155
+
140
156
  #results a, #results small{
141
157
  font-family:"Trebuchet MS";
142
158
  font-size:11px;
@@ -50,16 +50,16 @@ describe "Basic Finder" do
50
50
  Indexer.index_every_directory(remove_first=true)
51
51
  end
52
52
 
53
- it "should accept one parameter as query, and 2 optionals for paginating" do
53
+ it "should accept one parameter as query, 1 optional for sorting results and 2 optionals for paginating" do
54
54
  lambda {Finder.new}.should raise_error(ArgumentError, "wrong number of arguments (0 for 1)")
55
55
  # show first page with 10 results per page
56
56
  lambda {Finder.new("a b")}.should_not raise_error
57
57
  # show second page
58
- lambda {Finder.new("a", 2)}.should_not raise_error
58
+ lambda {Finder.new("a", "by_date")}.should_not raise_error
59
59
  # show first page with 15 results
60
- lambda {Finder.new("a", 1, 15)}.should_not raise_error
60
+ lambda {Finder.new("a", "by_date", 1, 15)}.should_not raise_error
61
61
  # Too many parameters
62
- lambda {Finder.new("a", 10, 20, 30)}.should raise_error(ArgumentError, "wrong number of arguments (4 for 3)")
62
+ lambda {Finder.new("a", "by_date", 10, 20, 30)}.should raise_error(ArgumentError, "wrong number of arguments (5 for 4)")
63
63
  end
64
64
 
65
65
  it "should return matching documents if executed successfully" do
@@ -5,28 +5,30 @@ basic_pdf_attribute={
5
5
  :basename=>'basic',
6
6
  :complete_path=>File.join(RAILS_ROOT, '/spec/test_dirs/indexed/basic/basic.pdf'),
7
7
  :extname=>'.pdf',
8
- :filename=>'basic.pdf'
8
+ :ext_as_sym => :pdf,
9
+ :filename=>'basic.pdf',
10
+ :size => 9380
9
11
  }
10
12
 
11
13
  describe Document do
12
14
  before(:each) do
13
- @valid_random_doc=Document.find(:random) rescue Document.new("spec/test_dirs/indexed/basic/basic.pdf")
15
+ @valid_document=Document.new("spec/test_dirs/indexed/basic/basic.pdf")
14
16
  end
15
17
 
16
18
  it "should be an existing file" do
17
19
  lambda {Document.new("/patapouf.txt")}.should raise_error(Errno::ENOENT)
18
- lambda {@valid_random_doc}.should_not raise_error
20
+ lambda {@valid_document}.should_not raise_error
19
21
  lambda {Document.new("spec/test_dirs/not_indexed/Rakefile")}.should_not raise_error(Errno::ENOENT)
20
22
  end
21
23
 
22
24
  it "should belong to an indexed directory" do
23
- lambda {@valid_random_doc}.should_not raise_error
25
+ lambda {@valid_document}.should_not raise_error
24
26
  lambda {Document.new("spec/test_dirs/not_indexed/Rakefile")}.should raise_error(ArgumentError, "required document is not in indexed directory")
25
27
  end
26
28
 
27
29
  basic_pdf_attribute.each{|attribute,expected_value|
28
30
  it "should know its #{attribute}" do
29
- @valid_random_doc.should respond_to(attribute)
31
+ @valid_document.should respond_to(attribute)
30
32
  @basic_pdf=Document.new('spec/test_dirs/indexed/basic/basic.pdf')
31
33
  @basic_pdf.send(attribute).should == expected_value
32
34
  end
@@ -36,23 +38,70 @@ describe Document do
36
38
  another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
37
39
  another_doc.content.should == "just a content test\nin a txt file"
38
40
  end
41
+
42
+ it "should know its cached content" do
43
+ another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
44
+ another_doc.cached.should == "just a content test\nin a txt file"
45
+ end
46
+
47
+ it "should know its highlighted cached content for a given query" do
48
+ another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
49
+ another_doc.highlighted_cache('a content test').should == "just a <<content>> <<test>>\nin a txt file"
50
+ end
39
51
 
40
52
  it "should know its alias_path" do
41
- @valid_random_doc.should respond_to(:alias_path)
42
- @valid_random_doc.alias_path.starts_with?("http://picolena.devjavu.com/browser/trunk/lib/picolena/templates/spec/test_dirs/indexed").should be_true
53
+ @valid_document.should respond_to(:alias_path)
54
+ @valid_document.alias_path.starts_with?("http://picolena.devjavu.com/browser/trunk/lib/picolena/templates/spec/test_dirs/indexed").should be_true
55
+ end
56
+
57
+ it "should know its probably_unique_id" do
58
+ @valid_document.should respond_to(:probably_unique_id)
59
+ @valid_document.probably_unique_id.should =~/^[a-z]+$/
60
+ @valid_document.probably_unique_id.size.should == Picolena::HashLength
43
61
  end
62
+
63
+ it "should know its modification date" do
64
+ @valid_document.pretty_date.class.should == String
65
+ @valid_document.pretty_date.should =~/^\d{4}\-\d{2}\-\d{2}$/
66
+ end
67
+
68
+ it "should know its modification time and returns it in a pretty way" do
69
+ @valid_document.should respond_to(:mtime)
70
+ @valid_document.mtime.should be_kind_of(Integer)
71
+ @valid_document.should respond_to(:pretty_mtime)
72
+ @valid_document.pretty_mtime.class.should == String
73
+ @valid_document.pretty_mtime.should =~/^\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}$/
74
+ end
75
+
76
+ it "should know if its content can be extracted" do
77
+ @valid_document.should respond_to(:supported?)
78
+ @valid_document.should be_supported
79
+ Document.new("spec/test_dirs/indexed/others/ghjopdfg.xyz").should_not be_supported
80
+ end
81
+
82
+ it "should know its language when enough content is available" do
83
+ Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
84
+ Document.new("spec/test_dirs/indexed/lang/shakespeare").language.should == "en"
85
+ Document.new("spec/test_dirs/indexed/lang/lorca").language.should == "es"
86
+ Document.new("spec/test_dirs/indexed/lang/hugo").language.should == "fr"
87
+ end if Picolena::UseLanguageRecognition
88
+
89
+ it "should not try to guess language when file is too small" do
90
+ Document.new("spec/test_dirs/indexed/basic/hello.rb").language.should be_nil
91
+ Document.new("spec/test_dirs/indexed/README").language.should be_nil
92
+ end if Picolena::UseLanguageRecognition
44
93
 
45
94
  it "should let finder specify its score" do
46
- @valid_random_doc.should respond_to(:score)
47
- @valid_random_doc.score.should be_nil
48
- @valid_random_doc.score=25
49
- @valid_random_doc.score.should == 25
95
+ @valid_document.should respond_to(:score)
96
+ @valid_document.score.should be_nil
97
+ @valid_document.score=25
98
+ @valid_document.score.should == 25
50
99
  end
51
100
 
52
101
  it "should let finder specify its matching content" do
53
- @valid_random_doc.should respond_to(:matching_content)
54
- @valid_random_doc.matching_content.should be_nil
55
- @valid_random_doc.matching_content=["thermal cooling", "heat driven cooling"]
56
- @valid_random_doc.matching_content.should include("thermal cooling")
102
+ @valid_document.should respond_to(:matching_content)
103
+ @valid_document.matching_content.should be_nil
104
+ @valid_document.matching_content=["thermal cooling", "heat driven cooling"]
105
+ @valid_document.matching_content.should include("thermal cooling")
57
106
  end
58
- end
107
+ end
@@ -8,9 +8,9 @@ end
8
8
 
9
9
 
10
10
  def matching_document_for(query)
11
- # Returns matching document for any given query only if
12
- # exactly one document is found.
13
- # Specs don't pass otherwise.
11
+ # Returns matching document for any given query only if
12
+ # exactly one document is found.
13
+ # Specs don't pass otherwise.
14
14
  matching_documents=Finder.new(query).matching_documents
15
15
  matching_documents.size.should == 1
16
16
  matching_documents.first
@@ -19,6 +19,7 @@ end
19
19
 
20
20
  describe Finder do
21
21
  before(:all) do
22
+ Globalite.language = :en
22
23
  # SVN doesn't like non-ascii filenames.
23
24
  revert_changes!('spec/test_dirs/indexed/others/bäñüßé.txt',"just to know if files are indexed with utf8 filenames")
24
25
 
@@ -13,10 +13,18 @@ describe "Host indexing system" do
13
13
 
14
14
  it "should know which IP addresses are allowed (config/custom/white_list_ip.yml)" do
15
15
  File.should be_readable('config/custom/white_list_ip.yml')
16
+ ip_conf=YAML.load_file('config/custom/white_list_ip.yml')
17
+ ip_conf.class.should == Hash
18
+ ip_conf['Allow'].should_not be_nil
16
19
  end
17
20
 
18
21
  it "should know which directories are to be indexed (config/custom/indexed_directories.yml)" do
19
22
  File.should be_readable('config/custom/indexed_directories.yml')
23
+ dirs_conf=YAML.load_file('config/custom/indexed_directories.yml')
24
+ dirs_conf.class.should == Hash
25
+ %w(development test production).all?{|env|
26
+ dirs_conf[env].should_not be_nil
27
+ }
20
28
  end
21
29
 
22
30
  it "should be able to calculate base26 hash from strings" do
@@ -1,7 +1,7 @@
1
1
  require File.dirname(__FILE__) + '/../spec_helper'
2
2
 
3
3
  describe Indexer do
4
- before(:each) do
5
- @indexer = Indexer.new
4
+ it "should have at least 32MB memory allocated" do
5
+ Indexer.index.writer.max_buffer_memory.should > 2**25-1
6
6
  end
7
7
  end
@@ -27,16 +27,4 @@ describe "PlainTextExtractors" do
27
27
  end
28
28
  }
29
29
  }
30
-
31
- it "should guess language when enough content is available" do
32
- Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
33
- Document.new("spec/test_dirs/indexed/lang/shakespeare").language.should == "en"
34
- Document.new("spec/test_dirs/indexed/lang/lorca").language.should == "es"
35
- Document.new("spec/test_dirs/indexed/lang/hugo").language.should == "fr"
36
- end if Picolena::UseLanguageRecognition
37
-
38
- it "should not try to guess language when file is too small" do
39
- Document.new("spec/test_dirs/indexed/basic/hello.rb").language.should be_nil
40
- Document.new("spec/test_dirs/indexed/README").language.should be_nil
41
- end if Picolena::UseLanguageRecognition
42
30
  end
@@ -1,8 +1,16 @@
1
1
  require File.dirname(__FILE__) + '/../spec_helper'
2
2
 
3
3
  describe Query do
4
- it "should return a BooleanQuery" do
4
+ it "should return a BooleanQuery, a TermQuery or a RangeQuery" do
5
5
  Query.extract_from("whatever").class.should == Ferret::Search::BooleanQuery
6
+ Query.extract_from("lang:de").class.should == Ferret::Search::TermQuery
7
+ Query.extract_from("date:<1990").class.should == Ferret::Search::RangeQuery
8
+ end
9
+
10
+ it "should not remove stop-words from TermQuery" do
11
+ # it means "Italian language", but also is a stop-word.
12
+ Query.extract_from("lang:it").class.should == Ferret::Search::TermQuery
13
+ Query.extract_from("lang:it").to_s.should == "language:it"
6
14
  end
7
15
 
8
16
  it "should translate LIKE, NOT, OR and AND boolean ops to English" do
@@ -12,6 +20,7 @@ describe Query do
12
20
  :fr=>["COMME","NON","OU","ET"]
13
21
  }
14
22
 
23
+ Globalite.language = :en
15
24
  english_query_with_like_and_not=Query.extract_from("LIKE something NOT something")
16
25
  english_query_with_or=Query.extract_from("test OR another")
17
26
  english_query_with_and=Query.extract_from("test AND another")
@@ -2,7 +2,7 @@ module Picolena #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 1
5
- TINY = 6
5
+ TINY = 7
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
data/website/index.html CHANGED
@@ -33,7 +33,7 @@
33
33
  <h1>Picolena</h1>
34
34
  <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
35
35
  <p>Get Version</p>
36
- <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.6</a>
36
+ <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.7</a>
37
37
  </div>
38
38
  <h1>&#x2192; &#8216;picolena&#8217;</h1>
39
39
 
data/website/index.txt CHANGED
File without changes
File without changes
File without changes
File without changes
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: picolena
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Duminil
@@ -30,7 +30,7 @@ cert_chain:
30
30
  qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2008-04-25 00:00:00 +02:00
33
+ date: 2008-04-30 00:00:00 +02:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
@@ -146,6 +146,7 @@ files:
146
146
  - lib/picolena/config/basic.rb
147
147
  - lib/picolena/config/icons_and_filetypes.yml
148
148
  - lib/picolena/config/indexed_directories.yml
149
+ - lib/picolena/config/indexing_performance.yml
149
150
  - lib/picolena/config/title_and_names_and_links.yml
150
151
  - lib/picolena/config/white_list_ip.yml
151
152
  - lib/picolena/picolena_generator.rb
@@ -177,6 +178,7 @@ files:
177
178
  - lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
178
179
  - lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
179
180
  - lib/picolena/templates/config/initializers/006_load_icons.rb
181
+ - lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb
180
182
  - lib/picolena/templates/config/routes.rb
181
183
  - lib/picolena/templates/lang/ui/de.yml
182
184
  - lib/picolena/templates/lang/ui/en.yml
metadata.gz.sig CHANGED
@@ -1 +1,2 @@
1
- ��"#m��EZY����v��>��lmLW A�ft �-�����<d�B��w]��T7��ꞅ�tR-i��X�W��%2e� 8f�]b�M�<IAF�ͯ7]krz�)w� ��
1
+ �;����U�=nƷ�8߿X�`>����B����2Ħ@,u!��~�u9>�Ӽq�J1� ֖i�T������-.q�^l*�`�>"��m�8��ɏP�cWk��y%����W�’:r=&����Ct aO;c
2
+ .&��}�e)�g(O�)0ة)!����s�
3
+ �"��Fm��>8���n���q�?I�P'����`|����`�\�>{\a4�Ӷ�JǮ}�&�?�d�UM{