picolena 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +12 -3
- data/Manifest.txt +2 -0
- data/bin/picolena +1 -1
- data/config/files_to_clean +1 -0
- data/lib/picolena/config/basic.rb +6 -2
- data/lib/picolena/config/indexing_performance.yml +30 -0
- data/lib/picolena/picolena_generator.rb +9 -4
- data/lib/picolena/templates/app/controllers/documents_controller.rb +3 -1
- data/lib/picolena/templates/app/helpers/documents_helper.rb +18 -9
- data/lib/picolena/templates/app/models/document.rb +20 -3
- data/lib/picolena/templates/app/models/finder.rb +19 -19
- data/lib/picolena/templates/app/models/indexer.rb +36 -9
- data/lib/picolena/templates/app/views/documents/_document.html.haml +7 -2
- data/lib/picolena/templates/app/views/documents/cached.html.haml +2 -2
- data/lib/picolena/templates/app/views/documents/show.html.haml +5 -2
- data/lib/picolena/templates/config/environment.rb +1 -1
- data/lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb +6 -0
- data/lib/picolena/templates/lib/tasks/index.rake +6 -1
- data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
- data/lib/picolena/templates/public/stylesheets/style.css +17 -1
- data/lib/picolena/templates/spec/models/basic_finder_spec.rb +4 -4
- data/lib/picolena/templates/spec/models/document_spec.rb +65 -16
- data/lib/picolena/templates/spec/models/finder_spec.rb +4 -3
- data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +8 -0
- data/lib/picolena/templates/spec/models/indexer_spec.rb +2 -2
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +0 -12
- data/lib/picolena/templates/spec/models/query_spec.rb +10 -1
- data/lib/picolena/version.rb +1 -1
- data/website/index.html +1 -1
- data/website/index.txt +0 -0
- data/website/index_devjavu +0 -0
- data/website/javascripts/rounded_corners_lite.inc.js +0 -0
- data/website/stylesheets/screen.css +0 -0
- data.tar.gz.sig +0 -0
- metadata +4 -2
- metadata.gz.sig +3 -1
data/History.txt
CHANGED
@@ -1,10 +1,19 @@
|
|
1
|
+
== 0.1.7 2008-04-30
|
2
|
+
|
3
|
+
* 5 minor enhancements:
|
4
|
+
* added cache highlighting à la Google
|
5
|
+
* rake index:update implemented as described in Ferret book by David Balmain
|
6
|
+
* rake index:prune removes missing files from indexer
|
7
|
+
* possibility to sort results by relevance / by date
|
8
|
+
* one configuration file for performance tweaks
|
9
|
+
|
1
10
|
== 0.1.6 2008-04-25
|
2
11
|
|
3
12
|
* 1 minor enhancement:
|
4
13
|
* replaced index key by Document#probably_unique_id
|
5
14
|
|
6
15
|
* bug fixes:
|
7
|
-
* Added forgotten public/images/flags to generator file
|
16
|
+
* Added forgotten public/images/flags to generator file
|
8
17
|
|
9
18
|
== 0.1.5 2008-04-25
|
10
19
|
|
@@ -24,7 +33,7 @@
|
|
24
33
|
== 0.1.3 2008-04-20
|
25
34
|
|
26
35
|
* 1 bug fix:
|
27
|
-
* removed verbose debug info
|
36
|
+
* removed verbose debug info
|
28
37
|
|
29
38
|
== 0.1.2 2008-04-20
|
30
39
|
|
@@ -49,7 +58,7 @@
|
|
49
58
|
* 3 minor enhancements:
|
50
59
|
* can now be installed on win32 (doesn't pass every spec though)
|
51
60
|
* moved rails_plugins away from lib/ so that they don't get parsed by rdoc/ri
|
52
|
-
* shorter and prettier base26_hash id for documents
|
61
|
+
* shorter and prettier base26_hash id for documents
|
53
62
|
|
54
63
|
== 0.0.99 2008-04-06
|
55
64
|
|
data/Manifest.txt
CHANGED
@@ -11,6 +11,7 @@ lib/picolena/USAGE
|
|
11
11
|
lib/picolena/config/basic.rb
|
12
12
|
lib/picolena/config/icons_and_filetypes.yml
|
13
13
|
lib/picolena/config/indexed_directories.yml
|
14
|
+
lib/picolena/config/indexing_performance.yml
|
14
15
|
lib/picolena/config/title_and_names_and_links.yml
|
15
16
|
lib/picolena/config/white_list_ip.yml
|
16
17
|
lib/picolena/picolena_generator.rb
|
@@ -42,6 +43,7 @@ lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb
|
|
42
43
|
lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
|
43
44
|
lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
|
44
45
|
lib/picolena/templates/config/initializers/006_load_icons.rb
|
46
|
+
lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb
|
45
47
|
lib/picolena/templates/config/routes.rb
|
46
48
|
lib/picolena/templates/lang/ui/de.yml
|
47
49
|
lib/picolena/templates/lang/ui/en.yml
|
data/bin/picolena
CHANGED
@@ -11,7 +11,7 @@ if %w(-v --version).include? ARGV.first
|
|
11
11
|
exit(0)
|
12
12
|
end
|
13
13
|
|
14
|
-
action= ARGV.
|
14
|
+
action= ARGV.any?{|opt| opt[0,6]=="--spec"} ? "testing" : "installing"
|
15
15
|
|
16
16
|
require 'rubigen/scripts/generate'
|
17
17
|
source = RubiGen::PathSource.new(:application,
|
data/config/files_to_clean
CHANGED
@@ -5,6 +5,7 @@ lib/picolena/templates/config/custom/indexed_directories.yml
|
|
5
5
|
lib/picolena/templates/config/custom/white_list_ip.yml
|
6
6
|
lib/picolena/templates/config/custom/title_and_names_and_links.yml
|
7
7
|
lib/picolena/templates/config/custom/icons_and_filetypes.yml
|
8
|
+
lib/picolena/templates/config/custom/indexing_performance.yml
|
8
9
|
lib/picolena/templates/log
|
9
10
|
lib/picolena/templates/spec/test_dirs/indexed/others/bäñüßé.txt
|
10
11
|
lib/picolena/templates/tmp
|
@@ -42,5 +42,9 @@ module Picolena
|
|
42
42
|
# Specify the default Levenshtein distance when using FuzzyQuery
|
43
43
|
# see http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html for more information.
|
44
44
|
Ferret::Search::FuzzyQuery.default_min_similarity=0.6
|
45
|
-
|
46
|
-
|
45
|
+
|
46
|
+
# PerFieldAnalyzer is used to prevent queries like "language:it" to be broken by StopFilter.
|
47
|
+
per_field_analyzer=Ferret::Analysis::PerFieldAnalyzer.new(Ferret::Analysis::StandardAnalyzer.new)
|
48
|
+
per_field_analyzer[:language]=Ferret::Analysis::WhiteSpaceAnalyzer.new
|
49
|
+
Analyzer=per_field_analyzer
|
50
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# You probably shouldn't change those parameters
|
2
|
+
# if you don't know what they represent.
|
3
|
+
# For more information, refer to:
|
4
|
+
# http://ferret.davebalmain.com/api/classes/Ferret/Index/IndexWriter.html
|
5
|
+
|
6
|
+
## Main performance parameters
|
7
|
+
|
8
|
+
# Allowed memory for indexing process.
|
9
|
+
# 128MB by default, or 2^27
|
10
|
+
max_buffer_memory: 134_217_728
|
11
|
+
|
12
|
+
# High value => fast indexing, slow searching
|
13
|
+
# Low value => slow indexing, fast searching
|
14
|
+
# 10 by default
|
15
|
+
merge_factor: 10
|
16
|
+
|
17
|
+
# Maximum number of extracted terms for any given document
|
18
|
+
max_field_length: 10_000
|
19
|
+
|
20
|
+
|
21
|
+
## Other parameters
|
22
|
+
# 1MB by default, or 2**20
|
23
|
+
chunk_size: 1_048_576
|
24
|
+
max_buffered_docs: 10_000
|
25
|
+
# NOTE: Be extra careful with this parameter, setting it to -1 (infinite)
|
26
|
+
# multiplied indexing time by an order of magnitude.
|
27
|
+
# max_merge_docs: -1
|
28
|
+
use_compound_file: true
|
29
|
+
index_skip_interval: 128
|
30
|
+
doc_skip_interval: 16
|
@@ -16,10 +16,14 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
|
|
16
16
|
usage if args.empty? and !options[:spec_only]
|
17
17
|
@destination_root = options[:destination]
|
18
18
|
|
19
|
-
@directories_to_index=
|
20
|
-
|
21
|
-
|
22
|
-
|
19
|
+
@directories_to_index=if options[:spec_only] then
|
20
|
+
"/whatever : /whatever"
|
21
|
+
else
|
22
|
+
ARGV.collect{|relative_path|
|
23
|
+
abs_dir=Pathname.new(relative_path).realpath.to_s
|
24
|
+
"\"#{abs_dir}\" : \"#{abs_dir}\""
|
25
|
+
}.join("\n ")
|
26
|
+
end
|
23
27
|
|
24
28
|
extract_options
|
25
29
|
end
|
@@ -63,6 +67,7 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
|
|
63
67
|
m.template '../config/indexed_directories.yml', 'config/custom/indexed_directories.yml', :assigns => {:directories_to_index => @directories_to_index}
|
64
68
|
m.template '../config/title_and_names_and_links.yml', 'config/custom/title_and_names_and_links.yml', :assigns => {:version => Picolena::VERSION::STRING}
|
65
69
|
m.file '../config/icons_and_filetypes.yml', 'config/custom/icons_and_filetypes.yml'
|
70
|
+
m.file '../config/indexing_performance.yml', 'config/custom/indexing_performance.yml'
|
66
71
|
|
67
72
|
# README, License & Rakefile
|
68
73
|
m.file 'MIT-LICENSE', 'LICENSE'
|
@@ -22,8 +22,9 @@ class DocumentsController < ApplicationController
|
|
22
22
|
def show
|
23
23
|
start=Time.now
|
24
24
|
@query=[params[:id],params.delete(:format)].compact.join('.')
|
25
|
+
@sort=params[:sort]
|
25
26
|
page=params[:page]||1
|
26
|
-
finder=Finder.new(@query,page)
|
27
|
+
finder=Finder.new(@query,@sort,page)
|
27
28
|
finder.execute!
|
28
29
|
pager=::Paginator.new(finder.total_hits, Picolena::ResultsPerPage) do
|
29
30
|
finder.matching_documents
|
@@ -47,6 +48,7 @@ class DocumentsController < ApplicationController
|
|
47
48
|
# Returns the content of the document identified by probably_unique_id, as it was at the time it was indexed.
|
48
49
|
# similar to Google cache.
|
49
50
|
def cached
|
51
|
+
@query=[params[:query],params.delete(:format)].compact.join('.')
|
50
52
|
end
|
51
53
|
|
52
54
|
private
|
@@ -3,15 +3,15 @@ module DocumentsHelper
|
|
3
3
|
def nothing_found?
|
4
4
|
@matching_documents.nil? or @matching_documents.entries.empty?
|
5
5
|
end
|
6
|
-
|
6
|
+
|
7
7
|
# Very basic pagination.
|
8
8
|
# Provides liks to Next, Prev and FirstPage when needed.
|
9
|
-
def should_paginate(page,query)
|
10
|
-
[(link_to("←←", :action => :show, :id => query, :
|
11
|
-
(link_to("←", :action => :show, :id => query, :page => page.prev.number) if page.prev?),
|
12
|
-
(link_to("→", :action => :show, :id => query, :page => page.next.number) if page.next?)].compact.join(" | ")
|
9
|
+
def should_paginate(page,query, sort)
|
10
|
+
[(link_to("←←", :action => :show, :id => query, :sort=>sort) if page.number>2),
|
11
|
+
(link_to("←", :action => :show, :id => query, :page => page.prev.number, :sort=>sort) if page.prev?),
|
12
|
+
(link_to("→", :action => :show, :id => query, :page => page.next.number, :sort=>sort) if page.next?)].compact.join(" | ")
|
13
13
|
end
|
14
|
-
|
14
|
+
|
15
15
|
# Returns a localized sentence like "Results 1-10 of 12 for Zimbabwe (0.472s)" or
|
16
16
|
# "Résultats 1-2 parmi 2 pour whatever (0.012s)"
|
17
17
|
def describe_results(page, total_hits, dt, query)
|
@@ -24,7 +24,7 @@ module DocumentsHelper
|
|
24
24
|
show_time_needed(dt)
|
25
25
|
].join(' ')
|
26
26
|
end
|
27
|
-
|
27
|
+
|
28
28
|
# Returns the time needed to treat the query and launch the search, with a ms precision : (0.472s)
|
29
29
|
def show_time_needed(dt)
|
30
30
|
content_tag(:small,'('<<number_with_precision(dt,3)<<'s)')
|
@@ -71,8 +71,17 @@ module DocumentsHelper
|
|
71
71
|
end
|
72
72
|
|
73
73
|
# For any indexed document, returns a link to show its cached content.
|
74
|
-
def link_to_cached_content(document)
|
74
|
+
def link_to_cached_content(document, query)
|
75
75
|
link_name="("<<content_tag(:small,:cached.l)<<")"
|
76
|
-
link_to link_name, cached_document_path(document.probably_unique_id)
|
76
|
+
link_to link_name, cached_document_path(:id => document.probably_unique_id, :query => query)
|
77
|
+
end
|
78
|
+
|
79
|
+
def highlighted_cache(document, query)
|
80
|
+
h(document.highlighted_cache(query)).gsub(/\n/,'<br/>').gsub(/<<(.*?)>>/,content_tag(:span, '\1', :class=>"matching_content"))
|
81
|
+
end
|
82
|
+
|
83
|
+
def sort_by_date_or_relevance(query)
|
84
|
+
[link_to_unless_current('By date', document_path(query, :sort=>'by_date')),
|
85
|
+
link_to_unless_current('By relevance', document_path(query))].join(" ")
|
77
86
|
end
|
78
87
|
end
|
@@ -11,7 +11,7 @@ class Document
|
|
11
11
|
end
|
12
12
|
|
13
13
|
#Delegating properties to File::method_name(complete_path)
|
14
|
-
[:dirname, :basename, :extname, :ext_as_sym, :file?, :ext_as_sym].each{|method_name|
|
14
|
+
[:dirname, :basename, :extname, :ext_as_sym, :file?, :size, :ext_as_sym].each{|method_name|
|
15
15
|
define_method(method_name){File.send(method_name,complete_path)}
|
16
16
|
}
|
17
17
|
alias_method :filename, :basename
|
@@ -63,11 +63,22 @@ class Document
|
|
63
63
|
def cached
|
64
64
|
from_index[:content]
|
65
65
|
end
|
66
|
+
|
67
|
+
def highlighted_cache(raw_query)
|
68
|
+
#TODO: Report to Ferret. Highlight should accept :key and not only :doc_id.
|
69
|
+
Indexer.index.highlight(Query.extract_from(raw_query), doc_id,
|
70
|
+
:field => :content, :excerpt_length => :all,
|
71
|
+
:pre_tag => "<<", :post_tag => ">>"
|
72
|
+
).first
|
73
|
+
end
|
66
74
|
|
67
|
-
# FIXME: Not just date anymore.
|
68
75
|
# Returns the last modification date before the document got indexed.
|
69
76
|
# Useful to know how old a document is, and to which version the cache corresponds.
|
70
|
-
def
|
77
|
+
def pretty_date
|
78
|
+
from_index[:modified].sub(/(\d{4})(\d{2})(\d{2})\d{6}/,'\1-\2-\3')
|
79
|
+
end
|
80
|
+
|
81
|
+
def pretty_mtime
|
71
82
|
from_index[:modified].sub(/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/,'\1-\2-\3 \4:\5:\6')
|
72
83
|
end
|
73
84
|
|
@@ -93,6 +104,12 @@ class Document
|
|
93
104
|
end
|
94
105
|
|
95
106
|
private
|
107
|
+
|
108
|
+
# FIXME: Is there a way to easily retrieve doc_id for a given document?
|
109
|
+
# Better yet, fix Index#highlight to accept :probably_unique_id and stop using :doc_id.
|
110
|
+
def doc_id
|
111
|
+
Indexer.index.search(Ferret::Search::TermQuery.new(:probably_unique_id,probably_unique_id)).hits.first.doc
|
112
|
+
end
|
96
113
|
|
97
114
|
# Retrieves the document from the index.
|
98
115
|
# Useful to get meta-info about it.
|
@@ -5,36 +5,34 @@ class Finder
|
|
5
5
|
@@index ||= Indexer.index
|
6
6
|
end
|
7
7
|
|
8
|
-
def initialize(raw_query,page=1,results_per_page=Picolena::ResultsPerPage)
|
8
|
+
def initialize(raw_query,by_date=false, page=1,results_per_page=Picolena::ResultsPerPage)
|
9
9
|
@query = Query.extract_from(raw_query)
|
10
10
|
@raw_query= raw_query
|
11
11
|
Indexer.ensure_index_existence
|
12
12
|
@per_page=results_per_page
|
13
13
|
@offset=(page.to_i-1)*results_per_page
|
14
|
+
@by_date=by_date
|
14
15
|
index_should_have_documents
|
15
16
|
end
|
16
17
|
|
17
18
|
def execute!
|
18
19
|
@matching_documents=[]
|
19
20
|
start=Time.now
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
#"File has been moved/deleted!"
|
33
|
-
end
|
21
|
+
@total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @by_date)){|index_id, score|
|
22
|
+
begin
|
23
|
+
found_doc=Document.new(index[index_id][:complete_path])
|
24
|
+
found_doc.matching_content=index.highlight(query, index_id,
|
25
|
+
:field => :content, :excerpt_length => 80,
|
26
|
+
:pre_tag => "<<", :post_tag => ">>"
|
27
|
+
)
|
28
|
+
found_doc.score=score
|
29
|
+
@matching_documents<<found_doc
|
30
|
+
rescue Errno::ENOENT
|
31
|
+
#"File has been moved/deleted!"
|
32
|
+
end
|
34
33
|
}
|
35
34
|
@executed=true
|
36
|
-
|
37
|
-
@total_hits=top_docs.total_hits
|
35
|
+
@time_needed=Time.now-start
|
38
36
|
end
|
39
37
|
|
40
38
|
# Returns true if it has been executed.
|
@@ -54,13 +52,15 @@ class Finder
|
|
54
52
|
}
|
55
53
|
}
|
56
54
|
|
57
|
-
|
58
|
-
|
59
55
|
def self.reload!
|
60
56
|
@@index = nil
|
61
57
|
end
|
62
58
|
|
63
59
|
private
|
60
|
+
|
61
|
+
def sort_by_date
|
62
|
+
Ferret::Search::SortField.new(:modified, :type => :byte, :reverse => true)
|
63
|
+
end
|
64
64
|
|
65
65
|
def index_should_have_documents
|
66
66
|
raise IndexError, "no document found" unless index.size > 0
|
@@ -10,6 +10,7 @@ class Indexer
|
|
10
10
|
def index_every_directory(remove_first=false)
|
11
11
|
@@do_not_disturb_while_indexing=true
|
12
12
|
clear! if remove_first
|
13
|
+
@from_scratch = remove_first
|
13
14
|
# Forces Finder.searcher and Finder.index to be reloaded, by removing them from the cache.
|
14
15
|
Finder.reload!
|
15
16
|
log :debug => "Indexing every directory"
|
@@ -35,13 +36,19 @@ class Indexer
|
|
35
36
|
prepare_multi_threads_environment
|
36
37
|
|
37
38
|
indexing_list_chunks.each_with_thread{|chunk|
|
38
|
-
chunk.each{|
|
39
|
-
|
39
|
+
chunk.each{|complete_path|
|
40
|
+
last_itime=index_time_dbm_file[complete_path]
|
41
|
+
if @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime) then
|
42
|
+
add_or_update_file(complete_path)
|
43
|
+
else
|
44
|
+
log :debug => "Identical : #{complete_path}"
|
45
|
+
end
|
46
|
+
index_time_dbm_file[complete_path] = Time.now._dump
|
40
47
|
}
|
41
48
|
}
|
42
49
|
end
|
43
50
|
|
44
|
-
def
|
51
|
+
def add_or_update_file(complete_path)
|
45
52
|
default_fields = Document.default_fields_for(complete_path)
|
46
53
|
begin
|
47
54
|
document = PlainTextExtractor.extract_content_and_language_from(complete_path)
|
@@ -69,6 +76,19 @@ class Indexer
|
|
69
76
|
# Ferret will SEGFAULT otherwise.
|
70
77
|
@@index = nil
|
71
78
|
end
|
79
|
+
|
80
|
+
|
81
|
+
# Checks for indexed files that are missing from filesytem
|
82
|
+
# and removes them from index & dbm file.
|
83
|
+
def prune_index
|
84
|
+
missing_files=index_time_dbm_file.reject{|filename,itime| File.exists?(filename) && Picolena::IndexedDirectories.any?{|dir,alias_path| filename.starts_with?(dir)}}
|
85
|
+
missing_files.each{|filename, itime|
|
86
|
+
index.writer.delete(:complete_path, filename)
|
87
|
+
index_time_dbm_file.delete(filename)
|
88
|
+
log :debug => "Removed : #{filename}"
|
89
|
+
}
|
90
|
+
index.optimize
|
91
|
+
end
|
72
92
|
|
73
93
|
# Only one IndexWriter should be instantiated.
|
74
94
|
# If one index already exists, returns it.
|
@@ -81,11 +101,17 @@ class Indexer
|
|
81
101
|
index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production"
|
82
102
|
end
|
83
103
|
|
84
|
-
|
85
|
-
|
104
|
+
# Returns how many files are indexed.
|
105
|
+
def size
|
106
|
+
index.size
|
86
107
|
end
|
87
108
|
|
88
109
|
private
|
110
|
+
|
111
|
+
# Copied from Ferret book, By David Balmain
|
112
|
+
def index_time_dbm_file
|
113
|
+
@@dbm_file ||= DBM.open(File.join(Picolena::IndexSavePath, 'added_at'))
|
114
|
+
end
|
89
115
|
|
90
116
|
def index_exists?
|
91
117
|
index_filename and File.exists?(index_filename)
|
@@ -108,7 +134,7 @@ class Indexer
|
|
108
134
|
:field_infos => default_field_infos,
|
109
135
|
# Great way to ensure that no file is indexed twice!
|
110
136
|
:key => :probably_unique_id
|
111
|
-
}
|
137
|
+
}.merge Picolena::IndexingConfiguration
|
112
138
|
end
|
113
139
|
|
114
140
|
def default_field_infos
|
@@ -120,7 +146,7 @@ class Indexer
|
|
120
146
|
field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
|
121
147
|
field_infos.add_field(:modified, :store => :yes, :index => :untokenized)
|
122
148
|
field_infos.add_field(:probably_unique_id, :store => :no, :index => :untokenized)
|
123
|
-
field_infos.add_field(:language, :store => :yes, :index => :
|
149
|
+
field_infos.add_field(:language, :store => :yes, :index => :untokenized)
|
124
150
|
end
|
125
151
|
end
|
126
152
|
|
@@ -130,7 +156,8 @@ class Indexer
|
|
130
156
|
# an IndexWriter at the same time, and get a
|
131
157
|
# Ferret::Store::Lock::LockError
|
132
158
|
index
|
133
|
-
#
|
159
|
+
# Opens dbm file to dump indexing time.
|
160
|
+
index_time_dbm_file
|
134
161
|
# ActiveSupport sometime raises
|
135
162
|
# Expected Object is NOT missing constant
|
136
163
|
# without.
|
@@ -140,4 +167,4 @@ class Indexer
|
|
140
167
|
PlainTextExtractor
|
141
168
|
end
|
142
169
|
end
|
143
|
-
end
|
170
|
+
end
|
@@ -2,7 +2,7 @@
|
|
2
2
|
=link_to icon_and_filename_for(@document), download_document_path(@probably_unique_id)
|
3
3
|
(
|
4
4
|
=:as_it_was_indexed_on.l
|
5
|
-
=@document.
|
5
|
+
=@document.pretty_date
|
6
6
|
)
|
7
7
|
%p=link_to_containing_directory(@document)
|
8
|
-
%blockquote=
|
8
|
+
%blockquote=highlighted_cache(@document, @query)
|
@@ -7,6 +7,9 @@
|
|
7
7
|
%strong=h(@query)
|
8
8
|
=show_time_needed(@time_needed)
|
9
9
|
-else
|
10
|
-
%span{:class=>'pagination'}=should_paginate(@matching_documents, @query)
|
10
|
+
%span{:class=>'pagination'}=should_paginate(@matching_documents, @query, @sort)
|
11
11
|
=describe_results(@matching_documents, @total_hits, @time_needed, h(@query))
|
12
|
-
|
12
|
+
-unless nothing_found?
|
13
|
+
%p
|
14
|
+
%span{:class=>'sort_by'}=sort_by_date_or_relevance(@query)
|
15
|
+
= render :partial =>'document', :collection => @matching_documents, :locals => { :query => @query}
|
@@ -1,4 +1,4 @@
|
|
1
|
-
%w(rubygems paginator fileutils pathname logger thread).each{|lib| require lib}
|
1
|
+
%w(rubygems paginator fileutils pathname logger thread dbm).each{|lib| require lib}
|
2
2
|
|
3
3
|
# Uncomment below to force Rails into production mode when
|
4
4
|
# you don't control web/app server and can't set it the proper way
|
@@ -14,10 +14,15 @@ namespace :index do
|
|
14
14
|
task :update => :environment do
|
15
15
|
Indexer.index_every_directory
|
16
16
|
end
|
17
|
+
|
18
|
+
desc 'Remove unneeded files from index'
|
19
|
+
task :prune => :environment do
|
20
|
+
Indexer.prune_index
|
21
|
+
end
|
17
22
|
|
18
23
|
desc 'Returns the number of indexed documents'
|
19
24
|
task :size => :environment do
|
20
|
-
puts "#{Indexer.
|
25
|
+
puts "#{Indexer.size} documents are currently indexed in #{Picolena::IndexSavePath}"
|
21
26
|
end
|
22
27
|
|
23
28
|
# Search index with query "some query" :
|
@@ -30,7 +30,7 @@ namespace :install_dependencies do
|
|
30
30
|
task :deb_packages do
|
31
31
|
root_privileges_required!
|
32
32
|
#TODO: Should load this list from defined PlainTextExtractor's
|
33
|
-
packages=%w{antiword poppler-utils odt2txt html2text catdoc unrtf mguesser}.join(" ")
|
33
|
+
packages=%w{antiword poppler-utils odt2txt html2text catdoc unrtf mguesser libdbm-ruby1.8}.join(" ")
|
34
34
|
puts "Installing "<<packages
|
35
35
|
system("apt-get install "<<packages)
|
36
36
|
end
|
@@ -82,6 +82,17 @@ h1, h2, h3, h4, h5, h6, p, form {
|
|
82
82
|
text-decoration:none;
|
83
83
|
}
|
84
84
|
|
85
|
+
.sort_by {
|
86
|
+
float:right;
|
87
|
+
font-size: 13px;
|
88
|
+
color:#000;
|
89
|
+
}
|
90
|
+
|
91
|
+
.sort_by a{
|
92
|
+
color: #EE8907;
|
93
|
+
text-decoration:none;
|
94
|
+
}
|
95
|
+
|
85
96
|
#mainimg input.btn{
|
86
97
|
margin-right: 10px;
|
87
98
|
height: 20px;
|
@@ -116,7 +127,7 @@ width: 80%;
|
|
116
127
|
|
117
128
|
#results {
|
118
129
|
width:778px;
|
119
|
-
padding-top:
|
130
|
+
padding-top: 15px;
|
120
131
|
}
|
121
132
|
|
122
133
|
#results h2 a{
|
@@ -137,6 +148,11 @@ width: 80%;
|
|
137
148
|
padding:0px 20px;
|
138
149
|
}
|
139
150
|
|
151
|
+
#results .matching_content{
|
152
|
+
background-color:#ffff66;
|
153
|
+
}
|
154
|
+
|
155
|
+
|
140
156
|
#results a, #results small{
|
141
157
|
font-family:"Trebuchet MS";
|
142
158
|
font-size:11px;
|
@@ -50,16 +50,16 @@ describe "Basic Finder" do
|
|
50
50
|
Indexer.index_every_directory(remove_first=true)
|
51
51
|
end
|
52
52
|
|
53
|
-
it "should accept one parameter as query, and 2 optionals for paginating" do
|
53
|
+
it "should accept one parameter as query, 1 optional for sorting results and 2 optionals for paginating" do
|
54
54
|
lambda {Finder.new}.should raise_error(ArgumentError, "wrong number of arguments (0 for 1)")
|
55
55
|
# show first page with 10 results per page
|
56
56
|
lambda {Finder.new("a b")}.should_not raise_error
|
57
57
|
# show second page
|
58
|
-
lambda {Finder.new("a",
|
58
|
+
lambda {Finder.new("a", "by_date")}.should_not raise_error
|
59
59
|
# show first page with 15 results
|
60
|
-
lambda {Finder.new("a", 1, 15)}.should_not raise_error
|
60
|
+
lambda {Finder.new("a", "by_date", 1, 15)}.should_not raise_error
|
61
61
|
# Too many parameters
|
62
|
-
lambda {Finder.new("a", 10, 20, 30)}.should raise_error(ArgumentError, "wrong number of arguments (
|
62
|
+
lambda {Finder.new("a", "by_date", 10, 20, 30)}.should raise_error(ArgumentError, "wrong number of arguments (5 for 4)")
|
63
63
|
end
|
64
64
|
|
65
65
|
it "should return matching documents if executed successfully" do
|
@@ -5,28 +5,30 @@ basic_pdf_attribute={
|
|
5
5
|
:basename=>'basic',
|
6
6
|
:complete_path=>File.join(RAILS_ROOT, '/spec/test_dirs/indexed/basic/basic.pdf'),
|
7
7
|
:extname=>'.pdf',
|
8
|
-
:
|
8
|
+
:ext_as_sym => :pdf,
|
9
|
+
:filename=>'basic.pdf',
|
10
|
+
:size => 9380
|
9
11
|
}
|
10
12
|
|
11
13
|
describe Document do
|
12
14
|
before(:each) do
|
13
|
-
@
|
15
|
+
@valid_document=Document.new("spec/test_dirs/indexed/basic/basic.pdf")
|
14
16
|
end
|
15
17
|
|
16
18
|
it "should be an existing file" do
|
17
19
|
lambda {Document.new("/patapouf.txt")}.should raise_error(Errno::ENOENT)
|
18
|
-
lambda {@
|
20
|
+
lambda {@valid_document}.should_not raise_error
|
19
21
|
lambda {Document.new("spec/test_dirs/not_indexed/Rakefile")}.should_not raise_error(Errno::ENOENT)
|
20
22
|
end
|
21
23
|
|
22
24
|
it "should belong to an indexed directory" do
|
23
|
-
lambda {@
|
25
|
+
lambda {@valid_document}.should_not raise_error
|
24
26
|
lambda {Document.new("spec/test_dirs/not_indexed/Rakefile")}.should raise_error(ArgumentError, "required document is not in indexed directory")
|
25
27
|
end
|
26
28
|
|
27
29
|
basic_pdf_attribute.each{|attribute,expected_value|
|
28
30
|
it "should know its #{attribute}" do
|
29
|
-
@
|
31
|
+
@valid_document.should respond_to(attribute)
|
30
32
|
@basic_pdf=Document.new('spec/test_dirs/indexed/basic/basic.pdf')
|
31
33
|
@basic_pdf.send(attribute).should == expected_value
|
32
34
|
end
|
@@ -36,23 +38,70 @@ describe Document do
|
|
36
38
|
another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
|
37
39
|
another_doc.content.should == "just a content test\nin a txt file"
|
38
40
|
end
|
41
|
+
|
42
|
+
it "should know its cached content" do
|
43
|
+
another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
|
44
|
+
another_doc.cached.should == "just a content test\nin a txt file"
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should know its highlighted cached content for a given query" do
|
48
|
+
another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
|
49
|
+
another_doc.highlighted_cache('a content test').should == "just a <<content>> <<test>>\nin a txt file"
|
50
|
+
end
|
39
51
|
|
40
52
|
it "should know its alias_path" do
|
41
|
-
@
|
42
|
-
@
|
53
|
+
@valid_document.should respond_to(:alias_path)
|
54
|
+
@valid_document.alias_path.starts_with?("http://picolena.devjavu.com/browser/trunk/lib/picolena/templates/spec/test_dirs/indexed").should be_true
|
55
|
+
end
|
56
|
+
|
57
|
+
it "should know its probably_unique_id" do
|
58
|
+
@valid_document.should respond_to(:probably_unique_id)
|
59
|
+
@valid_document.probably_unique_id.should =~/^[a-z]+$/
|
60
|
+
@valid_document.probably_unique_id.size.should == Picolena::HashLength
|
43
61
|
end
|
62
|
+
|
63
|
+
it "should know its modification date" do
|
64
|
+
@valid_document.pretty_date.class.should == String
|
65
|
+
@valid_document.pretty_date.should =~/^\d{4}\-\d{2}\-\d{2}$/
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should know its modification time and returns it in a pretty way" do
|
69
|
+
@valid_document.should respond_to(:mtime)
|
70
|
+
@valid_document.mtime.should be_kind_of(Integer)
|
71
|
+
@valid_document.should respond_to(:pretty_mtime)
|
72
|
+
@valid_document.pretty_mtime.class.should == String
|
73
|
+
@valid_document.pretty_mtime.should =~/^\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}$/
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should know if its content can be extracted" do
|
77
|
+
@valid_document.should respond_to(:supported?)
|
78
|
+
@valid_document.should be_supported
|
79
|
+
Document.new("spec/test_dirs/indexed/others/ghjopdfg.xyz").should_not be_supported
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should know its language when enough content is available" do
|
83
|
+
Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
|
84
|
+
Document.new("spec/test_dirs/indexed/lang/shakespeare").language.should == "en"
|
85
|
+
Document.new("spec/test_dirs/indexed/lang/lorca").language.should == "es"
|
86
|
+
Document.new("spec/test_dirs/indexed/lang/hugo").language.should == "fr"
|
87
|
+
end if Picolena::UseLanguageRecognition
|
88
|
+
|
89
|
+
it "should not try to guess language when file is too small" do
|
90
|
+
Document.new("spec/test_dirs/indexed/basic/hello.rb").language.should be_nil
|
91
|
+
Document.new("spec/test_dirs/indexed/README").language.should be_nil
|
92
|
+
end if Picolena::UseLanguageRecognition
|
44
93
|
|
45
94
|
it "should let finder specify its score" do
|
46
|
-
@
|
47
|
-
@
|
48
|
-
@
|
49
|
-
@
|
95
|
+
@valid_document.should respond_to(:score)
|
96
|
+
@valid_document.score.should be_nil
|
97
|
+
@valid_document.score=25
|
98
|
+
@valid_document.score.should == 25
|
50
99
|
end
|
51
100
|
|
52
101
|
it "should let finder specify its matching content" do
|
53
|
-
@
|
54
|
-
@
|
55
|
-
@
|
56
|
-
@
|
102
|
+
@valid_document.should respond_to(:matching_content)
|
103
|
+
@valid_document.matching_content.should be_nil
|
104
|
+
@valid_document.matching_content=["thermal cooling", "heat driven cooling"]
|
105
|
+
@valid_document.matching_content.should include("thermal cooling")
|
57
106
|
end
|
58
|
-
end
|
107
|
+
end
|
@@ -8,9 +8,9 @@ end
|
|
8
8
|
|
9
9
|
|
10
10
|
def matching_document_for(query)
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
# Returns matching document for any given query only if
|
12
|
+
# exactly one document is found.
|
13
|
+
# Specs don't pass otherwise.
|
14
14
|
matching_documents=Finder.new(query).matching_documents
|
15
15
|
matching_documents.size.should == 1
|
16
16
|
matching_documents.first
|
@@ -19,6 +19,7 @@ end
|
|
19
19
|
|
20
20
|
describe Finder do
|
21
21
|
before(:all) do
|
22
|
+
Globalite.language = :en
|
22
23
|
# SVN doesn't like non-ascii filenames.
|
23
24
|
revert_changes!('spec/test_dirs/indexed/others/bäñüßé.txt',"just to know if files are indexed with utf8 filenames")
|
24
25
|
|
@@ -13,10 +13,18 @@ describe "Host indexing system" do
|
|
13
13
|
|
14
14
|
it "should know which IP addresses are allowed (config/custom/white_list_ip.yml)" do
|
15
15
|
File.should be_readable('config/custom/white_list_ip.yml')
|
16
|
+
ip_conf=YAML.load_file('config/custom/white_list_ip.yml')
|
17
|
+
ip_conf.class.should == Hash
|
18
|
+
ip_conf['Allow'].should_not be_nil
|
16
19
|
end
|
17
20
|
|
18
21
|
it "should know which directories are to be indexed (config/custom/indexed_directories.yml)" do
|
19
22
|
File.should be_readable('config/custom/indexed_directories.yml')
|
23
|
+
dirs_conf=YAML.load_file('config/custom/indexed_directories.yml')
|
24
|
+
dirs_conf.class.should == Hash
|
25
|
+
%w(development test production).all?{|env|
|
26
|
+
dirs_conf[env].should_not be_nil
|
27
|
+
}
|
20
28
|
end
|
21
29
|
|
22
30
|
it "should be able to calculate base26 hash from strings" do
|
@@ -27,16 +27,4 @@ describe "PlainTextExtractors" do
|
|
27
27
|
end
|
28
28
|
}
|
29
29
|
}
|
30
|
-
|
31
|
-
it "should guess language when enough content is available" do
|
32
|
-
Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
|
33
|
-
Document.new("spec/test_dirs/indexed/lang/shakespeare").language.should == "en"
|
34
|
-
Document.new("spec/test_dirs/indexed/lang/lorca").language.should == "es"
|
35
|
-
Document.new("spec/test_dirs/indexed/lang/hugo").language.should == "fr"
|
36
|
-
end if Picolena::UseLanguageRecognition
|
37
|
-
|
38
|
-
it "should not try to guess language when file is too small" do
|
39
|
-
Document.new("spec/test_dirs/indexed/basic/hello.rb").language.should be_nil
|
40
|
-
Document.new("spec/test_dirs/indexed/README").language.should be_nil
|
41
|
-
end if Picolena::UseLanguageRecognition
|
42
30
|
end
|
@@ -1,8 +1,16 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../spec_helper'
|
2
2
|
|
3
3
|
describe Query do
|
4
|
-
it "should return a BooleanQuery" do
|
4
|
+
it "should return a BooleanQuery, a TermQuery or a RangeQuery" do
|
5
5
|
Query.extract_from("whatever").class.should == Ferret::Search::BooleanQuery
|
6
|
+
Query.extract_from("lang:de").class.should == Ferret::Search::TermQuery
|
7
|
+
Query.extract_from("date:<1990").class.should == Ferret::Search::RangeQuery
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should not remove stop-words from TermQuery" do
|
11
|
+
# it means "Italian language", but also is a stop-word.
|
12
|
+
Query.extract_from("lang:it").class.should == Ferret::Search::TermQuery
|
13
|
+
Query.extract_from("lang:it").to_s.should == "language:it"
|
6
14
|
end
|
7
15
|
|
8
16
|
it "should translate LIKE, NOT, OR and AND boolean ops to English" do
|
@@ -12,6 +20,7 @@ describe Query do
|
|
12
20
|
:fr=>["COMME","NON","OU","ET"]
|
13
21
|
}
|
14
22
|
|
23
|
+
Globalite.language = :en
|
15
24
|
english_query_with_like_and_not=Query.extract_from("LIKE something NOT something")
|
16
25
|
english_query_with_or=Query.extract_from("test OR another")
|
17
26
|
english_query_with_and=Query.extract_from("test AND another")
|
data/lib/picolena/version.rb
CHANGED
data/website/index.html
CHANGED
@@ -33,7 +33,7 @@
|
|
33
33
|
<h1>Picolena</h1>
|
34
34
|
<div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
|
35
35
|
<p>Get Version</p>
|
36
|
-
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.
|
36
|
+
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.7</a>
|
37
37
|
</div>
|
38
38
|
<h1>→ ‘picolena’</h1>
|
39
39
|
|
data/website/index.txt
CHANGED
File without changes
|
data/website/index_devjavu
CHANGED
File without changes
|
File without changes
|
File without changes
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: picolena
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Duminil
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
|
31
31
|
-----END CERTIFICATE-----
|
32
32
|
|
33
|
-
date: 2008-04-
|
33
|
+
date: 2008-04-30 00:00:00 +02:00
|
34
34
|
default_executable:
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
@@ -146,6 +146,7 @@ files:
|
|
146
146
|
- lib/picolena/config/basic.rb
|
147
147
|
- lib/picolena/config/icons_and_filetypes.yml
|
148
148
|
- lib/picolena/config/indexed_directories.yml
|
149
|
+
- lib/picolena/config/indexing_performance.yml
|
149
150
|
- lib/picolena/config/title_and_names_and_links.yml
|
150
151
|
- lib/picolena/config/white_list_ip.yml
|
151
152
|
- lib/picolena/picolena_generator.rb
|
@@ -177,6 +178,7 @@ files:
|
|
177
178
|
- lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
|
178
179
|
- lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
|
179
180
|
- lib/picolena/templates/config/initializers/006_load_icons.rb
|
181
|
+
- lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb
|
180
182
|
- lib/picolena/templates/config/routes.rb
|
181
183
|
- lib/picolena/templates/lang/ui/de.yml
|
182
184
|
- lib/picolena/templates/lang/ui/en.yml
|
metadata.gz.sig
CHANGED
@@ -1 +1,2 @@
|
|
1
|
-
|
1
|
+
�;����U�=nƷ�8߿X�`>����B����2Ħ@,u!��~�u�9>�Ӽq�J1� ֖i�T������-.q�^l*�`�>"��m�8��ɏP�cWk��y%����W�:r=&����CtaO;c
|
2
|
+
.&��}�e)�g(O�)0ة)!����s�
|
3
|
+
�"��Fm��>8���n���q�?I�P'����`|����`�\�>{\a4�Ӷ�JǮ}�&�?�d�UM{
|