picolena 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +12 -3
- data/Manifest.txt +2 -0
- data/bin/picolena +1 -1
- data/config/files_to_clean +1 -0
- data/lib/picolena/config/basic.rb +6 -2
- data/lib/picolena/config/indexing_performance.yml +30 -0
- data/lib/picolena/picolena_generator.rb +9 -4
- data/lib/picolena/templates/app/controllers/documents_controller.rb +3 -1
- data/lib/picolena/templates/app/helpers/documents_helper.rb +18 -9
- data/lib/picolena/templates/app/models/document.rb +20 -3
- data/lib/picolena/templates/app/models/finder.rb +19 -19
- data/lib/picolena/templates/app/models/indexer.rb +36 -9
- data/lib/picolena/templates/app/views/documents/_document.html.haml +7 -2
- data/lib/picolena/templates/app/views/documents/cached.html.haml +2 -2
- data/lib/picolena/templates/app/views/documents/show.html.haml +5 -2
- data/lib/picolena/templates/config/environment.rb +1 -1
- data/lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb +6 -0
- data/lib/picolena/templates/lib/tasks/index.rake +6 -1
- data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
- data/lib/picolena/templates/public/stylesheets/style.css +17 -1
- data/lib/picolena/templates/spec/models/basic_finder_spec.rb +4 -4
- data/lib/picolena/templates/spec/models/document_spec.rb +65 -16
- data/lib/picolena/templates/spec/models/finder_spec.rb +4 -3
- data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +8 -0
- data/lib/picolena/templates/spec/models/indexer_spec.rb +2 -2
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +0 -12
- data/lib/picolena/templates/spec/models/query_spec.rb +10 -1
- data/lib/picolena/version.rb +1 -1
- data/website/index.html +1 -1
- data/website/index.txt +0 -0
- data/website/index_devjavu +0 -0
- data/website/javascripts/rounded_corners_lite.inc.js +0 -0
- data/website/stylesheets/screen.css +0 -0
- data.tar.gz.sig +0 -0
- metadata +4 -2
- metadata.gz.sig +3 -1
data/History.txt
CHANGED
@@ -1,10 +1,19 @@
|
|
1
|
+
== 0.1.7 2008-04-30
|
2
|
+
|
3
|
+
* 5 minor enhancements:
|
4
|
+
* added cache highlighting à la Google
|
5
|
+
* rake index:update implemented as described in Ferret book by David Balmain
|
6
|
+
* rake index:prune removes missing files from indexer
|
7
|
+
* possibility to sort results by relevance / by date
|
8
|
+
* one configuration file for performance tweaks
|
9
|
+
|
1
10
|
== 0.1.6 2008-04-25
|
2
11
|
|
3
12
|
* 1 minor enhancement:
|
4
13
|
* replaced index key by Document#probably_unique_id
|
5
14
|
|
6
15
|
* bug fixes:
|
7
|
-
* Added forgotten public/images/flags to generator file
|
16
|
+
* Added forgotten public/images/flags to generator file
|
8
17
|
|
9
18
|
== 0.1.5 2008-04-25
|
10
19
|
|
@@ -24,7 +33,7 @@
|
|
24
33
|
== 0.1.3 2008-04-20
|
25
34
|
|
26
35
|
* 1 bug fix:
|
27
|
-
* removed verbose debug info
|
36
|
+
* removed verbose debug info
|
28
37
|
|
29
38
|
== 0.1.2 2008-04-20
|
30
39
|
|
@@ -49,7 +58,7 @@
|
|
49
58
|
* 3 minor enhancements:
|
50
59
|
* can now be installed on win32 (doesn't pass every spec though)
|
51
60
|
* moved rails_plugins away from lib/ so that they don't get parsed by rdoc/ri
|
52
|
-
* shorter and prettier base26_hash id for documents
|
61
|
+
* shorter and prettier base26_hash id for documents
|
53
62
|
|
54
63
|
== 0.0.99 2008-04-06
|
55
64
|
|
data/Manifest.txt
CHANGED
@@ -11,6 +11,7 @@ lib/picolena/USAGE
|
|
11
11
|
lib/picolena/config/basic.rb
|
12
12
|
lib/picolena/config/icons_and_filetypes.yml
|
13
13
|
lib/picolena/config/indexed_directories.yml
|
14
|
+
lib/picolena/config/indexing_performance.yml
|
14
15
|
lib/picolena/config/title_and_names_and_links.yml
|
15
16
|
lib/picolena/config/white_list_ip.yml
|
16
17
|
lib/picolena/picolena_generator.rb
|
@@ -42,6 +43,7 @@ lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb
|
|
42
43
|
lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
|
43
44
|
lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
|
44
45
|
lib/picolena/templates/config/initializers/006_load_icons.rb
|
46
|
+
lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb
|
45
47
|
lib/picolena/templates/config/routes.rb
|
46
48
|
lib/picolena/templates/lang/ui/de.yml
|
47
49
|
lib/picolena/templates/lang/ui/en.yml
|
data/bin/picolena
CHANGED
@@ -11,7 +11,7 @@ if %w(-v --version).include? ARGV.first
|
|
11
11
|
exit(0)
|
12
12
|
end
|
13
13
|
|
14
|
-
action= ARGV.
|
14
|
+
action= ARGV.any?{|opt| opt[0,6]=="--spec"} ? "testing" : "installing"
|
15
15
|
|
16
16
|
require 'rubigen/scripts/generate'
|
17
17
|
source = RubiGen::PathSource.new(:application,
|
data/config/files_to_clean
CHANGED
@@ -5,6 +5,7 @@ lib/picolena/templates/config/custom/indexed_directories.yml
|
|
5
5
|
lib/picolena/templates/config/custom/white_list_ip.yml
|
6
6
|
lib/picolena/templates/config/custom/title_and_names_and_links.yml
|
7
7
|
lib/picolena/templates/config/custom/icons_and_filetypes.yml
|
8
|
+
lib/picolena/templates/config/custom/indexing_performance.yml
|
8
9
|
lib/picolena/templates/log
|
9
10
|
lib/picolena/templates/spec/test_dirs/indexed/others/bäñüßé.txt
|
10
11
|
lib/picolena/templates/tmp
|
@@ -42,5 +42,9 @@ module Picolena
|
|
42
42
|
# Specify the default Levenshtein distance when using FuzzyQuery
|
43
43
|
# see http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html for more information.
|
44
44
|
Ferret::Search::FuzzyQuery.default_min_similarity=0.6
|
45
|
-
|
46
|
-
|
45
|
+
|
46
|
+
# PerFieldAnalyzer is used to prevent queries like "language:it" to be broken by StopFilter.
|
47
|
+
per_field_analyzer=Ferret::Analysis::PerFieldAnalyzer.new(Ferret::Analysis::StandardAnalyzer.new)
|
48
|
+
per_field_analyzer[:language]=Ferret::Analysis::WhiteSpaceAnalyzer.new
|
49
|
+
Analyzer=per_field_analyzer
|
50
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# You probably shouldn't change those parameters
|
2
|
+
# if you don't know what they represent.
|
3
|
+
# For more information, refer to:
|
4
|
+
# http://ferret.davebalmain.com/api/classes/Ferret/Index/IndexWriter.html
|
5
|
+
|
6
|
+
## Main performance parameters
|
7
|
+
|
8
|
+
# Allowed memory for indexing process.
|
9
|
+
# 128MB by default, or 2^27
|
10
|
+
max_buffer_memory: 134_217_728
|
11
|
+
|
12
|
+
# High value => fast indexing, slow searching
|
13
|
+
# Low value => slow indexing, fast searching
|
14
|
+
# 10 by default
|
15
|
+
merge_factor: 10
|
16
|
+
|
17
|
+
# Maximum number of extracted terms for any given document
|
18
|
+
max_field_length: 10_000
|
19
|
+
|
20
|
+
|
21
|
+
## Other parameters
|
22
|
+
# 1MB by default, or 2**20
|
23
|
+
chunk_size: 1_048_576
|
24
|
+
max_buffered_docs: 10_000
|
25
|
+
# NOTE: Be extra careful with this parameter, setting it to -1 (infinite)
|
26
|
+
# multiplied indexing time by an order of magnitude.
|
27
|
+
# max_merge_docs: -1
|
28
|
+
use_compound_file: true
|
29
|
+
index_skip_interval: 128
|
30
|
+
doc_skip_interval: 16
|
@@ -16,10 +16,14 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
|
|
16
16
|
usage if args.empty? and !options[:spec_only]
|
17
17
|
@destination_root = options[:destination]
|
18
18
|
|
19
|
-
@directories_to_index=
|
20
|
-
|
21
|
-
|
22
|
-
|
19
|
+
@directories_to_index=if options[:spec_only] then
|
20
|
+
"/whatever : /whatever"
|
21
|
+
else
|
22
|
+
ARGV.collect{|relative_path|
|
23
|
+
abs_dir=Pathname.new(relative_path).realpath.to_s
|
24
|
+
"\"#{abs_dir}\" : \"#{abs_dir}\""
|
25
|
+
}.join("\n ")
|
26
|
+
end
|
23
27
|
|
24
28
|
extract_options
|
25
29
|
end
|
@@ -63,6 +67,7 @@ class PicolenaGenerator < RubiGen::Base #:nodoc:
|
|
63
67
|
m.template '../config/indexed_directories.yml', 'config/custom/indexed_directories.yml', :assigns => {:directories_to_index => @directories_to_index}
|
64
68
|
m.template '../config/title_and_names_and_links.yml', 'config/custom/title_and_names_and_links.yml', :assigns => {:version => Picolena::VERSION::STRING}
|
65
69
|
m.file '../config/icons_and_filetypes.yml', 'config/custom/icons_and_filetypes.yml'
|
70
|
+
m.file '../config/indexing_performance.yml', 'config/custom/indexing_performance.yml'
|
66
71
|
|
67
72
|
# README, License & Rakefile
|
68
73
|
m.file 'MIT-LICENSE', 'LICENSE'
|
@@ -22,8 +22,9 @@ class DocumentsController < ApplicationController
|
|
22
22
|
def show
|
23
23
|
start=Time.now
|
24
24
|
@query=[params[:id],params.delete(:format)].compact.join('.')
|
25
|
+
@sort=params[:sort]
|
25
26
|
page=params[:page]||1
|
26
|
-
finder=Finder.new(@query,page)
|
27
|
+
finder=Finder.new(@query,@sort,page)
|
27
28
|
finder.execute!
|
28
29
|
pager=::Paginator.new(finder.total_hits, Picolena::ResultsPerPage) do
|
29
30
|
finder.matching_documents
|
@@ -47,6 +48,7 @@ class DocumentsController < ApplicationController
|
|
47
48
|
# Returns the content of the document identified by probably_unique_id, as it was at the time it was indexed.
|
48
49
|
# similar to Google cache.
|
49
50
|
def cached
|
51
|
+
@query=[params[:query],params.delete(:format)].compact.join('.')
|
50
52
|
end
|
51
53
|
|
52
54
|
private
|
@@ -3,15 +3,15 @@ module DocumentsHelper
|
|
3
3
|
def nothing_found?
|
4
4
|
@matching_documents.nil? or @matching_documents.entries.empty?
|
5
5
|
end
|
6
|
-
|
6
|
+
|
7
7
|
# Very basic pagination.
|
8
8
|
# Provides liks to Next, Prev and FirstPage when needed.
|
9
|
-
def should_paginate(page,query)
|
10
|
-
[(link_to("←←", :action => :show, :id => query, :
|
11
|
-
(link_to("←", :action => :show, :id => query, :page => page.prev.number) if page.prev?),
|
12
|
-
(link_to("→", :action => :show, :id => query, :page => page.next.number) if page.next?)].compact.join(" | ")
|
9
|
+
def should_paginate(page,query, sort)
|
10
|
+
[(link_to("←←", :action => :show, :id => query, :sort=>sort) if page.number>2),
|
11
|
+
(link_to("←", :action => :show, :id => query, :page => page.prev.number, :sort=>sort) if page.prev?),
|
12
|
+
(link_to("→", :action => :show, :id => query, :page => page.next.number, :sort=>sort) if page.next?)].compact.join(" | ")
|
13
13
|
end
|
14
|
-
|
14
|
+
|
15
15
|
# Returns a localized sentence like "Results 1-10 of 12 for Zimbabwe (0.472s)" or
|
16
16
|
# "Résultats 1-2 parmi 2 pour whatever (0.012s)"
|
17
17
|
def describe_results(page, total_hits, dt, query)
|
@@ -24,7 +24,7 @@ module DocumentsHelper
|
|
24
24
|
show_time_needed(dt)
|
25
25
|
].join(' ')
|
26
26
|
end
|
27
|
-
|
27
|
+
|
28
28
|
# Returns the time needed to treat the query and launch the search, with a ms precision : (0.472s)
|
29
29
|
def show_time_needed(dt)
|
30
30
|
content_tag(:small,'('<<number_with_precision(dt,3)<<'s)')
|
@@ -71,8 +71,17 @@ module DocumentsHelper
|
|
71
71
|
end
|
72
72
|
|
73
73
|
# For any indexed document, returns a link to show its cached content.
|
74
|
-
def link_to_cached_content(document)
|
74
|
+
def link_to_cached_content(document, query)
|
75
75
|
link_name="("<<content_tag(:small,:cached.l)<<")"
|
76
|
-
link_to link_name, cached_document_path(document.probably_unique_id)
|
76
|
+
link_to link_name, cached_document_path(:id => document.probably_unique_id, :query => query)
|
77
|
+
end
|
78
|
+
|
79
|
+
def highlighted_cache(document, query)
|
80
|
+
h(document.highlighted_cache(query)).gsub(/\n/,'<br/>').gsub(/<<(.*?)>>/,content_tag(:span, '\1', :class=>"matching_content"))
|
81
|
+
end
|
82
|
+
|
83
|
+
def sort_by_date_or_relevance(query)
|
84
|
+
[link_to_unless_current('By date', document_path(query, :sort=>'by_date')),
|
85
|
+
link_to_unless_current('By relevance', document_path(query))].join(" ")
|
77
86
|
end
|
78
87
|
end
|
@@ -11,7 +11,7 @@ class Document
|
|
11
11
|
end
|
12
12
|
|
13
13
|
#Delegating properties to File::method_name(complete_path)
|
14
|
-
[:dirname, :basename, :extname, :ext_as_sym, :file?, :ext_as_sym].each{|method_name|
|
14
|
+
[:dirname, :basename, :extname, :ext_as_sym, :file?, :size, :ext_as_sym].each{|method_name|
|
15
15
|
define_method(method_name){File.send(method_name,complete_path)}
|
16
16
|
}
|
17
17
|
alias_method :filename, :basename
|
@@ -63,11 +63,22 @@ class Document
|
|
63
63
|
def cached
|
64
64
|
from_index[:content]
|
65
65
|
end
|
66
|
+
|
67
|
+
def highlighted_cache(raw_query)
|
68
|
+
#TODO: Report to Ferret. Highlight should accept :key and not only :doc_id.
|
69
|
+
Indexer.index.highlight(Query.extract_from(raw_query), doc_id,
|
70
|
+
:field => :content, :excerpt_length => :all,
|
71
|
+
:pre_tag => "<<", :post_tag => ">>"
|
72
|
+
).first
|
73
|
+
end
|
66
74
|
|
67
|
-
# FIXME: Not just date anymore.
|
68
75
|
# Returns the last modification date before the document got indexed.
|
69
76
|
# Useful to know how old a document is, and to which version the cache corresponds.
|
70
|
-
def
|
77
|
+
def pretty_date
|
78
|
+
from_index[:modified].sub(/(\d{4})(\d{2})(\d{2})\d{6}/,'\1-\2-\3')
|
79
|
+
end
|
80
|
+
|
81
|
+
def pretty_mtime
|
71
82
|
from_index[:modified].sub(/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/,'\1-\2-\3 \4:\5:\6')
|
72
83
|
end
|
73
84
|
|
@@ -93,6 +104,12 @@ class Document
|
|
93
104
|
end
|
94
105
|
|
95
106
|
private
|
107
|
+
|
108
|
+
# FIXME: Is there a way to easily retrieve doc_id for a given document?
|
109
|
+
# Better yet, fix Index#highlight to accept :probably_unique_id and stop using :doc_id.
|
110
|
+
def doc_id
|
111
|
+
Indexer.index.search(Ferret::Search::TermQuery.new(:probably_unique_id,probably_unique_id)).hits.first.doc
|
112
|
+
end
|
96
113
|
|
97
114
|
# Retrieves the document from the index.
|
98
115
|
# Useful to get meta-info about it.
|
@@ -5,36 +5,34 @@ class Finder
|
|
5
5
|
@@index ||= Indexer.index
|
6
6
|
end
|
7
7
|
|
8
|
-
def initialize(raw_query,page=1,results_per_page=Picolena::ResultsPerPage)
|
8
|
+
def initialize(raw_query,by_date=false, page=1,results_per_page=Picolena::ResultsPerPage)
|
9
9
|
@query = Query.extract_from(raw_query)
|
10
10
|
@raw_query= raw_query
|
11
11
|
Indexer.ensure_index_existence
|
12
12
|
@per_page=results_per_page
|
13
13
|
@offset=(page.to_i-1)*results_per_page
|
14
|
+
@by_date=by_date
|
14
15
|
index_should_have_documents
|
15
16
|
end
|
16
17
|
|
17
18
|
def execute!
|
18
19
|
@matching_documents=[]
|
19
20
|
start=Time.now
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
#"File has been moved/deleted!"
|
33
|
-
end
|
21
|
+
@total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @by_date)){|index_id, score|
|
22
|
+
begin
|
23
|
+
found_doc=Document.new(index[index_id][:complete_path])
|
24
|
+
found_doc.matching_content=index.highlight(query, index_id,
|
25
|
+
:field => :content, :excerpt_length => 80,
|
26
|
+
:pre_tag => "<<", :post_tag => ">>"
|
27
|
+
)
|
28
|
+
found_doc.score=score
|
29
|
+
@matching_documents<<found_doc
|
30
|
+
rescue Errno::ENOENT
|
31
|
+
#"File has been moved/deleted!"
|
32
|
+
end
|
34
33
|
}
|
35
34
|
@executed=true
|
36
|
-
|
37
|
-
@total_hits=top_docs.total_hits
|
35
|
+
@time_needed=Time.now-start
|
38
36
|
end
|
39
37
|
|
40
38
|
# Returns true if it has been executed.
|
@@ -54,13 +52,15 @@ class Finder
|
|
54
52
|
}
|
55
53
|
}
|
56
54
|
|
57
|
-
|
58
|
-
|
59
55
|
def self.reload!
|
60
56
|
@@index = nil
|
61
57
|
end
|
62
58
|
|
63
59
|
private
|
60
|
+
|
61
|
+
def sort_by_date
|
62
|
+
Ferret::Search::SortField.new(:modified, :type => :byte, :reverse => true)
|
63
|
+
end
|
64
64
|
|
65
65
|
def index_should_have_documents
|
66
66
|
raise IndexError, "no document found" unless index.size > 0
|
@@ -10,6 +10,7 @@ class Indexer
|
|
10
10
|
def index_every_directory(remove_first=false)
|
11
11
|
@@do_not_disturb_while_indexing=true
|
12
12
|
clear! if remove_first
|
13
|
+
@from_scratch = remove_first
|
13
14
|
# Forces Finder.searcher and Finder.index to be reloaded, by removing them from the cache.
|
14
15
|
Finder.reload!
|
15
16
|
log :debug => "Indexing every directory"
|
@@ -35,13 +36,19 @@ class Indexer
|
|
35
36
|
prepare_multi_threads_environment
|
36
37
|
|
37
38
|
indexing_list_chunks.each_with_thread{|chunk|
|
38
|
-
chunk.each{|
|
39
|
-
|
39
|
+
chunk.each{|complete_path|
|
40
|
+
last_itime=index_time_dbm_file[complete_path]
|
41
|
+
if @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime) then
|
42
|
+
add_or_update_file(complete_path)
|
43
|
+
else
|
44
|
+
log :debug => "Identical : #{complete_path}"
|
45
|
+
end
|
46
|
+
index_time_dbm_file[complete_path] = Time.now._dump
|
40
47
|
}
|
41
48
|
}
|
42
49
|
end
|
43
50
|
|
44
|
-
def
|
51
|
+
def add_or_update_file(complete_path)
|
45
52
|
default_fields = Document.default_fields_for(complete_path)
|
46
53
|
begin
|
47
54
|
document = PlainTextExtractor.extract_content_and_language_from(complete_path)
|
@@ -69,6 +76,19 @@ class Indexer
|
|
69
76
|
# Ferret will SEGFAULT otherwise.
|
70
77
|
@@index = nil
|
71
78
|
end
|
79
|
+
|
80
|
+
|
81
|
+
# Checks for indexed files that are missing from filesytem
|
82
|
+
# and removes them from index & dbm file.
|
83
|
+
def prune_index
|
84
|
+
missing_files=index_time_dbm_file.reject{|filename,itime| File.exists?(filename) && Picolena::IndexedDirectories.any?{|dir,alias_path| filename.starts_with?(dir)}}
|
85
|
+
missing_files.each{|filename, itime|
|
86
|
+
index.writer.delete(:complete_path, filename)
|
87
|
+
index_time_dbm_file.delete(filename)
|
88
|
+
log :debug => "Removed : #{filename}"
|
89
|
+
}
|
90
|
+
index.optimize
|
91
|
+
end
|
72
92
|
|
73
93
|
# Only one IndexWriter should be instantiated.
|
74
94
|
# If one index already exists, returns it.
|
@@ -81,11 +101,17 @@ class Indexer
|
|
81
101
|
index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production"
|
82
102
|
end
|
83
103
|
|
84
|
-
|
85
|
-
|
104
|
+
# Returns how many files are indexed.
|
105
|
+
def size
|
106
|
+
index.size
|
86
107
|
end
|
87
108
|
|
88
109
|
private
|
110
|
+
|
111
|
+
# Copied from Ferret book, By David Balmain
|
112
|
+
def index_time_dbm_file
|
113
|
+
@@dbm_file ||= DBM.open(File.join(Picolena::IndexSavePath, 'added_at'))
|
114
|
+
end
|
89
115
|
|
90
116
|
def index_exists?
|
91
117
|
index_filename and File.exists?(index_filename)
|
@@ -108,7 +134,7 @@ class Indexer
|
|
108
134
|
:field_infos => default_field_infos,
|
109
135
|
# Great way to ensure that no file is indexed twice!
|
110
136
|
:key => :probably_unique_id
|
111
|
-
}
|
137
|
+
}.merge Picolena::IndexingConfiguration
|
112
138
|
end
|
113
139
|
|
114
140
|
def default_field_infos
|
@@ -120,7 +146,7 @@ class Indexer
|
|
120
146
|
field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
|
121
147
|
field_infos.add_field(:modified, :store => :yes, :index => :untokenized)
|
122
148
|
field_infos.add_field(:probably_unique_id, :store => :no, :index => :untokenized)
|
123
|
-
field_infos.add_field(:language, :store => :yes, :index => :
|
149
|
+
field_infos.add_field(:language, :store => :yes, :index => :untokenized)
|
124
150
|
end
|
125
151
|
end
|
126
152
|
|
@@ -130,7 +156,8 @@ class Indexer
|
|
130
156
|
# an IndexWriter at the same time, and get a
|
131
157
|
# Ferret::Store::Lock::LockError
|
132
158
|
index
|
133
|
-
#
|
159
|
+
# Opens dbm file to dump indexing time.
|
160
|
+
index_time_dbm_file
|
134
161
|
# ActiveSupport sometime raises
|
135
162
|
# Expected Object is NOT missing constant
|
136
163
|
# without.
|
@@ -140,4 +167,4 @@ class Indexer
|
|
140
167
|
PlainTextExtractor
|
141
168
|
end
|
142
169
|
end
|
143
|
-
end
|
170
|
+
end
|
@@ -2,7 +2,7 @@
|
|
2
2
|
=link_to icon_and_filename_for(@document), download_document_path(@probably_unique_id)
|
3
3
|
(
|
4
4
|
=:as_it_was_indexed_on.l
|
5
|
-
=@document.
|
5
|
+
=@document.pretty_date
|
6
6
|
)
|
7
7
|
%p=link_to_containing_directory(@document)
|
8
|
-
%blockquote=
|
8
|
+
%blockquote=highlighted_cache(@document, @query)
|
@@ -7,6 +7,9 @@
|
|
7
7
|
%strong=h(@query)
|
8
8
|
=show_time_needed(@time_needed)
|
9
9
|
-else
|
10
|
-
%span{:class=>'pagination'}=should_paginate(@matching_documents, @query)
|
10
|
+
%span{:class=>'pagination'}=should_paginate(@matching_documents, @query, @sort)
|
11
11
|
=describe_results(@matching_documents, @total_hits, @time_needed, h(@query))
|
12
|
-
|
12
|
+
-unless nothing_found?
|
13
|
+
%p
|
14
|
+
%span{:class=>'sort_by'}=sort_by_date_or_relevance(@query)
|
15
|
+
= render :partial =>'document', :collection => @matching_documents, :locals => { :query => @query}
|
@@ -1,4 +1,4 @@
|
|
1
|
-
%w(rubygems paginator fileutils pathname logger thread).each{|lib| require lib}
|
1
|
+
%w(rubygems paginator fileutils pathname logger thread dbm).each{|lib| require lib}
|
2
2
|
|
3
3
|
# Uncomment below to force Rails into production mode when
|
4
4
|
# you don't control web/app server and can't set it the proper way
|
@@ -14,10 +14,15 @@ namespace :index do
|
|
14
14
|
task :update => :environment do
|
15
15
|
Indexer.index_every_directory
|
16
16
|
end
|
17
|
+
|
18
|
+
desc 'Remove unneeded files from index'
|
19
|
+
task :prune => :environment do
|
20
|
+
Indexer.prune_index
|
21
|
+
end
|
17
22
|
|
18
23
|
desc 'Returns the number of indexed documents'
|
19
24
|
task :size => :environment do
|
20
|
-
puts "#{Indexer.
|
25
|
+
puts "#{Indexer.size} documents are currently indexed in #{Picolena::IndexSavePath}"
|
21
26
|
end
|
22
27
|
|
23
28
|
# Search index with query "some query" :
|
@@ -30,7 +30,7 @@ namespace :install_dependencies do
|
|
30
30
|
task :deb_packages do
|
31
31
|
root_privileges_required!
|
32
32
|
#TODO: Should load this list from defined PlainTextExtractor's
|
33
|
-
packages=%w{antiword poppler-utils odt2txt html2text catdoc unrtf mguesser}.join(" ")
|
33
|
+
packages=%w{antiword poppler-utils odt2txt html2text catdoc unrtf mguesser libdbm-ruby1.8}.join(" ")
|
34
34
|
puts "Installing "<<packages
|
35
35
|
system("apt-get install "<<packages)
|
36
36
|
end
|
@@ -82,6 +82,17 @@ h1, h2, h3, h4, h5, h6, p, form {
|
|
82
82
|
text-decoration:none;
|
83
83
|
}
|
84
84
|
|
85
|
+
.sort_by {
|
86
|
+
float:right;
|
87
|
+
font-size: 13px;
|
88
|
+
color:#000;
|
89
|
+
}
|
90
|
+
|
91
|
+
.sort_by a{
|
92
|
+
color: #EE8907;
|
93
|
+
text-decoration:none;
|
94
|
+
}
|
95
|
+
|
85
96
|
#mainimg input.btn{
|
86
97
|
margin-right: 10px;
|
87
98
|
height: 20px;
|
@@ -116,7 +127,7 @@ width: 80%;
|
|
116
127
|
|
117
128
|
#results {
|
118
129
|
width:778px;
|
119
|
-
padding-top:
|
130
|
+
padding-top: 15px;
|
120
131
|
}
|
121
132
|
|
122
133
|
#results h2 a{
|
@@ -137,6 +148,11 @@ width: 80%;
|
|
137
148
|
padding:0px 20px;
|
138
149
|
}
|
139
150
|
|
151
|
+
#results .matching_content{
|
152
|
+
background-color:#ffff66;
|
153
|
+
}
|
154
|
+
|
155
|
+
|
140
156
|
#results a, #results small{
|
141
157
|
font-family:"Trebuchet MS";
|
142
158
|
font-size:11px;
|
@@ -50,16 +50,16 @@ describe "Basic Finder" do
|
|
50
50
|
Indexer.index_every_directory(remove_first=true)
|
51
51
|
end
|
52
52
|
|
53
|
-
it "should accept one parameter as query, and 2 optionals for paginating" do
|
53
|
+
it "should accept one parameter as query, 1 optional for sorting results and 2 optionals for paginating" do
|
54
54
|
lambda {Finder.new}.should raise_error(ArgumentError, "wrong number of arguments (0 for 1)")
|
55
55
|
# show first page with 10 results per page
|
56
56
|
lambda {Finder.new("a b")}.should_not raise_error
|
57
57
|
# show second page
|
58
|
-
lambda {Finder.new("a",
|
58
|
+
lambda {Finder.new("a", "by_date")}.should_not raise_error
|
59
59
|
# show first page with 15 results
|
60
|
-
lambda {Finder.new("a", 1, 15)}.should_not raise_error
|
60
|
+
lambda {Finder.new("a", "by_date", 1, 15)}.should_not raise_error
|
61
61
|
# Too many parameters
|
62
|
-
lambda {Finder.new("a", 10, 20, 30)}.should raise_error(ArgumentError, "wrong number of arguments (
|
62
|
+
lambda {Finder.new("a", "by_date", 10, 20, 30)}.should raise_error(ArgumentError, "wrong number of arguments (5 for 4)")
|
63
63
|
end
|
64
64
|
|
65
65
|
it "should return matching documents if executed successfully" do
|
@@ -5,28 +5,30 @@ basic_pdf_attribute={
|
|
5
5
|
:basename=>'basic',
|
6
6
|
:complete_path=>File.join(RAILS_ROOT, '/spec/test_dirs/indexed/basic/basic.pdf'),
|
7
7
|
:extname=>'.pdf',
|
8
|
-
:
|
8
|
+
:ext_as_sym => :pdf,
|
9
|
+
:filename=>'basic.pdf',
|
10
|
+
:size => 9380
|
9
11
|
}
|
10
12
|
|
11
13
|
describe Document do
|
12
14
|
before(:each) do
|
13
|
-
@
|
15
|
+
@valid_document=Document.new("spec/test_dirs/indexed/basic/basic.pdf")
|
14
16
|
end
|
15
17
|
|
16
18
|
it "should be an existing file" do
|
17
19
|
lambda {Document.new("/patapouf.txt")}.should raise_error(Errno::ENOENT)
|
18
|
-
lambda {@
|
20
|
+
lambda {@valid_document}.should_not raise_error
|
19
21
|
lambda {Document.new("spec/test_dirs/not_indexed/Rakefile")}.should_not raise_error(Errno::ENOENT)
|
20
22
|
end
|
21
23
|
|
22
24
|
it "should belong to an indexed directory" do
|
23
|
-
lambda {@
|
25
|
+
lambda {@valid_document}.should_not raise_error
|
24
26
|
lambda {Document.new("spec/test_dirs/not_indexed/Rakefile")}.should raise_error(ArgumentError, "required document is not in indexed directory")
|
25
27
|
end
|
26
28
|
|
27
29
|
basic_pdf_attribute.each{|attribute,expected_value|
|
28
30
|
it "should know its #{attribute}" do
|
29
|
-
@
|
31
|
+
@valid_document.should respond_to(attribute)
|
30
32
|
@basic_pdf=Document.new('spec/test_dirs/indexed/basic/basic.pdf')
|
31
33
|
@basic_pdf.send(attribute).should == expected_value
|
32
34
|
end
|
@@ -36,23 +38,70 @@ describe Document do
|
|
36
38
|
another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
|
37
39
|
another_doc.content.should == "just a content test\nin a txt file"
|
38
40
|
end
|
41
|
+
|
42
|
+
it "should know its cached content" do
|
43
|
+
another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
|
44
|
+
another_doc.cached.should == "just a content test\nin a txt file"
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should know its highlighted cached content for a given query" do
|
48
|
+
another_doc=Document.new("spec/test_dirs/indexed/basic/plain.txt")
|
49
|
+
another_doc.highlighted_cache('a content test').should == "just a <<content>> <<test>>\nin a txt file"
|
50
|
+
end
|
39
51
|
|
40
52
|
it "should know its alias_path" do
|
41
|
-
@
|
42
|
-
@
|
53
|
+
@valid_document.should respond_to(:alias_path)
|
54
|
+
@valid_document.alias_path.starts_with?("http://picolena.devjavu.com/browser/trunk/lib/picolena/templates/spec/test_dirs/indexed").should be_true
|
55
|
+
end
|
56
|
+
|
57
|
+
it "should know its probably_unique_id" do
|
58
|
+
@valid_document.should respond_to(:probably_unique_id)
|
59
|
+
@valid_document.probably_unique_id.should =~/^[a-z]+$/
|
60
|
+
@valid_document.probably_unique_id.size.should == Picolena::HashLength
|
43
61
|
end
|
62
|
+
|
63
|
+
it "should know its modification date" do
|
64
|
+
@valid_document.pretty_date.class.should == String
|
65
|
+
@valid_document.pretty_date.should =~/^\d{4}\-\d{2}\-\d{2}$/
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should know its modification time and returns it in a pretty way" do
|
69
|
+
@valid_document.should respond_to(:mtime)
|
70
|
+
@valid_document.mtime.should be_kind_of(Integer)
|
71
|
+
@valid_document.should respond_to(:pretty_mtime)
|
72
|
+
@valid_document.pretty_mtime.class.should == String
|
73
|
+
@valid_document.pretty_mtime.should =~/^\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}$/
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should know if its content can be extracted" do
|
77
|
+
@valid_document.should respond_to(:supported?)
|
78
|
+
@valid_document.should be_supported
|
79
|
+
Document.new("spec/test_dirs/indexed/others/ghjopdfg.xyz").should_not be_supported
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should know its language when enough content is available" do
|
83
|
+
Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
|
84
|
+
Document.new("spec/test_dirs/indexed/lang/shakespeare").language.should == "en"
|
85
|
+
Document.new("spec/test_dirs/indexed/lang/lorca").language.should == "es"
|
86
|
+
Document.new("spec/test_dirs/indexed/lang/hugo").language.should == "fr"
|
87
|
+
end if Picolena::UseLanguageRecognition
|
88
|
+
|
89
|
+
it "should not try to guess language when file is too small" do
|
90
|
+
Document.new("spec/test_dirs/indexed/basic/hello.rb").language.should be_nil
|
91
|
+
Document.new("spec/test_dirs/indexed/README").language.should be_nil
|
92
|
+
end if Picolena::UseLanguageRecognition
|
44
93
|
|
45
94
|
it "should let finder specify its score" do
|
46
|
-
@
|
47
|
-
@
|
48
|
-
@
|
49
|
-
@
|
95
|
+
@valid_document.should respond_to(:score)
|
96
|
+
@valid_document.score.should be_nil
|
97
|
+
@valid_document.score=25
|
98
|
+
@valid_document.score.should == 25
|
50
99
|
end
|
51
100
|
|
52
101
|
it "should let finder specify its matching content" do
|
53
|
-
@
|
54
|
-
@
|
55
|
-
@
|
56
|
-
@
|
102
|
+
@valid_document.should respond_to(:matching_content)
|
103
|
+
@valid_document.matching_content.should be_nil
|
104
|
+
@valid_document.matching_content=["thermal cooling", "heat driven cooling"]
|
105
|
+
@valid_document.matching_content.should include("thermal cooling")
|
57
106
|
end
|
58
|
-
end
|
107
|
+
end
|
@@ -8,9 +8,9 @@ end
|
|
8
8
|
|
9
9
|
|
10
10
|
def matching_document_for(query)
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
# Returns matching document for any given query only if
|
12
|
+
# exactly one document is found.
|
13
|
+
# Specs don't pass otherwise.
|
14
14
|
matching_documents=Finder.new(query).matching_documents
|
15
15
|
matching_documents.size.should == 1
|
16
16
|
matching_documents.first
|
@@ -19,6 +19,7 @@ end
|
|
19
19
|
|
20
20
|
describe Finder do
|
21
21
|
before(:all) do
|
22
|
+
Globalite.language = :en
|
22
23
|
# SVN doesn't like non-ascii filenames.
|
23
24
|
revert_changes!('spec/test_dirs/indexed/others/bäñüßé.txt',"just to know if files are indexed with utf8 filenames")
|
24
25
|
|
@@ -13,10 +13,18 @@ describe "Host indexing system" do
|
|
13
13
|
|
14
14
|
it "should know which IP addresses are allowed (config/custom/white_list_ip.yml)" do
|
15
15
|
File.should be_readable('config/custom/white_list_ip.yml')
|
16
|
+
ip_conf=YAML.load_file('config/custom/white_list_ip.yml')
|
17
|
+
ip_conf.class.should == Hash
|
18
|
+
ip_conf['Allow'].should_not be_nil
|
16
19
|
end
|
17
20
|
|
18
21
|
it "should know which directories are to be indexed (config/custom/indexed_directories.yml)" do
|
19
22
|
File.should be_readable('config/custom/indexed_directories.yml')
|
23
|
+
dirs_conf=YAML.load_file('config/custom/indexed_directories.yml')
|
24
|
+
dirs_conf.class.should == Hash
|
25
|
+
%w(development test production).all?{|env|
|
26
|
+
dirs_conf[env].should_not be_nil
|
27
|
+
}
|
20
28
|
end
|
21
29
|
|
22
30
|
it "should be able to calculate base26 hash from strings" do
|
@@ -27,16 +27,4 @@ describe "PlainTextExtractors" do
|
|
27
27
|
end
|
28
28
|
}
|
29
29
|
}
|
30
|
-
|
31
|
-
it "should guess language when enough content is available" do
|
32
|
-
Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
|
33
|
-
Document.new("spec/test_dirs/indexed/lang/shakespeare").language.should == "en"
|
34
|
-
Document.new("spec/test_dirs/indexed/lang/lorca").language.should == "es"
|
35
|
-
Document.new("spec/test_dirs/indexed/lang/hugo").language.should == "fr"
|
36
|
-
end if Picolena::UseLanguageRecognition
|
37
|
-
|
38
|
-
it "should not try to guess language when file is too small" do
|
39
|
-
Document.new("spec/test_dirs/indexed/basic/hello.rb").language.should be_nil
|
40
|
-
Document.new("spec/test_dirs/indexed/README").language.should be_nil
|
41
|
-
end if Picolena::UseLanguageRecognition
|
42
30
|
end
|
@@ -1,8 +1,16 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../spec_helper'
|
2
2
|
|
3
3
|
describe Query do
|
4
|
-
it "should return a BooleanQuery" do
|
4
|
+
it "should return a BooleanQuery, a TermQuery or a RangeQuery" do
|
5
5
|
Query.extract_from("whatever").class.should == Ferret::Search::BooleanQuery
|
6
|
+
Query.extract_from("lang:de").class.should == Ferret::Search::TermQuery
|
7
|
+
Query.extract_from("date:<1990").class.should == Ferret::Search::RangeQuery
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should not remove stop-words from TermQuery" do
|
11
|
+
# it means "Italian language", but also is a stop-word.
|
12
|
+
Query.extract_from("lang:it").class.should == Ferret::Search::TermQuery
|
13
|
+
Query.extract_from("lang:it").to_s.should == "language:it"
|
6
14
|
end
|
7
15
|
|
8
16
|
it "should translate LIKE, NOT, OR and AND boolean ops to English" do
|
@@ -12,6 +20,7 @@ describe Query do
|
|
12
20
|
:fr=>["COMME","NON","OU","ET"]
|
13
21
|
}
|
14
22
|
|
23
|
+
Globalite.language = :en
|
15
24
|
english_query_with_like_and_not=Query.extract_from("LIKE something NOT something")
|
16
25
|
english_query_with_or=Query.extract_from("test OR another")
|
17
26
|
english_query_with_and=Query.extract_from("test AND another")
|
data/lib/picolena/version.rb
CHANGED
data/website/index.html
CHANGED
@@ -33,7 +33,7 @@
|
|
33
33
|
<h1>Picolena</h1>
|
34
34
|
<div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
|
35
35
|
<p>Get Version</p>
|
36
|
-
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.
|
36
|
+
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.7</a>
|
37
37
|
</div>
|
38
38
|
<h1>→ ‘picolena’</h1>
|
39
39
|
|
data/website/index.txt
CHANGED
File without changes
|
data/website/index_devjavu
CHANGED
File without changes
|
File without changes
|
File without changes
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: picolena
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Duminil
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
|
31
31
|
-----END CERTIFICATE-----
|
32
32
|
|
33
|
-
date: 2008-04-
|
33
|
+
date: 2008-04-30 00:00:00 +02:00
|
34
34
|
default_executable:
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
@@ -146,6 +146,7 @@ files:
|
|
146
146
|
- lib/picolena/config/basic.rb
|
147
147
|
- lib/picolena/config/icons_and_filetypes.yml
|
148
148
|
- lib/picolena/config/indexed_directories.yml
|
149
|
+
- lib/picolena/config/indexing_performance.yml
|
149
150
|
- lib/picolena/config/title_and_names_and_links.yml
|
150
151
|
- lib/picolena/config/white_list_ip.yml
|
151
152
|
- lib/picolena/picolena_generator.rb
|
@@ -177,6 +178,7 @@ files:
|
|
177
178
|
- lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb
|
178
179
|
- lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
|
179
180
|
- lib/picolena/templates/config/initializers/006_load_icons.rb
|
181
|
+
- lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb
|
180
182
|
- lib/picolena/templates/config/routes.rb
|
181
183
|
- lib/picolena/templates/lang/ui/de.yml
|
182
184
|
- lib/picolena/templates/lang/ui/en.yml
|
metadata.gz.sig
CHANGED
@@ -1 +1,2 @@
|
|
1
|
-
|
1
|
+
�;����U�=nƷ�8߿X�`>����B����2Ħ@,u!��~�u�9>�Ӽq�J1� ֖i�T������-.q�^l*�`�>"��m�8��ɏP�cWk��y%����W�:r=&����CtaO;c
|
2
|
+
.&��}�e)�g(O�)0ة)!����s�
|
3
|
+
�"��Fm��>8���n���q�?I�P'����`|����`�\�>{\a4�Ӷ�JǮ}�&�?�d�UM{
|