picolena 0.1.7 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +10 -0
- data/Manifest.txt +1 -0
- data/lib/picolena/templates/app/controllers/documents_controller.rb +3 -3
- data/lib/picolena/templates/app/helpers/documents_helper.rb +5 -5
- data/lib/picolena/templates/app/models/document.rb +16 -3
- data/lib/picolena/templates/app/models/finder.rb +18 -7
- data/lib/picolena/templates/app/models/indexer.rb +92 -33
- data/lib/picolena/templates/app/models/query.rb +5 -0
- data/lib/picolena/templates/app/views/documents/_document.html.haml +9 -8
- data/lib/picolena/templates/config/environment.rb +0 -2
- data/lib/picolena/templates/config/environments/development.rb +0 -3
- data/lib/picolena/templates/config/environments/production.rb +0 -2
- data/lib/picolena/templates/config/environments/test.rb +0 -3
- data/lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb +3 -0
- data/lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb +1 -1
- data/lib/picolena/templates/lang/ui/de.yml +2 -2
- data/lib/picolena/templates/lang/ui/en.yml +1 -1
- data/lib/picolena/templates/lang/ui/es.yml +1 -1
- data/lib/picolena/templates/lang/ui/fr.yml +2 -2
- data/lib/picolena/templates/lib/core_exts.rb +32 -23
- data/lib/picolena/templates/lib/indexer_logger.rb +45 -0
- data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +0 -1
- data/lib/picolena/templates/lib/plain_text_extractors/plain_text.rb +3 -3
- data/lib/picolena/templates/lib/tasks/index.rake +6 -1
- data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +0 -2
- data/lib/picolena/templates/spec/models/basic_finder_spec.rb +4 -0
- data/lib/picolena/templates/spec/models/document_spec.rb +6 -0
- data/lib/picolena/templates/spec/models/finder_spec.rb +0 -1
- data/lib/picolena/templates/spec/models/indexer_spec.rb +9 -0
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +5 -0
- data/lib/picolena/templates/spec/models/query_spec.rb +26 -2
- data/lib/picolena/version.rb +1 -1
- data/website/index.html +1 -1
- data.tar.gz.sig +0 -0
- metadata +3 -2
- metadata.gz.sig +0 -0
data/History.txt
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
== 0.1.8 2008-05-08
|
2
|
+
|
3
|
+
* 2 minor enhancements:
|
4
|
+
* New IndexerLogger with basic statistics
|
5
|
+
* More specs & documentation.
|
6
|
+
|
7
|
+
* 2 bug fixes:
|
8
|
+
* Binary documents without extension are not considered supported anymore
|
9
|
+
* Ensure that index is locked system-wide by using lock file.
|
10
|
+
|
1
11
|
== 0.1.7 2008-04-30
|
2
12
|
|
3
13
|
* 5 minor enhancements:
|
data/Manifest.txt
CHANGED
@@ -50,6 +50,7 @@ lib/picolena/templates/lang/ui/en.yml
|
|
50
50
|
lib/picolena/templates/lang/ui/es.yml
|
51
51
|
lib/picolena/templates/lang/ui/fr.yml
|
52
52
|
lib/picolena/templates/lib/core_exts.rb
|
53
|
+
lib/picolena/templates/lib/indexer_logger.rb
|
53
54
|
lib/picolena/templates/lib/plain_text_extractor_DSL.rb
|
54
55
|
lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
|
55
56
|
lib/picolena/templates/lib/plain_text_extractors/html.rb
|
@@ -22,9 +22,9 @@ class DocumentsController < ApplicationController
|
|
22
22
|
def show
|
23
23
|
start=Time.now
|
24
24
|
@query=[params[:id],params.delete(:format)].compact.join('.')
|
25
|
-
@
|
25
|
+
@sort_by=params[:sort_by]
|
26
26
|
page=params[:page]||1
|
27
|
-
finder=Finder.new(@query,@
|
27
|
+
finder=Finder.new(@query,@sort_by,page)
|
28
28
|
finder.execute!
|
29
29
|
pager=::Paginator.new(finder.total_hits, Picolena::ResultsPerPage) do
|
30
30
|
finder.matching_documents
|
@@ -64,7 +64,7 @@ class DocumentsController < ApplicationController
|
|
64
64
|
|
65
65
|
def ensure_index_is_created
|
66
66
|
Indexer.ensure_index_existence
|
67
|
-
while Indexer.
|
67
|
+
while Indexer.locked? do
|
68
68
|
sleep 1
|
69
69
|
end
|
70
70
|
end
|
@@ -6,10 +6,10 @@ module DocumentsHelper
|
|
6
6
|
|
7
7
|
# Very basic pagination.
|
8
8
|
# Provides liks to Next, Prev and FirstPage when needed.
|
9
|
-
def should_paginate(page,query,
|
10
|
-
[(link_to("←←", :action => :show, :id => query, :
|
11
|
-
(link_to("←", :action => :show, :id => query, :page => page.prev.number, :
|
12
|
-
(link_to("→", :action => :show, :id => query, :page => page.next.number, :
|
9
|
+
def should_paginate(page,query, sort_by)
|
10
|
+
[(link_to("←←", :action => :show, :id => query, :sort_by=>sort_by) if page.number>2),
|
11
|
+
(link_to("←", :action => :show, :id => query, :page => page.prev.number, :sort_by=>sort_by) if page.prev?),
|
12
|
+
(link_to("→", :action => :show, :id => query, :page => page.next.number, :sort_by=>sort_by) if page.next?)].compact.join(" | ")
|
13
13
|
end
|
14
14
|
|
15
15
|
# Returns a localized sentence like "Results 1-10 of 12 for Zimbabwe (0.472s)" or
|
@@ -81,7 +81,7 @@ module DocumentsHelper
|
|
81
81
|
end
|
82
82
|
|
83
83
|
def sort_by_date_or_relevance(query)
|
84
|
-
[link_to_unless_current('By date', document_path(query, :
|
84
|
+
[link_to_unless_current('By date', document_path(query, :sort_by=>'date')),
|
85
85
|
link_to_unless_current('By relevance', document_path(query))].join(" ")
|
86
86
|
end
|
87
87
|
end
|
@@ -11,10 +11,18 @@ class Document
|
|
11
11
|
end
|
12
12
|
|
13
13
|
#Delegating properties to File::method_name(complete_path)
|
14
|
-
[:dirname, :basename, :extname, :ext_as_sym, :file?, :size, :ext_as_sym].each{|method_name|
|
14
|
+
[:dirname, :basename, :extname, :ext_as_sym, :file?, :plain_text?, :size, :ext_as_sym].each{|method_name|
|
15
15
|
define_method(method_name){File.send(method_name,complete_path)}
|
16
16
|
}
|
17
17
|
alias_method :filename, :basename
|
18
|
+
alias_method :to_s, :complete_path
|
19
|
+
|
20
|
+
|
21
|
+
def inspect
|
22
|
+
[self,("(#{pretty_score})" if @score),("(language:#{language})" if language)].compact.join(" ")
|
23
|
+
end
|
24
|
+
|
25
|
+
|
18
26
|
|
19
27
|
# Returns filename without extension
|
20
28
|
# "buildings.odt" => "buildings"
|
@@ -50,7 +58,7 @@ class Document
|
|
50
58
|
# Document.new("presentation.pdf").supported? => true
|
51
59
|
# Document.new("presentation.some_weird_extension").supported? => false
|
52
60
|
def supported?
|
53
|
-
PlainTextExtractor.supported_extensions.include?(self.ext_as_sym)
|
61
|
+
PlainTextExtractor.supported_extensions.include?(self.ext_as_sym) unless ext_as_sym==:no_extension and !plain_text?
|
54
62
|
end
|
55
63
|
|
56
64
|
# Retrieves content as it is *now*.
|
@@ -91,6 +99,10 @@ class Document
|
|
91
99
|
from_index[:language]
|
92
100
|
end
|
93
101
|
|
102
|
+
def pretty_score
|
103
|
+
"%3.1f%" % (@score*100)
|
104
|
+
end
|
105
|
+
|
94
106
|
# Fields that are shared between every document.
|
95
107
|
def self.default_fields_for(complete_path)
|
96
108
|
{
|
@@ -103,6 +115,7 @@ class Document
|
|
103
115
|
}
|
104
116
|
end
|
105
117
|
|
118
|
+
|
106
119
|
private
|
107
120
|
|
108
121
|
# FIXME: Is there a way to easily retrieve doc_id for a given document?
|
@@ -138,4 +151,4 @@ class Document
|
|
138
151
|
def validate_in_indexed_directory
|
139
152
|
raise ArgumentError, "required document is not in indexed directory" unless in_indexed_directory?
|
140
153
|
end
|
141
|
-
end
|
154
|
+
end
|
@@ -2,23 +2,24 @@ class Finder
|
|
2
2
|
attr_reader :query
|
3
3
|
|
4
4
|
def index
|
5
|
-
@@index ||= Indexer.index
|
5
|
+
@@index ||= Indexer.index
|
6
6
|
end
|
7
7
|
|
8
|
-
def initialize(raw_query,
|
8
|
+
def initialize(raw_query,sort_by='relevance', page=1,results_per_page=Picolena::ResultsPerPage)
|
9
9
|
@query = Query.extract_from(raw_query)
|
10
10
|
@raw_query= raw_query
|
11
11
|
Indexer.ensure_index_existence
|
12
|
+
reload_index! if should_be_reloaded?
|
12
13
|
@per_page=results_per_page
|
13
14
|
@offset=(page.to_i-1)*results_per_page
|
14
|
-
@
|
15
|
+
@sort_by=sort_by
|
15
16
|
index_should_have_documents
|
16
17
|
end
|
17
18
|
|
18
19
|
def execute!
|
19
20
|
@matching_documents=[]
|
20
21
|
start=Time.now
|
21
|
-
@total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @
|
22
|
+
@total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @sort_by=='date')){|index_id, score|
|
22
23
|
begin
|
23
24
|
found_doc=Document.new(index[index_id][:complete_path])
|
24
25
|
found_doc.matching_content=index.highlight(query, index_id,
|
@@ -52,11 +53,21 @@ class Finder
|
|
52
53
|
}
|
53
54
|
}
|
54
55
|
|
55
|
-
|
56
|
+
private
|
57
|
+
|
58
|
+
def reload_index!
|
59
|
+
Indexer.close
|
56
60
|
@@index = nil
|
61
|
+
@@last_reload = Time.now
|
57
62
|
end
|
58
63
|
|
59
|
-
|
64
|
+
def should_be_reloaded?
|
65
|
+
Indexer.reload_file_mtime > last_reload
|
66
|
+
end
|
67
|
+
|
68
|
+
def last_reload
|
69
|
+
@@last_reload ||= Time.at(0)
|
70
|
+
end
|
60
71
|
|
61
72
|
def sort_by_date
|
62
73
|
Ferret::Search::SortField.new(:modified, :type => :byte, :reverse => true)
|
@@ -65,4 +76,4 @@ class Finder
|
|
65
76
|
def index_should_have_documents
|
66
77
|
raise IndexError, "no document found" unless index.size > 0
|
67
78
|
end
|
68
|
-
end
|
79
|
+
end
|
@@ -1,63 +1,74 @@
|
|
1
|
+
# Indexer is used to index (duh!) documents contained in IndexedDirectories
|
2
|
+
# It can create, update, delete and prune the index, and take care that only
|
3
|
+
# one IndexWriter exists at any given time, even when used in a multi-threaded
|
4
|
+
# way.
|
5
|
+
require 'indexer_logger'
|
1
6
|
class Indexer
|
2
7
|
# This regexp defines which files should *not* be indexed.
|
3
8
|
@@exclude = /(Thumbs\.db)/
|
4
9
|
# Number of threads that will be used during indexing process
|
5
10
|
@@threads_number = 8
|
6
|
-
|
7
|
-
cattr_reader :do_not_disturb_while_indexing
|
8
11
|
|
9
12
|
class << self
|
13
|
+
# Finds every document included in IndexedDirectories, parses them with
|
14
|
+
# PlainTextExtractor and adds them to the index.
|
15
|
+
#
|
16
|
+
# Updates the index unless remove_first parameter is set to true, in which
|
17
|
+
# case it removes the index first before re-creating it.
|
10
18
|
def index_every_directory(remove_first=false)
|
11
|
-
@@do_not_disturb_while_indexing=true
|
12
19
|
clear! if remove_first
|
20
|
+
lock!
|
13
21
|
@from_scratch = remove_first
|
14
|
-
|
15
|
-
Finder.reload!
|
16
|
-
log :debug => "Indexing every directory"
|
17
|
-
start=Time.now
|
22
|
+
logger.start_indexing
|
18
23
|
Picolena::IndexedDirectories.each{|dir, alias_dir|
|
19
24
|
index_directory_with_multithreads(dir)
|
20
25
|
}
|
21
|
-
|
26
|
+
logger.debug "Now optimizing index"
|
22
27
|
index.optimize
|
23
|
-
|
24
|
-
|
28
|
+
index_time_dbm_file['last']=Time.now._dump
|
29
|
+
unlock!
|
30
|
+
logger.show_report
|
25
31
|
end
|
26
32
|
|
33
|
+
# Indexes a given directory, using @@threads_number threads.
|
34
|
+
# To do so, it retrieves a list of every included document, cuts it in
|
35
|
+
# @@threads_number chunks, and create a new indexing thread for every chunk.
|
27
36
|
def index_directory_with_multithreads(dir)
|
28
|
-
|
29
|
-
|
37
|
+
logger.debug "Indexing #{dir}, #{@@threads_number} threads"
|
30
38
|
indexing_list=Dir[File.join(dir,"**/*")].select{|filename|
|
31
39
|
File.file?(filename) && filename !~ @@exclude
|
32
40
|
}
|
33
41
|
|
34
42
|
indexing_list_chunks=indexing_list.in_transposed_slices(@@threads_number)
|
35
|
-
|
36
43
|
prepare_multi_threads_environment
|
37
|
-
|
44
|
+
|
38
45
|
indexing_list_chunks.each_with_thread{|chunk|
|
39
46
|
chunk.each{|complete_path|
|
40
|
-
|
41
|
-
if @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime) then
|
47
|
+
if should_index_this_document?(complete_path) then
|
42
48
|
add_or_update_file(complete_path)
|
43
49
|
else
|
44
|
-
|
50
|
+
logger.debug "Identical : #{complete_path}"
|
45
51
|
end
|
46
52
|
index_time_dbm_file[complete_path] = Time.now._dump
|
47
53
|
}
|
48
54
|
}
|
49
55
|
end
|
50
56
|
|
57
|
+
# Retrieves content and language from a given document, and adds it to the index.
|
58
|
+
# Since Document#probably_unique_id is used as index :key, no document will be added
|
59
|
+
# twice to the index, and the old document will just get updated.
|
60
|
+
#
|
61
|
+
# If for some reason (no content found or no defined PlainTextExtractor), content cannot
|
62
|
+
# be found, some basic information about the document (mtime, filename, complete_path)
|
63
|
+
# gets indexed anyway.
|
51
64
|
def add_or_update_file(complete_path)
|
52
|
-
|
65
|
+
document = Document.default_fields_for(complete_path)
|
53
66
|
begin
|
54
|
-
document
|
67
|
+
document.merge! PlainTextExtractor.extract_content_and_language_from(complete_path)
|
55
68
|
raise "empty document #{complete_path}" if document[:content].strip.empty?
|
56
|
-
|
57
|
-
log :debug => ["Added : #{complete_path}",document[:language] ? " (#{document[:language]})" : ""].join
|
69
|
+
logger.add_document document
|
58
70
|
rescue => e
|
59
|
-
|
60
|
-
document = default_fields
|
71
|
+
logger.reject_document document, e
|
61
72
|
end
|
62
73
|
index << document
|
63
74
|
end
|
@@ -73,11 +84,9 @@ class Indexer
|
|
73
84
|
# ensures that a new Index is instantiated next time index is called.
|
74
85
|
def close
|
75
86
|
@@index.close rescue nil
|
76
|
-
# Ferret will SEGFAULT otherwise.
|
77
87
|
@@index = nil
|
78
88
|
end
|
79
89
|
|
80
|
-
|
81
90
|
# Checks for indexed files that are missing from filesytem
|
82
91
|
# and removes them from index & dbm file.
|
83
92
|
def prune_index
|
@@ -85,7 +94,7 @@ class Indexer
|
|
85
94
|
missing_files.each{|filename, itime|
|
86
95
|
index.writer.delete(:complete_path, filename)
|
87
96
|
index_time_dbm_file.delete(filename)
|
88
|
-
|
97
|
+
logger.debug "Removed : #{filename}"
|
89
98
|
}
|
90
99
|
index.optimize
|
91
100
|
end
|
@@ -97,6 +106,7 @@ class Indexer
|
|
97
106
|
@@index ||= Ferret::Index::Index.new(default_index_params)
|
98
107
|
end
|
99
108
|
|
109
|
+
# Creates the index unless it already exists.
|
100
110
|
def ensure_index_existence
|
101
111
|
index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production"
|
102
112
|
end
|
@@ -106,11 +116,66 @@ class Indexer
|
|
106
116
|
index.size
|
107
117
|
end
|
108
118
|
|
119
|
+
# Returns the time at which the index was last created/updated.
|
120
|
+
# Returns "none" if it doesn't exist.
|
121
|
+
def last_update
|
122
|
+
Time._load(index_time_dbm_file['last']) rescue "none"
|
123
|
+
end
|
124
|
+
|
125
|
+
# Returns the time at which the reload file was last touched.
|
126
|
+
# Useful to know if other processes have modified the shared index,
|
127
|
+
# and if the Indexer should be reloaded.
|
128
|
+
def reload_file_mtime
|
129
|
+
touch_reload_file! unless File.exists?(reload_file)
|
130
|
+
File.mtime(reload_file)
|
131
|
+
end
|
132
|
+
|
133
|
+
# For a given document, it retrieves the time it was last indexed, compare it to
|
134
|
+
# its modification time and returns false unless the file has been
|
135
|
+
# modified after the last indexing process.
|
136
|
+
def should_index_this_document?(complete_path)
|
137
|
+
last_itime=index_time_dbm_file[complete_path]
|
138
|
+
@from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime)
|
139
|
+
end
|
140
|
+
|
141
|
+
def locked?
|
142
|
+
File.exists?(lock_file)
|
143
|
+
end
|
144
|
+
|
109
145
|
private
|
110
146
|
|
147
|
+
def touch_reload_file!
|
148
|
+
FileUtils.touch(reload_file)
|
149
|
+
# To ensure that every process can touch reload_file, even if Picolena
|
150
|
+
# is launched as a special user.
|
151
|
+
FileUtils.chmod(0666, reload_file)
|
152
|
+
end
|
153
|
+
|
154
|
+
def reload_file
|
155
|
+
File.join(Picolena::MetaIndexPath,'reload')
|
156
|
+
end
|
157
|
+
|
158
|
+
def lock!
|
159
|
+
FileUtils.touch(lock_file)
|
160
|
+
end
|
161
|
+
|
162
|
+
def unlock!
|
163
|
+
FileUtils.rm(lock_file)
|
164
|
+
# Forces Finder.index to be reloaded.
|
165
|
+
touch_reload_file!
|
166
|
+
end
|
167
|
+
|
168
|
+
def lock_file
|
169
|
+
File.join(Picolena::MetaIndexPath,'lock')
|
170
|
+
end
|
171
|
+
|
172
|
+
def logger
|
173
|
+
@@logger ||= IndexerLogger.new
|
174
|
+
end
|
175
|
+
|
111
176
|
# Copied from Ferret book, By David Balmain
|
112
177
|
def index_time_dbm_file
|
113
|
-
@@dbm_file ||= DBM.open(File.join(Picolena::
|
178
|
+
@@dbm_file ||= DBM.open(File.join(Picolena::MetaIndexPath, 'added_at'))
|
114
179
|
end
|
115
180
|
|
116
181
|
def index_exists?
|
@@ -121,12 +186,6 @@ class Indexer
|
|
121
186
|
Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first
|
122
187
|
end
|
123
188
|
|
124
|
-
def log(hash)
|
125
|
-
hash.each{|level,message|
|
126
|
-
IndexerLogger.send(level,message)
|
127
|
-
}
|
128
|
-
end
|
129
|
-
|
130
189
|
def default_index_params
|
131
190
|
{
|
132
191
|
:path => Picolena::IndexSavePath,
|
@@ -4,6 +4,11 @@ class Query
|
|
4
4
|
def extract_from(raw_query)
|
5
5
|
parser.parse(convert_to_english(raw_query))
|
6
6
|
end
|
7
|
+
|
8
|
+
# Returns terms related to content. Useful for cache highlighting
|
9
|
+
def content_terms(raw_query)
|
10
|
+
Query.extract_from(raw_query).terms(Indexer.index.searcher).select{|term| term.field==:content}.collect{|term| term.text}.uniq
|
11
|
+
end
|
7
12
|
|
8
13
|
private
|
9
14
|
|
@@ -3,14 +3,15 @@
|
|
3
3
|
=language_icon_for(document)
|
4
4
|
%small=number_to_percentage(document.score*100, :precision=>1)
|
5
5
|
=highlight_matching_content(document)
|
6
|
-
%p
|
7
|
-
|
8
|
-
%
|
6
|
+
%p
|
7
|
+
=link_to_containing_directory(document)
|
8
|
+
%br/
|
9
|
+
-if document.supported?
|
9
10
|
=link_to_plain_text_content(document)
|
10
11
|
-
|
11
|
-
=number_to_human_size(document.size)
|
12
|
-
-
|
13
|
-
=document.pretty_date
|
14
|
-
-
|
15
12
|
=link_to_cached_content(document,query)
|
16
|
-
|
13
|
+
-
|
14
|
+
=number_to_human_size(document.size)
|
15
|
+
-
|
16
|
+
=document.pretty_date
|
17
|
+
%hr/
|
@@ -7,8 +7,6 @@
|
|
7
7
|
# Specifies gem version of Rails to use when vendor/rails is not present
|
8
8
|
RAILS_GEM_VERSION = '2.0.2' unless defined? RAILS_GEM_VERSION
|
9
9
|
|
10
|
-
IndexerLogger=Logger.new($stdout)
|
11
|
-
|
12
10
|
# Bootstrap the Rails environment, frameworks, and default configuration
|
13
11
|
require File.join(File.dirname(__FILE__), 'boot')
|
14
12
|
|
@@ -1,20 +1,3 @@
|
|
1
|
-
class MimeType
|
2
|
-
@@all=[]
|
3
|
-
def self.all
|
4
|
-
@@all
|
5
|
-
end
|
6
|
-
|
7
|
-
def self.add(exts,mime_name)
|
8
|
-
all<<new(exts,mime_name)
|
9
|
-
end
|
10
|
-
|
11
|
-
attr_reader :exts, :name
|
12
|
-
|
13
|
-
def initialize(exts,mime_name)
|
14
|
-
@exts,@name=exts,mime_name
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
1
|
class String
|
19
2
|
# Creates a "probably unique" id with the desired length, composed only of lowercase letters.
|
20
3
|
def base26_hash(length=Picolena::HashLength)
|
@@ -23,6 +6,9 @@ class String
|
|
23
6
|
end
|
24
7
|
|
25
8
|
module Enumerable
|
9
|
+
# Similar to Enumerable#each, but creates a new thread for each element.
|
10
|
+
# Used for the indexer to make it multi-threaded.
|
11
|
+
# It ensures that threads are joined together before returning.
|
26
12
|
def each_with_thread(&block)
|
27
13
|
tds=self.collect{|elem|
|
28
14
|
Thread.new(elem) {|elem|
|
@@ -57,17 +43,31 @@ class Array
|
|
57
43
|
end
|
58
44
|
end
|
59
45
|
|
46
|
+
class Hash
|
47
|
+
def add(category)
|
48
|
+
self[category]||={:size=>0}
|
49
|
+
self[category][:size]+=1
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
60
53
|
class File
|
54
|
+
# Returns the filetype of filename as a symbol.
|
55
|
+
# Returns :no_extension unless an extension is found
|
56
|
+
# >> File.ext_as_sym("test.pdf")
|
57
|
+
# => :pdf
|
58
|
+
# >> File.ext_as_sym("test.tar.gz")
|
59
|
+
# => :gz
|
60
|
+
# >> File.ext_as_sym("test")
|
61
|
+
# => :no_extension
|
61
62
|
def self.ext_as_sym(filename)
|
62
63
|
File.extname(filename).sub(/^\./,'').downcase.to_sym rescue :no_extension
|
63
64
|
end
|
64
65
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
66
|
+
# Returns a probable encoding for a given plain text file
|
67
|
+
# If source is a html file, it parses for metadata to retrieve encoding,
|
68
|
+
# and uses file -i otherwise.
|
69
|
+
# Returns iso-8859-15 instead of iso-8859-1, to be sure € char can be
|
70
|
+
# encoded
|
71
71
|
def self.encoding(source)
|
72
72
|
parse_for_charset="grep -io charset=[a-z0-9\\-]* | sed 's/charset=//i'"
|
73
73
|
if File.extname(source)[0,4]==".htm" then
|
@@ -86,9 +86,18 @@ class File
|
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
|
+
# Returns the content of a file and removes it after.
|
90
|
+
# Could be used to read temporary output file written by a PlainTextExtractor.
|
89
91
|
def self.read_and_remove(filename)
|
90
92
|
content=read(filename)
|
91
93
|
FileUtils.rm filename, :force=>true
|
92
94
|
content
|
93
95
|
end
|
96
|
+
|
97
|
+
# Returns nil unless filename is a plain text file.
|
98
|
+
# It requires file command.
|
99
|
+
# NOTE: What to use for Win32?
|
100
|
+
def self.plain_text?(filename)
|
101
|
+
%x{file -i "#{filename}"} =~ /: text\//
|
102
|
+
end
|
94
103
|
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
class IndexerLogger<Logger
|
2
|
+
def initialize
|
3
|
+
super($stdout)
|
4
|
+
#FIXME: Should be defined in config/environments/*.rb
|
5
|
+
levels={
|
6
|
+
"development"=>Logger::DEBUG,
|
7
|
+
"production" =>Logger::INFO,
|
8
|
+
"test" =>Logger::WARN
|
9
|
+
}
|
10
|
+
@level=levels[RAILS_ENV]
|
11
|
+
@found_languages={}
|
12
|
+
@supported_filetypes={}
|
13
|
+
@unsupported_filetypes={}
|
14
|
+
end
|
15
|
+
|
16
|
+
def start_indexing
|
17
|
+
@start_time=Time.now
|
18
|
+
debug "Indexing every directory"
|
19
|
+
end
|
20
|
+
|
21
|
+
def add_document(document)
|
22
|
+
debug ["Added : #{document[:complete_path]}",document[:language] && " ("<<document[:language]<<")"].join
|
23
|
+
@found_languages.add(document[:language]) if document[:language]
|
24
|
+
@supported_filetypes.add(document[:filetype])
|
25
|
+
end
|
26
|
+
|
27
|
+
def reject_document(document, error)
|
28
|
+
@unsupported_filetypes.add(document[:filetype])
|
29
|
+
debug "Added without content (#{error.message}) : #{document[:complete_path]}"
|
30
|
+
end
|
31
|
+
|
32
|
+
def show_report
|
33
|
+
describe :found_languages, :supported_filetypes, :unsupported_filetypes
|
34
|
+
info "Time needed : #{Time.now-@start_time} s."
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def describe(*instance_variable_names)
|
40
|
+
instance_variable_names.each{|var_name|
|
41
|
+
hash=instance_variable_get("@#{var_name}")
|
42
|
+
info var_name.to_s.humanize.ljust(25)<<": "<<hash.reject{|k,v| k.blank?}.sort_by{|k,v| v[:size]}.reverse.collect{|k,v| "#{k.downcase} (#{v[:size]})"}.join(", ") unless hash.empty?
|
43
|
+
}
|
44
|
+
end
|
45
|
+
end
|
@@ -3,12 +3,12 @@ PlainTextExtractor.new {
|
|
3
3
|
as "application/plain"
|
4
4
|
aka "plain text file"
|
5
5
|
with {|source|
|
6
|
+
raise "binary file" unless File.plain_text?(source)
|
6
7
|
encoding=File.encoding(source)
|
7
|
-
#TODO: Return "binary file" if binary
|
8
8
|
if encoding.empty? then
|
9
|
-
|
9
|
+
File.read(source)
|
10
10
|
else
|
11
|
-
|
11
|
+
%x{iconv -f #{encoding} -t utf8 "#{source}" 2>/dev/null}
|
12
12
|
end
|
13
13
|
}
|
14
14
|
# for dependencies spec
|
@@ -25,10 +25,15 @@ namespace :index do
|
|
25
25
|
puts "#{Indexer.size} documents are currently indexed in #{Picolena::IndexSavePath}"
|
26
26
|
end
|
27
27
|
|
28
|
+
desc 'Returns the last time the index was created/update'
|
29
|
+
task :last_update => :environment do
|
30
|
+
puts Indexer.last_update
|
31
|
+
end
|
32
|
+
|
28
33
|
# Search index with query "some query" :
|
29
34
|
# rake index:search query="some query"
|
30
35
|
desc 'Search index'
|
31
36
|
task :search => :environment do
|
32
|
-
Finder.new(ENV["query"]).matching_documents.entries.
|
37
|
+
puts Finder.new(ENV["query"]).matching_documents.entries.collect{|doc| doc.inspect}.join("\n"<<"#"*80<<"\n")
|
33
38
|
end
|
34
39
|
end
|
@@ -1,8 +1,6 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../spec_helper'
|
2
2
|
|
3
3
|
describe DocumentsHelper do
|
4
|
-
it "shouldn't raise if matching not in content field"
|
5
|
-
|
6
4
|
PlainTextExtractor.supported_extensions.each{|ext|
|
7
5
|
it "should have an icon for .#{ext} filetype" do
|
8
6
|
icon_for(ext).should_not be_nil
|
@@ -7,10 +7,13 @@ describe "Finder without index on disk" do
|
|
7
7
|
@original_indexed_dirs=Picolena::IndexedDirectories.dup
|
8
8
|
@new_index_path=File.join(Dir::tmpdir,'ferret_tst')
|
9
9
|
Picolena::IndexSavePath.replace(@new_index_path)
|
10
|
+
Picolena::MetaIndexPath.replace(File.join(@new_index_path,'meta'))
|
11
|
+
FileUtils.mkpath Picolena::MetaIndexPath
|
10
12
|
end
|
11
13
|
|
12
14
|
before(:each) do
|
13
15
|
Indexer.clear!
|
16
|
+
Finder.send(:class_variable_set,'@@last_reload',nil)
|
14
17
|
end
|
15
18
|
|
16
19
|
it "should create index" do
|
@@ -29,6 +32,7 @@ describe "Finder without index on disk" do
|
|
29
32
|
after(:all) do
|
30
33
|
Picolena::IndexedDirectories.replace(@original_indexed_dirs)
|
31
34
|
Picolena::IndexSavePath.replace(@original_index_path)
|
35
|
+
Picolena::MetaIndexPath.replace(File.join(@original_index_path,'meta'))
|
32
36
|
end
|
33
37
|
end
|
34
38
|
|
@@ -78,6 +78,12 @@ describe Document do
|
|
78
78
|
@valid_document.should be_supported
|
79
79
|
Document.new("spec/test_dirs/indexed/others/ghjopdfg.xyz").should_not be_supported
|
80
80
|
end
|
81
|
+
|
82
|
+
it "should not be considered supported if binary" do
|
83
|
+
Document.new("spec/test_dirs/indexed/others/BIN_FILE_WITHOUT_EXTENSION").should_not be_supported
|
84
|
+
end
|
85
|
+
|
86
|
+
|
81
87
|
|
82
88
|
it "should know its language when enough content is available" do
|
83
89
|
Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
|
@@ -4,4 +4,13 @@ describe Indexer do
|
|
4
4
|
it "should have at least 32MB memory allocated" do
|
5
5
|
Indexer.index.writer.max_buffer_memory.should > 2**25-1
|
6
6
|
end
|
7
|
+
|
8
|
+
it "should know the time it was updated" do
|
9
|
+
Indexer.should respond_to(:last_update)
|
10
|
+
begin
|
11
|
+
Indexer.last_update.should be_kind_of(Time)
|
12
|
+
rescue
|
13
|
+
Indexer.last_update.should == "none"
|
14
|
+
end
|
15
|
+
end
|
7
16
|
end
|
@@ -27,4 +27,9 @@ describe "PlainTextExtractors" do
|
|
27
27
|
end
|
28
28
|
}
|
29
29
|
}
|
30
|
+
|
31
|
+
it "should not extract content of binary files" do
|
32
|
+
bin_file="spec/test_dirs/indexed/others/BIN_FILE_WITHOUT_EXTENSION"
|
33
|
+
lambda{PlainTextExtractor.extract_content_from(bin_file)}.should raise_error(RuntimeError, "binary file")
|
34
|
+
end
|
30
35
|
end
|
@@ -37,7 +37,21 @@ describe Query do
|
|
37
37
|
}
|
38
38
|
end
|
39
39
|
|
40
|
-
it "should accept field terms in different languages"
|
40
|
+
it "should accept field terms in different languages" do
|
41
|
+
Globalite.language = :en
|
42
|
+
english_query_with_french_words = Query.extract_from("absorption language:fr extension:pdf")
|
43
|
+
english_query_with_german_words = Query.extract_from("Unabhängigkeit modified:>2005 filename:job.txt")
|
44
|
+
Globalite.language = :de
|
45
|
+
Query.extract_from("absorption sprache:fr erweiterung:pdf").should == english_query_with_french_words
|
46
|
+
Query.extract_from("Unabhängigkeit geändert:>2005 datei:job.txt").should == english_query_with_german_words
|
47
|
+
Globalite.language = :fr
|
48
|
+
Query.extract_from("absorption langue:fr extension:pdf").should == english_query_with_french_words
|
49
|
+
Query.extract_from("Unabhängigkeit modifié:>2005 fichier:job.txt").should == english_query_with_german_words
|
50
|
+
Globalite.language = :es
|
51
|
+
Query.extract_from("absorption idioma:fr extensión:pdf").should == english_query_with_french_words
|
52
|
+
Query.extract_from("Unabhängigkeit modificado:>2005 archivo:job.txt").should == english_query_with_german_words
|
53
|
+
|
54
|
+
end
|
41
55
|
|
42
56
|
it "should use AND as default boolean ops" do
|
43
57
|
query_without_and = Query.extract_from("one AND two")
|
@@ -62,4 +76,14 @@ describe Query do
|
|
62
76
|
Query.extract_from("test").should == Query.extract_from("tesT")
|
63
77
|
Query.extract_from("test").should_not == Query.extract_from("tesTe")
|
64
78
|
end
|
65
|
-
|
79
|
+
|
80
|
+
it "should be able to extract search terms related to :content" do
|
81
|
+
Query.content_terms("plain text").should == %w(plain text)
|
82
|
+
Query.content_terms("plain text extension:pdf").should == %w(plain text)
|
83
|
+
Query.content_terms("plain AND text").should == %w(plain text)
|
84
|
+
Query.content_terms("absorption OR adsorption").should ==%w(absorption adsorption)
|
85
|
+
Query.content_terms("filename:plain_text").should be_empty
|
86
|
+
Globalite.language = :en
|
87
|
+
Query.content_terms("LIKE absorption").include?("adsorption").should be_true
|
88
|
+
end
|
89
|
+
end
|
data/lib/picolena/version.rb
CHANGED
data/website/index.html
CHANGED
@@ -33,7 +33,7 @@
|
|
33
33
|
<h1>Picolena</h1>
|
34
34
|
<div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
|
35
35
|
<p>Get Version</p>
|
36
|
-
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.
|
36
|
+
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.8</a>
|
37
37
|
</div>
|
38
38
|
<h1>→ ‘picolena’</h1>
|
39
39
|
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: picolena
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Duminil
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
|
31
31
|
-----END CERTIFICATE-----
|
32
32
|
|
33
|
-
date: 2008-
|
33
|
+
date: 2008-05-08 00:00:00 +02:00
|
34
34
|
default_executable:
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
@@ -185,6 +185,7 @@ files:
|
|
185
185
|
- lib/picolena/templates/lang/ui/es.yml
|
186
186
|
- lib/picolena/templates/lang/ui/fr.yml
|
187
187
|
- lib/picolena/templates/lib/core_exts.rb
|
188
|
+
- lib/picolena/templates/lib/indexer_logger.rb
|
188
189
|
- lib/picolena/templates/lib/plain_text_extractor_DSL.rb
|
189
190
|
- lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
|
190
191
|
- lib/picolena/templates/lib/plain_text_extractors/html.rb
|
metadata.gz.sig
CHANGED
Binary file
|