picolena 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +10 -0
- data/Manifest.txt +1 -0
- data/lib/picolena/templates/app/controllers/documents_controller.rb +3 -3
- data/lib/picolena/templates/app/helpers/documents_helper.rb +5 -5
- data/lib/picolena/templates/app/models/document.rb +16 -3
- data/lib/picolena/templates/app/models/finder.rb +18 -7
- data/lib/picolena/templates/app/models/indexer.rb +92 -33
- data/lib/picolena/templates/app/models/query.rb +5 -0
- data/lib/picolena/templates/app/views/documents/_document.html.haml +9 -8
- data/lib/picolena/templates/config/environment.rb +0 -2
- data/lib/picolena/templates/config/environments/development.rb +0 -3
- data/lib/picolena/templates/config/environments/production.rb +0 -2
- data/lib/picolena/templates/config/environments/test.rb +0 -3
- data/lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb +3 -0
- data/lib/picolena/templates/config/initializers/007_load_performance_tweaks.rb +1 -1
- data/lib/picolena/templates/lang/ui/de.yml +2 -2
- data/lib/picolena/templates/lang/ui/en.yml +1 -1
- data/lib/picolena/templates/lang/ui/es.yml +1 -1
- data/lib/picolena/templates/lang/ui/fr.yml +2 -2
- data/lib/picolena/templates/lib/core_exts.rb +32 -23
- data/lib/picolena/templates/lib/indexer_logger.rb +45 -0
- data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +0 -1
- data/lib/picolena/templates/lib/plain_text_extractors/plain_text.rb +3 -3
- data/lib/picolena/templates/lib/tasks/index.rake +6 -1
- data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +0 -2
- data/lib/picolena/templates/spec/models/basic_finder_spec.rb +4 -0
- data/lib/picolena/templates/spec/models/document_spec.rb +6 -0
- data/lib/picolena/templates/spec/models/finder_spec.rb +0 -1
- data/lib/picolena/templates/spec/models/indexer_spec.rb +9 -0
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +5 -0
- data/lib/picolena/templates/spec/models/query_spec.rb +26 -2
- data/lib/picolena/version.rb +1 -1
- data/website/index.html +1 -1
- data.tar.gz.sig +0 -0
- metadata +3 -2
- metadata.gz.sig +0 -0
data/History.txt
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
== 0.1.8 2008-05-08
|
2
|
+
|
3
|
+
* 2 minor enhancements:
|
4
|
+
* New IndexerLogger with basic statistics
|
5
|
+
* More specs & documentation.
|
6
|
+
|
7
|
+
* 2 bug fixes:
|
8
|
+
* Binary documents without extension are not considered supported anymore
|
9
|
+
* Ensure that index is locked system-wide by using lock file.
|
10
|
+
|
1
11
|
== 0.1.7 2008-04-30
|
2
12
|
|
3
13
|
* 5 minor enhancements:
|
data/Manifest.txt
CHANGED
@@ -50,6 +50,7 @@ lib/picolena/templates/lang/ui/en.yml
|
|
50
50
|
lib/picolena/templates/lang/ui/es.yml
|
51
51
|
lib/picolena/templates/lang/ui/fr.yml
|
52
52
|
lib/picolena/templates/lib/core_exts.rb
|
53
|
+
lib/picolena/templates/lib/indexer_logger.rb
|
53
54
|
lib/picolena/templates/lib/plain_text_extractor_DSL.rb
|
54
55
|
lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
|
55
56
|
lib/picolena/templates/lib/plain_text_extractors/html.rb
|
@@ -22,9 +22,9 @@ class DocumentsController < ApplicationController
|
|
22
22
|
def show
|
23
23
|
start=Time.now
|
24
24
|
@query=[params[:id],params.delete(:format)].compact.join('.')
|
25
|
-
@
|
25
|
+
@sort_by=params[:sort_by]
|
26
26
|
page=params[:page]||1
|
27
|
-
finder=Finder.new(@query,@
|
27
|
+
finder=Finder.new(@query,@sort_by,page)
|
28
28
|
finder.execute!
|
29
29
|
pager=::Paginator.new(finder.total_hits, Picolena::ResultsPerPage) do
|
30
30
|
finder.matching_documents
|
@@ -64,7 +64,7 @@ class DocumentsController < ApplicationController
|
|
64
64
|
|
65
65
|
def ensure_index_is_created
|
66
66
|
Indexer.ensure_index_existence
|
67
|
-
while Indexer.
|
67
|
+
while Indexer.locked? do
|
68
68
|
sleep 1
|
69
69
|
end
|
70
70
|
end
|
@@ -6,10 +6,10 @@ module DocumentsHelper
|
|
6
6
|
|
7
7
|
# Very basic pagination.
|
8
8
|
# Provides liks to Next, Prev and FirstPage when needed.
|
9
|
-
def should_paginate(page,query,
|
10
|
-
[(link_to("←←", :action => :show, :id => query, :
|
11
|
-
(link_to("←", :action => :show, :id => query, :page => page.prev.number, :
|
12
|
-
(link_to("→", :action => :show, :id => query, :page => page.next.number, :
|
9
|
+
def should_paginate(page,query, sort_by)
|
10
|
+
[(link_to("←←", :action => :show, :id => query, :sort_by=>sort_by) if page.number>2),
|
11
|
+
(link_to("←", :action => :show, :id => query, :page => page.prev.number, :sort_by=>sort_by) if page.prev?),
|
12
|
+
(link_to("→", :action => :show, :id => query, :page => page.next.number, :sort_by=>sort_by) if page.next?)].compact.join(" | ")
|
13
13
|
end
|
14
14
|
|
15
15
|
# Returns a localized sentence like "Results 1-10 of 12 for Zimbabwe (0.472s)" or
|
@@ -81,7 +81,7 @@ module DocumentsHelper
|
|
81
81
|
end
|
82
82
|
|
83
83
|
def sort_by_date_or_relevance(query)
|
84
|
-
[link_to_unless_current('By date', document_path(query, :
|
84
|
+
[link_to_unless_current('By date', document_path(query, :sort_by=>'date')),
|
85
85
|
link_to_unless_current('By relevance', document_path(query))].join(" ")
|
86
86
|
end
|
87
87
|
end
|
@@ -11,10 +11,18 @@ class Document
|
|
11
11
|
end
|
12
12
|
|
13
13
|
#Delegating properties to File::method_name(complete_path)
|
14
|
-
[:dirname, :basename, :extname, :ext_as_sym, :file?, :size, :ext_as_sym].each{|method_name|
|
14
|
+
[:dirname, :basename, :extname, :ext_as_sym, :file?, :plain_text?, :size, :ext_as_sym].each{|method_name|
|
15
15
|
define_method(method_name){File.send(method_name,complete_path)}
|
16
16
|
}
|
17
17
|
alias_method :filename, :basename
|
18
|
+
alias_method :to_s, :complete_path
|
19
|
+
|
20
|
+
|
21
|
+
def inspect
|
22
|
+
[self,("(#{pretty_score})" if @score),("(language:#{language})" if language)].compact.join(" ")
|
23
|
+
end
|
24
|
+
|
25
|
+
|
18
26
|
|
19
27
|
# Returns filename without extension
|
20
28
|
# "buildings.odt" => "buildings"
|
@@ -50,7 +58,7 @@ class Document
|
|
50
58
|
# Document.new("presentation.pdf").supported? => true
|
51
59
|
# Document.new("presentation.some_weird_extension").supported? => false
|
52
60
|
def supported?
|
53
|
-
PlainTextExtractor.supported_extensions.include?(self.ext_as_sym)
|
61
|
+
PlainTextExtractor.supported_extensions.include?(self.ext_as_sym) unless ext_as_sym==:no_extension and !plain_text?
|
54
62
|
end
|
55
63
|
|
56
64
|
# Retrieves content as it is *now*.
|
@@ -91,6 +99,10 @@ class Document
|
|
91
99
|
from_index[:language]
|
92
100
|
end
|
93
101
|
|
102
|
+
def pretty_score
|
103
|
+
"%3.1f%" % (@score*100)
|
104
|
+
end
|
105
|
+
|
94
106
|
# Fields that are shared between every document.
|
95
107
|
def self.default_fields_for(complete_path)
|
96
108
|
{
|
@@ -103,6 +115,7 @@ class Document
|
|
103
115
|
}
|
104
116
|
end
|
105
117
|
|
118
|
+
|
106
119
|
private
|
107
120
|
|
108
121
|
# FIXME: Is there a way to easily retrieve doc_id for a given document?
|
@@ -138,4 +151,4 @@ class Document
|
|
138
151
|
def validate_in_indexed_directory
|
139
152
|
raise ArgumentError, "required document is not in indexed directory" unless in_indexed_directory?
|
140
153
|
end
|
141
|
-
end
|
154
|
+
end
|
@@ -2,23 +2,24 @@ class Finder
|
|
2
2
|
attr_reader :query
|
3
3
|
|
4
4
|
def index
|
5
|
-
@@index ||= Indexer.index
|
5
|
+
@@index ||= Indexer.index
|
6
6
|
end
|
7
7
|
|
8
|
-
def initialize(raw_query,
|
8
|
+
def initialize(raw_query,sort_by='relevance', page=1,results_per_page=Picolena::ResultsPerPage)
|
9
9
|
@query = Query.extract_from(raw_query)
|
10
10
|
@raw_query= raw_query
|
11
11
|
Indexer.ensure_index_existence
|
12
|
+
reload_index! if should_be_reloaded?
|
12
13
|
@per_page=results_per_page
|
13
14
|
@offset=(page.to_i-1)*results_per_page
|
14
|
-
@
|
15
|
+
@sort_by=sort_by
|
15
16
|
index_should_have_documents
|
16
17
|
end
|
17
18
|
|
18
19
|
def execute!
|
19
20
|
@matching_documents=[]
|
20
21
|
start=Time.now
|
21
|
-
@total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @
|
22
|
+
@total_hits = index.search_each(query, :limit => @per_page, :offset=>@offset, :sort => (sort_by_date if @sort_by=='date')){|index_id, score|
|
22
23
|
begin
|
23
24
|
found_doc=Document.new(index[index_id][:complete_path])
|
24
25
|
found_doc.matching_content=index.highlight(query, index_id,
|
@@ -52,11 +53,21 @@ class Finder
|
|
52
53
|
}
|
53
54
|
}
|
54
55
|
|
55
|
-
|
56
|
+
private
|
57
|
+
|
58
|
+
def reload_index!
|
59
|
+
Indexer.close
|
56
60
|
@@index = nil
|
61
|
+
@@last_reload = Time.now
|
57
62
|
end
|
58
63
|
|
59
|
-
|
64
|
+
def should_be_reloaded?
|
65
|
+
Indexer.reload_file_mtime > last_reload
|
66
|
+
end
|
67
|
+
|
68
|
+
def last_reload
|
69
|
+
@@last_reload ||= Time.at(0)
|
70
|
+
end
|
60
71
|
|
61
72
|
def sort_by_date
|
62
73
|
Ferret::Search::SortField.new(:modified, :type => :byte, :reverse => true)
|
@@ -65,4 +76,4 @@ class Finder
|
|
65
76
|
def index_should_have_documents
|
66
77
|
raise IndexError, "no document found" unless index.size > 0
|
67
78
|
end
|
68
|
-
end
|
79
|
+
end
|
@@ -1,63 +1,74 @@
|
|
1
|
+
# Indexer is used to index (duh!) documents contained in IndexedDirectories
|
2
|
+
# It can create, update, delete and prune the index, and take care that only
|
3
|
+
# one IndexWriter exists at any given time, even when used in a multi-threaded
|
4
|
+
# way.
|
5
|
+
require 'indexer_logger'
|
1
6
|
class Indexer
|
2
7
|
# This regexp defines which files should *not* be indexed.
|
3
8
|
@@exclude = /(Thumbs\.db)/
|
4
9
|
# Number of threads that will be used during indexing process
|
5
10
|
@@threads_number = 8
|
6
|
-
|
7
|
-
cattr_reader :do_not_disturb_while_indexing
|
8
11
|
|
9
12
|
class << self
|
13
|
+
# Finds every document included in IndexedDirectories, parses them with
|
14
|
+
# PlainTextExtractor and adds them to the index.
|
15
|
+
#
|
16
|
+
# Updates the index unless remove_first parameter is set to true, in which
|
17
|
+
# case it removes the index first before re-creating it.
|
10
18
|
def index_every_directory(remove_first=false)
|
11
|
-
@@do_not_disturb_while_indexing=true
|
12
19
|
clear! if remove_first
|
20
|
+
lock!
|
13
21
|
@from_scratch = remove_first
|
14
|
-
|
15
|
-
Finder.reload!
|
16
|
-
log :debug => "Indexing every directory"
|
17
|
-
start=Time.now
|
22
|
+
logger.start_indexing
|
18
23
|
Picolena::IndexedDirectories.each{|dir, alias_dir|
|
19
24
|
index_directory_with_multithreads(dir)
|
20
25
|
}
|
21
|
-
|
26
|
+
logger.debug "Now optimizing index"
|
22
27
|
index.optimize
|
23
|
-
|
24
|
-
|
28
|
+
index_time_dbm_file['last']=Time.now._dump
|
29
|
+
unlock!
|
30
|
+
logger.show_report
|
25
31
|
end
|
26
32
|
|
33
|
+
# Indexes a given directory, using @@threads_number threads.
|
34
|
+
# To do so, it retrieves a list of every included document, cuts it in
|
35
|
+
# @@threads_number chunks, and create a new indexing thread for every chunk.
|
27
36
|
def index_directory_with_multithreads(dir)
|
28
|
-
|
29
|
-
|
37
|
+
logger.debug "Indexing #{dir}, #{@@threads_number} threads"
|
30
38
|
indexing_list=Dir[File.join(dir,"**/*")].select{|filename|
|
31
39
|
File.file?(filename) && filename !~ @@exclude
|
32
40
|
}
|
33
41
|
|
34
42
|
indexing_list_chunks=indexing_list.in_transposed_slices(@@threads_number)
|
35
|
-
|
36
43
|
prepare_multi_threads_environment
|
37
|
-
|
44
|
+
|
38
45
|
indexing_list_chunks.each_with_thread{|chunk|
|
39
46
|
chunk.each{|complete_path|
|
40
|
-
|
41
|
-
if @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime) then
|
47
|
+
if should_index_this_document?(complete_path) then
|
42
48
|
add_or_update_file(complete_path)
|
43
49
|
else
|
44
|
-
|
50
|
+
logger.debug "Identical : #{complete_path}"
|
45
51
|
end
|
46
52
|
index_time_dbm_file[complete_path] = Time.now._dump
|
47
53
|
}
|
48
54
|
}
|
49
55
|
end
|
50
56
|
|
57
|
+
# Retrieves content and language from a given document, and adds it to the index.
|
58
|
+
# Since Document#probably_unique_id is used as index :key, no document will be added
|
59
|
+
# twice to the index, and the old document will just get updated.
|
60
|
+
#
|
61
|
+
# If for some reason (no content found or no defined PlainTextExtractor), content cannot
|
62
|
+
# be found, some basic information about the document (mtime, filename, complete_path)
|
63
|
+
# gets indexed anyway.
|
51
64
|
def add_or_update_file(complete_path)
|
52
|
-
|
65
|
+
document = Document.default_fields_for(complete_path)
|
53
66
|
begin
|
54
|
-
document
|
67
|
+
document.merge! PlainTextExtractor.extract_content_and_language_from(complete_path)
|
55
68
|
raise "empty document #{complete_path}" if document[:content].strip.empty?
|
56
|
-
|
57
|
-
log :debug => ["Added : #{complete_path}",document[:language] ? " (#{document[:language]})" : ""].join
|
69
|
+
logger.add_document document
|
58
70
|
rescue => e
|
59
|
-
|
60
|
-
document = default_fields
|
71
|
+
logger.reject_document document, e
|
61
72
|
end
|
62
73
|
index << document
|
63
74
|
end
|
@@ -73,11 +84,9 @@ class Indexer
|
|
73
84
|
# ensures that a new Index is instantiated next time index is called.
|
74
85
|
def close
|
75
86
|
@@index.close rescue nil
|
76
|
-
# Ferret will SEGFAULT otherwise.
|
77
87
|
@@index = nil
|
78
88
|
end
|
79
89
|
|
80
|
-
|
81
90
|
# Checks for indexed files that are missing from filesytem
|
82
91
|
# and removes them from index & dbm file.
|
83
92
|
def prune_index
|
@@ -85,7 +94,7 @@ class Indexer
|
|
85
94
|
missing_files.each{|filename, itime|
|
86
95
|
index.writer.delete(:complete_path, filename)
|
87
96
|
index_time_dbm_file.delete(filename)
|
88
|
-
|
97
|
+
logger.debug "Removed : #{filename}"
|
89
98
|
}
|
90
99
|
index.optimize
|
91
100
|
end
|
@@ -97,6 +106,7 @@ class Indexer
|
|
97
106
|
@@index ||= Ferret::Index::Index.new(default_index_params)
|
98
107
|
end
|
99
108
|
|
109
|
+
# Creates the index unless it already exists.
|
100
110
|
def ensure_index_existence
|
101
111
|
index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production"
|
102
112
|
end
|
@@ -106,11 +116,66 @@ class Indexer
|
|
106
116
|
index.size
|
107
117
|
end
|
108
118
|
|
119
|
+
# Returns the time at which the index was last created/updated.
|
120
|
+
# Returns "none" if it doesn't exist.
|
121
|
+
def last_update
|
122
|
+
Time._load(index_time_dbm_file['last']) rescue "none"
|
123
|
+
end
|
124
|
+
|
125
|
+
# Returns the time at which the reload file was last touched.
|
126
|
+
# Useful to know if other processes have modified the shared index,
|
127
|
+
# and if the Indexer should be reloaded.
|
128
|
+
def reload_file_mtime
|
129
|
+
touch_reload_file! unless File.exists?(reload_file)
|
130
|
+
File.mtime(reload_file)
|
131
|
+
end
|
132
|
+
|
133
|
+
# For a given document, it retrieves the time it was last indexed, compare it to
|
134
|
+
# its modification time and returns false unless the file has been
|
135
|
+
# modified after the last indexing process.
|
136
|
+
def should_index_this_document?(complete_path)
|
137
|
+
last_itime=index_time_dbm_file[complete_path]
|
138
|
+
@from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime)
|
139
|
+
end
|
140
|
+
|
141
|
+
def locked?
|
142
|
+
File.exists?(lock_file)
|
143
|
+
end
|
144
|
+
|
109
145
|
private
|
110
146
|
|
147
|
+
def touch_reload_file!
|
148
|
+
FileUtils.touch(reload_file)
|
149
|
+
# To ensure that every process can touch reload_file, even if Picolena
|
150
|
+
# is launched as a special user.
|
151
|
+
FileUtils.chmod(0666, reload_file)
|
152
|
+
end
|
153
|
+
|
154
|
+
def reload_file
|
155
|
+
File.join(Picolena::MetaIndexPath,'reload')
|
156
|
+
end
|
157
|
+
|
158
|
+
def lock!
|
159
|
+
FileUtils.touch(lock_file)
|
160
|
+
end
|
161
|
+
|
162
|
+
def unlock!
|
163
|
+
FileUtils.rm(lock_file)
|
164
|
+
# Forces Finder.index to be reloaded.
|
165
|
+
touch_reload_file!
|
166
|
+
end
|
167
|
+
|
168
|
+
def lock_file
|
169
|
+
File.join(Picolena::MetaIndexPath,'lock')
|
170
|
+
end
|
171
|
+
|
172
|
+
def logger
|
173
|
+
@@logger ||= IndexerLogger.new
|
174
|
+
end
|
175
|
+
|
111
176
|
# Copied from Ferret book, By David Balmain
|
112
177
|
def index_time_dbm_file
|
113
|
-
@@dbm_file ||= DBM.open(File.join(Picolena::
|
178
|
+
@@dbm_file ||= DBM.open(File.join(Picolena::MetaIndexPath, 'added_at'))
|
114
179
|
end
|
115
180
|
|
116
181
|
def index_exists?
|
@@ -121,12 +186,6 @@ class Indexer
|
|
121
186
|
Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first
|
122
187
|
end
|
123
188
|
|
124
|
-
def log(hash)
|
125
|
-
hash.each{|level,message|
|
126
|
-
IndexerLogger.send(level,message)
|
127
|
-
}
|
128
|
-
end
|
129
|
-
|
130
189
|
def default_index_params
|
131
190
|
{
|
132
191
|
:path => Picolena::IndexSavePath,
|
@@ -4,6 +4,11 @@ class Query
|
|
4
4
|
def extract_from(raw_query)
|
5
5
|
parser.parse(convert_to_english(raw_query))
|
6
6
|
end
|
7
|
+
|
8
|
+
# Returns terms related to content. Useful for cache highlighting
|
9
|
+
def content_terms(raw_query)
|
10
|
+
Query.extract_from(raw_query).terms(Indexer.index.searcher).select{|term| term.field==:content}.collect{|term| term.text}.uniq
|
11
|
+
end
|
7
12
|
|
8
13
|
private
|
9
14
|
|
@@ -3,14 +3,15 @@
|
|
3
3
|
=language_icon_for(document)
|
4
4
|
%small=number_to_percentage(document.score*100, :precision=>1)
|
5
5
|
=highlight_matching_content(document)
|
6
|
-
%p
|
7
|
-
|
8
|
-
%
|
6
|
+
%p
|
7
|
+
=link_to_containing_directory(document)
|
8
|
+
%br/
|
9
|
+
-if document.supported?
|
9
10
|
=link_to_plain_text_content(document)
|
10
11
|
-
|
11
|
-
=number_to_human_size(document.size)
|
12
|
-
-
|
13
|
-
=document.pretty_date
|
14
|
-
-
|
15
12
|
=link_to_cached_content(document,query)
|
16
|
-
|
13
|
+
-
|
14
|
+
=number_to_human_size(document.size)
|
15
|
+
-
|
16
|
+
=document.pretty_date
|
17
|
+
%hr/
|
@@ -7,8 +7,6 @@
|
|
7
7
|
# Specifies gem version of Rails to use when vendor/rails is not present
|
8
8
|
RAILS_GEM_VERSION = '2.0.2' unless defined? RAILS_GEM_VERSION
|
9
9
|
|
10
|
-
IndexerLogger=Logger.new($stdout)
|
11
|
-
|
12
10
|
# Bootstrap the Rails environment, frameworks, and default configuration
|
13
11
|
require File.join(File.dirname(__FILE__), 'boot')
|
14
12
|
|
@@ -1,20 +1,3 @@
|
|
1
|
-
class MimeType
|
2
|
-
@@all=[]
|
3
|
-
def self.all
|
4
|
-
@@all
|
5
|
-
end
|
6
|
-
|
7
|
-
def self.add(exts,mime_name)
|
8
|
-
all<<new(exts,mime_name)
|
9
|
-
end
|
10
|
-
|
11
|
-
attr_reader :exts, :name
|
12
|
-
|
13
|
-
def initialize(exts,mime_name)
|
14
|
-
@exts,@name=exts,mime_name
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
1
|
class String
|
19
2
|
# Creates a "probably unique" id with the desired length, composed only of lowercase letters.
|
20
3
|
def base26_hash(length=Picolena::HashLength)
|
@@ -23,6 +6,9 @@ class String
|
|
23
6
|
end
|
24
7
|
|
25
8
|
module Enumerable
|
9
|
+
# Similar to Enumerable#each, but creates a new thread for each element.
|
10
|
+
# Used for the indexer to make it multi-threaded.
|
11
|
+
# It ensures that threads are joined together before returning.
|
26
12
|
def each_with_thread(&block)
|
27
13
|
tds=self.collect{|elem|
|
28
14
|
Thread.new(elem) {|elem|
|
@@ -57,17 +43,31 @@ class Array
|
|
57
43
|
end
|
58
44
|
end
|
59
45
|
|
46
|
+
class Hash
|
47
|
+
def add(category)
|
48
|
+
self[category]||={:size=>0}
|
49
|
+
self[category][:size]+=1
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
60
53
|
class File
|
54
|
+
# Returns the filetype of filename as a symbol.
|
55
|
+
# Returns :no_extension unless an extension is found
|
56
|
+
# >> File.ext_as_sym("test.pdf")
|
57
|
+
# => :pdf
|
58
|
+
# >> File.ext_as_sym("test.tar.gz")
|
59
|
+
# => :gz
|
60
|
+
# >> File.ext_as_sym("test")
|
61
|
+
# => :no_extension
|
61
62
|
def self.ext_as_sym(filename)
|
62
63
|
File.extname(filename).sub(/^\./,'').downcase.to_sym rescue :no_extension
|
63
64
|
end
|
64
65
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
66
|
+
# Returns a probable encoding for a given plain text file
|
67
|
+
# If source is a html file, it parses for metadata to retrieve encoding,
|
68
|
+
# and uses file -i otherwise.
|
69
|
+
# Returns iso-8859-15 instead of iso-8859-1, to be sure € char can be
|
70
|
+
# encoded
|
71
71
|
def self.encoding(source)
|
72
72
|
parse_for_charset="grep -io charset=[a-z0-9\\-]* | sed 's/charset=//i'"
|
73
73
|
if File.extname(source)[0,4]==".htm" then
|
@@ -86,9 +86,18 @@ class File
|
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
|
+
# Returns the content of a file and removes it after.
|
90
|
+
# Could be used to read temporary output file written by a PlainTextExtractor.
|
89
91
|
def self.read_and_remove(filename)
|
90
92
|
content=read(filename)
|
91
93
|
FileUtils.rm filename, :force=>true
|
92
94
|
content
|
93
95
|
end
|
96
|
+
|
97
|
+
# Returns nil unless filename is a plain text file.
|
98
|
+
# It requires file command.
|
99
|
+
# NOTE: What to use for Win32?
|
100
|
+
def self.plain_text?(filename)
|
101
|
+
%x{file -i "#{filename}"} =~ /: text\//
|
102
|
+
end
|
94
103
|
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
class IndexerLogger<Logger
|
2
|
+
def initialize
|
3
|
+
super($stdout)
|
4
|
+
#FIXME: Should be defined in config/environments/*.rb
|
5
|
+
levels={
|
6
|
+
"development"=>Logger::DEBUG,
|
7
|
+
"production" =>Logger::INFO,
|
8
|
+
"test" =>Logger::WARN
|
9
|
+
}
|
10
|
+
@level=levels[RAILS_ENV]
|
11
|
+
@found_languages={}
|
12
|
+
@supported_filetypes={}
|
13
|
+
@unsupported_filetypes={}
|
14
|
+
end
|
15
|
+
|
16
|
+
def start_indexing
|
17
|
+
@start_time=Time.now
|
18
|
+
debug "Indexing every directory"
|
19
|
+
end
|
20
|
+
|
21
|
+
def add_document(document)
|
22
|
+
debug ["Added : #{document[:complete_path]}",document[:language] && " ("<<document[:language]<<")"].join
|
23
|
+
@found_languages.add(document[:language]) if document[:language]
|
24
|
+
@supported_filetypes.add(document[:filetype])
|
25
|
+
end
|
26
|
+
|
27
|
+
def reject_document(document, error)
|
28
|
+
@unsupported_filetypes.add(document[:filetype])
|
29
|
+
debug "Added without content (#{error.message}) : #{document[:complete_path]}"
|
30
|
+
end
|
31
|
+
|
32
|
+
def show_report
|
33
|
+
describe :found_languages, :supported_filetypes, :unsupported_filetypes
|
34
|
+
info "Time needed : #{Time.now-@start_time} s."
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def describe(*instance_variable_names)
|
40
|
+
instance_variable_names.each{|var_name|
|
41
|
+
hash=instance_variable_get("@#{var_name}")
|
42
|
+
info var_name.to_s.humanize.ljust(25)<<": "<<hash.reject{|k,v| k.blank?}.sort_by{|k,v| v[:size]}.reverse.collect{|k,v| "#{k.downcase} (#{v[:size]})"}.join(", ") unless hash.empty?
|
43
|
+
}
|
44
|
+
end
|
45
|
+
end
|
@@ -3,12 +3,12 @@ PlainTextExtractor.new {
|
|
3
3
|
as "application/plain"
|
4
4
|
aka "plain text file"
|
5
5
|
with {|source|
|
6
|
+
raise "binary file" unless File.plain_text?(source)
|
6
7
|
encoding=File.encoding(source)
|
7
|
-
#TODO: Return "binary file" if binary
|
8
8
|
if encoding.empty? then
|
9
|
-
|
9
|
+
File.read(source)
|
10
10
|
else
|
11
|
-
|
11
|
+
%x{iconv -f #{encoding} -t utf8 "#{source}" 2>/dev/null}
|
12
12
|
end
|
13
13
|
}
|
14
14
|
# for dependencies spec
|
@@ -25,10 +25,15 @@ namespace :index do
|
|
25
25
|
puts "#{Indexer.size} documents are currently indexed in #{Picolena::IndexSavePath}"
|
26
26
|
end
|
27
27
|
|
28
|
+
desc 'Returns the last time the index was created/update'
|
29
|
+
task :last_update => :environment do
|
30
|
+
puts Indexer.last_update
|
31
|
+
end
|
32
|
+
|
28
33
|
# Search index with query "some query" :
|
29
34
|
# rake index:search query="some query"
|
30
35
|
desc 'Search index'
|
31
36
|
task :search => :environment do
|
32
|
-
Finder.new(ENV["query"]).matching_documents.entries.
|
37
|
+
puts Finder.new(ENV["query"]).matching_documents.entries.collect{|doc| doc.inspect}.join("\n"<<"#"*80<<"\n")
|
33
38
|
end
|
34
39
|
end
|
@@ -1,8 +1,6 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/../spec_helper'
|
2
2
|
|
3
3
|
describe DocumentsHelper do
|
4
|
-
it "shouldn't raise if matching not in content field"
|
5
|
-
|
6
4
|
PlainTextExtractor.supported_extensions.each{|ext|
|
7
5
|
it "should have an icon for .#{ext} filetype" do
|
8
6
|
icon_for(ext).should_not be_nil
|
@@ -7,10 +7,13 @@ describe "Finder without index on disk" do
|
|
7
7
|
@original_indexed_dirs=Picolena::IndexedDirectories.dup
|
8
8
|
@new_index_path=File.join(Dir::tmpdir,'ferret_tst')
|
9
9
|
Picolena::IndexSavePath.replace(@new_index_path)
|
10
|
+
Picolena::MetaIndexPath.replace(File.join(@new_index_path,'meta'))
|
11
|
+
FileUtils.mkpath Picolena::MetaIndexPath
|
10
12
|
end
|
11
13
|
|
12
14
|
before(:each) do
|
13
15
|
Indexer.clear!
|
16
|
+
Finder.send(:class_variable_set,'@@last_reload',nil)
|
14
17
|
end
|
15
18
|
|
16
19
|
it "should create index" do
|
@@ -29,6 +32,7 @@ describe "Finder without index on disk" do
|
|
29
32
|
after(:all) do
|
30
33
|
Picolena::IndexedDirectories.replace(@original_indexed_dirs)
|
31
34
|
Picolena::IndexSavePath.replace(@original_index_path)
|
35
|
+
Picolena::MetaIndexPath.replace(File.join(@original_index_path,'meta'))
|
32
36
|
end
|
33
37
|
end
|
34
38
|
|
@@ -78,6 +78,12 @@ describe Document do
|
|
78
78
|
@valid_document.should be_supported
|
79
79
|
Document.new("spec/test_dirs/indexed/others/ghjopdfg.xyz").should_not be_supported
|
80
80
|
end
|
81
|
+
|
82
|
+
it "should not be considered supported if binary" do
|
83
|
+
Document.new("spec/test_dirs/indexed/others/BIN_FILE_WITHOUT_EXTENSION").should_not be_supported
|
84
|
+
end
|
85
|
+
|
86
|
+
|
81
87
|
|
82
88
|
it "should know its language when enough content is available" do
|
83
89
|
Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
|
@@ -4,4 +4,13 @@ describe Indexer do
|
|
4
4
|
it "should have at least 32MB memory allocated" do
|
5
5
|
Indexer.index.writer.max_buffer_memory.should > 2**25-1
|
6
6
|
end
|
7
|
+
|
8
|
+
it "should know the time it was updated" do
|
9
|
+
Indexer.should respond_to(:last_update)
|
10
|
+
begin
|
11
|
+
Indexer.last_update.should be_kind_of(Time)
|
12
|
+
rescue
|
13
|
+
Indexer.last_update.should == "none"
|
14
|
+
end
|
15
|
+
end
|
7
16
|
end
|
@@ -27,4 +27,9 @@ describe "PlainTextExtractors" do
|
|
27
27
|
end
|
28
28
|
}
|
29
29
|
}
|
30
|
+
|
31
|
+
it "should not extract content of binary files" do
|
32
|
+
bin_file="spec/test_dirs/indexed/others/BIN_FILE_WITHOUT_EXTENSION"
|
33
|
+
lambda{PlainTextExtractor.extract_content_from(bin_file)}.should raise_error(RuntimeError, "binary file")
|
34
|
+
end
|
30
35
|
end
|
@@ -37,7 +37,21 @@ describe Query do
|
|
37
37
|
}
|
38
38
|
end
|
39
39
|
|
40
|
-
it "should accept field terms in different languages"
|
40
|
+
it "should accept field terms in different languages" do
|
41
|
+
Globalite.language = :en
|
42
|
+
english_query_with_french_words = Query.extract_from("absorption language:fr extension:pdf")
|
43
|
+
english_query_with_german_words = Query.extract_from("Unabhängigkeit modified:>2005 filename:job.txt")
|
44
|
+
Globalite.language = :de
|
45
|
+
Query.extract_from("absorption sprache:fr erweiterung:pdf").should == english_query_with_french_words
|
46
|
+
Query.extract_from("Unabhängigkeit geändert:>2005 datei:job.txt").should == english_query_with_german_words
|
47
|
+
Globalite.language = :fr
|
48
|
+
Query.extract_from("absorption langue:fr extension:pdf").should == english_query_with_french_words
|
49
|
+
Query.extract_from("Unabhängigkeit modifié:>2005 fichier:job.txt").should == english_query_with_german_words
|
50
|
+
Globalite.language = :es
|
51
|
+
Query.extract_from("absorption idioma:fr extensión:pdf").should == english_query_with_french_words
|
52
|
+
Query.extract_from("Unabhängigkeit modificado:>2005 archivo:job.txt").should == english_query_with_german_words
|
53
|
+
|
54
|
+
end
|
41
55
|
|
42
56
|
it "should use AND as default boolean ops" do
|
43
57
|
query_without_and = Query.extract_from("one AND two")
|
@@ -62,4 +76,14 @@ describe Query do
|
|
62
76
|
Query.extract_from("test").should == Query.extract_from("tesT")
|
63
77
|
Query.extract_from("test").should_not == Query.extract_from("tesTe")
|
64
78
|
end
|
65
|
-
|
79
|
+
|
80
|
+
it "should be able to extract search terms related to :content" do
|
81
|
+
Query.content_terms("plain text").should == %w(plain text)
|
82
|
+
Query.content_terms("plain text extension:pdf").should == %w(plain text)
|
83
|
+
Query.content_terms("plain AND text").should == %w(plain text)
|
84
|
+
Query.content_terms("absorption OR adsorption").should ==%w(absorption adsorption)
|
85
|
+
Query.content_terms("filename:plain_text").should be_empty
|
86
|
+
Globalite.language = :en
|
87
|
+
Query.content_terms("LIKE absorption").include?("adsorption").should be_true
|
88
|
+
end
|
89
|
+
end
|
data/lib/picolena/version.rb
CHANGED
data/website/index.html
CHANGED
@@ -33,7 +33,7 @@
|
|
33
33
|
<h1>Picolena</h1>
|
34
34
|
<div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
|
35
35
|
<p>Get Version</p>
|
36
|
-
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.
|
36
|
+
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.8</a>
|
37
37
|
</div>
|
38
38
|
<h1>→ ‘picolena’</h1>
|
39
39
|
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: picolena
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Duminil
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
|
31
31
|
-----END CERTIFICATE-----
|
32
32
|
|
33
|
-
date: 2008-
|
33
|
+
date: 2008-05-08 00:00:00 +02:00
|
34
34
|
default_executable:
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
@@ -185,6 +185,7 @@ files:
|
|
185
185
|
- lib/picolena/templates/lang/ui/es.yml
|
186
186
|
- lib/picolena/templates/lang/ui/fr.yml
|
187
187
|
- lib/picolena/templates/lib/core_exts.rb
|
188
|
+
- lib/picolena/templates/lib/indexer_logger.rb
|
188
189
|
- lib/picolena/templates/lib/plain_text_extractor_DSL.rb
|
189
190
|
- lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
|
190
191
|
- lib/picolena/templates/lib/plain_text_extractors/html.rb
|
metadata.gz.sig
CHANGED
Binary file
|