picolena 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +11 -0
- data/Manifest.txt +16 -4
- data/lib/picolena/picolena_generator.rb +0 -1
- data/lib/picolena/templates/app/helpers/documents_helper.rb +4 -0
- data/lib/picolena/templates/app/models/document.rb +21 -12
- data/lib/picolena/templates/app/models/finder.rb +38 -18
- data/lib/picolena/templates/app/models/indexer.rb +69 -89
- data/lib/picolena/templates/app/models/plain_text_extractor.rb +7 -7
- data/lib/picolena/templates/app/models/query.rb +4 -2
- data/lib/picolena/templates/app/views/documents/_document.html.haml +1 -0
- data/lib/picolena/templates/lang/ui/de.yml +3 -1
- data/lib/picolena/templates/lang/ui/en.yml +3 -1
- data/lib/picolena/templates/lang/ui/es.yml +3 -1
- data/lib/picolena/templates/lang/ui/fr.yml +3 -1
- data/lib/picolena/templates/lib/tasks/index.rake +3 -3
- data/lib/picolena/templates/public/images/flags/ar.png +0 -0
- data/lib/picolena/templates/public/images/flags/be.png +0 -0
- data/lib/picolena/templates/public/images/flags/ca.png +0 -0
- data/lib/picolena/templates/public/images/flags/de.png +0 -0
- data/lib/picolena/templates/public/images/flags/el.png +0 -0
- data/lib/picolena/templates/public/images/flags/en.png +0 -0
- data/lib/picolena/templates/public/images/flags/es.png +0 -0
- data/lib/picolena/templates/public/images/flags/fr.png +0 -0
- data/lib/picolena/templates/public/images/flags/ga.png +0 -0
- data/lib/picolena/templates/public/images/flags/hr.png +0 -0
- data/lib/picolena/templates/public/images/flags/it.png +0 -0
- data/lib/picolena/templates/public/images/flags/nl.png +0 -0
- data/lib/picolena/templates/public/images/flags/pl.png +0 -0
- data/lib/picolena/templates/public/images/flags/pt-br.png +0 -0
- data/lib/picolena/templates/public/images/flags/pt-pt.png +0 -0
- data/lib/picolena/templates/public/images/flags/readme.txt +9 -0
- data/lib/picolena/templates/spec/models/basic_finder_spec.rb +13 -10
- data/lib/picolena/templates/spec/models/finder_spec.rb +5 -5
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +7 -7
- data/lib/picolena/version.rb +1 -1
- data/tasks/hack.rake +2 -2
- data/website/index.html +1 -1
- data.tar.gz.sig +1 -2
- metadata +19 -6
- metadata.gz.sig +0 -0
- data/lib/picolena/templates/app/models/index_reader.rb +0 -54
- data/lib/picolena/templates/app/models/index_writer.rb +0 -33
- data/lib/picolena/templates/spec/models/index_reader_spec.rb +0 -7
- data/lib/picolena/templates/spec/models/index_writer_spec.rb +0 -7
data/History.txt
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
== 0.1.5 2008-04-
|
2
|
+
|
3
|
+
* 1 major enhancement:
|
4
|
+
* yet another Indexer & Index rewrite
|
5
|
+
|
6
|
+
* 1 minor enhancement:
|
7
|
+
* flags to indicate found language
|
8
|
+
|
9
|
+
* bug fixes:
|
10
|
+
* No more (or just less?) index lock errors
|
11
|
+
|
1
12
|
== 0.1.4 2008-04-23
|
2
13
|
* 1 minor enhancement:
|
3
14
|
* minimal MacOS support
|
data/Manifest.txt
CHANGED
@@ -22,8 +22,6 @@ lib/picolena/templates/app/helpers/application_helper.rb
|
|
22
22
|
lib/picolena/templates/app/helpers/documents_helper.rb
|
23
23
|
lib/picolena/templates/app/models/document.rb
|
24
24
|
lib/picolena/templates/app/models/finder.rb
|
25
|
-
lib/picolena/templates/app/models/index_reader.rb
|
26
|
-
lib/picolena/templates/app/models/index_writer.rb
|
27
25
|
lib/picolena/templates/app/models/indexer.rb
|
28
26
|
lib/picolena/templates/app/models/plain_text_extractor.rb
|
29
27
|
lib/picolena/templates/app/models/query.rb
|
@@ -75,6 +73,22 @@ lib/picolena/templates/public/favicon.ico
|
|
75
73
|
lib/picolena/templates/public/help/PicolenaHowTo-de.pdf
|
76
74
|
lib/picolena/templates/public/help/PicolenaHowTo-de.tex
|
77
75
|
lib/picolena/templates/public/images/bg.gif
|
76
|
+
lib/picolena/templates/public/images/flags/ar.png
|
77
|
+
lib/picolena/templates/public/images/flags/be.png
|
78
|
+
lib/picolena/templates/public/images/flags/ca.png
|
79
|
+
lib/picolena/templates/public/images/flags/de.png
|
80
|
+
lib/picolena/templates/public/images/flags/el.png
|
81
|
+
lib/picolena/templates/public/images/flags/en.png
|
82
|
+
lib/picolena/templates/public/images/flags/es.png
|
83
|
+
lib/picolena/templates/public/images/flags/fr.png
|
84
|
+
lib/picolena/templates/public/images/flags/ga.png
|
85
|
+
lib/picolena/templates/public/images/flags/hr.png
|
86
|
+
lib/picolena/templates/public/images/flags/it.png
|
87
|
+
lib/picolena/templates/public/images/flags/nl.png
|
88
|
+
lib/picolena/templates/public/images/flags/pl.png
|
89
|
+
lib/picolena/templates/public/images/flags/pt-br.png
|
90
|
+
lib/picolena/templates/public/images/flags/pt-pt.png
|
91
|
+
lib/picolena/templates/public/images/flags/readme.txt
|
78
92
|
lib/picolena/templates/public/images/icons/cad.png
|
79
93
|
lib/picolena/templates/public/images/icons/code.png
|
80
94
|
lib/picolena/templates/public/images/icons/doc.png
|
@@ -120,8 +134,6 @@ lib/picolena/templates/spec/models/document_spec.rb
|
|
120
134
|
lib/picolena/templates/spec/models/finder_spec.rb
|
121
135
|
lib/picolena/templates/spec/models/host_indexing_system_spec.rb
|
122
136
|
lib/picolena/templates/spec/models/index_directories_spec.rb
|
123
|
-
lib/picolena/templates/spec/models/index_reader_spec.rb
|
124
|
-
lib/picolena/templates/spec/models/index_writer_spec.rb
|
125
137
|
lib/picolena/templates/spec/models/indexer_spec.rb
|
126
138
|
lib/picolena/templates/spec/models/plain_text_extractor_spec.rb
|
127
139
|
lib/picolena/templates/spec/models/query_spec.rb
|
@@ -37,6 +37,10 @@ module DocumentsHelper
|
|
37
37
|
}) if document.matching_content
|
38
38
|
end
|
39
39
|
|
40
|
+
def language_icon_for(document)
|
41
|
+
(lang=document.language) && image_tag("flags/#{lang}.png")
|
42
|
+
end
|
43
|
+
|
40
44
|
# Returns icon and filename for any given document.
|
41
45
|
def icon_and_filename_for(document)
|
42
46
|
[icon_for(document.extname),document.filename].join(" ")
|
@@ -71,21 +71,33 @@ class Document
|
|
71
71
|
# Returns the last modification date before the document got indexed.
|
72
72
|
# Useful to know how old a document is, and to which version the cache corresponds.
|
73
73
|
def date
|
74
|
-
from_index[:
|
74
|
+
from_index[:modified].sub(/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/,'\1-\2-\3 \4:\5:\6')
|
75
75
|
end
|
76
76
|
|
77
77
|
def mtime
|
78
|
-
from_index[:
|
78
|
+
from_index[:modified].to_i
|
79
79
|
end
|
80
80
|
|
81
81
|
# Returns language.
|
82
|
-
def
|
83
|
-
from_index[:
|
82
|
+
def language
|
83
|
+
from_index[:language]
|
84
84
|
end
|
85
85
|
|
86
86
|
# Returns the id with which the document is indexed.
|
87
87
|
def index_id
|
88
|
-
@index_id ||=
|
88
|
+
@index_id ||= Finder.term_search(:complete_path, complete_path).doc
|
89
|
+
end
|
90
|
+
|
91
|
+
# Fields that are shared between every document.
|
92
|
+
def self.default_fields_for(complete_path)
|
93
|
+
{
|
94
|
+
:complete_path => complete_path,
|
95
|
+
:probably_unique_id => complete_path.base26_hash,
|
96
|
+
:filename => File.basename(complete_path),
|
97
|
+
:basename => File.basename(complete_path, File.extname(complete_path)).gsub(/_/,' '),
|
98
|
+
:filetype => File.extname(complete_path),
|
99
|
+
:modified => File.mtime(complete_path).strftime("%Y%m%d%H%M%S")
|
100
|
+
}
|
89
101
|
end
|
90
102
|
|
91
103
|
private
|
@@ -93,17 +105,14 @@ class Document
|
|
93
105
|
# Retrieves the document from the index.
|
94
106
|
# Useful to get meta-info about it.
|
95
107
|
def from_index
|
96
|
-
|
108
|
+
Indexer.index[index_id]
|
97
109
|
end
|
98
110
|
|
99
111
|
def self.find_by_unique_id(some_id)
|
100
|
-
Finder.
|
112
|
+
doc_id=Finder.term_search(:probably_unique_id, some_id).doc
|
113
|
+
new(Indexer.index[doc_id][:complete_path])
|
101
114
|
end
|
102
|
-
|
103
|
-
def self.find_by_complete_path(complete_path)
|
104
|
-
Finder.new('complete_path:"'<<complete_path<<'"').matching_document
|
105
|
-
end
|
106
|
-
|
115
|
+
|
107
116
|
def in_indexed_directory?
|
108
117
|
!indexed_directory.nil?
|
109
118
|
end
|
@@ -2,18 +2,16 @@ class Finder
|
|
2
2
|
attr_reader :query
|
3
3
|
|
4
4
|
def index
|
5
|
-
|
6
|
-
# causes ferret-0.11.6/lib/ferret/index.rb:768: [BUG] Segmentation fault
|
7
|
-
IndexReader.new
|
5
|
+
@@index ||= Indexer.index
|
8
6
|
end
|
9
7
|
|
10
8
|
def initialize(raw_query,page=1,results_per_page=Picolena::ResultsPerPage)
|
11
9
|
@query = Query.extract_from(raw_query)
|
12
10
|
@raw_query= raw_query
|
13
|
-
|
11
|
+
Indexer.ensure_index_existence
|
14
12
|
@per_page=results_per_page
|
15
13
|
@offset=(page.to_i-1)*results_per_page
|
16
|
-
|
14
|
+
index_should_have_documents
|
17
15
|
end
|
18
16
|
|
19
17
|
def execute!
|
@@ -31,9 +29,9 @@ class Finder
|
|
31
29
|
found_doc.score=score
|
32
30
|
found_doc.index_id=index_id
|
33
31
|
@matching_documents<<found_doc
|
34
|
-
|
35
|
-
|
36
|
-
|
32
|
+
rescue Errno::ENOENT
|
33
|
+
#"File has been moved/deleted!"
|
34
|
+
end
|
37
35
|
}
|
38
36
|
@executed=true
|
39
37
|
@time_needed=Time.now-start
|
@@ -60,14 +58,36 @@ class Finder
|
|
60
58
|
# Returns matching document for any given query only if
|
61
59
|
# exactly one document is found.
|
62
60
|
# Raises otherwise.
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
61
|
+
def matching_document
|
62
|
+
case matching_documents.size
|
63
|
+
when 0
|
64
|
+
raise IndexError, "No document found"
|
65
|
+
when 1
|
66
|
+
matching_documents.first
|
67
|
+
else
|
68
|
+
raise IndexError, "More than one document found"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
class<<self
|
73
|
+
def searcher
|
74
|
+
@@searcher ||= Ferret::Search::Searcher.new(Picolena::IndexSavePath)
|
75
|
+
end
|
76
|
+
|
77
|
+
def term_search(field,term)
|
78
|
+
query = Ferret::Search::TermQuery.new(field,term)
|
79
|
+
searcher.search(query).hits.first
|
80
|
+
end
|
81
|
+
|
82
|
+
def reload!
|
83
|
+
@@searcher = nil
|
84
|
+
@@index = nil
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
def index_should_have_documents
|
91
|
+
raise IndexError, "no document found" unless index.size > 0
|
92
|
+
end
|
73
93
|
end
|
@@ -5,54 +5,22 @@ class Indexer
|
|
5
5
|
@@max_threads_number = 8
|
6
6
|
|
7
7
|
class << self
|
8
|
-
def
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
:file => File.basename(complete_path),
|
13
|
-
:basename => File.basename(complete_path, File.extname(complete_path)).gsub(/_/,' '),
|
14
|
-
:filetype => File.extname(complete_path),
|
15
|
-
:date => File.mtime(complete_path).strftime("%Y%m%d%H%M%S")
|
16
|
-
}
|
17
|
-
end
|
18
|
-
|
19
|
-
def index_every_directory(update=true)
|
8
|
+
def index_every_directory(remove_first=false)
|
9
|
+
clear! if remove_first
|
10
|
+
# Forces Finder.searcher and Finder.index to be reloaded, by removing them from the cache.
|
11
|
+
Finder.reload!
|
20
12
|
log :debug => "Indexing every directory"
|
21
|
-
|
22
|
-
|
23
13
|
start=Time.now
|
24
|
-
@update = update
|
25
|
-
reset! unless update
|
26
|
-
|
27
14
|
Picolena::IndexedDirectories.each{|dir, alias_dir|
|
28
15
|
index_directory_with_multithreads(dir)
|
29
16
|
}
|
30
|
-
|
17
|
+
log :debug => "Now optimizing index"
|
31
18
|
writer.optimize
|
32
|
-
writer.close
|
33
|
-
# launching Indexer.index_every_directory twice in a row
|
34
|
-
# would raise a SEGFAULT:
|
35
|
-
# picolena/lib/picolena/templates/app/models/indexer.rb:27: [BUG] Segmentation fault
|
36
|
-
# ruby 1.8.6 (2007-06-07) [i486-linux]
|
37
|
-
#
|
38
|
-
# Aborted (core dumped)
|
39
|
-
#
|
40
|
-
# But without those 2 lines, specs don't pass anymore.
|
41
|
-
#
|
42
19
|
log :debug => "Indexing done in #{Time.now-start} s."
|
43
20
|
end
|
44
21
|
|
45
22
|
def index_directory_with_multithreads(dir)
|
46
|
-
|
47
|
-
# indexer raises:
|
48
|
-
#
|
49
|
-
# current thread not owner
|
50
|
-
# /usr/lib/ruby/1.8/monitor.rb:278:in `mon_check_owner'
|
51
|
-
# /home/www/picolena/lib/picolena/templates/lib/core_exts.rb:32:in `join'
|
52
|
-
# ...
|
53
|
-
#
|
54
|
-
# So Index creation is multithreaded, Index update is monothreaded.
|
55
|
-
threads_number = @update ? 1 : @@max_threads_number
|
23
|
+
threads_number = @@max_threads_number
|
56
24
|
log :debug => "Indexing #{dir}, #{threads_number} thread(s)"
|
57
25
|
|
58
26
|
indexing_list=Dir[File.join(dir,"**/*")].select{|filename|
|
@@ -61,82 +29,94 @@ class Indexer
|
|
61
29
|
|
62
30
|
indexing_list_chunks=indexing_list.in_transposed_slices(threads_number)
|
63
31
|
|
32
|
+
# It initializes an IndexWriter before launching multithreaded
|
33
|
+
# indexing. Otherwise, two threads could try to instantiate
|
34
|
+
# an IndexWriter at the same time, and get a
|
35
|
+
# Ferret::Store::Lock::LockError
|
36
|
+
writer
|
37
|
+
|
64
38
|
indexing_list_chunks.each_with_thread{|chunk|
|
65
39
|
chunk.each{|filename|
|
66
|
-
|
40
|
+
add_file(filename)
|
67
41
|
}
|
68
42
|
}
|
69
43
|
end
|
70
44
|
|
71
|
-
def add_or_update_file(complete_path)
|
72
|
-
should_be_added = true
|
73
|
-
if @update then
|
74
|
-
log :debug => "What to do with #{complete_path} ?"
|
75
|
-
occurences = reader.occurences_number(complete_path)
|
76
|
-
log :debug => "\tappears #{occurences} times in the index"
|
77
|
-
case occurences
|
78
|
-
when 0
|
79
|
-
#Nothing to do here, the file will be added.
|
80
|
-
when 1
|
81
|
-
d=Document.find_by_complete_path(complete_path)
|
82
|
-
if File.mtime(complete_path).strftime("%Y%m%d%H%M%S").to_i > d.mtime then
|
83
|
-
log :debug => "\thas been modified"
|
84
|
-
delete_file(complete_path)
|
85
|
-
else
|
86
|
-
should_be_added = false
|
87
|
-
log :debug => "\thas not been modified. leaving it"
|
88
|
-
end
|
89
|
-
else
|
90
|
-
delete_file(complete_path)
|
91
|
-
end
|
92
|
-
end
|
93
|
-
add_file(complete_path) if should_be_added
|
94
|
-
end
|
95
|
-
|
96
45
|
def add_file(complete_path)
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
raise "\tempty document #{complete_path}" if text.strip.empty?
|
104
|
-
fields[:content] = text
|
105
|
-
log :debug => "language found: #{lang}" if lang
|
106
|
-
fields[:lang] = lang
|
46
|
+
default_fields = Document.default_fields_for(complete_path)
|
47
|
+
begin
|
48
|
+
document = PlainTextExtractor.extract_content_and_language_from(complete_path)
|
49
|
+
raise "empty document #{complete_path}" if document[:content].strip.empty?
|
50
|
+
document.merge! default_fields
|
51
|
+
log :debug => ["Added : #{complete_path}",document[:language] ? " (#{document[:language]})" : ""].join
|
107
52
|
rescue => e
|
108
53
|
log :debug => "\tindexing without content: #{e.message}"
|
54
|
+
document = default_fields
|
109
55
|
end
|
110
|
-
|
111
|
-
writer << fields
|
56
|
+
writer << document
|
112
57
|
end
|
113
58
|
|
114
|
-
|
115
|
-
|
59
|
+
# Ensures writer is closed, and removes every index file for RAILS_ENV.
|
60
|
+
def clear!(all=false)
|
61
|
+
close
|
62
|
+
to_remove=all ? Picolena::IndexesSavePath : Picolena::IndexSavePath
|
63
|
+
Dir.glob(File.join(to_remove,'**/*')).each{|f| FileUtils.rm(f) if File.file?(f)}
|
116
64
|
end
|
117
65
|
|
118
|
-
|
119
|
-
|
66
|
+
# Closes the writer and
|
67
|
+
# ensures that a new IndexWriter is instantiated next time writer is called.
|
68
|
+
def close
|
69
|
+
@@writer.close rescue nil
|
70
|
+
# Ferret will SEGFAULT otherwise.
|
71
|
+
@@writer = nil
|
120
72
|
end
|
121
73
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
IndexWriter.
|
74
|
+
# Only one IndexWriter should be instantiated.
|
75
|
+
# If one already exists, returns it.
|
76
|
+
# Creates it otherwise.
|
77
|
+
def writer
|
78
|
+
@@writer ||= Ferret::Index::IndexWriter.new(default_index_params)
|
127
79
|
end
|
128
80
|
|
129
|
-
def
|
130
|
-
|
131
|
-
|
81
|
+
def index
|
82
|
+
Ferret::Index::Index.new(default_index_params)
|
83
|
+
end
|
84
|
+
|
85
|
+
def ensure_index_existence
|
86
|
+
index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production"
|
132
87
|
end
|
133
88
|
|
134
89
|
private
|
135
90
|
|
91
|
+
def index_exists?
|
92
|
+
index_filename and File.exists?(index_filename)
|
93
|
+
end
|
94
|
+
|
95
|
+
def index_filename
|
96
|
+
Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first
|
97
|
+
end
|
98
|
+
|
136
99
|
def log(hash)
|
137
100
|
hash.each{|level,message|
|
138
101
|
IndexerLogger.send(level,message)
|
139
102
|
}
|
140
|
-
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def default_index_params
|
106
|
+
{:path => Picolena::IndexSavePath, :analyzer => Picolena::Analyzer, :field_infos => default_field_infos}
|
107
|
+
end
|
108
|
+
|
109
|
+
def default_field_infos
|
110
|
+
returning Ferret::Index::FieldInfos.new do |field_infos|
|
111
|
+
field_infos.add_field(:complete_path, :store => :yes, :index => :untokenized)
|
112
|
+
field_infos.add_field(:content, :store => :yes, :index => :yes)
|
113
|
+
field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5)
|
114
|
+
field_infos.add_field(:filename, :store => :no, :index => :yes, :boost => 1.5)
|
115
|
+
field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
|
116
|
+
field_infos.add_field(:modified, :store => :yes, :index => :untokenized)
|
117
|
+
field_infos.add_field(:probably_unique_id, :store => :no, :index => :yes)
|
118
|
+
field_infos.add_field(:language, :store => :yes, :index => :yes)
|
119
|
+
end
|
120
|
+
end
|
141
121
|
end
|
142
122
|
end
|
@@ -109,12 +109,12 @@ class PlainTextExtractor
|
|
109
109
|
# and if probability score is higher than 90%.
|
110
110
|
def extract_content_and_language
|
111
111
|
content=extract_content
|
112
|
-
return
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
112
|
+
return {:content => content} unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb)
|
113
|
+
Picolena::UseLanguageRecognition,
|
114
|
+
# Is a language guesser already installed?
|
115
|
+
PlainTextExtractor.language_guesser,
|
116
|
+
# Language recognition is too unreliable for small files.
|
117
|
+
content.size > 500].all?
|
118
118
|
language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser|
|
119
119
|
lang_guesser.write content
|
120
120
|
lang_guesser.close_write
|
@@ -125,6 +125,6 @@ class PlainTextExtractor
|
|
125
125
|
lang unless score<0.9
|
126
126
|
end
|
127
127
|
}
|
128
|
-
|
128
|
+
{:content => content, :language => language}
|
129
129
|
end
|
130
130
|
end
|
@@ -13,9 +13,11 @@ class Query
|
|
13
13
|
/\b#{:AND.l}\b/=>'AND',
|
14
14
|
/\b#{:OR.l}\b/=>'OR',
|
15
15
|
/\b#{:NOT.l}\b/=>'NOT',
|
16
|
+
/(#{:filename.l}):/=>'filename:',
|
16
17
|
/(#{:filetype.l}):/=>'filetype:',
|
17
18
|
/#{:content.l}:/ => 'content:',
|
18
|
-
|
19
|
+
/(#{:modified.l}):/ => 'modified:',
|
20
|
+
/(#{:language.l}):/ => 'language:',
|
19
21
|
/\b#{:LIKE.l}\s+(\S+)/=>'\1~'
|
20
22
|
}
|
21
23
|
to_en.inject(raw_query){|mem,non_english_to_english_keyword|
|
@@ -25,7 +27,7 @@ class Query
|
|
25
27
|
|
26
28
|
# Instantiates a QueryParser once, and keeps it in cache.
|
27
29
|
def parser
|
28
|
-
@@parser ||= Ferret::QueryParser.new(:fields => [:content, :
|
30
|
+
@@parser ||= Ferret::QueryParser.new(:fields => [:content, :filename, :basename, :filetype, :modified], :or_default => false, :analyzer=>Picolena::Analyzer)
|
29
31
|
end
|
30
32
|
end
|
31
33
|
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
%h2
|
2
2
|
=link_to icon_and_filename_for(document), download_document_path(document.probably_unique_id)
|
3
|
+
=language_icon_for(document)
|
3
4
|
%small=number_to_percentage(document.score*100, :precision=>1)
|
4
5
|
=highlight_matching_content(document)
|
5
6
|
%p=link_to_containing_directory(document)
|
@@ -2,17 +2,17 @@ desc 'Ferret index maintenance tasks'
|
|
2
2
|
namespace :index do
|
3
3
|
desc 'Clear indexes'
|
4
4
|
task :clear => :environment do
|
5
|
-
|
5
|
+
Indexer.clear! :all
|
6
6
|
end
|
7
7
|
|
8
8
|
desc 'Create index'
|
9
9
|
task :create => :environment do
|
10
|
-
Indexer.index_every_directory(
|
10
|
+
Indexer.index_every_directory(remove_first=true)
|
11
11
|
end
|
12
12
|
|
13
13
|
desc 'Update index'
|
14
14
|
task :update => :environment do
|
15
|
-
Indexer.index_every_directory
|
15
|
+
Indexer.index_every_directory
|
16
16
|
end
|
17
17
|
|
18
18
|
# Search index with query "some query" :
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,9 @@
|
|
1
|
+
Flag icons - http://www.famfamfam.com
|
2
|
+
|
3
|
+
These icons are public domain, and as such are free for any use (attribution appreciated but not required).
|
4
|
+
|
5
|
+
Note that these flags are named using the ISO3166-1 alpha-2 country codes where appropriate. A list of codes can be found at http://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
|
6
|
+
|
7
|
+
If you find these icons useful, please donate via paypal to mjames@gmail.com (or click the donate button available at http://www.famfamfam.com/lab/icons/silk)
|
8
|
+
|
9
|
+
Contact: mjames@gmail.com
|
@@ -10,14 +10,14 @@ describe "Finder without index on disk" do
|
|
10
10
|
end
|
11
11
|
|
12
12
|
before(:each) do
|
13
|
-
|
13
|
+
Indexer.clear!
|
14
14
|
end
|
15
15
|
|
16
16
|
it "should create index" do
|
17
17
|
Picolena::IndexedDirectories.replace({'spec/test_dirs/indexed/just_one_doc'=>'//justonedoc/'})
|
18
|
-
lambda {@finder_with_new_index=Finder.new("test moi")}.should change(
|
18
|
+
lambda {@finder_with_new_index=Finder.new("test moi")}.should change(Indexer, :index_exists?).from(false).to(true)
|
19
19
|
File.exists?(File.join(@new_index_path,'_0.cfs')).should be_true
|
20
|
-
|
20
|
+
Indexer.index.size.should >0
|
21
21
|
end
|
22
22
|
|
23
23
|
it "should raise if index is still empty after trying to create it" do
|
@@ -35,16 +35,19 @@ end
|
|
35
35
|
|
36
36
|
fields={
|
37
37
|
# description => key
|
38
|
-
:content
|
39
|
-
:
|
40
|
-
:
|
41
|
-
:
|
42
|
-
:
|
38
|
+
:content => :content,
|
39
|
+
:complete_path => :complete_path,
|
40
|
+
:basename => :basename,
|
41
|
+
:filename => :filename,
|
42
|
+
:extension => :filetype,
|
43
|
+
:modification_time => :modified,
|
44
|
+
:probably_unique_id => :probably_unique_id,
|
45
|
+
:language => :language
|
43
46
|
}
|
44
47
|
|
45
48
|
describe "Basic Finder" do
|
46
49
|
before(:all) do
|
47
|
-
Indexer.index_every_directory(
|
50
|
+
Indexer.index_every_directory(remove_first=true)
|
48
51
|
end
|
49
52
|
|
50
53
|
it "should accept one parameter as query, and 2 optionals for paginating" do
|
@@ -82,7 +85,7 @@ describe "Basic Finder" do
|
|
82
85
|
|
83
86
|
fields.each_pair do |description,field_name|
|
84
87
|
it "should index #{description} as :#{field_name}" do
|
85
|
-
|
88
|
+
Indexer.index.field_infos[field_name].should be_an_instance_of(Ferret::Index::FieldInfo)
|
86
89
|
end
|
87
90
|
end
|
88
91
|
|
@@ -21,7 +21,7 @@ describe Finder do
|
|
21
21
|
File.utime(0, once_upon_a_time, 'spec/test_dirs/indexed/basic/basic.pdf')
|
22
22
|
File.utime(0, a_bit_later, 'spec/test_dirs/indexed/yet_another_dir/office2003-word-template.dot')
|
23
23
|
File.utime(0, nineties, 'spec/test_dirs/indexed/others/placeholder.txt')
|
24
|
-
Indexer.index_every_directory(
|
24
|
+
Indexer.index_every_directory(remove_first=true)
|
25
25
|
end
|
26
26
|
|
27
27
|
it "should find documents according to their basename when specified with basename:query" do
|
@@ -30,8 +30,8 @@ describe Finder do
|
|
30
30
|
matching_documents_filename.should include("crossed.text")
|
31
31
|
end
|
32
32
|
|
33
|
-
it "should find documents according to their filename when specified with file:query" do
|
34
|
-
Finder.new("
|
33
|
+
it "should find documents according to their filename when specified with file:query or filename:query" do
|
34
|
+
Finder.new("filename:crossed.text").matching_documents.collect{|d| d.content}.should include("txt inside!")
|
35
35
|
Finder.new("file:crossed.txt").matching_documents.collect{|d| d.content}.should include("text inside!")
|
36
36
|
end
|
37
37
|
|
@@ -47,9 +47,9 @@ describe Finder do
|
|
47
47
|
end
|
48
48
|
|
49
49
|
it "should give a boost to basename, filename and filetype in index" do
|
50
|
-
index=
|
50
|
+
index=Indexer.index
|
51
51
|
index.field_infos[:basename].boost.should > 1.0
|
52
|
-
index.field_infos[:
|
52
|
+
index.field_infos[:filename].boost.should > 1.0
|
53
53
|
index.field_infos[:filetype].boost.should > 1.0
|
54
54
|
end
|
55
55
|
|
@@ -2,7 +2,7 @@ require File.dirname(__FILE__) + '/../spec_helper'
|
|
2
2
|
|
3
3
|
describe "PlainTextExtractors" do
|
4
4
|
before(:all) do
|
5
|
-
|
5
|
+
Indexer.ensure_index_existence
|
6
6
|
end
|
7
7
|
|
8
8
|
PlainTextExtractor.all.each{|extractor|
|
@@ -29,14 +29,14 @@ describe "PlainTextExtractors" do
|
|
29
29
|
}
|
30
30
|
|
31
31
|
it "should guess language when enough content is available" do
|
32
|
-
Document.new("spec/test_dirs/indexed/lang/goethe").
|
33
|
-
Document.new("spec/test_dirs/indexed/lang/shakespeare").
|
34
|
-
Document.new("spec/test_dirs/indexed/lang/lorca").
|
35
|
-
Document.new("spec/test_dirs/indexed/lang/hugo").
|
32
|
+
Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
|
33
|
+
Document.new("spec/test_dirs/indexed/lang/shakespeare").language.should == "en"
|
34
|
+
Document.new("spec/test_dirs/indexed/lang/lorca").language.should == "es"
|
35
|
+
Document.new("spec/test_dirs/indexed/lang/hugo").language.should == "fr"
|
36
36
|
end
|
37
37
|
|
38
38
|
it "should not try to guess language when file is too small" do
|
39
|
-
Document.new("spec/test_dirs/indexed/basic/hello.rb").
|
40
|
-
Document.new("spec/test_dirs/indexed/README").
|
39
|
+
Document.new("spec/test_dirs/indexed/basic/hello.rb").language.should be_nil
|
40
|
+
Document.new("spec/test_dirs/indexed/README").language.should be_nil
|
41
41
|
end
|
42
42
|
end
|
data/lib/picolena/version.rb
CHANGED
data/tasks/hack.rake
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
desc 'Create development picolena structure inside lib/picolena/templates'
|
2
|
-
task :lets_hack do
|
2
|
+
task :lets_hack => :clean do
|
3
3
|
picolena_root=File.join(File.dirname(__FILE__),'..')
|
4
4
|
Dir.chdir(picolena_root){
|
5
5
|
# Doesn't overwrite any file, Doesn't create any index, Doesn't launch any spec.
|
6
|
-
system("ruby bin/picolena lib/picolena/templates/spec/test_dirs/indexed --
|
6
|
+
system("ruby bin/picolena lib/picolena/templates/spec/test_dirs/indexed --no-index --no-spec --destination=lib/picolena/templates")
|
7
7
|
}
|
8
8
|
puts <<-EXPLAIN
|
9
9
|
|
data/website/index.html
CHANGED
@@ -33,7 +33,7 @@
|
|
33
33
|
<h1>Picolena</h1>
|
34
34
|
<div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
|
35
35
|
<p>Get Version</p>
|
36
|
-
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.
|
36
|
+
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.5</a>
|
37
37
|
</div>
|
38
38
|
<h1>→ ‘picolena’</h1>
|
39
39
|
|
data.tar.gz.sig
CHANGED
@@ -1,2 +1 @@
|
|
1
|
-
|
2
|
-
��V���[;#̧KM���$�;=X�~�>���� wYI7��3ksv��A߶� ��0�GZTi7$�����>@
|
1
|
+
B�8Ǣ�����ԝ�ŗFA�sέ�%l�ѵ�Aw�k>�6�w���|ĝW^9>]���k��i����I٤�e�Z7٭Px���UK��+r�>P��al�<�T+eL@�HD�!�@��X�nV鐎wa<��b臋�g����,q���m�{i��2����#�m�=�܈ϲH'W�má؝=cm��ݔ�^��㩫҃L=�ˁ�"�r�L�{7�{�R'4�������k����hkx�����=��6�j
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: picolena
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Duminil
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
|
31
31
|
-----END CERTIFICATE-----
|
32
32
|
|
33
|
-
date: 2008-04-
|
33
|
+
date: 2008-04-25 00:00:00 +02:00
|
34
34
|
default_executable:
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
@@ -107,6 +107,7 @@ extra_rdoc_files:
|
|
107
107
|
- History.txt
|
108
108
|
- Manifest.txt
|
109
109
|
- README.txt
|
110
|
+
- lib/picolena/templates/public/images/flags/readme.txt
|
110
111
|
- lib/picolena/templates/public/robots.txt
|
111
112
|
- lib/picolena/templates/spec/test_dirs/indexed/basic/crossed.txt
|
112
113
|
- lib/picolena/templates/spec/test_dirs/indexed/basic/plain.txt
|
@@ -156,8 +157,6 @@ files:
|
|
156
157
|
- lib/picolena/templates/app/helpers/documents_helper.rb
|
157
158
|
- lib/picolena/templates/app/models/document.rb
|
158
159
|
- lib/picolena/templates/app/models/finder.rb
|
159
|
-
- lib/picolena/templates/app/models/index_reader.rb
|
160
|
-
- lib/picolena/templates/app/models/index_writer.rb
|
161
160
|
- lib/picolena/templates/app/models/indexer.rb
|
162
161
|
- lib/picolena/templates/app/models/plain_text_extractor.rb
|
163
162
|
- lib/picolena/templates/app/models/query.rb
|
@@ -209,6 +208,22 @@ files:
|
|
209
208
|
- lib/picolena/templates/public/help/PicolenaHowTo-de.pdf
|
210
209
|
- lib/picolena/templates/public/help/PicolenaHowTo-de.tex
|
211
210
|
- lib/picolena/templates/public/images/bg.gif
|
211
|
+
- lib/picolena/templates/public/images/flags/ar.png
|
212
|
+
- lib/picolena/templates/public/images/flags/be.png
|
213
|
+
- lib/picolena/templates/public/images/flags/ca.png
|
214
|
+
- lib/picolena/templates/public/images/flags/de.png
|
215
|
+
- lib/picolena/templates/public/images/flags/el.png
|
216
|
+
- lib/picolena/templates/public/images/flags/en.png
|
217
|
+
- lib/picolena/templates/public/images/flags/es.png
|
218
|
+
- lib/picolena/templates/public/images/flags/fr.png
|
219
|
+
- lib/picolena/templates/public/images/flags/ga.png
|
220
|
+
- lib/picolena/templates/public/images/flags/hr.png
|
221
|
+
- lib/picolena/templates/public/images/flags/it.png
|
222
|
+
- lib/picolena/templates/public/images/flags/nl.png
|
223
|
+
- lib/picolena/templates/public/images/flags/pl.png
|
224
|
+
- lib/picolena/templates/public/images/flags/pt-br.png
|
225
|
+
- lib/picolena/templates/public/images/flags/pt-pt.png
|
226
|
+
- lib/picolena/templates/public/images/flags/readme.txt
|
212
227
|
- lib/picolena/templates/public/images/icons/cad.png
|
213
228
|
- lib/picolena/templates/public/images/icons/code.png
|
214
229
|
- lib/picolena/templates/public/images/icons/doc.png
|
@@ -254,8 +269,6 @@ files:
|
|
254
269
|
- lib/picolena/templates/spec/models/finder_spec.rb
|
255
270
|
- lib/picolena/templates/spec/models/host_indexing_system_spec.rb
|
256
271
|
- lib/picolena/templates/spec/models/index_directories_spec.rb
|
257
|
-
- lib/picolena/templates/spec/models/index_reader_spec.rb
|
258
|
-
- lib/picolena/templates/spec/models/index_writer_spec.rb
|
259
272
|
- lib/picolena/templates/spec/models/indexer_spec.rb
|
260
273
|
- lib/picolena/templates/spec/models/plain_text_extractor_spec.rb
|
261
274
|
- lib/picolena/templates/spec/models/query_spec.rb
|
metadata.gz.sig
CHANGED
Binary file
|
@@ -1,54 +0,0 @@
|
|
1
|
-
class IndexReader < Ferret::Index::Index
|
2
|
-
def initialize(params={})
|
3
|
-
# Add needed parameters
|
4
|
-
params.merge!(:path => Picolena::IndexSavePath, :analyzer => Picolena::Analyzer)
|
5
|
-
# Creates the IndexReader
|
6
|
-
super(params)
|
7
|
-
end
|
8
|
-
|
9
|
-
# Returns the number of times a file is present in the index.
|
10
|
-
# index_reader.doc_freq(field, term) → integer
|
11
|
-
# Return the number of documents in which the term term appears in the field field.
|
12
|
-
def occurences_number(complete_path)
|
13
|
-
# complete_path_query = Ferret::Search::TermQuery.new(:complete_path, complete_path)
|
14
|
-
search_by_complete_path(complete_path).total_hits
|
15
|
-
end
|
16
|
-
|
17
|
-
def search_by_complete_path(complete_path)
|
18
|
-
search('complete_path:"'<<complete_path<<'"')
|
19
|
-
end
|
20
|
-
|
21
|
-
def delete_by_complete_path(complete_path)
|
22
|
-
search_by_complete_path(complete_path).hits.each{|hit|
|
23
|
-
delete(hit.doc)
|
24
|
-
}
|
25
|
-
close
|
26
|
-
end
|
27
|
-
|
28
|
-
|
29
|
-
# Validation methods.
|
30
|
-
|
31
|
-
def should_have_documents
|
32
|
-
raise IndexError, "no document found" unless has_documents?
|
33
|
-
end
|
34
|
-
|
35
|
-
# Returns true if there's at least one document indexed.
|
36
|
-
def has_documents?
|
37
|
-
size>0
|
38
|
-
end
|
39
|
-
|
40
|
-
class<<self
|
41
|
-
|
42
|
-
def ensure_existence
|
43
|
-
Indexer.index_every_directory(update=false) unless exists? or RAILS_ENV=="production"
|
44
|
-
end
|
45
|
-
|
46
|
-
def exists?
|
47
|
-
filename and File.exists?(filename)
|
48
|
-
end
|
49
|
-
|
50
|
-
def filename
|
51
|
-
Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
@@ -1,33 +0,0 @@
|
|
1
|
-
class IndexWriter < Ferret::Index::IndexWriter
|
2
|
-
def initialize(params={})
|
3
|
-
# Add needed parameters
|
4
|
-
params.merge!(:create_if_missing => true,
|
5
|
-
:path => Picolena::IndexSavePath,
|
6
|
-
:analyzer => Picolena::Analyzer
|
7
|
-
# huge performance impact?
|
8
|
-
# :auto_flush => true
|
9
|
-
)
|
10
|
-
# Creates the IndexWriter
|
11
|
-
super(params)
|
12
|
-
# Add required fields (content, filetype, probably_unique_id, ...)
|
13
|
-
add_fields!
|
14
|
-
end
|
15
|
-
|
16
|
-
def self.remove
|
17
|
-
Dir.glob(File.join(Picolena::IndexSavePath,'*')).each{|f| FileUtils.rm(f) if File.file?(f)}
|
18
|
-
end
|
19
|
-
|
20
|
-
private
|
21
|
-
def add_fields!
|
22
|
-
# No need to re-create any field.
|
23
|
-
return unless field_infos.fields.empty?
|
24
|
-
field_infos.add_field(:complete_path, :store => :yes, :index => :yes)
|
25
|
-
field_infos.add_field(:content, :store => :yes, :index => :yes)
|
26
|
-
field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5)
|
27
|
-
field_infos.add_field(:file, :store => :no, :index => :yes, :boost => 1.5)
|
28
|
-
field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
|
29
|
-
field_infos.add_field(:date, :store => :yes, :index => :yes)
|
30
|
-
field_infos.add_field(:probably_unique_id, :store => :no, :index => :yes)
|
31
|
-
field_infos.add_field(:lang, :store => :yes, :index => :yes)
|
32
|
-
end
|
33
|
-
end
|