picolena 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +11 -0
- data/Manifest.txt +16 -4
- data/lib/picolena/picolena_generator.rb +0 -1
- data/lib/picolena/templates/app/helpers/documents_helper.rb +4 -0
- data/lib/picolena/templates/app/models/document.rb +21 -12
- data/lib/picolena/templates/app/models/finder.rb +38 -18
- data/lib/picolena/templates/app/models/indexer.rb +69 -89
- data/lib/picolena/templates/app/models/plain_text_extractor.rb +7 -7
- data/lib/picolena/templates/app/models/query.rb +4 -2
- data/lib/picolena/templates/app/views/documents/_document.html.haml +1 -0
- data/lib/picolena/templates/lang/ui/de.yml +3 -1
- data/lib/picolena/templates/lang/ui/en.yml +3 -1
- data/lib/picolena/templates/lang/ui/es.yml +3 -1
- data/lib/picolena/templates/lang/ui/fr.yml +3 -1
- data/lib/picolena/templates/lib/tasks/index.rake +3 -3
- data/lib/picolena/templates/public/images/flags/ar.png +0 -0
- data/lib/picolena/templates/public/images/flags/be.png +0 -0
- data/lib/picolena/templates/public/images/flags/ca.png +0 -0
- data/lib/picolena/templates/public/images/flags/de.png +0 -0
- data/lib/picolena/templates/public/images/flags/el.png +0 -0
- data/lib/picolena/templates/public/images/flags/en.png +0 -0
- data/lib/picolena/templates/public/images/flags/es.png +0 -0
- data/lib/picolena/templates/public/images/flags/fr.png +0 -0
- data/lib/picolena/templates/public/images/flags/ga.png +0 -0
- data/lib/picolena/templates/public/images/flags/hr.png +0 -0
- data/lib/picolena/templates/public/images/flags/it.png +0 -0
- data/lib/picolena/templates/public/images/flags/nl.png +0 -0
- data/lib/picolena/templates/public/images/flags/pl.png +0 -0
- data/lib/picolena/templates/public/images/flags/pt-br.png +0 -0
- data/lib/picolena/templates/public/images/flags/pt-pt.png +0 -0
- data/lib/picolena/templates/public/images/flags/readme.txt +9 -0
- data/lib/picolena/templates/spec/models/basic_finder_spec.rb +13 -10
- data/lib/picolena/templates/spec/models/finder_spec.rb +5 -5
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +7 -7
- data/lib/picolena/version.rb +1 -1
- data/tasks/hack.rake +2 -2
- data/website/index.html +1 -1
- data.tar.gz.sig +1 -2
- metadata +19 -6
- metadata.gz.sig +0 -0
- data/lib/picolena/templates/app/models/index_reader.rb +0 -54
- data/lib/picolena/templates/app/models/index_writer.rb +0 -33
- data/lib/picolena/templates/spec/models/index_reader_spec.rb +0 -7
- data/lib/picolena/templates/spec/models/index_writer_spec.rb +0 -7
data/History.txt
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
== 0.1.5 2008-04-
|
2
|
+
|
3
|
+
* 1 major enhancement:
|
4
|
+
* yet another Indexer & Index rewrite
|
5
|
+
|
6
|
+
* 1 minor enhancement:
|
7
|
+
* flags to indicate found language
|
8
|
+
|
9
|
+
* bug fixes:
|
10
|
+
* No more (or just less?) index lock errors
|
11
|
+
|
1
12
|
== 0.1.4 2008-04-23
|
2
13
|
* 1 minor enhancement:
|
3
14
|
* minimal MacOS support
|
data/Manifest.txt
CHANGED
@@ -22,8 +22,6 @@ lib/picolena/templates/app/helpers/application_helper.rb
|
|
22
22
|
lib/picolena/templates/app/helpers/documents_helper.rb
|
23
23
|
lib/picolena/templates/app/models/document.rb
|
24
24
|
lib/picolena/templates/app/models/finder.rb
|
25
|
-
lib/picolena/templates/app/models/index_reader.rb
|
26
|
-
lib/picolena/templates/app/models/index_writer.rb
|
27
25
|
lib/picolena/templates/app/models/indexer.rb
|
28
26
|
lib/picolena/templates/app/models/plain_text_extractor.rb
|
29
27
|
lib/picolena/templates/app/models/query.rb
|
@@ -75,6 +73,22 @@ lib/picolena/templates/public/favicon.ico
|
|
75
73
|
lib/picolena/templates/public/help/PicolenaHowTo-de.pdf
|
76
74
|
lib/picolena/templates/public/help/PicolenaHowTo-de.tex
|
77
75
|
lib/picolena/templates/public/images/bg.gif
|
76
|
+
lib/picolena/templates/public/images/flags/ar.png
|
77
|
+
lib/picolena/templates/public/images/flags/be.png
|
78
|
+
lib/picolena/templates/public/images/flags/ca.png
|
79
|
+
lib/picolena/templates/public/images/flags/de.png
|
80
|
+
lib/picolena/templates/public/images/flags/el.png
|
81
|
+
lib/picolena/templates/public/images/flags/en.png
|
82
|
+
lib/picolena/templates/public/images/flags/es.png
|
83
|
+
lib/picolena/templates/public/images/flags/fr.png
|
84
|
+
lib/picolena/templates/public/images/flags/ga.png
|
85
|
+
lib/picolena/templates/public/images/flags/hr.png
|
86
|
+
lib/picolena/templates/public/images/flags/it.png
|
87
|
+
lib/picolena/templates/public/images/flags/nl.png
|
88
|
+
lib/picolena/templates/public/images/flags/pl.png
|
89
|
+
lib/picolena/templates/public/images/flags/pt-br.png
|
90
|
+
lib/picolena/templates/public/images/flags/pt-pt.png
|
91
|
+
lib/picolena/templates/public/images/flags/readme.txt
|
78
92
|
lib/picolena/templates/public/images/icons/cad.png
|
79
93
|
lib/picolena/templates/public/images/icons/code.png
|
80
94
|
lib/picolena/templates/public/images/icons/doc.png
|
@@ -120,8 +134,6 @@ lib/picolena/templates/spec/models/document_spec.rb
|
|
120
134
|
lib/picolena/templates/spec/models/finder_spec.rb
|
121
135
|
lib/picolena/templates/spec/models/host_indexing_system_spec.rb
|
122
136
|
lib/picolena/templates/spec/models/index_directories_spec.rb
|
123
|
-
lib/picolena/templates/spec/models/index_reader_spec.rb
|
124
|
-
lib/picolena/templates/spec/models/index_writer_spec.rb
|
125
137
|
lib/picolena/templates/spec/models/indexer_spec.rb
|
126
138
|
lib/picolena/templates/spec/models/plain_text_extractor_spec.rb
|
127
139
|
lib/picolena/templates/spec/models/query_spec.rb
|
@@ -37,6 +37,10 @@ module DocumentsHelper
|
|
37
37
|
}) if document.matching_content
|
38
38
|
end
|
39
39
|
|
40
|
+
def language_icon_for(document)
|
41
|
+
(lang=document.language) && image_tag("flags/#{lang}.png")
|
42
|
+
end
|
43
|
+
|
40
44
|
# Returns icon and filename for any given document.
|
41
45
|
def icon_and_filename_for(document)
|
42
46
|
[icon_for(document.extname),document.filename].join(" ")
|
@@ -71,21 +71,33 @@ class Document
|
|
71
71
|
# Returns the last modification date before the document got indexed.
|
72
72
|
# Useful to know how old a document is, and to which version the cache corresponds.
|
73
73
|
def date
|
74
|
-
from_index[:
|
74
|
+
from_index[:modified].sub(/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/,'\1-\2-\3 \4:\5:\6')
|
75
75
|
end
|
76
76
|
|
77
77
|
def mtime
|
78
|
-
from_index[:
|
78
|
+
from_index[:modified].to_i
|
79
79
|
end
|
80
80
|
|
81
81
|
# Returns language.
|
82
|
-
def
|
83
|
-
from_index[:
|
82
|
+
def language
|
83
|
+
from_index[:language]
|
84
84
|
end
|
85
85
|
|
86
86
|
# Returns the id with which the document is indexed.
|
87
87
|
def index_id
|
88
|
-
@index_id ||=
|
88
|
+
@index_id ||= Finder.term_search(:complete_path, complete_path).doc
|
89
|
+
end
|
90
|
+
|
91
|
+
# Fields that are shared between every document.
|
92
|
+
def self.default_fields_for(complete_path)
|
93
|
+
{
|
94
|
+
:complete_path => complete_path,
|
95
|
+
:probably_unique_id => complete_path.base26_hash,
|
96
|
+
:filename => File.basename(complete_path),
|
97
|
+
:basename => File.basename(complete_path, File.extname(complete_path)).gsub(/_/,' '),
|
98
|
+
:filetype => File.extname(complete_path),
|
99
|
+
:modified => File.mtime(complete_path).strftime("%Y%m%d%H%M%S")
|
100
|
+
}
|
89
101
|
end
|
90
102
|
|
91
103
|
private
|
@@ -93,17 +105,14 @@ class Document
|
|
93
105
|
# Retrieves the document from the index.
|
94
106
|
# Useful to get meta-info about it.
|
95
107
|
def from_index
|
96
|
-
|
108
|
+
Indexer.index[index_id]
|
97
109
|
end
|
98
110
|
|
99
111
|
def self.find_by_unique_id(some_id)
|
100
|
-
Finder.
|
112
|
+
doc_id=Finder.term_search(:probably_unique_id, some_id).doc
|
113
|
+
new(Indexer.index[doc_id][:complete_path])
|
101
114
|
end
|
102
|
-
|
103
|
-
def self.find_by_complete_path(complete_path)
|
104
|
-
Finder.new('complete_path:"'<<complete_path<<'"').matching_document
|
105
|
-
end
|
106
|
-
|
115
|
+
|
107
116
|
def in_indexed_directory?
|
108
117
|
!indexed_directory.nil?
|
109
118
|
end
|
@@ -2,18 +2,16 @@ class Finder
|
|
2
2
|
attr_reader :query
|
3
3
|
|
4
4
|
def index
|
5
|
-
|
6
|
-
# causes ferret-0.11.6/lib/ferret/index.rb:768: [BUG] Segmentation fault
|
7
|
-
IndexReader.new
|
5
|
+
@@index ||= Indexer.index
|
8
6
|
end
|
9
7
|
|
10
8
|
def initialize(raw_query,page=1,results_per_page=Picolena::ResultsPerPage)
|
11
9
|
@query = Query.extract_from(raw_query)
|
12
10
|
@raw_query= raw_query
|
13
|
-
|
11
|
+
Indexer.ensure_index_existence
|
14
12
|
@per_page=results_per_page
|
15
13
|
@offset=(page.to_i-1)*results_per_page
|
16
|
-
|
14
|
+
index_should_have_documents
|
17
15
|
end
|
18
16
|
|
19
17
|
def execute!
|
@@ -31,9 +29,9 @@ class Finder
|
|
31
29
|
found_doc.score=score
|
32
30
|
found_doc.index_id=index_id
|
33
31
|
@matching_documents<<found_doc
|
34
|
-
|
35
|
-
|
36
|
-
|
32
|
+
rescue Errno::ENOENT
|
33
|
+
#"File has been moved/deleted!"
|
34
|
+
end
|
37
35
|
}
|
38
36
|
@executed=true
|
39
37
|
@time_needed=Time.now-start
|
@@ -60,14 +58,36 @@ class Finder
|
|
60
58
|
# Returns matching document for any given query only if
|
61
59
|
# exactly one document is found.
|
62
60
|
# Raises otherwise.
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
61
|
+
def matching_document
|
62
|
+
case matching_documents.size
|
63
|
+
when 0
|
64
|
+
raise IndexError, "No document found"
|
65
|
+
when 1
|
66
|
+
matching_documents.first
|
67
|
+
else
|
68
|
+
raise IndexError, "More than one document found"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
class<<self
|
73
|
+
def searcher
|
74
|
+
@@searcher ||= Ferret::Search::Searcher.new(Picolena::IndexSavePath)
|
75
|
+
end
|
76
|
+
|
77
|
+
def term_search(field,term)
|
78
|
+
query = Ferret::Search::TermQuery.new(field,term)
|
79
|
+
searcher.search(query).hits.first
|
80
|
+
end
|
81
|
+
|
82
|
+
def reload!
|
83
|
+
@@searcher = nil
|
84
|
+
@@index = nil
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
def index_should_have_documents
|
91
|
+
raise IndexError, "no document found" unless index.size > 0
|
92
|
+
end
|
73
93
|
end
|
@@ -5,54 +5,22 @@ class Indexer
|
|
5
5
|
@@max_threads_number = 8
|
6
6
|
|
7
7
|
class << self
|
8
|
-
def
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
:file => File.basename(complete_path),
|
13
|
-
:basename => File.basename(complete_path, File.extname(complete_path)).gsub(/_/,' '),
|
14
|
-
:filetype => File.extname(complete_path),
|
15
|
-
:date => File.mtime(complete_path).strftime("%Y%m%d%H%M%S")
|
16
|
-
}
|
17
|
-
end
|
18
|
-
|
19
|
-
def index_every_directory(update=true)
|
8
|
+
def index_every_directory(remove_first=false)
|
9
|
+
clear! if remove_first
|
10
|
+
# Forces Finder.searcher and Finder.index to be reloaded, by removing them from the cache.
|
11
|
+
Finder.reload!
|
20
12
|
log :debug => "Indexing every directory"
|
21
|
-
|
22
|
-
|
23
13
|
start=Time.now
|
24
|
-
@update = update
|
25
|
-
reset! unless update
|
26
|
-
|
27
14
|
Picolena::IndexedDirectories.each{|dir, alias_dir|
|
28
15
|
index_directory_with_multithreads(dir)
|
29
16
|
}
|
30
|
-
|
17
|
+
log :debug => "Now optimizing index"
|
31
18
|
writer.optimize
|
32
|
-
writer.close
|
33
|
-
# launching Indexer.index_every_directory twice in a row
|
34
|
-
# would raise a SEGFAULT:
|
35
|
-
# picolena/lib/picolena/templates/app/models/indexer.rb:27: [BUG] Segmentation fault
|
36
|
-
# ruby 1.8.6 (2007-06-07) [i486-linux]
|
37
|
-
#
|
38
|
-
# Aborted (core dumped)
|
39
|
-
#
|
40
|
-
# But without those 2 lines, specs don't pass anymore.
|
41
|
-
#
|
42
19
|
log :debug => "Indexing done in #{Time.now-start} s."
|
43
20
|
end
|
44
21
|
|
45
22
|
def index_directory_with_multithreads(dir)
|
46
|
-
|
47
|
-
# indexer raises:
|
48
|
-
#
|
49
|
-
# current thread not owner
|
50
|
-
# /usr/lib/ruby/1.8/monitor.rb:278:in `mon_check_owner'
|
51
|
-
# /home/www/picolena/lib/picolena/templates/lib/core_exts.rb:32:in `join'
|
52
|
-
# ...
|
53
|
-
#
|
54
|
-
# So Index creation is multithreaded, Index update is monothreaded.
|
55
|
-
threads_number = @update ? 1 : @@max_threads_number
|
23
|
+
threads_number = @@max_threads_number
|
56
24
|
log :debug => "Indexing #{dir}, #{threads_number} thread(s)"
|
57
25
|
|
58
26
|
indexing_list=Dir[File.join(dir,"**/*")].select{|filename|
|
@@ -61,82 +29,94 @@ class Indexer
|
|
61
29
|
|
62
30
|
indexing_list_chunks=indexing_list.in_transposed_slices(threads_number)
|
63
31
|
|
32
|
+
# It initializes an IndexWriter before launching multithreaded
|
33
|
+
# indexing. Otherwise, two threads could try to instantiate
|
34
|
+
# an IndexWriter at the same time, and get a
|
35
|
+
# Ferret::Store::Lock::LockError
|
36
|
+
writer
|
37
|
+
|
64
38
|
indexing_list_chunks.each_with_thread{|chunk|
|
65
39
|
chunk.each{|filename|
|
66
|
-
|
40
|
+
add_file(filename)
|
67
41
|
}
|
68
42
|
}
|
69
43
|
end
|
70
44
|
|
71
|
-
def add_or_update_file(complete_path)
|
72
|
-
should_be_added = true
|
73
|
-
if @update then
|
74
|
-
log :debug => "What to do with #{complete_path} ?"
|
75
|
-
occurences = reader.occurences_number(complete_path)
|
76
|
-
log :debug => "\tappears #{occurences} times in the index"
|
77
|
-
case occurences
|
78
|
-
when 0
|
79
|
-
#Nothing to do here, the file will be added.
|
80
|
-
when 1
|
81
|
-
d=Document.find_by_complete_path(complete_path)
|
82
|
-
if File.mtime(complete_path).strftime("%Y%m%d%H%M%S").to_i > d.mtime then
|
83
|
-
log :debug => "\thas been modified"
|
84
|
-
delete_file(complete_path)
|
85
|
-
else
|
86
|
-
should_be_added = false
|
87
|
-
log :debug => "\thas not been modified. leaving it"
|
88
|
-
end
|
89
|
-
else
|
90
|
-
delete_file(complete_path)
|
91
|
-
end
|
92
|
-
end
|
93
|
-
add_file(complete_path) if should_be_added
|
94
|
-
end
|
95
|
-
|
96
45
|
def add_file(complete_path)
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
raise "\tempty document #{complete_path}" if text.strip.empty?
|
104
|
-
fields[:content] = text
|
105
|
-
log :debug => "language found: #{lang}" if lang
|
106
|
-
fields[:lang] = lang
|
46
|
+
default_fields = Document.default_fields_for(complete_path)
|
47
|
+
begin
|
48
|
+
document = PlainTextExtractor.extract_content_and_language_from(complete_path)
|
49
|
+
raise "empty document #{complete_path}" if document[:content].strip.empty?
|
50
|
+
document.merge! default_fields
|
51
|
+
log :debug => ["Added : #{complete_path}",document[:language] ? " (#{document[:language]})" : ""].join
|
107
52
|
rescue => e
|
108
53
|
log :debug => "\tindexing without content: #{e.message}"
|
54
|
+
document = default_fields
|
109
55
|
end
|
110
|
-
|
111
|
-
writer << fields
|
56
|
+
writer << document
|
112
57
|
end
|
113
58
|
|
114
|
-
|
115
|
-
|
59
|
+
# Ensures writer is closed, and removes every index file for RAILS_ENV.
|
60
|
+
def clear!(all=false)
|
61
|
+
close
|
62
|
+
to_remove=all ? Picolena::IndexesSavePath : Picolena::IndexSavePath
|
63
|
+
Dir.glob(File.join(to_remove,'**/*')).each{|f| FileUtils.rm(f) if File.file?(f)}
|
116
64
|
end
|
117
65
|
|
118
|
-
|
119
|
-
|
66
|
+
# Closes the writer and
|
67
|
+
# ensures that a new IndexWriter is instantiated next time writer is called.
|
68
|
+
def close
|
69
|
+
@@writer.close rescue nil
|
70
|
+
# Ferret will SEGFAULT otherwise.
|
71
|
+
@@writer = nil
|
120
72
|
end
|
121
73
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
IndexWriter.
|
74
|
+
# Only one IndexWriter should be instantiated.
|
75
|
+
# If one already exists, returns it.
|
76
|
+
# Creates it otherwise.
|
77
|
+
def writer
|
78
|
+
@@writer ||= Ferret::Index::IndexWriter.new(default_index_params)
|
127
79
|
end
|
128
80
|
|
129
|
-
def
|
130
|
-
|
131
|
-
|
81
|
+
def index
|
82
|
+
Ferret::Index::Index.new(default_index_params)
|
83
|
+
end
|
84
|
+
|
85
|
+
def ensure_index_existence
|
86
|
+
index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production"
|
132
87
|
end
|
133
88
|
|
134
89
|
private
|
135
90
|
|
91
|
+
def index_exists?
|
92
|
+
index_filename and File.exists?(index_filename)
|
93
|
+
end
|
94
|
+
|
95
|
+
def index_filename
|
96
|
+
Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first
|
97
|
+
end
|
98
|
+
|
136
99
|
def log(hash)
|
137
100
|
hash.each{|level,message|
|
138
101
|
IndexerLogger.send(level,message)
|
139
102
|
}
|
140
|
-
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def default_index_params
|
106
|
+
{:path => Picolena::IndexSavePath, :analyzer => Picolena::Analyzer, :field_infos => default_field_infos}
|
107
|
+
end
|
108
|
+
|
109
|
+
def default_field_infos
|
110
|
+
returning Ferret::Index::FieldInfos.new do |field_infos|
|
111
|
+
field_infos.add_field(:complete_path, :store => :yes, :index => :untokenized)
|
112
|
+
field_infos.add_field(:content, :store => :yes, :index => :yes)
|
113
|
+
field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5)
|
114
|
+
field_infos.add_field(:filename, :store => :no, :index => :yes, :boost => 1.5)
|
115
|
+
field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
|
116
|
+
field_infos.add_field(:modified, :store => :yes, :index => :untokenized)
|
117
|
+
field_infos.add_field(:probably_unique_id, :store => :no, :index => :yes)
|
118
|
+
field_infos.add_field(:language, :store => :yes, :index => :yes)
|
119
|
+
end
|
120
|
+
end
|
141
121
|
end
|
142
122
|
end
|
@@ -109,12 +109,12 @@ class PlainTextExtractor
|
|
109
109
|
# and if probability score is higher than 90%.
|
110
110
|
def extract_content_and_language
|
111
111
|
content=extract_content
|
112
|
-
return
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
112
|
+
return {:content => content} unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb)
|
113
|
+
Picolena::UseLanguageRecognition,
|
114
|
+
# Is a language guesser already installed?
|
115
|
+
PlainTextExtractor.language_guesser,
|
116
|
+
# Language recognition is too unreliable for small files.
|
117
|
+
content.size > 500].all?
|
118
118
|
language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser|
|
119
119
|
lang_guesser.write content
|
120
120
|
lang_guesser.close_write
|
@@ -125,6 +125,6 @@ class PlainTextExtractor
|
|
125
125
|
lang unless score<0.9
|
126
126
|
end
|
127
127
|
}
|
128
|
-
|
128
|
+
{:content => content, :language => language}
|
129
129
|
end
|
130
130
|
end
|
@@ -13,9 +13,11 @@ class Query
|
|
13
13
|
/\b#{:AND.l}\b/=>'AND',
|
14
14
|
/\b#{:OR.l}\b/=>'OR',
|
15
15
|
/\b#{:NOT.l}\b/=>'NOT',
|
16
|
+
/(#{:filename.l}):/=>'filename:',
|
16
17
|
/(#{:filetype.l}):/=>'filetype:',
|
17
18
|
/#{:content.l}:/ => 'content:',
|
18
|
-
|
19
|
+
/(#{:modified.l}):/ => 'modified:',
|
20
|
+
/(#{:language.l}):/ => 'language:',
|
19
21
|
/\b#{:LIKE.l}\s+(\S+)/=>'\1~'
|
20
22
|
}
|
21
23
|
to_en.inject(raw_query){|mem,non_english_to_english_keyword|
|
@@ -25,7 +27,7 @@ class Query
|
|
25
27
|
|
26
28
|
# Instantiates a QueryParser once, and keeps it in cache.
|
27
29
|
def parser
|
28
|
-
@@parser ||= Ferret::QueryParser.new(:fields => [:content, :
|
30
|
+
@@parser ||= Ferret::QueryParser.new(:fields => [:content, :filename, :basename, :filetype, :modified], :or_default => false, :analyzer=>Picolena::Analyzer)
|
29
31
|
end
|
30
32
|
end
|
31
33
|
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
%h2
|
2
2
|
=link_to icon_and_filename_for(document), download_document_path(document.probably_unique_id)
|
3
|
+
=language_icon_for(document)
|
3
4
|
%small=number_to_percentage(document.score*100, :precision=>1)
|
4
5
|
=highlight_matching_content(document)
|
5
6
|
%p=link_to_containing_directory(document)
|
@@ -2,17 +2,17 @@ desc 'Ferret index maintenance tasks'
|
|
2
2
|
namespace :index do
|
3
3
|
desc 'Clear indexes'
|
4
4
|
task :clear => :environment do
|
5
|
-
|
5
|
+
Indexer.clear! :all
|
6
6
|
end
|
7
7
|
|
8
8
|
desc 'Create index'
|
9
9
|
task :create => :environment do
|
10
|
-
Indexer.index_every_directory(
|
10
|
+
Indexer.index_every_directory(remove_first=true)
|
11
11
|
end
|
12
12
|
|
13
13
|
desc 'Update index'
|
14
14
|
task :update => :environment do
|
15
|
-
Indexer.index_every_directory
|
15
|
+
Indexer.index_every_directory
|
16
16
|
end
|
17
17
|
|
18
18
|
# Search index with query "some query" :
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,9 @@
|
|
1
|
+
Flag icons - http://www.famfamfam.com
|
2
|
+
|
3
|
+
These icons are public domain, and as such are free for any use (attribution appreciated but not required).
|
4
|
+
|
5
|
+
Note that these flags are named using the ISO3166-1 alpha-2 country codes where appropriate. A list of codes can be found at http://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
|
6
|
+
|
7
|
+
If you find these icons useful, please donate via paypal to mjames@gmail.com (or click the donate button available at http://www.famfamfam.com/lab/icons/silk)
|
8
|
+
|
9
|
+
Contact: mjames@gmail.com
|
@@ -10,14 +10,14 @@ describe "Finder without index on disk" do
|
|
10
10
|
end
|
11
11
|
|
12
12
|
before(:each) do
|
13
|
-
|
13
|
+
Indexer.clear!
|
14
14
|
end
|
15
15
|
|
16
16
|
it "should create index" do
|
17
17
|
Picolena::IndexedDirectories.replace({'spec/test_dirs/indexed/just_one_doc'=>'//justonedoc/'})
|
18
|
-
lambda {@finder_with_new_index=Finder.new("test moi")}.should change(
|
18
|
+
lambda {@finder_with_new_index=Finder.new("test moi")}.should change(Indexer, :index_exists?).from(false).to(true)
|
19
19
|
File.exists?(File.join(@new_index_path,'_0.cfs')).should be_true
|
20
|
-
|
20
|
+
Indexer.index.size.should >0
|
21
21
|
end
|
22
22
|
|
23
23
|
it "should raise if index is still empty after trying to create it" do
|
@@ -35,16 +35,19 @@ end
|
|
35
35
|
|
36
36
|
fields={
|
37
37
|
# description => key
|
38
|
-
:content
|
39
|
-
:
|
40
|
-
:
|
41
|
-
:
|
42
|
-
:
|
38
|
+
:content => :content,
|
39
|
+
:complete_path => :complete_path,
|
40
|
+
:basename => :basename,
|
41
|
+
:filename => :filename,
|
42
|
+
:extension => :filetype,
|
43
|
+
:modification_time => :modified,
|
44
|
+
:probably_unique_id => :probably_unique_id,
|
45
|
+
:language => :language
|
43
46
|
}
|
44
47
|
|
45
48
|
describe "Basic Finder" do
|
46
49
|
before(:all) do
|
47
|
-
Indexer.index_every_directory(
|
50
|
+
Indexer.index_every_directory(remove_first=true)
|
48
51
|
end
|
49
52
|
|
50
53
|
it "should accept one parameter as query, and 2 optionals for paginating" do
|
@@ -82,7 +85,7 @@ describe "Basic Finder" do
|
|
82
85
|
|
83
86
|
fields.each_pair do |description,field_name|
|
84
87
|
it "should index #{description} as :#{field_name}" do
|
85
|
-
|
88
|
+
Indexer.index.field_infos[field_name].should be_an_instance_of(Ferret::Index::FieldInfo)
|
86
89
|
end
|
87
90
|
end
|
88
91
|
|
@@ -21,7 +21,7 @@ describe Finder do
|
|
21
21
|
File.utime(0, once_upon_a_time, 'spec/test_dirs/indexed/basic/basic.pdf')
|
22
22
|
File.utime(0, a_bit_later, 'spec/test_dirs/indexed/yet_another_dir/office2003-word-template.dot')
|
23
23
|
File.utime(0, nineties, 'spec/test_dirs/indexed/others/placeholder.txt')
|
24
|
-
Indexer.index_every_directory(
|
24
|
+
Indexer.index_every_directory(remove_first=true)
|
25
25
|
end
|
26
26
|
|
27
27
|
it "should find documents according to their basename when specified with basename:query" do
|
@@ -30,8 +30,8 @@ describe Finder do
|
|
30
30
|
matching_documents_filename.should include("crossed.text")
|
31
31
|
end
|
32
32
|
|
33
|
-
it "should find documents according to their filename when specified with file:query" do
|
34
|
-
Finder.new("
|
33
|
+
it "should find documents according to their filename when specified with file:query or filename:query" do
|
34
|
+
Finder.new("filename:crossed.text").matching_documents.collect{|d| d.content}.should include("txt inside!")
|
35
35
|
Finder.new("file:crossed.txt").matching_documents.collect{|d| d.content}.should include("text inside!")
|
36
36
|
end
|
37
37
|
|
@@ -47,9 +47,9 @@ describe Finder do
|
|
47
47
|
end
|
48
48
|
|
49
49
|
it "should give a boost to basename, filename and filetype in index" do
|
50
|
-
index=
|
50
|
+
index=Indexer.index
|
51
51
|
index.field_infos[:basename].boost.should > 1.0
|
52
|
-
index.field_infos[:
|
52
|
+
index.field_infos[:filename].boost.should > 1.0
|
53
53
|
index.field_infos[:filetype].boost.should > 1.0
|
54
54
|
end
|
55
55
|
|
@@ -2,7 +2,7 @@ require File.dirname(__FILE__) + '/../spec_helper'
|
|
2
2
|
|
3
3
|
describe "PlainTextExtractors" do
|
4
4
|
before(:all) do
|
5
|
-
|
5
|
+
Indexer.ensure_index_existence
|
6
6
|
end
|
7
7
|
|
8
8
|
PlainTextExtractor.all.each{|extractor|
|
@@ -29,14 +29,14 @@ describe "PlainTextExtractors" do
|
|
29
29
|
}
|
30
30
|
|
31
31
|
it "should guess language when enough content is available" do
|
32
|
-
Document.new("spec/test_dirs/indexed/lang/goethe").
|
33
|
-
Document.new("spec/test_dirs/indexed/lang/shakespeare").
|
34
|
-
Document.new("spec/test_dirs/indexed/lang/lorca").
|
35
|
-
Document.new("spec/test_dirs/indexed/lang/hugo").
|
32
|
+
Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de"
|
33
|
+
Document.new("spec/test_dirs/indexed/lang/shakespeare").language.should == "en"
|
34
|
+
Document.new("spec/test_dirs/indexed/lang/lorca").language.should == "es"
|
35
|
+
Document.new("spec/test_dirs/indexed/lang/hugo").language.should == "fr"
|
36
36
|
end
|
37
37
|
|
38
38
|
it "should not try to guess language when file is too small" do
|
39
|
-
Document.new("spec/test_dirs/indexed/basic/hello.rb").
|
40
|
-
Document.new("spec/test_dirs/indexed/README").
|
39
|
+
Document.new("spec/test_dirs/indexed/basic/hello.rb").language.should be_nil
|
40
|
+
Document.new("spec/test_dirs/indexed/README").language.should be_nil
|
41
41
|
end
|
42
42
|
end
|
data/lib/picolena/version.rb
CHANGED
data/tasks/hack.rake
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
desc 'Create development picolena structure inside lib/picolena/templates'
|
2
|
-
task :lets_hack do
|
2
|
+
task :lets_hack => :clean do
|
3
3
|
picolena_root=File.join(File.dirname(__FILE__),'..')
|
4
4
|
Dir.chdir(picolena_root){
|
5
5
|
# Doesn't overwrite any file, Doesn't create any index, Doesn't launch any spec.
|
6
|
-
system("ruby bin/picolena lib/picolena/templates/spec/test_dirs/indexed --
|
6
|
+
system("ruby bin/picolena lib/picolena/templates/spec/test_dirs/indexed --no-index --no-spec --destination=lib/picolena/templates")
|
7
7
|
}
|
8
8
|
puts <<-EXPLAIN
|
9
9
|
|
data/website/index.html
CHANGED
@@ -33,7 +33,7 @@
|
|
33
33
|
<h1>Picolena</h1>
|
34
34
|
<div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'>
|
35
35
|
<p>Get Version</p>
|
36
|
-
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.
|
36
|
+
<a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.5</a>
|
37
37
|
</div>
|
38
38
|
<h1>→ ‘picolena’</h1>
|
39
39
|
|
data.tar.gz.sig
CHANGED
@@ -1,2 +1 @@
|
|
1
|
-
|
2
|
-
��V���[;#̧KM���$�;=X�~�>���� wYI7��3ksv��A߶� ��0�GZTi7$�����>@
|
1
|
+
B�8Ǣ�����ԝ�ŗFA�sέ�%l�ѵ�Aw�k>�6�w���|ĝW^9>]���k��i����I٤�e�Z7٭Px���UK��+r�>P��al�<�T+eL@�HD�!�@��X�nV鐎wa<��b臋�g����,q���m�{i��2����#�m�=�܈ϲH'W�má؝=cm��ݔ�^��㩫҃L=�ˁ�"�r�L�{7�{�R'4�������k����hkx�����=��6�j
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: picolena
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Duminil
|
@@ -30,7 +30,7 @@ cert_chain:
|
|
30
30
|
qvI9FgPZ1QTG5uZAlBbk6d6JU2XfpA==
|
31
31
|
-----END CERTIFICATE-----
|
32
32
|
|
33
|
-
date: 2008-04-
|
33
|
+
date: 2008-04-25 00:00:00 +02:00
|
34
34
|
default_executable:
|
35
35
|
dependencies:
|
36
36
|
- !ruby/object:Gem::Dependency
|
@@ -107,6 +107,7 @@ extra_rdoc_files:
|
|
107
107
|
- History.txt
|
108
108
|
- Manifest.txt
|
109
109
|
- README.txt
|
110
|
+
- lib/picolena/templates/public/images/flags/readme.txt
|
110
111
|
- lib/picolena/templates/public/robots.txt
|
111
112
|
- lib/picolena/templates/spec/test_dirs/indexed/basic/crossed.txt
|
112
113
|
- lib/picolena/templates/spec/test_dirs/indexed/basic/plain.txt
|
@@ -156,8 +157,6 @@ files:
|
|
156
157
|
- lib/picolena/templates/app/helpers/documents_helper.rb
|
157
158
|
- lib/picolena/templates/app/models/document.rb
|
158
159
|
- lib/picolena/templates/app/models/finder.rb
|
159
|
-
- lib/picolena/templates/app/models/index_reader.rb
|
160
|
-
- lib/picolena/templates/app/models/index_writer.rb
|
161
160
|
- lib/picolena/templates/app/models/indexer.rb
|
162
161
|
- lib/picolena/templates/app/models/plain_text_extractor.rb
|
163
162
|
- lib/picolena/templates/app/models/query.rb
|
@@ -209,6 +208,22 @@ files:
|
|
209
208
|
- lib/picolena/templates/public/help/PicolenaHowTo-de.pdf
|
210
209
|
- lib/picolena/templates/public/help/PicolenaHowTo-de.tex
|
211
210
|
- lib/picolena/templates/public/images/bg.gif
|
211
|
+
- lib/picolena/templates/public/images/flags/ar.png
|
212
|
+
- lib/picolena/templates/public/images/flags/be.png
|
213
|
+
- lib/picolena/templates/public/images/flags/ca.png
|
214
|
+
- lib/picolena/templates/public/images/flags/de.png
|
215
|
+
- lib/picolena/templates/public/images/flags/el.png
|
216
|
+
- lib/picolena/templates/public/images/flags/en.png
|
217
|
+
- lib/picolena/templates/public/images/flags/es.png
|
218
|
+
- lib/picolena/templates/public/images/flags/fr.png
|
219
|
+
- lib/picolena/templates/public/images/flags/ga.png
|
220
|
+
- lib/picolena/templates/public/images/flags/hr.png
|
221
|
+
- lib/picolena/templates/public/images/flags/it.png
|
222
|
+
- lib/picolena/templates/public/images/flags/nl.png
|
223
|
+
- lib/picolena/templates/public/images/flags/pl.png
|
224
|
+
- lib/picolena/templates/public/images/flags/pt-br.png
|
225
|
+
- lib/picolena/templates/public/images/flags/pt-pt.png
|
226
|
+
- lib/picolena/templates/public/images/flags/readme.txt
|
212
227
|
- lib/picolena/templates/public/images/icons/cad.png
|
213
228
|
- lib/picolena/templates/public/images/icons/code.png
|
214
229
|
- lib/picolena/templates/public/images/icons/doc.png
|
@@ -254,8 +269,6 @@ files:
|
|
254
269
|
- lib/picolena/templates/spec/models/finder_spec.rb
|
255
270
|
- lib/picolena/templates/spec/models/host_indexing_system_spec.rb
|
256
271
|
- lib/picolena/templates/spec/models/index_directories_spec.rb
|
257
|
-
- lib/picolena/templates/spec/models/index_reader_spec.rb
|
258
|
-
- lib/picolena/templates/spec/models/index_writer_spec.rb
|
259
272
|
- lib/picolena/templates/spec/models/indexer_spec.rb
|
260
273
|
- lib/picolena/templates/spec/models/plain_text_extractor_spec.rb
|
261
274
|
- lib/picolena/templates/spec/models/query_spec.rb
|
metadata.gz.sig
CHANGED
Binary file
|
@@ -1,54 +0,0 @@
|
|
1
|
-
class IndexReader < Ferret::Index::Index
|
2
|
-
def initialize(params={})
|
3
|
-
# Add needed parameters
|
4
|
-
params.merge!(:path => Picolena::IndexSavePath, :analyzer => Picolena::Analyzer)
|
5
|
-
# Creates the IndexReader
|
6
|
-
super(params)
|
7
|
-
end
|
8
|
-
|
9
|
-
# Returns the number of times a file is present in the index.
|
10
|
-
# index_reader.doc_freq(field, term) → integer
|
11
|
-
# Return the number of documents in which the term term appears in the field field.
|
12
|
-
def occurences_number(complete_path)
|
13
|
-
# complete_path_query = Ferret::Search::TermQuery.new(:complete_path, complete_path)
|
14
|
-
search_by_complete_path(complete_path).total_hits
|
15
|
-
end
|
16
|
-
|
17
|
-
def search_by_complete_path(complete_path)
|
18
|
-
search('complete_path:"'<<complete_path<<'"')
|
19
|
-
end
|
20
|
-
|
21
|
-
def delete_by_complete_path(complete_path)
|
22
|
-
search_by_complete_path(complete_path).hits.each{|hit|
|
23
|
-
delete(hit.doc)
|
24
|
-
}
|
25
|
-
close
|
26
|
-
end
|
27
|
-
|
28
|
-
|
29
|
-
# Validation methods.
|
30
|
-
|
31
|
-
def should_have_documents
|
32
|
-
raise IndexError, "no document found" unless has_documents?
|
33
|
-
end
|
34
|
-
|
35
|
-
# Returns true if there's at least one document indexed.
|
36
|
-
def has_documents?
|
37
|
-
size>0
|
38
|
-
end
|
39
|
-
|
40
|
-
class<<self
|
41
|
-
|
42
|
-
def ensure_existence
|
43
|
-
Indexer.index_every_directory(update=false) unless exists? or RAILS_ENV=="production"
|
44
|
-
end
|
45
|
-
|
46
|
-
def exists?
|
47
|
-
filename and File.exists?(filename)
|
48
|
-
end
|
49
|
-
|
50
|
-
def filename
|
51
|
-
Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
@@ -1,33 +0,0 @@
|
|
1
|
-
class IndexWriter < Ferret::Index::IndexWriter
|
2
|
-
def initialize(params={})
|
3
|
-
# Add needed parameters
|
4
|
-
params.merge!(:create_if_missing => true,
|
5
|
-
:path => Picolena::IndexSavePath,
|
6
|
-
:analyzer => Picolena::Analyzer
|
7
|
-
# huge performance impact?
|
8
|
-
# :auto_flush => true
|
9
|
-
)
|
10
|
-
# Creates the IndexWriter
|
11
|
-
super(params)
|
12
|
-
# Add required fields (content, filetype, probably_unique_id, ...)
|
13
|
-
add_fields!
|
14
|
-
end
|
15
|
-
|
16
|
-
def self.remove
|
17
|
-
Dir.glob(File.join(Picolena::IndexSavePath,'*')).each{|f| FileUtils.rm(f) if File.file?(f)}
|
18
|
-
end
|
19
|
-
|
20
|
-
private
|
21
|
-
def add_fields!
|
22
|
-
# No need to re-create any field.
|
23
|
-
return unless field_infos.fields.empty?
|
24
|
-
field_infos.add_field(:complete_path, :store => :yes, :index => :yes)
|
25
|
-
field_infos.add_field(:content, :store => :yes, :index => :yes)
|
26
|
-
field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5)
|
27
|
-
field_infos.add_field(:file, :store => :no, :index => :yes, :boost => 1.5)
|
28
|
-
field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
|
29
|
-
field_infos.add_field(:date, :store => :yes, :index => :yes)
|
30
|
-
field_infos.add_field(:probably_unique_id, :store => :no, :index => :yes)
|
31
|
-
field_infos.add_field(:lang, :store => :yes, :index => :yes)
|
32
|
-
end
|
33
|
-
end
|