picolena 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +8 -0
- data/Manifest.txt +28 -15
- data/README.txt +1 -1
- data/config/files_to_clean +2 -1
- data/config/hoe.rb +1 -1
- data/lib/picolena/config/basic.rb +46 -35
- data/lib/picolena/config/icons_and_filetypes.yml +69 -0
- data/lib/picolena/config/indexed_directories.yml +1 -1
- data/lib/picolena/picolena_generator.rb +3 -1
- data/lib/picolena/templates/app/controllers/application.rb +2 -2
- data/lib/picolena/templates/app/controllers/documents_controller.rb +1 -1
- data/lib/picolena/templates/app/helpers/documents_helper.rb +7 -26
- data/lib/picolena/templates/app/models/document.rb +32 -14
- data/lib/picolena/templates/app/models/finder.rb +21 -78
- data/lib/picolena/templates/app/models/index_reader.rb +56 -0
- data/lib/picolena/templates/app/models/index_writer.rb +36 -0
- data/lib/picolena/templates/app/models/indexer.rb +142 -0
- data/lib/picolena/templates/app/models/plain_text_extractor.rb +122 -0
- data/lib/picolena/templates/app/models/query.rb +31 -0
- data/lib/picolena/templates/app/views/documents/_document.html.haml +2 -2
- data/lib/picolena/templates/config/environment.rb +2 -2
- data/lib/picolena/templates/config/environments/development.rb +1 -1
- data/lib/picolena/templates/config/environments/production.rb +1 -1
- data/lib/picolena/templates/config/environments/test.rb +1 -1
- data/lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb +2 -0
- data/lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb +3 -1
- data/lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb +6 -0
- data/lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb +2 -0
- data/lib/picolena/templates/config/initializers/006_load_icons.rb +8 -0
- data/lib/picolena/templates/lib/core_exts.rb +20 -1
- data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +72 -0
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/adobe.pdf.rb +3 -3
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/html.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.excel.rb +4 -4
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.powerpoint.rb +4 -4
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.rtf.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.word.rb +4 -4
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.presentation.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.spreadsheet.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.text.rb +2 -2
- data/lib/picolena/templates/lib/{filters → plain_text_extractors}/plain_text.rb +3 -3
- data/lib/picolena/templates/lib/tasks/index.rake +4 -6
- data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
- data/lib/picolena/templates/spec/controllers/documents_controller_spec.rb +5 -5
- data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +1 -1
- data/lib/picolena/templates/spec/models/basic_finder_spec.rb +13 -13
- data/lib/picolena/templates/spec/models/document_spec.rb +1 -1
- data/lib/picolena/templates/spec/models/finder_spec.rb +5 -70
- data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +6 -2
- data/lib/picolena/templates/spec/models/index_directories_spec.rb +4 -4
- data/lib/picolena/templates/spec/models/index_reader_spec.rb +7 -0
- data/lib/picolena/templates/spec/models/index_writer_spec.rb +7 -0
- data/lib/picolena/templates/spec/models/indexer_spec.rb +7 -0
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +42 -0
- data/lib/picolena/templates/spec/models/query_spec.rb +56 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/goethe +42 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/hugo +83 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/lorca +86 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare +90 -0
- data/lib/picolena/version.rb +1 -1
- data/tasks/hack.rake +2 -1
- data/website/index.html +2 -2
- data.tar.gz.sig +0 -0
- metadata +30 -17
- metadata.gz.sig +0 -0
- data/lib/picolena/templates/config/initializers/004_load_filters.rb +0 -6
- data/lib/picolena/templates/lib/ff.rb +0 -117
- data/lib/picolena/templates/lib/filter.rb +0 -75
- data/lib/picolena/templates/lib/filter_DSL.rb +0 -77
- data/lib/picolena/templates/spec/models/filters_spec.rb +0 -30
@@ -0,0 +1,56 @@
|
|
1
|
+
class IndexReader < Ferret::Index::Index
|
2
|
+
def initialize(params={})
|
3
|
+
# TODO: Remove those debug lines!
|
4
|
+
# puts "##################################################################Creating Reader!!!!!"
|
5
|
+
# Add needed parameters
|
6
|
+
params.merge!(:path => Picolena::IndexSavePath, :analyzer => Picolena::Analyzer)
|
7
|
+
# Creates the IndexReader
|
8
|
+
super(params)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns the number of times a file is present in the index.
|
12
|
+
# index_reader.doc_freq(field, term) → integer
|
13
|
+
# Return the number of documents in which the term term appears in the field field.
|
14
|
+
def occurences_number(complete_path)
|
15
|
+
# complete_path_query = Ferret::Search::TermQuery.new(:complete_path, complete_path)
|
16
|
+
search_by_complete_path(complete_path).total_hits
|
17
|
+
end
|
18
|
+
|
19
|
+
def search_by_complete_path(complete_path)
|
20
|
+
search('complete_path:"'<<complete_path<<'"')
|
21
|
+
end
|
22
|
+
|
23
|
+
def delete_by_complete_path(complete_path)
|
24
|
+
search_by_complete_path(complete_path).hits.each{|hit|
|
25
|
+
delete(hit.doc)
|
26
|
+
}
|
27
|
+
close
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
# Validation methods.
|
32
|
+
|
33
|
+
def should_have_documents
|
34
|
+
raise IndexError, "no document found" unless has_documents?
|
35
|
+
end
|
36
|
+
|
37
|
+
# Returns true if there's at least one document indexed.
|
38
|
+
def has_documents?
|
39
|
+
size>0
|
40
|
+
end
|
41
|
+
|
42
|
+
class<<self
|
43
|
+
|
44
|
+
def ensure_existence
|
45
|
+
Indexer.index_every_directory(update=false) unless exists? or RAILS_ENV=="production"
|
46
|
+
end
|
47
|
+
|
48
|
+
def exists?
|
49
|
+
filename and File.exists?(filename)
|
50
|
+
end
|
51
|
+
|
52
|
+
def filename
|
53
|
+
Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
class IndexWriter < Ferret::Index::IndexWriter
|
2
|
+
def initialize(params={})
|
3
|
+
# TODO: Remove those debug lines!
|
4
|
+
# puts "##################################################################Creating Writer!!!!!"
|
5
|
+
|
6
|
+
# Add needed parameters
|
7
|
+
params.merge!(:create_if_missing => true,
|
8
|
+
:path => Picolena::IndexSavePath,
|
9
|
+
:analyzer => Picolena::Analyzer
|
10
|
+
# huge performance impact?
|
11
|
+
# :auto_flush => true
|
12
|
+
)
|
13
|
+
# Creates the IndexWriter
|
14
|
+
super(params)
|
15
|
+
# Add required fields (content, filetype, probably_unique_id, ...)
|
16
|
+
add_fields!
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.remove
|
20
|
+
Dir.glob(File.join(Picolena::IndexSavePath,'*')).each{|f| FileUtils.rm(f) if File.file?(f)}
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
def add_fields!
|
25
|
+
# No need to re-create any field.
|
26
|
+
return unless field_infos.fields.empty?
|
27
|
+
field_infos.add_field(:complete_path, :store => :yes, :index => :yes)
|
28
|
+
field_infos.add_field(:content, :store => :yes, :index => :yes)
|
29
|
+
field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5)
|
30
|
+
field_infos.add_field(:file, :store => :no, :index => :yes, :boost => 1.5)
|
31
|
+
field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
|
32
|
+
field_infos.add_field(:date, :store => :yes, :index => :yes)
|
33
|
+
field_infos.add_field(:probably_unique_id, :store => :no, :index => :yes)
|
34
|
+
field_infos.add_field(:lang, :store => :yes, :index => :yes)
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
class Indexer
|
2
|
+
# This regexp defines which files should *not* be indexed.
|
3
|
+
@@exclude = /(Thumbs\.db)/
|
4
|
+
# Number of threads that will be used during indexing process
|
5
|
+
@@max_threads_number = 8
|
6
|
+
|
7
|
+
class << self
|
8
|
+
def fields_for(complete_path)
|
9
|
+
{
|
10
|
+
:complete_path => complete_path,
|
11
|
+
:probably_unique_id => complete_path.base26_hash,
|
12
|
+
:file => File.basename(complete_path),
|
13
|
+
:basename => File.basename(complete_path, File.extname(complete_path)).gsub(/_/,' '),
|
14
|
+
:filetype => File.extname(complete_path),
|
15
|
+
:date => File.mtime(complete_path).strftime("%Y%m%d%H%M%S")
|
16
|
+
}
|
17
|
+
end
|
18
|
+
|
19
|
+
def index_every_directory(update=true)
|
20
|
+
log :debug => "Indexing every directory"
|
21
|
+
|
22
|
+
|
23
|
+
start=Time.now
|
24
|
+
@update = update
|
25
|
+
reset! unless update
|
26
|
+
|
27
|
+
Picolena::IndexedDirectories.each{|dir, alias_dir|
|
28
|
+
index_directory_with_multithreads(dir)
|
29
|
+
}
|
30
|
+
# FIXME: with those 2 lines,
|
31
|
+
writer.optimize
|
32
|
+
writer.close
|
33
|
+
# launching Indexer.index_every_directory twice in a row
|
34
|
+
# would raise a SEGFAULT:
|
35
|
+
# picolena/lib/picolena/templates/app/models/indexer.rb:27: [BUG] Segmentation fault
|
36
|
+
# ruby 1.8.6 (2007-06-07) [i486-linux]
|
37
|
+
#
|
38
|
+
# Aborted (core dumped)
|
39
|
+
#
|
40
|
+
# But without those 2 lines, specs don't pass anymore.
|
41
|
+
#
|
42
|
+
log :debug => "Indexing done in #{Time.now-start} s."
|
43
|
+
end
|
44
|
+
|
45
|
+
def index_directory_with_multithreads(dir)
|
46
|
+
# FIXME: Don't know why, but if more than one thread is created while update the index,
|
47
|
+
# indexer raises:
|
48
|
+
#
|
49
|
+
# current thread not owner
|
50
|
+
# /usr/lib/ruby/1.8/monitor.rb:278:in `mon_check_owner'
|
51
|
+
# /home/www/picolena/lib/picolena/templates/lib/core_exts.rb:32:in `join'
|
52
|
+
# ...
|
53
|
+
#
|
54
|
+
# So Index creation is multithreaded, Index update is monothreaded.
|
55
|
+
threads_number = @update ? 1 : @@max_threads_number
|
56
|
+
log :debug => "Indexing #{dir}, #{threads_number} thread(s)"
|
57
|
+
|
58
|
+
indexing_list=Dir[File.join(dir,"**/*")].select{|filename|
|
59
|
+
File.file?(filename) && filename !~ @@exclude
|
60
|
+
}
|
61
|
+
|
62
|
+
indexing_list_chunks=indexing_list.in_transposed_chunks(threads_number)
|
63
|
+
|
64
|
+
indexing_list_chunks.each_with_thread{|chunk|
|
65
|
+
chunk.each{|filename|
|
66
|
+
add_or_update_file(filename)
|
67
|
+
}
|
68
|
+
}
|
69
|
+
end
|
70
|
+
|
71
|
+
def add_or_update_file(complete_path)
|
72
|
+
should_be_added = true
|
73
|
+
if @update then
|
74
|
+
log :debug => "What to do with #{complete_path} ?"
|
75
|
+
occurences = reader.occurences_number(complete_path)
|
76
|
+
log :debug => "\tappears #{occurences} times in the index"
|
77
|
+
case occurences
|
78
|
+
when 0
|
79
|
+
#Nothing to do here, the file will be added.
|
80
|
+
when 1
|
81
|
+
d=Document.find_by_complete_path(complete_path)
|
82
|
+
if File.mtime(complete_path).strftime("%Y%m%d%H%M%S").to_i > d.mtime then
|
83
|
+
log :debug => "\thas been modified"
|
84
|
+
delete_file(complete_path)
|
85
|
+
else
|
86
|
+
should_be_added = false
|
87
|
+
log :debug => "\thas not been modified. leaving it"
|
88
|
+
end
|
89
|
+
else
|
90
|
+
delete_file(complete_path)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
add_file(complete_path) if should_be_added
|
94
|
+
end
|
95
|
+
|
96
|
+
def add_file(complete_path)
|
97
|
+
log :debug => "Adding #{complete_path}"
|
98
|
+
mime_type=File.mime(complete_path)
|
99
|
+
fields = fields_for(complete_path)
|
100
|
+
|
101
|
+
begin
|
102
|
+
text, lang = PlainTextExtractor.extract_content_and_language_from(complete_path)
|
103
|
+
raise "\tempty document #{complete_path}" if text.strip.empty?
|
104
|
+
fields[:content] = text
|
105
|
+
log :debug => "language found: #{lang}" if lang
|
106
|
+
fields[:lang] = lang
|
107
|
+
rescue => e
|
108
|
+
log :debug => "\tindexing without content: #{e.message}"
|
109
|
+
end
|
110
|
+
|
111
|
+
writer << fields
|
112
|
+
end
|
113
|
+
|
114
|
+
def writer
|
115
|
+
@@writer ||= IndexWriter.new
|
116
|
+
end
|
117
|
+
|
118
|
+
def reader
|
119
|
+
@@reader ||= IndexReader.new
|
120
|
+
end
|
121
|
+
|
122
|
+
def reset!
|
123
|
+
log :debug => "Resetting Index"
|
124
|
+
@@writer=nil
|
125
|
+
@@reader=nil
|
126
|
+
IndexWriter.remove
|
127
|
+
end
|
128
|
+
|
129
|
+
def delete_file(complete_path)
|
130
|
+
log :debug => "\tRemoving from index"
|
131
|
+
reader.delete_by_complete_path(complete_path)
|
132
|
+
end
|
133
|
+
|
134
|
+
private
|
135
|
+
|
136
|
+
def log(hash)
|
137
|
+
hash.each{|level,message|
|
138
|
+
IndexerLogger.send(level,message)
|
139
|
+
}
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'plain_text_extractor_DSL'
|
2
|
+
|
3
|
+
class PlainTextExtractor
|
4
|
+
include PlainTextExtractorDSL
|
5
|
+
class<<self
|
6
|
+
# Returns every defined extractor
|
7
|
+
def all
|
8
|
+
Picolena::Extractors
|
9
|
+
end
|
10
|
+
|
11
|
+
# Add an extractor to the extractors list
|
12
|
+
def add(extractor)
|
13
|
+
all<<extractor
|
14
|
+
end
|
15
|
+
|
16
|
+
# Calls block for each extractor
|
17
|
+
def each(&block)
|
18
|
+
all.each(&block)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Returns every required dependency for every defined extractor
|
22
|
+
def dependencies
|
23
|
+
@@dependencies||=all.collect{|extractor| extractor.dependencies}.flatten.compact.uniq.sort
|
24
|
+
end
|
25
|
+
|
26
|
+
# Returns every supported file extensions
|
27
|
+
def supported_extensions
|
28
|
+
@@supported_exts||=all.collect{|extractor| extractor.exts}.flatten.compact.uniq
|
29
|
+
end
|
30
|
+
|
31
|
+
# Finds which extractor should be used for a given file, according to its extension
|
32
|
+
# Raises if the file is unsupported.
|
33
|
+
def find_by_filename(filename)
|
34
|
+
ext=File.ext_as_sym(filename)
|
35
|
+
found_extractor=all.find{|extractor| extractor.exts.include?(ext)} || raise(ArgumentError, "no convertor for #{filename}")
|
36
|
+
found_extractor.source=filename
|
37
|
+
found_extractor
|
38
|
+
end
|
39
|
+
|
40
|
+
# Launches extractor on given file and outputs plain text result
|
41
|
+
def extract_content_from(source)
|
42
|
+
find_by_filename(source).extract_content
|
43
|
+
end
|
44
|
+
|
45
|
+
def extract_content_and_language_from(source)
|
46
|
+
find_by_filename(source).extract_content_and_language
|
47
|
+
end
|
48
|
+
|
49
|
+
def language_guesser
|
50
|
+
@@language_guesser||=('mguesser -n1' unless IO.popen("which mguesser"){|i| i.read}.empty?)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
attr_accessor :source
|
55
|
+
|
56
|
+
# Parses command in order to know which programs are needed.
|
57
|
+
# rspec will then check that every dependecy is installed on the system
|
58
|
+
def dependencies
|
59
|
+
if command.is_a?(String) then
|
60
|
+
command.split(/\|\s*/).collect{|command_part| command_part.split(/ /).first}
|
61
|
+
else
|
62
|
+
@dependencies
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
## Conversion part
|
67
|
+
|
68
|
+
# destination method can be used by some conversion command that cannot output to stdout (example?)
|
69
|
+
# a file containing plain text result will first be written by command, and then be read by extract_content.
|
70
|
+
def destination
|
71
|
+
require 'tmpdir'
|
72
|
+
@@temp_file_as_destination ||= File.join(Dir::tmpdir,"ferret_#{Time.now.to_i}")
|
73
|
+
end
|
74
|
+
|
75
|
+
# Replaces generic command with specific source and destination (if specified) files
|
76
|
+
def specific_command
|
77
|
+
command.sub('SOURCE','"'<<source<<'"').sub('DESTINATION','"'<<destination<<'"')
|
78
|
+
end
|
79
|
+
|
80
|
+
# Returns plain text content of source file
|
81
|
+
def extract_content
|
82
|
+
if command.is_a?(String) then
|
83
|
+
# If command is a String, launch it via system(command).
|
84
|
+
if command.include?('DESTINATION') then
|
85
|
+
# If command includes 'DESTINATION' keyword,
|
86
|
+
# launches the command and returns the content of
|
87
|
+
# DESTINATION file.
|
88
|
+
system(specific_command)
|
89
|
+
File.read_and_remove(destination)
|
90
|
+
else
|
91
|
+
# Otherwise, launches the command and returns STDOUT.
|
92
|
+
IO.popen(specific_command){|io| io.read}
|
93
|
+
end
|
94
|
+
else
|
95
|
+
# command is a Block.
|
96
|
+
# Returns the result of command.call,
|
97
|
+
# with source file as parameter.
|
98
|
+
command.call(source)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Returns plain text content and language of source file,
|
103
|
+
# using mguesser to guess used language.
|
104
|
+
# This method only returns probable language if the content is bigger than 500 chars
|
105
|
+
# and if probability score is higher than 90%.
|
106
|
+
def extract_content_and_language
|
107
|
+
content=extract_content
|
108
|
+
# Language recognition is too unreliable for small files.
|
109
|
+
return [content, nil] unless Picolena::UseLanguageRecognition && PlainTextExtractor.language_guesser && content.size > 500
|
110
|
+
language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser|
|
111
|
+
lang_guesser.write content
|
112
|
+
lang_guesser.close_write
|
113
|
+
output=lang_guesser.read
|
114
|
+
if output=~/^([01]\.\d+)\t(\w+)\t(\w+)/ then
|
115
|
+
score, lang, encoding = $1.to_f, $2, $3
|
116
|
+
# Language recognition isn't reliable if score is too low.
|
117
|
+
lang unless score<0.9
|
118
|
+
end
|
119
|
+
}
|
120
|
+
[content,language]
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
class Query
|
2
|
+
class << self
|
3
|
+
# Returns a Ferret::Query from a raw String query.
|
4
|
+
def extract_from(raw_query)
|
5
|
+
parser.parse(convert_to_english(raw_query))
|
6
|
+
end
|
7
|
+
|
8
|
+
private
|
9
|
+
|
10
|
+
# Converts query keywords to english so they can be parsed by Ferret.
|
11
|
+
def convert_to_english(raw_query)
|
12
|
+
to_en={
|
13
|
+
/\b#{:AND.l}\b/=>'AND',
|
14
|
+
/\b#{:OR.l}\b/=>'OR',
|
15
|
+
/\b#{:NOT.l}\b/=>'NOT',
|
16
|
+
/(#{:filetype.l}):/=>'filetype:',
|
17
|
+
/#{:content.l}:/ => 'content:',
|
18
|
+
/#{:date.l}:/ => 'date:',
|
19
|
+
/\b#{:LIKE.l}\s+(\S+)/=>'\1~'
|
20
|
+
}
|
21
|
+
to_en.inject(raw_query){|mem,non_english_to_english_keyword|
|
22
|
+
mem.gsub(*non_english_to_english_keyword)
|
23
|
+
}
|
24
|
+
end
|
25
|
+
|
26
|
+
# Instantiates a QueryParser once, and keeps it in cache.
|
27
|
+
def parser
|
28
|
+
@@parser ||= Ferret::QueryParser.new(:fields => [:content, :file, :basename, :filetype, :date], :or_default => false, :analyzer=>Picolena::Analyzer)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -3,8 +3,8 @@
|
|
3
3
|
%small=number_to_percentage(document.score*100, :precision=>1)
|
4
4
|
=highlight_matching_content(document)
|
5
5
|
%p=link_to_containing_directory(document)
|
6
|
-
-
|
6
|
+
-if document.supported?
|
7
7
|
%p
|
8
8
|
=link_to_plain_text_content(document)
|
9
9
|
=link_to_cached_content(document)
|
10
|
-
%hr/
|
10
|
+
%hr/
|
@@ -1,4 +1,4 @@
|
|
1
|
-
%w(rubygems paginator pathname logger).each{|lib| require lib}
|
1
|
+
%w(rubygems paginator fileutils pathname logger thread).each{|lib| require lib}
|
2
2
|
|
3
3
|
# Uncomment below to force Rails into production mode when
|
4
4
|
# you don't control web/app server and can't set it the proper way
|
@@ -7,7 +7,7 @@
|
|
7
7
|
# Specifies gem version of Rails to use when vendor/rails is not present
|
8
8
|
RAILS_GEM_VERSION = '2.0.2' unless defined? RAILS_GEM_VERSION
|
9
9
|
|
10
|
-
|
10
|
+
IndexerLogger=Logger.new($stdout)
|
11
11
|
|
12
12
|
# Bootstrap the Rails environment, frameworks, and default configuration
|
13
13
|
require File.join(File.dirname(__FILE__), 'boot')
|
@@ -1,3 +1,4 @@
|
|
1
|
+
module Picolena
|
1
2
|
#Loading directories to be indexed
|
2
3
|
indexed_dir_config_file='config/custom/indexed_directories.yml'
|
3
4
|
IndexedDirectories={}
|
@@ -6,3 +7,4 @@ YAML.load_file(indexed_dir_config_file)[RAILS_ENV].each_pair{|abs_or_rel_path, a
|
|
6
7
|
}
|
7
8
|
|
8
9
|
IndexSavePath=File.join(IndexesSavePath,ENV["RAILS_ENV"] || "development")
|
10
|
+
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
module Picolena
|
1
2
|
#Deny all, Allow only IPs described in config/custom/white_list_ip.yml
|
2
3
|
white_list_ip_config_file='config/custom/white_list_ip.yml'
|
3
4
|
WhiteListIPs=Regexp.new(
|
@@ -5,4 +6,5 @@ WhiteListIPs=Regexp.new(
|
|
5
6
|
YAML.load_file(white_list_ip_config_file)["Allow"].collect{|ip|
|
6
7
|
ip.downcase.include?("all") ? /.*/ : Regexp.escape(ip)
|
7
8
|
}.join("|")<<")"
|
8
|
-
) rescue /^(127\.0\.0\.1|0\.0\.0\.0)/
|
9
|
+
) rescue /^(127\.0\.0\.1|0\.0\.0\.0)/
|
10
|
+
end
|
data/lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
|
+
module Picolena
|
1
2
|
custom_localization_yml=File.join(RAILS_ROOT,'config/custom/title_and_names_and_links.yml')
|
2
3
|
|
3
4
|
YAML.load_file(custom_localization_yml).each{|key_name, custom_translation|
|
4
5
|
Globalite.localizations[key_name.to_sym]=custom_translation unless custom_translation.blank?
|
5
6
|
}
|
7
|
+
end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
icons_config_file='config/custom/icons_and_filetypes.yml'
|
2
|
+
FiletypeToIconSymbol={}
|
3
|
+
YAML.load_file(icons_config_file).each_pair{|icon_name, filetypes|
|
4
|
+
icon_symbol=icon_name.to_sym
|
5
|
+
filetypes.split(/\s/).each{|filetype|
|
6
|
+
FiletypeToIconSymbol[filetype.downcase]=icon_symbol
|
7
|
+
}
|
8
|
+
}
|
@@ -17,11 +17,30 @@ end
|
|
17
17
|
|
18
18
|
class String
|
19
19
|
# Creates a "probably unique" id with the desired length, composed only of lowercase letters.
|
20
|
-
def base26_hash(length=HashLength)
|
20
|
+
def base26_hash(length=Picolena::HashLength)
|
21
21
|
Digest::MD5.hexdigest(self).to_i(16).to_s(26).tr('0-9a-p', 'a-z')[-length,length]
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
|
+
module Enumerable
|
26
|
+
def each_with_thread(&block)
|
27
|
+
tds=self.collect{|elem|
|
28
|
+
Thread.new(elem) {|elem|
|
29
|
+
block.call(elem)
|
30
|
+
}
|
31
|
+
}
|
32
|
+
tds.each{|aThread| aThread.join}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class Array
|
37
|
+
def in_transposed_chunks(n)
|
38
|
+
s=self.size
|
39
|
+
i=n-s%n
|
40
|
+
(self+[nil]*i).enum_slice(n).to_a.transpose.collect{|e| e.compact}
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
25
44
|
class File
|
26
45
|
def self.ext_as_sym(filename)
|
27
46
|
File.extname(filename).sub(/^\./,'').downcase.to_sym rescue :no_extension
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# Defines plain text extractors with DSL
|
2
|
+
# For example, to convert "Microsoft Office Word document" to plain text
|
3
|
+
# PlainTextExtractor.new {
|
4
|
+
# every :doc, :dot
|
5
|
+
# as "application/msword"
|
6
|
+
# aka "Microsoft Office Word document"
|
7
|
+
# with "antiword SOURCE" => :on_linux, "some other command" => :on_windows
|
8
|
+
# which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
|
9
|
+
# or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
|
10
|
+
# }
|
11
|
+
|
12
|
+
module PlainTextExtractorDSL
|
13
|
+
attr_reader :exts, :mime_name, :description, :command, :content_and_file_examples
|
14
|
+
|
15
|
+
def initialize(&block)
|
16
|
+
@content_and_file_examples=[]
|
17
|
+
self.instance_eval(&block)
|
18
|
+
PlainTextExtractor.add(self)
|
19
|
+
MimeType.add(self.exts,self.mime_name)
|
20
|
+
end
|
21
|
+
|
22
|
+
def every(*exts)
|
23
|
+
@exts=exts
|
24
|
+
end
|
25
|
+
|
26
|
+
def as(mime_name)
|
27
|
+
@mime_name=mime_name
|
28
|
+
end
|
29
|
+
|
30
|
+
def aka(description)
|
31
|
+
@description=description
|
32
|
+
end
|
33
|
+
|
34
|
+
def which_requires(*dependencies)
|
35
|
+
@dependencies=dependencies
|
36
|
+
end
|
37
|
+
|
38
|
+
#used by rspec to test extractors:
|
39
|
+
# which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
|
40
|
+
# or_extract 'some other stuff inside another pdf file', :from => 'yet_another.pdf'
|
41
|
+
#
|
42
|
+
#this spec will pass if 'basic.pdf' and 'yet_another.pdf' are included in an indexed directory, if every dependency is installed,
|
43
|
+
#and if plain text output from the extractor applied to 'basic.pdf' and 'yet_another.pdf' respectively include 'in a pdf file' and 'some other stuff inside another pdf file'
|
44
|
+
def which_should_for_example_extract(content, file)
|
45
|
+
@content_and_file_examples << [content,file[:from]]
|
46
|
+
end
|
47
|
+
|
48
|
+
#it allows to define specs in this way:
|
49
|
+
# which_should_for_example_extract 'Hello world!', :from => 'hello.rb'
|
50
|
+
# or_extract 'text inside!', :from => 'crossed.txt'
|
51
|
+
alias_method :or_extract, :which_should_for_example_extract
|
52
|
+
|
53
|
+
def with(command_as_hash_or_string=nil,&block)
|
54
|
+
#TODO: Find a better way to manage platforms, and include OS X, Vista, BSD...
|
55
|
+
platform=case RUBY_PLATFORM
|
56
|
+
when /linux/
|
57
|
+
:on_linux
|
58
|
+
when /win/
|
59
|
+
:on_windows
|
60
|
+
end
|
61
|
+
@command=case command_as_hash_or_string
|
62
|
+
when String
|
63
|
+
command_as_hash_or_string
|
64
|
+
when Hash
|
65
|
+
#dup must be used, otherwise @command gets frozen. No idea why though....
|
66
|
+
command_as_hash_or_string.invert[platform].dup
|
67
|
+
else
|
68
|
+
block || raise("No command defined for this extractor: #{description}")
|
69
|
+
end
|
70
|
+
@command<<' 2>/dev/null' if (@command.is_a?(String) && platform==:on_linux && !@command.include?('|'))
|
71
|
+
end
|
72
|
+
end
|
@@ -4,10 +4,10 @@
|
|
4
4
|
# Installation: Ubuntu xpdf-utils package
|
5
5
|
# Home page: http://www.foolabs.com/xpdf/
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
PlainTextExtractor.new {
|
8
|
+
every :pdf
|
9
9
|
as "application/pdf"
|
10
10
|
aka "Adobe Portable Document Format"
|
11
11
|
with "pdftotext -enc UTF-8 SOURCE -" => :on_linux, "some other command" => :on_windows
|
12
12
|
which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
|
13
|
-
}
|
13
|
+
}
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#Excel 97-2003
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
PlainTextExtractor.new {
|
4
|
+
every :xls
|
5
5
|
as "application/excel"
|
6
6
|
aka "Microsoft Office Excel document"
|
7
7
|
with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux, "some other command" => :on_windows
|
@@ -11,8 +11,8 @@ PlainText.extract {
|
|
11
11
|
#Excel 2007
|
12
12
|
|
13
13
|
require 'zip/zip'
|
14
|
-
|
15
|
-
|
14
|
+
PlainTextExtractor.new {
|
15
|
+
every :xlsx
|
16
16
|
as 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
17
17
|
aka "Microsoft Office 2007 Excel spreadsheet"
|
18
18
|
with {|source|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
#Powerpoint 97-2003
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
PlainTextExtractor.new {
|
4
|
+
every :ppt, :pps
|
5
5
|
as "application/powerpoint"
|
6
6
|
aka "Microsoft Office Powerpoint document"
|
7
7
|
with "catppt SOURCE" => :on_linux, "some other command" => :on_windows
|
@@ -13,8 +13,8 @@ PlainText.extract {
|
|
13
13
|
#Powerpoint 2007
|
14
14
|
|
15
15
|
require 'zip/zip'
|
16
|
-
|
17
|
-
|
16
|
+
PlainTextExtractor.new {
|
17
|
+
every :pptx
|
18
18
|
as 'application/vnd.openxmlformats-officedocument.presentationml.presentation' #could that mime BE any longer?
|
19
19
|
aka "Microsoft Office 2007 Powerpoint document"
|
20
20
|
with {|source|
|
@@ -4,8 +4,8 @@
|
|
4
4
|
# Installation: Ubuntu unrtf package
|
5
5
|
# http://www.gnu.org/software/unrtf/unrtf.html
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
PlainTextExtractor.new {
|
8
|
+
every :rtf
|
9
9
|
as "application/rtf"
|
10
10
|
aka "Microsoft Rich Text Format"
|
11
11
|
with "unrtf SOURCE -t text" => :on_linux, "some other command" => :on_windows
|