picolena 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. data/History.txt +8 -0
  2. data/Manifest.txt +28 -15
  3. data/README.txt +1 -1
  4. data/config/files_to_clean +2 -1
  5. data/config/hoe.rb +1 -1
  6. data/lib/picolena/config/basic.rb +46 -35
  7. data/lib/picolena/config/icons_and_filetypes.yml +69 -0
  8. data/lib/picolena/config/indexed_directories.yml +1 -1
  9. data/lib/picolena/picolena_generator.rb +3 -1
  10. data/lib/picolena/templates/app/controllers/application.rb +2 -2
  11. data/lib/picolena/templates/app/controllers/documents_controller.rb +1 -1
  12. data/lib/picolena/templates/app/helpers/documents_helper.rb +7 -26
  13. data/lib/picolena/templates/app/models/document.rb +32 -14
  14. data/lib/picolena/templates/app/models/finder.rb +21 -78
  15. data/lib/picolena/templates/app/models/index_reader.rb +56 -0
  16. data/lib/picolena/templates/app/models/index_writer.rb +36 -0
  17. data/lib/picolena/templates/app/models/indexer.rb +142 -0
  18. data/lib/picolena/templates/app/models/plain_text_extractor.rb +122 -0
  19. data/lib/picolena/templates/app/models/query.rb +31 -0
  20. data/lib/picolena/templates/app/views/documents/_document.html.haml +2 -2
  21. data/lib/picolena/templates/config/environment.rb +2 -2
  22. data/lib/picolena/templates/config/environments/development.rb +1 -1
  23. data/lib/picolena/templates/config/environments/production.rb +1 -1
  24. data/lib/picolena/templates/config/environments/test.rb +1 -1
  25. data/lib/picolena/templates/config/initializers/002_load_indexed_dirs.rb +2 -0
  26. data/lib/picolena/templates/config/initializers/003_load_white_list_IPs.rb +3 -1
  27. data/lib/picolena/templates/config/initializers/004_load_plain_text_extractors.rb +6 -0
  28. data/lib/picolena/templates/config/initializers/005_load_custom_title_and_names_and_links.rb +2 -0
  29. data/lib/picolena/templates/config/initializers/006_load_icons.rb +8 -0
  30. data/lib/picolena/templates/lib/core_exts.rb +20 -1
  31. data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +72 -0
  32. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/adobe.pdf.rb +3 -3
  33. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/html.rb +2 -2
  34. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.excel.rb +4 -4
  35. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.powerpoint.rb +4 -4
  36. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.rtf.rb +2 -2
  37. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/ms.word.rb +4 -4
  38. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.presentation.rb +2 -2
  39. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.spreadsheet.rb +2 -2
  40. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/opendocument.text.rb +2 -2
  41. data/lib/picolena/templates/lib/{filters → plain_text_extractors}/plain_text.rb +3 -3
  42. data/lib/picolena/templates/lib/tasks/index.rake +4 -6
  43. data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
  44. data/lib/picolena/templates/spec/controllers/documents_controller_spec.rb +5 -5
  45. data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +1 -1
  46. data/lib/picolena/templates/spec/models/basic_finder_spec.rb +13 -13
  47. data/lib/picolena/templates/spec/models/document_spec.rb +1 -1
  48. data/lib/picolena/templates/spec/models/finder_spec.rb +5 -70
  49. data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +6 -2
  50. data/lib/picolena/templates/spec/models/index_directories_spec.rb +4 -4
  51. data/lib/picolena/templates/spec/models/index_reader_spec.rb +7 -0
  52. data/lib/picolena/templates/spec/models/index_writer_spec.rb +7 -0
  53. data/lib/picolena/templates/spec/models/indexer_spec.rb +7 -0
  54. data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +42 -0
  55. data/lib/picolena/templates/spec/models/query_spec.rb +56 -0
  56. data/lib/picolena/templates/spec/test_dirs/indexed/lang/goethe +42 -0
  57. data/lib/picolena/templates/spec/test_dirs/indexed/lang/hugo +83 -0
  58. data/lib/picolena/templates/spec/test_dirs/indexed/lang/lorca +86 -0
  59. data/lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare +90 -0
  60. data/lib/picolena/version.rb +1 -1
  61. data/tasks/hack.rake +2 -1
  62. data/website/index.html +2 -2
  63. data.tar.gz.sig +0 -0
  64. metadata +30 -17
  65. metadata.gz.sig +0 -0
  66. data/lib/picolena/templates/config/initializers/004_load_filters.rb +0 -6
  67. data/lib/picolena/templates/lib/ff.rb +0 -117
  68. data/lib/picolena/templates/lib/filter.rb +0 -75
  69. data/lib/picolena/templates/lib/filter_DSL.rb +0 -77
  70. data/lib/picolena/templates/spec/models/filters_spec.rb +0 -30
@@ -0,0 +1,56 @@
1
+ class IndexReader < Ferret::Index::Index
2
+ def initialize(params={})
3
+ # TODO: Remove those debug lines!
4
+ # puts "##################################################################Creating Reader!!!!!"
5
+ # Add needed parameters
6
+ params.merge!(:path => Picolena::IndexSavePath, :analyzer => Picolena::Analyzer)
7
+ # Creates the IndexReader
8
+ super(params)
9
+ end
10
+
11
+ # Returns the number of times a file is present in the index.
12
+ # index_reader.doc_freq(field, term) → integer
13
+ # Return the number of documents in which the term term appears in the field field.
14
+ def occurences_number(complete_path)
15
+ # complete_path_query = Ferret::Search::TermQuery.new(:complete_path, complete_path)
16
+ search_by_complete_path(complete_path).total_hits
17
+ end
18
+
19
+ def search_by_complete_path(complete_path)
20
+ search('complete_path:"'<<complete_path<<'"')
21
+ end
22
+
23
+ def delete_by_complete_path(complete_path)
24
+ search_by_complete_path(complete_path).hits.each{|hit|
25
+ delete(hit.doc)
26
+ }
27
+ close
28
+ end
29
+
30
+
31
+ # Validation methods.
32
+
33
+ def should_have_documents
34
+ raise IndexError, "no document found" unless has_documents?
35
+ end
36
+
37
+ # Returns true if there's at least one document indexed.
38
+ def has_documents?
39
+ size>0
40
+ end
41
+
42
+ class<<self
43
+
44
+ def ensure_existence
45
+ Indexer.index_every_directory(update=false) unless exists? or RAILS_ENV=="production"
46
+ end
47
+
48
+ def exists?
49
+ filename and File.exists?(filename)
50
+ end
51
+
52
+ def filename
53
+ Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,36 @@
1
+ class IndexWriter < Ferret::Index::IndexWriter
2
+ def initialize(params={})
3
+ # TODO: Remove those debug lines!
4
+ # puts "##################################################################Creating Writer!!!!!"
5
+
6
+ # Add needed parameters
7
+ params.merge!(:create_if_missing => true,
8
+ :path => Picolena::IndexSavePath,
9
+ :analyzer => Picolena::Analyzer
10
+ # huge performance impact?
11
+ # :auto_flush => true
12
+ )
13
+ # Creates the IndexWriter
14
+ super(params)
15
+ # Add required fields (content, filetype, probably_unique_id, ...)
16
+ add_fields!
17
+ end
18
+
19
+ def self.remove
20
+ Dir.glob(File.join(Picolena::IndexSavePath,'*')).each{|f| FileUtils.rm(f) if File.file?(f)}
21
+ end
22
+
23
+ private
24
+ def add_fields!
25
+ # No need to re-create any field.
26
+ return unless field_infos.fields.empty?
27
+ field_infos.add_field(:complete_path, :store => :yes, :index => :yes)
28
+ field_infos.add_field(:content, :store => :yes, :index => :yes)
29
+ field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5)
30
+ field_infos.add_field(:file, :store => :no, :index => :yes, :boost => 1.5)
31
+ field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
32
+ field_infos.add_field(:date, :store => :yes, :index => :yes)
33
+ field_infos.add_field(:probably_unique_id, :store => :no, :index => :yes)
34
+ field_infos.add_field(:lang, :store => :yes, :index => :yes)
35
+ end
36
+ end
@@ -0,0 +1,142 @@
1
+ class Indexer
2
+ # This regexp defines which files should *not* be indexed.
3
+ @@exclude = /(Thumbs\.db)/
4
+ # Number of threads that will be used during indexing process
5
+ @@max_threads_number = 8
6
+
7
+ class << self
8
+ def fields_for(complete_path)
9
+ {
10
+ :complete_path => complete_path,
11
+ :probably_unique_id => complete_path.base26_hash,
12
+ :file => File.basename(complete_path),
13
+ :basename => File.basename(complete_path, File.extname(complete_path)).gsub(/_/,' '),
14
+ :filetype => File.extname(complete_path),
15
+ :date => File.mtime(complete_path).strftime("%Y%m%d%H%M%S")
16
+ }
17
+ end
18
+
19
+ def index_every_directory(update=true)
20
+ log :debug => "Indexing every directory"
21
+
22
+
23
+ start=Time.now
24
+ @update = update
25
+ reset! unless update
26
+
27
+ Picolena::IndexedDirectories.each{|dir, alias_dir|
28
+ index_directory_with_multithreads(dir)
29
+ }
30
+ # FIXME: with those 2 lines,
31
+ writer.optimize
32
+ writer.close
33
+ # launching Indexer.index_every_directory twice in a row
34
+ # would raise a SEGFAULT:
35
+ # picolena/lib/picolena/templates/app/models/indexer.rb:27: [BUG] Segmentation fault
36
+ # ruby 1.8.6 (2007-06-07) [i486-linux]
37
+ #
38
+ # Aborted (core dumped)
39
+ #
40
+ # But without those 2 lines, specs don't pass anymore.
41
+ #
42
+ log :debug => "Indexing done in #{Time.now-start} s."
43
+ end
44
+
45
+ def index_directory_with_multithreads(dir)
46
+ # FIXME: Don't know why, but if more than one thread is created while update the index,
47
+ # indexer raises:
48
+ #
49
+ # current thread not owner
50
+ # /usr/lib/ruby/1.8/monitor.rb:278:in `mon_check_owner'
51
+ # /home/www/picolena/lib/picolena/templates/lib/core_exts.rb:32:in `join'
52
+ # ...
53
+ #
54
+ # So Index creation is multithreaded, Index update is monothreaded.
55
+ threads_number = @update ? 1 : @@max_threads_number
56
+ log :debug => "Indexing #{dir}, #{threads_number} thread(s)"
57
+
58
+ indexing_list=Dir[File.join(dir,"**/*")].select{|filename|
59
+ File.file?(filename) && filename !~ @@exclude
60
+ }
61
+
62
+ indexing_list_chunks=indexing_list.in_transposed_chunks(threads_number)
63
+
64
+ indexing_list_chunks.each_with_thread{|chunk|
65
+ chunk.each{|filename|
66
+ add_or_update_file(filename)
67
+ }
68
+ }
69
+ end
70
+
71
+ def add_or_update_file(complete_path)
72
+ should_be_added = true
73
+ if @update then
74
+ log :debug => "What to do with #{complete_path} ?"
75
+ occurences = reader.occurences_number(complete_path)
76
+ log :debug => "\tappears #{occurences} times in the index"
77
+ case occurences
78
+ when 0
79
+ #Nothing to do here, the file will be added.
80
+ when 1
81
+ d=Document.find_by_complete_path(complete_path)
82
+ if File.mtime(complete_path).strftime("%Y%m%d%H%M%S").to_i > d.mtime then
83
+ log :debug => "\thas been modified"
84
+ delete_file(complete_path)
85
+ else
86
+ should_be_added = false
87
+ log :debug => "\thas not been modified. leaving it"
88
+ end
89
+ else
90
+ delete_file(complete_path)
91
+ end
92
+ end
93
+ add_file(complete_path) if should_be_added
94
+ end
95
+
96
+ def add_file(complete_path)
97
+ log :debug => "Adding #{complete_path}"
98
+ mime_type=File.mime(complete_path)
99
+ fields = fields_for(complete_path)
100
+
101
+ begin
102
+ text, lang = PlainTextExtractor.extract_content_and_language_from(complete_path)
103
+ raise "\tempty document #{complete_path}" if text.strip.empty?
104
+ fields[:content] = text
105
+ log :debug => "language found: #{lang}" if lang
106
+ fields[:lang] = lang
107
+ rescue => e
108
+ log :debug => "\tindexing without content: #{e.message}"
109
+ end
110
+
111
+ writer << fields
112
+ end
113
+
114
+ def writer
115
+ @@writer ||= IndexWriter.new
116
+ end
117
+
118
+ def reader
119
+ @@reader ||= IndexReader.new
120
+ end
121
+
122
+ def reset!
123
+ log :debug => "Resetting Index"
124
+ @@writer=nil
125
+ @@reader=nil
126
+ IndexWriter.remove
127
+ end
128
+
129
+ def delete_file(complete_path)
130
+ log :debug => "\tRemoving from index"
131
+ reader.delete_by_complete_path(complete_path)
132
+ end
133
+
134
+ private
135
+
136
+ def log(hash)
137
+ hash.each{|level,message|
138
+ IndexerLogger.send(level,message)
139
+ }
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,122 @@
1
+ require 'plain_text_extractor_DSL'
2
+
3
+ class PlainTextExtractor
4
+ include PlainTextExtractorDSL
5
+ class<<self
6
+ # Returns every defined extractor
7
+ def all
8
+ Picolena::Extractors
9
+ end
10
+
11
+ # Add an extractor to the extractors list
12
+ def add(extractor)
13
+ all<<extractor
14
+ end
15
+
16
+ # Calls block for each extractor
17
+ def each(&block)
18
+ all.each(&block)
19
+ end
20
+
21
+ # Returns every required dependency for every defined extractor
22
+ def dependencies
23
+ @@dependencies||=all.collect{|extractor| extractor.dependencies}.flatten.compact.uniq.sort
24
+ end
25
+
26
+ # Returns every supported file extensions
27
+ def supported_extensions
28
+ @@supported_exts||=all.collect{|extractor| extractor.exts}.flatten.compact.uniq
29
+ end
30
+
31
+ # Finds which extractor should be used for a given file, according to its extension
32
+ # Raises if the file is unsupported.
33
+ def find_by_filename(filename)
34
+ ext=File.ext_as_sym(filename)
35
+ found_extractor=all.find{|extractor| extractor.exts.include?(ext)} || raise(ArgumentError, "no convertor for #{filename}")
36
+ found_extractor.source=filename
37
+ found_extractor
38
+ end
39
+
40
+ # Launches extractor on given file and outputs plain text result
41
+ def extract_content_from(source)
42
+ find_by_filename(source).extract_content
43
+ end
44
+
45
+ def extract_content_and_language_from(source)
46
+ find_by_filename(source).extract_content_and_language
47
+ end
48
+
49
+ def language_guesser
50
+ @@language_guesser||=('mguesser -n1' unless IO.popen("which mguesser"){|i| i.read}.empty?)
51
+ end
52
+ end
53
+
54
+ attr_accessor :source
55
+
56
+ # Parses command in order to know which programs are needed.
57
+ # rspec will then check that every dependecy is installed on the system
58
+ def dependencies
59
+ if command.is_a?(String) then
60
+ command.split(/\|\s*/).collect{|command_part| command_part.split(/ /).first}
61
+ else
62
+ @dependencies
63
+ end
64
+ end
65
+
66
+ ## Conversion part
67
+
68
+ # destination method can be used by some conversion command that cannot output to stdout (example?)
69
+ # a file containing plain text result will first be written by command, and then be read by extract_content.
70
+ def destination
71
+ require 'tmpdir'
72
+ @@temp_file_as_destination ||= File.join(Dir::tmpdir,"ferret_#{Time.now.to_i}")
73
+ end
74
+
75
+ # Replaces generic command with specific source and destination (if specified) files
76
+ def specific_command
77
+ command.sub('SOURCE','"'<<source<<'"').sub('DESTINATION','"'<<destination<<'"')
78
+ end
79
+
80
+ # Returns plain text content of source file
81
+ def extract_content
82
+ if command.is_a?(String) then
83
+ # If command is a String, launch it via system(command).
84
+ if command.include?('DESTINATION') then
85
+ # If command includes 'DESTINATION' keyword,
86
+ # launches the command and returns the content of
87
+ # DESTINATION file.
88
+ system(specific_command)
89
+ File.read_and_remove(destination)
90
+ else
91
+ # Otherwise, launches the command and returns STDOUT.
92
+ IO.popen(specific_command){|io| io.read}
93
+ end
94
+ else
95
+ # command is a Block.
96
+ # Returns the result of command.call,
97
+ # with source file as parameter.
98
+ command.call(source)
99
+ end
100
+ end
101
+
102
+ # Returns plain text content and language of source file,
103
+ # using mguesser to guess used language.
104
+ # This method only returns probable language if the content is bigger than 500 chars
105
+ # and if probability score is higher than 90%.
106
+ def extract_content_and_language
107
+ content=extract_content
108
+ # Language recognition is too unreliable for small files.
109
+ return [content, nil] unless Picolena::UseLanguageRecognition && PlainTextExtractor.language_guesser && content.size > 500
110
+ language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser|
111
+ lang_guesser.write content
112
+ lang_guesser.close_write
113
+ output=lang_guesser.read
114
+ if output=~/^([01]\.\d+)\t(\w+)\t(\w+)/ then
115
+ score, lang, encoding = $1.to_f, $2, $3
116
+ # Language recognition isn't reliable if score is too low.
117
+ lang unless score<0.9
118
+ end
119
+ }
120
+ [content,language]
121
+ end
122
+ end
@@ -0,0 +1,31 @@
1
+ class Query
2
+ class << self
3
+ # Returns a Ferret::Query from a raw String query.
4
+ def extract_from(raw_query)
5
+ parser.parse(convert_to_english(raw_query))
6
+ end
7
+
8
+ private
9
+
10
+ # Converts query keywords to english so they can be parsed by Ferret.
11
+ def convert_to_english(raw_query)
12
+ to_en={
13
+ /\b#{:AND.l}\b/=>'AND',
14
+ /\b#{:OR.l}\b/=>'OR',
15
+ /\b#{:NOT.l}\b/=>'NOT',
16
+ /(#{:filetype.l}):/=>'filetype:',
17
+ /#{:content.l}:/ => 'content:',
18
+ /#{:date.l}:/ => 'date:',
19
+ /\b#{:LIKE.l}\s+(\S+)/=>'\1~'
20
+ }
21
+ to_en.inject(raw_query){|mem,non_english_to_english_keyword|
22
+ mem.gsub(*non_english_to_english_keyword)
23
+ }
24
+ end
25
+
26
+ # Instantiates a QueryParser once, and keeps it in cache.
27
+ def parser
28
+ @@parser ||= Ferret::QueryParser.new(:fields => [:content, :file, :basename, :filetype, :date], :or_default => false, :analyzer=>Picolena::Analyzer)
29
+ end
30
+ end
31
+ end
@@ -3,8 +3,8 @@
3
3
  %small=number_to_percentage(document.score*100, :precision=>1)
4
4
  =highlight_matching_content(document)
5
5
  %p=link_to_containing_directory(document)
6
- - if document.supported?
6
+ -if document.supported?
7
7
  %p
8
8
  =link_to_plain_text_content(document)
9
9
  =link_to_cached_content(document)
10
- %hr/
10
+ %hr/
@@ -1,4 +1,4 @@
1
- %w(rubygems paginator pathname logger).each{|lib| require lib}
1
+ %w(rubygems paginator fileutils pathname logger thread).each{|lib| require lib}
2
2
 
3
3
  # Uncomment below to force Rails into production mode when
4
4
  # you don't control web/app server and can't set it the proper way
@@ -7,7 +7,7 @@
7
7
  # Specifies gem version of Rails to use when vendor/rails is not present
8
8
  RAILS_GEM_VERSION = '2.0.2' unless defined? RAILS_GEM_VERSION
9
9
 
10
- IndexLogger=Logger.new($stdout)
10
+ IndexerLogger=Logger.new($stdout)
11
11
 
12
12
  # Bootstrap the Rails environment, frameworks, and default configuration
13
13
  require File.join(File.dirname(__FILE__), 'boot')
@@ -18,4 +18,4 @@ config.action_view.cache_template_extensions = false
18
18
  config.action_mailer.raise_delivery_errors = false
19
19
 
20
20
 
21
- IndexLogger.level = Logger::DEBUG
21
+ IndexerLogger.level = Logger::DEBUG
@@ -18,4 +18,4 @@ config.action_view.cache_template_loading = true
18
18
  # Disable delivery errors, bad email addresses will be ignored
19
19
  # config.action_mailer.raise_delivery_errors = false
20
20
 
21
- IndexLogger.level = Logger::INFO
21
+ IndexerLogger.level = Logger::INFO
@@ -22,4 +22,4 @@ config.action_controller.allow_forgery_protection = false
22
22
  config.action_mailer.delivery_method = :test
23
23
 
24
24
 
25
- IndexLogger.level = Logger::WARN
25
+ IndexerLogger.level = Logger::WARN
@@ -1,3 +1,4 @@
1
+ module Picolena
1
2
  #Loading directories to be indexed
2
3
  indexed_dir_config_file='config/custom/indexed_directories.yml'
3
4
  IndexedDirectories={}
@@ -6,3 +7,4 @@ YAML.load_file(indexed_dir_config_file)[RAILS_ENV].each_pair{|abs_or_rel_path, a
6
7
  }
7
8
 
8
9
  IndexSavePath=File.join(IndexesSavePath,ENV["RAILS_ENV"] || "development")
10
+ end
@@ -1,3 +1,4 @@
1
+ module Picolena
1
2
  #Deny all, Allow only IPs described in config/custom/white_list_ip.yml
2
3
  white_list_ip_config_file='config/custom/white_list_ip.yml'
3
4
  WhiteListIPs=Regexp.new(
@@ -5,4 +6,5 @@ WhiteListIPs=Regexp.new(
5
6
  YAML.load_file(white_list_ip_config_file)["Allow"].collect{|ip|
6
7
  ip.downcase.include?("all") ? /.*/ : Regexp.escape(ip)
7
8
  }.join("|")<<")"
8
- ) rescue /^(127\.0\.0\.1|0\.0\.0\.0)/
9
+ ) rescue /^(127\.0\.0\.1|0\.0\.0\.0)/
10
+ end
@@ -0,0 +1,6 @@
1
+ require 'core_exts'
2
+ require 'plain_text_extractor_DSL'
3
+ Picolena::Extractors=[]
4
+ Dir.glob(File.join(RAILS_ROOT,'lib/plain_text_extractors/*.rb')).each{|extractor|
5
+ require extractor
6
+ }
@@ -1,5 +1,7 @@
1
+ module Picolena
1
2
  custom_localization_yml=File.join(RAILS_ROOT,'config/custom/title_and_names_and_links.yml')
2
3
 
3
4
  YAML.load_file(custom_localization_yml).each{|key_name, custom_translation|
4
5
  Globalite.localizations[key_name.to_sym]=custom_translation unless custom_translation.blank?
5
6
  }
7
+ end
@@ -0,0 +1,8 @@
1
+ icons_config_file='config/custom/icons_and_filetypes.yml'
2
+ FiletypeToIconSymbol={}
3
+ YAML.load_file(icons_config_file).each_pair{|icon_name, filetypes|
4
+ icon_symbol=icon_name.to_sym
5
+ filetypes.split(/\s/).each{|filetype|
6
+ FiletypeToIconSymbol[filetype.downcase]=icon_symbol
7
+ }
8
+ }
@@ -17,11 +17,30 @@ end
17
17
 
18
18
  class String
19
19
  # Creates a "probably unique" id with the desired length, composed only of lowercase letters.
20
- def base26_hash(length=HashLength)
20
+ def base26_hash(length=Picolena::HashLength)
21
21
  Digest::MD5.hexdigest(self).to_i(16).to_s(26).tr('0-9a-p', 'a-z')[-length,length]
22
22
  end
23
23
  end
24
24
 
25
+ module Enumerable
26
+ def each_with_thread(&block)
27
+ tds=self.collect{|elem|
28
+ Thread.new(elem) {|elem|
29
+ block.call(elem)
30
+ }
31
+ }
32
+ tds.each{|aThread| aThread.join}
33
+ end
34
+ end
35
+
36
+ class Array
37
+ def in_transposed_chunks(n)
38
+ s=self.size
39
+ i=n-s%n
40
+ (self+[nil]*i).enum_slice(n).to_a.transpose.collect{|e| e.compact}
41
+ end
42
+ end
43
+
25
44
  class File
26
45
  def self.ext_as_sym(filename)
27
46
  File.extname(filename).sub(/^\./,'').downcase.to_sym rescue :no_extension
@@ -0,0 +1,72 @@
1
+ # Defines plain text extractors with DSL
2
+ # For example, to convert "Microsoft Office Word document" to plain text
3
+ # PlainTextExtractor.new {
4
+ # every :doc, :dot
5
+ # as "application/msword"
6
+ # aka "Microsoft Office Word document"
7
+ # with "antiword SOURCE" => :on_linux, "some other command" => :on_windows
8
+ # which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
9
+ # or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
10
+ # }
11
+
12
+ module PlainTextExtractorDSL
13
+ attr_reader :exts, :mime_name, :description, :command, :content_and_file_examples
14
+
15
+ def initialize(&block)
16
+ @content_and_file_examples=[]
17
+ self.instance_eval(&block)
18
+ PlainTextExtractor.add(self)
19
+ MimeType.add(self.exts,self.mime_name)
20
+ end
21
+
22
+ def every(*exts)
23
+ @exts=exts
24
+ end
25
+
26
+ def as(mime_name)
27
+ @mime_name=mime_name
28
+ end
29
+
30
+ def aka(description)
31
+ @description=description
32
+ end
33
+
34
+ def which_requires(*dependencies)
35
+ @dependencies=dependencies
36
+ end
37
+
38
+ #used by rspec to test extractors:
39
+ # which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
40
+ # or_extract 'some other stuff inside another pdf file', :from => 'yet_another.pdf'
41
+ #
42
+ #this spec will pass if 'basic.pdf' and 'yet_another.pdf' are included in an indexed directory, if every dependency is installed,
43
+ #and if plain text output from the extractor applied to 'basic.pdf' and 'yet_another.pdf' respectively include 'in a pdf file' and 'some other stuff inside another pdf file'
44
+ def which_should_for_example_extract(content, file)
45
+ @content_and_file_examples << [content,file[:from]]
46
+ end
47
+
48
+ #it allows to define specs in this way:
49
+ # which_should_for_example_extract 'Hello world!', :from => 'hello.rb'
50
+ # or_extract 'text inside!', :from => 'crossed.txt'
51
+ alias_method :or_extract, :which_should_for_example_extract
52
+
53
+ def with(command_as_hash_or_string=nil,&block)
54
+ #TODO: Find a better way to manage platforms, and include OS X, Vista, BSD...
55
+ platform=case RUBY_PLATFORM
56
+ when /linux/
57
+ :on_linux
58
+ when /win/
59
+ :on_windows
60
+ end
61
+ @command=case command_as_hash_or_string
62
+ when String
63
+ command_as_hash_or_string
64
+ when Hash
65
+ #dup must be used, otherwise @command gets frozen. No idea why though....
66
+ command_as_hash_or_string.invert[platform].dup
67
+ else
68
+ block || raise("No command defined for this extractor: #{description}")
69
+ end
70
+ @command<<' 2>/dev/null' if (@command.is_a?(String) && platform==:on_linux && !@command.include?('|'))
71
+ end
72
+ end
@@ -4,10 +4,10 @@
4
4
  # Installation: Ubuntu xpdf-utils package
5
5
  # Home page: http://www.foolabs.com/xpdf/
6
6
 
7
- PlainText.extract {
8
- from :pdf
7
+ PlainTextExtractor.new {
8
+ every :pdf
9
9
  as "application/pdf"
10
10
  aka "Adobe Portable Document Format"
11
11
  with "pdftotext -enc UTF-8 SOURCE -" => :on_linux, "some other command" => :on_windows
12
12
  which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
13
- }
13
+ }
@@ -1,5 +1,5 @@
1
- PlainText.extract {
2
- from :html, :htm
1
+ PlainTextExtractor.new {
2
+ every :html, :htm
3
3
  as "text/html"
4
4
  aka "HyperText Markup Language document"
5
5
  with {|source|
@@ -1,7 +1,7 @@
1
1
  #Excel 97-2003
2
2
 
3
- PlainText.extract {
4
- from :xls
3
+ PlainTextExtractor.new {
4
+ every :xls
5
5
  as "application/excel"
6
6
  aka "Microsoft Office Excel document"
7
7
  with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux, "some other command" => :on_windows
@@ -11,8 +11,8 @@ PlainText.extract {
11
11
  #Excel 2007
12
12
 
13
13
  require 'zip/zip'
14
- PlainText.extract {
15
- from :xlsx
14
+ PlainTextExtractor.new {
15
+ every :xlsx
16
16
  as 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
17
17
  aka "Microsoft Office 2007 Excel spreadsheet"
18
18
  with {|source|
@@ -1,7 +1,7 @@
1
1
  #Powerpoint 97-2003
2
2
 
3
- PlainText.extract {
4
- from :ppt, :pps
3
+ PlainTextExtractor.new {
4
+ every :ppt, :pps
5
5
  as "application/powerpoint"
6
6
  aka "Microsoft Office Powerpoint document"
7
7
  with "catppt SOURCE" => :on_linux, "some other command" => :on_windows
@@ -13,8 +13,8 @@ PlainText.extract {
13
13
  #Powerpoint 2007
14
14
 
15
15
  require 'zip/zip'
16
- PlainText.extract {
17
- from :pptx
16
+ PlainTextExtractor.new {
17
+ every :pptx
18
18
  as 'application/vnd.openxmlformats-officedocument.presentationml.presentation' #could that mime BE any longer?
19
19
  aka "Microsoft Office 2007 Powerpoint document"
20
20
  with {|source|
@@ -4,8 +4,8 @@
4
4
  # Installation: Ubuntu unrtf package
5
5
  # http://www.gnu.org/software/unrtf/unrtf.html
6
6
 
7
- PlainText.extract {
8
- from :rtf
7
+ PlainTextExtractor.new {
8
+ every :rtf
9
9
  as "application/rtf"
10
10
  aka "Microsoft Rich Text Format"
11
11
  with "unrtf SOURCE -t text" => :on_linux, "some other command" => :on_windows