picolena 0.2.0 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. data/History.txt +14 -0
  2. data/Manifest.txt +28 -8
  3. data/config/files_to_clean +1 -0
  4. data/config/requirements.rb +1 -1
  5. data/lib/picolena/config/basic.rb +2 -1
  6. data/lib/picolena/config/icons_and_filetypes.yml +5 -0
  7. data/lib/picolena/picolena_generator.rb +3 -1
  8. data/lib/picolena/templates/app/helpers/documents_helper.rb +4 -4
  9. data/lib/picolena/templates/app/models/document.rb +27 -4
  10. data/lib/picolena/templates/app/models/indexer.rb +6 -2
  11. data/lib/picolena/templates/app/models/plain_text_extractor.rb +27 -13
  12. data/lib/picolena/templates/app/models/query.rb +2 -2
  13. data/lib/picolena/templates/app/views/documents/_document.html.haml +1 -1
  14. data/lib/picolena/templates/config/environments/development.rb +2 -0
  15. data/lib/picolena/templates/config/initializers/001_load_ferret.rb +17 -0
  16. data/lib/picolena/templates/config/initializers/{001_load_custom_config.rb → 002_load_custom_config.rb} +1 -2
  17. data/lib/picolena/templates/config/initializers/{002_load_indexed_dirs.rb → 003_load_indexed_dirs.rb} +0 -0
  18. data/lib/picolena/templates/config/initializers/{003_load_white_list_IPs.rb → 004_load_white_list_IPs.rb} +0 -0
  19. data/lib/picolena/templates/config/initializers/{004_load_plain_text_extractors.rb → 005_load_plain_text_extractors.rb} +1 -1
  20. data/lib/picolena/templates/config/initializers/{005_load_custom_title_and_names_and_links.rb → 006_load_custom_title_and_names_and_links.rb} +0 -0
  21. data/lib/picolena/templates/config/initializers/{006_load_icons.rb → 007_load_icons.rb} +0 -0
  22. data/lib/picolena/templates/config/initializers/{007_load_performance_tweaks.rb → 008_load_performance_tweaks.rb} +0 -0
  23. data/lib/picolena/templates/lib/core_exts.rb +52 -0
  24. data/lib/picolena/templates/lib/development_helpers.rb +35 -0
  25. data/lib/picolena/templates/lib/plain_text_extractor_dsl.rb +128 -0
  26. data/lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb +2 -2
  27. data/lib/picolena/templates/lib/plain_text_extractors/adobe.photoshop.rb +12 -0
  28. data/lib/picolena/templates/lib/plain_text_extractors/html.rb +1 -1
  29. data/lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb +4 -4
  30. data/lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb +4 -4
  31. data/lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb +3 -3
  32. data/lib/picolena/templates/lib/plain_text_extractors/ms.word.rb +4 -4
  33. data/lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb +2 -2
  34. data/lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb +2 -2
  35. data/lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb +2 -2
  36. data/lib/picolena/templates/lib/plain_text_extractors/pictures.rb +15 -4
  37. data/lib/picolena/templates/lib/plain_text_extractors/plain_text.rb +9 -2
  38. data/lib/picolena/templates/lib/plain_text_extractors/rar.rb +18 -0
  39. data/lib/picolena/templates/lib/plain_text_extractors/videos.rb +13 -0
  40. data/lib/picolena/templates/lib/plain_text_extractors/zip.rb +17 -0
  41. data/lib/picolena/templates/lib/tasks/extract.rake +16 -0
  42. data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
  43. data/lib/picolena/templates/public/images/thumbnails/NOTE +2 -0
  44. data/lib/picolena/templates/spec/controllers/documents_controller_spec.rb +8 -0
  45. data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +12 -1
  46. data/lib/picolena/templates/spec/models/basic_finder_spec.rb +6 -4
  47. data/lib/picolena/templates/spec/models/document_spec.rb +24 -4
  48. data/lib/picolena/templates/spec/models/finder_spec.rb +18 -11
  49. data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +1 -1
  50. data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +25 -8
  51. data/lib/picolena/templates/spec/models/query_spec.rb +4 -5
  52. data/lib/picolena/templates/spec/spec_helper.rb +9 -0
  53. data/lib/picolena/templates/spec/test_dirs/indexed/archives/dumb_file.rar +0 -0
  54. data/lib/picolena/templates/spec/test_dirs/indexed/archives/some_test_files.zip +0 -0
  55. data/lib/picolena/templates/spec/test_dirs/indexed/basic/fake_thumbnailer +14 -0
  56. data/lib/picolena/templates/spec/test_dirs/indexed/media/badminton.avi +0 -0
  57. data/lib/picolena/templates/spec/test_dirs/indexed/media/caution.tif +0 -0
  58. data/lib/picolena/templates/spec/test_dirs/indexed/media/cygnus.jpeg +0 -0
  59. data/lib/picolena/templates/spec/test_dirs/indexed/media/diceface.eps +79 -0
  60. data/lib/picolena/templates/spec/test_dirs/indexed/media/glass.png +0 -0
  61. data/lib/picolena/templates/spec/test_dirs/indexed/media/gnu.bmp +0 -0
  62. data/lib/picolena/templates/spec/test_dirs/indexed/media/picolena.psd +0 -0
  63. data/lib/picolena/templates/spec/test_dirs/indexed/media/rails_logo_remix.gif +0 -0
  64. data/lib/picolena/templates/spec/test_dirs/indexed/media/warning.tiff +0 -0
  65. data/lib/picolena/version.rb +1 -1
  66. data/website/index.html +1 -1
  67. metadata +31 -32
  68. data.tar.gz.sig +0 -0
  69. data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +0 -88
  70. metadata.gz.sig +0 -0
@@ -3,6 +3,15 @@ class String
3
3
  def base26_hash(length=Picolena::HashLength)
4
4
  Digest::MD5.hexdigest(self).to_i(16).to_s(26).tr('0-9a-p', 'a-z')[-length,length]
5
5
  end
6
+
7
+ # Returns true iff self is an available command on the system
8
+ # >> "grep".installed?
9
+ # => true
10
+ # >> "sdfgsdfgsdf".installed?
11
+ # => false
12
+ def installed?
13
+ !IO.popen("which #{self}"){|i| i.read}.empty?
14
+ end
6
15
  end
7
16
 
8
17
  module Enumerable
@@ -100,4 +109,47 @@ class File
100
109
  def self.plain_text?(filename)
101
110
  %x{file -i "#{filename}"} =~ /: text\//
102
111
  end
112
+
113
+ # For a given file, returns the path at which a thumbnail should be saved
114
+ def self.thumbnail_path(filename, public_dir=false)
115
+ thumb=expand_path(filename).base26_hash+'.jpg'
116
+ public_dir ? File.join('thumbnails', thumb) : File.join(RAILS_ROOT, 'public/images/thumbnails', thumb)
117
+ end
118
+ end
119
+
120
+ class Object
121
+ # [1,2,3].is_an?(Array) just looks better than [1,2,3].is_a?(Array)
122
+ alias_method :is_an?, :is_a?
123
+ end
124
+
125
+ module Kernel
126
+ require 'open3'
127
+ # Executes a command and returns stdout while silenting stderr
128
+ # NOTE: Restricted to systems on which forking is possible. How to do on windows?
129
+ def silently_execute(command)
130
+ Open3.popen3(command){|i,e,o| e.read}
131
+ end
132
+ end
133
+
134
+
135
+ # A PlainTextExtractor.command can be either a String, a Block or undefined.
136
+ class String
137
+ # For a given *nix command line, returns an Array of required commands:
138
+ # >> "xls2csv SOURCE | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'".dependencies
139
+ # => ["xls2csv", "grep", "sed"]
140
+ def dependencies
141
+ self.split(/\|\s*/).collect{|command_part| command_part.split(/ /).first}
142
+ end
143
+ end
144
+
145
+ class Proc
146
+ def dependencies
147
+ []
148
+ end
149
+ end
150
+
151
+ class NilClass
152
+ def dependencies
153
+ []
154
+ end
103
155
  end
@@ -0,0 +1,35 @@
1
+ # D.query displays matching_documents for query, and returns the document
2
+ # with the highest score.
3
+ # Useful for development and debugging purposes
4
+ #
5
+ # >> D.test
6
+ # 71 document(s) found for test:
7
+ # for_test.txt
8
+ # some_test_files.zip
9
+ # plain.txt
10
+ # another_plain.text
11
+ # other_basic.PDF
12
+ # basic.pdf
13
+ # basic.odt
14
+ # basic.tex
15
+ # queens.for
16
+ # README
17
+ # ...........
18
+ # => "spec/test_dirs/indexed/just_one_doc/for_test.txt (82.7%)"
19
+ class D
20
+ def self.method_missing(query,*params)
21
+ self[query.to_s] || super
22
+ end
23
+ def self.[](query)
24
+ f=Finder.new(query.to_s)
25
+ hits=f.total_hits
26
+ if hits > 0 then
27
+ puts "#{hits} document(s) found for #{query}:"
28
+ f.matching_documents.each{|doc| puts " "+doc.filename}
29
+ puts " ..........." if hits > f.matching_documents.size
30
+ f.matching_documents.first
31
+ else
32
+ puts "Nothing found for #{query}"
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,128 @@
1
+ # Defines plain text extractors with DSL
2
+ # For example, to convert "Microsoft Office Word document" to plain text
3
+ # PlainTextExtractor.new {
4
+ # every :doc, :dot
5
+ # as "application/msword"
6
+ # aka "Microsoft Office Word document"
7
+ # extract_content_with "antiword SOURCE" => :on_linux, "some other command" => :on_windows
8
+ # which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
9
+ # or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
10
+ # }
11
+
12
+ module PlainTextExtractorDSL
13
+ attr_reader :exts, :mime_name, :description, :command, :content_and_file_examples, :thumbnail_command
14
+
15
+ def initialize(&block)
16
+ @content_and_file_examples=[]
17
+ self.instance_eval(&block)
18
+ PlainTextExtractor.add(self)
19
+ end
20
+
21
+ def every(*exts)
22
+ @exts ||=[]
23
+ @exts |= exts
24
+ end
25
+
26
+ def as(mime_name)
27
+ @mime_name=mime_name
28
+ end
29
+
30
+ def aka(description)
31
+ @description=description
32
+ end
33
+
34
+ def which_requires(*dependencies)
35
+ @dependencies=dependencies
36
+ end
37
+
38
+ #used by rspec to test extractors:
39
+ # which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
40
+ # or_extract 'some other stuff inside another pdf file', :from => 'yet_another.pdf'
41
+ #
42
+ #this spec will pass if 'basic.pdf' and 'yet_another.pdf' are included in an indexed directory, if every dependency is installed,
43
+ #and if plain text output from the extractor applied to 'basic.pdf' and 'yet_another.pdf' respectively include 'in a pdf file' and 'some other stuff inside another pdf file'
44
+ def which_should_for_example_extract(content, file)
45
+ @content_and_file_examples << [content,file[:from]]
46
+ end
47
+
48
+ #it allows to define specs in this way:
49
+ # which_should_for_example_extract 'Hello world!', :from => 'hello.rb'
50
+ # or_extract 'text inside!', :from => 'crossed.txt'
51
+ alias_method :or_extract, :which_should_for_example_extract
52
+
53
+ def extract_content_with(command_as_hash_or_string=nil,&block)
54
+ #TODO: Find a better way to manage platforms, and include OS X, Vista, BSD...
55
+ @command=case command_as_hash_or_string
56
+ when String
57
+ command_as_hash_or_string
58
+ when Hash
59
+ command_for_current_platform(command_as_hash_or_string)
60
+ else
61
+ block || raise("No command defined for this extractor: #{description}")
62
+ end
63
+ end
64
+
65
+ def extract_thumbnail_with(command_as_hash_or_string=nil, &block)
66
+ #TODO: Don't ignore block and use it as in extract_content_with
67
+ @thumbnail_command=case command_as_hash_or_string
68
+ when String
69
+ command_as_hash_or_string
70
+ when Hash
71
+ command_for_current_platform(command_as_hash_or_string)
72
+ end
73
+ end
74
+
75
+ # Unpack an archive and extract content from every supported file
76
+ def extract_content_from_archive_with(unpack_command)
77
+ #FIXME: Cleaner code needed!
78
+ @command=lambda {|source|
79
+ begin
80
+ global_temp_dir = File.join(Dir::tmpdir, 'picolena_archive_temp')
81
+ specific_temp_dir = File.join(global_temp_dir, source.base26_hash)
82
+ FileUtils.mkpath specific_temp_dir
83
+ specific_unpack_command=unpack_command.sub('SOURCE','"'<<source<<'"').sub(/TE?MPDIR/,'"'<<specific_temp_dir<<'"')
84
+ silently_execute(specific_unpack_command)
85
+ Dir["#{specific_temp_dir}/**/*"].select{|f| File.file?(f)}.map{|filename|
86
+ content=PlainTextExtractor.extract_content_from(filename) rescue "---"
87
+ ["##"<<filename.sub(specific_temp_dir,'').gsub('/', '>'),
88
+ content]
89
+ }.join("\n")
90
+ ensure
91
+ FileUtils.remove_entry_secure(specific_temp_dir)
92
+ FileUtils.rmdir(global_temp_dir) rescue "not empty"
93
+ end
94
+ }
95
+ (@dependencies||=[])<<unpack_command.dependencies
96
+ end
97
+
98
+ private
99
+ def command_for_current_platform(command_as_hash)
100
+ # Allows to write
101
+ # with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
102
+ # "some other command" => :on_windows
103
+ #
104
+ # On linux and mac_os platforms, it returns "pdftotext -enc UTF-8 SOURCE -",
105
+ # on windows, it returns "some other command"
106
+ #
107
+ # If commands for linux & mac os were different :
108
+ # with "some command" => :on_linux,
109
+ # "another command" => :on_mac_os,
110
+ # "yet another command" => :on_windows
111
+ #
112
+ #NOTE: What to do when no command is defined for a given platform?
113
+ command_as_hash.invert.find{|platforms,command|
114
+ platforms.to_s.split(/_?and_?/i).collect{|on_platform| on_platform.sub(/on_/,'').to_sym}.include?(current_platform_symbol)
115
+ }.last.dup
116
+ end
117
+
118
+ def current_platform_symbol
119
+ @@platform_symbol||=case RUBY_PLATFORM
120
+ when /linux/
121
+ :linux
122
+ when /win/
123
+ :windows
124
+ when /darwin/
125
+ :mac_os
126
+ end
127
+ end
128
+ end
@@ -8,7 +8,7 @@ PlainTextExtractor.new {
8
8
  every :pdf
9
9
  as "application/pdf"
10
10
  aka "Adobe Portable Document Format"
11
- with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
12
- "some other command" => :on_windows
11
+ extract_content_with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
12
+ "some other command" => :on_windows
13
13
  which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
14
14
  }
@@ -0,0 +1,12 @@
1
+ PlainTextExtractor.new {
2
+ every :psd
3
+ as "image/adobe.photoshop"
4
+ aka "Adobe Photoshop Format"
5
+
6
+ #NOTE: PSD gets its own Extractor since convert method is different from one-layer pictures
7
+ # and needs -flatten option
8
+ extract_thumbnail_with 'convert SOURCE -flatten -thumbnail 80x80 -quality 50 THUMBNAIL'
9
+
10
+ extract_content_with 'exiftool SOURCE'
11
+ which_should_for_example_extract '"Adobe Photoshop CS2 Windows" 584x150', :from => 'picolena.psd'
12
+ }
@@ -2,7 +2,7 @@ PlainTextExtractor.new {
2
2
  every :html, :htm
3
3
  as "text/html"
4
4
  aka "HyperText Markup Language document"
5
- with {|source|
5
+ extract_content_with {|source|
6
6
  encoding=File.encoding(source)
7
7
  if encoding.empty? or encoding.gsub(/[^\w]/,'').downcase=="utf8" then
8
8
  %x{html2text -nobs "#{source}"}
@@ -4,8 +4,8 @@ PlainTextExtractor.new {
4
4
  every :xls
5
5
  as "application/excel"
6
6
  aka "Microsoft Office Excel document"
7
- with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux_and_mac_os,
8
- "some other command" => :on_windows
7
+ extract_content_with "xls2csv SOURCE | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux_and_mac_os,
8
+ "some other command" => :on_windows
9
9
  which_should_for_example_extract 'Some text (should be indexed!)', :from => 'table.xls'
10
10
  }
11
11
 
@@ -16,7 +16,7 @@ PlainTextExtractor.new {
16
16
  every :xlsx
17
17
  as 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
18
18
  aka "Microsoft Office 2007 Excel spreadsheet"
19
- with {|source|
19
+ extract_content_with {|source|
20
20
  Zip::ZipFile.open(source){|zipfile|
21
21
  text_cells=zipfile.read("xl/sharedStrings.xml").split(/</).grep(/^t/).collect{|l|
22
22
  l.sub(/^[^>]+>/,'')
@@ -39,4 +39,4 @@ PlainTextExtractor.new {
39
39
  ## Home page: http://www.winfield.demon.nl/
40
40
 
41
41
  ## MS OOXML excel to text conversion:
42
- ## Ruby code written by Eric DUMINIL
42
+ ## Ruby code written by Eric DUMINIL
@@ -4,8 +4,8 @@ PlainTextExtractor.new {
4
4
  every :ppt, :pps
5
5
  as "application/powerpoint"
6
6
  aka "Microsoft Office Powerpoint document"
7
- with "catppt SOURCE" => :on_linux_and_mac_os,
8
- "some other command" => :on_windows
7
+ extract_content_with "catppt SOURCE" => :on_linux_and_mac_os,
8
+ "some other command" => :on_windows
9
9
  which_should_for_example_extract 'unofficial written by OOo Impress', :from => 'one_page.ppt'
10
10
  #FIXME: it seems that catppt cannot open .pps files.
11
11
  #or_extract 'a lightweight ferret-powered search engine written in Ruby on rails.', :from => 'picolena.pps'
@@ -18,7 +18,7 @@ PlainTextExtractor.new {
18
18
  every :pptx
19
19
  as 'application/vnd.openxmlformats-officedocument.presentationml.presentation' #could that mime BE any longer?
20
20
  aka "Microsoft Office 2007 Powerpoint document"
21
- with {|source|
21
+ extract_content_with {|source|
22
22
  Zip::ZipFile.open(source){|zipfile|
23
23
  slides=zipfile.entries.select{|l| l.name=~/^ppt\/slides\/slide\d+.xml/}
24
24
  slides.collect{|entry|
@@ -38,4 +38,4 @@ PlainTextExtractor.new {
38
38
  ## Home page: http://www.wagner.pp.ru/~vitus/software/catdoc/
39
39
 
40
40
  ## MS OOXML powerpoint to text conversion:
41
- ## Ruby code written by Eric DUMINIL
41
+ ## Ruby code written by Eric DUMINIL
@@ -8,7 +8,7 @@ PlainTextExtractor.new {
8
8
  every :rtf
9
9
  as "application/rtf"
10
10
  aka "Microsoft Rich Text Format"
11
- with "unrtf SOURCE -t text" => :on_linux_and_mac_os,
12
- "some other command" => :on_windows
11
+ extract_content_with "unrtf SOURCE -t text" => :on_linux_and_mac_os,
12
+ "some other command" => :on_windows
13
13
  which_should_for_example_extract 'Resampling when limiting', :from => 'ReadMe.rtf'
14
- }
14
+ }
@@ -4,8 +4,8 @@ PlainTextExtractor.new {
4
4
  every :doc, :dot
5
5
  as "application/msword"
6
6
  aka "Microsoft Office Word document"
7
- with "antiword SOURCE" => :on_linux_and_mac_os,
8
- "some other command" => :on_windows
7
+ extract_content_with "antiword SOURCE" => :on_linux_and_mac_os,
8
+ "some other command" => :on_windows
9
9
  which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
10
10
  or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
11
11
  }
@@ -17,7 +17,7 @@ PlainTextExtractor.new {
17
17
  every :docx, :dotx
18
18
  as 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
19
19
  aka "Microsoft Office 2007 Word document"
20
- with {|source|
20
+ extract_content_with {|source|
21
21
  Zip::ZipFile.open(source){|zipfile|
22
22
  zipfile.read("word/document.xml").split(/</).grep(/^w:t/).collect{|l|
23
23
  l.sub(/^[^>]+>/,'')
@@ -35,4 +35,4 @@ PlainTextExtractor.new {
35
35
  ## Home page: http://www.winfield.demon.nl/
36
36
 
37
37
  ## MS OOXML word to text conversion:
38
- ## Ruby code written by Eric DUMINIL
38
+ ## Ruby code written by Eric DUMINIL
@@ -5,7 +5,7 @@ PlainTextExtractor.new {
5
5
  every :odp
6
6
  as 'application/vnd.oasis.opendocument.presentation'
7
7
  aka "Open Document Format for presentation"
8
- with {|source|
8
+ extract_content_with {|source|
9
9
  Zip::ZipFile.open(source){|zipfile|
10
10
  zipfile.read("content.xml").split(/</).grep(/^text:(p|span)/).collect{|l|
11
11
  l.sub(/^[^>]+>/,'')
@@ -13,4 +13,4 @@ PlainTextExtractor.new {
13
13
  }
14
14
  }
15
15
  which_should_for_example_extract 'Picolena can it find me maybe!', :from => 'ubuntu_theme.odp'
16
- }
16
+ }
@@ -5,7 +5,7 @@ PlainTextExtractor.new {
5
5
  every :ods
6
6
  as 'application/vnd.oasis.opendocument.spreadsheet'
7
7
  aka "Open Document Format for spreadsheet"
8
- with {|source|
8
+ extract_content_with {|source|
9
9
  Zip::ZipFile.open(source){|zipfile|
10
10
  zipfile.read("content.xml").split(/</).grep(/^text:(p|span)/).collect{|l|
11
11
  l.sub(/^[^>]+>/,'')
@@ -13,4 +13,4 @@ PlainTextExtractor.new {
13
13
  }
14
14
  }
15
15
  which_should_for_example_extract 'Cessna F-172P G-BIDF, serial number 2045', :from => 'weight_and_balance.ods'
16
- }
16
+ }
@@ -5,7 +5,7 @@ PlainTextExtractor.new {
5
5
  every :odt
6
6
  as 'application/vnd.oasis.opendocument.text'
7
7
  aka "Open Document Format for text"
8
- with {|source|
8
+ extract_content_with {|source|
9
9
  Zip::ZipFile.open(source){|zipfile|
10
10
  zipfile.read("content.xml").split(/</).grep(/^text:(p|span)/).collect{|l|
11
11
  l.sub(/^[^>]+>/,'')
@@ -13,4 +13,4 @@ PlainTextExtractor.new {
13
13
  }
14
14
  }
15
15
  which_should_for_example_extract 'written with OpenOffice.org', :from => 'basic.odt'
16
- }
16
+ }
@@ -1,8 +1,19 @@
1
1
  PlainTextExtractor.new {
2
- every :bmp, :crw, :eps, :gif, :jpeg, :jpg, :nef, :png, :psd, :raw, :tif, :tiff
2
+ every :bmp, :crw, :eps, :gif, :jpeg, :jpg, :nef, :png, :raw, :tif, :tiff
3
3
  as "image/*"
4
4
  aka "some picture"
5
- with 'exiftool SOURCE'
6
- which_requires 'exiftool'
7
- which_should_for_example_extract 'Eric Duminil Nikon D90', :from => 'crow.jpg'
5
+
6
+ extract_thumbnail_with 'convert -quality 50 -thumbnail 80x80 SOURCE THUMBNAIL'
7
+
8
+ extract_content_with 'exiftool SOURCE'
9
+ which_should_for_example_extract 'Eric Duminil Nikon D90' , :from => 'crow.jpg'
10
+ or_extract '64x64 BMP' , :from => 'gnu.bmp'
11
+ or_extract 'application/postscript 258x43' , :from => 'diceface.eps'
12
+ or_extract 'Panasonic DMC-FZ8 320x240' , :from => 'glass.png'
13
+ or_extract 'Panasonic DMC-FZ8 "35mm equivalent: 432.0mm"', :from => 'cygnus.jpeg'
14
+ or_extract '"1990 bytes" 24x24 LZW' , :from => 'warning.tiff'
15
+ or_extract '"1978 bytes" 24x24 LZW' , :from => 'caution.tif'
16
+ or_extract 'GIF 110x140' , :from => 'rails_logo_remix.gif'
17
+ # Raw pictures (.nef, .crw, .raw) would also need to be tested, but their size doesn't make it worth including
18
+ # corresponding files in the repository. Specs will therefore stay with "Not Yet Implemented" status.
8
19
  }
@@ -1,8 +1,15 @@
1
1
  PlainTextExtractor.new {
2
- every :txt, :text, :tex, :for, :cpp, :c, :rb, :ins, :vee, :java, :no_extension
2
+ every :txt, :text
3
+ every :tex, :bib, :for, :cpp, :c, :rb, :ins, :vee, :java
4
+ every :ini
5
+ every :sub, :srt
6
+ #NOTE: Could be interesting to extract thumbnail from vCards
7
+ every :vcf, :vcard
8
+ every :no_extension
9
+
3
10
  as "application/plain"
4
11
  aka "plain text file"
5
- with {|source|
12
+ extract_content_with {|source|
6
13
  raise "binary file" unless File.plain_text?(source)
7
14
  encoding=File.encoding(source)
8
15
  if encoding.empty? then
@@ -0,0 +1,18 @@
1
+ PlainTextExtractor.new {
2
+ every :rar
3
+ as "archive/rar"
4
+ aka "RAR Archive"
5
+
6
+ # If a non-free version of unrar is available, uses it
7
+ # because unrar-nonfree supports more archives than unrar-free
8
+ if "unrar".installed? then
9
+ extract_content_from_archive_with "unrar x SOURCE TEMPDIR"
10
+ else
11
+ # falls back to unrar-free otherwise
12
+ extract_content_from_archive_with "unrar-free --extract SOURCE TEMPDIR"
13
+ end
14
+
15
+ which_should_for_example_extract 'IAE2ORREucIRPx+XgpYcYoO8Twz1TN5/LezRbdwWonlAqpDanBTR+McCehXpk7Pz',
16
+ :from => 'dumb_file.rar'
17
+ or_extract '"(Same file, but inside one directory)"', :from => 'dumb_file.rar'
18
+ }