picolena 0.2.0 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +14 -0
- data/Manifest.txt +28 -8
- data/config/files_to_clean +1 -0
- data/config/requirements.rb +1 -1
- data/lib/picolena/config/basic.rb +2 -1
- data/lib/picolena/config/icons_and_filetypes.yml +5 -0
- data/lib/picolena/picolena_generator.rb +3 -1
- data/lib/picolena/templates/app/helpers/documents_helper.rb +4 -4
- data/lib/picolena/templates/app/models/document.rb +27 -4
- data/lib/picolena/templates/app/models/indexer.rb +6 -2
- data/lib/picolena/templates/app/models/plain_text_extractor.rb +27 -13
- data/lib/picolena/templates/app/models/query.rb +2 -2
- data/lib/picolena/templates/app/views/documents/_document.html.haml +1 -1
- data/lib/picolena/templates/config/environments/development.rb +2 -0
- data/lib/picolena/templates/config/initializers/001_load_ferret.rb +17 -0
- data/lib/picolena/templates/config/initializers/{001_load_custom_config.rb → 002_load_custom_config.rb} +1 -2
- data/lib/picolena/templates/config/initializers/{002_load_indexed_dirs.rb → 003_load_indexed_dirs.rb} +0 -0
- data/lib/picolena/templates/config/initializers/{003_load_white_list_IPs.rb → 004_load_white_list_IPs.rb} +0 -0
- data/lib/picolena/templates/config/initializers/{004_load_plain_text_extractors.rb → 005_load_plain_text_extractors.rb} +1 -1
- data/lib/picolena/templates/config/initializers/{005_load_custom_title_and_names_and_links.rb → 006_load_custom_title_and_names_and_links.rb} +0 -0
- data/lib/picolena/templates/config/initializers/{006_load_icons.rb → 007_load_icons.rb} +0 -0
- data/lib/picolena/templates/config/initializers/{007_load_performance_tweaks.rb → 008_load_performance_tweaks.rb} +0 -0
- data/lib/picolena/templates/lib/core_exts.rb +52 -0
- data/lib/picolena/templates/lib/development_helpers.rb +35 -0
- data/lib/picolena/templates/lib/plain_text_extractor_dsl.rb +128 -0
- data/lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb +2 -2
- data/lib/picolena/templates/lib/plain_text_extractors/adobe.photoshop.rb +12 -0
- data/lib/picolena/templates/lib/plain_text_extractors/html.rb +1 -1
- data/lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb +4 -4
- data/lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb +4 -4
- data/lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb +3 -3
- data/lib/picolena/templates/lib/plain_text_extractors/ms.word.rb +4 -4
- data/lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb +2 -2
- data/lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb +2 -2
- data/lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb +2 -2
- data/lib/picolena/templates/lib/plain_text_extractors/pictures.rb +15 -4
- data/lib/picolena/templates/lib/plain_text_extractors/plain_text.rb +9 -2
- data/lib/picolena/templates/lib/plain_text_extractors/rar.rb +18 -0
- data/lib/picolena/templates/lib/plain_text_extractors/videos.rb +13 -0
- data/lib/picolena/templates/lib/plain_text_extractors/zip.rb +17 -0
- data/lib/picolena/templates/lib/tasks/extract.rake +16 -0
- data/lib/picolena/templates/lib/tasks/install_dependencies.rake +1 -1
- data/lib/picolena/templates/public/images/thumbnails/NOTE +2 -0
- data/lib/picolena/templates/spec/controllers/documents_controller_spec.rb +8 -0
- data/lib/picolena/templates/spec/helpers/documents_helper_spec.rb +12 -1
- data/lib/picolena/templates/spec/models/basic_finder_spec.rb +6 -4
- data/lib/picolena/templates/spec/models/document_spec.rb +24 -4
- data/lib/picolena/templates/spec/models/finder_spec.rb +18 -11
- data/lib/picolena/templates/spec/models/host_indexing_system_spec.rb +1 -1
- data/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb +25 -8
- data/lib/picolena/templates/spec/models/query_spec.rb +4 -5
- data/lib/picolena/templates/spec/spec_helper.rb +9 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/archives/dumb_file.rar +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/archives/some_test_files.zip +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/basic/fake_thumbnailer +14 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/badminton.avi +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/caution.tif +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/cygnus.jpeg +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/diceface.eps +79 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/glass.png +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/gnu.bmp +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/picolena.psd +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/rails_logo_remix.gif +0 -0
- data/lib/picolena/templates/spec/test_dirs/indexed/media/warning.tiff +0 -0
- data/lib/picolena/version.rb +1 -1
- data/website/index.html +1 -1
- metadata +31 -32
- data.tar.gz.sig +0 -0
- data/lib/picolena/templates/lib/plain_text_extractor_DSL.rb +0 -88
- metadata.gz.sig +0 -0
File without changes
|
File without changes
|
@@ -3,6 +3,15 @@ class String
|
|
3
3
|
def base26_hash(length=Picolena::HashLength)
|
4
4
|
Digest::MD5.hexdigest(self).to_i(16).to_s(26).tr('0-9a-p', 'a-z')[-length,length]
|
5
5
|
end
|
6
|
+
|
7
|
+
# Returns true iff self is an available command on the system
|
8
|
+
# >> "grep".installed?
|
9
|
+
# => true
|
10
|
+
# >> "sdfgsdfgsdf".installed?
|
11
|
+
# => false
|
12
|
+
def installed?
|
13
|
+
!IO.popen("which #{self}"){|i| i.read}.empty?
|
14
|
+
end
|
6
15
|
end
|
7
16
|
|
8
17
|
module Enumerable
|
@@ -100,4 +109,47 @@ class File
|
|
100
109
|
def self.plain_text?(filename)
|
101
110
|
%x{file -i "#{filename}"} =~ /: text\//
|
102
111
|
end
|
112
|
+
|
113
|
+
# For a given file, returns the path at which a thumbnail should be saved
|
114
|
+
def self.thumbnail_path(filename, public_dir=false)
|
115
|
+
thumb=expand_path(filename).base26_hash+'.jpg'
|
116
|
+
public_dir ? File.join('thumbnails', thumb) : File.join(RAILS_ROOT, 'public/images/thumbnails', thumb)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
class Object
|
121
|
+
# [1,2,3].is_an?(Array) just looks better than [1,2,3].is_a?(Array)
|
122
|
+
alias_method :is_an?, :is_a?
|
123
|
+
end
|
124
|
+
|
125
|
+
module Kernel
|
126
|
+
require 'open3'
|
127
|
+
# Executes a command and returns stdout while silenting stderr
|
128
|
+
# NOTE: Restricted to systems on which forking is possible. How to do on windows?
|
129
|
+
def silently_execute(command)
|
130
|
+
Open3.popen3(command){|i,e,o| e.read}
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
|
135
|
+
# A PlainTextExtractor.command can be either a String, a Block or undefined.
|
136
|
+
class String
|
137
|
+
# For a given *nix command line, returns an Array of required commands:
|
138
|
+
# >> "xls2csv SOURCE | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'".dependencies
|
139
|
+
# => ["xls2csv", "grep", "sed"]
|
140
|
+
def dependencies
|
141
|
+
self.split(/\|\s*/).collect{|command_part| command_part.split(/ /).first}
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
class Proc
|
146
|
+
def dependencies
|
147
|
+
[]
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
class NilClass
|
152
|
+
def dependencies
|
153
|
+
[]
|
154
|
+
end
|
103
155
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# D.query displays matching_documents for query, and returns the document
|
2
|
+
# with the highest score.
|
3
|
+
# Useful for development and debugging purposes
|
4
|
+
#
|
5
|
+
# >> D.test
|
6
|
+
# 71 document(s) found for test:
|
7
|
+
# for_test.txt
|
8
|
+
# some_test_files.zip
|
9
|
+
# plain.txt
|
10
|
+
# another_plain.text
|
11
|
+
# other_basic.PDF
|
12
|
+
# basic.pdf
|
13
|
+
# basic.odt
|
14
|
+
# basic.tex
|
15
|
+
# queens.for
|
16
|
+
# README
|
17
|
+
# ...........
|
18
|
+
# => "spec/test_dirs/indexed/just_one_doc/for_test.txt (82.7%)"
|
19
|
+
class D
|
20
|
+
def self.method_missing(query,*params)
|
21
|
+
self[query.to_s] || super
|
22
|
+
end
|
23
|
+
def self.[](query)
|
24
|
+
f=Finder.new(query.to_s)
|
25
|
+
hits=f.total_hits
|
26
|
+
if hits > 0 then
|
27
|
+
puts "#{hits} document(s) found for #{query}:"
|
28
|
+
f.matching_documents.each{|doc| puts " "+doc.filename}
|
29
|
+
puts " ..........." if hits > f.matching_documents.size
|
30
|
+
f.matching_documents.first
|
31
|
+
else
|
32
|
+
puts "Nothing found for #{query}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
# Defines plain text extractors with DSL
|
2
|
+
# For example, to convert "Microsoft Office Word document" to plain text
|
3
|
+
# PlainTextExtractor.new {
|
4
|
+
# every :doc, :dot
|
5
|
+
# as "application/msword"
|
6
|
+
# aka "Microsoft Office Word document"
|
7
|
+
# extract_content_with "antiword SOURCE" => :on_linux, "some other command" => :on_windows
|
8
|
+
# which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
|
9
|
+
# or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
|
10
|
+
# }
|
11
|
+
|
12
|
+
module PlainTextExtractorDSL
|
13
|
+
attr_reader :exts, :mime_name, :description, :command, :content_and_file_examples, :thumbnail_command
|
14
|
+
|
15
|
+
def initialize(&block)
|
16
|
+
@content_and_file_examples=[]
|
17
|
+
self.instance_eval(&block)
|
18
|
+
PlainTextExtractor.add(self)
|
19
|
+
end
|
20
|
+
|
21
|
+
def every(*exts)
|
22
|
+
@exts ||=[]
|
23
|
+
@exts |= exts
|
24
|
+
end
|
25
|
+
|
26
|
+
def as(mime_name)
|
27
|
+
@mime_name=mime_name
|
28
|
+
end
|
29
|
+
|
30
|
+
def aka(description)
|
31
|
+
@description=description
|
32
|
+
end
|
33
|
+
|
34
|
+
def which_requires(*dependencies)
|
35
|
+
@dependencies=dependencies
|
36
|
+
end
|
37
|
+
|
38
|
+
#used by rspec to test extractors:
|
39
|
+
# which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
|
40
|
+
# or_extract 'some other stuff inside another pdf file', :from => 'yet_another.pdf'
|
41
|
+
#
|
42
|
+
#this spec will pass if 'basic.pdf' and 'yet_another.pdf' are included in an indexed directory, if every dependency is installed,
|
43
|
+
#and if plain text output from the extractor applied to 'basic.pdf' and 'yet_another.pdf' respectively include 'in a pdf file' and 'some other stuff inside another pdf file'
|
44
|
+
def which_should_for_example_extract(content, file)
|
45
|
+
@content_and_file_examples << [content,file[:from]]
|
46
|
+
end
|
47
|
+
|
48
|
+
#it allows to define specs in this way:
|
49
|
+
# which_should_for_example_extract 'Hello world!', :from => 'hello.rb'
|
50
|
+
# or_extract 'text inside!', :from => 'crossed.txt'
|
51
|
+
alias_method :or_extract, :which_should_for_example_extract
|
52
|
+
|
53
|
+
def extract_content_with(command_as_hash_or_string=nil,&block)
|
54
|
+
#TODO: Find a better way to manage platforms, and include OS X, Vista, BSD...
|
55
|
+
@command=case command_as_hash_or_string
|
56
|
+
when String
|
57
|
+
command_as_hash_or_string
|
58
|
+
when Hash
|
59
|
+
command_for_current_platform(command_as_hash_or_string)
|
60
|
+
else
|
61
|
+
block || raise("No command defined for this extractor: #{description}")
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def extract_thumbnail_with(command_as_hash_or_string=nil, &block)
|
66
|
+
#TODO: Don't ignore block and use it as in extract_content_with
|
67
|
+
@thumbnail_command=case command_as_hash_or_string
|
68
|
+
when String
|
69
|
+
command_as_hash_or_string
|
70
|
+
when Hash
|
71
|
+
command_for_current_platform(command_as_hash_or_string)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# Unpack an archive and extract content from every supported file
|
76
|
+
def extract_content_from_archive_with(unpack_command)
|
77
|
+
#FIXME: Cleaner code needed!
|
78
|
+
@command=lambda {|source|
|
79
|
+
begin
|
80
|
+
global_temp_dir = File.join(Dir::tmpdir, 'picolena_archive_temp')
|
81
|
+
specific_temp_dir = File.join(global_temp_dir, source.base26_hash)
|
82
|
+
FileUtils.mkpath specific_temp_dir
|
83
|
+
specific_unpack_command=unpack_command.sub('SOURCE','"'<<source<<'"').sub(/TE?MPDIR/,'"'<<specific_temp_dir<<'"')
|
84
|
+
silently_execute(specific_unpack_command)
|
85
|
+
Dir["#{specific_temp_dir}/**/*"].select{|f| File.file?(f)}.map{|filename|
|
86
|
+
content=PlainTextExtractor.extract_content_from(filename) rescue "---"
|
87
|
+
["##"<<filename.sub(specific_temp_dir,'').gsub('/', '>'),
|
88
|
+
content]
|
89
|
+
}.join("\n")
|
90
|
+
ensure
|
91
|
+
FileUtils.remove_entry_secure(specific_temp_dir)
|
92
|
+
FileUtils.rmdir(global_temp_dir) rescue "not empty"
|
93
|
+
end
|
94
|
+
}
|
95
|
+
(@dependencies||=[])<<unpack_command.dependencies
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
def command_for_current_platform(command_as_hash)
|
100
|
+
# Allows to write
|
101
|
+
# with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
|
102
|
+
# "some other command" => :on_windows
|
103
|
+
#
|
104
|
+
# On linux and mac_os platforms, it returns "pdftotext -enc UTF-8 SOURCE -",
|
105
|
+
# on windows, it returns "some other command"
|
106
|
+
#
|
107
|
+
# If commands for linux & mac os were different :
|
108
|
+
# with "some command" => :on_linux,
|
109
|
+
# "another command" => :on_mac_os,
|
110
|
+
# "yet another command" => :on_windows
|
111
|
+
#
|
112
|
+
#NOTE: What to do when no command is defined for a given platform?
|
113
|
+
command_as_hash.invert.find{|platforms,command|
|
114
|
+
platforms.to_s.split(/_?and_?/i).collect{|on_platform| on_platform.sub(/on_/,'').to_sym}.include?(current_platform_symbol)
|
115
|
+
}.last.dup
|
116
|
+
end
|
117
|
+
|
118
|
+
def current_platform_symbol
|
119
|
+
@@platform_symbol||=case RUBY_PLATFORM
|
120
|
+
when /linux/
|
121
|
+
:linux
|
122
|
+
when /win/
|
123
|
+
:windows
|
124
|
+
when /darwin/
|
125
|
+
:mac_os
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
@@ -8,7 +8,7 @@ PlainTextExtractor.new {
|
|
8
8
|
every :pdf
|
9
9
|
as "application/pdf"
|
10
10
|
aka "Adobe Portable Document Format"
|
11
|
-
|
12
|
-
|
11
|
+
extract_content_with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
|
12
|
+
"some other command" => :on_windows
|
13
13
|
which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
|
14
14
|
}
|
@@ -0,0 +1,12 @@
|
|
1
|
+
PlainTextExtractor.new {
|
2
|
+
every :psd
|
3
|
+
as "image/adobe.photoshop"
|
4
|
+
aka "Adobe Photoshop Format"
|
5
|
+
|
6
|
+
#NOTE: PSD gets its own Extractor since convert method is different from one-layer pictures
|
7
|
+
# and needs -flatten option
|
8
|
+
extract_thumbnail_with 'convert SOURCE -flatten -thumbnail 80x80 -quality 50 THUMBNAIL'
|
9
|
+
|
10
|
+
extract_content_with 'exiftool SOURCE'
|
11
|
+
which_should_for_example_extract '"Adobe Photoshop CS2 Windows" 584x150', :from => 'picolena.psd'
|
12
|
+
}
|
@@ -2,7 +2,7 @@ PlainTextExtractor.new {
|
|
2
2
|
every :html, :htm
|
3
3
|
as "text/html"
|
4
4
|
aka "HyperText Markup Language document"
|
5
|
-
|
5
|
+
extract_content_with {|source|
|
6
6
|
encoding=File.encoding(source)
|
7
7
|
if encoding.empty? or encoding.gsub(/[^\w]/,'').downcase=="utf8" then
|
8
8
|
%x{html2text -nobs "#{source}"}
|
@@ -4,8 +4,8 @@ PlainTextExtractor.new {
|
|
4
4
|
every :xls
|
5
5
|
as "application/excel"
|
6
6
|
aka "Microsoft Office Excel document"
|
7
|
-
|
8
|
-
|
7
|
+
extract_content_with "xls2csv SOURCE | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux_and_mac_os,
|
8
|
+
"some other command" => :on_windows
|
9
9
|
which_should_for_example_extract 'Some text (should be indexed!)', :from => 'table.xls'
|
10
10
|
}
|
11
11
|
|
@@ -16,7 +16,7 @@ PlainTextExtractor.new {
|
|
16
16
|
every :xlsx
|
17
17
|
as 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
18
18
|
aka "Microsoft Office 2007 Excel spreadsheet"
|
19
|
-
|
19
|
+
extract_content_with {|source|
|
20
20
|
Zip::ZipFile.open(source){|zipfile|
|
21
21
|
text_cells=zipfile.read("xl/sharedStrings.xml").split(/</).grep(/^t/).collect{|l|
|
22
22
|
l.sub(/^[^>]+>/,'')
|
@@ -39,4 +39,4 @@ PlainTextExtractor.new {
|
|
39
39
|
## Home page: http://www.winfield.demon.nl/
|
40
40
|
|
41
41
|
## MS OOXML excel to text conversion:
|
42
|
-
## Ruby code written by Eric DUMINIL
|
42
|
+
## Ruby code written by Eric DUMINIL
|
@@ -4,8 +4,8 @@ PlainTextExtractor.new {
|
|
4
4
|
every :ppt, :pps
|
5
5
|
as "application/powerpoint"
|
6
6
|
aka "Microsoft Office Powerpoint document"
|
7
|
-
|
8
|
-
|
7
|
+
extract_content_with "catppt SOURCE" => :on_linux_and_mac_os,
|
8
|
+
"some other command" => :on_windows
|
9
9
|
which_should_for_example_extract 'unofficial written by OOo Impress', :from => 'one_page.ppt'
|
10
10
|
#FIXME: it seems that catppt cannot open .pps files.
|
11
11
|
#or_extract 'a lightweight ferret-powered search engine written in Ruby on rails.', :from => 'picolena.pps'
|
@@ -18,7 +18,7 @@ PlainTextExtractor.new {
|
|
18
18
|
every :pptx
|
19
19
|
as 'application/vnd.openxmlformats-officedocument.presentationml.presentation' #could that mime BE any longer?
|
20
20
|
aka "Microsoft Office 2007 Powerpoint document"
|
21
|
-
|
21
|
+
extract_content_with {|source|
|
22
22
|
Zip::ZipFile.open(source){|zipfile|
|
23
23
|
slides=zipfile.entries.select{|l| l.name=~/^ppt\/slides\/slide\d+.xml/}
|
24
24
|
slides.collect{|entry|
|
@@ -38,4 +38,4 @@ PlainTextExtractor.new {
|
|
38
38
|
## Home page: http://www.wagner.pp.ru/~vitus/software/catdoc/
|
39
39
|
|
40
40
|
## MS OOXML powerpoint to text conversion:
|
41
|
-
## Ruby code written by Eric DUMINIL
|
41
|
+
## Ruby code written by Eric DUMINIL
|
@@ -8,7 +8,7 @@ PlainTextExtractor.new {
|
|
8
8
|
every :rtf
|
9
9
|
as "application/rtf"
|
10
10
|
aka "Microsoft Rich Text Format"
|
11
|
-
|
12
|
-
|
11
|
+
extract_content_with "unrtf SOURCE -t text" => :on_linux_and_mac_os,
|
12
|
+
"some other command" => :on_windows
|
13
13
|
which_should_for_example_extract 'Resampling when limiting', :from => 'ReadMe.rtf'
|
14
|
-
}
|
14
|
+
}
|
@@ -4,8 +4,8 @@ PlainTextExtractor.new {
|
|
4
4
|
every :doc, :dot
|
5
5
|
as "application/msword"
|
6
6
|
aka "Microsoft Office Word document"
|
7
|
-
|
8
|
-
|
7
|
+
extract_content_with "antiword SOURCE" => :on_linux_and_mac_os,
|
8
|
+
"some other command" => :on_windows
|
9
9
|
which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
|
10
10
|
or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
|
11
11
|
}
|
@@ -17,7 +17,7 @@ PlainTextExtractor.new {
|
|
17
17
|
every :docx, :dotx
|
18
18
|
as 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
19
19
|
aka "Microsoft Office 2007 Word document"
|
20
|
-
|
20
|
+
extract_content_with {|source|
|
21
21
|
Zip::ZipFile.open(source){|zipfile|
|
22
22
|
zipfile.read("word/document.xml").split(/</).grep(/^w:t/).collect{|l|
|
23
23
|
l.sub(/^[^>]+>/,'')
|
@@ -35,4 +35,4 @@ PlainTextExtractor.new {
|
|
35
35
|
## Home page: http://www.winfield.demon.nl/
|
36
36
|
|
37
37
|
## MS OOXML word to text conversion:
|
38
|
-
## Ruby code written by Eric DUMINIL
|
38
|
+
## Ruby code written by Eric DUMINIL
|
@@ -5,7 +5,7 @@ PlainTextExtractor.new {
|
|
5
5
|
every :odp
|
6
6
|
as 'application/vnd.oasis.opendocument.presentation'
|
7
7
|
aka "Open Document Format for presentation"
|
8
|
-
|
8
|
+
extract_content_with {|source|
|
9
9
|
Zip::ZipFile.open(source){|zipfile|
|
10
10
|
zipfile.read("content.xml").split(/</).grep(/^text:(p|span)/).collect{|l|
|
11
11
|
l.sub(/^[^>]+>/,'')
|
@@ -13,4 +13,4 @@ PlainTextExtractor.new {
|
|
13
13
|
}
|
14
14
|
}
|
15
15
|
which_should_for_example_extract 'Picolena can it find me maybe!', :from => 'ubuntu_theme.odp'
|
16
|
-
}
|
16
|
+
}
|
@@ -5,7 +5,7 @@ PlainTextExtractor.new {
|
|
5
5
|
every :ods
|
6
6
|
as 'application/vnd.oasis.opendocument.spreadsheet'
|
7
7
|
aka "Open Document Format for spreadsheet"
|
8
|
-
|
8
|
+
extract_content_with {|source|
|
9
9
|
Zip::ZipFile.open(source){|zipfile|
|
10
10
|
zipfile.read("content.xml").split(/</).grep(/^text:(p|span)/).collect{|l|
|
11
11
|
l.sub(/^[^>]+>/,'')
|
@@ -13,4 +13,4 @@ PlainTextExtractor.new {
|
|
13
13
|
}
|
14
14
|
}
|
15
15
|
which_should_for_example_extract 'Cessna F-172P G-BIDF, serial number 2045', :from => 'weight_and_balance.ods'
|
16
|
-
}
|
16
|
+
}
|
@@ -5,7 +5,7 @@ PlainTextExtractor.new {
|
|
5
5
|
every :odt
|
6
6
|
as 'application/vnd.oasis.opendocument.text'
|
7
7
|
aka "Open Document Format for text"
|
8
|
-
|
8
|
+
extract_content_with {|source|
|
9
9
|
Zip::ZipFile.open(source){|zipfile|
|
10
10
|
zipfile.read("content.xml").split(/</).grep(/^text:(p|span)/).collect{|l|
|
11
11
|
l.sub(/^[^>]+>/,'')
|
@@ -13,4 +13,4 @@ PlainTextExtractor.new {
|
|
13
13
|
}
|
14
14
|
}
|
15
15
|
which_should_for_example_extract 'written with OpenOffice.org', :from => 'basic.odt'
|
16
|
-
}
|
16
|
+
}
|
@@ -1,8 +1,19 @@
|
|
1
1
|
PlainTextExtractor.new {
|
2
|
-
every :bmp, :crw, :eps, :gif, :jpeg, :jpg, :nef, :png, :
|
2
|
+
every :bmp, :crw, :eps, :gif, :jpeg, :jpg, :nef, :png, :raw, :tif, :tiff
|
3
3
|
as "image/*"
|
4
4
|
aka "some picture"
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
|
6
|
+
extract_thumbnail_with 'convert -quality 50 -thumbnail 80x80 SOURCE THUMBNAIL'
|
7
|
+
|
8
|
+
extract_content_with 'exiftool SOURCE'
|
9
|
+
which_should_for_example_extract 'Eric Duminil Nikon D90' , :from => 'crow.jpg'
|
10
|
+
or_extract '64x64 BMP' , :from => 'gnu.bmp'
|
11
|
+
or_extract 'application/postscript 258x43' , :from => 'diceface.eps'
|
12
|
+
or_extract 'Panasonic DMC-FZ8 320x240' , :from => 'glass.png'
|
13
|
+
or_extract 'Panasonic DMC-FZ8 "35mm equivalent: 432.0mm"', :from => 'cygnus.jpeg'
|
14
|
+
or_extract '"1990 bytes" 24x24 LZW' , :from => 'warning.tiff'
|
15
|
+
or_extract '"1978 bytes" 24x24 LZW' , :from => 'caution.tif'
|
16
|
+
or_extract 'GIF 110x140' , :from => 'rails_logo_remix.gif'
|
17
|
+
# Raw pictures (.nef, .crw, .raw) would also need to be tested, but their size doesn't make it worth including
|
18
|
+
# corresponding files in the repository. Specs will therefore stay with "Not Yet Implemented" status.
|
8
19
|
}
|
@@ -1,8 +1,15 @@
|
|
1
1
|
PlainTextExtractor.new {
|
2
|
-
every :txt, :text
|
2
|
+
every :txt, :text
|
3
|
+
every :tex, :bib, :for, :cpp, :c, :rb, :ins, :vee, :java
|
4
|
+
every :ini
|
5
|
+
every :sub, :srt
|
6
|
+
#NOTE: Could be interesting to extract thumbnail from vCards
|
7
|
+
every :vcf, :vcard
|
8
|
+
every :no_extension
|
9
|
+
|
3
10
|
as "application/plain"
|
4
11
|
aka "plain text file"
|
5
|
-
|
12
|
+
extract_content_with {|source|
|
6
13
|
raise "binary file" unless File.plain_text?(source)
|
7
14
|
encoding=File.encoding(source)
|
8
15
|
if encoding.empty? then
|
@@ -0,0 +1,18 @@
|
|
1
|
+
PlainTextExtractor.new {
|
2
|
+
every :rar
|
3
|
+
as "archive/rar"
|
4
|
+
aka "RAR Archive"
|
5
|
+
|
6
|
+
# If a non-free version of unrar is available, uses it
|
7
|
+
# because unrar-nonfree supports more archives than unrar-free
|
8
|
+
if "unrar".installed? then
|
9
|
+
extract_content_from_archive_with "unrar x SOURCE TEMPDIR"
|
10
|
+
else
|
11
|
+
# falls back to unrar-free otherwise
|
12
|
+
extract_content_from_archive_with "unrar-free --extract SOURCE TEMPDIR"
|
13
|
+
end
|
14
|
+
|
15
|
+
which_should_for_example_extract 'IAE2ORREucIRPx+XgpYcYoO8Twz1TN5/LezRbdwWonlAqpDanBTR+McCehXpk7Pz',
|
16
|
+
:from => 'dumb_file.rar'
|
17
|
+
or_extract '"(Same file, but inside one directory)"', :from => 'dumb_file.rar'
|
18
|
+
}
|