docsplit 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.5.2' # Keep version in sync with docsplit.rb
4
- s.date = '2011-05-13'
3
+ s.version = '0.6.0' # Keep version in sync with docsplit.rb
4
+ s.date = '2011-09-13'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.5.2' # Keep in sync with gemspec.
4
+ VERSION = '0.6.0' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -19,7 +19,9 @@ module Docsplit
19
19
 
20
20
  DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
21
21
 
22
- # Check for all dependencies, and warn of their absence.
22
+ ESCAPE = lambda {|x| Shellwords.shellescape(x) }
23
+
24
+ # Check for all dependencies, and note their absence.
23
25
  dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
24
26
  DEPENDENCIES.each_key do |dep|
25
27
  dirs.each do |dir|
@@ -28,7 +30,6 @@ module Docsplit
28
30
  break
29
31
  end
30
32
  end
31
- warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
32
33
  end
33
34
 
34
35
  # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
@@ -62,11 +63,13 @@ module Docsplit
62
63
  [docs].flatten.each do |doc|
63
64
  ext = File.extname(doc)
64
65
  basename = File.basename(doc, ext)
65
- if GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
66
- `gm convert "#{doc}" "#{out}/#{basename}.pdf"`
66
+ escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
67
+
68
+ if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
69
+ `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
67
70
  else
68
71
  options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
69
- run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
72
+ run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
70
73
  end
71
74
  end
72
75
  end
@@ -113,6 +116,7 @@ end
113
116
 
114
117
  require 'tmpdir'
115
118
  require 'fileutils'
119
+ require 'shellwords'
116
120
  require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
117
121
  require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
118
122
  require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
@@ -85,6 +85,9 @@ Options:
85
85
  opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
86
86
  @options[:format] = t.split(',')
87
87
  end
88
+ opts.on('-d', '--density [NUM]', 'set image density (DPI) when rasterizing') do |d|
89
+ @options[:density] = d
90
+ end
88
91
  opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
89
92
  @options[:ocr] = o
90
93
  end
@@ -4,9 +4,9 @@ module Docsplit
4
4
  # nicely sized images.
5
5
  class ImageExtractor
6
6
 
7
- DENSITY_ARG = "-density 150"
8
7
  MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
9
8
  DEFAULT_FORMAT = :png
9
+ DEFAULT_DENSITY = '150'
10
10
 
11
11
  # Extract a list of PDFs as rasterized page images, according to the
12
12
  # configuration in options.
@@ -32,16 +32,17 @@ module Docsplit
32
32
  basename = File.basename(pdf, File.extname(pdf))
33
33
  directory = directory_for(size)
34
34
  pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
35
+ escaped_pdf = ESCAPE[pdf]
35
36
  FileUtils.mkdir_p(directory) unless File.exists?(directory)
36
- common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
37
+ common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
37
38
  if previous
38
39
  FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
39
40
  result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
40
41
  raise ExtractionFailed, result if $? != 0
41
42
  else
42
43
  page_list(pages).each do |page|
43
- out_file = File.join(directory, "#{basename}_#{page}.#{format}")
44
- cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}[#{page - 1}]\" \"#{out_file}\" 2>&1".chomp
44
+ out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
45
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
45
46
  result = `#{cmd}`.chomp
46
47
  raise ExtractionFailed, result if $? != 0
47
48
  end
@@ -57,6 +58,7 @@ module Docsplit
57
58
  def extract_options(options)
58
59
  @output = options[:output] || '.'
59
60
  @pages = options[:pages]
61
+ @density = options[:density] || DEFAULT_DENSITY
60
62
  @formats = [options[:format] || DEFAULT_FORMAT].flatten
61
63
  @sizes = [options[:size]].flatten.compact
62
64
  @sizes = [nil] if @sizes.empty?
@@ -18,7 +18,7 @@ module Docsplit
18
18
  # Pull out a single datum from a pdf.
19
19
  def extract(key, pdfs, opts)
20
20
  pdf = [pdfs].flatten.first
21
- cmd = "pdfinfo #{pdf} 2>&1"
21
+ cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
22
22
  result = `#{cmd}`.chomp
23
23
  raise ExtractionFailed, result if $? != 0
24
24
  match = result.match(MATCHERS[key])
@@ -11,7 +11,7 @@ module Docsplit
11
11
  pdf_name = File.basename(pdf, File.extname(pdf))
12
12
  page_path = File.join(@output, "#{pdf_name}_%d.pdf")
13
13
  FileUtils.mkdir_p @output unless File.exists?(@output)
14
- cmd = "pdftk #{pdf} burst output #{page_path} 2>&1"
14
+ cmd = "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
15
15
  result = `#{cmd}`.chomp
16
16
  FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
17
17
  raise ExtractionFailed, result if $? != 0
@@ -16,7 +16,7 @@ module Docsplit
16
16
 
17
17
  NO_TEXT_DETECTED = /---------\n\Z/
18
18
 
19
- OCR_FLAGS = '-density 200x200 -colorspace GRAY'
19
+ OCR_FLAGS = '-density 400x400 -colorspace GRAY'
20
20
  MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
21
21
 
22
22
  MIN_TEXT_PER_PAGE = 100 # in bytes
@@ -45,7 +45,7 @@ module Docsplit
45
45
 
46
46
  # Does a PDF have any text embedded?
47
47
  def contains_text?(pdf)
48
- fonts = `pdffonts #{pdf} 2>&1`
48
+ fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
49
49
  !fonts.match(NO_TEXT_DETECTED)
50
50
  end
51
51
 
@@ -59,19 +59,22 @@ module Docsplit
59
59
  def extract_from_ocr(pdf, pages)
60
60
  tempdir = Dir.mktmpdir
61
61
  base_path = File.join(@output, @pdf_name)
62
+ escaped_pdf = ESCAPE[pdf]
62
63
  if pages
63
64
  pages.each do |page|
64
65
  tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
66
+ escaped_tiff = ESCAPE[tiff]
65
67
  file = "#{base_path}_#{page}"
66
- run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
67
- run "tesseract #{tiff} #{file} -l eng 2>&1"
68
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
69
+ run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l eng 2>&1"
68
70
  clean_text(file + '.txt') if @clean_ocr
69
71
  FileUtils.remove_entry_secure tiff
70
72
  end
71
73
  else
72
74
  tiff = "#{tempdir}/#{@pdf_name}.tif"
73
- run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
74
- run "tesseract #{tiff} #{base_path} -l eng 2>&1"
75
+ escaped_tiff = ESCAPE[tiff]
76
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
77
+ run "tesseract #{escaped_tiff} #{base_path} -l eng 2>&1"
75
78
  clean_text(base_path + '.txt') if @clean_ocr
76
79
  end
77
80
  ensure
@@ -100,14 +103,14 @@ module Docsplit
100
103
  # Extract the full contents of a pdf as a single file, directly.
101
104
  def extract_full(pdf)
102
105
  text_path = File.join(@output, "#{@pdf_name}.txt")
103
- run "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
106
+ run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
104
107
  end
105
108
 
106
109
  # Extract the contents of a single page of text, directly, adding it to
107
110
  # the `@pages_to_ocr` list if the text length is inadequate.
108
111
  def extract_page(pdf, page)
109
112
  text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
110
- run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"
113
+ run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
111
114
  unless @forbid_ocr
112
115
  @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
113
116
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
4
+ hash: 7
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 5
9
- - 2
10
- version: 0.5.2
8
+ - 6
9
+ - 0
10
+ version: 0.6.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jeremy Ashkenas
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2011-05-13 00:00:00 Z
19
+ date: 2011-09-13 00:00:00 Z
20
20
  dependencies: []
21
21
 
22
22
  description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"