docsplit 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.5.2' # Keep version in sync with docsplit.rb
4
- s.date = '2011-05-13'
3
+ s.version = '0.6.0' # Keep version in sync with docsplit.rb
4
+ s.date = '2011-09-13'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
@@ -1,7 +1,7 @@
1
1
  # The Docsplit module delegates to the Java PDF extractors.
2
2
  module Docsplit
3
3
 
4
- VERSION = '0.5.2' # Keep in sync with gemspec.
4
+ VERSION = '0.6.0' # Keep in sync with gemspec.
5
5
 
6
6
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
7
7
 
@@ -19,7 +19,9 @@ module Docsplit
19
19
 
20
20
  DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
21
21
 
22
- # Check for all dependencies, and warn of their absence.
22
+ ESCAPE = lambda {|x| Shellwords.shellescape(x) }
23
+
24
+ # Check for all dependencies, and note their absence.
23
25
  dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
24
26
  DEPENDENCIES.each_key do |dep|
25
27
  dirs.each do |dir|
@@ -28,7 +30,6 @@ module Docsplit
28
30
  break
29
31
  end
30
32
  end
31
- warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
32
33
  end
33
34
 
34
35
  # Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
@@ -62,11 +63,13 @@ module Docsplit
62
63
  [docs].flatten.each do |doc|
63
64
  ext = File.extname(doc)
64
65
  basename = File.basename(doc, ext)
65
- if GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
66
- `gm convert "#{doc}" "#{out}/#{basename}.pdf"`
66
+ escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
67
+
68
+ if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
69
+ `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
67
70
  else
68
71
  options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
69
- run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
72
+ run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
70
73
  end
71
74
  end
72
75
  end
@@ -113,6 +116,7 @@ end
113
116
 
114
117
  require 'tmpdir'
115
118
  require 'fileutils'
119
+ require 'shellwords'
116
120
  require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
117
121
  require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
118
122
  require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
@@ -85,6 +85,9 @@ Options:
85
85
  opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
86
86
  @options[:format] = t.split(',')
87
87
  end
88
+ opts.on('-d', '--density [NUM]', 'set image density (DPI) when rasterizing') do |d|
89
+ @options[:density] = d
90
+ end
88
91
  opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
89
92
  @options[:ocr] = o
90
93
  end
@@ -4,9 +4,9 @@ module Docsplit
4
4
  # nicely sized images.
5
5
  class ImageExtractor
6
6
 
7
- DENSITY_ARG = "-density 150"
8
7
  MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
9
8
  DEFAULT_FORMAT = :png
9
+ DEFAULT_DENSITY = '150'
10
10
 
11
11
  # Extract a list of PDFs as rasterized page images, according to the
12
12
  # configuration in options.
@@ -32,16 +32,17 @@ module Docsplit
32
32
  basename = File.basename(pdf, File.extname(pdf))
33
33
  directory = directory_for(size)
34
34
  pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
35
+ escaped_pdf = ESCAPE[pdf]
35
36
  FileUtils.mkdir_p(directory) unless File.exists?(directory)
36
- common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
37
+ common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
37
38
  if previous
38
39
  FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
39
40
  result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
40
41
  raise ExtractionFailed, result if $? != 0
41
42
  else
42
43
  page_list(pages).each do |page|
43
- out_file = File.join(directory, "#{basename}_#{page}.#{format}")
44
- cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}[#{page - 1}]\" \"#{out_file}\" 2>&1".chomp
44
+ out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
45
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
45
46
  result = `#{cmd}`.chomp
46
47
  raise ExtractionFailed, result if $? != 0
47
48
  end
@@ -57,6 +58,7 @@ module Docsplit
57
58
  def extract_options(options)
58
59
  @output = options[:output] || '.'
59
60
  @pages = options[:pages]
61
+ @density = options[:density] || DEFAULT_DENSITY
60
62
  @formats = [options[:format] || DEFAULT_FORMAT].flatten
61
63
  @sizes = [options[:size]].flatten.compact
62
64
  @sizes = [nil] if @sizes.empty?
@@ -18,7 +18,7 @@ module Docsplit
18
18
  # Pull out a single datum from a pdf.
19
19
  def extract(key, pdfs, opts)
20
20
  pdf = [pdfs].flatten.first
21
- cmd = "pdfinfo #{pdf} 2>&1"
21
+ cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
22
22
  result = `#{cmd}`.chomp
23
23
  raise ExtractionFailed, result if $? != 0
24
24
  match = result.match(MATCHERS[key])
@@ -11,7 +11,7 @@ module Docsplit
11
11
  pdf_name = File.basename(pdf, File.extname(pdf))
12
12
  page_path = File.join(@output, "#{pdf_name}_%d.pdf")
13
13
  FileUtils.mkdir_p @output unless File.exists?(@output)
14
- cmd = "pdftk #{pdf} burst output #{page_path} 2>&1"
14
+ cmd = "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
15
15
  result = `#{cmd}`.chomp
16
16
  FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
17
17
  raise ExtractionFailed, result if $? != 0
@@ -16,7 +16,7 @@ module Docsplit
16
16
 
17
17
  NO_TEXT_DETECTED = /---------\n\Z/
18
18
 
19
- OCR_FLAGS = '-density 200x200 -colorspace GRAY'
19
+ OCR_FLAGS = '-density 400x400 -colorspace GRAY'
20
20
  MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
21
21
 
22
22
  MIN_TEXT_PER_PAGE = 100 # in bytes
@@ -45,7 +45,7 @@ module Docsplit
45
45
 
46
46
  # Does a PDF have any text embedded?
47
47
  def contains_text?(pdf)
48
- fonts = `pdffonts #{pdf} 2>&1`
48
+ fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
49
49
  !fonts.match(NO_TEXT_DETECTED)
50
50
  end
51
51
 
@@ -59,19 +59,22 @@ module Docsplit
59
59
  def extract_from_ocr(pdf, pages)
60
60
  tempdir = Dir.mktmpdir
61
61
  base_path = File.join(@output, @pdf_name)
62
+ escaped_pdf = ESCAPE[pdf]
62
63
  if pages
63
64
  pages.each do |page|
64
65
  tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
66
+ escaped_tiff = ESCAPE[tiff]
65
67
  file = "#{base_path}_#{page}"
66
- run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
67
- run "tesseract #{tiff} #{file} -l eng 2>&1"
68
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
69
+ run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l eng 2>&1"
68
70
  clean_text(file + '.txt') if @clean_ocr
69
71
  FileUtils.remove_entry_secure tiff
70
72
  end
71
73
  else
72
74
  tiff = "#{tempdir}/#{@pdf_name}.tif"
73
- run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
74
- run "tesseract #{tiff} #{base_path} -l eng 2>&1"
75
+ escaped_tiff = ESCAPE[tiff]
76
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
77
+ run "tesseract #{escaped_tiff} #{base_path} -l eng 2>&1"
75
78
  clean_text(base_path + '.txt') if @clean_ocr
76
79
  end
77
80
  ensure
@@ -100,14 +103,14 @@ module Docsplit
100
103
  # Extract the full contents of a pdf as a single file, directly.
101
104
  def extract_full(pdf)
102
105
  text_path = File.join(@output, "#{@pdf_name}.txt")
103
- run "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
106
+ run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
104
107
  end
105
108
 
106
109
  # Extract the contents of a single page of text, directly, adding it to
107
110
  # the `@pages_to_ocr` list if the text length is inadequate.
108
111
  def extract_page(pdf, page)
109
112
  text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
110
- run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"
113
+ run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
111
114
  unless @forbid_ocr
112
115
  @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
113
116
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
4
+ hash: 7
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 5
9
- - 2
10
- version: 0.5.2
8
+ - 6
9
+ - 0
10
+ version: 0.6.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jeremy Ashkenas
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2011-05-13 00:00:00 Z
19
+ date: 2011-09-13 00:00:00 Z
20
20
  dependencies: []
21
21
 
22
22
  description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"