docsplit 0.6.3 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.6.3' # Keep version in sync with docsplit.rb
4
- s.date = '2011-11-23'
3
+ s.version = '0.6.4' # Keep version in sync with docsplit.rb
4
+ s.date = '2012-11-12'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
data/lib/docsplit.rb CHANGED
@@ -1,13 +1,20 @@
1
+ require 'tmpdir'
2
+ require 'fileutils'
3
+ require 'shellwords'
4
+
1
5
  # The Docsplit module delegates to the Java PDF extractors.
2
6
  module Docsplit
3
7
 
4
- VERSION = '0.6.3' # Keep in sync with gemspec.
8
+ VERSION = '0.6.4' # Keep in sync with gemspec.
9
+
10
+ ESCAPE = lambda {|x| Shellwords.shellescape(x) }
5
11
 
6
12
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
13
+ ESCAPED_ROOT = ESCAPE[ROOT]
7
14
 
8
- CLASSPATH = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'"
15
+ CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
9
16
 
10
- LOGGING = "-Djava.util.logging.config.file=#{ROOT}/vendor/logging.properties"
17
+ LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
11
18
 
12
19
  HEADLESS = "-Djava.awt.headless=true"
13
20
 
@@ -20,9 +27,7 @@ module Docsplit
20
27
 
21
28
  GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
22
29
 
23
- DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
24
-
25
- ESCAPE = lambda {|x| Shellwords.shellescape(x) }
30
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
26
31
 
27
32
  # Check for all dependencies, and note their absence.
28
33
  dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -71,7 +76,7 @@ module Docsplit
71
76
  if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
72
77
  `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
73
78
  else
74
- options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ROOT}/vendor/conf/document-formats.js"
79
+ options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
75
80
  run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
76
81
  end
77
82
  end
@@ -117,9 +122,6 @@ module Docsplit
117
122
 
118
123
  end
119
124
 
120
- require 'tmpdir'
121
- require 'fileutils'
122
- require 'shellwords'
123
125
  require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
124
126
  require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
125
127
  require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
@@ -94,6 +94,9 @@ Options:
94
94
  opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
95
95
  @options[:clean] = false
96
96
  end
97
+ opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
98
+ @options[:language] = l
99
+ end
97
100
  opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
98
101
  @options[:rolling] = true
99
102
  end
@@ -42,7 +42,7 @@ module Docsplit
42
42
  else
43
43
  page_list(pages).each do |page|
44
44
  out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
45
- cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
45
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
46
46
  result = `#{cmd}`.chomp
47
47
  raise ExtractionFailed, result if $? != 0
48
48
  end
@@ -11,7 +11,12 @@ module Docsplit
11
11
  pdf_name = File.basename(pdf, File.extname(pdf))
12
12
  page_path = File.join(@output, "#{pdf_name}_%d.pdf")
13
13
  FileUtils.mkdir_p @output unless File.exists?(@output)
14
- cmd = "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
14
+
15
+ cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
16
+ "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
17
+ else
18
+ "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
19
+ end
15
20
  result = `#{cmd}`.chomp
16
21
  FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
17
22
  raise ExtractionFailed, result if $? != 0
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- prerelease: false
4
+ hash: 15
5
+ prerelease:
5
6
  segments:
6
7
  - 0
7
8
  - 6
8
- - 3
9
- version: 0.6.3
9
+ - 4
10
+ version: 0.6.4
10
11
  platform: ruby
11
12
  authors:
12
13
  - Jeremy Ashkenas
@@ -16,8 +17,7 @@ autorequire:
16
17
  bindir: bin
17
18
  cert_chain: []
18
19
 
19
- date: 2011-11-23 00:00:00 -06:00
20
- default_executable:
20
+ date: 2012-11-12 00:00:00 Z
21
21
  dependencies: []
22
22
 
23
23
  description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
@@ -51,7 +51,6 @@ files:
51
51
  - docsplit.gemspec
52
52
  - LICENSE
53
53
  - README
54
- has_rdoc: true
55
54
  homepage: http://documentcloud.github.com/docsplit/
56
55
  licenses: []
57
56
 
@@ -61,23 +60,27 @@ rdoc_options: []
61
60
  require_paths:
62
61
  - lib
63
62
  required_ruby_version: !ruby/object:Gem::Requirement
63
+ none: false
64
64
  requirements:
65
65
  - - ">="
66
66
  - !ruby/object:Gem::Version
67
+ hash: 3
67
68
  segments:
68
69
  - 0
69
70
  version: "0"
70
71
  required_rubygems_version: !ruby/object:Gem::Requirement
72
+ none: false
71
73
  requirements:
72
74
  - - ">="
73
75
  - !ruby/object:Gem::Version
76
+ hash: 3
74
77
  segments:
75
78
  - 0
76
79
  version: "0"
77
80
  requirements: []
78
81
 
79
82
  rubyforge_project: docsplit
80
- rubygems_version: 1.3.6
83
+ rubygems_version: 1.8.24
81
84
  signing_key:
82
85
  specification_version: 3
83
86
  summary: Break Apart Documents into Images, Text, Pages and PDFs