docsplit 0.6.3 → 0.6.4

Sign up to get free protection for your applications and to get access to all the features.
data/docsplit.gemspec CHANGED
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.6.3' # Keep version in sync with docsplit.rb
4
- s.date = '2011-11-23'
3
+ s.version = '0.6.4' # Keep version in sync with docsplit.rb
4
+ s.date = '2012-11-12'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
7
7
  s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
data/lib/docsplit.rb CHANGED
@@ -1,13 +1,20 @@
1
+ require 'tmpdir'
2
+ require 'fileutils'
3
+ require 'shellwords'
4
+
1
5
  # The Docsplit module delegates to the Java PDF extractors.
2
6
  module Docsplit
3
7
 
4
- VERSION = '0.6.3' # Keep in sync with gemspec.
8
+ VERSION = '0.6.4' # Keep in sync with gemspec.
9
+
10
+ ESCAPE = lambda {|x| Shellwords.shellescape(x) }
5
11
 
6
12
  ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
13
+ ESCAPED_ROOT = ESCAPE[ROOT]
7
14
 
8
- CLASSPATH = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'"
15
+ CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
9
16
 
10
- LOGGING = "-Djava.util.logging.config.file=#{ROOT}/vendor/logging.properties"
17
+ LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
11
18
 
12
19
  HEADLESS = "-Djava.awt.headless=true"
13
20
 
@@ -20,9 +27,7 @@ module Docsplit
20
27
 
21
28
  GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
22
29
 
23
- DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
24
-
25
- ESCAPE = lambda {|x| Shellwords.shellescape(x) }
30
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false}
26
31
 
27
32
  # Check for all dependencies, and note their absence.
28
33
  dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
@@ -71,7 +76,7 @@ module Docsplit
71
76
  if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
72
77
  `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
73
78
  else
74
- options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ROOT}/vendor/conf/document-formats.js"
79
+ options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
75
80
  run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
76
81
  end
77
82
  end
@@ -117,9 +122,6 @@ module Docsplit
117
122
 
118
123
  end
119
124
 
120
- require 'tmpdir'
121
- require 'fileutils'
122
- require 'shellwords'
123
125
  require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
124
126
  require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
125
127
  require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
@@ -94,6 +94,9 @@ Options:
94
94
  opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
95
95
  @options[:clean] = false
96
96
  end
97
+ opts.on('-l', '--language [LANGUAGE]', 'set the language (ISO 639-2/T code) for text extraction') do |l|
98
+ @options[:language] = l
99
+ end
97
100
  opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
98
101
  @options[:rolling] = true
99
102
  end
@@ -42,7 +42,7 @@ module Docsplit
42
42
  else
43
43
  page_list(pages).each do |page|
44
44
  out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
45
- cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
45
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
46
46
  result = `#{cmd}`.chomp
47
47
  raise ExtractionFailed, result if $? != 0
48
48
  end
@@ -11,7 +11,12 @@ module Docsplit
11
11
  pdf_name = File.basename(pdf, File.extname(pdf))
12
12
  page_path = File.join(@output, "#{pdf_name}_%d.pdf")
13
13
  FileUtils.mkdir_p @output unless File.exists?(@output)
14
- cmd = "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
14
+
15
+ cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
16
+ "pdftailor unstitch --output #{ESCAPE[page_path]} #{ESCAPE[pdf]} 2>&1"
17
+ else
18
+ "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
19
+ end
15
20
  result = `#{cmd}`.chomp
16
21
  FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
17
22
  raise ExtractionFailed, result if $? != 0
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- prerelease: false
4
+ hash: 15
5
+ prerelease:
5
6
  segments:
6
7
  - 0
7
8
  - 6
8
- - 3
9
- version: 0.6.3
9
+ - 4
10
+ version: 0.6.4
10
11
  platform: ruby
11
12
  authors:
12
13
  - Jeremy Ashkenas
@@ -16,8 +17,7 @@ autorequire:
16
17
  bindir: bin
17
18
  cert_chain: []
18
19
 
19
- date: 2011-11-23 00:00:00 -06:00
20
- default_executable:
20
+ date: 2012-11-12 00:00:00 Z
21
21
  dependencies: []
22
22
 
23
23
  description: " Docsplit is a command-line utility and Ruby library for splitting apart\n documents into their component parts: searchable UTF-8 plain text, page\n images or thumbnails in any format, PDFs, single pages, and document\n metadata (title, author, number of pages...)\n"
@@ -51,7 +51,6 @@ files:
51
51
  - docsplit.gemspec
52
52
  - LICENSE
53
53
  - README
54
- has_rdoc: true
55
54
  homepage: http://documentcloud.github.com/docsplit/
56
55
  licenses: []
57
56
 
@@ -61,23 +60,27 @@ rdoc_options: []
61
60
  require_paths:
62
61
  - lib
63
62
  required_ruby_version: !ruby/object:Gem::Requirement
63
+ none: false
64
64
  requirements:
65
65
  - - ">="
66
66
  - !ruby/object:Gem::Version
67
+ hash: 3
67
68
  segments:
68
69
  - 0
69
70
  version: "0"
70
71
  required_rubygems_version: !ruby/object:Gem::Requirement
72
+ none: false
71
73
  requirements:
72
74
  - - ">="
73
75
  - !ruby/object:Gem::Version
76
+ hash: 3
74
77
  segments:
75
78
  - 0
76
79
  version: "0"
77
80
  requirements: []
78
81
 
79
82
  rubyforge_project: docsplit
80
- rubygems_version: 1.3.6
83
+ rubygems_version: 1.8.24
81
84
  signing_key:
82
85
  specification_version: 3
83
86
  summary: Break Apart Documents into Images, Text, Pages and PDFs