docsplit 0.7.6 → 0.8.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/docsplit.gemspec +1 -1
- data/lib/docsplit.rb +3 -2
- data/lib/docsplit/pdfshaver_extractor.rb +65 -0
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 74386527dc285350f68f32adc8ff00d8386fceb6
|
4
|
+
data.tar.gz: 9af1f5b45c5f684393732ab7e25a8c30c0ab357a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 42fe34c591dec03a4392bb2b0c43b1c224062f1ffdb5608bf8671487c3441ac703871e1da48d05428a90f7abb11b191ecfa4eb5d2778b8e886827523250f1561
|
7
|
+
data.tar.gz: de4a3bc034585d2dbcc952fd5276c41d00710f8171671b818bc540032309c1e9c9a0914657cae08f478112366d86f1fc68f4df2f0ae085f132927b74fb1f45bd
|
data/docsplit.gemspec
CHANGED
data/lib/docsplit.rb
CHANGED
@@ -5,7 +5,7 @@ require 'shellwords'
|
|
5
5
|
# The Docsplit module delegates to the Java PDF extractors.
|
6
6
|
module Docsplit
|
7
7
|
|
8
|
-
VERSION = '0.
|
8
|
+
VERSION = '0.8.0' # Keep in sync with gemspec.
|
9
9
|
|
10
10
|
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
11
11
|
|
@@ -56,7 +56,7 @@ module Docsplit
|
|
56
56
|
def self.extract_images(pdfs, opts={})
|
57
57
|
pdfs = ensure_pdfs(pdfs)
|
58
58
|
opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
|
59
|
-
|
59
|
+
PDFShaverExtractor.new.extract(pdfs, opts)
|
60
60
|
end
|
61
61
|
|
62
62
|
# Use JODCConverter to extract the documents as PDFs.
|
@@ -101,6 +101,7 @@ module Docsplit
|
|
101
101
|
end
|
102
102
|
|
103
103
|
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
104
|
+
require "#{Docsplit::ROOT}/lib/docsplit/pdfshaver_extractor"
|
104
105
|
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
105
106
|
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
106
107
|
require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'pdfshaver'
|
2
|
+
module Docsplit
|
3
|
+
class PDFShaverExtractor
|
4
|
+
|
5
|
+
MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
|
6
|
+
DEFAULT_FORMAT = :png
|
7
|
+
DEFAULT_DENSITY = '150'
|
8
|
+
|
9
|
+
def extract(paths, options={})
|
10
|
+
extract_options(options)
|
11
|
+
paths.flatten.each do |pdf_path|
|
12
|
+
begin
|
13
|
+
pdf = PDFShaver::Document.new(pdf_path)
|
14
|
+
rescue ArgumentError => e
|
15
|
+
raise ExtractionFailed
|
16
|
+
end
|
17
|
+
pdf.pages(extract_page_list(@pages)).each do |page|
|
18
|
+
@formats.each do |format|
|
19
|
+
@sizes.each do |size_string|
|
20
|
+
directory = directory_for(size_string)
|
21
|
+
pdf_name = File.basename(pdf_path, File.extname(pdf_path))
|
22
|
+
filename = "#{pdf_name}_#{page.number}.#{format}"
|
23
|
+
destination = File.join(directory, filename)
|
24
|
+
FileUtils.mkdir_p ESCAPE[directory]
|
25
|
+
|
26
|
+
dimensions = page.extract_dimensions_from_gm_geometry_string(size_string)
|
27
|
+
page.render(destination, dimensions)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
# If there's only one size requested, generate the images directly into
|
36
|
+
# the output directory. Multiple sizes each get a directory of their own.
|
37
|
+
def directory_for(size)
|
38
|
+
path = @sizes.length == 1 ? @output : File.join(@output, size)
|
39
|
+
File.expand_path(path)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Generate the expanded list of requested page numbers.
|
43
|
+
def extract_page_list(pages)
|
44
|
+
return :all if pages.nil?
|
45
|
+
pages.split(',').map { |range|
|
46
|
+
if range.include?('-')
|
47
|
+
range = range.split('-')
|
48
|
+
Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
|
49
|
+
else
|
50
|
+
range.to_i
|
51
|
+
end
|
52
|
+
}.flatten.uniq.sort
|
53
|
+
end
|
54
|
+
|
55
|
+
def extract_options(options)
|
56
|
+
@output = options[:output] || '.'
|
57
|
+
@pages = options[:pages]
|
58
|
+
@density = options[:density] || DEFAULT_DENSITY
|
59
|
+
@formats = [options[:format] || DEFAULT_FORMAT].flatten
|
60
|
+
@sizes = [options[:size]].flatten.compact
|
61
|
+
@sizes = [nil] if @sizes.empty?
|
62
|
+
@rolling = !!options[:rolling]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0.alpha
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy Ashkenas
|
@@ -33,6 +33,7 @@ files:
|
|
33
33
|
- lib/docsplit/info_extractor.rb
|
34
34
|
- lib/docsplit/page_extractor.rb
|
35
35
|
- lib/docsplit/pdf_extractor.rb
|
36
|
+
- lib/docsplit/pdfshaver_extractor.rb
|
36
37
|
- lib/docsplit/text_cleaner.rb
|
37
38
|
- lib/docsplit/text_extractor.rb
|
38
39
|
- lib/docsplit/transparent_pdfs.rb
|
@@ -61,12 +62,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
61
62
|
version: '0'
|
62
63
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
64
|
requirements:
|
64
|
-
- - "
|
65
|
+
- - ">"
|
65
66
|
- !ruby/object:Gem::Version
|
66
|
-
version:
|
67
|
+
version: 1.3.1
|
67
68
|
requirements: []
|
68
69
|
rubyforge_project: docsplit
|
69
|
-
rubygems_version: 2.
|
70
|
+
rubygems_version: 2.4.5
|
70
71
|
signing_key:
|
71
72
|
specification_version: 4
|
72
73
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|