docsplit 0.7.6 → 0.8.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/docsplit.gemspec +1 -1
- data/lib/docsplit.rb +3 -2
- data/lib/docsplit/pdfshaver_extractor.rb +65 -0
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 74386527dc285350f68f32adc8ff00d8386fceb6
|
4
|
+
data.tar.gz: 9af1f5b45c5f684393732ab7e25a8c30c0ab357a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 42fe34c591dec03a4392bb2b0c43b1c224062f1ffdb5608bf8671487c3441ac703871e1da48d05428a90f7abb11b191ecfa4eb5d2778b8e886827523250f1561
|
7
|
+
data.tar.gz: de4a3bc034585d2dbcc952fd5276c41d00710f8171671b818bc540032309c1e9c9a0914657cae08f478112366d86f1fc68f4df2f0ae085f132927b74fb1f45bd
|
data/docsplit.gemspec
CHANGED
data/lib/docsplit.rb
CHANGED
@@ -5,7 +5,7 @@ require 'shellwords'
|
|
5
5
|
# The Docsplit module delegates to the Java PDF extractors.
|
6
6
|
module Docsplit
|
7
7
|
|
8
|
-
VERSION = '0.
|
8
|
+
VERSION = '0.8.0' # Keep in sync with gemspec.
|
9
9
|
|
10
10
|
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
|
11
11
|
|
@@ -56,7 +56,7 @@ module Docsplit
|
|
56
56
|
def self.extract_images(pdfs, opts={})
|
57
57
|
pdfs = ensure_pdfs(pdfs)
|
58
58
|
opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
|
59
|
-
|
59
|
+
PDFShaverExtractor.new.extract(pdfs, opts)
|
60
60
|
end
|
61
61
|
|
62
62
|
# Use JODCConverter to extract the documents as PDFs.
|
@@ -101,6 +101,7 @@ module Docsplit
|
|
101
101
|
end
|
102
102
|
|
103
103
|
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
|
104
|
+
require "#{Docsplit::ROOT}/lib/docsplit/pdfshaver_extractor"
|
104
105
|
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
|
105
106
|
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
|
106
107
|
require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'pdfshaver'
|
2
|
+
module Docsplit
|
3
|
+
class PDFShaverExtractor
|
4
|
+
|
5
|
+
MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
|
6
|
+
DEFAULT_FORMAT = :png
|
7
|
+
DEFAULT_DENSITY = '150'
|
8
|
+
|
9
|
+
def extract(paths, options={})
|
10
|
+
extract_options(options)
|
11
|
+
paths.flatten.each do |pdf_path|
|
12
|
+
begin
|
13
|
+
pdf = PDFShaver::Document.new(pdf_path)
|
14
|
+
rescue ArgumentError => e
|
15
|
+
raise ExtractionFailed
|
16
|
+
end
|
17
|
+
pdf.pages(extract_page_list(@pages)).each do |page|
|
18
|
+
@formats.each do |format|
|
19
|
+
@sizes.each do |size_string|
|
20
|
+
directory = directory_for(size_string)
|
21
|
+
pdf_name = File.basename(pdf_path, File.extname(pdf_path))
|
22
|
+
filename = "#{pdf_name}_#{page.number}.#{format}"
|
23
|
+
destination = File.join(directory, filename)
|
24
|
+
FileUtils.mkdir_p ESCAPE[directory]
|
25
|
+
|
26
|
+
dimensions = page.extract_dimensions_from_gm_geometry_string(size_string)
|
27
|
+
page.render(destination, dimensions)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
# If there's only one size requested, generate the images directly into
|
36
|
+
# the output directory. Multiple sizes each get a directory of their own.
|
37
|
+
def directory_for(size)
|
38
|
+
path = @sizes.length == 1 ? @output : File.join(@output, size)
|
39
|
+
File.expand_path(path)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Generate the expanded list of requested page numbers.
|
43
|
+
def extract_page_list(pages)
|
44
|
+
return :all if pages.nil?
|
45
|
+
pages.split(',').map { |range|
|
46
|
+
if range.include?('-')
|
47
|
+
range = range.split('-')
|
48
|
+
Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
|
49
|
+
else
|
50
|
+
range.to_i
|
51
|
+
end
|
52
|
+
}.flatten.uniq.sort
|
53
|
+
end
|
54
|
+
|
55
|
+
def extract_options(options)
|
56
|
+
@output = options[:output] || '.'
|
57
|
+
@pages = options[:pages]
|
58
|
+
@density = options[:density] || DEFAULT_DENSITY
|
59
|
+
@formats = [options[:format] || DEFAULT_FORMAT].flatten
|
60
|
+
@sizes = [options[:size]].flatten.compact
|
61
|
+
@sizes = [nil] if @sizes.empty?
|
62
|
+
@rolling = !!options[:rolling]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: docsplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0.alpha
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy Ashkenas
|
@@ -33,6 +33,7 @@ files:
|
|
33
33
|
- lib/docsplit/info_extractor.rb
|
34
34
|
- lib/docsplit/page_extractor.rb
|
35
35
|
- lib/docsplit/pdf_extractor.rb
|
36
|
+
- lib/docsplit/pdfshaver_extractor.rb
|
36
37
|
- lib/docsplit/text_cleaner.rb
|
37
38
|
- lib/docsplit/text_extractor.rb
|
38
39
|
- lib/docsplit/transparent_pdfs.rb
|
@@ -61,12 +62,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
61
62
|
version: '0'
|
62
63
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
64
|
requirements:
|
64
|
-
- - "
|
65
|
+
- - ">"
|
65
66
|
- !ruby/object:Gem::Version
|
66
|
-
version:
|
67
|
+
version: 1.3.1
|
67
68
|
requirements: []
|
68
69
|
rubyforge_project: docsplit
|
69
|
-
rubygems_version: 2.
|
70
|
+
rubygems_version: 2.4.5
|
70
71
|
signing_key:
|
71
72
|
specification_version: 4
|
72
73
|
summary: Break Apart Documents into Images, Text, Pages and PDFs
|