docsplit 0.7.6 → 0.8.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6c4106dcd5d8d9f8f6a1915a99a438b293154e1e
4
- data.tar.gz: 90450ce6412bbedb022f4bc68ec7171f47b5d829
3
+ metadata.gz: 74386527dc285350f68f32adc8ff00d8386fceb6
4
+ data.tar.gz: 9af1f5b45c5f684393732ab7e25a8c30c0ab357a
5
5
  SHA512:
6
- metadata.gz: 1f6ccf476687ce1bf3a5559f07d0f7d8ebd2a80034b102b3058f538fb962a3b537b8e3eaeb245df27f14a4dc70716b69e34599bb50edf3e99e7b8a7b3f38d98d
7
- data.tar.gz: 912d974bc4ed17942d32a932232439cd2df6903d6d20e72af31e0e80a1c70fc5e58d4be63bd00f245c53be90dc93a815ffd41a25268072367a1a244a5cb59ec4
6
+ metadata.gz: 42fe34c591dec03a4392bb2b0c43b1c224062f1ffdb5608bf8671487c3441ac703871e1da48d05428a90f7abb11b191ecfa4eb5d2778b8e886827523250f1561
7
+ data.tar.gz: de4a3bc034585d2dbcc952fd5276c41d00710f8171671b818bc540032309c1e9c9a0914657cae08f478112366d86f1fc68f4df2f0ae085f132927b74fb1f45bd
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.7.6' # Keep version in sync with docsplit.rb
3
+ s.version = '0.8.0.alpha' # Keep version in sync with docsplit.rb
4
4
  s.date = '2014-11-17'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
@@ -5,7 +5,7 @@ require 'shellwords'
5
5
  # The Docsplit module delegates to the Java PDF extractors.
6
6
  module Docsplit
7
7
 
8
- VERSION = '0.7.6' # Keep in sync with gemspec.
8
+ VERSION = '0.8.0' # Keep in sync with gemspec.
9
9
 
10
10
  ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
11
 
@@ -56,7 +56,7 @@ module Docsplit
56
56
  def self.extract_images(pdfs, opts={})
57
57
  pdfs = ensure_pdfs(pdfs)
58
58
  opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
59
- ImageExtractor.new.extract(pdfs, opts)
59
+ PDFShaverExtractor.new.extract(pdfs, opts)
60
60
  end
61
61
 
62
62
  # Use JODCConverter to extract the documents as PDFs.
@@ -101,6 +101,7 @@ module Docsplit
101
101
  end
102
102
 
103
103
  require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
104
+ require "#{Docsplit::ROOT}/lib/docsplit/pdfshaver_extractor"
104
105
  require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
105
106
  require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
106
107
  require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
@@ -0,0 +1,65 @@
1
+ require 'pdfshaver'
2
+ module Docsplit
3
+ class PDFShaverExtractor
4
+
5
+ MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
6
+ DEFAULT_FORMAT = :png
7
+ DEFAULT_DENSITY = '150'
8
+
9
+ def extract(paths, options={})
10
+ extract_options(options)
11
+ paths.flatten.each do |pdf_path|
12
+ begin
13
+ pdf = PDFShaver::Document.new(pdf_path)
14
+ rescue ArgumentError => e
15
+ raise ExtractionFailed
16
+ end
17
+ pdf.pages(extract_page_list(@pages)).each do |page|
18
+ @formats.each do |format|
19
+ @sizes.each do |size_string|
20
+ directory = directory_for(size_string)
21
+ pdf_name = File.basename(pdf_path, File.extname(pdf_path))
22
+ filename = "#{pdf_name}_#{page.number}.#{format}"
23
+ destination = File.join(directory, filename)
24
+ FileUtils.mkdir_p ESCAPE[directory]
25
+
26
+ dimensions = page.extract_dimensions_from_gm_geometry_string(size_string)
27
+ page.render(destination, dimensions)
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
33
+
34
+ private
35
+ # If there's only one size requested, generate the images directly into
36
+ # the output directory. Multiple sizes each get a directory of their own.
37
+ def directory_for(size)
38
+ path = @sizes.length == 1 ? @output : File.join(@output, size)
39
+ File.expand_path(path)
40
+ end
41
+
42
+ # Generate the expanded list of requested page numbers.
43
+ def extract_page_list(pages)
44
+ return :all if pages.nil?
45
+ pages.split(',').map { |range|
46
+ if range.include?('-')
47
+ range = range.split('-')
48
+ Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
49
+ else
50
+ range.to_i
51
+ end
52
+ }.flatten.uniq.sort
53
+ end
54
+
55
+ def extract_options(options)
56
+ @output = options[:output] || '.'
57
+ @pages = options[:pages]
58
+ @density = options[:density] || DEFAULT_DENSITY
59
+ @formats = [options[:format] || DEFAULT_FORMAT].flatten
60
+ @sizes = [options[:size]].flatten.compact
61
+ @sizes = [nil] if @sizes.empty?
62
+ @rolling = !!options[:rolling]
63
+ end
64
+ end
65
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.6
4
+ version: 0.8.0.alpha
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy Ashkenas
@@ -33,6 +33,7 @@ files:
33
33
  - lib/docsplit/info_extractor.rb
34
34
  - lib/docsplit/page_extractor.rb
35
35
  - lib/docsplit/pdf_extractor.rb
36
+ - lib/docsplit/pdfshaver_extractor.rb
36
37
  - lib/docsplit/text_cleaner.rb
37
38
  - lib/docsplit/text_extractor.rb
38
39
  - lib/docsplit/transparent_pdfs.rb
@@ -61,12 +62,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
61
62
  version: '0'
62
63
  required_rubygems_version: !ruby/object:Gem::Requirement
63
64
  requirements:
64
- - - ">="
65
+ - - ">"
65
66
  - !ruby/object:Gem::Version
66
- version: '0'
67
+ version: 1.3.1
67
68
  requirements: []
68
69
  rubyforge_project: docsplit
69
- rubygems_version: 2.2.2
70
+ rubygems_version: 2.4.5
70
71
  signing_key:
71
72
  specification_version: 4
72
73
  summary: Break Apart Documents into Images, Text, Pages and PDFs