docsplit 0.7.6 → 0.8.0.alpha

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6c4106dcd5d8d9f8f6a1915a99a438b293154e1e
4
- data.tar.gz: 90450ce6412bbedb022f4bc68ec7171f47b5d829
3
+ metadata.gz: 74386527dc285350f68f32adc8ff00d8386fceb6
4
+ data.tar.gz: 9af1f5b45c5f684393732ab7e25a8c30c0ab357a
5
5
  SHA512:
6
- metadata.gz: 1f6ccf476687ce1bf3a5559f07d0f7d8ebd2a80034b102b3058f538fb962a3b537b8e3eaeb245df27f14a4dc70716b69e34599bb50edf3e99e7b8a7b3f38d98d
7
- data.tar.gz: 912d974bc4ed17942d32a932232439cd2df6903d6d20e72af31e0e80a1c70fc5e58d4be63bd00f245c53be90dc93a815ffd41a25268072367a1a244a5cb59ec4
6
+ metadata.gz: 42fe34c591dec03a4392bb2b0c43b1c224062f1ffdb5608bf8671487c3441ac703871e1da48d05428a90f7abb11b191ecfa4eb5d2778b8e886827523250f1561
7
+ data.tar.gz: de4a3bc034585d2dbcc952fd5276c41d00710f8171671b818bc540032309c1e9c9a0914657cae08f478112366d86f1fc68f4df2f0ae085f132927b74fb1f45bd
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'docsplit'
3
- s.version = '0.7.6' # Keep version in sync with docsplit.rb
3
+ s.version = '0.8.0.alpha' # Keep version in sync with docsplit.rb
4
4
  s.date = '2014-11-17'
5
5
 
6
6
  s.homepage = "http://documentcloud.github.com/docsplit/"
@@ -5,7 +5,7 @@ require 'shellwords'
5
5
  # The Docsplit module delegates to the Java PDF extractors.
6
6
  module Docsplit
7
7
 
8
- VERSION = '0.7.6' # Keep in sync with gemspec.
8
+ VERSION = '0.8.0' # Keep in sync with gemspec.
9
9
 
10
10
  ESCAPE = lambda {|x| Shellwords.shellescape(x) }
11
11
 
@@ -56,7 +56,7 @@ module Docsplit
56
56
  def self.extract_images(pdfs, opts={})
57
57
  pdfs = ensure_pdfs(pdfs)
58
58
  opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
59
- ImageExtractor.new.extract(pdfs, opts)
59
+ PDFShaverExtractor.new.extract(pdfs, opts)
60
60
  end
61
61
 
62
62
  # Use JODCConverter to extract the documents as PDFs.
@@ -101,6 +101,7 @@ module Docsplit
101
101
  end
102
102
 
103
103
  require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
104
+ require "#{Docsplit::ROOT}/lib/docsplit/pdfshaver_extractor"
104
105
  require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
105
106
  require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
106
107
  require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
@@ -0,0 +1,65 @@
1
+ require 'pdfshaver'
2
+ module Docsplit
3
+ class PDFShaverExtractor
4
+
5
+ MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
6
+ DEFAULT_FORMAT = :png
7
+ DEFAULT_DENSITY = '150'
8
+
9
+ def extract(paths, options={})
10
+ extract_options(options)
11
+ paths.flatten.each do |pdf_path|
12
+ begin
13
+ pdf = PDFShaver::Document.new(pdf_path)
14
+ rescue ArgumentError => e
15
+ raise ExtractionFailed
16
+ end
17
+ pdf.pages(extract_page_list(@pages)).each do |page|
18
+ @formats.each do |format|
19
+ @sizes.each do |size_string|
20
+ directory = directory_for(size_string)
21
+ pdf_name = File.basename(pdf_path, File.extname(pdf_path))
22
+ filename = "#{pdf_name}_#{page.number}.#{format}"
23
+ destination = File.join(directory, filename)
24
+ FileUtils.mkdir_p ESCAPE[directory]
25
+
26
+ dimensions = page.extract_dimensions_from_gm_geometry_string(size_string)
27
+ page.render(destination, dimensions)
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
33
+
34
+ private
35
+ # If there's only one size requested, generate the images directly into
36
+ # the output directory. Multiple sizes each get a directory of their own.
37
+ def directory_for(size)
38
+ path = @sizes.length == 1 ? @output : File.join(@output, size)
39
+ File.expand_path(path)
40
+ end
41
+
42
+ # Generate the expanded list of requested page numbers.
43
+ def extract_page_list(pages)
44
+ return :all if pages.nil?
45
+ pages.split(',').map { |range|
46
+ if range.include?('-')
47
+ range = range.split('-')
48
+ Range.new(range.first.to_i, range.last.to_i).to_a.map {|n| n.to_i }
49
+ else
50
+ range.to_i
51
+ end
52
+ }.flatten.uniq.sort
53
+ end
54
+
55
+ def extract_options(options)
56
+ @output = options[:output] || '.'
57
+ @pages = options[:pages]
58
+ @density = options[:density] || DEFAULT_DENSITY
59
+ @formats = [options[:format] || DEFAULT_FORMAT].flatten
60
+ @sizes = [options[:size]].flatten.compact
61
+ @sizes = [nil] if @sizes.empty?
62
+ @rolling = !!options[:rolling]
63
+ end
64
+ end
65
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: docsplit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.6
4
+ version: 0.8.0.alpha
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy Ashkenas
@@ -33,6 +33,7 @@ files:
33
33
  - lib/docsplit/info_extractor.rb
34
34
  - lib/docsplit/page_extractor.rb
35
35
  - lib/docsplit/pdf_extractor.rb
36
+ - lib/docsplit/pdfshaver_extractor.rb
36
37
  - lib/docsplit/text_cleaner.rb
37
38
  - lib/docsplit/text_extractor.rb
38
39
  - lib/docsplit/transparent_pdfs.rb
@@ -61,12 +62,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
61
62
  version: '0'
62
63
  required_rubygems_version: !ruby/object:Gem::Requirement
63
64
  requirements:
64
- - - ">="
65
+ - - ">"
65
66
  - !ruby/object:Gem::Version
66
- version: '0'
67
+ version: 1.3.1
67
68
  requirements: []
68
69
  rubyforge_project: docsplit
69
- rubygems_version: 2.2.2
70
+ rubygems_version: 2.4.5
70
71
  signing_key:
71
72
  specification_version: 4
72
73
  summary: Break Apart Documents into Images, Text, Pages and PDFs