pdf-reader-extract-images 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: ae5af6eccd6fb11766f5a3ede1ccbc689ad2ce72c570f481a9a6635c3622df9d
4
+ data.tar.gz: 3d319014cc564e6ce848853ec81cff50f43039d303d194e6d60dc036e508dce5
5
+ SHA512:
6
+ metadata.gz: e5db5d318b64a06e3c2270e3efd23aa44aedea20e68cf32cf6a2c6bbac4861d9ab0e4d9ff85dba998e0f0f75c3810ac8e7cf7c3643bd9754ad925b7170d5a1f4
7
+ data.tar.gz: 550f9a3778412f308043d979365d712170533cb02c65afbdbce5ec33b0f5f6ebd4685c2f4830ab7a1b2b00cff71de5e4a16410d3ebb4458fe662a91c40aaaf72
data/.gitignore ADDED
@@ -0,0 +1,8 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
data/.tool-versions ADDED
@@ -0,0 +1 @@
1
+ ruby 3.0.1
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in pdf-reader-extract-images.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Stefan Wienert
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,60 @@
1
+ # Pdf::Reader::Extract::Images
2
+
3
+ ExtractImages
4
+
5
+ Based upon the [Example from Pdf::Reader](https://github.com/yob/pdf-reader/blob/main/examples/extract_images.rb), battle hardened in our applicant tracking system with tens of thousands of PDFs.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'pdf-reader-extract-images'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle install
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install pdf-reader-extract-images
22
+
23
+ ## Usage
24
+
25
+ ```ruby
26
+ require 'pdf-reader-extract-images'
27
+
28
+ reader = PDF::Reader.new(pdf)
29
+ images = Pdf::Reader::ExtractImages.extract_all(reader)
30
+
31
+ # pass an image limit to ignore gigantic image-only pdfs
32
+ images = Pdf::Reader::ExtractImages.extract_all(reader, limit: 50)
33
+
34
+ # [
35
+ # {
36
+ # :filename => "1-1-Im1.jpg",
37
+ # :width => 1772,
38
+ # :height => 591
39
+ # :blob => "....",
40
+ # }
41
+ # ]
42
+
43
+ # OR you can just scan a single Pdf::Reader Page
44
+
45
+ reader.pages.each do |page|
46
+ images = Pdf::Reader::ExtractImages.extract_from_pdf_page(page)
47
+ end
48
+ ```
49
+
50
+ ## Limitations
51
+
52
+ There are some PDFs which have tons of images. Make sure to limit the timeout of an extraction somehow.
53
+
54
+ Also some PDFs product hundreds of images. Make sure to limit further processing down the line
55
+
56
+ Unfortunately, there is no public test suite. We have a private test suite that tests live pdfs which we cannot share. If you'd like to contribute problematic PDFs, feel free to open PR!
57
+
58
+ ## License
59
+
60
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ task default: %i[]
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "pdf/reader/extract/images"
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require "irb"
15
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,73 @@
1
+ module Pdf::Reader::ExtractImages
2
+ class Extractor
3
+ def initialize(limit = Float::INFINITY)
4
+ @images = []
5
+ @limit = limit
6
+ end
7
+
8
+ def page(page)
9
+ process_page(page, 0)
10
+ @images.compact!
11
+ @images
12
+ end
13
+
14
+ private
15
+
16
+ def complete_refs
17
+ @complete_refs ||= {}
18
+ end
19
+
20
+ def process_page(page, count)
21
+ xobjects = page.xobjects
22
+ return count if xobjects.empty?
23
+
24
+ xobjects.each do |name, stream|
25
+ return if @images.length > @limit
26
+
27
+ case stream.hash[:Subtype]
28
+ when :Image then
29
+ count += 1
30
+ number = page.respond_to?(:number) ? page.number : 1
31
+
32
+ @images << extract_image_from_stream(stream, filename: "#{number}-#{count}-#{name}")
33
+ when :Form then
34
+ if page.respond_to?(:objects)
35
+ count = process_page(PDF::Reader::FormXObject.new(page, stream), count)
36
+ end
37
+ end
38
+ end
39
+ count
40
+ end
41
+
42
+ def extract_image_from_stream(stream, filename:)
43
+ case stream.hash[:Filter]
44
+ when :CCITTFaxDecode
45
+ begin
46
+ Tiff.new(stream).save("#{filename}.tif")
47
+ rescue PDF::Reader::MalformedPDFError
48
+ nil
49
+ end
50
+ when :DCTDecode
51
+ Jpg.new(stream).save("#{filename}.jpg")
52
+ when [:FlateDecode, :DCTDecode], :FlateDecode
53
+ unzipped = Zlib::Inflate.inflate(stream.data)
54
+ if stream.hash[:ColorSpace]
55
+ Raw.new(stream, unzipped).save("#{filename}.tif")
56
+ else
57
+ {
58
+ blob: unzipped,
59
+ width: stream.hash[:Width],
60
+ height: stream.hash[:Height],
61
+ filename: "#{filename}.jpg"
62
+ }
63
+ end
64
+ else
65
+ begin
66
+ Raw.new(stream).save("#{filename}.tif")
67
+ rescue PDF::Reader::MalformedPDFError
68
+ nil
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,26 @@
1
+ require 'image_processing'
2
+
3
+ module Pdf::Reader::ExtractImages
4
+ class Jpg
5
+ attr_reader :stream
6
+
7
+ def initialize(stream)
8
+ @stream = stream
9
+ end
10
+
11
+ def save(filename)
12
+ w = stream.hash[:Width]
13
+ h = stream.hash[:Height]
14
+ blob = stream.data
15
+ if stream.hash[:ColorSpace] == :DeviceCMYK && stream.data['Adobe']
16
+ blob = Tempfile.open(['extract', filename]) { |tf|
17
+ tf.binmode
18
+ tf.write stream.data
19
+ tf.flush
20
+ ImageProcessing::MiniMagick.source(tf.path).negate.call.read
21
+ }
22
+ end
23
+ { filename: filename, blob: blob, width: w, height: h }
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,118 @@
1
+ module Pdf::Reader::ExtractImages
2
+ class Raw
3
+ attr_reader :stream
4
+
5
+ def initialize(stream, data = stream.unfiltered_data)
6
+ @stream = stream
7
+ @data = data
8
+ end
9
+
10
+ def save(filename)
11
+ case @stream.hash[:ColorSpace]
12
+ when :DeviceCMYK then save_cmyk(filename)
13
+ when :DeviceGray then save_gray(filename)
14
+ when :DeviceRGB then save_rgb(filename)
15
+ else
16
+ if @stream.hash[:ColorSpace].is_a?(Array)
17
+ if @stream.hash[:ColorSpace].include?(:DeviceCMYK)
18
+ return save_cmyk(filename)
19
+ elsif @stream.hash[:ColorSpace].include?(:DeviceRGB)
20
+ return save_rgb(filename)
21
+ elsif @stream.hash[:ColorSpace].include?(:DeviceGray)
22
+ return save_gray(filename)
23
+ end
24
+ end
25
+ warn "unsupport color depth #{@stream.hash[:ColorSpace]} #{filename}"
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def save_cmyk(filename)
32
+ h = stream.hash[:Height]
33
+ w = stream.hash[:Width]
34
+ bpc = stream.hash[:BitsPerComponent]
35
+ len = stream.hash[:Length]
36
+
37
+ # Synthesize a TIFF header
38
+ long_tag = ->(tag, count, value) { [tag, 4, count, value].pack("ssII") }
39
+ short_tag = ->(tag, count, value) { [tag, 3, count, value].pack("ssII") }
40
+ # header = byte order, version magic, offset of directory, directory count,
41
+ # followed by a series of tags containing metadata.
42
+ tag_count = 10
43
+ header = [73, 73, 42, 8, tag_count].pack("ccsIs")
44
+ tiff = header.dup
45
+ tiff << short_tag.call(256, 1, w) # image width
46
+ tiff << short_tag.call(257, 1, h) # image height
47
+ tiff << long_tag.call(258, 4, (header.size + (tag_count * 12) + 4)) # bits per pixel
48
+ tiff << short_tag.call(259, 1, 1) # compression
49
+ tiff << short_tag.call(262, 1, 5) # colorspace - separation
50
+ tiff << long_tag.call(273, 1, (10 + (tag_count * 12) + 20)) # data offset
51
+ tiff << short_tag.call(277, 1, 4) # samples per pixel
52
+ tiff << long_tag.call(279, 1, @data.size) # data byte size
53
+ tiff << short_tag.call(284, 1, 1) # planer config
54
+ tiff << long_tag.call(332, 1, 1) # inkset - CMYK
55
+ tiff << [0].pack("I") # next IFD pointer
56
+ tiff << [bpc, bpc, bpc, bpc].pack("IIII")
57
+ tiff << @data
58
+ { filename: filename, blob: tiff, width: w, height: h }
59
+ end
60
+
61
+ def save_gray(filename)
62
+ h = stream.hash[:Height]
63
+ w = stream.hash[:Width]
64
+ bpc = stream.hash[:BitsPerComponent]
65
+ len = stream.hash[:Length]
66
+
67
+ # Synthesize a TIFF header
68
+ long_tag = ->(tag, count, value) { [tag, 4, count, value].pack("ssII") }
69
+ short_tag = ->(tag, count, value) { [tag, 3, count, value].pack("ssII") }
70
+ # header = byte order, version magic, offset of directory, directory count,
71
+ # followed by a series of tags containing metadata.
72
+ tag_count = 9
73
+ header = [73, 73, 42, 8, tag_count].pack("ccsIs")
74
+ tiff = header.dup
75
+ tiff << short_tag.call(256, 1, w) # image width
76
+ tiff << short_tag.call(257, 1, h) # image height
77
+ tiff << short_tag.call(258, 1, 8) # bits per pixel
78
+ tiff << short_tag.call(259, 1, 1) # compression
79
+ tiff << short_tag.call(262, 1, 1) # colorspace - grayscale
80
+ tiff << long_tag.call(273, 1, (10 + (tag_count * 12) + 4)) # data offset
81
+ tiff << short_tag.call(277, 1, 1) # samples per pixel
82
+ tiff << long_tag.call(279, 1, stream.unfiltered_data.size) # data byte size
83
+ tiff << short_tag.call(284, 1, 1) # planer config
84
+ tiff << [0].pack("I") # next IFD pointer
85
+ tiff << stream.unfiltered_data
86
+ { filename: filename, blob: tiff, width: w, height: h }
87
+ end
88
+
89
+ def save_rgb(filename)
90
+ h = stream.hash[:Height]
91
+ w = stream.hash[:Width]
92
+ bpc = stream.hash[:BitsPerComponent]
93
+ len = stream.hash[:Length]
94
+
95
+ # Synthesize a TIFF header
96
+ long_tag = ->(tag, count, value) { [tag, 4, count, value].pack("ssII") }
97
+ short_tag = ->(tag, count, value) { [tag, 3, count, value].pack("ssII") }
98
+ # header = byte order, version magic, offset of directory, directory count,
99
+ # followed by a series of tags containing metadata.
100
+ tag_count = 8
101
+ header = [73, 73, 42, 8, tag_count].pack("ccsIs")
102
+ tiff = header.dup
103
+ tiff << short_tag.call(256, 1, w) # image width
104
+ tiff << short_tag.call(257, 1, h) # image height
105
+ tiff << long_tag.call(258, 3, (header.size + (tag_count * 12) + 4)) # bits per pixel
106
+ tiff << short_tag.call(259, 1, 1) # compression
107
+ tiff << short_tag.call(262, 1, 2) # colorspace - RGB
108
+ tiff << long_tag.call(273, 1, (header.size + (tag_count * 12) + 16)) # data offset
109
+ tiff << short_tag.call(277, 1, 3) # samples per pixel
110
+ tiff << long_tag.call(279, 1, stream.unfiltered_data.size) # data byte size
111
+ tiff << [0].pack("I") # next IFD pointer
112
+ tiff << [bpc, bpc, bpc].pack("III")
113
+ tiff << stream.unfiltered_data
114
+ { filename: filename, blob: tiff }
115
+ end
116
+ end
117
+ end
118
+
@@ -0,0 +1,50 @@
1
+ module Pdf::Reader::ExtractImages
2
+ class Tiff
3
+ attr_reader :stream
4
+
5
+ def initialize(stream)
6
+ @stream = stream
7
+ end
8
+
9
+ def save(filename)
10
+ k = stream.hash[:DecodeParms][:K]
11
+ if !k.nil? && stream.hash[:DecodeParms][:K] <= 0
12
+ save_group_four(filename)
13
+ else
14
+ warn "#{filename}: CCITT non-group 4/2D image."
15
+ end
16
+ end
17
+
18
+ private
19
+
20
+ # Group 4, 2D
21
+ def save_group_four(filename)
22
+ k = stream.hash[:DecodeParms][:K]
23
+ h = stream.hash[:Height]
24
+ w = stream.hash[:Width]
25
+ bpc = stream.hash[:BitsPerComponent]
26
+ mask = stream.hash[:ImageMask]
27
+ len = stream.hash[:Length]
28
+ cols = stream.hash[:DecodeParms][:Columns]
29
+
30
+ # Synthesize a TIFF header
31
+ long_tag = ->(tag, value) { [tag, 4, 1, value].pack("ssII") }
32
+ short_tag = ->(tag, value) { [tag, 3, 1, value].pack("ssII") }
33
+ # header = byte order, version magic, offset of directory, directory count,
34
+ # followed by a series of tags containing metadata: 259 is a magic number for
35
+ # the compression type; 273 is the offset of the image data.
36
+ tiff = [73, 73, 42, 8, 5].pack("ccsIs") \
37
+ + short_tag.call(256, cols) \
38
+ + short_tag.call(257, h) \
39
+ + short_tag.call(259, 4) \
40
+ + long_tag.call(273, (10 + (5 * 12) + 4)) \
41
+ + long_tag.call(279, len) \
42
+ + [0].pack("I") \
43
+ + stream.data
44
+ { filename: filename, blob: tiff, width: w, height: h }
45
+ end
46
+ end
47
+ end
48
+ end
49
+
50
+
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pdf
4
+ module Reader
5
+ module ExtractImages
6
+ VERSION = "0.1.0"
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,17 @@
1
+ require "zeitwerk"
2
+ loader = Zeitwerk::Loader.for_gem
3
+ loader.ignore(__FILE__)
4
+ loader.setup
5
+
6
+ module Pdf::Reader
7
+ module ExtractImages
8
+ def self.extract_from_pdf_page(page, limit: Float::INFINITY)
9
+ Extractor(limit).new.page(page)
10
+ end
11
+
12
+ def self.extract_all(pdf_reader, limit: Float::INFINITY)
13
+ pdf_reader.pages.flat_map { |page| Extractor.new(limit).page(page) }.compact
14
+ end
15
+ end
16
+ end
17
+
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/pdf/reader/extract_images/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "pdf-reader-extract-images"
7
+ spec.version = Pdf::Reader::ExtractImages::VERSION
8
+ spec.authors = ["Stefan Wienert"]
9
+ spec.email = ["info@stefanwienert.de"]
10
+
11
+ spec.summary = "Extract all images with format conversions based upon Pdf::Reader library"
12
+ spec.description = spec.summary
13
+ spec.homepage = "https://github.com/pludoni/pdf-reader-extract-images"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.4.0")
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
21
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
22
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
23
+ end
24
+ spec.bindir = "exe"
25
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
26
+ spec.require_paths = ["lib"]
27
+
28
+ spec.add_dependency "pdf-reader", ">= 2.5.0"
29
+ spec.add_dependency "image_processing"
30
+ spec.add_dependency "zeitwerk"
31
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdf-reader-extract-images
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Stefan Wienert
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-08-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: pdf-reader
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 2.5.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 2.5.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: image_processing
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: zeitwerk
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Extract all images with format conversions based upon Pdf::Reader library
56
+ email:
57
+ - info@stefanwienert.de
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - ".tool-versions"
64
+ - Gemfile
65
+ - LICENSE.txt
66
+ - README.md
67
+ - Rakefile
68
+ - bin/console
69
+ - bin/setup
70
+ - lib/pdf-reader-extract-images.rb
71
+ - lib/pdf/reader/extract_images/extractor.rb
72
+ - lib/pdf/reader/extract_images/jpg.rb
73
+ - lib/pdf/reader/extract_images/raw.rb
74
+ - lib/pdf/reader/extract_images/tiff.rb
75
+ - lib/pdf/reader/extract_images/version.rb
76
+ - pdf-reader-extract-images.gemspec
77
+ homepage: https://github.com/pludoni/pdf-reader-extract-images
78
+ licenses:
79
+ - MIT
80
+ metadata:
81
+ homepage_uri: https://github.com/pludoni/pdf-reader-extract-images
82
+ post_install_message:
83
+ rdoc_options: []
84
+ require_paths:
85
+ - lib
86
+ required_ruby_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: 2.4.0
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ requirements: []
97
+ rubygems_version: 3.2.15
98
+ signing_key:
99
+ specification_version: 4
100
+ summary: Extract all images with format conversions based upon Pdf::Reader library
101
+ test_files: []