pdf-reader-extract-images 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: ae5af6eccd6fb11766f5a3ede1ccbc689ad2ce72c570f481a9a6635c3622df9d
4
+ data.tar.gz: 3d319014cc564e6ce848853ec81cff50f43039d303d194e6d60dc036e508dce5
5
+ SHA512:
6
+ metadata.gz: e5db5d318b64a06e3c2270e3efd23aa44aedea20e68cf32cf6a2c6bbac4861d9ab0e4d9ff85dba998e0f0f75c3810ac8e7cf7c3643bd9754ad925b7170d5a1f4
7
+ data.tar.gz: 550f9a3778412f308043d979365d712170533cb02c65afbdbce5ec33b0f5f6ebd4685c2f4830ab7a1b2b00cff71de5e4a16410d3ebb4458fe662a91c40aaaf72
data/.gitignore ADDED
@@ -0,0 +1,8 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
data/.tool-versions ADDED
@@ -0,0 +1 @@
1
+ ruby 3.0.1
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in pdf-reader-extract-images.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Stefan Wienert
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,60 @@
1
+ # Pdf::Reader::Extract::Images
2
+
3
+ ExtractImages
4
+
5
+ Based upon the [Example from Pdf::Reader](https://github.com/yob/pdf-reader/blob/main/examples/extract_images.rb), battle hardened in our applicant tracking system with tens of thousands of PDFs.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'pdf-reader-extract-images'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle install
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install pdf-reader-extract-images
22
+
23
+ ## Usage
24
+
25
+ ```ruby
26
+ require 'pdf-reader-extract-images'
27
+
28
+ reader = PDF::Reader.new(pdf)
29
+ images = Pdf::Reader::ExtractImages.extract_all(reader)
30
+
31
+ # pass an image limit to ignore gigantic image-only pdfs
32
+ images = Pdf::Reader::ExtractImages.extract_all(reader, limit: 50)
33
+
34
+ # [
35
+ # {
36
+ # :filename => "1-1-Im1.jpg",
37
+ # :width => 1772,
38
+ # :height => 591
39
+ # :blob => "....",
40
+ # }
41
+ # ]
42
+
43
+ # OR you can just scan a single Pdf::Reader Page
44
+
45
+ reader.pages.each do |page|
46
+ images = Pdf::Reader::ExtractImages.extract_from_pdf_page(page)
47
+ end
48
+ ```
49
+
50
+ ## Limitations
51
+
52
+ There are some PDFs which have tons of images. Make sure to limit the timeout of an extraction somehow.
53
+
54
+ Also some PDFs product hundreds of images. Make sure to limit further processing down the line
55
+
56
+ Unfortunately, there is no public test suite. We have a private test suite that tests live pdfs which we cannot share. If you'd like to contribute problematic PDFs, feel free to open PR!
57
+
58
+ ## License
59
+
60
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ task default: %i[]
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "pdf/reader/extract/images"
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require "irb"
15
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,73 @@
1
+ module Pdf::Reader::ExtractImages
2
+ class Extractor
3
+ def initialize(limit = Float::INFINITY)
4
+ @images = []
5
+ @limit = limit
6
+ end
7
+
8
+ def page(page)
9
+ process_page(page, 0)
10
+ @images.compact!
11
+ @images
12
+ end
13
+
14
+ private
15
+
16
+ def complete_refs
17
+ @complete_refs ||= {}
18
+ end
19
+
20
+ def process_page(page, count)
21
+ xobjects = page.xobjects
22
+ return count if xobjects.empty?
23
+
24
+ xobjects.each do |name, stream|
25
+ return if @images.length > @limit
26
+
27
+ case stream.hash[:Subtype]
28
+ when :Image then
29
+ count += 1
30
+ number = page.respond_to?(:number) ? page.number : 1
31
+
32
+ @images << extract_image_from_stream(stream, filename: "#{number}-#{count}-#{name}")
33
+ when :Form then
34
+ if page.respond_to?(:objects)
35
+ count = process_page(PDF::Reader::FormXObject.new(page, stream), count)
36
+ end
37
+ end
38
+ end
39
+ count
40
+ end
41
+
42
+ def extract_image_from_stream(stream, filename:)
43
+ case stream.hash[:Filter]
44
+ when :CCITTFaxDecode
45
+ begin
46
+ Tiff.new(stream).save("#{filename}.tif")
47
+ rescue PDF::Reader::MalformedPDFError
48
+ nil
49
+ end
50
+ when :DCTDecode
51
+ Jpg.new(stream).save("#{filename}.jpg")
52
+ when [:FlateDecode, :DCTDecode], :FlateDecode
53
+ unzipped = Zlib::Inflate.inflate(stream.data)
54
+ if stream.hash[:ColorSpace]
55
+ Raw.new(stream, unzipped).save("#{filename}.tif")
56
+ else
57
+ {
58
+ blob: unzipped,
59
+ width: stream.hash[:Width],
60
+ height: stream.hash[:Height],
61
+ filename: "#{filename}.jpg"
62
+ }
63
+ end
64
+ else
65
+ begin
66
+ Raw.new(stream).save("#{filename}.tif")
67
+ rescue PDF::Reader::MalformedPDFError
68
+ nil
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,26 @@
1
+ require 'image_processing'
2
+
3
+ module Pdf::Reader::ExtractImages
4
+ class Jpg
5
+ attr_reader :stream
6
+
7
+ def initialize(stream)
8
+ @stream = stream
9
+ end
10
+
11
+ def save(filename)
12
+ w = stream.hash[:Width]
13
+ h = stream.hash[:Height]
14
+ blob = stream.data
15
+ if stream.hash[:ColorSpace] == :DeviceCMYK && stream.data['Adobe']
16
+ blob = Tempfile.open(['extract', filename]) { |tf|
17
+ tf.binmode
18
+ tf.write stream.data
19
+ tf.flush
20
+ ImageProcessing::MiniMagick.source(tf.path).negate.call.read
21
+ }
22
+ end
23
+ { filename: filename, blob: blob, width: w, height: h }
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,118 @@
1
+ module Pdf::Reader::ExtractImages
2
+ class Raw
3
+ attr_reader :stream
4
+
5
+ def initialize(stream, data = stream.unfiltered_data)
6
+ @stream = stream
7
+ @data = data
8
+ end
9
+
10
+ def save(filename)
11
+ case @stream.hash[:ColorSpace]
12
+ when :DeviceCMYK then save_cmyk(filename)
13
+ when :DeviceGray then save_gray(filename)
14
+ when :DeviceRGB then save_rgb(filename)
15
+ else
16
+ if @stream.hash[:ColorSpace].is_a?(Array)
17
+ if @stream.hash[:ColorSpace].include?(:DeviceCMYK)
18
+ return save_cmyk(filename)
19
+ elsif @stream.hash[:ColorSpace].include?(:DeviceRGB)
20
+ return save_rgb(filename)
21
+ elsif @stream.hash[:ColorSpace].include?(:DeviceGray)
22
+ return save_gray(filename)
23
+ end
24
+ end
25
+ warn "unsupport color depth #{@stream.hash[:ColorSpace]} #{filename}"
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def save_cmyk(filename)
32
+ h = stream.hash[:Height]
33
+ w = stream.hash[:Width]
34
+ bpc = stream.hash[:BitsPerComponent]
35
+ len = stream.hash[:Length]
36
+
37
+ # Synthesize a TIFF header
38
+ long_tag = ->(tag, count, value) { [tag, 4, count, value].pack("ssII") }
39
+ short_tag = ->(tag, count, value) { [tag, 3, count, value].pack("ssII") }
40
+ # header = byte order, version magic, offset of directory, directory count,
41
+ # followed by a series of tags containing metadata.
42
+ tag_count = 10
43
+ header = [73, 73, 42, 8, tag_count].pack("ccsIs")
44
+ tiff = header.dup
45
+ tiff << short_tag.call(256, 1, w) # image width
46
+ tiff << short_tag.call(257, 1, h) # image height
47
+ tiff << long_tag.call(258, 4, (header.size + (tag_count * 12) + 4)) # bits per pixel
48
+ tiff << short_tag.call(259, 1, 1) # compression
49
+ tiff << short_tag.call(262, 1, 5) # colorspace - separation
50
+ tiff << long_tag.call(273, 1, (10 + (tag_count * 12) + 20)) # data offset
51
+ tiff << short_tag.call(277, 1, 4) # samples per pixel
52
+ tiff << long_tag.call(279, 1, @data.size) # data byte size
53
+ tiff << short_tag.call(284, 1, 1) # planer config
54
+ tiff << long_tag.call(332, 1, 1) # inkset - CMYK
55
+ tiff << [0].pack("I") # next IFD pointer
56
+ tiff << [bpc, bpc, bpc, bpc].pack("IIII")
57
+ tiff << @data
58
+ { filename: filename, blob: tiff, width: w, height: h }
59
+ end
60
+
61
+ def save_gray(filename)
62
+ h = stream.hash[:Height]
63
+ w = stream.hash[:Width]
64
+ bpc = stream.hash[:BitsPerComponent]
65
+ len = stream.hash[:Length]
66
+
67
+ # Synthesize a TIFF header
68
+ long_tag = ->(tag, count, value) { [tag, 4, count, value].pack("ssII") }
69
+ short_tag = ->(tag, count, value) { [tag, 3, count, value].pack("ssII") }
70
+ # header = byte order, version magic, offset of directory, directory count,
71
+ # followed by a series of tags containing metadata.
72
+ tag_count = 9
73
+ header = [73, 73, 42, 8, tag_count].pack("ccsIs")
74
+ tiff = header.dup
75
+ tiff << short_tag.call(256, 1, w) # image width
76
+ tiff << short_tag.call(257, 1, h) # image height
77
+ tiff << short_tag.call(258, 1, 8) # bits per pixel
78
+ tiff << short_tag.call(259, 1, 1) # compression
79
+ tiff << short_tag.call(262, 1, 1) # colorspace - grayscale
80
+ tiff << long_tag.call(273, 1, (10 + (tag_count * 12) + 4)) # data offset
81
+ tiff << short_tag.call(277, 1, 1) # samples per pixel
82
+ tiff << long_tag.call(279, 1, stream.unfiltered_data.size) # data byte size
83
+ tiff << short_tag.call(284, 1, 1) # planer config
84
+ tiff << [0].pack("I") # next IFD pointer
85
+ tiff << stream.unfiltered_data
86
+ { filename: filename, blob: tiff, width: w, height: h }
87
+ end
88
+
89
+ def save_rgb(filename)
90
+ h = stream.hash[:Height]
91
+ w = stream.hash[:Width]
92
+ bpc = stream.hash[:BitsPerComponent]
93
+ len = stream.hash[:Length]
94
+
95
+ # Synthesize a TIFF header
96
+ long_tag = ->(tag, count, value) { [tag, 4, count, value].pack("ssII") }
97
+ short_tag = ->(tag, count, value) { [tag, 3, count, value].pack("ssII") }
98
+ # header = byte order, version magic, offset of directory, directory count,
99
+ # followed by a series of tags containing metadata.
100
+ tag_count = 8
101
+ header = [73, 73, 42, 8, tag_count].pack("ccsIs")
102
+ tiff = header.dup
103
+ tiff << short_tag.call(256, 1, w) # image width
104
+ tiff << short_tag.call(257, 1, h) # image height
105
+ tiff << long_tag.call(258, 3, (header.size + (tag_count * 12) + 4)) # bits per pixel
106
+ tiff << short_tag.call(259, 1, 1) # compression
107
+ tiff << short_tag.call(262, 1, 2) # colorspace - RGB
108
+ tiff << long_tag.call(273, 1, (header.size + (tag_count * 12) + 16)) # data offset
109
+ tiff << short_tag.call(277, 1, 3) # samples per pixel
110
+ tiff << long_tag.call(279, 1, stream.unfiltered_data.size) # data byte size
111
+ tiff << [0].pack("I") # next IFD pointer
112
+ tiff << [bpc, bpc, bpc].pack("III")
113
+ tiff << stream.unfiltered_data
114
+ { filename: filename, blob: tiff }
115
+ end
116
+ end
117
+ end
118
+
@@ -0,0 +1,50 @@
1
+ module Pdf::Reader::ExtractImages
2
+ class Tiff
3
+ attr_reader :stream
4
+
5
+ def initialize(stream)
6
+ @stream = stream
7
+ end
8
+
9
+ def save(filename)
10
+ k = stream.hash[:DecodeParms][:K]
11
+ if !k.nil? && stream.hash[:DecodeParms][:K] <= 0
12
+ save_group_four(filename)
13
+ else
14
+ warn "#{filename}: CCITT non-group 4/2D image."
15
+ end
16
+ end
17
+
18
+ private
19
+
20
+ # Group 4, 2D
21
+ def save_group_four(filename)
22
+ k = stream.hash[:DecodeParms][:K]
23
+ h = stream.hash[:Height]
24
+ w = stream.hash[:Width]
25
+ bpc = stream.hash[:BitsPerComponent]
26
+ mask = stream.hash[:ImageMask]
27
+ len = stream.hash[:Length]
28
+ cols = stream.hash[:DecodeParms][:Columns]
29
+
30
+ # Synthesize a TIFF header
31
+ long_tag = ->(tag, value) { [tag, 4, 1, value].pack("ssII") }
32
+ short_tag = ->(tag, value) { [tag, 3, 1, value].pack("ssII") }
33
+ # header = byte order, version magic, offset of directory, directory count,
34
+ # followed by a series of tags containing metadata: 259 is a magic number for
35
+ # the compression type; 273 is the offset of the image data.
36
+ tiff = [73, 73, 42, 8, 5].pack("ccsIs") \
37
+ + short_tag.call(256, cols) \
38
+ + short_tag.call(257, h) \
39
+ + short_tag.call(259, 4) \
40
+ + long_tag.call(273, (10 + (5 * 12) + 4)) \
41
+ + long_tag.call(279, len) \
42
+ + [0].pack("I") \
43
+ + stream.data
44
+ { filename: filename, blob: tiff, width: w, height: h }
45
+ end
46
+ end
47
+ end
48
+ end
49
+
50
+
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pdf
4
+ module Reader
5
+ module ExtractImages
6
+ VERSION = "0.1.0"
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,17 @@
1
+ require "zeitwerk"
2
+ loader = Zeitwerk::Loader.for_gem
3
+ loader.ignore(__FILE__)
4
+ loader.setup
5
+
6
+ module Pdf::Reader
7
+ module ExtractImages
8
+ def self.extract_from_pdf_page(page, limit: Float::INFINITY)
9
+ Extractor(limit).new.page(page)
10
+ end
11
+
12
+ def self.extract_all(pdf_reader, limit: Float::INFINITY)
13
+ pdf_reader.pages.flat_map { |page| Extractor.new(limit).page(page) }.compact
14
+ end
15
+ end
16
+ end
17
+
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/pdf/reader/extract_images/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "pdf-reader-extract-images"
7
+ spec.version = Pdf::Reader::ExtractImages::VERSION
8
+ spec.authors = ["Stefan Wienert"]
9
+ spec.email = ["info@stefanwienert.de"]
10
+
11
+ spec.summary = "Extract all images with format conversions based upon Pdf::Reader library"
12
+ spec.description = spec.summary
13
+ spec.homepage = "https://github.com/pludoni/pdf-reader-extract-images"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.4.0")
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
21
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
22
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
23
+ end
24
+ spec.bindir = "exe"
25
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
26
+ spec.require_paths = ["lib"]
27
+
28
+ spec.add_dependency "pdf-reader", ">= 2.5.0"
29
+ spec.add_dependency "image_processing"
30
+ spec.add_dependency "zeitwerk"
31
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdf-reader-extract-images
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Stefan Wienert
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-08-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: pdf-reader
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 2.5.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 2.5.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: image_processing
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: zeitwerk
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Extract all images with format conversions based upon Pdf::Reader library
56
+ email:
57
+ - info@stefanwienert.de
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - ".tool-versions"
64
+ - Gemfile
65
+ - LICENSE.txt
66
+ - README.md
67
+ - Rakefile
68
+ - bin/console
69
+ - bin/setup
70
+ - lib/pdf-reader-extract-images.rb
71
+ - lib/pdf/reader/extract_images/extractor.rb
72
+ - lib/pdf/reader/extract_images/jpg.rb
73
+ - lib/pdf/reader/extract_images/raw.rb
74
+ - lib/pdf/reader/extract_images/tiff.rb
75
+ - lib/pdf/reader/extract_images/version.rb
76
+ - pdf-reader-extract-images.gemspec
77
+ homepage: https://github.com/pludoni/pdf-reader-extract-images
78
+ licenses:
79
+ - MIT
80
+ metadata:
81
+ homepage_uri: https://github.com/pludoni/pdf-reader-extract-images
82
+ post_install_message:
83
+ rdoc_options: []
84
+ require_paths:
85
+ - lib
86
+ required_ruby_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: 2.4.0
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ requirements: []
97
+ rubygems_version: 3.2.15
98
+ signing_key:
99
+ specification_version: 4
100
+ summary: Extract all images with format conversions based upon Pdf::Reader library
101
+ test_files: []