pdf-reader 2.4.0 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +31 -0
- data/README.md +17 -2
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/buffer.rb +62 -21
- data/lib/pdf/reader/encoding.rb +1 -1
- data/lib/pdf/reader/error.rb +3 -3
- data/lib/pdf/reader/filter/ascii85.rb +5 -1
- data/lib/pdf/reader/filter/depredict.rb +3 -3
- data/lib/pdf/reader/filter/flate.rb +28 -16
- data/lib/pdf/reader/font.rb +3 -1
- data/lib/pdf/reader/glyph_hash.rb +15 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/object_hash.rb +3 -1
- data/lib/pdf/reader/orientation_detector.rb +2 -2
- data/lib/pdf/reader/page.rb +28 -0
- data/lib/pdf/reader/page_layout.rb +19 -13
- data/lib/pdf/reader/page_state.rb +7 -5
- data/lib/pdf/reader/page_text_receiver.rb +22 -1
- data/lib/pdf/reader/parser.rb +8 -6
- data/lib/pdf/reader/width_calculator/built_in.rb +7 -15
- data/lib/pdf/reader/xref.rb +6 -1
- data/lib/pdf/reader/zero_width_runs_filter.rb +11 -0
- metadata +17 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ccc4d14f5820ca798f6eafa1c0978207759ec1668c6f6307acb7cd43bcd0626e
|
4
|
+
data.tar.gz: 466bfe0a91f57463a56d9697ccd2529f981c6917e4ed578b4103f2bc87065522
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45d6c16b3d9ed029e6eb5a45cc64aa95e7ada2950e052053cbe0b6f5aae632f824a86f0505a5cee660abd1cd896177a0637a2f2f5a3f3633e829e8d46fb59817
|
7
|
+
data.tar.gz: e3e566344bd5560387577597dea20b2f7da40aed2a7fa8b8d074c0742486db59d7e349f6c38c91c8dcd9b0a8cf2aa4c19a00d0ee097003449504b3f06f18ca3c
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,34 @@
|
|
1
|
+
v2.6.0 (12th November 2021)
|
2
|
+
- Text extraction improvements
|
3
|
+
- Improved text layout on pages with a variery of font sizes (http://github.com/yob/pdf-reader/pull/355)
|
4
|
+
- Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
|
5
|
+
- Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
|
6
|
+
- Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
|
7
|
+
- Performance improvements
|
8
|
+
- Reduced memory pressure when decoding TIFF images (http://github.com/yob/pdf-reader/pull/360)
|
9
|
+
- Optional dependency on ascii81_native gem for faster processing of files using the ascii85 filter (http://github.com/yob/pdf-reader/pull/359)
|
10
|
+
- Successfully parse more files
|
11
|
+
- Gracefully handle some non-spec compliant CR/LF issues (http://github.com/yob/pdf-reader/pull/364)
|
12
|
+
- Fix parsing of some escape sequences in content streams (http://github.com/yob/pdf-reader/pull/368)
|
13
|
+
- Increase the amount of junk bytes we detect and skip at the end of a file (382)
|
14
|
+
- Ignore "/Prev 0" in trailers (http://github.com/yob/pdf-reader/pull/383)
|
15
|
+
- Fix parsing of some inline images (BI ID EI tokens) (http://github.com/yob/pdf-reader/pull/389)
|
16
|
+
- Gracefully handle some xref tables that incorrectly start with 1 (http://github.com/yob/pdf-reader/pull/384)
|
17
|
+
|
18
|
+
v2.5.0 (6th June 2021)
|
19
|
+
- bump minimum ruby version to 2.0
|
20
|
+
- Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
|
21
|
+
- Fix some character spacing issues with the TJ operator [#343](https://github.com/yob/pdf-reader/pull/343)
|
22
|
+
- Fix crash with some encrypted PDFs [#348](https://github.com/yob/pdf-reader/pull/348/)
|
23
|
+
- Fix positions of text on some PDFs with pages rotated 90° [#350](https://github.com/yob/pdf-reader/pull/350/)
|
24
|
+
|
25
|
+
v2.4.2 (28th January 2021)
|
26
|
+
- relax ASCII85 dependency to allow 1.x
|
27
|
+
- improved support for decompressing objects with slightly malformed zlib data
|
28
|
+
|
29
|
+
v.2.4.1 (24th September 2020)
|
30
|
+
- Re-vendor font metrics from Adobe to clarify their license
|
31
|
+
|
1
32
|
v2.4.0 (21st November 2019)
|
2
33
|
- Optimise overlapping characters code introduced in 2.3.0. Text extraction of pages with
|
3
34
|
thousands of characters is still slower than it was in 2.2.1, but it might tolerable
|
data/README.md
CHANGED
@@ -166,6 +166,19 @@ http://groups.google.com/group/pdf-reader
|
|
166
166
|
The easiest way to explain how this works in practice is to show some examples.
|
167
167
|
Check out the examples/ directory for a few files.
|
168
168
|
|
169
|
+
# Alternate Decoder
|
170
|
+
|
171
|
+
For PDF files containing Ascii85 streams, the [ascii85_native](https://github.com/AnomalousBit/ascii85_native) gem can be used for increased performance. If the ascii85_native gem is detected, pdf-reader will automatically use the gem.
|
172
|
+
|
173
|
+
First, run `gem install ascii85_native` and then require the gem alongside pdf-reader:
|
174
|
+
|
175
|
+
```ruby
|
176
|
+
require "pdf-reader"
|
177
|
+
require "ascii85_native"
|
178
|
+
```
|
179
|
+
|
180
|
+
Another way of enabling native Ascii85 decoding is to place `gem 'ascii85_native'` in your project's `Gemfile`.
|
181
|
+
|
169
182
|
# Known Limitations
|
170
183
|
|
171
184
|
Occasionally some text cannot be extracted properly due to the way it has been
|
@@ -176,8 +189,10 @@ little UTF-8 friendly box to indicate an unrecognisable character.
|
|
176
189
|
|
177
190
|
* PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
178
191
|
|
179
|
-
* PDF Specification:
|
192
|
+
* PDF Specification: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
|
193
|
+
|
194
|
+
* Adobe PDF Developer Resources: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
180
195
|
|
181
|
-
* PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
196
|
+
* PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
182
197
|
|
183
198
|
* Developing with PDF (book): http://shop.oreilly.com/product/0636920025269.do
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
|
|
14
14
|
Cane::RakeTask.new(:quality) do |cane|
|
15
15
|
cane.abc_max = 20
|
16
16
|
cane.style_measure = 100
|
17
|
-
cane.max_violations =
|
17
|
+
cane.max_violations = 32
|
18
18
|
|
19
19
|
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
20
20
|
end
|
data/examples/extract_fonts.rb
CHANGED
@@ -17,8 +17,8 @@ module ExtractFonts
|
|
17
17
|
return count if page.fonts.nil? || page.fonts.empty?
|
18
18
|
|
19
19
|
page.fonts.each do |label, font|
|
20
|
-
next if complete_refs[
|
21
|
-
complete_refs[
|
20
|
+
next if complete_refs[label]
|
21
|
+
complete_refs[label] = true
|
22
22
|
|
23
23
|
process_font(page, font)
|
24
24
|
|
@@ -39,7 +39,7 @@ module ExtractFonts
|
|
39
39
|
when :TrueType, :CIDFontType2 then
|
40
40
|
ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
|
41
41
|
else
|
42
|
-
$stderr.puts "unsupported font type #{font[:Subtype]}"
|
42
|
+
$stderr.puts "unsupported font type #{font[:Subtype]} for #{font[:BaseFont]}"
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
@@ -68,10 +68,15 @@ module ExtractFonts
|
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
|
-
|
71
|
+
if ARGV.size == 0 # default file name
|
72
|
+
ARGV << File.expand_path(File.join(File.dirname(__dir__), "spec", "data", "cairo-unicode.pdf"))
|
73
|
+
end
|
74
|
+
|
72
75
|
extractor = ExtractFonts::Extractor.new
|
73
76
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
+
ARGV.each do |arg|
|
78
|
+
PDF::Reader.open(arg) do |reader|
|
79
|
+
page = reader.page(1)
|
80
|
+
extractor.page(page)
|
81
|
+
end
|
77
82
|
end
|