rpdfium 0.4.1 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +615 -1317
- data/README.md +73 -78
- data/lib/rpdfium/annotation/annotation.rb +10 -8
- data/lib/rpdfium/document.rb +49 -22
- data/lib/rpdfium/errors.rb +2 -2
- data/lib/rpdfium/form/form.rb +9 -9
- data/lib/rpdfium/image/embedded.rb +17 -16
- data/lib/rpdfium/io/png.rb +9 -9
- data/lib/rpdfium/page.rb +561 -526
- data/lib/rpdfium/raw.rb +216 -203
- data/lib/rpdfium/search/search.rb +5 -5
- data/lib/rpdfium/structure/attachment.rb +6 -6
- data/lib/rpdfium/structure/element.rb +74 -74
- data/lib/rpdfium/structure/outline.rb +2 -2
- data/lib/rpdfium/structure/tree.rb +56 -55
- data/lib/rpdfium/table/cells.rb +36 -33
- data/lib/rpdfium/table/debugger.rb +12 -12
- data/lib/rpdfium/table/edges.rb +51 -49
- data/lib/rpdfium/table/extractor.rb +35 -34
- data/lib/rpdfium/table/table.rb +65 -62
- data/lib/rpdfium/util/cluster.rb +35 -33
- data/lib/rpdfium/util/column_inference.rb +34 -32
- data/lib/rpdfium/util/label_matcher.rb +30 -30
- data/lib/rpdfium/util/text_extraction.rb +15 -15
- data/lib/rpdfium/util/word_extractor.rb +49 -48
- data/lib/rpdfium/util/word_merger.rb +25 -24
- data/lib/rpdfium/version.rb +1 -1
- data/lib/rpdfium.rb +17 -15
- metadata +1 -1
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
|
-
#
|
|
5
|
-
#
|
|
4
|
+
# Text search within the page, based on FPDFText_Find*.
|
|
5
|
+
# Keeps the state (cursor) and supports forward/backward.
|
|
6
6
|
#
|
|
7
|
-
#
|
|
7
|
+
# Example:
|
|
8
8
|
# page.search("totale").each_match { |m| p m[:bbox], m[:text] }
|
|
9
9
|
class Search
|
|
10
10
|
include Enumerable
|
|
@@ -43,8 +43,8 @@ module Rpdfium
|
|
|
43
43
|
@state[:handle]
|
|
44
44
|
end
|
|
45
45
|
|
|
46
|
-
#
|
|
47
|
-
# :text, :rects (array
|
|
46
|
+
# Iterates over all forward occurrences. Returns a hash with :char_index, :length,
|
|
47
|
+
# :text, :rects (an array of top-down bboxes: one per text line).
|
|
48
48
|
def each_match
|
|
49
49
|
return enum_for(:each_match) unless block_given?
|
|
50
50
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
|
-
#
|
|
4
|
+
# Files embedded in the PDF (attachments). PDFium exposes them via FPDFDoc_GetAttachment.
|
|
5
5
|
class Attachment
|
|
6
6
|
attr_reader :document, :index, :handle
|
|
7
7
|
|
|
@@ -16,7 +16,7 @@ module Rpdfium
|
|
|
16
16
|
Raw.read_utf16_string(:FPDFAttachment_GetName, @handle)
|
|
17
17
|
end
|
|
18
18
|
|
|
19
|
-
#
|
|
19
|
+
# Returns the bytes of the attached file. Probe-then-fetch pattern.
|
|
20
20
|
def bytes
|
|
21
21
|
out_size = FFI::MemoryPointer.new(:ulong)
|
|
22
22
|
Raw.FPDFAttachment_GetFile(@handle, FFI::Pointer::NULL, 0, out_size)
|
|
@@ -25,10 +25,10 @@ module Rpdfium
|
|
|
25
25
|
|
|
26
26
|
buf = FFI::MemoryPointer.new(:uchar, n)
|
|
27
27
|
Raw.FPDFAttachment_GetFile(@handle, buf, n, out_size)
|
|
28
|
-
#
|
|
29
|
-
# PDFium
|
|
30
|
-
#
|
|
31
|
-
#
|
|
28
|
+
# Read n bytes (the size of OUR buffer), not out_size.read_ulong:
|
|
29
|
+
# PDFium may update out_size with a value different from n (e.g. the
|
|
30
|
+
# total size required), which would read past the buffer → IndexError.
|
|
31
|
+
# If the actual write is < n, the remainder is filled with NUL.
|
|
32
32
|
buf.read_bytes(n)
|
|
33
33
|
end
|
|
34
34
|
|
|
@@ -2,20 +2,20 @@
|
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
4
|
module Structure
|
|
5
|
-
# Element
|
|
5
|
+
# Element of a tagged PDF StructTree.
|
|
6
6
|
#
|
|
7
|
-
#
|
|
8
|
-
# `Document`, `P` (
|
|
9
|
-
# `TH`, `TD`, `Figure`, `Span`, `Lbl`, `LI`, `Caption`,
|
|
10
|
-
# PDF spec §14.8
|
|
7
|
+
# An Element represents a node of the document's logical structure:
|
|
8
|
+
# `Document`, `P` (paragraph), `H1`..`H6` (headings), `Table`, `TR`,
|
|
9
|
+
# `TH`, `TD`, `Figure`, `Span`, `Lbl`, `LI`, `Caption`, etc. See
|
|
10
|
+
# PDF spec §14.8 for the complete taxonomy.
|
|
11
11
|
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
#
|
|
12
|
+
# Elements have no independent lifetime: they belong to the Tree that
|
|
13
|
+
# produced them. When the Tree is closed, the elements become
|
|
14
|
+
# invalid. Do not call methods on an element after `tree.close`.
|
|
15
15
|
#
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
16
|
+
# All methods are read-only: PDFium exposes no API to modify the
|
|
17
|
+
# StructTree (it is a "read-only" structure even in its public C
|
|
18
|
+
# API).
|
|
19
19
|
class Element
|
|
20
20
|
attr_reader :handle, :tree
|
|
21
21
|
|
|
@@ -24,75 +24,75 @@ module Rpdfium
|
|
|
24
24
|
@handle = handle
|
|
25
25
|
end
|
|
26
26
|
|
|
27
|
-
#
|
|
28
|
-
# Nil
|
|
27
|
+
# Structural type of the element (e.g. "P", "H1", "Table", "TR", "TD").
|
|
28
|
+
# Nil if PDFium cannot read it (placeholder element).
|
|
29
29
|
def type
|
|
30
30
|
read_utf16_string(:FPDF_StructElement_GetType)
|
|
31
31
|
end
|
|
32
32
|
|
|
33
|
-
#
|
|
34
|
-
#
|
|
35
|
-
#
|
|
33
|
+
# Type of the underlying PDF object: usually "StructElem", but may
|
|
34
|
+
# be "MCR" (Marked Content Reference) or "OBJR" (Object Reference)
|
|
35
|
+
# for specialized nodes. Most users use `type`.
|
|
36
36
|
def obj_type
|
|
37
37
|
read_utf16_string(:FPDF_StructElement_GetObjType)
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
-
# Title attribute (
|
|
41
|
-
#
|
|
40
|
+
# Title attribute (rare, used in some documents to give the element
|
|
41
|
+
# a descriptive name, e.g. "Capitolo 1").
|
|
42
42
|
def title
|
|
43
43
|
read_utf16_string(:FPDF_StructElement_GetTitle)
|
|
44
44
|
end
|
|
45
45
|
|
|
46
|
-
# ID
|
|
47
|
-
# StructTreeRoot).
|
|
48
|
-
# attribute
|
|
46
|
+
# Unique ID of the element (if declared in the /ID dictionary of
|
|
47
|
+
# the StructTreeRoot). Enables cross-element references (e.g. the
|
|
48
|
+
# Headers attribute of a TD cell pointing to a TH by id).
|
|
49
49
|
def id
|
|
50
50
|
read_utf16_string(:FPDF_StructElement_GetID)
|
|
51
51
|
end
|
|
52
52
|
|
|
53
|
-
#
|
|
54
|
-
#
|
|
53
|
+
# Language declared on the element (e.g. "it-IT", "en-US"). Inherited
|
|
54
|
+
# from the parent if not overridden. Useful for language-aware pipelines.
|
|
55
55
|
def lang
|
|
56
56
|
read_utf16_string(:FPDF_StructElement_GetLang)
|
|
57
57
|
end
|
|
58
58
|
|
|
59
|
-
# ActualText: override
|
|
60
|
-
#
|
|
61
|
-
# ("∫" → "integral"),
|
|
62
|
-
#
|
|
59
|
+
# ActualText: override of the "logical" text for the element. Resolves
|
|
60
|
+
# ligatures (the PDF shows `fi` but actual_text says "fi"), math symbols
|
|
61
|
+
# ("∫" → "integral"), abbreviations. When present, it takes precedence
|
|
62
|
+
# over the graphical text for accessibility and search.
|
|
63
63
|
def actual_text
|
|
64
64
|
read_utf16_string(:FPDF_StructElement_GetActualText)
|
|
65
65
|
end
|
|
66
66
|
|
|
67
|
-
# AltText:
|
|
68
|
-
#
|
|
67
|
+
# AltText: alternative text for Figure / Formula / images. PDF/UA
|
|
68
|
+
# requires every Figure to have a non-empty alt_text.
|
|
69
69
|
def alt_text
|
|
70
70
|
read_utf16_string(:FPDF_StructElement_GetAltText)
|
|
71
71
|
end
|
|
72
72
|
|
|
73
|
-
# Expansion text
|
|
74
|
-
#
|
|
73
|
+
# Expansion text for abbreviations (e.g. an element of type "Span"
|
|
74
|
+
# with content "Dr." and expansion "Doctor"). Used for text-to-speech.
|
|
75
75
|
def expansion
|
|
76
76
|
read_utf16_string(:FPDF_StructElement_GetExpansion)
|
|
77
77
|
end
|
|
78
78
|
|
|
79
|
-
# Marked Content IDs
|
|
80
|
-
# 1 MCID (
|
|
81
|
-
# mcid=N)
|
|
82
|
-
# `<TR>` —
|
|
79
|
+
# Marked Content IDs linked to this element. An element typically has
|
|
80
|
+
# 1 MCID (e.g. a `<P>` holds all the paragraph text inside a BDC with
|
|
81
|
+
# mcid=N) or 0 (a pure structural element: `<Document>`, `<Table>`,
|
|
82
|
+
# `<TR>` — their MCIDs reside in the leaf children).
|
|
83
83
|
#
|
|
84
|
-
#
|
|
85
|
-
#
|
|
84
|
+
# To link an MCID to the page text: read the page objects and group
|
|
85
|
+
# by `FPDFPageObj_GetMarkedContentID`. See `Element#text`.
|
|
86
86
|
def marked_content_ids
|
|
87
87
|
first = Raw.FPDF_StructElement_GetMarkedContentID(@handle)
|
|
88
88
|
count = Raw.FPDF_StructElement_GetMarkedContentIdCount(@handle)
|
|
89
|
-
#
|
|
90
|
-
#
|
|
91
|
-
#
|
|
89
|
+
# Cases: GetMarkedContentIdCount returns -1 when there are no direct
|
|
90
|
+
# MCIDs (structural element). GetMarkedContentID returns -1 in the
|
|
91
|
+
# same case.
|
|
92
92
|
return [] if count <= 0 && first < 0
|
|
93
93
|
|
|
94
|
-
#
|
|
95
|
-
# 0
|
|
94
|
+
# When a single MCID exists, GetMarkedContentIdCount may return
|
|
95
|
+
# 0 or -1 while GetMarkedContentID provides the value. Coalesce:
|
|
96
96
|
if count <= 0
|
|
97
97
|
first >= 0 ? [first] : []
|
|
98
98
|
else
|
|
@@ -103,8 +103,8 @@ module Rpdfium
|
|
|
103
103
|
end
|
|
104
104
|
end
|
|
105
105
|
|
|
106
|
-
#
|
|
107
|
-
# (top-to-bottom, left-to-right
|
|
106
|
+
# Direct children of the element. Ordered as declared in the PDF
|
|
107
|
+
# (top-to-bottom, left-to-right for reading order).
|
|
108
108
|
def children
|
|
109
109
|
n = Raw.FPDF_StructElement_CountChildren(@handle)
|
|
110
110
|
return [] if n <= 0
|
|
@@ -115,7 +115,7 @@ module Rpdfium
|
|
|
115
115
|
end
|
|
116
116
|
end
|
|
117
117
|
|
|
118
|
-
# Parent. Nil
|
|
118
|
+
# Parent. Nil for root elements (direct children of the StructTree).
|
|
119
119
|
def parent
|
|
120
120
|
h = Raw.FPDF_StructElement_GetParent(@handle)
|
|
121
121
|
return nil if h.null?
|
|
@@ -123,9 +123,9 @@ module Rpdfium
|
|
|
123
123
|
Element.new(@tree, h)
|
|
124
124
|
end
|
|
125
125
|
|
|
126
|
-
#
|
|
127
|
-
#
|
|
128
|
-
#
|
|
126
|
+
# Depth-first walk of the entire sub-tree starting from this element.
|
|
127
|
+
# Visits self first, then recursively the children.
|
|
128
|
+
# Without a block returns an Enumerator.
|
|
129
129
|
def walk(&block)
|
|
130
130
|
return enum_for(:walk) unless block
|
|
131
131
|
|
|
@@ -133,26 +133,26 @@ module Rpdfium
|
|
|
133
133
|
children.each { |c| c.walk(&block) }
|
|
134
134
|
end
|
|
135
135
|
|
|
136
|
-
#
|
|
137
|
-
#
|
|
136
|
+
# Leaves of the sub-tree (elements without children). These are the
|
|
137
|
+
# nodes that typically hold the direct MCID.
|
|
138
138
|
def leaves
|
|
139
139
|
return [self] if children.empty?
|
|
140
140
|
|
|
141
141
|
children.flat_map(&:leaves)
|
|
142
142
|
end
|
|
143
143
|
|
|
144
|
-
#
|
|
145
|
-
# 1.
|
|
146
|
-
# 2.
|
|
147
|
-
# +
|
|
148
|
-
#
|
|
144
|
+
# Text of the element, reconstructed from the page via MCID. Resolution:
|
|
145
|
+
# 1. If `actual_text` is present, use it (handles ligatures/abbreviations).
|
|
146
|
+
# 2. Otherwise collect all MCIDs of the sub-tree (this element
|
|
147
|
+
# + recursively the children) and concatenate the text of the page
|
|
148
|
+
# objects with those MCIDs, in document order.
|
|
149
149
|
#
|
|
150
|
-
#
|
|
151
|
-
#
|
|
150
|
+
# For pure structural elements (`Table`, `TR`) the text is the
|
|
151
|
+
# concatenation of all descendants — useful as a "summary".
|
|
152
152
|
def text
|
|
153
153
|
return actual_text if actual_text && !actual_text.empty?
|
|
154
154
|
|
|
155
|
-
#
|
|
155
|
+
# Collect MCIDs of the entire sub-tree depth-first
|
|
156
156
|
all_mcids = []
|
|
157
157
|
walk { |el| all_mcids.concat(el.marked_content_ids) }
|
|
158
158
|
return "" if all_mcids.empty?
|
|
@@ -161,11 +161,11 @@ module Rpdfium
|
|
|
161
161
|
all_mcids.filter_map { |id| mcid_map[id] }.join
|
|
162
162
|
end
|
|
163
163
|
|
|
164
|
-
#
|
|
165
|
-
#
|
|
166
|
-
# Scope, Headers, BBox,
|
|
167
|
-
# Float, String, true/false,
|
|
168
|
-
#
|
|
164
|
+
# Structural PDF attributes. Returns a Hash { name => value } with
|
|
165
|
+
# all attributes declared on this element (RowSpan, ColSpan,
|
|
166
|
+
# Scope, Headers, BBox, etc.). Values are Ruby-native: Integer,
|
|
167
|
+
# Float, String, true/false, or Array for "Headers" attributes that
|
|
168
|
+
# contain lists of IDs.
|
|
169
169
|
def attributes
|
|
170
170
|
result = {}
|
|
171
171
|
attr_count = Raw.FPDF_StructElement_GetAttributeCount(@handle)
|
|
@@ -204,9 +204,9 @@ module Rpdfium
|
|
|
204
204
|
|
|
205
205
|
private
|
|
206
206
|
|
|
207
|
-
#
|
|
208
|
-
#
|
|
209
|
-
#
|
|
207
|
+
# UTF-16 string read helper with proper probe-then-fetch. PDFium
|
|
208
|
+
# returns the number of bytes required (including the null
|
|
209
|
+
# terminator), even when the buffer is too small.
|
|
210
210
|
def read_utf16_string(fn_name)
|
|
211
211
|
needed = Raw.send(fn_name, @handle, FFI::Pointer::NULL, 0)
|
|
212
212
|
return nil if needed < 2
|
|
@@ -215,7 +215,7 @@ module Rpdfium
|
|
|
215
215
|
written = Raw.send(fn_name, @handle, buf, needed)
|
|
216
216
|
return nil if written < 2
|
|
217
217
|
|
|
218
|
-
# Clamp:
|
|
218
|
+
# Clamp: read at most the allocated buffer minus the null terminator.
|
|
219
219
|
payload = [written - 2, needed - 2].min
|
|
220
220
|
return nil if payload <= 0
|
|
221
221
|
|
|
@@ -235,7 +235,7 @@ module Rpdfium
|
|
|
235
235
|
n = len_buf.read_ulong
|
|
236
236
|
return nil if n.zero?
|
|
237
237
|
|
|
238
|
-
# GetName
|
|
238
|
+
# GetName returns ASCII (latin-1), not UTF-16
|
|
239
239
|
name_buf.read_bytes(n).force_encoding("UTF-8").delete("\u0000")
|
|
240
240
|
end
|
|
241
241
|
|
|
@@ -244,7 +244,7 @@ module Rpdfium
|
|
|
244
244
|
return nil if val_handle.null?
|
|
245
245
|
|
|
246
246
|
type = Raw.FPDF_StructElement_Attr_GetType(val_handle)
|
|
247
|
-
# Type codes
|
|
247
|
+
# Type codes from fpdf_structtree.h:
|
|
248
248
|
# 1 = Boolean, 2 = Number, 3 = String, 4 = Blob,
|
|
249
249
|
# 5 = Name, 6 = Array, 7 = Dictionary
|
|
250
250
|
case type
|
|
@@ -258,15 +258,15 @@ module Rpdfium
|
|
|
258
258
|
read_attr_string_value(val_handle)
|
|
259
259
|
when 4 # Blob (raw bytes)
|
|
260
260
|
read_attr_blob_value(val_handle)
|
|
261
|
-
when 6 # Array →
|
|
261
|
+
when 6 # Array → recursively collect the children
|
|
262
262
|
n = Raw.FPDF_StructElement_Attr_CountChildren(val_handle)
|
|
263
263
|
(0...n).filter_map do |i|
|
|
264
264
|
child = Raw.FPDF_StructElement_Attr_GetChildAtIndex(val_handle, i)
|
|
265
265
|
next nil if child.null?
|
|
266
266
|
|
|
267
|
-
#
|
|
268
|
-
#
|
|
269
|
-
#
|
|
267
|
+
# For each child apply the same read via type. But there is no
|
|
268
|
+
# "name" to access Attr_GetValue on a child; the child is
|
|
269
|
+
# already an FPDF_STRUCTELEMENT_ATTR_VALUE. Read it directly.
|
|
270
270
|
read_attr_value_handle(child)
|
|
271
271
|
end
|
|
272
272
|
else
|
|
@@ -294,7 +294,7 @@ module Rpdfium
|
|
|
294
294
|
|
|
295
295
|
def read_attr_string_value(val_handle)
|
|
296
296
|
len_buf = FFI::MemoryPointer.new(:ulong)
|
|
297
|
-
# Probe size
|
|
297
|
+
# Probe the size
|
|
298
298
|
Raw.FPDF_StructElement_Attr_GetStringValue(val_handle,
|
|
299
299
|
FFI::Pointer::NULL, 0, len_buf)
|
|
300
300
|
n = len_buf.read_ulong
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
|
-
#
|
|
4
|
+
# Document bookmark (outline) tree. Built recursively.
|
|
5
5
|
class Outline
|
|
6
6
|
attr_reader :title, :page_index, :children
|
|
7
7
|
|
|
@@ -32,7 +32,7 @@ module Rpdfium
|
|
|
32
32
|
result
|
|
33
33
|
end
|
|
34
34
|
|
|
35
|
-
#
|
|
35
|
+
# Flat preorder iterator: useful for generating a linear table of contents.
|
|
36
36
|
def self.flatten(outline_tree, depth = 0, &block)
|
|
37
37
|
outline_tree.each do |item|
|
|
38
38
|
block.call(item, depth)
|
|
@@ -2,39 +2,40 @@
|
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
4
|
module Structure
|
|
5
|
-
# StructTree
|
|
5
|
+
# StructTree of a tagged PDF page.
|
|
6
6
|
#
|
|
7
|
-
#
|
|
8
|
-
# Word/LibreOffice/InDesign),
|
|
9
|
-
# Document → P, H1, Table, TR, TH, TD, Figure,
|
|
7
|
+
# For tagged PDFs (PDF/UA, accessibility-friendly exports from
|
|
8
|
+
# Word/LibreOffice/InDesign), it exposes the logical structure of the
|
|
9
|
+
# document: Document → P, H1, Table, TR, TH, TD, Figure, etc.
|
|
10
10
|
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
11
|
+
# For NON-tagged PDFs, `Page#struct_tree` returns nil. For "tagged but
|
|
12
|
+
# empty" PDFs (e.g. CR Banca d'Italia, StructTreeRoot present but with
|
|
13
|
+
# placeholder elements without type/MCID), `Tree#empty?` returns true.
|
|
14
14
|
#
|
|
15
|
-
# Lifecycle:
|
|
16
|
-
# `FPDF_StructTree_Close`
|
|
17
|
-
#
|
|
15
|
+
# Lifecycle: the Tree holds a PDFium handle that is "owning" — calling
|
|
16
|
+
# `FPDF_StructTree_Close` deallocates it. PDFium automatically
|
|
17
|
+
# deallocates the struct tree when the document is closed, so in
|
|
18
|
+
# practice:
|
|
18
19
|
#
|
|
19
|
-
# -
|
|
20
|
-
# `FPDF_CloseDocument` (zero
|
|
21
|
-
# in
|
|
22
|
-
# -
|
|
20
|
+
# - if you never close the tree explicitly, PDFium frees it with
|
|
21
|
+
# `FPDF_CloseDocument` (zero persistent leak, but the tree stays
|
|
22
|
+
# in memory until the doc is closed — it may be ~MB)
|
|
23
|
+
# - for deterministic control (release immediately), use the block:
|
|
23
24
|
#
|
|
24
25
|
# page.struct_tree do |tree|
|
|
25
26
|
# tree.walk { |el| ... }
|
|
26
27
|
# end
|
|
27
|
-
#
|
|
28
|
+
# on exit from the block the tree is closed, even on exception.
|
|
28
29
|
#
|
|
29
|
-
#
|
|
30
|
-
#
|
|
31
|
-
#
|
|
32
|
-
# via Document
|
|
33
|
-
#
|
|
30
|
+
# As a design choice we do NOT use `ObjectSpace.define_finalizer`: if
|
|
31
|
+
# the GC were to call `FPDF_StructTree_Close` after the document had
|
|
32
|
+
# already been closed, this would cause a use-after-free → segfault.
|
|
33
|
+
# Closing via Document is always safe; closing via Tree.close (explicit
|
|
34
|
+
# or through a block) requires the document to still be alive.
|
|
34
35
|
class Tree
|
|
35
36
|
attr_reader :handle, :page
|
|
36
37
|
|
|
37
|
-
#
|
|
38
|
+
# Returns nil if the page is not tagged. Otherwise a Tree.
|
|
38
39
|
def self.for_page(page)
|
|
39
40
|
h = Raw.FPDF_StructTree_GetForPage(page.handle)
|
|
40
41
|
return nil if h.null?
|
|
@@ -48,23 +49,23 @@ module Rpdfium
|
|
|
48
49
|
@closed = false
|
|
49
50
|
@mcid_text_cache = nil
|
|
50
51
|
|
|
51
|
-
#
|
|
52
|
-
# ~CPDF_StructTree()
|
|
53
|
-
#
|
|
54
|
-
#
|
|
55
|
-
# - close
|
|
52
|
+
# NOTE: no finalizer. FPDF_StructTree_Close is "owning": it calls
|
|
53
|
+
# ~CPDF_StructTree() which frees the object. If the PDF document
|
|
54
|
+
# is closed before the tree, the GC finalizer would call Close on
|
|
55
|
+
# already-freed memory → segfault. Safe lifetime:
|
|
56
|
+
# - explicit close via `tree.close` or via the block
|
|
56
57
|
# `page.struct_tree { |tree| ... }`
|
|
57
|
-
# -
|
|
58
|
-
#
|
|
59
|
-
# persistent,
|
|
58
|
+
# - if nobody closes it explicitly, PDFium frees the tree
|
|
59
|
+
# together with the document at `FPDF_CloseDocument` (no
|
|
60
|
+
# persistent leak, only memory held until the doc is closed)
|
|
60
61
|
end
|
|
61
62
|
|
|
62
63
|
def closed?
|
|
63
64
|
@closed
|
|
64
65
|
end
|
|
65
66
|
|
|
66
|
-
#
|
|
67
|
-
#
|
|
67
|
+
# Explicit close (idempotent). After close, do not call methods on
|
|
68
|
+
# this Tree nor on the Elements it generated.
|
|
68
69
|
def close
|
|
69
70
|
return if @closed
|
|
70
71
|
|
|
@@ -73,15 +74,15 @@ module Rpdfium
|
|
|
73
74
|
@mcid_text_cache = nil
|
|
74
75
|
end
|
|
75
76
|
|
|
76
|
-
#
|
|
77
|
-
#
|
|
78
|
-
#
|
|
77
|
+
# Number of root elements (direct children of the StructTreeRoot for
|
|
78
|
+
# this page). Typically 1 (`<Document>`), but it can be arbitrarily
|
|
79
|
+
# high on odd PDFs (e.g. cu.pdf: 717 placeholders).
|
|
79
80
|
def root_count
|
|
80
81
|
n = Raw.FPDF_StructTree_CountChildren(@handle)
|
|
81
82
|
[n, 0].max
|
|
82
83
|
end
|
|
83
84
|
|
|
84
|
-
#
|
|
85
|
+
# Root elements (direct children of the StructTreeRoot). Typically 1
|
|
85
86
|
# (`<Document>`).
|
|
86
87
|
def roots
|
|
87
88
|
(0...root_count).filter_map do |i|
|
|
@@ -90,42 +91,42 @@ module Rpdfium
|
|
|
90
91
|
end
|
|
91
92
|
end
|
|
92
93
|
|
|
93
|
-
# True
|
|
94
|
-
#
|
|
95
|
-
# CR Banca d'Italia:
|
|
96
|
-
#
|
|
94
|
+
# True if the tree is structurally empty (no element with a readable
|
|
95
|
+
# type among the roots). A common case for "fake-tagged" PDFs such as
|
|
96
|
+
# CR Banca d'Italia: the StructTreeRoot exists but the elements are
|
|
97
|
+
# empty placeholders.
|
|
97
98
|
def empty?
|
|
98
99
|
return true if root_count.zero?
|
|
99
100
|
|
|
100
101
|
roots.none? { |r| r.type || r.children.any? }
|
|
101
102
|
end
|
|
102
103
|
|
|
103
|
-
#
|
|
104
|
-
# `roots.flat_map(&:walk)`.
|
|
104
|
+
# Depth-first walk of ALL the elements of the tree. Equivalent to
|
|
105
|
+
# `roots.flat_map(&:walk)`. Without a block it returns an Enumerator.
|
|
105
106
|
def walk(&block)
|
|
106
107
|
return enum_for(:walk) unless block
|
|
107
108
|
|
|
108
109
|
roots.each { |r| r.walk(&block) }
|
|
109
110
|
end
|
|
110
111
|
|
|
111
|
-
#
|
|
112
|
-
# "Figure").
|
|
113
|
-
# "P", "H1",
|
|
112
|
+
# Finds all the elements of the specified type (e.g. "Table", "P",
|
|
113
|
+
# "Figure"). Case-sensitive comparison (PDF types are "Table",
|
|
114
|
+
# "P", "H1", etc.).
|
|
114
115
|
def find_all(type:)
|
|
115
116
|
walk.select { |el| el.type == type }
|
|
116
117
|
end
|
|
117
118
|
|
|
118
|
-
#
|
|
119
|
-
#
|
|
119
|
+
# Returns all the elements of type "Table". Convenient for semantic
|
|
120
|
+
# table extraction.
|
|
120
121
|
def tables
|
|
121
122
|
find_all(type: "Table")
|
|
122
123
|
end
|
|
123
124
|
|
|
124
|
-
# Page objects
|
|
125
|
-
#
|
|
126
|
-
#
|
|
125
|
+
# Page objects grouped by Marked Content ID, to allow Element#text
|
|
126
|
+
# to resolve the text of its MCIDs. The map is built only once per
|
|
127
|
+
# Tree and cached.
|
|
127
128
|
#
|
|
128
|
-
#
|
|
129
|
+
# Public but intended for internal use; not part of the stable API.
|
|
129
130
|
def mcid_text_map
|
|
130
131
|
@mcid_text_cache ||= build_mcid_text_map
|
|
131
132
|
end
|
|
@@ -137,9 +138,9 @@ module Rpdfium
|
|
|
137
138
|
|
|
138
139
|
private
|
|
139
140
|
|
|
140
|
-
#
|
|
141
|
-
#
|
|
142
|
-
#
|
|
141
|
+
# Iterates all the page objects (incl. Form XObject) and groups their
|
|
142
|
+
# text by MCID. The probe-then-fetch pattern on FPDFTextObj_GetText
|
|
143
|
+
# is well-established (see Page#read_text_obj_text_fast).
|
|
143
144
|
def build_mcid_text_map
|
|
144
145
|
map = Hash.new { |h, k| h[k] = +"" }
|
|
145
146
|
tp = @page.text_page
|
|
@@ -170,8 +171,8 @@ module Rpdfium
|
|
|
170
171
|
end
|
|
171
172
|
|
|
172
173
|
def read_text_obj_text(obj, tp, buf)
|
|
173
|
-
# Probe
|
|
174
|
-
# content runs,
|
|
174
|
+
# Probe with a 1024-byte buffer (sufficient for 99% of marked
|
|
175
|
+
# content runs, which are typically single words or short phrases).
|
|
175
176
|
needed = Raw.FPDFTextObj_GetText(obj, tp.handle, buf, 1024)
|
|
176
177
|
return nil if needed < 2
|
|
177
178
|
|