rpdfium 0.4.1 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +615 -1317
- data/README.md +73 -78
- data/lib/rpdfium/annotation/annotation.rb +10 -8
- data/lib/rpdfium/document.rb +49 -22
- data/lib/rpdfium/errors.rb +2 -2
- data/lib/rpdfium/form/form.rb +9 -9
- data/lib/rpdfium/image/embedded.rb +17 -16
- data/lib/rpdfium/io/png.rb +9 -9
- data/lib/rpdfium/page.rb +561 -526
- data/lib/rpdfium/raw.rb +216 -203
- data/lib/rpdfium/search/search.rb +5 -5
- data/lib/rpdfium/structure/attachment.rb +6 -6
- data/lib/rpdfium/structure/element.rb +74 -74
- data/lib/rpdfium/structure/outline.rb +2 -2
- data/lib/rpdfium/structure/tree.rb +56 -55
- data/lib/rpdfium/table/cells.rb +36 -33
- data/lib/rpdfium/table/debugger.rb +12 -12
- data/lib/rpdfium/table/edges.rb +51 -49
- data/lib/rpdfium/table/extractor.rb +35 -34
- data/lib/rpdfium/table/table.rb +65 -62
- data/lib/rpdfium/util/cluster.rb +35 -33
- data/lib/rpdfium/util/column_inference.rb +34 -32
- data/lib/rpdfium/util/label_matcher.rb +30 -30
- data/lib/rpdfium/util/text_extraction.rb +15 -15
- data/lib/rpdfium/util/word_extractor.rb +49 -48
- data/lib/rpdfium/util/word_merger.rb +25 -24
- data/lib/rpdfium/version.rb +1 -1
- data/lib/rpdfium.rb +17 -15
- metadata +1 -1
|
@@ -2,28 +2,29 @@
|
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
4
|
module Util
|
|
5
|
-
#
|
|
5
|
+
# Extracts "words" from a list of chars, faithfully to
|
|
6
|
+
# pdfplumber.WordExtractor.
|
|
6
7
|
#
|
|
7
|
-
#
|
|
8
|
-
# 1.
|
|
9
|
-
#
|
|
10
|
-
# 2. Cluster
|
|
11
|
-
# 3.
|
|
12
|
-
#
|
|
13
|
-
#
|
|
14
|
-
# 4.
|
|
8
|
+
# Algorithm:
|
|
9
|
+
# 1. Sort the chars by (top, x0): rows top-to-bottom, chars
|
|
10
|
+
# left-to-right within each row.
|
|
11
|
+
# 2. Cluster by top with `y_tolerance` → "logical rows" of chars.
|
|
12
|
+
# 3. Within each row, cluster by horizontal gap: two chars belong to
|
|
13
|
+
# the same word if `next.x0 - prev.x1 <= x_tolerance`. A whitespace
|
|
14
|
+
# char also separates the word (unless `keep_blank_chars`).
|
|
15
|
+
# 4. For each cluster of chars, emit a word: concatenated text, bbox.
|
|
15
16
|
#
|
|
16
|
-
#
|
|
17
|
-
# -
|
|
18
|
-
#
|
|
19
|
-
# -
|
|
20
|
-
#
|
|
21
|
-
# `chars` (top, x0).
|
|
22
|
-
# -
|
|
23
|
-
#
|
|
17
|
+
# Differences from pdfplumber (simplifications acceptable for our use):
|
|
18
|
+
# - We do not handle rotated `line_dir`/`char_dir` (text rotated away
|
|
19
|
+
# from horizontal ltr): not relevant for current use cases.
|
|
20
|
+
# - We do not handle `use_text_flow` (ordering based on the content
|
|
21
|
+
# stream): our chars already arrive from PDFium in geometric order
|
|
22
|
+
# via `chars` (top, x0).
|
|
23
|
+
# - We do not handle `expand_ligatures`: PDFium usually expands the
|
|
24
|
+
# codepoints correctly already at the char level.
|
|
24
25
|
#
|
|
25
|
-
#
|
|
26
|
-
#
|
|
26
|
+
# These differences are documented; if ever needed they can be added
|
|
27
|
+
# as feature toggles without changing the default path.
|
|
27
28
|
class WordExtractor
|
|
28
29
|
DEFAULT_X_TOLERANCE = 3.0
|
|
29
30
|
DEFAULT_Y_TOLERANCE = 3.0
|
|
@@ -40,13 +41,13 @@ module Rpdfium
|
|
|
40
41
|
@extra_attrs = extra_attrs || []
|
|
41
42
|
end
|
|
42
43
|
|
|
43
|
-
#
|
|
44
|
-
#
|
|
45
|
-
#
|
|
44
|
+
# Returns an Array of Hash: { text:, x0:, x1:, top:, bottom:, chars: }.
|
|
45
|
+
# If `extra_attrs` is non-empty, each word also splits when these
|
|
46
|
+
# attributes change (e.g. different fontname/size → different words).
|
|
46
47
|
def extract_words(chars)
|
|
47
48
|
return [] if chars.empty?
|
|
48
49
|
|
|
49
|
-
# Fast path:
|
|
50
|
+
# Fast path: a single char → 1 trivial word (if not whitespace).
|
|
50
51
|
if chars.size == 1
|
|
51
52
|
c = chars.first
|
|
52
53
|
return [] if blank?(c) && !@keep_blank_chars
|
|
@@ -54,35 +55,35 @@ module Rpdfium
|
|
|
54
55
|
return [build_word([c])]
|
|
55
56
|
end
|
|
56
57
|
|
|
57
|
-
# 1.
|
|
58
|
+
# 1. Sort by (top, x0). Top-down, left-to-right.
|
|
58
59
|
sorted = chars.sort_by { |c| [c[:top], c[:x0]] }
|
|
59
60
|
|
|
60
|
-
# 2. Cluster
|
|
61
|
-
# `presorted: true`: sorted
|
|
62
|
-
#
|
|
63
|
-
# sort
|
|
61
|
+
# 2. Cluster into rows by `top`.
|
|
62
|
+
# `presorted: true`: sorted is already ordered by [top, x0], hence
|
|
63
|
+
# implicitly also by top — cluster_objects skips its own internal
|
|
64
|
+
# sort.
|
|
64
65
|
rows = Cluster.cluster_objects(sorted, :top,
|
|
65
66
|
tolerance: @y_tolerance,
|
|
66
67
|
presorted: true)
|
|
67
68
|
|
|
68
69
|
words = []
|
|
69
70
|
rows.each do |row|
|
|
70
|
-
# Re-sort
|
|
71
|
+
# Re-sort by x0 within each clustered row.
|
|
71
72
|
#
|
|
72
|
-
#
|
|
73
|
-
# [top, x0],
|
|
74
|
-
#
|
|
75
|
-
#
|
|
76
|
-
#
|
|
77
|
-
#
|
|
78
|
-
#
|
|
79
|
-
#
|
|
80
|
-
#
|
|
81
|
-
#
|
|
73
|
+
# NOTE: in principle the input `sorted` is already ordered by
|
|
74
|
+
# [top, x0], so the top clusters should already be in x0 order.
|
|
75
|
+
# BUT the global sort `[top, x0]` strictly respects the order by
|
|
76
|
+
# top — if two chars of the same visual row have different tops
|
|
77
|
+
# within tolerance (e.g. the lowercase "i" often has a top higher
|
|
78
|
+
# by 0.008pt than the other letters because of how PDFium computes
|
|
79
|
+
# the bbox), the global sort interleaves them. cluster_objects by
|
|
80
|
+
# :top does not internally reorder the chars, so a char with a
|
|
81
|
+
# slightly lower top ends up AHEAD of all the other letters of the
|
|
82
|
+
# word.
|
|
82
83
|
#
|
|
83
|
-
#
|
|
84
|
-
# 414.9869 → output `iCategora`
|
|
85
|
-
#
|
|
84
|
+
# Real example: "Categoria" where "i" has top=414.9789 and the
|
|
85
|
+
# others 414.9869 → output `iCategora` instead of `Categoria`.
|
|
86
|
+
# The fix is simply to re-sort by x0 within the row.
|
|
86
87
|
row_sorted = row.sort_by { |c| c[:x0] }
|
|
87
88
|
|
|
88
89
|
word_chars = []
|
|
@@ -91,8 +92,8 @@ module Rpdfium
|
|
|
91
92
|
words << build_word(word_chars) unless word_chars.empty?
|
|
92
93
|
word_chars = []
|
|
93
94
|
end
|
|
94
|
-
# Whitespace:
|
|
95
|
-
#
|
|
95
|
+
# Whitespace: by default we use it as a separator (we discard it).
|
|
96
|
+
# With keep_blank_chars=true we include it in the current word.
|
|
96
97
|
if blank?(c) && !@keep_blank_chars
|
|
97
98
|
words << build_word(word_chars) unless word_chars.empty?
|
|
98
99
|
word_chars = []
|
|
@@ -111,15 +112,15 @@ module Rpdfium
|
|
|
111
112
|
def char_begins_new_word?(prev, curr)
|
|
112
113
|
return false if prev.nil?
|
|
113
114
|
|
|
114
|
-
#
|
|
115
|
+
# Horizontal gap (PDF font hinting may give a slight overlap, max 0)
|
|
115
116
|
gap = curr[:x0] - prev[:x1]
|
|
116
117
|
return true if gap > @x_tolerance
|
|
117
118
|
|
|
118
|
-
#
|
|
119
|
-
#
|
|
119
|
+
# Row change (can happen if y_tolerance is large but two chars are
|
|
120
|
+
# nonetheless on different rows)
|
|
120
121
|
return true if (curr[:top] - prev[:top]).abs > @y_tolerance
|
|
121
122
|
|
|
122
|
-
#
|
|
123
|
+
# Change of a required extra_attr
|
|
123
124
|
@extra_attrs.any? { |attr| prev[attr] != curr[attr] }
|
|
124
125
|
end
|
|
125
126
|
|
|
@@ -2,30 +2,30 @@
|
|
|
2
2
|
|
|
3
3
|
module Rpdfium
|
|
4
4
|
module Util
|
|
5
|
-
#
|
|
6
|
-
#
|
|
5
|
+
# Merges adjacent words on the same row into a single word with an
|
|
6
|
+
# aggregated bbox and concatenated text.
|
|
7
7
|
#
|
|
8
|
-
#
|
|
8
|
+
# Three strategies are available as separate methods:
|
|
9
9
|
#
|
|
10
|
-
# - `merge_by_proximity` —
|
|
11
|
-
#
|
|
10
|
+
# - `merge_by_proximity` — merges all adjacent words that satisfy the
|
|
11
|
+
# proximity criterion. Base strategy.
|
|
12
12
|
#
|
|
13
|
-
# - `merge_by_label` —
|
|
14
|
-
# (
|
|
15
|
-
#
|
|
16
|
-
# in
|
|
13
|
+
# - `merge_by_label` — merges only words that share the same "label"
|
|
14
|
+
# (external key computed by the caller). Useful for preserving
|
|
15
|
+
# semantics when different labels fall on the same row (e.g. flags
|
|
16
|
+
# in adjacent columns).
|
|
17
17
|
#
|
|
18
|
-
# - `merge_unlabeled` —
|
|
19
|
-
#
|
|
18
|
+
# - `merge_unlabeled` — merges only "orphan" words (label nil), leaving
|
|
19
|
+
# labeled ones intact. Inverse of merge_by_label.
|
|
20
20
|
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
21
|
+
# All return a new list of words, with merged ones represented as the
|
|
22
|
+
# hash `{ text:, x0:, x1:, top:, bottom: }`.
|
|
23
23
|
#
|
|
24
|
-
# @example merge
|
|
24
|
+
# @example merge by proximity
|
|
25
25
|
# merger = Rpdfium::Util::WordMerger.new(x_gap: 20.0, y_tol: 3.0)
|
|
26
26
|
# merged = merger.merge_by_proximity(words)
|
|
27
27
|
#
|
|
28
|
-
# @example merge
|
|
28
|
+
# @example merge by label, with the label provided by the caller
|
|
29
29
|
# labels_by_word = words.each_with_object({}) { |w, h| h[w] = compute_label(w) }
|
|
30
30
|
# merged = merger.merge_by_label(words, labels_by_word)
|
|
31
31
|
class WordMerger
|
|
@@ -37,21 +37,22 @@ module Rpdfium
|
|
|
37
37
|
@y_tol = y_tol
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
-
#
|
|
40
|
+
# Merges all adjacent words (same row + horizontal gap ≤ x_gap).
|
|
41
41
|
def merge_by_proximity(words)
|
|
42
42
|
merge_groups(words) { |a, b| true }
|
|
43
43
|
end
|
|
44
44
|
|
|
45
|
-
#
|
|
46
|
-
# @param labels_by_word [Hash] mapping word → label (
|
|
47
|
-
#
|
|
45
|
+
# Merges only words with the same label.
|
|
46
|
+
# @param labels_by_word [Hash] mapping word → label (any type).
|
|
47
|
+
# Words with the same label are merged; words with different
|
|
48
|
+
# labels are not.
|
|
48
49
|
def merge_by_label(words, labels_by_word)
|
|
49
50
|
merge_groups(words) do |a, b|
|
|
50
51
|
labels_by_word[a] == labels_by_word[b]
|
|
51
52
|
end
|
|
52
53
|
end
|
|
53
54
|
|
|
54
|
-
#
|
|
55
|
+
# Merges only words with a nil label (orphans).
|
|
55
56
|
def merge_unlabeled(words, labels_by_word)
|
|
56
57
|
merge_groups(words) do |a, b|
|
|
57
58
|
labels_by_word[a].nil? && labels_by_word[b].nil?
|
|
@@ -60,10 +61,10 @@ module Rpdfium
|
|
|
60
61
|
|
|
61
62
|
private
|
|
62
63
|
|
|
63
|
-
#
|
|
64
|
-
#
|
|
65
|
-
# (
|
|
66
|
-
#
|
|
64
|
+
# Generic merging algorithm: iterates over the words sorted by
|
|
65
|
+
# (top, x0) and groups them when they satisfy both the geometric
|
|
66
|
+
# criterion (same row and narrow horizontal gap) and the `yield`
|
|
67
|
+
# predicate provided by the caller.
|
|
67
68
|
def merge_groups(words)
|
|
68
69
|
return [] if words.empty?
|
|
69
70
|
|
data/lib/rpdfium/version.rb
CHANGED
data/lib/rpdfium.rb
CHANGED
|
@@ -3,9 +3,9 @@
|
|
|
3
3
|
require_relative "rpdfium/version"
|
|
4
4
|
require_relative "rpdfium/errors"
|
|
5
5
|
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
# Rpdfium::Binary.library_path
|
|
6
|
+
# Loads the companion gem rpdfium-binary if present: this must happen BEFORE
|
|
7
|
+
# raw.rb, which calls ffi_lib at require time and queries
|
|
8
|
+
# Rpdfium::Binary.library_path to find the absolute path to the .so/.dylib.
|
|
9
9
|
begin
|
|
10
10
|
require "rpdfium/binary"
|
|
11
11
|
rescue LoadError
|
|
@@ -54,22 +54,24 @@ module Rpdfium
|
|
|
54
54
|
Document.open(input, password: password, &block)
|
|
55
55
|
end
|
|
56
56
|
|
|
57
|
-
#
|
|
57
|
+
# Extract all the text of all pages, one string per page.
|
|
58
58
|
def self.extract_text(input, password: nil)
|
|
59
|
-
open(input, password: password)
|
|
59
|
+
open(input, password: password) do |doc|
|
|
60
|
+
doc.each_page_streaming.map(&:text)
|
|
61
|
+
end
|
|
60
62
|
end
|
|
61
63
|
|
|
62
|
-
#
|
|
63
|
-
#
|
|
64
|
+
# Extract all the tables of all pages.
|
|
65
|
+
# Returns Array<{ page: Integer, rows: Array<Array<String>> }>.
|
|
64
66
|
#
|
|
65
|
-
# `keep_blank_rows: false` (default)
|
|
66
|
-
#
|
|
67
|
-
#
|
|
68
|
-
#
|
|
69
|
-
#
|
|
67
|
+
# `keep_blank_rows: false` (default) removes the completely empty rows
|
|
68
|
+
# that the `:text` strategy of words_to_edges_h generates by construction (each
|
|
69
|
+
# visual row produces two edges, top + bottom, and between pairs of adjacent
|
|
70
|
+
# edges "spurious rows" form, with a height equal to the line gap).
|
|
71
|
+
# With `keep_blank_rows: true` you get the raw output of Table#extract.
|
|
70
72
|
def self.extract_tables(input, password: nil, keep_blank_rows: false, **opts)
|
|
71
73
|
open(input, password: password) do |doc|
|
|
72
|
-
doc.flat_map do |page|
|
|
74
|
+
doc.each_page_streaming.flat_map do |page|
|
|
73
75
|
Table::Extractor.new(page, **opts).extract.map do |rows|
|
|
74
76
|
rows = rows.reject { |r| r.all? { |c| c.nil? || c.empty? } } unless keep_blank_rows
|
|
75
77
|
{ page: page.index, rows: rows }
|
|
@@ -78,11 +80,11 @@ module Rpdfium
|
|
|
78
80
|
end
|
|
79
81
|
end
|
|
80
82
|
|
|
81
|
-
#
|
|
83
|
+
# Render each page to a PNG inside output_dir.
|
|
82
84
|
def self.render_to_pngs(input, output_dir:, scale: 2.0, password: nil)
|
|
83
85
|
Dir.mkdir(output_dir) unless Dir.exist?(output_dir)
|
|
84
86
|
open(input, password: password) do |doc|
|
|
85
|
-
doc.map do |page|
|
|
87
|
+
doc.each_page_streaming.map do |page|
|
|
86
88
|
path = File.join(output_dir, format("page_%04d.png", page.index + 1))
|
|
87
89
|
page.render_to_png(path, scale: scale)
|
|
88
90
|
path
|