rpdfium 0.4.2 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/lib/rpdfium/page.rb +29 -9
- data/lib/rpdfium/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b20ddc3e261b5d9a8a8fa0d84c8ff65e26a6a952a7e3d57dcfe64680944c652c
|
|
4
|
+
data.tar.gz: cf5c332e8231777062ecd593458b2192794dec3429a2875cf679279ee4ac6a5f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d59b9039332abdf90ee46032ac7b6501b99b753e9045da3526c410e2806a53b2b5dd416ec83c254d2692a5926f64ff0aea8c1c5859633a26a26e84a43fd2cdb5
|
|
7
|
+
data.tar.gz: 7e218d1c1f6632ae871fa6b0453de3e97a256a9e9871788f4bed34a93c3ee71d91bee3042712ab083be963fec38fa5aa9bb20a608970bf45e7358115ddf554ed
|
data/CHANGELOG.md
CHANGED
|
@@ -8,6 +8,34 @@ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|
|
8
8
|
|
|
9
9
|
## [Unreleased]
|
|
10
10
|
|
|
11
|
+
## [0.4.4] - 2026-06-18
|
|
12
|
+
|
|
13
|
+
### Changed
|
|
14
|
+
|
|
15
|
+
- **`Page#words` allocates ~77% fewer objects.** The row/word pipeline was
|
|
16
|
+
rebuilt to avoid transient allocations: the by-row sort now uses a
|
|
17
|
+
comparator block instead of `sort_by { [top, x0] }` (no per-char 2-element
|
|
18
|
+
key array nor Schwartzian pair array), the per-row sort runs in place with
|
|
19
|
+
`sort!`, and `word_from_chars` folds `text`/`top`/`bottom` in a single pass
|
|
20
|
+
instead of three separate `map`+`join`/`min`/`max` passes (each of which
|
|
21
|
+
allocated an intermediate array). Measured on a real 3567-char page (438
|
|
22
|
+
words): 6552 → 1534 allocated objects per call (1.84 → 0.43 objects/char).
|
|
23
|
+
Behavior is unchanged.
|
|
24
|
+
|
|
25
|
+
## [0.4.3] - 2026-06-16
|
|
26
|
+
|
|
27
|
+
### Fixed
|
|
28
|
+
|
|
29
|
+
- **`Page#words` now returns numeric `top`/`bottom` coordinates.** Each word's
|
|
30
|
+
`:top` and `:bottom` fields were computed with `chars.min { |c| c[:top] }` /
|
|
31
|
+
`chars.max { |c| c[:bottom] }`. The block form of `Enumerable#min`/`max` must
|
|
32
|
+
return a comparator (-1/0/1) and receives two arguments, so the numeric value
|
|
33
|
+
returned by the single-argument block was used as a broken comparator and the
|
|
34
|
+
method returned the whole char hash instead of the position. As a result
|
|
35
|
+
`word[:top]` and `word[:bottom]` were no longer positional numbers. Fixed by
|
|
36
|
+
using `chars.map { |c| c[:top] }.min` / `.max`, which return the expected
|
|
37
|
+
scalar.
|
|
38
|
+
|
|
11
39
|
## [0.4.2] - 2026-06-15
|
|
12
40
|
|
|
13
41
|
### Added
|
data/lib/rpdfium/page.rb
CHANGED
|
@@ -876,13 +876,20 @@ module Rpdfium
|
|
|
876
876
|
cs = chars(**char_opts)
|
|
877
877
|
return [] if cs.empty?
|
|
878
878
|
|
|
879
|
-
# Group into rows by y
|
|
880
|
-
|
|
879
|
+
# Group into rows by y. Comparator sort instead of sort_by avoids one
|
|
880
|
+
# 2-element key array per char (plus the Schwartzian pair array) — the
|
|
881
|
+
# dominant transient allocation on pages with thousands of chars.
|
|
882
|
+
sorted_cs = cs.sort do |a, b|
|
|
883
|
+
c = a[:top] <=> b[:top]
|
|
884
|
+
c.zero? ? a[:x0] <=> b[:x0] : c
|
|
885
|
+
end
|
|
886
|
+
rows = group_consecutive(sorted_cs) do |a, b|
|
|
881
887
|
(a[:top] - b[:top]).abs <= y_tolerance
|
|
882
888
|
end
|
|
883
889
|
|
|
884
890
|
rows.flat_map do |row|
|
|
885
|
-
|
|
891
|
+
# In-place: `row` is a fresh group array, no need for a new copy.
|
|
892
|
+
sorted = row.sort! { |a, b| a[:x0] <=> b[:x0] }
|
|
886
893
|
# Split on gap > x_tolerance or explicit space
|
|
887
894
|
word_groups = []
|
|
888
895
|
buf = []
|
|
@@ -1595,14 +1602,27 @@ module Rpdfium
|
|
|
1595
1602
|
end
|
|
1596
1603
|
|
|
1597
1604
|
def word_from_chars(chars)
|
|
1605
|
+
# Single pass: fold text/top/bottom together instead of 3 separate
|
|
1606
|
+
# map+join/min/max passes (each allocating an intermediate array).
|
|
1607
|
+
first = chars.first
|
|
1608
|
+
text = +''
|
|
1609
|
+
top = first[:top]
|
|
1610
|
+
bottom = first[:bottom]
|
|
1611
|
+
chars.each do |c|
|
|
1612
|
+
text << c[:char].to_s
|
|
1613
|
+
t = c[:top]
|
|
1614
|
+
b = c[:bottom]
|
|
1615
|
+
top = t if t < top
|
|
1616
|
+
bottom = b if b > bottom
|
|
1617
|
+
end
|
|
1598
1618
|
{
|
|
1599
|
-
text:
|
|
1600
|
-
x0:
|
|
1619
|
+
text: text,
|
|
1620
|
+
x0: first[:x0],
|
|
1601
1621
|
x1: chars.last[:x1],
|
|
1602
|
-
top:
|
|
1603
|
-
bottom:
|
|
1604
|
-
fontsize:
|
|
1605
|
-
font:
|
|
1622
|
+
top: top,
|
|
1623
|
+
bottom: bottom,
|
|
1624
|
+
fontsize: first[:fontsize],
|
|
1625
|
+
font: first[:font],
|
|
1606
1626
|
chars: chars
|
|
1607
1627
|
}
|
|
1608
1628
|
end
|
data/lib/rpdfium/version.rb
CHANGED