pdfsink 0.1.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +10 -0
- data/LICENSE +21 -0
- data/README.md +148 -0
- data/Rakefile +28 -0
- data/ext/pdfsink/extconf.rb +68 -0
- data/lib/pdfsink/cli.rb +146 -0
- data/lib/pdfsink/document.rb +84 -0
- data/lib/pdfsink/error.rb +33 -0
- data/lib/pdfsink/page.rb +97 -0
- data/lib/pdfsink/pdfsink-rs +0 -0
- data/lib/pdfsink/railtie.rb +23 -0
- data/lib/pdfsink/table_strategy.rb +47 -0
- data/lib/pdfsink/version.rb +8 -0
- data/lib/pdfsink.rb +81 -0
- data/pdfsink.gemspec +30 -0
- metadata +62 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 34d1b6e0a8b05f517a8e65a67a6d0c403aff224c571259a64d3d2dd37e1a7a30
|
|
4
|
+
data.tar.gz: c001deebaf3d8f7f56321cdea83c1afa18574a3d5c5c6e096bb03fcdebfe3637
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 46ef50b4483f780005b46923364ddab7dab72179fb2eed24d053cdbf8254532a0ff66f71f7e60c6502ab7fa1ba1f17334827672ce7534e7cbe3267a5077f8158
|
|
7
|
+
data.tar.gz: 4fed8ef3d23fd84df18f36573f7ad4a56b12d4edc6bfc8579928d381a02271227884342ccd418bb3428f5c91033e54f8ccabf41e32da907599428d3bdb54c963
|
data/Gemfile
ADDED
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Accountaim
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# Pdfsink
|
|
2
|
+
|
|
3
|
+
Ruby wrapper for [pdfsink-rs](https://github.com/clark-labs-inc/pdfsink-rs), a
|
|
4
|
+
fast pure-Rust PDF extraction tool. Extract text, words, page objects, tables,
|
|
5
|
+
hyperlinks, and regex matches from PDFs — without a Python runtime.
|
|
6
|
+
|
|
7
|
+
The gem shells out to the bundled `pdfsink-rs` binary and parses its JSON
|
|
8
|
+
output, so there is nothing to load into your Ruby process and no FFI.
|
|
9
|
+
|
|
10
|
+
## Requirements
|
|
11
|
+
|
|
12
|
+
- Ruby >= 3.2
|
|
13
|
+
- For source installs only: a Rust toolchain (`cargo`) to compile the binary.
|
|
14
|
+
Precompiled platform gems bundle the binary and need no toolchain.
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```ruby
|
|
19
|
+
# Gemfile
|
|
20
|
+
gem "pdfsink"
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
bundle install
|
|
25
|
+
# or
|
|
26
|
+
gem install pdfsink
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
On supported platforms (`x86_64-linux`, `aarch64-linux`, `x86_64-darwin`,
|
|
30
|
+
`arm64-darwin`) a precompiled gem ships the binary. Elsewhere the source gem
|
|
31
|
+
compiles it on install via `cargo install pdfsink-rs`.
|
|
32
|
+
|
|
33
|
+
### Building from source
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
git clone https://github.com/AccountAim/pdfsink-ruby
|
|
37
|
+
cd pdfsink-ruby
|
|
38
|
+
bundle install
|
|
39
|
+
bundle exec rake cargo:build # compiles the binary into lib/pdfsink/
|
|
40
|
+
bundle exec rspec
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
To point the gem at a binary you built elsewhere, set `PDFSINK_BIN`:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
export PDFSINK_BIN=/path/to/pdfsink-rs
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Usage
|
|
50
|
+
|
|
51
|
+
### Open a document
|
|
52
|
+
|
|
53
|
+
```ruby
|
|
54
|
+
doc = Pdfsink.open("report.pdf")
|
|
55
|
+
doc.page_count # => 12
|
|
56
|
+
doc.pages # => [#<Pdfsink::Page number=1 612.0x792.0>, ...]
|
|
57
|
+
doc.each_page { |page| puts page.extract_text }
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Extract text
|
|
61
|
+
|
|
62
|
+
```ruby
|
|
63
|
+
page = doc.page(1)
|
|
64
|
+
page.extract_text # => "Quarterly Report\n..."
|
|
65
|
+
|
|
66
|
+
# One-shot, without holding a Document:
|
|
67
|
+
Pdfsink.extract_text("report.pdf", page: 1)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Words with positions
|
|
71
|
+
|
|
72
|
+
```ruby
|
|
73
|
+
page.extract_words
|
|
74
|
+
# => [{"text" => "Quarterly", "x0" => 72.0, "top" => 90.1, ...}, ...]
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Page objects
|
|
78
|
+
|
|
79
|
+
```ruby
|
|
80
|
+
page.object_counts # => {"chars" => 812, "lines" => 14, "rects" => 3, ...}
|
|
81
|
+
page.objects # => {"chars" => [...], "lines" => [...], ...}
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Tables
|
|
85
|
+
|
|
86
|
+
```ruby
|
|
87
|
+
page.tables # default strategy ("lines")
|
|
88
|
+
page.tables(strategy: :text) # infer from text alignment
|
|
89
|
+
# => [["Name", "Age", "City"], ["Alice", "31", "Oakland"], ...]
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Strategies: `:lines`, `:lines_strict`, `:text`, `:explicit`
|
|
93
|
+
(see `Pdfsink::TableStrategy`).
|
|
94
|
+
|
|
95
|
+
### Hyperlinks
|
|
96
|
+
|
|
97
|
+
```ruby
|
|
98
|
+
page.links
|
|
99
|
+
# => [{"uri" => "https://example.com", "x0" => 72.0, ...}]
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Search
|
|
103
|
+
|
|
104
|
+
```ruby
|
|
105
|
+
page.search(/total:\s*\$\d+/i)
|
|
106
|
+
# => [{"text" => "Total: $420", "x0" => ..., ...}]
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Binary version
|
|
110
|
+
|
|
111
|
+
```ruby
|
|
112
|
+
Pdfsink.version # => "0.2.8"
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Rails Integration
|
|
116
|
+
|
|
117
|
+
```ruby
|
|
118
|
+
# config/application.rb (or an initializer)
|
|
119
|
+
config.pdfsink.default_table_strategy = :text
|
|
120
|
+
config.pdfsink.binary_path = Rails.root.join("bin/pdfsink-rs").to_s
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Configuration
|
|
124
|
+
|
|
125
|
+
```ruby
|
|
126
|
+
Pdfsink.configure do |config|
|
|
127
|
+
config.default_table_strategy = :text # used when Page#tables gets no strategy
|
|
128
|
+
end
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
The binary is located in this order:
|
|
132
|
+
|
|
133
|
+
1. `PDFSINK_BIN` environment variable
|
|
134
|
+
2. `lib/pdfsink/pdfsink-rs` inside the gem (bundled / built)
|
|
135
|
+
3. `ext/pdfsink/bin/pdfsink-rs` (dev location)
|
|
136
|
+
4. `pdfsink-rs` on `PATH`
|
|
137
|
+
|
|
138
|
+
## Development
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
bundle install
|
|
142
|
+
bundle exec rake cargo:build
|
|
143
|
+
bundle exec rspec
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## License
|
|
147
|
+
|
|
148
|
+
MIT — see [LICENSE](LICENSE).
|
data/Rakefile
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rspec/core/rake_task"
|
|
4
|
+
|
|
5
|
+
RSpec::Core::RakeTask.new(:spec)
|
|
6
|
+
|
|
7
|
+
namespace :cargo do
|
|
8
|
+
desc "Build the pdfsink-rs binary into lib/pdfsink/"
|
|
9
|
+
task :build do
|
|
10
|
+
crate_ver = "0.2.8"
|
|
11
|
+
bin_name = "pdfsink-rs"
|
|
12
|
+
stage = File.expand_path("ext/pdfsink/cargo-root", __dir__)
|
|
13
|
+
lib_dir = File.expand_path("lib/pdfsink", __dir__)
|
|
14
|
+
|
|
15
|
+
sh "cargo", "install", "pdfsink-rs",
|
|
16
|
+
"--version", crate_ver, "--bin", bin_name,
|
|
17
|
+
"--root", stage, "--force"
|
|
18
|
+
|
|
19
|
+
built = File.join(stage, "bin", bin_name)
|
|
20
|
+
abort "ERROR: binary not found at #{built}" unless File.exist?(built)
|
|
21
|
+
|
|
22
|
+
mkdir_p lib_dir
|
|
23
|
+
cp built, lib_dir, verbose: true
|
|
24
|
+
chmod 0o755, File.join(lib_dir, bin_name)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
task default: :spec
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# extconf.rb -- invoked by `gem install` or `bundle install` to build the
|
|
4
|
+
# pdfsink-rs CLI binary from the published crate.
|
|
5
|
+
#
|
|
6
|
+
# Requirements:
|
|
7
|
+
# - cargo / rustc (Rust toolchain, 1.88+)
|
|
8
|
+
#
|
|
9
|
+
# Precompiled platform gems ship the binary in lib/pdfsink/ and strip this
|
|
10
|
+
# extension, so this script only runs for source installs.
|
|
11
|
+
|
|
12
|
+
require "fileutils"
|
|
13
|
+
|
|
14
|
+
CRATE = "pdfsink-rs"
|
|
15
|
+
CRATE_VER = "0.2.8"
|
|
16
|
+
BIN_NAME = "pdfsink-rs"
|
|
17
|
+
EXT_DIR = __dir__
|
|
18
|
+
LIB_DIR = File.expand_path("../../lib/pdfsink", EXT_DIR)
|
|
19
|
+
|
|
20
|
+
def write_dummy_makefile
|
|
21
|
+
File.write(File.join(EXT_DIR, "Makefile"), "all:\ninstall:\nclean:\n")
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# ── Skip build if the binary already exists ───────────────────────────
|
|
25
|
+
|
|
26
|
+
if File.executable?(File.join(LIB_DIR, BIN_NAME))
|
|
27
|
+
puts "#{BIN_NAME} already exists in #{LIB_DIR}, skipping Rust build."
|
|
28
|
+
write_dummy_makefile
|
|
29
|
+
exit 0
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# ── Pre-flight checks ─────────────────────────────────────────────────
|
|
33
|
+
|
|
34
|
+
unless system("command -v cargo > /dev/null 2>&1")
|
|
35
|
+
abort <<~MSG
|
|
36
|
+
ERROR: `cargo` not found on PATH.
|
|
37
|
+
|
|
38
|
+
The pdfsink gem requires the Rust toolchain to compile the native binary.
|
|
39
|
+
Install Rust via https://rustup.rs and ensure `cargo` is on your PATH,
|
|
40
|
+
then run `gem install pdfsink` again.
|
|
41
|
+
MSG
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# ── Build the binary into a staging root ──────────────────────────────
|
|
45
|
+
|
|
46
|
+
stage = File.join(EXT_DIR, "cargo-root")
|
|
47
|
+
FileUtils.mkdir_p(stage)
|
|
48
|
+
|
|
49
|
+
puts "Installing #{CRATE} v#{CRATE_VER} (release)..."
|
|
50
|
+
ok = system("cargo", "install", CRATE,
|
|
51
|
+
"--version", CRATE_VER,
|
|
52
|
+
"--bin", BIN_NAME,
|
|
53
|
+
"--root", stage,
|
|
54
|
+
"--force")
|
|
55
|
+
abort "ERROR: cargo install #{CRATE} failed" unless ok
|
|
56
|
+
|
|
57
|
+
# ── Copy the artifact into lib/pdfsink/ ───────────────────────────────
|
|
58
|
+
|
|
59
|
+
built = File.join(stage, "bin", BIN_NAME)
|
|
60
|
+
abort "ERROR: binary not found at #{built}" unless File.exist?(built)
|
|
61
|
+
|
|
62
|
+
FileUtils.mkdir_p(LIB_DIR)
|
|
63
|
+
dest = File.join(LIB_DIR, BIN_NAME)
|
|
64
|
+
FileUtils.cp(built, dest, verbose: true)
|
|
65
|
+
FileUtils.chmod(0o755, dest)
|
|
66
|
+
puts "Installed #{BIN_NAME} into #{LIB_DIR}"
|
|
67
|
+
|
|
68
|
+
write_dummy_makefile
|
data/lib/pdfsink/cli.rb
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "open3"
|
|
4
|
+
require "json"
|
|
5
|
+
|
|
6
|
+
module Pdfsink
|
|
7
|
+
# Low-level runner for the pdfsink-rs CLI binary.
|
|
8
|
+
#
|
|
9
|
+
# This module is not intended for direct use -- see the public API on
|
|
10
|
+
# {Pdfsink}, {Pdfsink::Document}, and {Pdfsink::Page} instead. Every method
|
|
11
|
+
# shells out to the binary, returning either its raw stdout (for +text+) or
|
|
12
|
+
# the parsed JSON it prints.
|
|
13
|
+
#
|
|
14
|
+
# @api private
|
|
15
|
+
module Cli
|
|
16
|
+
BINARY = "pdfsink-rs"
|
|
17
|
+
|
|
18
|
+
class << self
|
|
19
|
+
# Absolute path to the pdfsink-rs binary. Search order:
|
|
20
|
+
# 1. PDFSINK_BIN environment variable (explicit override)
|
|
21
|
+
# 2. lib/pdfsink/ inside the gem (where extconf.rb copies the build)
|
|
22
|
+
# 3. ext/pdfsink/bin/ (dev / cargo-install location)
|
|
23
|
+
# 4. The bare name, resolved against PATH at exec time
|
|
24
|
+
#
|
|
25
|
+
# @return [String]
|
|
26
|
+
def binary
|
|
27
|
+
@binary ||= find_binary
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Override the resolved binary path (mainly for tests).
|
|
31
|
+
attr_writer :binary
|
|
32
|
+
|
|
33
|
+
# The pdfsink-rs version string, e.g. "pdfsink-rs 0.2.8".
|
|
34
|
+
#
|
|
35
|
+
# The CLI has no version subcommand, so this reports the crate version
|
|
36
|
+
# the gem was built against.
|
|
37
|
+
#
|
|
38
|
+
# @return [String]
|
|
39
|
+
def version
|
|
40
|
+
Pdfsink::PDFSINK_RS_VERSION
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Document-level metadata for every page: dimensions, rotation, bbox,
|
|
44
|
+
# and per-page object counts.
|
|
45
|
+
#
|
|
46
|
+
# @param path [String]
|
|
47
|
+
# @return [Hash]
|
|
48
|
+
def info(path)
|
|
49
|
+
run_json("info", path)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Extracted text for a single page.
|
|
53
|
+
#
|
|
54
|
+
# @param path [String]
|
|
55
|
+
# @param page [Integer] 1-based page number
|
|
56
|
+
# @return [String]
|
|
57
|
+
def text(path, page)
|
|
58
|
+
run("text", path, page.to_s)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Words with positions for a single page.
|
|
62
|
+
#
|
|
63
|
+
# @return [Array<Hash>]
|
|
64
|
+
def words(path, page)
|
|
65
|
+
run_json("words", path, page.to_s)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Regex search matches for a single page.
|
|
69
|
+
#
|
|
70
|
+
# @return [Array<Hash>]
|
|
71
|
+
def search(path, page, pattern)
|
|
72
|
+
run_json("search", path, page.to_s, pattern)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# All page objects (chars, lines, rects, curves, images, ...) as a dict.
|
|
76
|
+
#
|
|
77
|
+
# @return [Hash]
|
|
78
|
+
def objects(path, page)
|
|
79
|
+
run_json("objects", path, page.to_s)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Hyperlinks on a single page.
|
|
83
|
+
#
|
|
84
|
+
# @return [Array<Hash>]
|
|
85
|
+
def links(path, page)
|
|
86
|
+
run_json("links", path, page.to_s)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Extracted table for a single page, or nil if none was found.
|
|
90
|
+
#
|
|
91
|
+
# @param strategy [String] one of "lines", "lines_strict", "text", "explicit"
|
|
92
|
+
# @return [Array<Array>, nil]
|
|
93
|
+
def table(path, page, strategy)
|
|
94
|
+
run_json("table", path, page.to_s, strategy)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
private
|
|
98
|
+
|
|
99
|
+
def find_binary
|
|
100
|
+
if (env = ENV["PDFSINK_BIN"]) && File.executable?(env)
|
|
101
|
+
return env
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
gem_root = File.expand_path("../..", __dir__)
|
|
105
|
+
|
|
106
|
+
candidates = [
|
|
107
|
+
File.join(gem_root, "lib", "pdfsink", BINARY),
|
|
108
|
+
File.join(gem_root, "ext", "pdfsink", "bin", BINARY),
|
|
109
|
+
]
|
|
110
|
+
candidates.each { |path| return path if File.executable?(path) }
|
|
111
|
+
|
|
112
|
+
# Fall back to PATH resolution at exec time.
|
|
113
|
+
BINARY
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def run(command, *args)
|
|
117
|
+
argv = [binary, command, *args]
|
|
118
|
+
stdout, stderr, status = Open3.capture3(*argv)
|
|
119
|
+
|
|
120
|
+
unless status.success?
|
|
121
|
+
raise CommandError.new(
|
|
122
|
+
"pdfsink-rs #{command} failed: #{stderr.strip}",
|
|
123
|
+
command: argv, status: status.exitstatus, stderr: stderr
|
|
124
|
+
)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
stdout
|
|
128
|
+
rescue Errno::ENOENT
|
|
129
|
+
raise BinaryNotFoundError,
|
|
130
|
+
"Could not find the pdfsink-rs binary (looked for #{binary.inspect}).\n\n" \
|
|
131
|
+
"Build it with:\n" \
|
|
132
|
+
" rake cargo:build\n\n" \
|
|
133
|
+
"Or set PDFSINK_BIN to the full path of the binary."
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def run_json(command, *args)
|
|
137
|
+
out = run(command, *args)
|
|
138
|
+
return nil if out.strip.empty?
|
|
139
|
+
|
|
140
|
+
JSON.parse(out)
|
|
141
|
+
rescue JSON::ParserError => e
|
|
142
|
+
raise ParseError, "pdfsink-rs #{command} returned invalid JSON: #{e.message}"
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
end
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pdfsink
|
|
4
|
+
# A PDF document opened from a file on disk.
|
|
5
|
+
#
|
|
6
|
+
# Opening is cheap: the path is validated and the document's +info+ payload
|
|
7
|
+
# (page count and per-page metadata) is fetched lazily on first access.
|
|
8
|
+
# {Page} objects are created on demand and memoized.
|
|
9
|
+
#
|
|
10
|
+
# @example
|
|
11
|
+
# doc = Pdfsink::Document.open("report.pdf")
|
|
12
|
+
# doc.page_count # => 12
|
|
13
|
+
# doc.page(1).extract_text
|
|
14
|
+
# doc.pages.flat_map(&:extract_words)
|
|
15
|
+
class Document
|
|
16
|
+
# @return [String] absolute path to the PDF file
|
|
17
|
+
attr_reader :path
|
|
18
|
+
|
|
19
|
+
# Open a PDF document.
|
|
20
|
+
#
|
|
21
|
+
# @param path [String] path to a PDF file
|
|
22
|
+
# @return [Document]
|
|
23
|
+
# @raise [Errno::ENOENT] if the file does not exist
|
|
24
|
+
def self.open(path)
|
|
25
|
+
new(path)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# @param path [String] path to a PDF file
|
|
29
|
+
def initialize(path)
|
|
30
|
+
@path = File.expand_path(path)
|
|
31
|
+
raise Errno::ENOENT, @path unless File.exist?(@path)
|
|
32
|
+
|
|
33
|
+
@pages = {}
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Document and per-page metadata as returned by the binary.
|
|
37
|
+
#
|
|
38
|
+
# @return [Hash]
|
|
39
|
+
def info
|
|
40
|
+
@info ||= Cli.info(path)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# @return [Integer] number of pages
|
|
44
|
+
def page_count
|
|
45
|
+
info["page_count"]
|
|
46
|
+
end
|
|
47
|
+
alias length page_count
|
|
48
|
+
alias size page_count
|
|
49
|
+
|
|
50
|
+
# Fetch a single page.
|
|
51
|
+
#
|
|
52
|
+
# @param number [Integer] 1-based page number
|
|
53
|
+
# @return [Page]
|
|
54
|
+
# @raise [RangeError] if the page number is out of range
|
|
55
|
+
def page(number)
|
|
56
|
+
unless number.is_a?(Integer) && number >= 1 && number <= page_count
|
|
57
|
+
raise RangeError, "page #{number} out of range (1..#{page_count})"
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
@pages[number] ||= Page.new(self, number, info["pages"][number - 1])
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# All pages, in order.
|
|
64
|
+
#
|
|
65
|
+
# @return [Array<Page>]
|
|
66
|
+
def pages
|
|
67
|
+
(1..page_count).map { |n| page(n) }
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Iterate over each page.
|
|
71
|
+
#
|
|
72
|
+
# @yieldparam page [Page]
|
|
73
|
+
# @return [Enumerator] if no block is given
|
|
74
|
+
def each_page(&block)
|
|
75
|
+
return enum_for(:each_page) unless block
|
|
76
|
+
|
|
77
|
+
pages.each(&block)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def inspect
|
|
81
|
+
"#<Pdfsink::Document path=#{path.inspect} pages=#{page_count}>"
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pdfsink
|
|
4
|
+
# Base error for all Pdfsink errors.
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
# Raised when the pdfsink-rs binary cannot be located.
|
|
8
|
+
class BinaryNotFoundError < Error; end
|
|
9
|
+
|
|
10
|
+
# Raised when the pdfsink-rs binary exits non-zero.
|
|
11
|
+
#
|
|
12
|
+
# Carries the failing command, exit status, and captured stderr.
|
|
13
|
+
class CommandError < Error
|
|
14
|
+
# @return [Array<String>] the argv that was executed
|
|
15
|
+
attr_reader :command
|
|
16
|
+
|
|
17
|
+
# @return [Integer, nil] the process exit status
|
|
18
|
+
attr_reader :status
|
|
19
|
+
|
|
20
|
+
# @return [String] captured standard error
|
|
21
|
+
attr_reader :stderr
|
|
22
|
+
|
|
23
|
+
def initialize(message, command:, status:, stderr:)
|
|
24
|
+
@command = command
|
|
25
|
+
@status = status
|
|
26
|
+
@stderr = stderr
|
|
27
|
+
super(message)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Raised when the binary's stdout is not the JSON we expected.
|
|
32
|
+
class ParseError < Error; end
|
|
33
|
+
end
|
data/lib/pdfsink/page.rb
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pdfsink
|
|
4
|
+
# A single page of a {Document}.
|
|
5
|
+
#
|
|
6
|
+
# Each accessor shells out to the pdfsink-rs binary for that page; results
|
|
7
|
+
# are cached so repeated reads don't re-spawn the process. Page-level
|
|
8
|
+
# metadata (dimensions, rotation, bbox, object counts) comes from the
|
|
9
|
+
# document's +info+ payload and needs no extra spawn.
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# doc = Pdfsink::Document.open("report.pdf")
|
|
13
|
+
# page = doc.page(1)
|
|
14
|
+
# page.width # => 612.0
|
|
15
|
+
# page.extract_text # => "Quarterly Report\n..."
|
|
16
|
+
# page.tables # => [[["Q1", "Q2"], ["10", "20"]]]
|
|
17
|
+
class Page
|
|
18
|
+
# @return [Integer] 1-based page number
|
|
19
|
+
attr_reader :number
|
|
20
|
+
|
|
21
|
+
# @param document [Document]
|
|
22
|
+
# @param number [Integer] 1-based page number
|
|
23
|
+
# @param meta [Hash] the per-page slice of the document +info+ payload
|
|
24
|
+
def initialize(document, number, meta)
|
|
25
|
+
@document = document
|
|
26
|
+
@number = number
|
|
27
|
+
@meta = meta
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# @return [Float] page width in PDF points
|
|
31
|
+
def width = @meta["width"]
|
|
32
|
+
|
|
33
|
+
# @return [Float] page height in PDF points
|
|
34
|
+
def height = @meta["height"]
|
|
35
|
+
|
|
36
|
+
# @return [Integer] clockwise rotation in degrees (0, 90, 180, 270)
|
|
37
|
+
def rotation = @meta["rotation"]
|
|
38
|
+
|
|
39
|
+
# @return [Hash] the page bounding box ({"x0", "top", "x1", "bottom"})
|
|
40
|
+
def bbox = @meta["bbox"]
|
|
41
|
+
|
|
42
|
+
# @return [Hash] counts of each object kind on the page
|
|
43
|
+
def object_counts = @meta["object_counts"]
|
|
44
|
+
|
|
45
|
+
# The page's text in reading order.
|
|
46
|
+
#
|
|
47
|
+
# @return [String]
|
|
48
|
+
def extract_text
|
|
49
|
+
@extract_text ||= Cli.text(path, number)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Words with positions and font metadata.
|
|
53
|
+
#
|
|
54
|
+
# @return [Array<Hash>]
|
|
55
|
+
def extract_words
|
|
56
|
+
@extract_words ||= Cli.words(path, number)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Every page object (chars, lines, rects, curves, images, annots, ...).
|
|
60
|
+
#
|
|
61
|
+
# @return [Hash] keyed by object kind
|
|
62
|
+
def objects
|
|
63
|
+
@objects ||= Cli.objects(path, number)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Hyperlinks on the page.
|
|
67
|
+
#
|
|
68
|
+
# @return [Array<Hash>]
|
|
69
|
+
def links
|
|
70
|
+
@links ||= Cli.links(path, number)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Regex search matches within the page text.
|
|
74
|
+
#
|
|
75
|
+
# @param pattern [String, Regexp] the pattern to search for
|
|
76
|
+
# @return [Array<Hash>]
|
|
77
|
+
def search(pattern)
|
|
78
|
+
Cli.search(path, number, pattern.is_a?(Regexp) ? pattern.source : pattern.to_s)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# The page's largest detected table, or nil if none is found.
|
|
82
|
+
#
|
|
83
|
+
# @param strategy [Symbol, String, nil] table-detection strategy
|
|
84
|
+
# @return [Array<Array>, nil] rows of cells
|
|
85
|
+
def tables(strategy: nil)
|
|
86
|
+
Cli.table(path, number, TableStrategy.resolve(strategy))
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def inspect
|
|
90
|
+
"#<Pdfsink::Page number=#{number} #{width}x#{height}>"
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
private
|
|
94
|
+
|
|
95
|
+
def path = @document.path
|
|
96
|
+
end
|
|
97
|
+
end
|
|
Binary file
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pdfsink
|
|
4
|
+
# Rails integration. Loaded automatically when +Rails::Railtie+ is
|
|
5
|
+
# defined (see +lib/pdfsink.rb+).
|
|
6
|
+
#
|
|
7
|
+
# @example config/application.rb
|
|
8
|
+
# config.pdfsink.default_table_strategy = :text
|
|
9
|
+
# config.pdfsink.binary_path = Rails.root.join("bin/pdfsink-rs").to_s
|
|
10
|
+
class Railtie < Rails::Railtie
|
|
11
|
+
config.pdfsink = ActiveSupport::OrderedOptions.new
|
|
12
|
+
|
|
13
|
+
initializer "pdfsink.configure" do |app|
|
|
14
|
+
cfg = app.config.pdfsink
|
|
15
|
+
|
|
16
|
+
Pdfsink.configure do |c|
|
|
17
|
+
c.default_table_strategy = cfg.default_table_strategy if cfg.default_table_strategy
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
Cli.binary = cfg.binary_path if cfg.binary_path
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pdfsink
|
|
4
|
+
# The table-detection strategies supported by pdfsink-rs.
|
|
5
|
+
#
|
|
6
|
+
# Each constant holds the string the CLI's +table+ command expects.
|
|
7
|
+
# Use symbols or strings interchangeably in the public API --
|
|
8
|
+
# {TableStrategy.resolve} normalizes them.
|
|
9
|
+
#
|
|
10
|
+
# @example
|
|
11
|
+
# page.tables(strategy: Pdfsink::TableStrategy::TEXT)
|
|
12
|
+
# page.tables(strategy: :text) # same thing
|
|
13
|
+
module TableStrategy
|
|
14
|
+
# Detect cell boundaries from ruling lines.
|
|
15
|
+
LINES = "lines"
|
|
16
|
+
|
|
17
|
+
# Like LINES, but only lines that meet at corners delimit cells.
|
|
18
|
+
LINES_STRICT = "lines_strict"
|
|
19
|
+
|
|
20
|
+
# Infer boundaries from text alignment when there are no ruling lines.
|
|
21
|
+
TEXT = "text"
|
|
22
|
+
|
|
23
|
+
# Use caller-supplied explicit vertical/horizontal lines.
|
|
24
|
+
EXPLICIT = "explicit"
|
|
25
|
+
|
|
26
|
+
# All known strategy names.
|
|
27
|
+
ALL = [LINES, LINES_STRICT, TEXT, EXPLICIT].freeze
|
|
28
|
+
|
|
29
|
+
# Normalize a strategy argument to the string the CLI expects.
|
|
30
|
+
#
|
|
31
|
+
# Accepts symbols, strings, or nil (nil -> the configured default,
|
|
32
|
+
# falling back to "lines"). Unknown values raise ArgumentError.
|
|
33
|
+
#
|
|
34
|
+
# @param name [Symbol, String, nil]
|
|
35
|
+
# @return [String]
|
|
36
|
+
def self.resolve(name)
|
|
37
|
+
name = Pdfsink.configuration.default_table_strategy if name.nil?
|
|
38
|
+
name = LINES if name.nil?
|
|
39
|
+
|
|
40
|
+
key = name.to_s.downcase.strip
|
|
41
|
+
return key if ALL.include?(key)
|
|
42
|
+
|
|
43
|
+
raise ArgumentError, "Unknown table strategy: #{name.inspect}. " \
|
|
44
|
+
"Known strategies: #{ALL.join(', ')}"
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
data/lib/pdfsink.rb
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "pdfsink/version"
|
|
4
|
+
require_relative "pdfsink/error"
|
|
5
|
+
require_relative "pdfsink/table_strategy"
|
|
6
|
+
require_relative "pdfsink/cli"
|
|
7
|
+
require_relative "pdfsink/page"
|
|
8
|
+
require_relative "pdfsink/document"
|
|
9
|
+
|
|
10
|
+
# Load the Railtie only when Rails is present.
|
|
11
|
+
require_relative "pdfsink/railtie" if defined?(Rails::Railtie)
|
|
12
|
+
|
|
13
|
+
# Pdfsink wraps the pdfsink-rs CLI, a fast pure-Rust PDF extraction tool,
|
|
14
|
+
# exposing text, word, object, table, link, and search extraction to Ruby.
|
|
15
|
+
#
|
|
16
|
+
# @example Open a document and read a page
|
|
17
|
+
# doc = Pdfsink.open("report.pdf")
|
|
18
|
+
# doc.page_count # => 12
|
|
19
|
+
# doc.page(1).extract_text # => "Quarterly Report\n..."
|
|
20
|
+
#
|
|
21
|
+
# @example One-shot text extraction
|
|
22
|
+
# Pdfsink.extract_text("report.pdf", page: 1)
|
|
23
|
+
#
|
|
24
|
+
# @example Tables
|
|
25
|
+
# Pdfsink.open("invoice.pdf").page(1).tables(strategy: :text)
|
|
26
|
+
module Pdfsink
|
|
27
|
+
# ── Configuration ────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
class Configuration
|
|
30
|
+
# Strategy used by {Page#tables} when none is given. Defaults to "lines".
|
|
31
|
+
# @return [Symbol, String, nil]
|
|
32
|
+
attr_accessor :default_table_strategy
|
|
33
|
+
|
|
34
|
+
def initialize
|
|
35
|
+
@default_table_strategy = nil
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
class << self
|
|
40
|
+
# @return [Configuration]
|
|
41
|
+
def configuration
|
|
42
|
+
@configuration ||= Configuration.new
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Yields the configuration object for modification.
|
|
46
|
+
#
|
|
47
|
+
# @example
|
|
48
|
+
# Pdfsink.configure do |config|
|
|
49
|
+
# config.default_table_strategy = :text
|
|
50
|
+
# end
|
|
51
|
+
def configure
|
|
52
|
+
yield(configuration)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# ── Public API ───────────────────────────────────────────────────
|
|
56
|
+
|
|
57
|
+
# Open a PDF document.
|
|
58
|
+
#
|
|
59
|
+
# @param path [String] path to a PDF file
|
|
60
|
+
# @return [Document]
|
|
61
|
+
def open(path)
|
|
62
|
+
Document.open(path)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Extract the text of a single page in one call.
|
|
66
|
+
#
|
|
67
|
+
# @param path [String]
|
|
68
|
+
# @param page [Integer] 1-based page number
|
|
69
|
+
# @return [String]
|
|
70
|
+
def extract_text(path, page: 1)
|
|
71
|
+
Cli.text(File.expand_path(path), page)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# The version of the underlying pdfsink-rs binary the gem was built with.
|
|
75
|
+
#
|
|
76
|
+
# @return [String] e.g. "0.2.8"
|
|
77
|
+
def version
|
|
78
|
+
Cli.version
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
data/pdfsink.gemspec
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "lib/pdfsink/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = "pdfsink"
|
|
7
|
+
spec.version = Pdfsink::VERSION
|
|
8
|
+
spec.authors = ["Accountaim"]
|
|
9
|
+
spec.summary = "Ruby wrapper for pdfsink-rs: fast pure-Rust PDF extraction"
|
|
10
|
+
spec.description = <<~DESC
|
|
11
|
+
A Ruby gem that wraps the pdfsink-rs CLI, a fast pure-Rust PDF extraction
|
|
12
|
+
tool, providing text, word, object, table, link, and regex-search
|
|
13
|
+
extraction from PDFs for use in Rails applications.
|
|
14
|
+
DESC
|
|
15
|
+
spec.homepage = "https://github.com/AccountAim/pdfsink-ruby"
|
|
16
|
+
spec.license = "MIT"
|
|
17
|
+
spec.required_ruby_version = ">= 3.2.0"
|
|
18
|
+
|
|
19
|
+
spec.files = Dir.chdir(__dir__) do
|
|
20
|
+
Dir["{lib,ext}/**/*", "Gemfile", "Rakefile", "pdfsink.gemspec", "LICENSE", "README.md"]
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
spec.require_paths = ["lib"]
|
|
24
|
+
spec.extensions = ["ext/pdfsink/extconf.rb"]
|
|
25
|
+
|
|
26
|
+
spec.metadata = {
|
|
27
|
+
"source_code_uri" => "https://github.com/AccountAim/pdfsink-ruby",
|
|
28
|
+
"rubygems_mfa_required" => "true"
|
|
29
|
+
}
|
|
30
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: pdfsink
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: x86_64-linux
|
|
6
|
+
authors:
|
|
7
|
+
- Accountaim
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2026-06-05 00:00:00.000000000 Z
|
|
12
|
+
dependencies: []
|
|
13
|
+
description: |
|
|
14
|
+
A Ruby gem that wraps the pdfsink-rs CLI, a fast pure-Rust PDF extraction
|
|
15
|
+
tool, providing text, word, object, table, link, and regex-search
|
|
16
|
+
extraction from PDFs for use in Rails applications.
|
|
17
|
+
email:
|
|
18
|
+
executables: []
|
|
19
|
+
extensions: []
|
|
20
|
+
extra_rdoc_files: []
|
|
21
|
+
files:
|
|
22
|
+
- Gemfile
|
|
23
|
+
- LICENSE
|
|
24
|
+
- README.md
|
|
25
|
+
- Rakefile
|
|
26
|
+
- ext/pdfsink/extconf.rb
|
|
27
|
+
- lib/pdfsink.rb
|
|
28
|
+
- lib/pdfsink/cli.rb
|
|
29
|
+
- lib/pdfsink/document.rb
|
|
30
|
+
- lib/pdfsink/error.rb
|
|
31
|
+
- lib/pdfsink/page.rb
|
|
32
|
+
- lib/pdfsink/pdfsink-rs
|
|
33
|
+
- lib/pdfsink/railtie.rb
|
|
34
|
+
- lib/pdfsink/table_strategy.rb
|
|
35
|
+
- lib/pdfsink/version.rb
|
|
36
|
+
- pdfsink.gemspec
|
|
37
|
+
homepage: https://github.com/AccountAim/pdfsink-ruby
|
|
38
|
+
licenses:
|
|
39
|
+
- MIT
|
|
40
|
+
metadata:
|
|
41
|
+
source_code_uri: https://github.com/AccountAim/pdfsink-ruby
|
|
42
|
+
rubygems_mfa_required: 'true'
|
|
43
|
+
post_install_message:
|
|
44
|
+
rdoc_options: []
|
|
45
|
+
require_paths:
|
|
46
|
+
- lib
|
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
48
|
+
requirements:
|
|
49
|
+
- - ">="
|
|
50
|
+
- !ruby/object:Gem::Version
|
|
51
|
+
version: 3.2.0
|
|
52
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
53
|
+
requirements:
|
|
54
|
+
- - ">="
|
|
55
|
+
- !ruby/object:Gem::Version
|
|
56
|
+
version: '0'
|
|
57
|
+
requirements: []
|
|
58
|
+
rubygems_version: 3.5.22
|
|
59
|
+
signing_key:
|
|
60
|
+
specification_version: 4
|
|
61
|
+
summary: 'Ruby wrapper for pdfsink-rs: fast pure-Rust PDF extraction'
|
|
62
|
+
test_files: []
|