pdfsink 0.1.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: fa9c60cd090b037fe5f5db680e8b0d7c5cc76a72875d9206c8101226c8b78594
4
+ data.tar.gz: 96cbe481494324c0fdf804a97198b3e8e838241313f3aa8bf820a81a8b288907
5
+ SHA512:
6
+ metadata.gz: 7559f1b0284502e0608e6f794df291cc38282df8117c2ef74696cab4a33842143a3d11ea3fa19996f216c0d0aff8c5943d8a8e1c73aacc938885aae5b00f07e9
7
+ data.tar.gz: 48e3b41e7f3a4e7c8d953f8ab1c710978b61986a0277966f9dbe36baf3e63380d657b6cb38fd6ba0c3a60ebcc5f219430232c103264826d935d6598e6c3559d9
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gemspec
6
+
7
+ group :development, :test do
8
+ gem "rspec", "~> 3.12"
9
+ gem "rake", "~> 13.0"
10
+ end
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Accountaim
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,148 @@
1
+ # Pdfsink
2
+
3
+ Ruby wrapper for [pdfsink-rs](https://github.com/clark-labs-inc/pdfsink-rs), a
4
+ fast pure-Rust PDF extraction tool. Extract text, words, page objects, tables,
5
+ hyperlinks, and regex matches from PDFs — without a Python runtime.
6
+
7
+ The gem shells out to the bundled `pdfsink-rs` binary and parses its JSON
8
+ output, so there is nothing to load into your Ruby process and no FFI.
9
+
10
+ ## Requirements
11
+
12
+ - Ruby >= 3.2
13
+ - For source installs only: a Rust toolchain (`cargo`) to compile the binary.
14
+ Precompiled platform gems bundle the binary and need no toolchain.
15
+
16
+ ## Installation
17
+
18
+ ```ruby
19
+ # Gemfile
20
+ gem "pdfsink"
21
+ ```
22
+
23
+ ```bash
24
+ bundle install
25
+ # or
26
+ gem install pdfsink
27
+ ```
28
+
29
+ On supported platforms (`x86_64-linux`, `aarch64-linux`, `x86_64-darwin`,
30
+ `arm64-darwin`) a precompiled gem ships the binary. Elsewhere the source gem
31
+ compiles it on install via `cargo install pdfsink-rs`.
32
+
33
+ ### Building from source
34
+
35
+ ```bash
36
+ git clone https://github.com/AccountAim/pdfsink-ruby
37
+ cd pdfsink-ruby
38
+ bundle install
39
+ bundle exec rake cargo:build # compiles the binary into lib/pdfsink/
40
+ bundle exec rspec
41
+ ```
42
+
43
+ To point the gem at a binary you built elsewhere, set `PDFSINK_BIN`:
44
+
45
+ ```bash
46
+ export PDFSINK_BIN=/path/to/pdfsink-rs
47
+ ```
48
+
49
+ ## Usage
50
+
51
+ ### Open a document
52
+
53
+ ```ruby
54
+ doc = Pdfsink.open("report.pdf")
55
+ doc.page_count # => 12
56
+ doc.pages # => [#<Pdfsink::Page number=1 612.0x792.0>, ...]
57
+ doc.each_page { |page| puts page.extract_text }
58
+ ```
59
+
60
+ ### Extract text
61
+
62
+ ```ruby
63
+ page = doc.page(1)
64
+ page.extract_text # => "Quarterly Report\n..."
65
+
66
+ # One-shot, without holding a Document:
67
+ Pdfsink.extract_text("report.pdf", page: 1)
68
+ ```
69
+
70
+ ### Words with positions
71
+
72
+ ```ruby
73
+ page.extract_words
74
+ # => [{"text" => "Quarterly", "x0" => 72.0, "top" => 90.1, ...}, ...]
75
+ ```
76
+
77
+ ### Page objects
78
+
79
+ ```ruby
80
+ page.object_counts # => {"chars" => 812, "lines" => 14, "rects" => 3, ...}
81
+ page.objects # => {"chars" => [...], "lines" => [...], ...}
82
+ ```
83
+
84
+ ### Tables
85
+
86
+ ```ruby
87
+ page.tables # default strategy ("lines")
88
+ page.tables(strategy: :text) # infer from text alignment
89
+ # => [["Name", "Age", "City"], ["Alice", "31", "Oakland"], ...]
90
+ ```
91
+
92
+ Strategies: `:lines`, `:lines_strict`, `:text`, `:explicit`
93
+ (see `Pdfsink::TableStrategy`).
94
+
95
+ ### Hyperlinks
96
+
97
+ ```ruby
98
+ page.links
99
+ # => [{"uri" => "https://example.com", "x0" => 72.0, ...}]
100
+ ```
101
+
102
+ ### Search
103
+
104
+ ```ruby
105
+ page.search(/total:\s*\$\d+/i)
106
+ # => [{"text" => "Total: $420", "x0" => ..., ...}]
107
+ ```
108
+
109
+ ### Binary version
110
+
111
+ ```ruby
112
+ Pdfsink.version # => "0.2.8"
113
+ ```
114
+
115
+ ## Rails Integration
116
+
117
+ ```ruby
118
+ # config/application.rb (or an initializer)
119
+ config.pdfsink.default_table_strategy = :text
120
+ config.pdfsink.binary_path = Rails.root.join("bin/pdfsink-rs").to_s
121
+ ```
122
+
123
+ ## Configuration
124
+
125
+ ```ruby
126
+ Pdfsink.configure do |config|
127
+ config.default_table_strategy = :text # used when Page#tables gets no strategy
128
+ end
129
+ ```
130
+
131
+ The binary is located in this order:
132
+
133
+ 1. `PDFSINK_BIN` environment variable
134
+ 2. `lib/pdfsink/pdfsink-rs` inside the gem (bundled / built)
135
+ 3. `ext/pdfsink/bin/pdfsink-rs` (dev location)
136
+ 4. `pdfsink-rs` on `PATH`
137
+
138
+ ## Development
139
+
140
+ ```bash
141
+ bundle install
142
+ bundle exec rake cargo:build
143
+ bundle exec rspec
144
+ ```
145
+
146
+ ## License
147
+
148
+ MIT — see [LICENSE](LICENSE).
data/Rakefile ADDED
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rspec/core/rake_task"
4
+
5
+ RSpec::Core::RakeTask.new(:spec)
6
+
7
+ namespace :cargo do
8
+ desc "Build the pdfsink-rs binary into lib/pdfsink/"
9
+ task :build do
10
+ crate_ver = "0.2.8"
11
+ bin_name = "pdfsink-rs"
12
+ stage = File.expand_path("ext/pdfsink/cargo-root", __dir__)
13
+ lib_dir = File.expand_path("lib/pdfsink", __dir__)
14
+
15
+ sh "cargo", "install", "pdfsink-rs",
16
+ "--version", crate_ver, "--bin", bin_name,
17
+ "--root", stage, "--force"
18
+
19
+ built = File.join(stage, "bin", bin_name)
20
+ abort "ERROR: binary not found at #{built}" unless File.exist?(built)
21
+
22
+ mkdir_p lib_dir
23
+ cp built, lib_dir, verbose: true
24
+ chmod 0o755, File.join(lib_dir, bin_name)
25
+ end
26
+ end
27
+
28
+ task default: :spec
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ # extconf.rb -- invoked by `gem install` or `bundle install` to build the
4
+ # pdfsink-rs CLI binary from the published crate.
5
+ #
6
+ # Requirements:
7
+ # - cargo / rustc (Rust toolchain, 1.88+)
8
+ #
9
+ # Precompiled platform gems ship the binary in lib/pdfsink/ and strip this
10
+ # extension, so this script only runs for source installs.
11
+
12
+ require "fileutils"
13
+
14
+ CRATE = "pdfsink-rs"
15
+ CRATE_VER = "0.2.8"
16
+ BIN_NAME = "pdfsink-rs"
17
+ EXT_DIR = __dir__
18
+ LIB_DIR = File.expand_path("../../lib/pdfsink", EXT_DIR)
19
+
20
+ def write_dummy_makefile
21
+ File.write(File.join(EXT_DIR, "Makefile"), "all:\ninstall:\nclean:\n")
22
+ end
23
+
24
+ # ── Skip build if the binary already exists ───────────────────────────
25
+
26
+ if File.executable?(File.join(LIB_DIR, BIN_NAME))
27
+ puts "#{BIN_NAME} already exists in #{LIB_DIR}, skipping Rust build."
28
+ write_dummy_makefile
29
+ exit 0
30
+ end
31
+
32
+ # ── Pre-flight checks ─────────────────────────────────────────────────
33
+
34
+ unless system("command -v cargo > /dev/null 2>&1")
35
+ abort <<~MSG
36
+ ERROR: `cargo` not found on PATH.
37
+
38
+ The pdfsink gem requires the Rust toolchain to compile the native binary.
39
+ Install Rust via https://rustup.rs and ensure `cargo` is on your PATH,
40
+ then run `gem install pdfsink` again.
41
+ MSG
42
+ end
43
+
44
+ # ── Build the binary into a staging root ──────────────────────────────
45
+
46
+ stage = File.join(EXT_DIR, "cargo-root")
47
+ FileUtils.mkdir_p(stage)
48
+
49
+ puts "Installing #{CRATE} v#{CRATE_VER} (release)..."
50
+ ok = system("cargo", "install", CRATE,
51
+ "--version", CRATE_VER,
52
+ "--bin", BIN_NAME,
53
+ "--root", stage,
54
+ "--force")
55
+ abort "ERROR: cargo install #{CRATE} failed" unless ok
56
+
57
+ # ── Copy the artifact into lib/pdfsink/ ───────────────────────────────
58
+
59
+ built = File.join(stage, "bin", BIN_NAME)
60
+ abort "ERROR: binary not found at #{built}" unless File.exist?(built)
61
+
62
+ FileUtils.mkdir_p(LIB_DIR)
63
+ dest = File.join(LIB_DIR, BIN_NAME)
64
+ FileUtils.cp(built, dest, verbose: true)
65
+ FileUtils.chmod(0o755, dest)
66
+ puts "Installed #{BIN_NAME} into #{LIB_DIR}"
67
+
68
+ write_dummy_makefile
@@ -0,0 +1,146 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "open3"
4
+ require "json"
5
+
6
+ module Pdfsink
7
+ # Low-level runner for the pdfsink-rs CLI binary.
8
+ #
9
+ # This module is not intended for direct use -- see the public API on
10
+ # {Pdfsink}, {Pdfsink::Document}, and {Pdfsink::Page} instead. Every method
11
+ # shells out to the binary, returning either its raw stdout (for +text+) or
12
+ # the parsed JSON it prints.
13
+ #
14
+ # @api private
15
+ module Cli
16
+ BINARY = "pdfsink-rs"
17
+
18
+ class << self
19
+ # Absolute path to the pdfsink-rs binary. Search order:
20
+ # 1. PDFSINK_BIN environment variable (explicit override)
21
+ # 2. lib/pdfsink/ inside the gem (where extconf.rb copies the build)
22
+ # 3. ext/pdfsink/bin/ (dev / cargo-install location)
23
+ # 4. The bare name, resolved against PATH at exec time
24
+ #
25
+ # @return [String]
26
+ def binary
27
+ @binary ||= find_binary
28
+ end
29
+
30
+ # Override the resolved binary path (mainly for tests).
31
+ attr_writer :binary
32
+
33
+ # The pdfsink-rs version string, e.g. "pdfsink-rs 0.2.8".
34
+ #
35
+ # The CLI has no version subcommand, so this reports the crate version
36
+ # the gem was built against.
37
+ #
38
+ # @return [String]
39
+ def version
40
+ Pdfsink::PDFSINK_RS_VERSION
41
+ end
42
+
43
+ # Document-level metadata for every page: dimensions, rotation, bbox,
44
+ # and per-page object counts.
45
+ #
46
+ # @param path [String]
47
+ # @return [Hash]
48
+ def info(path)
49
+ run_json("info", path)
50
+ end
51
+
52
+ # Extracted text for a single page.
53
+ #
54
+ # @param path [String]
55
+ # @param page [Integer] 1-based page number
56
+ # @return [String]
57
+ def text(path, page)
58
+ run("text", path, page.to_s)
59
+ end
60
+
61
+ # Words with positions for a single page.
62
+ #
63
+ # @return [Array<Hash>]
64
+ def words(path, page)
65
+ run_json("words", path, page.to_s)
66
+ end
67
+
68
+ # Regex search matches for a single page.
69
+ #
70
+ # @return [Array<Hash>]
71
+ def search(path, page, pattern)
72
+ run_json("search", path, page.to_s, pattern)
73
+ end
74
+
75
+ # All page objects (chars, lines, rects, curves, images, ...) as a dict.
76
+ #
77
+ # @return [Hash]
78
+ def objects(path, page)
79
+ run_json("objects", path, page.to_s)
80
+ end
81
+
82
+ # Hyperlinks on a single page.
83
+ #
84
+ # @return [Array<Hash>]
85
+ def links(path, page)
86
+ run_json("links", path, page.to_s)
87
+ end
88
+
89
+ # Extracted table for a single page, or nil if none was found.
90
+ #
91
+ # @param strategy [String] one of "lines", "lines_strict", "text", "explicit"
92
+ # @return [Array<Array>, nil]
93
+ def table(path, page, strategy)
94
+ run_json("table", path, page.to_s, strategy)
95
+ end
96
+
97
+ private
98
+
99
+ def find_binary
100
+ if (env = ENV["PDFSINK_BIN"]) && File.executable?(env)
101
+ return env
102
+ end
103
+
104
+ gem_root = File.expand_path("../..", __dir__)
105
+
106
+ candidates = [
107
+ File.join(gem_root, "lib", "pdfsink", BINARY),
108
+ File.join(gem_root, "ext", "pdfsink", "bin", BINARY),
109
+ ]
110
+ candidates.each { |path| return path if File.executable?(path) }
111
+
112
+ # Fall back to PATH resolution at exec time.
113
+ BINARY
114
+ end
115
+
116
+ def run(command, *args)
117
+ argv = [binary, command, *args]
118
+ stdout, stderr, status = Open3.capture3(*argv)
119
+
120
+ unless status.success?
121
+ raise CommandError.new(
122
+ "pdfsink-rs #{command} failed: #{stderr.strip}",
123
+ command: argv, status: status.exitstatus, stderr: stderr
124
+ )
125
+ end
126
+
127
+ stdout
128
+ rescue Errno::ENOENT
129
+ raise BinaryNotFoundError,
130
+ "Could not find the pdfsink-rs binary (looked for #{binary.inspect}).\n\n" \
131
+ "Build it with:\n" \
132
+ " rake cargo:build\n\n" \
133
+ "Or set PDFSINK_BIN to the full path of the binary."
134
+ end
135
+
136
+ def run_json(command, *args)
137
+ out = run(command, *args)
138
+ return nil if out.strip.empty?
139
+
140
+ JSON.parse(out)
141
+ rescue JSON::ParserError => e
142
+ raise ParseError, "pdfsink-rs #{command} returned invalid JSON: #{e.message}"
143
+ end
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pdfsink
4
+ # A PDF document opened from a file on disk.
5
+ #
6
+ # Opening is cheap: the path is validated and the document's +info+ payload
7
+ # (page count and per-page metadata) is fetched lazily on first access.
8
+ # {Page} objects are created on demand and memoized.
9
+ #
10
+ # @example
11
+ # doc = Pdfsink::Document.open("report.pdf")
12
+ # doc.page_count # => 12
13
+ # doc.page(1).extract_text
14
+ # doc.pages.flat_map(&:extract_words)
15
+ class Document
16
+ # @return [String] absolute path to the PDF file
17
+ attr_reader :path
18
+
19
+ # Open a PDF document.
20
+ #
21
+ # @param path [String] path to a PDF file
22
+ # @return [Document]
23
+ # @raise [Errno::ENOENT] if the file does not exist
24
+ def self.open(path)
25
+ new(path)
26
+ end
27
+
28
+ # @param path [String] path to a PDF file
29
+ def initialize(path)
30
+ @path = File.expand_path(path)
31
+ raise Errno::ENOENT, @path unless File.exist?(@path)
32
+
33
+ @pages = {}
34
+ end
35
+
36
+ # Document and per-page metadata as returned by the binary.
37
+ #
38
+ # @return [Hash]
39
+ def info
40
+ @info ||= Cli.info(path)
41
+ end
42
+
43
+ # @return [Integer] number of pages
44
+ def page_count
45
+ info["page_count"]
46
+ end
47
+ alias length page_count
48
+ alias size page_count
49
+
50
+ # Fetch a single page.
51
+ #
52
+ # @param number [Integer] 1-based page number
53
+ # @return [Page]
54
+ # @raise [RangeError] if the page number is out of range
55
+ def page(number)
56
+ unless number.is_a?(Integer) && number >= 1 && number <= page_count
57
+ raise RangeError, "page #{number} out of range (1..#{page_count})"
58
+ end
59
+
60
+ @pages[number] ||= Page.new(self, number, info["pages"][number - 1])
61
+ end
62
+
63
+ # All pages, in order.
64
+ #
65
+ # @return [Array<Page>]
66
+ def pages
67
+ (1..page_count).map { |n| page(n) }
68
+ end
69
+
70
+ # Iterate over each page.
71
+ #
72
+ # @yieldparam page [Page]
73
+ # @return [Enumerator] if no block is given
74
+ def each_page(&block)
75
+ return enum_for(:each_page) unless block
76
+
77
+ pages.each(&block)
78
+ end
79
+
80
+ def inspect
81
+ "#<Pdfsink::Document path=#{path.inspect} pages=#{page_count}>"
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pdfsink
4
+ # Base error for all Pdfsink errors.
5
+ class Error < StandardError; end
6
+
7
+ # Raised when the pdfsink-rs binary cannot be located.
8
+ class BinaryNotFoundError < Error; end
9
+
10
+ # Raised when the pdfsink-rs binary exits non-zero.
11
+ #
12
+ # Carries the failing command, exit status, and captured stderr.
13
+ class CommandError < Error
14
+ # @return [Array<String>] the argv that was executed
15
+ attr_reader :command
16
+
17
+ # @return [Integer, nil] the process exit status
18
+ attr_reader :status
19
+
20
+ # @return [String] captured standard error
21
+ attr_reader :stderr
22
+
23
+ def initialize(message, command:, status:, stderr:)
24
+ @command = command
25
+ @status = status
26
+ @stderr = stderr
27
+ super(message)
28
+ end
29
+ end
30
+
31
+ # Raised when the binary's stdout is not the JSON we expected.
32
+ class ParseError < Error; end
33
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pdfsink
4
+ # A single page of a {Document}.
5
+ #
6
+ # Each accessor shells out to the pdfsink-rs binary for that page; results
7
+ # are cached so repeated reads don't re-spawn the process. Page-level
8
+ # metadata (dimensions, rotation, bbox, object counts) comes from the
9
+ # document's +info+ payload and needs no extra spawn.
10
+ #
11
+ # @example
12
+ # doc = Pdfsink::Document.open("report.pdf")
13
+ # page = doc.page(1)
14
+ # page.width # => 612.0
15
+ # page.extract_text # => "Quarterly Report\n..."
16
+ # page.tables # => [[["Q1", "Q2"], ["10", "20"]]]
17
+ class Page
18
+ # @return [Integer] 1-based page number
19
+ attr_reader :number
20
+
21
+ # @param document [Document]
22
+ # @param number [Integer] 1-based page number
23
+ # @param meta [Hash] the per-page slice of the document +info+ payload
24
+ def initialize(document, number, meta)
25
+ @document = document
26
+ @number = number
27
+ @meta = meta
28
+ end
29
+
30
+ # @return [Float] page width in PDF points
31
+ def width = @meta["width"]
32
+
33
+ # @return [Float] page height in PDF points
34
+ def height = @meta["height"]
35
+
36
+ # @return [Integer] clockwise rotation in degrees (0, 90, 180, 270)
37
+ def rotation = @meta["rotation"]
38
+
39
+ # @return [Hash] the page bounding box ({"x0", "top", "x1", "bottom"})
40
+ def bbox = @meta["bbox"]
41
+
42
+ # @return [Hash] counts of each object kind on the page
43
+ def object_counts = @meta["object_counts"]
44
+
45
+ # The page's text in reading order.
46
+ #
47
+ # @return [String]
48
+ def extract_text
49
+ @extract_text ||= Cli.text(path, number)
50
+ end
51
+
52
+ # Words with positions and font metadata.
53
+ #
54
+ # @return [Array<Hash>]
55
+ def extract_words
56
+ @extract_words ||= Cli.words(path, number)
57
+ end
58
+
59
+ # Every page object (chars, lines, rects, curves, images, annots, ...).
60
+ #
61
+ # @return [Hash] keyed by object kind
62
+ def objects
63
+ @objects ||= Cli.objects(path, number)
64
+ end
65
+
66
+ # Hyperlinks on the page.
67
+ #
68
+ # @return [Array<Hash>]
69
+ def links
70
+ @links ||= Cli.links(path, number)
71
+ end
72
+
73
+ # Regex search matches within the page text.
74
+ #
75
+ # @param pattern [String, Regexp] the pattern to search for
76
+ # @return [Array<Hash>]
77
+ def search(pattern)
78
+ Cli.search(path, number, pattern.is_a?(Regexp) ? pattern.source : pattern.to_s)
79
+ end
80
+
81
+ # The page's largest detected table, or nil if none is found.
82
+ #
83
+ # @param strategy [Symbol, String, nil] table-detection strategy
84
+ # @return [Array<Array>, nil] rows of cells
85
+ def tables(strategy: nil)
86
+ Cli.table(path, number, TableStrategy.resolve(strategy))
87
+ end
88
+
89
+ def inspect
90
+ "#<Pdfsink::Page number=#{number} #{width}x#{height}>"
91
+ end
92
+
93
+ private
94
+
95
+ def path = @document.path
96
+ end
97
+ end
Binary file
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pdfsink
4
+ # Rails integration. Loaded automatically when +Rails::Railtie+ is
5
+ # defined (see +lib/pdfsink.rb+).
6
+ #
7
+ # @example config/application.rb
8
+ # config.pdfsink.default_table_strategy = :text
9
+ # config.pdfsink.binary_path = Rails.root.join("bin/pdfsink-rs").to_s
10
+ class Railtie < Rails::Railtie
11
+ config.pdfsink = ActiveSupport::OrderedOptions.new
12
+
13
+ initializer "pdfsink.configure" do |app|
14
+ cfg = app.config.pdfsink
15
+
16
+ Pdfsink.configure do |c|
17
+ c.default_table_strategy = cfg.default_table_strategy if cfg.default_table_strategy
18
+ end
19
+
20
+ Cli.binary = cfg.binary_path if cfg.binary_path
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pdfsink
4
+ # The table-detection strategies supported by pdfsink-rs.
5
+ #
6
+ # Each constant holds the string the CLI's +table+ command expects.
7
+ # Use symbols or strings interchangeably in the public API --
8
+ # {TableStrategy.resolve} normalizes them.
9
+ #
10
+ # @example
11
+ # page.tables(strategy: Pdfsink::TableStrategy::TEXT)
12
+ # page.tables(strategy: :text) # same thing
13
+ module TableStrategy
14
+ # Detect cell boundaries from ruling lines.
15
+ LINES = "lines"
16
+
17
+ # Like LINES, but only lines that meet at corners delimit cells.
18
+ LINES_STRICT = "lines_strict"
19
+
20
+ # Infer boundaries from text alignment when there are no ruling lines.
21
+ TEXT = "text"
22
+
23
+ # Use caller-supplied explicit vertical/horizontal lines.
24
+ EXPLICIT = "explicit"
25
+
26
+ # All known strategy names.
27
+ ALL = [LINES, LINES_STRICT, TEXT, EXPLICIT].freeze
28
+
29
+ # Normalize a strategy argument to the string the CLI expects.
30
+ #
31
+ # Accepts symbols, strings, or nil (nil -> the configured default,
32
+ # falling back to "lines"). Unknown values raise ArgumentError.
33
+ #
34
+ # @param name [Symbol, String, nil]
35
+ # @return [String]
36
+ def self.resolve(name)
37
+ name = Pdfsink.configuration.default_table_strategy if name.nil?
38
+ name = LINES if name.nil?
39
+
40
+ key = name.to_s.downcase.strip
41
+ return key if ALL.include?(key)
42
+
43
+ raise ArgumentError, "Unknown table strategy: #{name.inspect}. " \
44
+ "Known strategies: #{ALL.join(', ')}"
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pdfsink
4
+ VERSION = "0.1.0"
5
+
6
+ # Version of the pdfsink-rs crate this gem builds and wraps.
7
+ PDFSINK_RS_VERSION = "0.2.8"
8
+ end
data/lib/pdfsink.rb ADDED
@@ -0,0 +1,81 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "pdfsink/version"
4
+ require_relative "pdfsink/error"
5
+ require_relative "pdfsink/table_strategy"
6
+ require_relative "pdfsink/cli"
7
+ require_relative "pdfsink/page"
8
+ require_relative "pdfsink/document"
9
+
10
+ # Load the Railtie only when Rails is present.
11
+ require_relative "pdfsink/railtie" if defined?(Rails::Railtie)
12
+
13
+ # Pdfsink wraps the pdfsink-rs CLI, a fast pure-Rust PDF extraction tool,
14
+ # exposing text, word, object, table, link, and search extraction to Ruby.
15
+ #
16
+ # @example Open a document and read a page
17
+ # doc = Pdfsink.open("report.pdf")
18
+ # doc.page_count # => 12
19
+ # doc.page(1).extract_text # => "Quarterly Report\n..."
20
+ #
21
+ # @example One-shot text extraction
22
+ # Pdfsink.extract_text("report.pdf", page: 1)
23
+ #
24
+ # @example Tables
25
+ # Pdfsink.open("invoice.pdf").page(1).tables(strategy: :text)
26
+ module Pdfsink
27
+ # ── Configuration ────────────────────────────────────────────────────
28
+
29
+ class Configuration
30
+ # Strategy used by {Page#tables} when none is given. Defaults to "lines".
31
+ # @return [Symbol, String, nil]
32
+ attr_accessor :default_table_strategy
33
+
34
+ def initialize
35
+ @default_table_strategy = nil
36
+ end
37
+ end
38
+
39
+ class << self
40
+ # @return [Configuration]
41
+ def configuration
42
+ @configuration ||= Configuration.new
43
+ end
44
+
45
+ # Yields the configuration object for modification.
46
+ #
47
+ # @example
48
+ # Pdfsink.configure do |config|
49
+ # config.default_table_strategy = :text
50
+ # end
51
+ def configure
52
+ yield(configuration)
53
+ end
54
+
55
+ # ── Public API ───────────────────────────────────────────────────
56
+
57
+ # Open a PDF document.
58
+ #
59
+ # @param path [String] path to a PDF file
60
+ # @return [Document]
61
+ def open(path)
62
+ Document.open(path)
63
+ end
64
+
65
+ # Extract the text of a single page in one call.
66
+ #
67
+ # @param path [String]
68
+ # @param page [Integer] 1-based page number
69
+ # @return [String]
70
+ def extract_text(path, page: 1)
71
+ Cli.text(File.expand_path(path), page)
72
+ end
73
+
74
+ # The version of the underlying pdfsink-rs binary the gem was built with.
75
+ #
76
+ # @return [String] e.g. "0.2.8"
77
+ def version
78
+ Cli.version
79
+ end
80
+ end
81
+ end
data/pdfsink.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/pdfsink/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "pdfsink"
7
+ spec.version = Pdfsink::VERSION
8
+ spec.authors = ["Accountaim"]
9
+ spec.summary = "Ruby wrapper for pdfsink-rs: fast pure-Rust PDF extraction"
10
+ spec.description = <<~DESC
11
+ A Ruby gem that wraps the pdfsink-rs CLI, a fast pure-Rust PDF extraction
12
+ tool, providing text, word, object, table, link, and regex-search
13
+ extraction from PDFs for use in Rails applications.
14
+ DESC
15
+ spec.homepage = "https://github.com/AccountAim/pdfsink-ruby"
16
+ spec.license = "MIT"
17
+ spec.required_ruby_version = ">= 3.2.0"
18
+
19
+ spec.files = Dir.chdir(__dir__) do
20
+ Dir["{lib,ext}/**/*", "Gemfile", "Rakefile", "pdfsink.gemspec", "LICENSE", "README.md"]
21
+ end
22
+
23
+ spec.require_paths = ["lib"]
24
+ spec.extensions = ["ext/pdfsink/extconf.rb"]
25
+
26
+ spec.metadata = {
27
+ "source_code_uri" => "https://github.com/AccountAim/pdfsink-ruby",
28
+ "rubygems_mfa_required" => "true"
29
+ }
30
+ end
metadata ADDED
@@ -0,0 +1,62 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdfsink
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: arm64-darwin
6
+ authors:
7
+ - Accountaim
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2026-06-05 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: |
14
+ A Ruby gem that wraps the pdfsink-rs CLI, a fast pure-Rust PDF extraction
15
+ tool, providing text, word, object, table, link, and regex-search
16
+ extraction from PDFs for use in Rails applications.
17
+ email:
18
+ executables: []
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - Gemfile
23
+ - LICENSE
24
+ - README.md
25
+ - Rakefile
26
+ - ext/pdfsink/extconf.rb
27
+ - lib/pdfsink.rb
28
+ - lib/pdfsink/cli.rb
29
+ - lib/pdfsink/document.rb
30
+ - lib/pdfsink/error.rb
31
+ - lib/pdfsink/page.rb
32
+ - lib/pdfsink/pdfsink-rs
33
+ - lib/pdfsink/railtie.rb
34
+ - lib/pdfsink/table_strategy.rb
35
+ - lib/pdfsink/version.rb
36
+ - pdfsink.gemspec
37
+ homepage: https://github.com/AccountAim/pdfsink-ruby
38
+ licenses:
39
+ - MIT
40
+ metadata:
41
+ source_code_uri: https://github.com/AccountAim/pdfsink-ruby
42
+ rubygems_mfa_required: 'true'
43
+ post_install_message:
44
+ rdoc_options: []
45
+ require_paths:
46
+ - lib
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: 3.2.0
52
+ required_rubygems_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ requirements: []
58
+ rubygems_version: 3.5.22
59
+ signing_key:
60
+ specification_version: 4
61
+ summary: 'Ruby wrapper for pdfsink-rs: fast pure-Rust PDF extraction'
62
+ test_files: []