rpdfium 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +1870 -0
- data/LICENSE +19 -0
- data/README.md +599 -0
- data/lib/rpdfium/annotation/annotation.rb +114 -0
- data/lib/rpdfium/document.rb +226 -0
- data/lib/rpdfium/errors.rb +55 -0
- data/lib/rpdfium/form/form.rb +121 -0
- data/lib/rpdfium/image/embedded.rb +145 -0
- data/lib/rpdfium/io/png.rb +65 -0
- data/lib/rpdfium/page.rb +1623 -0
- data/lib/rpdfium/raw.rb +982 -0
- data/lib/rpdfium/search/search.rb +101 -0
- data/lib/rpdfium/structure/attachment.rb +40 -0
- data/lib/rpdfium/structure/element.rb +330 -0
- data/lib/rpdfium/structure/outline.rb +48 -0
- data/lib/rpdfium/structure/tree.rb +202 -0
- data/lib/rpdfium/table/cells.rb +137 -0
- data/lib/rpdfium/table/debugger.rb +122 -0
- data/lib/rpdfium/table/edges.rb +225 -0
- data/lib/rpdfium/table/extractor.rb +246 -0
- data/lib/rpdfium/table/table.rb +184 -0
- data/lib/rpdfium/util/cluster.rb +143 -0
- data/lib/rpdfium/util/column_inference.rb +139 -0
- data/lib/rpdfium/util/label_matcher.rb +214 -0
- data/lib/rpdfium/util/text_extraction.rb +49 -0
- data/lib/rpdfium/util/word_extractor.rb +151 -0
- data/lib/rpdfium/util/word_merger.rb +102 -0
- data/lib/rpdfium/version.rb +5 -0
- data/lib/rpdfium.rb +92 -0
- metadata +134 -0
data/lib/rpdfium.rb
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "rpdfium/version"
|
|
4
|
+
require_relative "rpdfium/errors"
|
|
5
|
+
|
|
6
|
+
# Carica la gemma companion rpdfium-binary se presente: deve avvenire PRIMA
|
|
7
|
+
# di raw.rb, che chiama ffi_lib al momento del require e interroga
|
|
8
|
+
# Rpdfium::Binary.library_path per trovare il path assoluto al .so/.dylib.
|
|
9
|
+
begin
|
|
10
|
+
require "rpdfium/binary"
|
|
11
|
+
rescue LoadError
|
|
12
|
+
nil
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
require_relative "rpdfium/raw"
|
|
16
|
+
|
|
17
|
+
require_relative "rpdfium/io/png"
|
|
18
|
+
|
|
19
|
+
require_relative "rpdfium/structure/outline"
|
|
20
|
+
require_relative "rpdfium/structure/attachment"
|
|
21
|
+
require_relative "rpdfium/structure/element"
|
|
22
|
+
require_relative "rpdfium/structure/tree"
|
|
23
|
+
|
|
24
|
+
require_relative "rpdfium/image/embedded"
|
|
25
|
+
require_relative "rpdfium/annotation/annotation"
|
|
26
|
+
require_relative "rpdfium/form/form"
|
|
27
|
+
require_relative "rpdfium/search/search"
|
|
28
|
+
|
|
29
|
+
require_relative "rpdfium/document"
|
|
30
|
+
require_relative "rpdfium/page"
|
|
31
|
+
|
|
32
|
+
require_relative "rpdfium/util/cluster"
|
|
33
|
+
require_relative "rpdfium/util/word_extractor"
|
|
34
|
+
require_relative "rpdfium/util/text_extraction"
|
|
35
|
+
require_relative "rpdfium/util/word_merger"
|
|
36
|
+
require_relative "rpdfium/util/column_inference"
|
|
37
|
+
require_relative "rpdfium/util/label_matcher"
|
|
38
|
+
|
|
39
|
+
require_relative "rpdfium/table/edges"
|
|
40
|
+
require_relative "rpdfium/table/cells"
|
|
41
|
+
require_relative "rpdfium/table/table"
|
|
42
|
+
require_relative "rpdfium/table/extractor"
|
|
43
|
+
require_relative "rpdfium/table/debugger"
|
|
44
|
+
|
|
45
|
+
# rpdfium - Ruby bindings to PDFium with table extraction.
|
|
46
|
+
#
|
|
47
|
+
# Top-level API:
|
|
48
|
+
# Rpdfium.open(path_or_io_or_bytes) { |doc| ... }
|
|
49
|
+
# Rpdfium.extract_text(path)
|
|
50
|
+
# Rpdfium.extract_tables(path)
|
|
51
|
+
# Rpdfium.render_to_pngs(path, output_dir:)
|
|
52
|
+
module Rpdfium
|
|
53
|
+
def self.open(input, password: nil, &block)
|
|
54
|
+
Document.open(input, password: password, &block)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Estrai tutto il testo di tutte le pagine, una stringa per pagina.
|
|
58
|
+
def self.extract_text(input, password: nil)
|
|
59
|
+
open(input, password: password) { |doc| doc.map(&:text) }
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Estrai tutte le tabelle di tutte le pagine.
|
|
63
|
+
# Ritorna Array<{ page: Integer, rows: Array<Array<String>> }>.
|
|
64
|
+
#
|
|
65
|
+
# `keep_blank_rows: false` (default) elimina le righe completamente vuote
|
|
66
|
+
# che la strategia `:text` di words_to_edges_h genera per costruzione (ogni
|
|
67
|
+
# riga visiva produce due edges, top + bottom, e tra coppie di edges
|
|
68
|
+
# adiacenti si formano "righe spurie" di altezza pari al gap interlinea).
|
|
69
|
+
# Con `keep_blank_rows: true` ottieni l'output grezzo di Table#extract.
|
|
70
|
+
def self.extract_tables(input, password: nil, keep_blank_rows: false, **opts)
|
|
71
|
+
open(input, password: password) do |doc|
|
|
72
|
+
doc.flat_map do |page|
|
|
73
|
+
Table::Extractor.new(page, **opts).extract.map do |rows|
|
|
74
|
+
rows = rows.reject { |r| r.all? { |c| c.nil? || c.empty? } } unless keep_blank_rows
|
|
75
|
+
{ page: page.index, rows: rows }
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Renderizza ogni pagina in un PNG dentro output_dir.
|
|
82
|
+
def self.render_to_pngs(input, output_dir:, scale: 2.0, password: nil)
|
|
83
|
+
Dir.mkdir(output_dir) unless Dir.exist?(output_dir)
|
|
84
|
+
open(input, password: password) do |doc|
|
|
85
|
+
doc.map do |page|
|
|
86
|
+
path = File.join(output_dir, format("page_%04d.png", page.index + 1))
|
|
87
|
+
page.render_to_png(path, scale: scale)
|
|
88
|
+
path
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: rpdfium
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.4.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Roberto Scinocca
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: ffi
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '1.16'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '1.16'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: rspec
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '3.13'
|
|
33
|
+
type: :development
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '3.13'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: rake
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '13.0'
|
|
47
|
+
type: :development
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '13.0'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: rubocop
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - "~>"
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '1.60'
|
|
61
|
+
type: :development
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - "~>"
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: '1.60'
|
|
68
|
+
description: |
|
|
69
|
+
FFI bindings to Google's PDFium library, the same engine that powers
|
|
70
|
+
Chrome's PDF viewer. Provides text extraction with character-level
|
|
71
|
+
metadata (font, weight, origin, angle), vector path access, image
|
|
72
|
+
extraction, annotations, AcroForm fields, page rendering, and
|
|
73
|
+
pdfplumber-style table detection. Inspired by pypdfium2 and pdfplumber.
|
|
74
|
+
email:
|
|
75
|
+
- roberto.scinocca@hey.com
|
|
76
|
+
executables: []
|
|
77
|
+
extensions: []
|
|
78
|
+
extra_rdoc_files: []
|
|
79
|
+
files:
|
|
80
|
+
- CHANGELOG.md
|
|
81
|
+
- LICENSE
|
|
82
|
+
- README.md
|
|
83
|
+
- lib/rpdfium.rb
|
|
84
|
+
- lib/rpdfium/annotation/annotation.rb
|
|
85
|
+
- lib/rpdfium/document.rb
|
|
86
|
+
- lib/rpdfium/errors.rb
|
|
87
|
+
- lib/rpdfium/form/form.rb
|
|
88
|
+
- lib/rpdfium/image/embedded.rb
|
|
89
|
+
- lib/rpdfium/io/png.rb
|
|
90
|
+
- lib/rpdfium/page.rb
|
|
91
|
+
- lib/rpdfium/raw.rb
|
|
92
|
+
- lib/rpdfium/search/search.rb
|
|
93
|
+
- lib/rpdfium/structure/attachment.rb
|
|
94
|
+
- lib/rpdfium/structure/element.rb
|
|
95
|
+
- lib/rpdfium/structure/outline.rb
|
|
96
|
+
- lib/rpdfium/structure/tree.rb
|
|
97
|
+
- lib/rpdfium/table/cells.rb
|
|
98
|
+
- lib/rpdfium/table/debugger.rb
|
|
99
|
+
- lib/rpdfium/table/edges.rb
|
|
100
|
+
- lib/rpdfium/table/extractor.rb
|
|
101
|
+
- lib/rpdfium/table/table.rb
|
|
102
|
+
- lib/rpdfium/util/cluster.rb
|
|
103
|
+
- lib/rpdfium/util/column_inference.rb
|
|
104
|
+
- lib/rpdfium/util/label_matcher.rb
|
|
105
|
+
- lib/rpdfium/util/text_extraction.rb
|
|
106
|
+
- lib/rpdfium/util/word_extractor.rb
|
|
107
|
+
- lib/rpdfium/util/word_merger.rb
|
|
108
|
+
- lib/rpdfium/version.rb
|
|
109
|
+
homepage: https://github.com/retsef/rpdfium
|
|
110
|
+
licenses:
|
|
111
|
+
- Apache-2.0
|
|
112
|
+
metadata:
|
|
113
|
+
source_code_uri: https://github.com/retsef/rpdfium
|
|
114
|
+
changelog_uri: https://github.com/retsef/rpdfium/blob/main/CHANGELOG.md
|
|
115
|
+
bug_tracker_uri: https://github.com/retsef/rpdfium/issues
|
|
116
|
+
rubygems_mfa_required: 'true'
|
|
117
|
+
rdoc_options: []
|
|
118
|
+
require_paths:
|
|
119
|
+
- lib
|
|
120
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
121
|
+
requirements:
|
|
122
|
+
- - ">="
|
|
123
|
+
- !ruby/object:Gem::Version
|
|
124
|
+
version: 3.0.0
|
|
125
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
126
|
+
requirements:
|
|
127
|
+
- - ">="
|
|
128
|
+
- !ruby/object:Gem::Version
|
|
129
|
+
version: '0'
|
|
130
|
+
requirements: []
|
|
131
|
+
rubygems_version: 4.0.6
|
|
132
|
+
specification_version: 4
|
|
133
|
+
summary: Ruby bindings for PDFium with table extraction
|
|
134
|
+
test_files: []
|