rpdfium 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rpdfium.rb ADDED
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "rpdfium/version"
4
+ require_relative "rpdfium/errors"
5
+
6
+ # Carica la gemma companion rpdfium-binary se presente: deve avvenire PRIMA
7
+ # di raw.rb, che chiama ffi_lib al momento del require e interroga
8
+ # Rpdfium::Binary.library_path per trovare il path assoluto al .so/.dylib.
9
+ begin
10
+ require "rpdfium/binary"
11
+ rescue LoadError
12
+ nil
13
+ end
14
+
15
+ require_relative "rpdfium/raw"
16
+
17
+ require_relative "rpdfium/io/png"
18
+
19
+ require_relative "rpdfium/structure/outline"
20
+ require_relative "rpdfium/structure/attachment"
21
+ require_relative "rpdfium/structure/element"
22
+ require_relative "rpdfium/structure/tree"
23
+
24
+ require_relative "rpdfium/image/embedded"
25
+ require_relative "rpdfium/annotation/annotation"
26
+ require_relative "rpdfium/form/form"
27
+ require_relative "rpdfium/search/search"
28
+
29
+ require_relative "rpdfium/document"
30
+ require_relative "rpdfium/page"
31
+
32
+ require_relative "rpdfium/util/cluster"
33
+ require_relative "rpdfium/util/word_extractor"
34
+ require_relative "rpdfium/util/text_extraction"
35
+ require_relative "rpdfium/util/word_merger"
36
+ require_relative "rpdfium/util/column_inference"
37
+ require_relative "rpdfium/util/label_matcher"
38
+
39
+ require_relative "rpdfium/table/edges"
40
+ require_relative "rpdfium/table/cells"
41
+ require_relative "rpdfium/table/table"
42
+ require_relative "rpdfium/table/extractor"
43
+ require_relative "rpdfium/table/debugger"
44
+
45
+ # rpdfium - Ruby bindings to PDFium with table extraction.
46
+ #
47
+ # Top-level API:
48
+ # Rpdfium.open(path_or_io_or_bytes) { |doc| ... }
49
+ # Rpdfium.extract_text(path)
50
+ # Rpdfium.extract_tables(path)
51
+ # Rpdfium.render_to_pngs(path, output_dir:)
52
+ module Rpdfium
53
+ def self.open(input, password: nil, &block)
54
+ Document.open(input, password: password, &block)
55
+ end
56
+
57
+ # Estrai tutto il testo di tutte le pagine, una stringa per pagina.
58
+ def self.extract_text(input, password: nil)
59
+ open(input, password: password) { |doc| doc.map(&:text) }
60
+ end
61
+
62
+ # Estrai tutte le tabelle di tutte le pagine.
63
+ # Ritorna Array<{ page: Integer, rows: Array<Array<String>> }>.
64
+ #
65
+ # `keep_blank_rows: false` (default) elimina le righe completamente vuote
66
+ # che la strategia `:text` di words_to_edges_h genera per costruzione (ogni
67
+ # riga visiva produce due edges, top + bottom, e tra coppie di edges
68
+ # adiacenti si formano "righe spurie" di altezza pari al gap interlinea).
69
+ # Con `keep_blank_rows: true` ottieni l'output grezzo di Table#extract.
70
+ def self.extract_tables(input, password: nil, keep_blank_rows: false, **opts)
71
+ open(input, password: password) do |doc|
72
+ doc.flat_map do |page|
73
+ Table::Extractor.new(page, **opts).extract.map do |rows|
74
+ rows = rows.reject { |r| r.all? { |c| c.nil? || c.empty? } } unless keep_blank_rows
75
+ { page: page.index, rows: rows }
76
+ end
77
+ end
78
+ end
79
+ end
80
+
81
+ # Renderizza ogni pagina in un PNG dentro output_dir.
82
+ def self.render_to_pngs(input, output_dir:, scale: 2.0, password: nil)
83
+ Dir.mkdir(output_dir) unless Dir.exist?(output_dir)
84
+ open(input, password: password) do |doc|
85
+ doc.map do |page|
86
+ path = File.join(output_dir, format("page_%04d.png", page.index + 1))
87
+ page.render_to_png(path, scale: scale)
88
+ path
89
+ end
90
+ end
91
+ end
92
+ end
metadata ADDED
@@ -0,0 +1,134 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rpdfium
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.1
5
+ platform: ruby
6
+ authors:
7
+ - Roberto Scinocca
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: ffi
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '1.16'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '1.16'
26
+ - !ruby/object:Gem::Dependency
27
+ name: rspec
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '3.13'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '3.13'
40
+ - !ruby/object:Gem::Dependency
41
+ name: rake
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '13.0'
47
+ type: :development
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '13.0'
54
+ - !ruby/object:Gem::Dependency
55
+ name: rubocop
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '1.60'
61
+ type: :development
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '1.60'
68
+ description: |
69
+ FFI bindings to Google's PDFium library, the same engine that powers
70
+ Chrome's PDF viewer. Provides text extraction with character-level
71
+ metadata (font, weight, origin, angle), vector path access, image
72
+ extraction, annotations, AcroForm fields, page rendering, and
73
+ pdfplumber-style table detection. Inspired by pypdfium2 and pdfplumber.
74
+ email:
75
+ - roberto.scinocca@hey.com
76
+ executables: []
77
+ extensions: []
78
+ extra_rdoc_files: []
79
+ files:
80
+ - CHANGELOG.md
81
+ - LICENSE
82
+ - README.md
83
+ - lib/rpdfium.rb
84
+ - lib/rpdfium/annotation/annotation.rb
85
+ - lib/rpdfium/document.rb
86
+ - lib/rpdfium/errors.rb
87
+ - lib/rpdfium/form/form.rb
88
+ - lib/rpdfium/image/embedded.rb
89
+ - lib/rpdfium/io/png.rb
90
+ - lib/rpdfium/page.rb
91
+ - lib/rpdfium/raw.rb
92
+ - lib/rpdfium/search/search.rb
93
+ - lib/rpdfium/structure/attachment.rb
94
+ - lib/rpdfium/structure/element.rb
95
+ - lib/rpdfium/structure/outline.rb
96
+ - lib/rpdfium/structure/tree.rb
97
+ - lib/rpdfium/table/cells.rb
98
+ - lib/rpdfium/table/debugger.rb
99
+ - lib/rpdfium/table/edges.rb
100
+ - lib/rpdfium/table/extractor.rb
101
+ - lib/rpdfium/table/table.rb
102
+ - lib/rpdfium/util/cluster.rb
103
+ - lib/rpdfium/util/column_inference.rb
104
+ - lib/rpdfium/util/label_matcher.rb
105
+ - lib/rpdfium/util/text_extraction.rb
106
+ - lib/rpdfium/util/word_extractor.rb
107
+ - lib/rpdfium/util/word_merger.rb
108
+ - lib/rpdfium/version.rb
109
+ homepage: https://github.com/retsef/rpdfium
110
+ licenses:
111
+ - Apache-2.0
112
+ metadata:
113
+ source_code_uri: https://github.com/retsef/rpdfium
114
+ changelog_uri: https://github.com/retsef/rpdfium/blob/main/CHANGELOG.md
115
+ bug_tracker_uri: https://github.com/retsef/rpdfium/issues
116
+ rubygems_mfa_required: 'true'
117
+ rdoc_options: []
118
+ require_paths:
119
+ - lib
120
+ required_ruby_version: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: 3.0.0
125
+ required_rubygems_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ version: '0'
130
+ requirements: []
131
+ rubygems_version: 4.0.6
132
+ specification_version: 4
133
+ summary: Ruby bindings for PDFium with table extraction
134
+ test_files: []