qpdf_ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ #include "struct_node.hpp"
2
+
3
+ McrNode::McrNode(int mcid, int pageObj, int pageGen, int pageNumber)
4
+ : mcid(mcid), pageObj(pageObj), pageGen(pageGen), pageNumber(pageNumber) {}
5
+
6
+ std::string McrNode::to_string(int level, PDFStructWalker& walker) {
7
+ std::ostringstream oss;
8
+
9
+ IndentHelper::indent(oss, level);
10
+ oss << "[MCR: MCID=" << mcid << " PageObj=" << pageObj << " Gen=" << pageGen;
11
+ if (pageNumber > 0) {
12
+ oss << " PageNumber=" << pageNumber;
13
+ }
14
+ oss << "]" << std::endl;
15
+
16
+ return oss.str();
17
+ }
18
+
19
+ int McrNode::getMcid() const { return mcid; }
20
+ void McrNode::setPageNumber(int pageNum) { pageNumber = pageNum; }
@@ -0,0 +1,294 @@
1
+ #include "pdf_image_mapper.hpp"
2
+ #include <qpdf/BufferInputSource.hh>
3
+ #include <qpdf/QPDFTokenizer.hh>
4
+
5
+ #include <algorithm>
6
+ #include <optional>
7
+
8
+ using Matrix = std::array<double, 6>;
9
+
10
+ static std::pair<double, double> apply_matrix(const Matrix& m, double x, double y) {
11
+ double x_new = m[0] * x + m[2] * y + m[4];
12
+ double y_new = m[1] * x + m[3] * y + m[5];
13
+ return {x_new, y_new};
14
+ }
15
+
16
+ static std::array<double, 4> compute_bbox(double width, double height, const Matrix& matrix) {
17
+ std::array<std::pair<double, double>, 4> corners = {apply_matrix(matrix, 0, 0), apply_matrix(matrix, width, 0),
18
+ apply_matrix(matrix, width, height),
19
+ apply_matrix(matrix, 0, height)};
20
+
21
+ double min_x = corners[0].first;
22
+ double max_x = corners[0].first;
23
+ double min_y = corners[0].second;
24
+ double max_y = corners[0].second;
25
+
26
+ for (int i = 1; i < 4; ++i) {
27
+ min_x = std::min(min_x, corners[i].first);
28
+ max_x = std::max(max_x, corners[i].first);
29
+ min_y = std::min(min_y, corners[i].second);
30
+ max_y = std::max(max_y, corners[i].second);
31
+ }
32
+
33
+ return {min_x, min_y, max_x, max_y};
34
+ }
35
+
36
+ static char const* tokenTypeName(QPDFTokenizer::token_type_e ttype) {
37
+ // Do this is a case statement instead of a lookup so the compiler
38
+ // will warn if we miss any.
39
+ switch (ttype) {
40
+ case QPDFTokenizer::tt_bad:
41
+ return "bad";
42
+ case QPDFTokenizer::tt_array_close:
43
+ return "array_close";
44
+ case QPDFTokenizer::tt_array_open:
45
+ return "array_open";
46
+ case QPDFTokenizer::tt_brace_close:
47
+ return "brace_close";
48
+ case QPDFTokenizer::tt_brace_open:
49
+ return "brace_open";
50
+ case QPDFTokenizer::tt_dict_close:
51
+ return "dict_close";
52
+ case QPDFTokenizer::tt_dict_open:
53
+ return "dict_open";
54
+ case QPDFTokenizer::tt_integer:
55
+ return "integer";
56
+ case QPDFTokenizer::tt_name:
57
+ return "name";
58
+ case QPDFTokenizer::tt_real:
59
+ return "real";
60
+ case QPDFTokenizer::tt_string:
61
+ return "string";
62
+ case QPDFTokenizer::tt_null:
63
+ return "null";
64
+ case QPDFTokenizer::tt_bool:
65
+ return "bool";
66
+ case QPDFTokenizer::tt_word:
67
+ return "word";
68
+ case QPDFTokenizer::tt_eof:
69
+ return "eof";
70
+ case QPDFTokenizer::tt_space:
71
+ return "space";
72
+ case QPDFTokenizer::tt_comment:
73
+ return "comment";
74
+ case QPDFTokenizer::tt_inline_image:
75
+ return "inline-image";
76
+ }
77
+ return nullptr;
78
+ }
79
+
80
+ static std::optional<ImageInfo> get_image_info(QPDFObjectHandle resources, const std::string& name) {
81
+ if (resources.isNull() || !resources.isDictionary()) {
82
+ return std::nullopt;
83
+ }
84
+
85
+ QPDFObjectHandle xobjects = resources.getKey("/XObject");
86
+ if (xobjects.isNull() || !xobjects.isDictionary()) {
87
+ return std::nullopt;
88
+ }
89
+
90
+ if (!xobjects.hasKey(name)) {
91
+ return std::nullopt;
92
+ }
93
+
94
+ QPDFObjectHandle xobject = xobjects.getKey(name);
95
+ if (!xobject.isStream()) {
96
+ return std::nullopt;
97
+ }
98
+
99
+ QPDFObjectHandle dict = xobject.getDict();
100
+ bool is_image =
101
+ (dict.hasKey("/Subtype") && dict.getKey("/Subtype").isName() && dict.getKey("/Subtype").getName() == "/Image");
102
+
103
+ if (is_image) {
104
+ ImageInfo image_info;
105
+
106
+ if (dict.hasKey("/Height") && dict.getKey("/Height").isInteger()) {
107
+ image_info.height = dict.getKey("/Height").getIntValue();
108
+ }
109
+
110
+ if (dict.hasKey("/Width") && dict.getKey("/Width").isInteger()) {
111
+ image_info.width = dict.getKey("/Width").getIntValue();
112
+ }
113
+
114
+ return image_info;
115
+ }
116
+
117
+ return std::nullopt;
118
+ ;
119
+ }
120
+
121
+ PDFImageMapper::PDFImageMapper(int target_mcid) : target_mcid(target_mcid), in_target_mcid(false) {}
122
+
123
+ void PDFImageMapper::push_cm(double value) {
124
+ if (cm_fixed_size_queue.size() == 6) {
125
+ cm_fixed_size_queue.pop_front(); // Remove oldest element
126
+ }
127
+ cm_fixed_size_queue.push_back(value);
128
+ }
129
+
130
+ void PDFImageMapper::find(QPDF& pdf) {
131
+ QPDFPageDocumentHelper doc_helper(pdf);
132
+ std::vector<QPDFPageObjectHelper> pages = doc_helper.getAllPages();
133
+ for (auto& page : pages) {
134
+ QPDFPageObjectHelper poh(page);
135
+
136
+ auto crop_box = poh.getCropBox();
137
+
138
+ this->find(poh);
139
+ }
140
+ }
141
+
142
+ struct OperatorInfo {
143
+ const char* name;
144
+ const char* description;
145
+ int operand_count;
146
+ };
147
+
148
+ static constexpr OperatorInfo OPERATOR_CM = {"cm", "Concatenate matrix (set CTM)", 6};
149
+ static constexpr OperatorInfo OPERATOR_Q = {"q", "Save graphics state", 0};
150
+ static constexpr OperatorInfo OPERATOR_Q_UPPER = {"Q", "Restore graphics state", 0};
151
+ static constexpr OperatorInfo OPERATOR_DO = {"Do", "Invoke named XObject (draw image/form)", 1};
152
+ static constexpr OperatorInfo OPERATOR_BDC = {"BDC", "Begin Marked Content sequence with property list", 2};
153
+ static constexpr OperatorInfo OPERATOR_EMC = {"EMC", "End Marked Content sequence", 0};
154
+
155
+ class CMDoExtractor : public QPDFObjectHandle::ParserCallbacks {
156
+ public:
157
+ explicit CMDoExtractor(QPDFPageObjectHelper& page_ref) : page(page_ref) {
158
+ // You can add any other initialization logic here if needed
159
+ }
160
+
161
+ std::vector<double> current_matrix = {1, 0, 0, 1, 0, 0};
162
+ std::stack<std::vector<double>> matrix_stack;
163
+ std::vector<QPDFObjectHandle> operand_stack;
164
+
165
+ std::stack<int> mcid_stack;
166
+ int current_mcid;
167
+
168
+ void handleEOF() override {
169
+ // No action needed for this example
170
+ }
171
+
172
+ std::optional<ImageInfo> image_info(std::string imgName) {
173
+ QPDFObjectHandle resources = page.getObjectHandle().getKey("/Resources");
174
+
175
+ return get_image_info(resources, imgName);
176
+ }
177
+
178
+ void handleObject(QPDFObjectHandle obj, size_t offset, size_t length) override {
179
+ if (obj.isOperator()) {
180
+ std::string op = obj.getOperatorValue();
181
+
182
+ if (op == OPERATOR_CM.name && operand_stack.size() >= OPERATOR_CM.operand_count) {
183
+ auto numeric_at = [&](std::size_t distance_from_top) -> double {
184
+ auto& obj = operand_stack[operand_stack.size() - distance_from_top];
185
+ auto t = obj.getTypeName(); // "integer", "real", …
186
+ if (std::strcmp(t, "integer") != 0 && std::strcmp(t, "real") != 0) {
187
+ std::stringstream ss;
188
+ ss << "numeric operand expected (got " << t << ")";
189
+ throw std::runtime_error(ss.str());
190
+ }
191
+ return obj.getNumericValue();
192
+ };
193
+
194
+ // pull operands (top of stack is distance 1)
195
+ double f = numeric_at(1);
196
+ double e = numeric_at(2);
197
+ double d = numeric_at(3);
198
+ double c = numeric_at(4);
199
+ double b = numeric_at(5);
200
+ double a = numeric_at(6);
201
+ current_matrix = {a, b, c, d, e, f};
202
+
203
+ operand_stack.resize(operand_stack.size() - 6);
204
+ } else if (op == OPERATOR_Q.name) {
205
+ matrix_stack.push(current_matrix);
206
+ } else if (op == OPERATOR_Q_UPPER.name) {
207
+ if (!matrix_stack.empty()) {
208
+ current_matrix = matrix_stack.top();
209
+ matrix_stack.pop();
210
+ }
211
+ } else if (op == OPERATOR_BDC.name && operand_stack.size() >= OPERATOR_BDC.operand_count) {
212
+ mcid_stack.push(current_mcid); // Save current MCID
213
+
214
+ QPDFObjectHandle properties_obj = operand_stack.back(); // Properties operand
215
+ // QPDFObjectHandle tag_obj = operand_stack[operand_stack.size() - 2]; // Tag operand
216
+
217
+ int mcid_val = -1; // Default if MCID not found or not an integer
218
+ if (properties_obj.isDictionary()) {
219
+ if (properties_obj.hasKey("/MCID") && properties_obj.getKey("/MCID").isInteger()) {
220
+ mcid_val = properties_obj.getKey("/MCID").getIntValue();
221
+ }
222
+ } else if (properties_obj.isName()) {
223
+ // Properties is a name, look it up in Resources /Properties dictionary
224
+ QPDFObjectHandle resources = page.getObjectHandle().getKey("/Resources");
225
+ if (resources.isDictionary() && resources.hasKey("/Properties")) {
226
+ QPDFObjectHandle properties_dict = resources.getKey("/Properties");
227
+ if (properties_dict.isDictionary() && properties_dict.hasKey(properties_obj.getName())) {
228
+ QPDFObjectHandle actual_props = properties_dict.getKey(properties_obj.getName());
229
+ if (actual_props.isDictionary() && actual_props.hasKey("/MCID") &&
230
+ actual_props.getKey("/MCID").isInteger()) {
231
+ mcid_val = actual_props.getKey("/MCID").getIntValue();
232
+ }
233
+ }
234
+ }
235
+ }
236
+ current_mcid = mcid_val;
237
+ operand_stack.resize(operand_stack.size() - OPERATOR_BDC.operand_count);
238
+ } else if (op == OPERATOR_EMC.name) { // EMC takes 0 operands
239
+ if (!mcid_stack.empty()) {
240
+ current_mcid = mcid_stack.top();
241
+ mcid_stack.pop();
242
+ } else {
243
+ current_mcid = -1; // Reset to default if stack underflow (should be balanced)
244
+ }
245
+ // EMC takes 0 operands, so no change to operand_stack based on operand_count
246
+ } else if (op == OPERATOR_DO.name && !operand_stack.empty()) {
247
+ std::string imgName = operand_stack.back().getName();
248
+ auto image_info = this->image_info(imgName);
249
+
250
+ std::copy(current_matrix.begin(), current_matrix.end(), image_info->cm_matrix.begin());
251
+
252
+ image_info->mcid = current_mcid;
253
+
254
+ image_info->bbox = compute_bbox(image_info->width, image_info->height, image_info->cm_matrix);
255
+
256
+ QPDFObjectHandle mb = page.getObjectHandle().getKey("/MediaBox");
257
+
258
+ double px0 = mb.getArrayItem(0).getNumericValue();
259
+ double py0 = mb.getArrayItem(1).getNumericValue();
260
+ double px1 = mb.getArrayItem(2).getNumericValue();
261
+ double py1 = mb.getArrayItem(3).getNumericValue();
262
+
263
+ auto& bb = image_info->bbox; // shorthand reference
264
+ bb[0] = std::max(bb[0], px0); // left edge
265
+ bb[1] = std::max(bb[1], py0); // bottom edge
266
+ bb[2] = std::min(bb[2], px1); // right edge
267
+ bb[3] = std::min(bb[3], py1); // top edge
268
+
269
+ image_to_mcid[imgName] = *image_info;
270
+
271
+ operand_stack.pop_back();
272
+ } else {
273
+ // For other operators, just clear the operand stack
274
+ operand_stack.clear();
275
+ }
276
+ } else {
277
+ operand_stack.push_back(obj);
278
+ }
279
+ }
280
+
281
+ const std::map<std::string, ImageInfo>& getImageMap() const { return image_to_mcid; }
282
+
283
+ private:
284
+ QPDFPageObjectHelper& page;
285
+ std::map<std::string, ImageInfo> image_to_mcid;
286
+ };
287
+
288
+ void PDFImageMapper::find(QPDFPageObjectHelper& page) {
289
+ CMDoExtractor cb(page);
290
+ page.parseContents(&cb);
291
+
292
+ const auto& extracted_map = cb.getImageMap();
293
+ image_to_mcid.insert(extracted_map.begin(), extracted_map.end());
294
+ }
@@ -0,0 +1,66 @@
1
+ // x_object_finder.hpp
2
+ #pragma once
3
+
4
+ #include <qpdf/QPDF.hh>
5
+ #include <qpdf/QPDFPageDocumentHelper.hh>
6
+ #include <qpdf/QPDFPageObjectHelper.hh>
7
+ #include <qpdf/QPDFObjectHandle.hh>
8
+ #include <qpdf/QIntC.hh>
9
+ #include <qpdf/QPDFExc.hh>
10
+ #include <qpdf/QTC.hh>
11
+ #include <qpdf/QUtil.hh>
12
+ #include <qpdf/BufferInputSource.hh>
13
+ #include <qpdf/QPDFTokenizer.hh>
14
+ #include <qpdf/Buffer.hh>
15
+
16
+ #include <string>
17
+ #include <vector>
18
+ #include <iostream>
19
+ #include <map>
20
+ #include <stack>
21
+ #include <regex>
22
+ #include <sstream>
23
+ #include <deque>
24
+
25
+ struct ImageInfo {
26
+ int mcid;
27
+ double width;
28
+ double height;
29
+ std::array<double, 6> cm_matrix; // stores the transformation matrix values
30
+ std::array<double, 4> bbox = {0, 0, 0, 0}; // user-space bounding box [llx, lly, urx, ury]
31
+
32
+ ImageInfo() : mcid(-1), width(0), height(0), cm_matrix{0, 0, 0, 0, 0, 0} {}
33
+
34
+ ImageInfo(int m, double w, double h, const std::array<double, 6>& cm) : mcid(m), width(w), height(h), cm_matrix(cm) {}
35
+
36
+ std::string to_string() const {
37
+ std::ostringstream oss;
38
+ oss << "ImageInfo(mcid=" << mcid << ", width=" << width << ", height=" << height << ", cm_matrix=[";
39
+ for (size_t i = 0; i < cm_matrix.size(); ++i) {
40
+ oss << cm_matrix[i];
41
+ if (i != cm_matrix.size() - 1) oss << ", ";
42
+ }
43
+ oss << "])";
44
+ return oss.str();
45
+ }
46
+ };
47
+
48
+ class PDFImageMapper {
49
+ public:
50
+ explicit PDFImageMapper(int target_mcid);
51
+
52
+ void find(QPDF& pdf);
53
+ // Parses the page's content stream to find the XObject name for the MCID
54
+ void find(QPDFPageObjectHelper& page);
55
+
56
+ void push_cm(double value);
57
+
58
+ // Expose internal map for external use
59
+ const std::map<std::string, ImageInfo>& getImageMap() const { return image_to_mcid; }
60
+
61
+ private:
62
+ int target_mcid;
63
+ bool in_target_mcid;
64
+ std::map<std::string, ImageInfo> image_to_mcid;
65
+ std::deque<double> cm_fixed_size_queue;
66
+ };
@@ -0,0 +1,46 @@
1
+ #include "pdf_struct_walker.hpp"
2
+ #include "struct_node.hpp"
3
+
4
+ PDFStructWalker::PDFStructWalker(std::ostream& out, const std::unordered_map<int, std::array<double, 4>>& mcid2bbox)
5
+ : out(out), mcid2bbox(const_cast<std::unordered_map<int, std::array<double, 4>>&>(mcid2bbox)) {}
6
+
7
+ std::string PDFStructWalker::get_structure_as_string(QPDFObjectHandle const& node) {
8
+ std::unique_ptr<StructNode> structNode = StructNode::fromQPDF(node);
9
+
10
+ return structNode->to_string(0, *this);
11
+ }
12
+
13
+ void PDFStructWalker::ensureLayoutBBox(QPDFObjectHandle const& node) {
14
+ std::unique_ptr<StructNode> structNode = StructNode::fromQPDF(node);
15
+
16
+ structNode->ensureLayoutBBox(*this);
17
+ }
18
+
19
+ void PDFStructWalker::buildPageObjectMap(QPDF& pdf) {
20
+ pageObjToNumMap.clear();
21
+ std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
22
+ for (int i = 0; i < pages.size(); ++i) {
23
+ // Map the page's object ID to its 1-based page number
24
+ pageObjToNumMap[pages.at(i).getObjGen()] = i + 1;
25
+ }
26
+ }
27
+
28
+ const std::map<QPDFObjGen, int>& PDFStructWalker::getPageObjectMap() const { return pageObjToNumMap; }
29
+
30
+ std::array<double, 4> PDFStructWalker::getPageCropBoxFor(QPDFObjectHandle const& page_oh) const {
31
+ auto inherited = [](QPDFObjectHandle node, char const* key) -> QPDFObjectHandle {
32
+ while (!node.isNull()) {
33
+ if (auto val = node.getKey(key); !val.isNull()) return val;
34
+ node = node.getKey("/Parent");
35
+ }
36
+ return QPDFObjectHandle(); // null ⇒ not found
37
+ };
38
+
39
+ QPDFObjectHandle crop = inherited(page_oh, "/CropBox");
40
+ if (crop.isNull()) crop = inherited(page_oh, "/MediaBox"); // spec default
41
+
42
+ std::array<double, 4> r;
43
+ for (size_t i = 0; i < 4; ++i) r[i] = crop.getArrayItem(i).getNumericValue();
44
+
45
+ return r;
46
+ }
@@ -0,0 +1,34 @@
1
+ // PDFStructWalker.h
2
+
3
+ #pragma once
4
+
5
+ #include <qpdf/QPDF.hh>
6
+ #include <qpdf/QPDFWriter.hh>
7
+ #include <qpdf/QPDFPageObjectHelper.hh>
8
+ #include <qpdf/QPDFObjectHandle.hh>
9
+
10
+ #include <iostream>
11
+ #include <stdexcept> // For std::exception (if you add try-catch)
12
+ #include <vector> // For std::vector
13
+ #include <string> // For std::string
14
+ #include <regex>
15
+ #include <map>
16
+
17
+ class PDFStructWalker {
18
+ private:
19
+ std::ostream& out;
20
+ std::map<QPDFObjGen, int> pageObjToNumMap;
21
+ std::unordered_map<int, std::array<double, 4>>& mcid2bbox;
22
+
23
+ public:
24
+ PDFStructWalker(std::ostream& out = std::cout, const std::unordered_map<int, std::array<double, 4>>& mcid2bbox = {});
25
+
26
+ void buildPageObjectMap(QPDF& pdf);
27
+ std::string get_structure_as_string(QPDFObjectHandle const& node);
28
+ void ensureLayoutBBox(QPDFObjectHandle const& node);
29
+
30
+ const std::map<QPDFObjGen, int>& getPageObjectMap() const;
31
+ std::array<double, 4> getPageCropBoxFor(QPDFObjectHandle const& elem) const;
32
+
33
+ const std::unordered_map<int, std::array<double, 4>>& getMcidBboxMap() const { return mcid2bbox; }
34
+ };
@@ -0,0 +1,204 @@
1
+ #include "qpdf_ruby.hpp"
2
+ #include "pdf_struct_walker.hpp"
3
+ #include "struct_node.hpp"
4
+ #include "pdf_image_mapper.hpp"
5
+ #include "document_handle.hpp"
6
+
7
+ #include <qpdf/QPDF.hh>
8
+ #include <qpdf/QPDFWriter.hh>
9
+ #include <qpdf/QPDFPageObjectHelper.hh>
10
+ #include <qpdf/QPDFObjectHandle.hh>
11
+
12
+ #include <iostream>
13
+ #include <stdexcept> // For std::exception (if you add try-catch)
14
+ #include <vector> // For std::vector
15
+ #include <string> // For std::string
16
+ #include <regex>
17
+
18
+ VALUE rb_mQpdfRuby;
19
+ VALUE rb_cDocument;
20
+
21
+ using namespace qpdf_ruby;
22
+
23
+ VALUE rb_qpdf_get_structure_string(VALUE self) {
24
+ DocumentHandle* h;
25
+ Data_Get_Struct(self, DocumentHandle, h);
26
+
27
+ try {
28
+ QPDF& pdf = h->qpdf();
29
+
30
+ QPDFObjectHandle catalog = pdf.getRoot();
31
+ QPDFObjectHandle struct_root = catalog.getKey("/StructTreeRoot");
32
+ if (!struct_root.isDictionary()) {
33
+ rb_raise(rb_eRuntimeError, "No StructTreeRoot found");
34
+ }
35
+ QPDFObjectHandle topKids = struct_root.getKey("/K");
36
+
37
+ PDFStructWalker walker(std::cout); // For now, std::cout, unless you pass another stream
38
+ walker.buildPageObjectMap(pdf);
39
+
40
+ std::string result;
41
+ if (topKids.isArray()) {
42
+ for (int i = 0; i < topKids.getArrayNItems(); ++i) {
43
+ result += walker.get_structure_as_string(topKids.getArrayItem(i));
44
+ }
45
+ } else {
46
+ result = walker.get_structure_as_string(topKids);
47
+ }
48
+
49
+ return rb_str_new(result.c_str(), result.length());
50
+ } catch (const std::exception& e) {
51
+ rb_raise(rb_eRuntimeError, "Error: %s", e.what());
52
+ }
53
+ return Qnil;
54
+ }
55
+
56
+ VALUE rb_qpdf_mark_paths_as_artifacts(VALUE self) {
57
+ DocumentHandle* h;
58
+ Data_Get_Struct(self, DocumentHandle, h);
59
+ try {
60
+ QPDF& pdf = h->qpdf();
61
+
62
+ std::vector<QPDFObjectHandle> pages = pdf.getAllPages();
63
+ std::regex path_regex(R"((?:[-+]?\d*\.?\d+(?:e[-+]?\d+)?\s+){4}re\s+(?:S|s|f\*?|F|B\*?|b\*?|n))",
64
+ std::regex::ECMAScript | std::regex::optimize);
65
+
66
+ for (auto& page_obj : pages) {
67
+ QPDFPageObjectHelper poh(page_obj);
68
+ std::vector<QPDFObjectHandle> contents = poh.getPageContents();
69
+ std::vector<QPDFObjectHandle> new_contents_array;
70
+
71
+ for (auto& content_stream : contents) {
72
+ if (content_stream.isStream()) {
73
+ // Use std::shared_ptr instead of PointerHolder
74
+ std::shared_ptr<Buffer> stream_buffer = content_stream.getStreamData();
75
+ std::string stream_data_str(reinterpret_cast<char*>(stream_buffer->getBuffer()), stream_buffer->getSize());
76
+
77
+ std::string new_stream_data_str = std::regex_replace(stream_data_str, path_regex, "/Artifact BMC\n$&\nEMC");
78
+
79
+ // Create a new Buffer with std::shared_ptr
80
+ std::shared_ptr<Buffer> new_buffer = std::make_shared<Buffer>(new_stream_data_str.length());
81
+ memcpy(new_buffer->getBuffer(), new_stream_data_str.data(), new_stream_data_str.length());
82
+
83
+ QPDFObjectHandle new_stream = QPDFObjectHandle::newStream(&pdf, new_buffer);
84
+ new_contents_array.push_back(new_stream);
85
+ } else {
86
+ new_contents_array.push_back(content_stream);
87
+ }
88
+ }
89
+
90
+ if (new_contents_array.size() == 1) {
91
+ page_obj.replaceKey("/Contents", new_contents_array[0]);
92
+ } else {
93
+ page_obj.replaceKey("/Contents", QPDFObjectHandle::newArray(new_contents_array));
94
+ }
95
+ }
96
+ } catch (const QPDFExc& e) { // Catching specific QPDF exceptions is good
97
+ rb_raise(rb_eRuntimeError, "QPDF Error: %s (filename: %s)", e.what(), e.getFilename().c_str());
98
+ } catch (const std::exception& e) { // Fallback for other standard exceptions
99
+ rb_raise(rb_eRuntimeError, "Error: %s", e.what());
100
+ }
101
+
102
+ return Qnil;
103
+ }
104
+
105
+ VALUE rb_qpdf_ensure_bboxs(VALUE self) {
106
+ DocumentHandle* h;
107
+ Data_Get_Struct(self, DocumentHandle, h);
108
+
109
+ try {
110
+ QPDF& pdf = h->qpdf();
111
+
112
+ QPDFObjectHandle catalog = pdf.getRoot();
113
+ QPDFObjectHandle struct_root = catalog.getKey("/StructTreeRoot");
114
+ if (!struct_root.isDictionary()) {
115
+ rb_raise(rb_eRuntimeError, "No StructTreeRoot found");
116
+ }
117
+ QPDFObjectHandle topKids = struct_root.getKey("/K");
118
+
119
+ PDFImageMapper finder(0);
120
+
121
+ finder.find(pdf);
122
+
123
+ std::unordered_map<int, std::array<double, 4>> mcid2bbox;
124
+ for (const auto& kv : finder.getImageMap()) {
125
+ if (kv.second.mcid >= 0) mcid2bbox[kv.second.mcid] = kv.second.bbox;
126
+ }
127
+
128
+ PDFStructWalker walker(std::cout, mcid2bbox); // For now, std::cout, unless you pass another stream
129
+
130
+ if (topKids.isArray()) {
131
+ for (int i = 0; i < topKids.getArrayNItems(); ++i) {
132
+ walker.ensureLayoutBBox(topKids.getArrayItem(i));
133
+ }
134
+ } else {
135
+ walker.ensureLayoutBBox(topKids);
136
+ }
137
+ } catch (const std::exception& e) {
138
+ rb_raise(rb_eRuntimeError, "Error: %s", e.what());
139
+ }
140
+ return Qnil;
141
+ }
142
+
143
+ static void doc_free(void* ptr) { qpdf_ruby::qpdf_ruby_close(static_cast<qpdf_ruby::DocumentHandle*>(ptr)); }
144
+
145
+ static VALUE doc_alloc(VALUE klass) { return Data_Wrap_Struct(klass, /* mark */ 0, doc_free, nullptr); }
146
+
147
+ static VALUE doc_initialize(VALUE self, VALUE filename) {
148
+ Check_Type(filename, T_STRING);
149
+ DocumentHandle* h = qpdf_ruby::qpdf_ruby_open(StringValueCStr(filename));
150
+ if (!h) rb_sys_fail("qpdf_ruby_open");
151
+ DATA_PTR(self) = h;
152
+
153
+ return self;
154
+ }
155
+
156
+ static VALUE doc_write(VALUE self, VALUE out_filename) {
157
+ DocumentHandle* h;
158
+ Data_Get_Struct(self, DocumentHandle, h);
159
+ if (qpdf_ruby::qpdf_ruby_write(h, StringValueCStr(out_filename)) == -1) rb_sys_fail("qpdf_ruby_write");
160
+
161
+ return Qnil;
162
+ }
163
+
164
+ VALUE qpdf_ruby_write_memory(DocumentHandle* h) {
165
+ if (!h) rb_sys_fail("Bad handle");
166
+ std::string bytes = h->write_to_memory();
167
+ return rb_str_new(bytes.data(), bytes.size());
168
+ }
169
+
170
+ static VALUE doc_from_memory(VALUE klass, VALUE str, VALUE password) {
171
+ Check_Type(str, T_STRING);
172
+ Check_Type(password, T_STRING);
173
+
174
+ DocumentHandle* h = qpdf_ruby_open_memory("ruby-memory", reinterpret_cast<unsigned char const*>(RSTRING_PTR(str)),
175
+ RSTRING_LEN(str), StringValueCStr(password));
176
+
177
+ if (!h) rb_sys_fail("qpdf_ruby_open_memory");
178
+
179
+ VALUE obj = Data_Wrap_Struct(klass, 0, doc_free, h);
180
+ return obj;
181
+ }
182
+
183
+ static VALUE doc_to_memory(VALUE self) {
184
+ DocumentHandle* h;
185
+ Data_Get_Struct(self, DocumentHandle, h);
186
+ return qpdf_ruby_write_memory(h); // returns a Ruby ::String
187
+ }
188
+
189
+ RUBY_FUNC_EXPORTED "C" void Init_qpdf_ruby(void) {
190
+ rb_mQpdfRuby = rb_define_module("QpdfRuby");
191
+ rb_cDocument = rb_define_class_under(rb_mQpdfRuby, "Document", rb_cObject);
192
+
193
+ rb_define_alloc_func(rb_cDocument, doc_alloc);
194
+
195
+ rb_define_method(rb_cDocument, "initialize", RUBY_METHOD_FUNC(doc_initialize), 1);
196
+ rb_define_singleton_method(rb_cDocument, "from_memory", RUBY_METHOD_FUNC(doc_from_memory), 2);
197
+
198
+ rb_define_method(rb_cDocument, "write", RUBY_METHOD_FUNC(doc_write), 1);
199
+ rb_define_method(rb_cDocument, "to_memory", RUBY_METHOD_FUNC(doc_to_memory), 0);
200
+
201
+ rb_define_method(rb_cDocument, "mark_paths_as_artifacts", RUBY_METHOD_FUNC(rb_qpdf_mark_paths_as_artifacts), 0);
202
+ rb_define_method(rb_cDocument, "ensure_bbox", RUBY_METHOD_FUNC(rb_qpdf_ensure_bboxs), 0);
203
+ rb_define_method(rb_cDocument, "show_structure", RUBY_METHOD_FUNC(rb_qpdf_get_structure_string), 0);
204
+ }
@@ -0,0 +1,10 @@
1
+ #ifndef QPDF_RUBY_H
2
+ #define QPDF_RUBY_H 1
3
+
4
+ #include "ruby.h"
5
+
6
+ VALUE rb_qpdf_mark_paths_as_artifacts(VALUE self);
7
+ VALUE rb_qpdf_ensure_bboxs(VALUE self);
8
+ VALUE rb_qpdf_get_structure_string(VALUE self);
9
+
10
+ #endif /* QPDF_RUBY_H */
@@ -0,0 +1,10 @@
1
+ #include "struct_node.hpp"
2
+
3
+ std::string StreamNode::to_string(int level, PDFStructWalker& walker) {
4
+ std::ostringstream oss;
5
+ IndentHelper::indent(oss, level);
6
+ oss << "[Stream: length=" << size;
7
+
8
+ oss << "]" << std::endl;
9
+ return oss.str();
10
+ }