rxerces 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/CHANGES.md +17 -0
- data/README.md +29 -1
- data/benchmarks/xpath_validation_cache_benchmark.rb +157 -0
- data/benchmarks/xpath_validation_micro_benchmark.rb +168 -0
- data/e +0 -0
- data/ext/rxerces/rxerces.cpp +497 -22
- data/lib/rxerces/version.rb +1 -1
- data/lib/rxerces.rb +3 -2
- data/rxerces.gemspec +2 -1
- data/spec/document_spec.rb +184 -17
- data/spec/node_spec.rb +230 -58
- data/spec/nodeset_spec.rb +90 -0
- data/spec/rxerces_shared.rb +1 -1
- data/spec/rxerces_spec.rb +58 -0
- data/spec/schema_spec.rb +28 -1
- data/spec/spec_helper.rb +5 -0
- data/spec/xpath_cache_spec.rb +409 -0
- data/spec/xpath_spec.rb +306 -18
- data/tmp/arm64-darwin24/rxerces/3.4.8/rxerces.bundle.dSYM/Contents/Info.plist +20 -0
- data/tmp/arm64-darwin24/rxerces/3.4.8/rxerces.bundle.dSYM/Contents/Resources/Relocations/aarch64/rxerces.bundle.yml +5 -0
- data.tar.gz.sig +0 -0
- metadata +25 -4
- metadata.gz.sig +0 -0
- /data/{tmp/arm64-darwin24/rxerces/3.4.7 → ext/rxerces}/rxerces.bundle.dSYM/Contents/Info.plist +0 -0
- /data/{tmp/arm64-darwin24/rxerces/3.4.7 → ext/rxerces}/rxerces.bundle.dSYM/Contents/Resources/Relocations/aarch64/rxerces.bundle.yml +0 -0
data/ext/rxerces/rxerces.cpp
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
#include "rxerces.h"
|
|
2
|
+
#include <ruby/encoding.h>
|
|
2
3
|
#include <xercesc/util/PlatformUtils.hpp>
|
|
3
4
|
#include <xercesc/parsers/XercesDOMParser.hpp>
|
|
4
5
|
#include <xercesc/dom/DOM.hpp>
|
|
5
6
|
#include <xercesc/util/XMLString.hpp>
|
|
7
|
+
#include <xercesc/util/XMLUni.hpp>
|
|
6
8
|
#include <xercesc/framework/MemBufInputSource.hpp>
|
|
7
9
|
#include <xercesc/framework/MemBufFormatTarget.hpp>
|
|
8
10
|
#include <xercesc/util/XercesDefs.hpp>
|
|
@@ -10,8 +12,12 @@
|
|
|
10
12
|
#include <xercesc/dom/DOMXPathExpression.hpp>
|
|
11
13
|
#include <xercesc/sax/ErrorHandler.hpp>
|
|
12
14
|
#include <xercesc/sax/SAXParseException.hpp>
|
|
15
|
+
#include <xercesc/sax/SAXException.hpp>
|
|
13
16
|
#include <sstream>
|
|
14
17
|
#include <vector>
|
|
18
|
+
#include <mutex>
|
|
19
|
+
#include <list>
|
|
20
|
+
#include <unordered_map>
|
|
15
21
|
|
|
16
22
|
#ifdef HAVE_XALAN
|
|
17
23
|
#include <xalanc/XPath/XPathEvaluator.hpp>
|
|
@@ -50,6 +56,17 @@ static bool xerces_initialized = false;
|
|
|
50
56
|
#ifdef HAVE_XALAN
|
|
51
57
|
static bool xalan_initialized = false;
|
|
52
58
|
#endif
|
|
59
|
+
static std::mutex init_mutex;
|
|
60
|
+
|
|
61
|
+
// XPath validation cache with LRU eviction
|
|
62
|
+
// Uses a list for LRU ordering (front = most recently used)
|
|
63
|
+
// and a map for O(1) lookup of list iterators
|
|
64
|
+
static std::list<std::string>* xpath_cache_lru_list = nullptr;
|
|
65
|
+
static std::unordered_map<std::string, std::list<std::string>::iterator>* xpath_cache_map = nullptr;
|
|
66
|
+
static std::mutex xpath_cache_mutex;
|
|
67
|
+
static bool cache_xpath_validation = true; // Default: enabled
|
|
68
|
+
static size_t xpath_cache_max_size = 10000; // Max cached expressions
|
|
69
|
+
static size_t xpath_max_length = 10000; // Max XPath expression length
|
|
53
70
|
|
|
54
71
|
// Forward declarations
|
|
55
72
|
static std::string css_to_xpath(const char* css);
|
|
@@ -63,6 +80,12 @@ static void ensure_xerces_initialized() {
|
|
|
63
80
|
return;
|
|
64
81
|
}
|
|
65
82
|
|
|
83
|
+
std::lock_guard<std::mutex> lock(init_mutex);
|
|
84
|
+
|
|
85
|
+
if (xerces_initialized) {
|
|
86
|
+
return;
|
|
87
|
+
}
|
|
88
|
+
|
|
66
89
|
try {
|
|
67
90
|
XMLPlatformUtils::Initialize();
|
|
68
91
|
#ifdef HAVE_XALAN
|
|
@@ -80,6 +103,16 @@ static void ensure_xerces_initialized() {
|
|
|
80
103
|
|
|
81
104
|
// Cleanup function called at exit
|
|
82
105
|
static void cleanup_xerces() {
|
|
106
|
+
// Clean up XPath validation cache (LRU)
|
|
107
|
+
if (xpath_cache_lru_list) {
|
|
108
|
+
delete xpath_cache_lru_list;
|
|
109
|
+
xpath_cache_lru_list = nullptr;
|
|
110
|
+
}
|
|
111
|
+
if (xpath_cache_map) {
|
|
112
|
+
delete xpath_cache_map;
|
|
113
|
+
xpath_cache_map = nullptr;
|
|
114
|
+
}
|
|
115
|
+
|
|
83
116
|
#ifdef HAVE_XALAN
|
|
84
117
|
if (xalan_initialized) {
|
|
85
118
|
XPathEvaluator::terminate();
|
|
@@ -92,6 +125,167 @@ static void cleanup_xerces() {
|
|
|
92
125
|
}
|
|
93
126
|
}
|
|
94
127
|
|
|
128
|
+
// Validate XPath expression to prevent XPath injection attacks
|
|
129
|
+
static void validate_xpath_expression(const char* xpath_str) {
|
|
130
|
+
if (!xpath_str || strlen(xpath_str) == 0) {
|
|
131
|
+
rb_raise(rb_eArgError, "XPath expression cannot be empty");
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
std::string xpath(xpath_str);
|
|
135
|
+
|
|
136
|
+
// Check cache first if caching is enabled (LRU cache)
|
|
137
|
+
if (cache_xpath_validation) {
|
|
138
|
+
std::lock_guard<std::mutex> lock(xpath_cache_mutex);
|
|
139
|
+
if (!xpath_cache_lru_list) {
|
|
140
|
+
xpath_cache_lru_list = new std::list<std::string>();
|
|
141
|
+
}
|
|
142
|
+
if (!xpath_cache_map) {
|
|
143
|
+
xpath_cache_map = new std::unordered_map<std::string, std::list<std::string>::iterator>();
|
|
144
|
+
}
|
|
145
|
+
auto it = xpath_cache_map->find(xpath);
|
|
146
|
+
if (it != xpath_cache_map->end()) {
|
|
147
|
+
// Cache hit: move to front (most recently used)
|
|
148
|
+
xpath_cache_lru_list->splice(xpath_cache_lru_list->begin(), *xpath_cache_lru_list, it->second);
|
|
149
|
+
return; // Already validated
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
size_t len = xpath.length();
|
|
153
|
+
|
|
154
|
+
// Check for excessively long XPath expressions (potential DoS)
|
|
155
|
+
if (xpath_max_length > 0 && len > xpath_max_length) {
|
|
156
|
+
rb_raise(rb_eArgError, "XPath expression is too long (max %zu characters)", xpath_max_length);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Check for dangerous patterns that could indicate XPath injection
|
|
160
|
+
// These patterns are commonly used in XPath injection attacks
|
|
161
|
+
|
|
162
|
+
// 1. Check for unbalanced quotes which could break out of string literals
|
|
163
|
+
int single_quotes = 0;
|
|
164
|
+
int double_quotes = 0;
|
|
165
|
+
bool in_single_quote = false;
|
|
166
|
+
bool in_double_quote = false;
|
|
167
|
+
|
|
168
|
+
for (size_t i = 0; i < len; i++) {
|
|
169
|
+
char c = xpath[i];
|
|
170
|
+
|
|
171
|
+
// Track quote state
|
|
172
|
+
if (c == '\'' && !in_double_quote) {
|
|
173
|
+
in_single_quote = !in_single_quote;
|
|
174
|
+
single_quotes++;
|
|
175
|
+
} else if (c == '"' && !in_single_quote) {
|
|
176
|
+
in_double_quote = !in_double_quote;
|
|
177
|
+
double_quotes++;
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Unbalanced quotes are suspicious
|
|
182
|
+
if (single_quotes % 2 != 0 || double_quotes % 2 != 0) {
|
|
183
|
+
rb_raise(rb_eArgError, "XPath expression contains unbalanced quotes");
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// 2. Check for suspicious comment patterns that could be used to bypass validation
|
|
187
|
+
if (xpath.find("(:") != std::string::npos || xpath.find(":)") != std::string::npos) {
|
|
188
|
+
rb_raise(rb_eArgError, "XPath expression contains suspicious comment patterns");
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// 3. Check for null bytes which could truncate validation
|
|
192
|
+
if (xpath.find('\0') != std::string::npos) {
|
|
193
|
+
rb_raise(rb_eArgError, "XPath expression contains null bytes");
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// 4. Check for excessive nesting which could cause stack overflow
|
|
197
|
+
int bracket_depth = 0;
|
|
198
|
+
int paren_depth = 0;
|
|
199
|
+
const int MAX_DEPTH = 100;
|
|
200
|
+
|
|
201
|
+
for (size_t i = 0; i < len; i++) {
|
|
202
|
+
char c = xpath[i];
|
|
203
|
+
|
|
204
|
+
if (c == '[') bracket_depth++;
|
|
205
|
+
else if (c == ']') bracket_depth--;
|
|
206
|
+
else if (c == '(') paren_depth++;
|
|
207
|
+
else if (c == ')') paren_depth--;
|
|
208
|
+
|
|
209
|
+
if (bracket_depth > MAX_DEPTH || paren_depth > MAX_DEPTH) {
|
|
210
|
+
rb_raise(rb_eArgError, "XPath expression has excessive nesting depth");
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
if (bracket_depth < 0 || paren_depth < 0) {
|
|
214
|
+
rb_raise(rb_eArgError, "XPath expression has unbalanced brackets or parentheses");
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if (bracket_depth != 0 || paren_depth != 0) {
|
|
219
|
+
rb_raise(rb_eArgError, "XPath expression has unbalanced brackets or parentheses");
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// 5. Check for suspicious function calls that could access system functions
|
|
223
|
+
// or perform dangerous operations
|
|
224
|
+
std::vector<std::string> dangerous_patterns = {
|
|
225
|
+
"document(", // Can access external documents
|
|
226
|
+
"doc(", // Can access external documents
|
|
227
|
+
"collection(", // Can access external collections
|
|
228
|
+
"unparsed-text(", // Can read arbitrary files
|
|
229
|
+
"system-property(", // Can leak system information
|
|
230
|
+
"environment-variable(", // Can leak environment variables
|
|
231
|
+
};
|
|
232
|
+
|
|
233
|
+
for (const auto& pattern : dangerous_patterns) {
|
|
234
|
+
if (xpath.find(pattern) != std::string::npos) {
|
|
235
|
+
rb_raise(rb_eArgError, "XPath expression contains potentially dangerous function: %s", pattern.c_str());
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// 6. Check for encoded characters that could bypass validation
|
|
240
|
+
// Use specific patterns to avoid false positives (e.g., "Q&A" in text)
|
|
241
|
+
if (xpath.find("&#") != std::string::npos || // Numeric character reference (<)
|
|
242
|
+
xpath.find("&#x") != std::string::npos || // Hex character reference (<)
|
|
243
|
+
xpath.find("&#") != std::string::npos) { // Encoded entity reference
|
|
244
|
+
rb_raise(rb_eArgError, "XPath expression contains encoded characters");
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// 7. Detect potential boolean-based blind XPath injection patterns
|
|
248
|
+
// These patterns use 'or' with always-true conditions
|
|
249
|
+
std::vector<std::string> injection_patterns = {
|
|
250
|
+
"or 1=1",
|
|
251
|
+
"or '1'='1'",
|
|
252
|
+
"or \"1\"=\"1\"",
|
|
253
|
+
"or true()",
|
|
254
|
+
"and 1=0",
|
|
255
|
+
"and false()",
|
|
256
|
+
"or 'a'='a'",
|
|
257
|
+
"or \"a\"=\"a\"",
|
|
258
|
+
};
|
|
259
|
+
|
|
260
|
+
// Convert to lowercase for case-insensitive comparison
|
|
261
|
+
std::string xpath_lower = xpath;
|
|
262
|
+
std::transform(xpath_lower.begin(), xpath_lower.end(), xpath_lower.begin(), ::tolower);
|
|
263
|
+
|
|
264
|
+
for (const auto& pattern : injection_patterns) {
|
|
265
|
+
if (xpath_lower.find(pattern) != std::string::npos) {
|
|
266
|
+
rb_raise(rb_eArgError, "XPath expression contains suspicious injection pattern");
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// Add to cache if caching is enabled (LRU eviction)
|
|
271
|
+
if (cache_xpath_validation) {
|
|
272
|
+
std::lock_guard<std::mutex> lock(xpath_cache_mutex);
|
|
273
|
+
if (xpath_cache_lru_list && xpath_cache_map) {
|
|
274
|
+
// If cache is full, evict least recently used (back of list)
|
|
275
|
+
if (xpath_cache_max_size > 0 && xpath_cache_map->size() >= xpath_cache_max_size) {
|
|
276
|
+
std::string& lru = xpath_cache_lru_list->back();
|
|
277
|
+
xpath_cache_map->erase(lru);
|
|
278
|
+
xpath_cache_lru_list->pop_back();
|
|
279
|
+
}
|
|
280
|
+
// Add new entry to front (most recently used)
|
|
281
|
+
if (xpath_cache_max_size > 0) {
|
|
282
|
+
xpath_cache_lru_list->push_front(xpath);
|
|
283
|
+
(*xpath_cache_map)[xpath] = xpath_cache_lru_list->begin();
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
95
289
|
// Helper class to manage XMLCh strings
|
|
96
290
|
class XStr {
|
|
97
291
|
public:
|
|
@@ -353,35 +547,110 @@ static VALUE wrap_node(DOMNode* node, VALUE doc_ref) {
|
|
|
353
547
|
return Qnil;
|
|
354
548
|
}
|
|
355
549
|
|
|
356
|
-
|
|
357
|
-
wrapper->node = node;
|
|
358
|
-
wrapper->doc_ref = doc_ref;
|
|
359
|
-
|
|
360
|
-
VALUE rb_node;
|
|
361
|
-
|
|
550
|
+
VALUE rb_class;
|
|
362
551
|
switch (node->getNodeType()) {
|
|
363
552
|
case DOMNode::ELEMENT_NODE:
|
|
364
|
-
|
|
553
|
+
rb_class = rb_cElement;
|
|
365
554
|
break;
|
|
366
555
|
case DOMNode::TEXT_NODE:
|
|
367
|
-
|
|
556
|
+
rb_class = rb_cText;
|
|
368
557
|
break;
|
|
369
558
|
default:
|
|
370
|
-
|
|
559
|
+
rb_class = rb_cNode;
|
|
371
560
|
break;
|
|
372
561
|
}
|
|
373
562
|
|
|
563
|
+
VALUE rb_node = TypedData_Wrap_Struct(rb_class, &node_type, NULL);
|
|
564
|
+
NodeWrapper* wrapper = ALLOC(NodeWrapper);
|
|
565
|
+
wrapper->node = node;
|
|
566
|
+
wrapper->doc_ref = doc_ref;
|
|
567
|
+
DATA_PTR(rb_node) = wrapper;
|
|
568
|
+
|
|
374
569
|
return rb_node;
|
|
375
570
|
}
|
|
376
571
|
|
|
377
|
-
// RXerces::XML::Document.parse(string)
|
|
378
|
-
|
|
572
|
+
// RXerces::XML::Document.parse(string, options = {})
|
|
573
|
+
// Validate options hash for document_parse - only allow known keys
|
|
574
|
+
static void validate_parse_options(VALUE options) {
|
|
575
|
+
if (NIL_P(options)) {
|
|
576
|
+
return;
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
Check_Type(options, T_HASH);
|
|
580
|
+
|
|
581
|
+
// Define allowed option keys
|
|
582
|
+
std::vector<const char*> allowed_keys = {
|
|
583
|
+
"allow_external_entities"
|
|
584
|
+
};
|
|
585
|
+
|
|
586
|
+
// Get all keys from the provided options hash
|
|
587
|
+
VALUE keys = rb_funcall(options, rb_intern("keys"), 0);
|
|
588
|
+
long keys_len = RARRAY_LEN(keys);
|
|
589
|
+
|
|
590
|
+
// Check each key against the allowed list
|
|
591
|
+
for (long i = 0; i < keys_len; i++) {
|
|
592
|
+
VALUE key = rb_ary_entry(keys, i);
|
|
593
|
+
|
|
594
|
+
// Convert symbol or string key to string for comparison
|
|
595
|
+
VALUE key_str;
|
|
596
|
+
if (TYPE(key) == T_SYMBOL) {
|
|
597
|
+
key_str = rb_sym_to_s(key);
|
|
598
|
+
} else if (TYPE(key) == T_STRING) {
|
|
599
|
+
key_str = key;
|
|
600
|
+
} else {
|
|
601
|
+
rb_raise(rb_eArgError, "Option keys must be symbols or strings");
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
const char* key_cstr = StringValueCStr(key_str);
|
|
605
|
+
bool found = false;
|
|
606
|
+
|
|
607
|
+
for (const auto& allowed : allowed_keys) {
|
|
608
|
+
if (strcmp(key_cstr, allowed) == 0) {
|
|
609
|
+
found = true;
|
|
610
|
+
break;
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
if (!found) {
|
|
615
|
+
rb_raise(rb_eArgError, "Unknown option: %s. Allowed options are: allow_external_entities", key_cstr);
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
static VALUE document_parse(int argc, VALUE* argv, VALUE klass) {
|
|
621
|
+
VALUE str, options;
|
|
622
|
+
rb_scan_args(argc, argv, "11", &str, &options);
|
|
623
|
+
|
|
379
624
|
ensure_xerces_initialized();
|
|
380
625
|
|
|
381
626
|
Check_Type(str, T_STRING);
|
|
382
627
|
const char* xml_str = StringValueCStr(str);
|
|
383
628
|
|
|
629
|
+
// Validate options hash before processing
|
|
630
|
+
validate_parse_options(options);
|
|
631
|
+
|
|
384
632
|
XercesDOMParser* parser = new XercesDOMParser();
|
|
633
|
+
|
|
634
|
+
// Check if external entities should be allowed (default: false for security)
|
|
635
|
+
bool allow_external = false;
|
|
636
|
+
if (!NIL_P(options)) {
|
|
637
|
+
VALUE allow_key = rb_intern("allow_external_entities");
|
|
638
|
+
VALUE allow_val = rb_hash_aref(options, ID2SYM(allow_key));
|
|
639
|
+
if (RTEST(allow_val)) {
|
|
640
|
+
allow_external = true;
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
if (allow_external) {
|
|
645
|
+
// Allow external entities (less secure)
|
|
646
|
+
parser->setLoadExternalDTD(true);
|
|
647
|
+
parser->setDisableDefaultEntityResolution(false);
|
|
648
|
+
} else {
|
|
649
|
+
// Security: Disable external entity processing to prevent XXE attacks
|
|
650
|
+
parser->setLoadExternalDTD(false);
|
|
651
|
+
parser->setDisableDefaultEntityResolution(true);
|
|
652
|
+
}
|
|
653
|
+
|
|
385
654
|
parser->setValidationScheme(XercesDOMParser::Val_Never);
|
|
386
655
|
parser->setDoNamespaces(true);
|
|
387
656
|
parser->setDoSchema(false);
|
|
@@ -485,8 +754,16 @@ static VALUE document_to_s(VALUE self) {
|
|
|
485
754
|
serializer->release();
|
|
486
755
|
|
|
487
756
|
return result;
|
|
757
|
+
} catch (const DOMException& e) {
|
|
758
|
+
CharStr message(e.getMessage());
|
|
759
|
+
rb_raise(rb_eRuntimeError, "Failed to serialize document: %s", message.localForm());
|
|
760
|
+
} catch (const XMLException& e) {
|
|
761
|
+
CharStr message(e.getMessage());
|
|
762
|
+
rb_raise(rb_eRuntimeError, "Failed to serialize document (XMLException): %s", message.localForm());
|
|
763
|
+
} catch (const std::exception& e) {
|
|
764
|
+
rb_raise(rb_eRuntimeError, "Failed to serialize document (std::exception): %s", e.what());
|
|
488
765
|
} catch (...) {
|
|
489
|
-
rb_raise(rb_eRuntimeError, "Failed to serialize document");
|
|
766
|
+
rb_raise(rb_eRuntimeError, "Failed to serialize document (unknown exception type)");
|
|
490
767
|
}
|
|
491
768
|
|
|
492
769
|
return Qnil;
|
|
@@ -702,6 +979,9 @@ static VALUE document_last_element_child(VALUE self) {
|
|
|
702
979
|
#ifdef HAVE_XALAN
|
|
703
980
|
// Helper function to execute XPath using Xalan for full XPath 1.0 support
|
|
704
981
|
static VALUE execute_xpath_with_xalan(DOMNode* context_node, const char* xpath_str, VALUE doc_ref) {
|
|
982
|
+
// Validate XPath expression before execution
|
|
983
|
+
validate_xpath_expression(xpath_str);
|
|
984
|
+
|
|
705
985
|
ensure_xerces_initialized();
|
|
706
986
|
|
|
707
987
|
try {
|
|
@@ -809,6 +1089,9 @@ static VALUE document_xpath(VALUE self, VALUE path) {
|
|
|
809
1089
|
Check_Type(path, T_STRING);
|
|
810
1090
|
const char* xpath_str = StringValueCStr(path);
|
|
811
1091
|
|
|
1092
|
+
// Validate XPath expression before execution
|
|
1093
|
+
validate_xpath_expression(xpath_str);
|
|
1094
|
+
|
|
812
1095
|
#ifdef HAVE_XALAN
|
|
813
1096
|
// Use Xalan for full XPath 1.0 support
|
|
814
1097
|
DOMElement* root = doc_wrapper->doc->getDocumentElement();
|
|
@@ -1394,6 +1677,37 @@ static VALUE node_attributes(VALUE self) {
|
|
|
1394
1677
|
return hash;
|
|
1395
1678
|
}
|
|
1396
1679
|
|
|
1680
|
+
// node.attribute_nodes - returns array of attribute nodes (only for element nodes)
|
|
1681
|
+
static VALUE node_attribute_nodes(VALUE self) {
|
|
1682
|
+
NodeWrapper* wrapper;
|
|
1683
|
+
TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
|
|
1684
|
+
|
|
1685
|
+
VALUE nodes_array = rb_ary_new();
|
|
1686
|
+
|
|
1687
|
+
if (!wrapper->node || wrapper->node->getNodeType() != DOMNode::ELEMENT_NODE) {
|
|
1688
|
+
return nodes_array;
|
|
1689
|
+
}
|
|
1690
|
+
|
|
1691
|
+
DOMElement* element = dynamic_cast<DOMElement*>(wrapper->node);
|
|
1692
|
+
DOMNamedNodeMap* attributes = element->getAttributes();
|
|
1693
|
+
|
|
1694
|
+
if (!attributes) {
|
|
1695
|
+
return nodes_array;
|
|
1696
|
+
}
|
|
1697
|
+
|
|
1698
|
+
VALUE doc_ref = wrapper->doc_ref;
|
|
1699
|
+
XMLSize_t length = attributes->getLength();
|
|
1700
|
+
|
|
1701
|
+
for (XMLSize_t i = 0; i < length; i++) {
|
|
1702
|
+
DOMNode* attr = attributes->item(i);
|
|
1703
|
+
if (attr) {
|
|
1704
|
+
rb_ary_push(nodes_array, wrap_node(attr, doc_ref));
|
|
1705
|
+
}
|
|
1706
|
+
}
|
|
1707
|
+
|
|
1708
|
+
return nodes_array;
|
|
1709
|
+
}
|
|
1710
|
+
|
|
1397
1711
|
// node.next_sibling
|
|
1398
1712
|
static VALUE node_next_sibling(VALUE self) {
|
|
1399
1713
|
NodeWrapper* wrapper;
|
|
@@ -1493,7 +1807,7 @@ static VALUE node_add_child(VALUE self, VALUE child) {
|
|
|
1493
1807
|
}
|
|
1494
1808
|
|
|
1495
1809
|
DOMNode* child_node = NULL;
|
|
1496
|
-
|
|
1810
|
+
VALUE doc_ref = wrapper->doc_ref; // Keep track of the Ruby document reference
|
|
1497
1811
|
|
|
1498
1812
|
// Check if child is a string or a node
|
|
1499
1813
|
if (TYPE(child) == T_STRING) {
|
|
@@ -1507,13 +1821,27 @@ static VALUE node_add_child(VALUE self, VALUE child) {
|
|
|
1507
1821
|
NodeWrapper* child_wrapper;
|
|
1508
1822
|
if (rb_obj_is_kind_of(child, rb_cNode)) {
|
|
1509
1823
|
TypedData_Get_Struct(child, NodeWrapper, &node_type, child_wrapper);
|
|
1510
|
-
|
|
1824
|
+
DOMNode* original_child = child_wrapper->node;
|
|
1511
1825
|
|
|
1512
1826
|
// Check if child belongs to a different document
|
|
1513
|
-
DOMDocument* child_doc =
|
|
1827
|
+
DOMDocument* child_doc = original_child->getOwnerDocument();
|
|
1514
1828
|
if (child_doc && child_doc != doc) {
|
|
1515
|
-
|
|
1516
|
-
|
|
1829
|
+
// Automatically import the node from the other document
|
|
1830
|
+
// The second parameter 'true' means deep copy (include all descendants)
|
|
1831
|
+
try {
|
|
1832
|
+
child_node = doc->importNode(original_child, true);
|
|
1833
|
+
|
|
1834
|
+
// Update the child wrapper to point to the imported node
|
|
1835
|
+
// and the new document reference
|
|
1836
|
+
child_wrapper->node = child_node;
|
|
1837
|
+
child_wrapper->doc_ref = doc_ref;
|
|
1838
|
+
} catch (const DOMException& e) {
|
|
1839
|
+
CharStr message(e.getMessage());
|
|
1840
|
+
rb_raise(rb_eRuntimeError, "Failed to import node from different document: %s",
|
|
1841
|
+
message.localForm());
|
|
1842
|
+
}
|
|
1843
|
+
} else {
|
|
1844
|
+
child_node = original_child;
|
|
1517
1845
|
}
|
|
1518
1846
|
} else {
|
|
1519
1847
|
rb_raise(rb_eTypeError, "Argument must be a String or Node");
|
|
@@ -1752,6 +2080,9 @@ static VALUE node_xpath(VALUE self, VALUE path) {
|
|
|
1752
2080
|
const char* xpath_str = StringValueCStr(path);
|
|
1753
2081
|
VALUE doc_ref = node_wrapper->doc_ref;
|
|
1754
2082
|
|
|
2083
|
+
// Validate XPath expression before execution
|
|
2084
|
+
validate_xpath_expression(xpath_str);
|
|
2085
|
+
|
|
1755
2086
|
#ifdef HAVE_XALAN
|
|
1756
2087
|
// Use Xalan for full XPath 1.0 support
|
|
1757
2088
|
return execute_xpath_with_xalan(node_wrapper->node, xpath_str, doc_ref);
|
|
@@ -2121,6 +2452,31 @@ static VALUE nodeset_text(VALUE self) {
|
|
|
2121
2452
|
}
|
|
2122
2453
|
|
|
2123
2454
|
// nodeset.inspect / nodeset.to_s - human-readable representation
|
|
2455
|
+
// Helper function to safely truncate UTF-8 strings using Ruby's built-in UTF-8 handling
|
|
2456
|
+
// Ruby's rb_str_substr operates on CHARACTER positions, not byte positions
|
|
2457
|
+
static std::string safe_truncate_utf8(const std::string& str, long max_chars) {
|
|
2458
|
+
if (str.empty()) {
|
|
2459
|
+
return str;
|
|
2460
|
+
}
|
|
2461
|
+
|
|
2462
|
+
// Create a Ruby string with UTF-8 encoding
|
|
2463
|
+
VALUE rb_str = rb_enc_str_new(str.c_str(), str.length(), rb_utf8_encoding());
|
|
2464
|
+
|
|
2465
|
+
// Get the character length (not byte length)
|
|
2466
|
+
long char_len = RSTRING_LEN(rb_str);
|
|
2467
|
+
|
|
2468
|
+
if (char_len <= max_chars) {
|
|
2469
|
+
return str;
|
|
2470
|
+
}
|
|
2471
|
+
|
|
2472
|
+
// Use Ruby's rb_str_substr which correctly handles multi-byte characters
|
|
2473
|
+
// Parameters: string, start position (in characters), length (in characters)
|
|
2474
|
+
VALUE truncated = rb_str_substr(rb_str, 0, max_chars);
|
|
2475
|
+
|
|
2476
|
+
// Convert back to C++ string
|
|
2477
|
+
return std::string(RSTRING_PTR(truncated), RSTRING_LEN(truncated));
|
|
2478
|
+
}
|
|
2479
|
+
|
|
2124
2480
|
static VALUE nodeset_inspect(VALUE self) {
|
|
2125
2481
|
NodeSetWrapper* wrapper;
|
|
2126
2482
|
TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
|
|
@@ -2191,7 +2547,7 @@ static VALUE nodeset_inspect(VALUE self) {
|
|
|
2191
2547
|
textStr = textStr.substr(start, end - start + 1);
|
|
2192
2548
|
|
|
2193
2549
|
if (textStr.length() > 30) {
|
|
2194
|
-
textStr = textStr
|
|
2550
|
+
textStr = safe_truncate_utf8(textStr, 27) + "...";
|
|
2195
2551
|
}
|
|
2196
2552
|
|
|
2197
2553
|
result += ">";
|
|
@@ -2219,7 +2575,7 @@ static VALUE nodeset_inspect(VALUE self) {
|
|
|
2219
2575
|
textStr = textStr.substr(start, end - start + 1);
|
|
2220
2576
|
|
|
2221
2577
|
if (textStr.length() > 30) {
|
|
2222
|
-
textStr = textStr
|
|
2578
|
+
textStr = safe_truncate_utf8(textStr, 27) + "...";
|
|
2223
2579
|
}
|
|
2224
2580
|
|
|
2225
2581
|
result += "text(\"";
|
|
@@ -2241,7 +2597,10 @@ static VALUE nodeset_inspect(VALUE self) {
|
|
|
2241
2597
|
}
|
|
2242
2598
|
|
|
2243
2599
|
result += "]>";
|
|
2244
|
-
|
|
2600
|
+
VALUE rb_result = rb_str_new_cstr(result.c_str());
|
|
2601
|
+
// Ensure the string is marked as UTF-8 encoded
|
|
2602
|
+
rb_enc_associate(rb_result, rb_utf8_encoding());
|
|
2603
|
+
return rb_result;
|
|
2245
2604
|
}
|
|
2246
2605
|
|
|
2247
2606
|
// Schema.from_document(schema_doc) or Schema.from_string(xsd_string)
|
|
@@ -2282,6 +2641,18 @@ static VALUE schema_from_document(int argc, VALUE* argv, VALUE klass) {
|
|
|
2282
2641
|
|
|
2283
2642
|
try {
|
|
2284
2643
|
schemaParser->parse(schemaInput);
|
|
2644
|
+
} catch (const XMLException& e) {
|
|
2645
|
+
delete schemaParser;
|
|
2646
|
+
delete wrapper->schemaContent;
|
|
2647
|
+
xfree(wrapper);
|
|
2648
|
+
CharStr message(e.getMessage());
|
|
2649
|
+
rb_raise(rb_eRuntimeError, "Schema parsing failed: %s", message.localForm());
|
|
2650
|
+
} catch (const SAXException& e) {
|
|
2651
|
+
delete schemaParser;
|
|
2652
|
+
delete wrapper->schemaContent;
|
|
2653
|
+
xfree(wrapper);
|
|
2654
|
+
CharStr message(e.getMessage());
|
|
2655
|
+
rb_raise(rb_eRuntimeError, "Schema parsing failed: %s", message.localForm());
|
|
2285
2656
|
} catch (...) {
|
|
2286
2657
|
delete schemaParser;
|
|
2287
2658
|
delete wrapper->schemaContent;
|
|
@@ -2363,6 +2734,12 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
|
|
|
2363
2734
|
validator->loadGrammar(schemaSource, Grammar::SchemaGrammarType, true);
|
|
2364
2735
|
validator->setExternalNoNamespaceSchemaLocation("schema.xsd");
|
|
2365
2736
|
validator->useCachedGrammarInParse(true);
|
|
2737
|
+
} catch (const XMLException& e) {
|
|
2738
|
+
CharStr message(e.getMessage());
|
|
2739
|
+
errorHandler.errors.push_back(std::string("Warning: Schema grammar could not be loaded: ") + message.localForm());
|
|
2740
|
+
} catch (const SAXException& e) {
|
|
2741
|
+
CharStr message(e.getMessage());
|
|
2742
|
+
errorHandler.errors.push_back(std::string("Warning: Schema grammar could not be loaded: ") + message.localForm());
|
|
2366
2743
|
} catch (...) {
|
|
2367
2744
|
// If grammar loading fails, just note it
|
|
2368
2745
|
errorHandler.errors.push_back("Warning: Schema grammar could not be loaded");
|
|
@@ -2414,13 +2791,110 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
|
|
|
2414
2791
|
}
|
|
2415
2792
|
|
|
2416
2793
|
return Qnil;
|
|
2417
|
-
}
|
|
2794
|
+
}
|
|
2795
|
+
|
|
2796
|
+
// RXerces.cache_xpath_validation? - check if XPath validation caching is enabled
|
|
2797
|
+
static VALUE rxerces_cache_xpath_validation_p(VALUE self) {
|
|
2798
|
+
return cache_xpath_validation ? Qtrue : Qfalse;
|
|
2799
|
+
}
|
|
2800
|
+
|
|
2801
|
+
// RXerces.cache_xpath_validation = bool - enable/disable XPath validation caching
|
|
2802
|
+
static VALUE rxerces_set_cache_xpath_validation(VALUE self, VALUE val) {
|
|
2803
|
+
cache_xpath_validation = RTEST(val);
|
|
2804
|
+
return val;
|
|
2805
|
+
}
|
|
2806
|
+
|
|
2807
|
+
// RXerces.clear_xpath_validation_cache - clear the XPath validation cache
|
|
2808
|
+
static VALUE rxerces_clear_xpath_validation_cache(VALUE self) {
|
|
2809
|
+
std::lock_guard<std::mutex> lock(xpath_cache_mutex);
|
|
2810
|
+
if (xpath_cache_lru_list) {
|
|
2811
|
+
xpath_cache_lru_list->clear();
|
|
2812
|
+
}
|
|
2813
|
+
if (xpath_cache_map) {
|
|
2814
|
+
xpath_cache_map->clear();
|
|
2815
|
+
}
|
|
2816
|
+
return Qnil;
|
|
2817
|
+
}
|
|
2818
|
+
|
|
2819
|
+
// RXerces.xpath_validation_cache_size - return number of cached expressions
|
|
2820
|
+
static VALUE rxerces_xpath_validation_cache_size(VALUE self) {
|
|
2821
|
+
std::lock_guard<std::mutex> lock(xpath_cache_mutex);
|
|
2822
|
+
if (!xpath_cache_map) {
|
|
2823
|
+
return LONG2NUM(0);
|
|
2824
|
+
}
|
|
2825
|
+
return LONG2NUM((long)xpath_cache_map->size());
|
|
2826
|
+
}
|
|
2827
|
+
|
|
2828
|
+
// RXerces.xpath_validation_cache_max_size - get max cache size
|
|
2829
|
+
static VALUE rxerces_xpath_validation_cache_max_size(VALUE self) {
|
|
2830
|
+
return LONG2NUM((long)xpath_cache_max_size);
|
|
2831
|
+
}
|
|
2832
|
+
|
|
2833
|
+
// RXerces.xpath_validation_cache_max_size = n - set max cache size
|
|
2834
|
+
static VALUE rxerces_set_xpath_validation_cache_max_size(VALUE self, VALUE val) {
|
|
2835
|
+
// Validate input: must be a non-negative integer
|
|
2836
|
+
if (!RB_INTEGER_TYPE_P(val)) {
|
|
2837
|
+
rb_raise(rb_eTypeError, "xpath_validation_cache_max_size must be an Integer");
|
|
2838
|
+
}
|
|
2839
|
+
|
|
2840
|
+
long size = NUM2LONG(val);
|
|
2841
|
+
if (size < 0) {
|
|
2842
|
+
rb_raise(rb_eArgError, "xpath_validation_cache_max_size must be non-negative");
|
|
2843
|
+
}
|
|
2844
|
+
|
|
2845
|
+
xpath_cache_max_size = (size_t)size;
|
|
2846
|
+
return val;
|
|
2847
|
+
}
|
|
2848
|
+
|
|
2849
|
+
// RXerces.xalan_enabled? - check if Xalan is available
|
|
2850
|
+
static VALUE rxerces_xalan_enabled_p(VALUE self) {
|
|
2851
|
+
#ifdef HAVE_XALAN
|
|
2852
|
+
return Qtrue;
|
|
2853
|
+
#else
|
|
2854
|
+
return Qfalse;
|
|
2855
|
+
#endif
|
|
2856
|
+
}
|
|
2857
|
+
|
|
2858
|
+
// RXerces.xpath_max_length - get max XPath expression length
|
|
2859
|
+
static VALUE rxerces_xpath_max_length(VALUE self) {
|
|
2860
|
+
return LONG2NUM((long)xpath_max_length);
|
|
2861
|
+
}
|
|
2862
|
+
|
|
2863
|
+
// RXerces.xpath_max_length = n - set max XPath expression length (0 = no limit)
|
|
2864
|
+
static VALUE rxerces_set_xpath_max_length(VALUE self, VALUE val) {
|
|
2865
|
+
// Validate input: must be a non-negative integer
|
|
2866
|
+
if (!RB_INTEGER_TYPE_P(val)) {
|
|
2867
|
+
rb_raise(rb_eTypeError, "xpath_max_length must be an Integer");
|
|
2868
|
+
}
|
|
2869
|
+
|
|
2870
|
+
long size = NUM2LONG(val);
|
|
2871
|
+
if (size < 0) {
|
|
2872
|
+
rb_raise(rb_eArgError, "xpath_max_length must be non-negative");
|
|
2873
|
+
}
|
|
2874
|
+
|
|
2875
|
+
xpath_max_length = (size_t)size;
|
|
2876
|
+
return val;
|
|
2877
|
+
}
|
|
2878
|
+
|
|
2879
|
+
extern "C" void Init_rxerces(void) {
|
|
2418
2880
|
rb_mRXerces = rb_define_module("RXerces");
|
|
2881
|
+
|
|
2882
|
+
// Module-level configuration methods for XPath validation caching
|
|
2883
|
+
rb_define_singleton_method(rb_mRXerces, "cache_xpath_validation?", RUBY_METHOD_FUNC(rxerces_cache_xpath_validation_p), 0);
|
|
2884
|
+
rb_define_singleton_method(rb_mRXerces, "cache_xpath_validation=", RUBY_METHOD_FUNC(rxerces_set_cache_xpath_validation), 1);
|
|
2885
|
+
rb_define_singleton_method(rb_mRXerces, "clear_xpath_validation_cache", RUBY_METHOD_FUNC(rxerces_clear_xpath_validation_cache), 0);
|
|
2886
|
+
rb_define_singleton_method(rb_mRXerces, "xpath_validation_cache_size", RUBY_METHOD_FUNC(rxerces_xpath_validation_cache_size), 0);
|
|
2887
|
+
rb_define_singleton_method(rb_mRXerces, "xpath_validation_cache_max_size", RUBY_METHOD_FUNC(rxerces_xpath_validation_cache_max_size), 0);
|
|
2888
|
+
rb_define_singleton_method(rb_mRXerces, "xpath_validation_cache_max_size=", RUBY_METHOD_FUNC(rxerces_set_xpath_validation_cache_max_size), 1);
|
|
2889
|
+
rb_define_singleton_method(rb_mRXerces, "xpath_max_length", RUBY_METHOD_FUNC(rxerces_xpath_max_length), 0);
|
|
2890
|
+
rb_define_singleton_method(rb_mRXerces, "xpath_max_length=", RUBY_METHOD_FUNC(rxerces_set_xpath_max_length), 1);
|
|
2891
|
+
rb_define_singleton_method(rb_mRXerces, "xalan_enabled?", RUBY_METHOD_FUNC(rxerces_xalan_enabled_p), 0);
|
|
2892
|
+
|
|
2419
2893
|
rb_mXML = rb_define_module_under(rb_mRXerces, "XML");
|
|
2420
2894
|
|
|
2421
2895
|
rb_cDocument = rb_define_class_under(rb_mXML, "Document", rb_cObject);
|
|
2422
2896
|
rb_undef_alloc_func(rb_cDocument);
|
|
2423
|
-
rb_define_singleton_method(rb_cDocument, "parse", RUBY_METHOD_FUNC(document_parse), 1);
|
|
2897
|
+
rb_define_singleton_method(rb_cDocument, "parse", RUBY_METHOD_FUNC(document_parse), -1);
|
|
2424
2898
|
rb_define_method(rb_cDocument, "root", RUBY_METHOD_FUNC(document_root), 0);
|
|
2425
2899
|
rb_define_method(rb_cDocument, "errors", RUBY_METHOD_FUNC(document_errors), 0);
|
|
2426
2900
|
rb_define_method(rb_cDocument, "to_s", RUBY_METHOD_FUNC(document_to_s), 0);
|
|
@@ -2464,6 +2938,7 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
|
|
|
2464
2938
|
rb_define_method(rb_cNode, "parent", RUBY_METHOD_FUNC(node_parent), 0);
|
|
2465
2939
|
rb_define_method(rb_cNode, "ancestors", RUBY_METHOD_FUNC(node_ancestors), -1);
|
|
2466
2940
|
rb_define_method(rb_cNode, "attributes", RUBY_METHOD_FUNC(node_attributes), 0);
|
|
2941
|
+
rb_define_method(rb_cNode, "attribute_nodes", RUBY_METHOD_FUNC(node_attribute_nodes), 0);
|
|
2467
2942
|
rb_define_method(rb_cNode, "next_sibling", RUBY_METHOD_FUNC(node_next_sibling), 0);
|
|
2468
2943
|
rb_define_method(rb_cNode, "next_element", RUBY_METHOD_FUNC(node_next_element), 0);
|
|
2469
2944
|
rb_define_method(rb_cNode, "previous_sibling", RUBY_METHOD_FUNC(node_previous_sibling), 0);
|