rxerces 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,10 @@
1
1
  #include "rxerces.h"
2
+ #include <ruby/encoding.h>
2
3
  #include <xercesc/util/PlatformUtils.hpp>
3
4
  #include <xercesc/parsers/XercesDOMParser.hpp>
4
5
  #include <xercesc/dom/DOM.hpp>
5
6
  #include <xercesc/util/XMLString.hpp>
7
+ #include <xercesc/util/XMLUni.hpp>
6
8
  #include <xercesc/framework/MemBufInputSource.hpp>
7
9
  #include <xercesc/framework/MemBufFormatTarget.hpp>
8
10
  #include <xercesc/util/XercesDefs.hpp>
@@ -10,8 +12,12 @@
10
12
  #include <xercesc/dom/DOMXPathExpression.hpp>
11
13
  #include <xercesc/sax/ErrorHandler.hpp>
12
14
  #include <xercesc/sax/SAXParseException.hpp>
15
+ #include <xercesc/sax/SAXException.hpp>
13
16
  #include <sstream>
14
17
  #include <vector>
18
+ #include <mutex>
19
+ #include <list>
20
+ #include <unordered_map>
15
21
 
16
22
  #ifdef HAVE_XALAN
17
23
  #include <xalanc/XPath/XPathEvaluator.hpp>
@@ -50,6 +56,17 @@ static bool xerces_initialized = false;
50
56
  #ifdef HAVE_XALAN
51
57
  static bool xalan_initialized = false;
52
58
  #endif
59
+ static std::mutex init_mutex;
60
+
61
+ // XPath validation cache with LRU eviction
62
+ // Uses a list for LRU ordering (front = most recently used)
63
+ // and a map for O(1) lookup of list iterators
64
+ static std::list<std::string>* xpath_cache_lru_list = nullptr;
65
+ static std::unordered_map<std::string, std::list<std::string>::iterator>* xpath_cache_map = nullptr;
66
+ static std::mutex xpath_cache_mutex;
67
+ static bool cache_xpath_validation = true; // Default: enabled
68
+ static size_t xpath_cache_max_size = 10000; // Max cached expressions
69
+ static size_t xpath_max_length = 10000; // Max XPath expression length
53
70
 
54
71
  // Forward declarations
55
72
  static std::string css_to_xpath(const char* css);
@@ -63,6 +80,12 @@ static void ensure_xerces_initialized() {
63
80
  return;
64
81
  }
65
82
 
83
+ std::lock_guard<std::mutex> lock(init_mutex);
84
+
85
+ if (xerces_initialized) {
86
+ return;
87
+ }
88
+
66
89
  try {
67
90
  XMLPlatformUtils::Initialize();
68
91
  #ifdef HAVE_XALAN
@@ -80,6 +103,16 @@ static void ensure_xerces_initialized() {
80
103
 
81
104
  // Cleanup function called at exit
82
105
  static void cleanup_xerces() {
106
+ // Clean up XPath validation cache (LRU)
107
+ if (xpath_cache_lru_list) {
108
+ delete xpath_cache_lru_list;
109
+ xpath_cache_lru_list = nullptr;
110
+ }
111
+ if (xpath_cache_map) {
112
+ delete xpath_cache_map;
113
+ xpath_cache_map = nullptr;
114
+ }
115
+
83
116
  #ifdef HAVE_XALAN
84
117
  if (xalan_initialized) {
85
118
  XPathEvaluator::terminate();
@@ -92,6 +125,167 @@ static void cleanup_xerces() {
92
125
  }
93
126
  }
94
127
 
128
+ // Validate XPath expression to prevent XPath injection attacks
129
+ static void validate_xpath_expression(const char* xpath_str) {
130
+ if (!xpath_str || strlen(xpath_str) == 0) {
131
+ rb_raise(rb_eArgError, "XPath expression cannot be empty");
132
+ }
133
+
134
+ std::string xpath(xpath_str);
135
+
136
+ // Check cache first if caching is enabled (LRU cache)
137
+ if (cache_xpath_validation) {
138
+ std::lock_guard<std::mutex> lock(xpath_cache_mutex);
139
+ if (!xpath_cache_lru_list) {
140
+ xpath_cache_lru_list = new std::list<std::string>();
141
+ }
142
+ if (!xpath_cache_map) {
143
+ xpath_cache_map = new std::unordered_map<std::string, std::list<std::string>::iterator>();
144
+ }
145
+ auto it = xpath_cache_map->find(xpath);
146
+ if (it != xpath_cache_map->end()) {
147
+ // Cache hit: move to front (most recently used)
148
+ xpath_cache_lru_list->splice(xpath_cache_lru_list->begin(), *xpath_cache_lru_list, it->second);
149
+ return; // Already validated
150
+ }
151
+ }
152
+ size_t len = xpath.length();
153
+
154
+ // Check for excessively long XPath expressions (potential DoS)
155
+ if (xpath_max_length > 0 && len > xpath_max_length) {
156
+ rb_raise(rb_eArgError, "XPath expression is too long (max %zu characters)", xpath_max_length);
157
+ }
158
+
159
+ // Check for dangerous patterns that could indicate XPath injection
160
+ // These patterns are commonly used in XPath injection attacks
161
+
162
+ // 1. Check for unbalanced quotes which could break out of string literals
163
+ int single_quotes = 0;
164
+ int double_quotes = 0;
165
+ bool in_single_quote = false;
166
+ bool in_double_quote = false;
167
+
168
+ for (size_t i = 0; i < len; i++) {
169
+ char c = xpath[i];
170
+
171
+ // Track quote state
172
+ if (c == '\'' && !in_double_quote) {
173
+ in_single_quote = !in_single_quote;
174
+ single_quotes++;
175
+ } else if (c == '"' && !in_single_quote) {
176
+ in_double_quote = !in_double_quote;
177
+ double_quotes++;
178
+ }
179
+ }
180
+
181
+ // Unbalanced quotes are suspicious
182
+ if (single_quotes % 2 != 0 || double_quotes % 2 != 0) {
183
+ rb_raise(rb_eArgError, "XPath expression contains unbalanced quotes");
184
+ }
185
+
186
+ // 2. Check for suspicious comment patterns that could be used to bypass validation
187
+ if (xpath.find("(:") != std::string::npos || xpath.find(":)") != std::string::npos) {
188
+ rb_raise(rb_eArgError, "XPath expression contains suspicious comment patterns");
189
+ }
190
+
191
+ // 3. Check for null bytes which could truncate validation
192
+ if (xpath.find('\0') != std::string::npos) {
193
+ rb_raise(rb_eArgError, "XPath expression contains null bytes");
194
+ }
195
+
196
+ // 4. Check for excessive nesting which could cause stack overflow
197
+ int bracket_depth = 0;
198
+ int paren_depth = 0;
199
+ const int MAX_DEPTH = 100;
200
+
201
+ for (size_t i = 0; i < len; i++) {
202
+ char c = xpath[i];
203
+
204
+ if (c == '[') bracket_depth++;
205
+ else if (c == ']') bracket_depth--;
206
+ else if (c == '(') paren_depth++;
207
+ else if (c == ')') paren_depth--;
208
+
209
+ if (bracket_depth > MAX_DEPTH || paren_depth > MAX_DEPTH) {
210
+ rb_raise(rb_eArgError, "XPath expression has excessive nesting depth");
211
+ }
212
+
213
+ if (bracket_depth < 0 || paren_depth < 0) {
214
+ rb_raise(rb_eArgError, "XPath expression has unbalanced brackets or parentheses");
215
+ }
216
+ }
217
+
218
+ if (bracket_depth != 0 || paren_depth != 0) {
219
+ rb_raise(rb_eArgError, "XPath expression has unbalanced brackets or parentheses");
220
+ }
221
+
222
+ // 5. Check for suspicious function calls that could access system functions
223
+ // or perform dangerous operations
224
+ std::vector<std::string> dangerous_patterns = {
225
+ "document(", // Can access external documents
226
+ "doc(", // Can access external documents
227
+ "collection(", // Can access external collections
228
+ "unparsed-text(", // Can read arbitrary files
229
+ "system-property(", // Can leak system information
230
+ "environment-variable(", // Can leak environment variables
231
+ };
232
+
233
+ for (const auto& pattern : dangerous_patterns) {
234
+ if (xpath.find(pattern) != std::string::npos) {
235
+ rb_raise(rb_eArgError, "XPath expression contains potentially dangerous function: %s", pattern.c_str());
236
+ }
237
+ }
238
+
239
+ // 6. Check for encoded characters that could bypass validation
240
+ // Use specific patterns to avoid false positives (e.g., "Q&A" in text)
241
+ if (xpath.find("&#") != std::string::npos || // Numeric character reference (&#60;)
242
+ xpath.find("&#x") != std::string::npos || // Hex character reference (&#x3C;)
243
+ xpath.find("&amp;#") != std::string::npos) { // Encoded entity reference
244
+ rb_raise(rb_eArgError, "XPath expression contains encoded characters");
245
+ }
246
+
247
+ // 7. Detect potential boolean-based blind XPath injection patterns
248
+ // These patterns use 'or' with always-true conditions
249
+ std::vector<std::string> injection_patterns = {
250
+ "or 1=1",
251
+ "or '1'='1'",
252
+ "or \"1\"=\"1\"",
253
+ "or true()",
254
+ "and 1=0",
255
+ "and false()",
256
+ "or 'a'='a'",
257
+ "or \"a\"=\"a\"",
258
+ };
259
+
260
+ // Convert to lowercase for case-insensitive comparison
261
+ std::string xpath_lower = xpath;
262
+ std::transform(xpath_lower.begin(), xpath_lower.end(), xpath_lower.begin(), ::tolower);
263
+
264
+ for (const auto& pattern : injection_patterns) {
265
+ if (xpath_lower.find(pattern) != std::string::npos) {
266
+ rb_raise(rb_eArgError, "XPath expression contains suspicious injection pattern");
267
+ }
268
+ }
269
+
270
+ // Add to cache if caching is enabled (LRU eviction)
271
+ if (cache_xpath_validation) {
272
+ std::lock_guard<std::mutex> lock(xpath_cache_mutex);
273
+ if (xpath_cache_lru_list && xpath_cache_map) {
274
+ // If cache is full, evict least recently used (back of list)
275
+ if (xpath_cache_max_size > 0 && xpath_cache_map->size() >= xpath_cache_max_size) {
276
+ std::string& lru = xpath_cache_lru_list->back();
277
+ xpath_cache_map->erase(lru);
278
+ xpath_cache_lru_list->pop_back();
279
+ }
280
+ // Add new entry to front (most recently used)
281
+ if (xpath_cache_max_size > 0) {
282
+ xpath_cache_lru_list->push_front(xpath);
283
+ (*xpath_cache_map)[xpath] = xpath_cache_lru_list->begin();
284
+ }
285
+ }
286
+ }
287
+ }
288
+
95
289
  // Helper class to manage XMLCh strings
96
290
  class XStr {
97
291
  public:
@@ -353,35 +547,110 @@ static VALUE wrap_node(DOMNode* node, VALUE doc_ref) {
353
547
  return Qnil;
354
548
  }
355
549
 
356
- NodeWrapper* wrapper = ALLOC(NodeWrapper);
357
- wrapper->node = node;
358
- wrapper->doc_ref = doc_ref;
359
-
360
- VALUE rb_node;
361
-
550
+ VALUE rb_class;
362
551
  switch (node->getNodeType()) {
363
552
  case DOMNode::ELEMENT_NODE:
364
- rb_node = TypedData_Wrap_Struct(rb_cElement, &node_type, wrapper);
553
+ rb_class = rb_cElement;
365
554
  break;
366
555
  case DOMNode::TEXT_NODE:
367
- rb_node = TypedData_Wrap_Struct(rb_cText, &node_type, wrapper);
556
+ rb_class = rb_cText;
368
557
  break;
369
558
  default:
370
- rb_node = TypedData_Wrap_Struct(rb_cNode, &node_type, wrapper);
559
+ rb_class = rb_cNode;
371
560
  break;
372
561
  }
373
562
 
563
+ VALUE rb_node = TypedData_Wrap_Struct(rb_class, &node_type, NULL);
564
+ NodeWrapper* wrapper = ALLOC(NodeWrapper);
565
+ wrapper->node = node;
566
+ wrapper->doc_ref = doc_ref;
567
+ DATA_PTR(rb_node) = wrapper;
568
+
374
569
  return rb_node;
375
570
  }
376
571
 
377
- // RXerces::XML::Document.parse(string)
378
- static VALUE document_parse(VALUE klass, VALUE str) {
572
+ // RXerces::XML::Document.parse(string, options = {})
573
+ // Validate options hash for document_parse - only allow known keys
574
+ static void validate_parse_options(VALUE options) {
575
+ if (NIL_P(options)) {
576
+ return;
577
+ }
578
+
579
+ Check_Type(options, T_HASH);
580
+
581
+ // Define allowed option keys
582
+ std::vector<const char*> allowed_keys = {
583
+ "allow_external_entities"
584
+ };
585
+
586
+ // Get all keys from the provided options hash
587
+ VALUE keys = rb_funcall(options, rb_intern("keys"), 0);
588
+ long keys_len = RARRAY_LEN(keys);
589
+
590
+ // Check each key against the allowed list
591
+ for (long i = 0; i < keys_len; i++) {
592
+ VALUE key = rb_ary_entry(keys, i);
593
+
594
+ // Convert symbol or string key to string for comparison
595
+ VALUE key_str;
596
+ if (TYPE(key) == T_SYMBOL) {
597
+ key_str = rb_sym_to_s(key);
598
+ } else if (TYPE(key) == T_STRING) {
599
+ key_str = key;
600
+ } else {
601
+ rb_raise(rb_eArgError, "Option keys must be symbols or strings");
602
+ }
603
+
604
+ const char* key_cstr = StringValueCStr(key_str);
605
+ bool found = false;
606
+
607
+ for (const auto& allowed : allowed_keys) {
608
+ if (strcmp(key_cstr, allowed) == 0) {
609
+ found = true;
610
+ break;
611
+ }
612
+ }
613
+
614
+ if (!found) {
615
+ rb_raise(rb_eArgError, "Unknown option: %s. Allowed options are: allow_external_entities", key_cstr);
616
+ }
617
+ }
618
+ }
619
+
620
+ static VALUE document_parse(int argc, VALUE* argv, VALUE klass) {
621
+ VALUE str, options;
622
+ rb_scan_args(argc, argv, "11", &str, &options);
623
+
379
624
  ensure_xerces_initialized();
380
625
 
381
626
  Check_Type(str, T_STRING);
382
627
  const char* xml_str = StringValueCStr(str);
383
628
 
629
+ // Validate options hash before processing
630
+ validate_parse_options(options);
631
+
384
632
  XercesDOMParser* parser = new XercesDOMParser();
633
+
634
+ // Check if external entities should be allowed (default: false for security)
635
+ bool allow_external = false;
636
+ if (!NIL_P(options)) {
637
+ VALUE allow_key = rb_intern("allow_external_entities");
638
+ VALUE allow_val = rb_hash_aref(options, ID2SYM(allow_key));
639
+ if (RTEST(allow_val)) {
640
+ allow_external = true;
641
+ }
642
+ }
643
+
644
+ if (allow_external) {
645
+ // Allow external entities (less secure)
646
+ parser->setLoadExternalDTD(true);
647
+ parser->setDisableDefaultEntityResolution(false);
648
+ } else {
649
+ // Security: Disable external entity processing to prevent XXE attacks
650
+ parser->setLoadExternalDTD(false);
651
+ parser->setDisableDefaultEntityResolution(true);
652
+ }
653
+
385
654
  parser->setValidationScheme(XercesDOMParser::Val_Never);
386
655
  parser->setDoNamespaces(true);
387
656
  parser->setDoSchema(false);
@@ -485,8 +754,16 @@ static VALUE document_to_s(VALUE self) {
485
754
  serializer->release();
486
755
 
487
756
  return result;
757
+ } catch (const DOMException& e) {
758
+ CharStr message(e.getMessage());
759
+ rb_raise(rb_eRuntimeError, "Failed to serialize document: %s", message.localForm());
760
+ } catch (const XMLException& e) {
761
+ CharStr message(e.getMessage());
762
+ rb_raise(rb_eRuntimeError, "Failed to serialize document (XMLException): %s", message.localForm());
763
+ } catch (const std::exception& e) {
764
+ rb_raise(rb_eRuntimeError, "Failed to serialize document (std::exception): %s", e.what());
488
765
  } catch (...) {
489
- rb_raise(rb_eRuntimeError, "Failed to serialize document");
766
+ rb_raise(rb_eRuntimeError, "Failed to serialize document (unknown exception type)");
490
767
  }
491
768
 
492
769
  return Qnil;
@@ -702,6 +979,9 @@ static VALUE document_last_element_child(VALUE self) {
702
979
  #ifdef HAVE_XALAN
703
980
  // Helper function to execute XPath using Xalan for full XPath 1.0 support
704
981
  static VALUE execute_xpath_with_xalan(DOMNode* context_node, const char* xpath_str, VALUE doc_ref) {
982
+ // Validate XPath expression before execution
983
+ validate_xpath_expression(xpath_str);
984
+
705
985
  ensure_xerces_initialized();
706
986
 
707
987
  try {
@@ -809,6 +1089,9 @@ static VALUE document_xpath(VALUE self, VALUE path) {
809
1089
  Check_Type(path, T_STRING);
810
1090
  const char* xpath_str = StringValueCStr(path);
811
1091
 
1092
+ // Validate XPath expression before execution
1093
+ validate_xpath_expression(xpath_str);
1094
+
812
1095
  #ifdef HAVE_XALAN
813
1096
  // Use Xalan for full XPath 1.0 support
814
1097
  DOMElement* root = doc_wrapper->doc->getDocumentElement();
@@ -1394,6 +1677,37 @@ static VALUE node_attributes(VALUE self) {
1394
1677
  return hash;
1395
1678
  }
1396
1679
 
1680
+ // node.attribute_nodes - returns array of attribute nodes (only for element nodes)
1681
+ static VALUE node_attribute_nodes(VALUE self) {
1682
+ NodeWrapper* wrapper;
1683
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
1684
+
1685
+ VALUE nodes_array = rb_ary_new();
1686
+
1687
+ if (!wrapper->node || wrapper->node->getNodeType() != DOMNode::ELEMENT_NODE) {
1688
+ return nodes_array;
1689
+ }
1690
+
1691
+ DOMElement* element = dynamic_cast<DOMElement*>(wrapper->node);
1692
+ DOMNamedNodeMap* attributes = element->getAttributes();
1693
+
1694
+ if (!attributes) {
1695
+ return nodes_array;
1696
+ }
1697
+
1698
+ VALUE doc_ref = wrapper->doc_ref;
1699
+ XMLSize_t length = attributes->getLength();
1700
+
1701
+ for (XMLSize_t i = 0; i < length; i++) {
1702
+ DOMNode* attr = attributes->item(i);
1703
+ if (attr) {
1704
+ rb_ary_push(nodes_array, wrap_node(attr, doc_ref));
1705
+ }
1706
+ }
1707
+
1708
+ return nodes_array;
1709
+ }
1710
+
1397
1711
  // node.next_sibling
1398
1712
  static VALUE node_next_sibling(VALUE self) {
1399
1713
  NodeWrapper* wrapper;
@@ -1493,7 +1807,7 @@ static VALUE node_add_child(VALUE self, VALUE child) {
1493
1807
  }
1494
1808
 
1495
1809
  DOMNode* child_node = NULL;
1496
- bool needs_import = false;
1810
+ VALUE doc_ref = wrapper->doc_ref; // Keep track of the Ruby document reference
1497
1811
 
1498
1812
  // Check if child is a string or a node
1499
1813
  if (TYPE(child) == T_STRING) {
@@ -1507,13 +1821,27 @@ static VALUE node_add_child(VALUE self, VALUE child) {
1507
1821
  NodeWrapper* child_wrapper;
1508
1822
  if (rb_obj_is_kind_of(child, rb_cNode)) {
1509
1823
  TypedData_Get_Struct(child, NodeWrapper, &node_type, child_wrapper);
1510
- child_node = child_wrapper->node;
1824
+ DOMNode* original_child = child_wrapper->node;
1511
1825
 
1512
1826
  // Check if child belongs to a different document
1513
- DOMDocument* child_doc = child_node->getOwnerDocument();
1827
+ DOMDocument* child_doc = original_child->getOwnerDocument();
1514
1828
  if (child_doc && child_doc != doc) {
1515
- rb_raise(rb_eRuntimeError,
1516
- "Node belongs to a different document. Use importNode to adopt nodes from other documents.");
1829
+ // Automatically import the node from the other document
1830
+ // The second parameter 'true' means deep copy (include all descendants)
1831
+ try {
1832
+ child_node = doc->importNode(original_child, true);
1833
+
1834
+ // Update the child wrapper to point to the imported node
1835
+ // and the new document reference
1836
+ child_wrapper->node = child_node;
1837
+ child_wrapper->doc_ref = doc_ref;
1838
+ } catch (const DOMException& e) {
1839
+ CharStr message(e.getMessage());
1840
+ rb_raise(rb_eRuntimeError, "Failed to import node from different document: %s",
1841
+ message.localForm());
1842
+ }
1843
+ } else {
1844
+ child_node = original_child;
1517
1845
  }
1518
1846
  } else {
1519
1847
  rb_raise(rb_eTypeError, "Argument must be a String or Node");
@@ -1752,6 +2080,9 @@ static VALUE node_xpath(VALUE self, VALUE path) {
1752
2080
  const char* xpath_str = StringValueCStr(path);
1753
2081
  VALUE doc_ref = node_wrapper->doc_ref;
1754
2082
 
2083
+ // Validate XPath expression before execution
2084
+ validate_xpath_expression(xpath_str);
2085
+
1755
2086
  #ifdef HAVE_XALAN
1756
2087
  // Use Xalan for full XPath 1.0 support
1757
2088
  return execute_xpath_with_xalan(node_wrapper->node, xpath_str, doc_ref);
@@ -2121,6 +2452,31 @@ static VALUE nodeset_text(VALUE self) {
2121
2452
  }
2122
2453
 
2123
2454
  // nodeset.inspect / nodeset.to_s - human-readable representation
2455
+ // Helper function to safely truncate UTF-8 strings using Ruby's built-in UTF-8 handling
2456
+ // Ruby's rb_str_substr operates on CHARACTER positions, not byte positions
2457
+ static std::string safe_truncate_utf8(const std::string& str, long max_chars) {
2458
+ if (str.empty()) {
2459
+ return str;
2460
+ }
2461
+
2462
+ // Create a Ruby string with UTF-8 encoding
2463
+ VALUE rb_str = rb_enc_str_new(str.c_str(), str.length(), rb_utf8_encoding());
2464
+
2465
+ // Get the character length (not byte length)
2466
+ long char_len = RSTRING_LEN(rb_str);
2467
+
2468
+ if (char_len <= max_chars) {
2469
+ return str;
2470
+ }
2471
+
2472
+ // Use Ruby's rb_str_substr which correctly handles multi-byte characters
2473
+ // Parameters: string, start position (in characters), length (in characters)
2474
+ VALUE truncated = rb_str_substr(rb_str, 0, max_chars);
2475
+
2476
+ // Convert back to C++ string
2477
+ return std::string(RSTRING_PTR(truncated), RSTRING_LEN(truncated));
2478
+ }
2479
+
2124
2480
  static VALUE nodeset_inspect(VALUE self) {
2125
2481
  NodeSetWrapper* wrapper;
2126
2482
  TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
@@ -2191,7 +2547,7 @@ static VALUE nodeset_inspect(VALUE self) {
2191
2547
  textStr = textStr.substr(start, end - start + 1);
2192
2548
 
2193
2549
  if (textStr.length() > 30) {
2194
- textStr = textStr.substr(0, 27) + "...";
2550
+ textStr = safe_truncate_utf8(textStr, 27) + "...";
2195
2551
  }
2196
2552
 
2197
2553
  result += ">";
@@ -2219,7 +2575,7 @@ static VALUE nodeset_inspect(VALUE self) {
2219
2575
  textStr = textStr.substr(start, end - start + 1);
2220
2576
 
2221
2577
  if (textStr.length() > 30) {
2222
- textStr = textStr.substr(0, 27) + "...";
2578
+ textStr = safe_truncate_utf8(textStr, 27) + "...";
2223
2579
  }
2224
2580
 
2225
2581
  result += "text(\"";
@@ -2241,7 +2597,10 @@ static VALUE nodeset_inspect(VALUE self) {
2241
2597
  }
2242
2598
 
2243
2599
  result += "]>";
2244
- return rb_str_new_cstr(result.c_str());
2600
+ VALUE rb_result = rb_str_new_cstr(result.c_str());
2601
+ // Ensure the string is marked as UTF-8 encoded
2602
+ rb_enc_associate(rb_result, rb_utf8_encoding());
2603
+ return rb_result;
2245
2604
  }
2246
2605
 
2247
2606
  // Schema.from_document(schema_doc) or Schema.from_string(xsd_string)
@@ -2282,6 +2641,18 @@ static VALUE schema_from_document(int argc, VALUE* argv, VALUE klass) {
2282
2641
 
2283
2642
  try {
2284
2643
  schemaParser->parse(schemaInput);
2644
+ } catch (const XMLException& e) {
2645
+ delete schemaParser;
2646
+ delete wrapper->schemaContent;
2647
+ xfree(wrapper);
2648
+ CharStr message(e.getMessage());
2649
+ rb_raise(rb_eRuntimeError, "Schema parsing failed: %s", message.localForm());
2650
+ } catch (const SAXException& e) {
2651
+ delete schemaParser;
2652
+ delete wrapper->schemaContent;
2653
+ xfree(wrapper);
2654
+ CharStr message(e.getMessage());
2655
+ rb_raise(rb_eRuntimeError, "Schema parsing failed: %s", message.localForm());
2285
2656
  } catch (...) {
2286
2657
  delete schemaParser;
2287
2658
  delete wrapper->schemaContent;
@@ -2363,6 +2734,12 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
2363
2734
  validator->loadGrammar(schemaSource, Grammar::SchemaGrammarType, true);
2364
2735
  validator->setExternalNoNamespaceSchemaLocation("schema.xsd");
2365
2736
  validator->useCachedGrammarInParse(true);
2737
+ } catch (const XMLException& e) {
2738
+ CharStr message(e.getMessage());
2739
+ errorHandler.errors.push_back(std::string("Warning: Schema grammar could not be loaded: ") + message.localForm());
2740
+ } catch (const SAXException& e) {
2741
+ CharStr message(e.getMessage());
2742
+ errorHandler.errors.push_back(std::string("Warning: Schema grammar could not be loaded: ") + message.localForm());
2366
2743
  } catch (...) {
2367
2744
  // If grammar loading fails, just note it
2368
2745
  errorHandler.errors.push_back("Warning: Schema grammar could not be loaded");
@@ -2414,13 +2791,110 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
2414
2791
  }
2415
2792
 
2416
2793
  return Qnil;
2417
- }extern "C" void Init_rxerces(void) {
2794
+ }
2795
+
2796
+ // RXerces.cache_xpath_validation? - check if XPath validation caching is enabled
2797
+ static VALUE rxerces_cache_xpath_validation_p(VALUE self) {
2798
+ return cache_xpath_validation ? Qtrue : Qfalse;
2799
+ }
2800
+
2801
+ // RXerces.cache_xpath_validation = bool - enable/disable XPath validation caching
2802
+ static VALUE rxerces_set_cache_xpath_validation(VALUE self, VALUE val) {
2803
+ cache_xpath_validation = RTEST(val);
2804
+ return val;
2805
+ }
2806
+
2807
+ // RXerces.clear_xpath_validation_cache - clear the XPath validation cache
2808
+ static VALUE rxerces_clear_xpath_validation_cache(VALUE self) {
2809
+ std::lock_guard<std::mutex> lock(xpath_cache_mutex);
2810
+ if (xpath_cache_lru_list) {
2811
+ xpath_cache_lru_list->clear();
2812
+ }
2813
+ if (xpath_cache_map) {
2814
+ xpath_cache_map->clear();
2815
+ }
2816
+ return Qnil;
2817
+ }
2818
+
2819
+ // RXerces.xpath_validation_cache_size - return number of cached expressions
2820
+ static VALUE rxerces_xpath_validation_cache_size(VALUE self) {
2821
+ std::lock_guard<std::mutex> lock(xpath_cache_mutex);
2822
+ if (!xpath_cache_map) {
2823
+ return LONG2NUM(0);
2824
+ }
2825
+ return LONG2NUM((long)xpath_cache_map->size());
2826
+ }
2827
+
2828
+ // RXerces.xpath_validation_cache_max_size - get max cache size
2829
+ static VALUE rxerces_xpath_validation_cache_max_size(VALUE self) {
2830
+ return LONG2NUM((long)xpath_cache_max_size);
2831
+ }
2832
+
2833
+ // RXerces.xpath_validation_cache_max_size = n - set max cache size
2834
+ static VALUE rxerces_set_xpath_validation_cache_max_size(VALUE self, VALUE val) {
2835
+ // Validate input: must be a non-negative integer
2836
+ if (!RB_INTEGER_TYPE_P(val)) {
2837
+ rb_raise(rb_eTypeError, "xpath_validation_cache_max_size must be an Integer");
2838
+ }
2839
+
2840
+ long size = NUM2LONG(val);
2841
+ if (size < 0) {
2842
+ rb_raise(rb_eArgError, "xpath_validation_cache_max_size must be non-negative");
2843
+ }
2844
+
2845
+ xpath_cache_max_size = (size_t)size;
2846
+ return val;
2847
+ }
2848
+
2849
+ // RXerces.xalan_enabled? - check if Xalan is available
2850
+ static VALUE rxerces_xalan_enabled_p(VALUE self) {
2851
+ #ifdef HAVE_XALAN
2852
+ return Qtrue;
2853
+ #else
2854
+ return Qfalse;
2855
+ #endif
2856
+ }
2857
+
2858
+ // RXerces.xpath_max_length - get max XPath expression length
2859
+ static VALUE rxerces_xpath_max_length(VALUE self) {
2860
+ return LONG2NUM((long)xpath_max_length);
2861
+ }
2862
+
2863
+ // RXerces.xpath_max_length = n - set max XPath expression length (0 = no limit)
2864
+ static VALUE rxerces_set_xpath_max_length(VALUE self, VALUE val) {
2865
+ // Validate input: must be a non-negative integer
2866
+ if (!RB_INTEGER_TYPE_P(val)) {
2867
+ rb_raise(rb_eTypeError, "xpath_max_length must be an Integer");
2868
+ }
2869
+
2870
+ long size = NUM2LONG(val);
2871
+ if (size < 0) {
2872
+ rb_raise(rb_eArgError, "xpath_max_length must be non-negative");
2873
+ }
2874
+
2875
+ xpath_max_length = (size_t)size;
2876
+ return val;
2877
+ }
2878
+
2879
+ extern "C" void Init_rxerces(void) {
2418
2880
  rb_mRXerces = rb_define_module("RXerces");
2881
+
2882
+ // Module-level configuration methods for XPath validation caching
2883
+ rb_define_singleton_method(rb_mRXerces, "cache_xpath_validation?", RUBY_METHOD_FUNC(rxerces_cache_xpath_validation_p), 0);
2884
+ rb_define_singleton_method(rb_mRXerces, "cache_xpath_validation=", RUBY_METHOD_FUNC(rxerces_set_cache_xpath_validation), 1);
2885
+ rb_define_singleton_method(rb_mRXerces, "clear_xpath_validation_cache", RUBY_METHOD_FUNC(rxerces_clear_xpath_validation_cache), 0);
2886
+ rb_define_singleton_method(rb_mRXerces, "xpath_validation_cache_size", RUBY_METHOD_FUNC(rxerces_xpath_validation_cache_size), 0);
2887
+ rb_define_singleton_method(rb_mRXerces, "xpath_validation_cache_max_size", RUBY_METHOD_FUNC(rxerces_xpath_validation_cache_max_size), 0);
2888
+ rb_define_singleton_method(rb_mRXerces, "xpath_validation_cache_max_size=", RUBY_METHOD_FUNC(rxerces_set_xpath_validation_cache_max_size), 1);
2889
+ rb_define_singleton_method(rb_mRXerces, "xpath_max_length", RUBY_METHOD_FUNC(rxerces_xpath_max_length), 0);
2890
+ rb_define_singleton_method(rb_mRXerces, "xpath_max_length=", RUBY_METHOD_FUNC(rxerces_set_xpath_max_length), 1);
2891
+ rb_define_singleton_method(rb_mRXerces, "xalan_enabled?", RUBY_METHOD_FUNC(rxerces_xalan_enabled_p), 0);
2892
+
2419
2893
  rb_mXML = rb_define_module_under(rb_mRXerces, "XML");
2420
2894
 
2421
2895
  rb_cDocument = rb_define_class_under(rb_mXML, "Document", rb_cObject);
2422
2896
  rb_undef_alloc_func(rb_cDocument);
2423
- rb_define_singleton_method(rb_cDocument, "parse", RUBY_METHOD_FUNC(document_parse), 1);
2897
+ rb_define_singleton_method(rb_cDocument, "parse", RUBY_METHOD_FUNC(document_parse), -1);
2424
2898
  rb_define_method(rb_cDocument, "root", RUBY_METHOD_FUNC(document_root), 0);
2425
2899
  rb_define_method(rb_cDocument, "errors", RUBY_METHOD_FUNC(document_errors), 0);
2426
2900
  rb_define_method(rb_cDocument, "to_s", RUBY_METHOD_FUNC(document_to_s), 0);
@@ -2464,6 +2938,7 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
2464
2938
  rb_define_method(rb_cNode, "parent", RUBY_METHOD_FUNC(node_parent), 0);
2465
2939
  rb_define_method(rb_cNode, "ancestors", RUBY_METHOD_FUNC(node_ancestors), -1);
2466
2940
  rb_define_method(rb_cNode, "attributes", RUBY_METHOD_FUNC(node_attributes), 0);
2941
+ rb_define_method(rb_cNode, "attribute_nodes", RUBY_METHOD_FUNC(node_attribute_nodes), 0);
2467
2942
  rb_define_method(rb_cNode, "next_sibling", RUBY_METHOD_FUNC(node_next_sibling), 0);
2468
2943
  rb_define_method(rb_cNode, "next_element", RUBY_METHOD_FUNC(node_next_element), 0);
2469
2944
  rb_define_method(rb_cNode, "previous_sibling", RUBY_METHOD_FUNC(node_previous_sibling), 0);