rxerces 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,10 @@
1
1
  #include "rxerces.h"
2
+ #include <ruby/encoding.h>
2
3
  #include <xercesc/util/PlatformUtils.hpp>
3
4
  #include <xercesc/parsers/XercesDOMParser.hpp>
4
5
  #include <xercesc/dom/DOM.hpp>
5
6
  #include <xercesc/util/XMLString.hpp>
7
+ #include <xercesc/util/XMLUni.hpp>
6
8
  #include <xercesc/framework/MemBufInputSource.hpp>
7
9
  #include <xercesc/framework/MemBufFormatTarget.hpp>
8
10
  #include <xercesc/util/XercesDefs.hpp>
@@ -10,8 +12,12 @@
10
12
  #include <xercesc/dom/DOMXPathExpression.hpp>
11
13
  #include <xercesc/sax/ErrorHandler.hpp>
12
14
  #include <xercesc/sax/SAXParseException.hpp>
15
+ #include <xercesc/sax/SAXException.hpp>
13
16
  #include <sstream>
14
17
  #include <vector>
18
+ #include <mutex>
19
+ #include <list>
20
+ #include <unordered_map>
15
21
 
16
22
  #ifdef HAVE_XALAN
17
23
  #include <xalanc/XPath/XPathEvaluator.hpp>
@@ -50,6 +56,17 @@ static bool xerces_initialized = false;
50
56
  #ifdef HAVE_XALAN
51
57
  static bool xalan_initialized = false;
52
58
  #endif
59
+ static std::mutex init_mutex;
60
+
61
+ // XPath validation cache with LRU eviction
62
+ // Uses a list for LRU ordering (front = most recently used)
63
+ // and a map for O(1) lookup of list iterators
64
+ static std::list<std::string>* xpath_cache_lru_list = nullptr;
65
+ static std::unordered_map<std::string, std::list<std::string>::iterator>* xpath_cache_map = nullptr;
66
+ static std::mutex xpath_cache_mutex;
67
+ static bool cache_xpath_validation = true; // Default: enabled
68
+ static size_t xpath_cache_max_size = 10000; // Max cached expressions
69
+ static size_t xpath_max_length = 10000; // Max XPath expression length
53
70
 
54
71
  // Forward declarations
55
72
  static std::string css_to_xpath(const char* css);
@@ -63,6 +80,12 @@ static void ensure_xerces_initialized() {
63
80
  return;
64
81
  }
65
82
 
83
+ std::lock_guard<std::mutex> lock(init_mutex);
84
+
85
+ if (xerces_initialized) {
86
+ return;
87
+ }
88
+
66
89
  try {
67
90
  XMLPlatformUtils::Initialize();
68
91
  #ifdef HAVE_XALAN
@@ -80,6 +103,16 @@ static void ensure_xerces_initialized() {
80
103
 
81
104
  // Cleanup function called at exit
82
105
  static void cleanup_xerces() {
106
+ // Clean up XPath validation cache (LRU)
107
+ if (xpath_cache_lru_list) {
108
+ delete xpath_cache_lru_list;
109
+ xpath_cache_lru_list = nullptr;
110
+ }
111
+ if (xpath_cache_map) {
112
+ delete xpath_cache_map;
113
+ xpath_cache_map = nullptr;
114
+ }
115
+
83
116
  #ifdef HAVE_XALAN
84
117
  if (xalan_initialized) {
85
118
  XPathEvaluator::terminate();
@@ -92,6 +125,167 @@ static void cleanup_xerces() {
92
125
  }
93
126
  }
94
127
 
128
+ // Validate XPath expression to prevent XPath injection attacks
129
+ static void validate_xpath_expression(const char* xpath_str) {
130
+ if (!xpath_str || strlen(xpath_str) == 0) {
131
+ rb_raise(rb_eArgError, "XPath expression cannot be empty");
132
+ }
133
+
134
+ std::string xpath(xpath_str);
135
+
136
+ // Check cache first if caching is enabled (LRU cache)
137
+ if (cache_xpath_validation) {
138
+ std::lock_guard<std::mutex> lock(xpath_cache_mutex);
139
+ if (!xpath_cache_lru_list) {
140
+ xpath_cache_lru_list = new std::list<std::string>();
141
+ }
142
+ if (!xpath_cache_map) {
143
+ xpath_cache_map = new std::unordered_map<std::string, std::list<std::string>::iterator>();
144
+ }
145
+ auto it = xpath_cache_map->find(xpath);
146
+ if (it != xpath_cache_map->end()) {
147
+ // Cache hit: move to front (most recently used)
148
+ xpath_cache_lru_list->splice(xpath_cache_lru_list->begin(), *xpath_cache_lru_list, it->second);
149
+ return; // Already validated
150
+ }
151
+ }
152
+ size_t len = xpath.length();
153
+
154
+ // Check for excessively long XPath expressions (potential DoS)
155
+ if (xpath_max_length > 0 && len > xpath_max_length) {
156
+ rb_raise(rb_eArgError, "XPath expression is too long (max %zu characters)", xpath_max_length);
157
+ }
158
+
159
+ // Check for dangerous patterns that could indicate XPath injection
160
+ // These patterns are commonly used in XPath injection attacks
161
+
162
+ // 1. Check for unbalanced quotes which could break out of string literals
163
+ int single_quotes = 0;
164
+ int double_quotes = 0;
165
+ bool in_single_quote = false;
166
+ bool in_double_quote = false;
167
+
168
+ for (size_t i = 0; i < len; i++) {
169
+ char c = xpath[i];
170
+
171
+ // Track quote state
172
+ if (c == '\'' && !in_double_quote) {
173
+ in_single_quote = !in_single_quote;
174
+ single_quotes++;
175
+ } else if (c == '"' && !in_single_quote) {
176
+ in_double_quote = !in_double_quote;
177
+ double_quotes++;
178
+ }
179
+ }
180
+
181
+ // Unbalanced quotes are suspicious
182
+ if (single_quotes % 2 != 0 || double_quotes % 2 != 0) {
183
+ rb_raise(rb_eArgError, "XPath expression contains unbalanced quotes");
184
+ }
185
+
186
+ // 2. Check for suspicious comment patterns that could be used to bypass validation
187
+ if (xpath.find("(:") != std::string::npos || xpath.find(":)") != std::string::npos) {
188
+ rb_raise(rb_eArgError, "XPath expression contains suspicious comment patterns");
189
+ }
190
+
191
+ // 3. Check for null bytes which could truncate validation
192
+ if (xpath.find('\0') != std::string::npos) {
193
+ rb_raise(rb_eArgError, "XPath expression contains null bytes");
194
+ }
195
+
196
+ // 4. Check for excessive nesting which could cause stack overflow
197
+ int bracket_depth = 0;
198
+ int paren_depth = 0;
199
+ const int MAX_DEPTH = 100;
200
+
201
+ for (size_t i = 0; i < len; i++) {
202
+ char c = xpath[i];
203
+
204
+ if (c == '[') bracket_depth++;
205
+ else if (c == ']') bracket_depth--;
206
+ else if (c == '(') paren_depth++;
207
+ else if (c == ')') paren_depth--;
208
+
209
+ if (bracket_depth > MAX_DEPTH || paren_depth > MAX_DEPTH) {
210
+ rb_raise(rb_eArgError, "XPath expression has excessive nesting depth");
211
+ }
212
+
213
+ if (bracket_depth < 0 || paren_depth < 0) {
214
+ rb_raise(rb_eArgError, "XPath expression has unbalanced brackets or parentheses");
215
+ }
216
+ }
217
+
218
+ if (bracket_depth != 0 || paren_depth != 0) {
219
+ rb_raise(rb_eArgError, "XPath expression has unbalanced brackets or parentheses");
220
+ }
221
+
222
+ // 5. Check for suspicious function calls that could access system functions
223
+ // or perform dangerous operations
224
+ std::vector<std::string> dangerous_patterns = {
225
+ "document(", // Can access external documents
226
+ "doc(", // Can access external documents
227
+ "collection(", // Can access external collections
228
+ "unparsed-text(", // Can read arbitrary files
229
+ "system-property(", // Can leak system information
230
+ "environment-variable(", // Can leak environment variables
231
+ };
232
+
233
+ for (const auto& pattern : dangerous_patterns) {
234
+ if (xpath.find(pattern) != std::string::npos) {
235
+ rb_raise(rb_eArgError, "XPath expression contains potentially dangerous function: %s", pattern.c_str());
236
+ }
237
+ }
238
+
239
+ // 6. Check for encoded characters that could bypass validation
240
+ // Use specific patterns to avoid false positives (e.g., "Q&A" in text)
241
+ if (xpath.find("&#") != std::string::npos || // Numeric character reference (&#60;)
242
+ xpath.find("&#x") != std::string::npos || // Hex character reference (&#x3C;)
243
+ xpath.find("&amp;#") != std::string::npos) { // Encoded entity reference
244
+ rb_raise(rb_eArgError, "XPath expression contains encoded characters");
245
+ }
246
+
247
+ // 7. Detect potential boolean-based blind XPath injection patterns
248
+ // These patterns use 'or' with always-true conditions
249
+ std::vector<std::string> injection_patterns = {
250
+ "or 1=1",
251
+ "or '1'='1'",
252
+ "or \"1\"=\"1\"",
253
+ "or true()",
254
+ "and 1=0",
255
+ "and false()",
256
+ "or 'a'='a'",
257
+ "or \"a\"=\"a\"",
258
+ };
259
+
260
+ // Convert to lowercase for case-insensitive comparison
261
+ std::string xpath_lower = xpath;
262
+ std::transform(xpath_lower.begin(), xpath_lower.end(), xpath_lower.begin(), ::tolower);
263
+
264
+ for (const auto& pattern : injection_patterns) {
265
+ if (xpath_lower.find(pattern) != std::string::npos) {
266
+ rb_raise(rb_eArgError, "XPath expression contains suspicious injection pattern");
267
+ }
268
+ }
269
+
270
+ // Add to cache if caching is enabled (LRU eviction)
271
+ if (cache_xpath_validation) {
272
+ std::lock_guard<std::mutex> lock(xpath_cache_mutex);
273
+ if (xpath_cache_lru_list && xpath_cache_map) {
274
+ // If cache is full, evict least recently used (back of list)
275
+ if (xpath_cache_max_size > 0 && xpath_cache_map->size() >= xpath_cache_max_size) {
276
+ std::string& lru = xpath_cache_lru_list->back();
277
+ xpath_cache_map->erase(lru);
278
+ xpath_cache_lru_list->pop_back();
279
+ }
280
+ // Add new entry to front (most recently used)
281
+ if (xpath_cache_max_size > 0) {
282
+ xpath_cache_lru_list->push_front(xpath);
283
+ (*xpath_cache_map)[xpath] = xpath_cache_lru_list->begin();
284
+ }
285
+ }
286
+ }
287
+ }
288
+
95
289
  // Helper class to manage XMLCh strings
96
290
  class XStr {
97
291
  public:
@@ -353,35 +547,110 @@ static VALUE wrap_node(DOMNode* node, VALUE doc_ref) {
353
547
  return Qnil;
354
548
  }
355
549
 
356
- NodeWrapper* wrapper = ALLOC(NodeWrapper);
357
- wrapper->node = node;
358
- wrapper->doc_ref = doc_ref;
359
-
360
- VALUE rb_node;
361
-
550
+ VALUE rb_class;
362
551
  switch (node->getNodeType()) {
363
552
  case DOMNode::ELEMENT_NODE:
364
- rb_node = TypedData_Wrap_Struct(rb_cElement, &node_type, wrapper);
553
+ rb_class = rb_cElement;
365
554
  break;
366
555
  case DOMNode::TEXT_NODE:
367
- rb_node = TypedData_Wrap_Struct(rb_cText, &node_type, wrapper);
556
+ rb_class = rb_cText;
368
557
  break;
369
558
  default:
370
- rb_node = TypedData_Wrap_Struct(rb_cNode, &node_type, wrapper);
559
+ rb_class = rb_cNode;
371
560
  break;
372
561
  }
373
562
 
563
+ VALUE rb_node = TypedData_Wrap_Struct(rb_class, &node_type, NULL);
564
+ NodeWrapper* wrapper = ALLOC(NodeWrapper);
565
+ wrapper->node = node;
566
+ wrapper->doc_ref = doc_ref;
567
+ DATA_PTR(rb_node) = wrapper;
568
+
374
569
  return rb_node;
375
570
  }
376
571
 
377
- // RXerces::XML::Document.parse(string)
378
- static VALUE document_parse(VALUE klass, VALUE str) {
572
+ // RXerces::XML::Document.parse(string, options = {})
573
+ // Validate options hash for document_parse - only allow known keys
574
+ static void validate_parse_options(VALUE options) {
575
+ if (NIL_P(options)) {
576
+ return;
577
+ }
578
+
579
+ Check_Type(options, T_HASH);
580
+
581
+ // Define allowed option keys
582
+ std::vector<const char*> allowed_keys = {
583
+ "allow_external_entities"
584
+ };
585
+
586
+ // Get all keys from the provided options hash
587
+ VALUE keys = rb_funcall(options, rb_intern("keys"), 0);
588
+ long keys_len = RARRAY_LEN(keys);
589
+
590
+ // Check each key against the allowed list
591
+ for (long i = 0; i < keys_len; i++) {
592
+ VALUE key = rb_ary_entry(keys, i);
593
+
594
+ // Convert symbol or string key to string for comparison
595
+ VALUE key_str;
596
+ if (TYPE(key) == T_SYMBOL) {
597
+ key_str = rb_sym_to_s(key);
598
+ } else if (TYPE(key) == T_STRING) {
599
+ key_str = key;
600
+ } else {
601
+ rb_raise(rb_eArgError, "Option keys must be symbols or strings");
602
+ }
603
+
604
+ const char* key_cstr = StringValueCStr(key_str);
605
+ bool found = false;
606
+
607
+ for (const auto& allowed : allowed_keys) {
608
+ if (strcmp(key_cstr, allowed) == 0) {
609
+ found = true;
610
+ break;
611
+ }
612
+ }
613
+
614
+ if (!found) {
615
+ rb_raise(rb_eArgError, "Unknown option: %s. Allowed options are: allow_external_entities", key_cstr);
616
+ }
617
+ }
618
+ }
619
+
620
+ static VALUE document_parse(int argc, VALUE* argv, VALUE klass) {
621
+ VALUE str, options;
622
+ rb_scan_args(argc, argv, "11", &str, &options);
623
+
379
624
  ensure_xerces_initialized();
380
625
 
381
626
  Check_Type(str, T_STRING);
382
627
  const char* xml_str = StringValueCStr(str);
383
628
 
629
+ // Validate options hash before processing
630
+ validate_parse_options(options);
631
+
384
632
  XercesDOMParser* parser = new XercesDOMParser();
633
+
634
+ // Check if external entities should be allowed (default: false for security)
635
+ bool allow_external = false;
636
+ if (!NIL_P(options)) {
637
+ VALUE allow_key = rb_intern("allow_external_entities");
638
+ VALUE allow_val = rb_hash_aref(options, ID2SYM(allow_key));
639
+ if (RTEST(allow_val)) {
640
+ allow_external = true;
641
+ }
642
+ }
643
+
644
+ if (allow_external) {
645
+ // Allow external entities (less secure)
646
+ parser->setLoadExternalDTD(true);
647
+ parser->setDisableDefaultEntityResolution(false);
648
+ } else {
649
+ // Security: Disable external entity processing to prevent XXE attacks
650
+ parser->setLoadExternalDTD(false);
651
+ parser->setDisableDefaultEntityResolution(true);
652
+ }
653
+
385
654
  parser->setValidationScheme(XercesDOMParser::Val_Never);
386
655
  parser->setDoNamespaces(true);
387
656
  parser->setDoSchema(false);
@@ -485,8 +754,16 @@ static VALUE document_to_s(VALUE self) {
485
754
  serializer->release();
486
755
 
487
756
  return result;
757
+ } catch (const DOMException& e) {
758
+ CharStr message(e.getMessage());
759
+ rb_raise(rb_eRuntimeError, "Failed to serialize document: %s", message.localForm());
760
+ } catch (const XMLException& e) {
761
+ CharStr message(e.getMessage());
762
+ rb_raise(rb_eRuntimeError, "Failed to serialize document (XMLException): %s", message.localForm());
763
+ } catch (const std::exception& e) {
764
+ rb_raise(rb_eRuntimeError, "Failed to serialize document (std::exception): %s", e.what());
488
765
  } catch (...) {
489
- rb_raise(rb_eRuntimeError, "Failed to serialize document");
766
+ rb_raise(rb_eRuntimeError, "Failed to serialize document (unknown exception type)");
490
767
  }
491
768
 
492
769
  return Qnil;
@@ -608,9 +885,103 @@ static VALUE document_create_element(VALUE self, VALUE name) {
608
885
  return Qnil;
609
886
  }
610
887
 
888
+ // document.children - returns all children (elements, text, comments, etc.)
889
+ static VALUE document_children(VALUE self) {
890
+ DocumentWrapper* wrapper;
891
+ TypedData_Get_Struct(self, DocumentWrapper, &document_type, wrapper);
892
+
893
+ VALUE children = rb_ary_new();
894
+
895
+ if (!wrapper->doc) {
896
+ return children;
897
+ }
898
+
899
+ DOMNodeList* child_nodes = wrapper->doc->getChildNodes();
900
+ XMLSize_t count = child_nodes->getLength();
901
+
902
+ for (XMLSize_t i = 0; i < count; i++) {
903
+ DOMNode* child = child_nodes->item(i);
904
+ rb_ary_push(children, wrap_node(child, self));
905
+ }
906
+
907
+ return children;
908
+ }
909
+
910
+ // document.element_children - returns only element children (no text nodes, comments, etc.)
911
+ static VALUE document_element_children(VALUE self) {
912
+ DocumentWrapper* wrapper;
913
+ TypedData_Get_Struct(self, DocumentWrapper, &document_type, wrapper);
914
+
915
+ VALUE children = rb_ary_new();
916
+
917
+ if (!wrapper->doc) {
918
+ return children;
919
+ }
920
+
921
+ DOMNodeList* child_nodes = wrapper->doc->getChildNodes();
922
+ XMLSize_t count = child_nodes->getLength();
923
+
924
+ for (XMLSize_t i = 0; i < count; i++) {
925
+ DOMNode* child = child_nodes->item(i);
926
+ if (child->getNodeType() == DOMNode::ELEMENT_NODE) {
927
+ rb_ary_push(children, wrap_node(child, self));
928
+ }
929
+ }
930
+
931
+ return children;
932
+ }
933
+
934
+ // document.first_element_child - returns first element child
935
+ static VALUE document_first_element_child(VALUE self) {
936
+ DocumentWrapper* wrapper;
937
+ TypedData_Get_Struct(self, DocumentWrapper, &document_type, wrapper);
938
+
939
+ if (!wrapper->doc) {
940
+ return Qnil;
941
+ }
942
+
943
+ DOMNodeList* child_nodes = wrapper->doc->getChildNodes();
944
+ XMLSize_t count = child_nodes->getLength();
945
+
946
+ for (XMLSize_t i = 0; i < count; i++) {
947
+ DOMNode* child = child_nodes->item(i);
948
+ if (child->getNodeType() == DOMNode::ELEMENT_NODE) {
949
+ return wrap_node(child, self);
950
+ }
951
+ }
952
+
953
+ return Qnil;
954
+ }
955
+
956
+ // document.last_element_child - returns last element child
957
+ static VALUE document_last_element_child(VALUE self) {
958
+ DocumentWrapper* wrapper;
959
+ TypedData_Get_Struct(self, DocumentWrapper, &document_type, wrapper);
960
+
961
+ if (!wrapper->doc) {
962
+ return Qnil;
963
+ }
964
+
965
+ DOMNodeList* child_nodes = wrapper->doc->getChildNodes();
966
+ XMLSize_t count = child_nodes->getLength();
967
+
968
+ // Search backwards for last element
969
+ for (XMLSize_t i = count; i > 0; i--) {
970
+ DOMNode* child = child_nodes->item(i - 1);
971
+ if (child->getNodeType() == DOMNode::ELEMENT_NODE) {
972
+ return wrap_node(child, self);
973
+ }
974
+ }
975
+
976
+ return Qnil;
977
+ }
978
+
611
979
  #ifdef HAVE_XALAN
612
980
  // Helper function to execute XPath using Xalan for full XPath 1.0 support
613
981
  static VALUE execute_xpath_with_xalan(DOMNode* context_node, const char* xpath_str, VALUE doc_ref) {
982
+ // Validate XPath expression before execution
983
+ validate_xpath_expression(xpath_str);
984
+
614
985
  ensure_xerces_initialized();
615
986
 
616
987
  try {
@@ -718,6 +1089,9 @@ static VALUE document_xpath(VALUE self, VALUE path) {
718
1089
  Check_Type(path, T_STRING);
719
1090
  const char* xpath_str = StringValueCStr(path);
720
1091
 
1092
+ // Validate XPath expression before execution
1093
+ validate_xpath_expression(xpath_str);
1094
+
721
1095
  #ifdef HAVE_XALAN
722
1096
  // Use Xalan for full XPath 1.0 support
723
1097
  DOMElement* root = doc_wrapper->doc->getDocumentElement();
@@ -782,6 +1156,19 @@ static VALUE document_xpath(VALUE self, VALUE path) {
782
1156
  #endif
783
1157
  }
784
1158
 
1159
+ // document.at_xpath(path) - returns first matching node or nil
1160
+ static VALUE document_at_xpath(VALUE self, VALUE path) {
1161
+ VALUE nodeset = document_xpath(self, path);
1162
+ NodeSetWrapper* wrapper;
1163
+ TypedData_Get_Struct(nodeset, NodeSetWrapper, &nodeset_type, wrapper);
1164
+
1165
+ if (RARRAY_LEN(wrapper->nodes_array) == 0) {
1166
+ return Qnil;
1167
+ }
1168
+
1169
+ return rb_ary_entry(wrapper->nodes_array, 0);
1170
+ }
1171
+
785
1172
  // document.css(selector) - Convert CSS to XPath and execute
786
1173
  static VALUE document_css(VALUE self, VALUE selector) {
787
1174
  Check_Type(selector, T_STRING);
@@ -1104,6 +1491,65 @@ static VALUE node_element_children(VALUE self) {
1104
1491
  return children;
1105
1492
  }
1106
1493
 
1494
+ // node.first_element_child - returns first element child
1495
+ static VALUE node_first_element_child(VALUE self) {
1496
+ NodeWrapper* wrapper;
1497
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
1498
+
1499
+ if (!wrapper->node) {
1500
+ return Qnil;
1501
+ }
1502
+
1503
+ VALUE doc_ref = wrapper->doc_ref;
1504
+ DOMNodeList* child_nodes = wrapper->node->getChildNodes();
1505
+ XMLSize_t count = child_nodes->getLength();
1506
+
1507
+ for (XMLSize_t i = 0; i < count; i++) {
1508
+ DOMNode* child = child_nodes->item(i);
1509
+ if (child->getNodeType() == DOMNode::ELEMENT_NODE) {
1510
+ return wrap_node(child, doc_ref);
1511
+ }
1512
+ }
1513
+
1514
+ return Qnil;
1515
+ }
1516
+
1517
+ // node.last_element_child - returns last element child
1518
+ static VALUE node_last_element_child(VALUE self) {
1519
+ NodeWrapper* wrapper;
1520
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
1521
+
1522
+ if (!wrapper->node) {
1523
+ return Qnil;
1524
+ }
1525
+
1526
+ VALUE doc_ref = wrapper->doc_ref;
1527
+ DOMNodeList* child_nodes = wrapper->node->getChildNodes();
1528
+ XMLSize_t count = child_nodes->getLength();
1529
+
1530
+ // Search backwards for last element
1531
+ for (XMLSize_t i = count; i > 0; i--) {
1532
+ DOMNode* child = child_nodes->item(i - 1);
1533
+ if (child->getNodeType() == DOMNode::ELEMENT_NODE) {
1534
+ return wrap_node(child, doc_ref);
1535
+ }
1536
+ }
1537
+
1538
+ return Qnil;
1539
+ }
1540
+
1541
+ // node.document - returns the document that owns this node
1542
+ static VALUE node_document(VALUE self) {
1543
+ NodeWrapper* wrapper;
1544
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
1545
+
1546
+ if (!wrapper->node) {
1547
+ return Qnil;
1548
+ }
1549
+
1550
+ return wrapper->doc_ref;
1551
+ }
1552
+
1107
1553
  // node.parent
1108
1554
  static VALUE node_parent(VALUE self) {
1109
1555
  NodeWrapper* wrapper;
@@ -1231,6 +1677,37 @@ static VALUE node_attributes(VALUE self) {
1231
1677
  return hash;
1232
1678
  }
1233
1679
 
1680
+ // node.attribute_nodes - returns array of attribute nodes (only for element nodes)
1681
+ static VALUE node_attribute_nodes(VALUE self) {
1682
+ NodeWrapper* wrapper;
1683
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
1684
+
1685
+ VALUE nodes_array = rb_ary_new();
1686
+
1687
+ if (!wrapper->node || wrapper->node->getNodeType() != DOMNode::ELEMENT_NODE) {
1688
+ return nodes_array;
1689
+ }
1690
+
1691
+ DOMElement* element = dynamic_cast<DOMElement*>(wrapper->node);
1692
+ DOMNamedNodeMap* attributes = element->getAttributes();
1693
+
1694
+ if (!attributes) {
1695
+ return nodes_array;
1696
+ }
1697
+
1698
+ VALUE doc_ref = wrapper->doc_ref;
1699
+ XMLSize_t length = attributes->getLength();
1700
+
1701
+ for (XMLSize_t i = 0; i < length; i++) {
1702
+ DOMNode* attr = attributes->item(i);
1703
+ if (attr) {
1704
+ rb_ary_push(nodes_array, wrap_node(attr, doc_ref));
1705
+ }
1706
+ }
1707
+
1708
+ return nodes_array;
1709
+ }
1710
+
1234
1711
  // node.next_sibling
1235
1712
  static VALUE node_next_sibling(VALUE self) {
1236
1713
  NodeWrapper* wrapper;
@@ -1330,7 +1807,7 @@ static VALUE node_add_child(VALUE self, VALUE child) {
1330
1807
  }
1331
1808
 
1332
1809
  DOMNode* child_node = NULL;
1333
- bool needs_import = false;
1810
+ VALUE doc_ref = wrapper->doc_ref; // Keep track of the Ruby document reference
1334
1811
 
1335
1812
  // Check if child is a string or a node
1336
1813
  if (TYPE(child) == T_STRING) {
@@ -1344,13 +1821,27 @@ static VALUE node_add_child(VALUE self, VALUE child) {
1344
1821
  NodeWrapper* child_wrapper;
1345
1822
  if (rb_obj_is_kind_of(child, rb_cNode)) {
1346
1823
  TypedData_Get_Struct(child, NodeWrapper, &node_type, child_wrapper);
1347
- child_node = child_wrapper->node;
1824
+ DOMNode* original_child = child_wrapper->node;
1348
1825
 
1349
1826
  // Check if child belongs to a different document
1350
- DOMDocument* child_doc = child_node->getOwnerDocument();
1827
+ DOMDocument* child_doc = original_child->getOwnerDocument();
1351
1828
  if (child_doc && child_doc != doc) {
1352
- rb_raise(rb_eRuntimeError,
1353
- "Node belongs to a different document. Use importNode to adopt nodes from other documents.");
1829
+ // Automatically import the node from the other document
1830
+ // The second parameter 'true' means deep copy (include all descendants)
1831
+ try {
1832
+ child_node = doc->importNode(original_child, true);
1833
+
1834
+ // Update the child wrapper to point to the imported node
1835
+ // and the new document reference
1836
+ child_wrapper->node = child_node;
1837
+ child_wrapper->doc_ref = doc_ref;
1838
+ } catch (const DOMException& e) {
1839
+ CharStr message(e.getMessage());
1840
+ rb_raise(rb_eRuntimeError, "Failed to import node from different document: %s",
1841
+ message.localForm());
1842
+ }
1843
+ } else {
1844
+ child_node = original_child;
1354
1845
  }
1355
1846
  } else {
1356
1847
  rb_raise(rb_eTypeError, "Argument must be a String or Node");
@@ -1589,6 +2080,9 @@ static VALUE node_xpath(VALUE self, VALUE path) {
1589
2080
  const char* xpath_str = StringValueCStr(path);
1590
2081
  VALUE doc_ref = node_wrapper->doc_ref;
1591
2082
 
2083
+ // Validate XPath expression before execution
2084
+ validate_xpath_expression(xpath_str);
2085
+
1592
2086
  #ifdef HAVE_XALAN
1593
2087
  // Use Xalan for full XPath 1.0 support
1594
2088
  return execute_xpath_with_xalan(node_wrapper->node, xpath_str, doc_ref);
@@ -1958,6 +2452,31 @@ static VALUE nodeset_text(VALUE self) {
1958
2452
  }
1959
2453
 
1960
2454
  // nodeset.inspect / nodeset.to_s - human-readable representation
2455
+ // Helper function to safely truncate UTF-8 strings using Ruby's built-in UTF-8 handling
2456
+ // Ruby's rb_str_substr operates on CHARACTER positions, not byte positions
2457
+ static std::string safe_truncate_utf8(const std::string& str, long max_chars) {
2458
+ if (str.empty()) {
2459
+ return str;
2460
+ }
2461
+
2462
+ // Create a Ruby string with UTF-8 encoding
2463
+ VALUE rb_str = rb_enc_str_new(str.c_str(), str.length(), rb_utf8_encoding());
2464
+
2465
+ // Get the character length (not byte length)
2466
+ long char_len = RSTRING_LEN(rb_str);
2467
+
2468
+ if (char_len <= max_chars) {
2469
+ return str;
2470
+ }
2471
+
2472
+ // Use Ruby's rb_str_substr which correctly handles multi-byte characters
2473
+ // Parameters: string, start position (in characters), length (in characters)
2474
+ VALUE truncated = rb_str_substr(rb_str, 0, max_chars);
2475
+
2476
+ // Convert back to C++ string
2477
+ return std::string(RSTRING_PTR(truncated), RSTRING_LEN(truncated));
2478
+ }
2479
+
1961
2480
  static VALUE nodeset_inspect(VALUE self) {
1962
2481
  NodeSetWrapper* wrapper;
1963
2482
  TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
@@ -2028,7 +2547,7 @@ static VALUE nodeset_inspect(VALUE self) {
2028
2547
  textStr = textStr.substr(start, end - start + 1);
2029
2548
 
2030
2549
  if (textStr.length() > 30) {
2031
- textStr = textStr.substr(0, 27) + "...";
2550
+ textStr = safe_truncate_utf8(textStr, 27) + "...";
2032
2551
  }
2033
2552
 
2034
2553
  result += ">";
@@ -2056,7 +2575,7 @@ static VALUE nodeset_inspect(VALUE self) {
2056
2575
  textStr = textStr.substr(start, end - start + 1);
2057
2576
 
2058
2577
  if (textStr.length() > 30) {
2059
- textStr = textStr.substr(0, 27) + "...";
2578
+ textStr = safe_truncate_utf8(textStr, 27) + "...";
2060
2579
  }
2061
2580
 
2062
2581
  result += "text(\"";
@@ -2078,7 +2597,10 @@ static VALUE nodeset_inspect(VALUE self) {
2078
2597
  }
2079
2598
 
2080
2599
  result += "]>";
2081
- return rb_str_new_cstr(result.c_str());
2600
+ VALUE rb_result = rb_str_new_cstr(result.c_str());
2601
+ // Ensure the string is marked as UTF-8 encoded
2602
+ rb_enc_associate(rb_result, rb_utf8_encoding());
2603
+ return rb_result;
2082
2604
  }
2083
2605
 
2084
2606
  // Schema.from_document(schema_doc) or Schema.from_string(xsd_string)
@@ -2119,6 +2641,18 @@ static VALUE schema_from_document(int argc, VALUE* argv, VALUE klass) {
2119
2641
 
2120
2642
  try {
2121
2643
  schemaParser->parse(schemaInput);
2644
+ } catch (const XMLException& e) {
2645
+ delete schemaParser;
2646
+ delete wrapper->schemaContent;
2647
+ xfree(wrapper);
2648
+ CharStr message(e.getMessage());
2649
+ rb_raise(rb_eRuntimeError, "Schema parsing failed: %s", message.localForm());
2650
+ } catch (const SAXException& e) {
2651
+ delete schemaParser;
2652
+ delete wrapper->schemaContent;
2653
+ xfree(wrapper);
2654
+ CharStr message(e.getMessage());
2655
+ rb_raise(rb_eRuntimeError, "Schema parsing failed: %s", message.localForm());
2122
2656
  } catch (...) {
2123
2657
  delete schemaParser;
2124
2658
  delete wrapper->schemaContent;
@@ -2200,6 +2734,12 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
2200
2734
  validator->loadGrammar(schemaSource, Grammar::SchemaGrammarType, true);
2201
2735
  validator->setExternalNoNamespaceSchemaLocation("schema.xsd");
2202
2736
  validator->useCachedGrammarInParse(true);
2737
+ } catch (const XMLException& e) {
2738
+ CharStr message(e.getMessage());
2739
+ errorHandler.errors.push_back(std::string("Warning: Schema grammar could not be loaded: ") + message.localForm());
2740
+ } catch (const SAXException& e) {
2741
+ CharStr message(e.getMessage());
2742
+ errorHandler.errors.push_back(std::string("Warning: Schema grammar could not be loaded: ") + message.localForm());
2203
2743
  } catch (...) {
2204
2744
  // If grammar loading fails, just note it
2205
2745
  errorHandler.errors.push_back("Warning: Schema grammar could not be loaded");
@@ -2251,25 +2791,129 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
2251
2791
  }
2252
2792
 
2253
2793
  return Qnil;
2254
- }extern "C" void Init_rxerces(void) {
2794
+ }
2795
+
2796
+ // RXerces.cache_xpath_validation? - check if XPath validation caching is enabled
2797
+ static VALUE rxerces_cache_xpath_validation_p(VALUE self) {
2798
+ return cache_xpath_validation ? Qtrue : Qfalse;
2799
+ }
2800
+
2801
+ // RXerces.cache_xpath_validation = bool - enable/disable XPath validation caching
2802
+ static VALUE rxerces_set_cache_xpath_validation(VALUE self, VALUE val) {
2803
+ cache_xpath_validation = RTEST(val);
2804
+ return val;
2805
+ }
2806
+
2807
+ // RXerces.clear_xpath_validation_cache - clear the XPath validation cache
2808
+ static VALUE rxerces_clear_xpath_validation_cache(VALUE self) {
2809
+ std::lock_guard<std::mutex> lock(xpath_cache_mutex);
2810
+ if (xpath_cache_lru_list) {
2811
+ xpath_cache_lru_list->clear();
2812
+ }
2813
+ if (xpath_cache_map) {
2814
+ xpath_cache_map->clear();
2815
+ }
2816
+ return Qnil;
2817
+ }
2818
+
2819
+ // RXerces.xpath_validation_cache_size - return number of cached expressions
2820
+ static VALUE rxerces_xpath_validation_cache_size(VALUE self) {
2821
+ std::lock_guard<std::mutex> lock(xpath_cache_mutex);
2822
+ if (!xpath_cache_map) {
2823
+ return LONG2NUM(0);
2824
+ }
2825
+ return LONG2NUM((long)xpath_cache_map->size());
2826
+ }
2827
+
2828
+ // RXerces.xpath_validation_cache_max_size - get max cache size
2829
+ static VALUE rxerces_xpath_validation_cache_max_size(VALUE self) {
2830
+ return LONG2NUM((long)xpath_cache_max_size);
2831
+ }
2832
+
2833
+ // RXerces.xpath_validation_cache_max_size = n - set max cache size
2834
+ static VALUE rxerces_set_xpath_validation_cache_max_size(VALUE self, VALUE val) {
2835
+ // Validate input: must be a non-negative integer
2836
+ if (!RB_INTEGER_TYPE_P(val)) {
2837
+ rb_raise(rb_eTypeError, "xpath_validation_cache_max_size must be an Integer");
2838
+ }
2839
+
2840
+ long size = NUM2LONG(val);
2841
+ if (size < 0) {
2842
+ rb_raise(rb_eArgError, "xpath_validation_cache_max_size must be non-negative");
2843
+ }
2844
+
2845
+ xpath_cache_max_size = (size_t)size;
2846
+ return val;
2847
+ }
2848
+
2849
+ // RXerces.xalan_enabled? - check if Xalan is available
2850
+ static VALUE rxerces_xalan_enabled_p(VALUE self) {
2851
+ #ifdef HAVE_XALAN
2852
+ return Qtrue;
2853
+ #else
2854
+ return Qfalse;
2855
+ #endif
2856
+ }
2857
+
2858
+ // RXerces.xpath_max_length - get max XPath expression length
2859
+ static VALUE rxerces_xpath_max_length(VALUE self) {
2860
+ return LONG2NUM((long)xpath_max_length);
2861
+ }
2862
+
2863
+ // RXerces.xpath_max_length = n - set max XPath expression length (0 = no limit)
2864
+ static VALUE rxerces_set_xpath_max_length(VALUE self, VALUE val) {
2865
+ // Validate input: must be a non-negative integer
2866
+ if (!RB_INTEGER_TYPE_P(val)) {
2867
+ rb_raise(rb_eTypeError, "xpath_max_length must be an Integer");
2868
+ }
2869
+
2870
+ long size = NUM2LONG(val);
2871
+ if (size < 0) {
2872
+ rb_raise(rb_eArgError, "xpath_max_length must be non-negative");
2873
+ }
2874
+
2875
+ xpath_max_length = (size_t)size;
2876
+ return val;
2877
+ }
2878
+
2879
+ extern "C" void Init_rxerces(void) {
2255
2880
  rb_mRXerces = rb_define_module("RXerces");
2881
+
2882
+ // Module-level configuration methods for XPath validation caching
2883
+ rb_define_singleton_method(rb_mRXerces, "cache_xpath_validation?", RUBY_METHOD_FUNC(rxerces_cache_xpath_validation_p), 0);
2884
+ rb_define_singleton_method(rb_mRXerces, "cache_xpath_validation=", RUBY_METHOD_FUNC(rxerces_set_cache_xpath_validation), 1);
2885
+ rb_define_singleton_method(rb_mRXerces, "clear_xpath_validation_cache", RUBY_METHOD_FUNC(rxerces_clear_xpath_validation_cache), 0);
2886
+ rb_define_singleton_method(rb_mRXerces, "xpath_validation_cache_size", RUBY_METHOD_FUNC(rxerces_xpath_validation_cache_size), 0);
2887
+ rb_define_singleton_method(rb_mRXerces, "xpath_validation_cache_max_size", RUBY_METHOD_FUNC(rxerces_xpath_validation_cache_max_size), 0);
2888
+ rb_define_singleton_method(rb_mRXerces, "xpath_validation_cache_max_size=", RUBY_METHOD_FUNC(rxerces_set_xpath_validation_cache_max_size), 1);
2889
+ rb_define_singleton_method(rb_mRXerces, "xpath_max_length", RUBY_METHOD_FUNC(rxerces_xpath_max_length), 0);
2890
+ rb_define_singleton_method(rb_mRXerces, "xpath_max_length=", RUBY_METHOD_FUNC(rxerces_set_xpath_max_length), 1);
2891
+ rb_define_singleton_method(rb_mRXerces, "xalan_enabled?", RUBY_METHOD_FUNC(rxerces_xalan_enabled_p), 0);
2892
+
2256
2893
  rb_mXML = rb_define_module_under(rb_mRXerces, "XML");
2257
2894
 
2258
2895
  rb_cDocument = rb_define_class_under(rb_mXML, "Document", rb_cObject);
2259
2896
  rb_undef_alloc_func(rb_cDocument);
2260
- rb_define_singleton_method(rb_cDocument, "parse", RUBY_METHOD_FUNC(document_parse), 1);
2897
+ rb_define_singleton_method(rb_cDocument, "parse", RUBY_METHOD_FUNC(document_parse), -1);
2261
2898
  rb_define_method(rb_cDocument, "root", RUBY_METHOD_FUNC(document_root), 0);
2262
2899
  rb_define_method(rb_cDocument, "errors", RUBY_METHOD_FUNC(document_errors), 0);
2263
2900
  rb_define_method(rb_cDocument, "to_s", RUBY_METHOD_FUNC(document_to_s), 0);
2264
2901
  rb_define_alias(rb_cDocument, "to_xml", "to_s");
2265
2902
  rb_define_method(rb_cDocument, "inspect", RUBY_METHOD_FUNC(document_inspect), 0);
2266
2903
  rb_define_method(rb_cDocument, "xpath", RUBY_METHOD_FUNC(document_xpath), 1);
2904
+ rb_define_method(rb_cDocument, "at_xpath", RUBY_METHOD_FUNC(document_at_xpath), 1);
2905
+ rb_define_alias(rb_cDocument, "at", "at_xpath");
2267
2906
  rb_define_method(rb_cDocument, "css", RUBY_METHOD_FUNC(document_css), 1);
2268
2907
  rb_define_method(rb_cDocument, "at_css", RUBY_METHOD_FUNC(document_at_css), 1);
2269
2908
  rb_define_method(rb_cDocument, "encoding", RUBY_METHOD_FUNC(document_encoding), 0);
2270
2909
  rb_define_method(rb_cDocument, "text", RUBY_METHOD_FUNC(document_text), 0);
2271
2910
  rb_define_alias(rb_cDocument, "content", "text");
2272
2911
  rb_define_method(rb_cDocument, "create_element", RUBY_METHOD_FUNC(document_create_element), 1);
2912
+ rb_define_method(rb_cDocument, "children", RUBY_METHOD_FUNC(document_children), 0);
2913
+ rb_define_method(rb_cDocument, "element_children", RUBY_METHOD_FUNC(document_element_children), 0);
2914
+ rb_define_alias(rb_cDocument, "elements", "element_children");
2915
+ rb_define_method(rb_cDocument, "first_element_child", RUBY_METHOD_FUNC(document_first_element_child), 0);
2916
+ rb_define_method(rb_cDocument, "last_element_child", RUBY_METHOD_FUNC(document_last_element_child), 0);
2273
2917
 
2274
2918
  rb_cNode = rb_define_class_under(rb_mXML, "Node", rb_cObject);
2275
2919
  rb_undef_alloc_func(rb_cNode);
@@ -2288,9 +2932,13 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
2288
2932
  rb_define_method(rb_cNode, "children", RUBY_METHOD_FUNC(node_children), 0);
2289
2933
  rb_define_method(rb_cNode, "element_children", RUBY_METHOD_FUNC(node_element_children), 0);
2290
2934
  rb_define_alias(rb_cNode, "elements", "element_children");
2935
+ rb_define_method(rb_cNode, "first_element_child", RUBY_METHOD_FUNC(node_first_element_child), 0);
2936
+ rb_define_method(rb_cNode, "last_element_child", RUBY_METHOD_FUNC(node_last_element_child), 0);
2937
+ rb_define_method(rb_cNode, "document", RUBY_METHOD_FUNC(node_document), 0);
2291
2938
  rb_define_method(rb_cNode, "parent", RUBY_METHOD_FUNC(node_parent), 0);
2292
2939
  rb_define_method(rb_cNode, "ancestors", RUBY_METHOD_FUNC(node_ancestors), -1);
2293
2940
  rb_define_method(rb_cNode, "attributes", RUBY_METHOD_FUNC(node_attributes), 0);
2941
+ rb_define_method(rb_cNode, "attribute_nodes", RUBY_METHOD_FUNC(node_attribute_nodes), 0);
2294
2942
  rb_define_method(rb_cNode, "next_sibling", RUBY_METHOD_FUNC(node_next_sibling), 0);
2295
2943
  rb_define_method(rb_cNode, "next_element", RUBY_METHOD_FUNC(node_next_element), 0);
2296
2944
  rb_define_method(rb_cNode, "previous_sibling", RUBY_METHOD_FUNC(node_previous_sibling), 0);