rxerces 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,9 +8,33 @@
8
8
  #include <xercesc/util/XercesDefs.hpp>
9
9
  #include <xercesc/dom/DOMXPathResult.hpp>
10
10
  #include <xercesc/dom/DOMXPathExpression.hpp>
11
+ #include <xercesc/sax/ErrorHandler.hpp>
12
+ #include <xercesc/sax/SAXParseException.hpp>
11
13
  #include <sstream>
14
+ #include <vector>
15
+
16
+ #ifdef HAVE_XALAN
17
+ #include <xalanc/XPath/XPathEvaluator.hpp>
18
+ #include <xalanc/XPath/NodeRefList.hpp>
19
+ #include <xalanc/XPath/XObject.hpp>
20
+ #include <xalanc/XPath/XObjectFactoryDefault.hpp>
21
+ #include <xalanc/XPath/XPathEnvSupportDefault.hpp>
22
+ #include <xalanc/XPath/XPathExecutionContextDefault.hpp>
23
+ #include <xalanc/XPath/XPathConstructionContextDefault.hpp>
24
+ #include <xalanc/XPath/ElementPrefixResolverProxy.hpp>
25
+ #include <xalanc/XPath/XPathFactoryDefault.hpp>
26
+ #include <xalanc/XPath/XPathProcessorImpl.hpp>
27
+ #include <xalanc/XPath/XPath.hpp>
28
+ #include <xalanc/XercesParserLiaison/XercesParserLiaison.hpp>
29
+ #include <xalanc/XercesParserLiaison/XercesDOMSupport.hpp>
30
+ #include <xalanc/XercesParserLiaison/XercesDocumentWrapper.hpp>
31
+ #include <xalanc/PlatformSupport/XalanMemoryManagerDefault.hpp>
32
+ #endif
12
33
 
13
34
  using namespace xercesc;
35
+ #ifdef HAVE_XALAN
36
+ using namespace xalanc;
37
+ #endif
14
38
 
15
39
  VALUE rb_mRXerces;
16
40
  VALUE rb_mXML;
@@ -19,9 +43,13 @@ VALUE rb_cNode;
19
43
  VALUE rb_cNodeSet;
20
44
  VALUE rb_cElement;
21
45
  VALUE rb_cText;
46
+ VALUE rb_cSchema;
22
47
 
23
- // Xerces initialization flag
48
+ // Initialization flags
24
49
  static bool xerces_initialized = false;
50
+ #ifdef HAVE_XALAN
51
+ static bool xalan_initialized = false;
52
+ #endif
25
53
 
26
54
  // Helper class to manage XMLCh strings
27
55
  class XStr {
@@ -82,6 +110,39 @@ typedef struct {
82
110
  VALUE nodes_array;
83
111
  } NodeSetWrapper;
84
112
 
113
+ // Wrapper structure for Schema
114
+ typedef struct {
115
+ std::string* schemaContent;
116
+ } SchemaWrapper;
117
+
118
+ // Error handler for schema validation
119
+ class ValidationErrorHandler : public ErrorHandler {
120
+ public:
121
+ std::vector<std::string> errors;
122
+
123
+ void warning(const SAXParseException& e) {
124
+ char* msg = XMLString::transcode(e.getMessage());
125
+ errors.push_back(std::string("Warning: ") + msg);
126
+ XMLString::release(&msg);
127
+ }
128
+
129
+ void error(const SAXParseException& e) {
130
+ char* msg = XMLString::transcode(e.getMessage());
131
+ errors.push_back(std::string("Error: ") + msg);
132
+ XMLString::release(&msg);
133
+ }
134
+
135
+ void fatalError(const SAXParseException& e) {
136
+ char* msg = XMLString::transcode(e.getMessage());
137
+ errors.push_back(std::string("Fatal: ") + msg);
138
+ XMLString::release(&msg);
139
+ }
140
+
141
+ void resetErrors() {
142
+ errors.clear();
143
+ }
144
+ };
145
+
85
146
  // Memory management functions
86
147
  static void document_free(void* ptr) {
87
148
  DocumentWrapper* wrapper = (DocumentWrapper*)ptr;
@@ -109,6 +170,16 @@ static void nodeset_free(void* ptr) {
109
170
  }
110
171
  }
111
172
 
173
+ static void schema_free(void* ptr) {
174
+ SchemaWrapper* wrapper = (SchemaWrapper*)ptr;
175
+ if (wrapper) {
176
+ if (wrapper->schemaContent) {
177
+ delete wrapper->schemaContent;
178
+ }
179
+ xfree(wrapper);
180
+ }
181
+ }
182
+
112
183
  static size_t document_size(const void* ptr) {
113
184
  return sizeof(DocumentWrapper);
114
185
  }
@@ -121,6 +192,10 @@ static size_t nodeset_size(const void* ptr) {
121
192
  return sizeof(NodeSetWrapper);
122
193
  }
123
194
 
195
+ static size_t schema_size(const void* ptr) {
196
+ return sizeof(SchemaWrapper);
197
+ }
198
+
124
199
  static const rb_data_type_t document_type = {
125
200
  "RXerces::XML::Document",
126
201
  {0, document_free, document_size},
@@ -142,6 +217,13 @@ static const rb_data_type_t nodeset_type = {
142
217
  RUBY_TYPED_FREE_IMMEDIATELY
143
218
  };
144
219
 
220
+ static const rb_data_type_t schema_type = {
221
+ "RXerces::XML::Schema",
222
+ {0, schema_free, schema_size},
223
+ 0, 0,
224
+ RUBY_TYPED_FREE_IMMEDIATELY
225
+ };
226
+
145
227
  // Helper to create Ruby Node object from DOMNode
146
228
  static VALUE wrap_node(DOMNode* node, VALUE doc_ref) {
147
229
  if (!node) {
@@ -188,7 +270,7 @@ static VALUE document_parse(VALUE klass, VALUE str) {
188
270
 
189
271
  XercesDOMParser* parser = new XercesDOMParser();
190
272
  parser->setValidationScheme(XercesDOMParser::Val_Never);
191
- parser->setDoNamespaces(false);
273
+ parser->setDoNamespaces(true);
192
274
  parser->setDoSchema(false);
193
275
 
194
276
  try {
@@ -261,6 +343,161 @@ static VALUE document_to_s(VALUE self) {
261
343
  return Qnil;
262
344
  }
263
345
 
346
+ // document.encoding
347
+ static VALUE document_encoding(VALUE self) {
348
+ DocumentWrapper* wrapper;
349
+ TypedData_Get_Struct(self, DocumentWrapper, &document_type, wrapper);
350
+
351
+ if (!wrapper->doc) {
352
+ return Qnil;
353
+ }
354
+
355
+ const XMLCh* encoding = wrapper->doc->getXmlEncoding();
356
+ if (!encoding || XMLString::stringLen(encoding) == 0) {
357
+ // Default to UTF-8 if no encoding is specified
358
+ return rb_str_new_cstr("UTF-8");
359
+ }
360
+
361
+ CharStr utf8_encoding(encoding);
362
+ return rb_str_new_cstr(utf8_encoding.localForm());
363
+ }
364
+
365
+ // document.create_element(name)
366
+ static VALUE document_create_element(VALUE self, VALUE name) {
367
+ DocumentWrapper* doc_wrapper;
368
+ TypedData_Get_Struct(self, DocumentWrapper, &document_type, doc_wrapper);
369
+
370
+ if (!doc_wrapper->doc) {
371
+ rb_raise(rb_eRuntimeError, "Cannot create element on null document");
372
+ }
373
+
374
+ Check_Type(name, T_STRING);
375
+ const char* element_name = StringValueCStr(name);
376
+
377
+ try {
378
+ XMLCh* element_name_xml = XMLString::transcode(element_name);
379
+ DOMElement* element = doc_wrapper->doc->createElement(element_name_xml);
380
+ XMLString::release(&element_name_xml);
381
+
382
+ if (!element) {
383
+ rb_raise(rb_eRuntimeError, "Failed to create element");
384
+ }
385
+
386
+ return wrap_node(element, self);
387
+
388
+ } catch (const DOMException& e) {
389
+ char* message = XMLString::transcode(e.getMessage());
390
+ VALUE rb_error = rb_str_new_cstr(message);
391
+ XMLString::release(&message);
392
+ rb_raise(rb_eRuntimeError, "Failed to create element: %s", StringValueCStr(rb_error));
393
+ } catch (...) {
394
+ rb_raise(rb_eRuntimeError, "Unknown error creating element");
395
+ }
396
+
397
+ return Qnil;
398
+ }
399
+
400
+ #ifdef HAVE_XALAN
401
+ // Helper function to execute XPath using Xalan for full XPath 1.0 support
402
+ static VALUE execute_xpath_with_xalan(DOMNode* context_node, const char* xpath_str, VALUE doc_ref) {
403
+ try {
404
+ // Initialize Xalan if needed
405
+ if (!xalan_initialized) {
406
+ XPathEvaluator::initialize();
407
+ XMLPlatformUtils::Initialize();
408
+ xalan_initialized = true;
409
+ }
410
+
411
+ // Get the document
412
+ DOMDocument* domDoc = context_node->getOwnerDocument();
413
+ if (!domDoc && context_node->getNodeType() == DOMNode::DOCUMENT_NODE) {
414
+ domDoc = static_cast<DOMDocument*>(context_node);
415
+ }
416
+
417
+ if (!domDoc) {
418
+ NodeSetWrapper* wrapper = ALLOC(NodeSetWrapper);
419
+ wrapper->nodes_array = rb_ary_new();
420
+ return TypedData_Wrap_Struct(rb_cNodeSet, &nodeset_type, wrapper);
421
+ }
422
+
423
+ // Create Xalan support objects
424
+ XercesParserLiaison liaison;
425
+ XercesDOMSupport domSupport(liaison);
426
+
427
+ // Create Xalan document - this creates and returns a XercesDocumentWrapper
428
+ XalanDocument* xalanDoc = liaison.createDocument(domDoc, false, false, false);
429
+ if (!xalanDoc) {
430
+ rb_raise(rb_eRuntimeError, "Failed to create Xalan document wrapper");
431
+ }
432
+
433
+ // The document IS the wrapper
434
+ XercesDocumentWrapper* docWrapper = static_cast<XercesDocumentWrapper*>(xalanDoc);
435
+
436
+ // Map the context node to Xalan
437
+ XalanNode* xalanContextNode = docWrapper->mapNode(context_node);
438
+ if (!xalanContextNode) {
439
+ xalanContextNode = docWrapper;
440
+ }
441
+
442
+ // Set up XPath factories and contexts
443
+ XPathEnvSupportDefault envSupport;
444
+ XObjectFactoryDefault objectFactory;
445
+ XPathExecutionContextDefault executionContext(envSupport, domSupport, objectFactory);
446
+ XPathConstructionContextDefault constructionContext;
447
+ XPathFactoryDefault factory;
448
+
449
+ // Create XPath
450
+ XPathProcessorImpl processor;
451
+ XPath* xpath = factory.create();
452
+
453
+ // Compile XPath expression
454
+ ElementPrefixResolverProxy resolver(docWrapper->getDocumentElement(), envSupport, domSupport);
455
+ processor.initXPath(*xpath, constructionContext, XalanDOMString(xpath_str), resolver);
456
+
457
+ // Execute XPath query
458
+ const XObjectPtr result = xpath->execute(xalanContextNode, resolver, executionContext);
459
+
460
+ VALUE nodes_array = rb_ary_new();
461
+
462
+ if (result.get() != 0) {
463
+ // Check if result is a node set
464
+ const NodeRefListBase& nodeList = result->nodeset();
465
+ const NodeRefListBase::size_type length = nodeList.getLength();
466
+
467
+ for (NodeRefListBase::size_type i = 0; i < length; ++i) {
468
+ XalanNode* xalanNode = nodeList.item(i);
469
+ if (xalanNode) {
470
+ // Map back to Xerces DOM node
471
+ const DOMNode* domNode = docWrapper->mapNode(xalanNode);
472
+ if (domNode) {
473
+ rb_ary_push(nodes_array, wrap_node(const_cast<DOMNode*>(domNode), doc_ref));
474
+ }
475
+ }
476
+ }
477
+ }
478
+
479
+ factory.returnObject(xpath);
480
+
481
+ NodeSetWrapper* wrapper = ALLOC(NodeSetWrapper);
482
+ wrapper->nodes_array = nodes_array;
483
+ return TypedData_Wrap_Struct(rb_cNodeSet, &nodeset_type, wrapper);
484
+
485
+ } catch (const XalanXPathException& e) {
486
+ CharStr msg(e.getMessage().c_str());
487
+ rb_raise(rb_eRuntimeError, "XPath error: %s", msg.localForm());
488
+ } catch (const XMLException& e) {
489
+ CharStr message(e.getMessage());
490
+ rb_raise(rb_eRuntimeError, "XML error: %s", message.localForm());
491
+ } catch (...) {
492
+ rb_raise(rb_eRuntimeError, "Unknown XPath error");
493
+ }
494
+
495
+ NodeSetWrapper* wrapper = ALLOC(NodeSetWrapper);
496
+ wrapper->nodes_array = rb_ary_new();
497
+ return TypedData_Wrap_Struct(rb_cNodeSet, &nodeset_type, wrapper);
498
+ }
499
+ #endif
500
+
264
501
  // document.xpath(path)
265
502
  static VALUE document_xpath(VALUE self, VALUE path) {
266
503
  DocumentWrapper* doc_wrapper;
@@ -275,6 +512,17 @@ static VALUE document_xpath(VALUE self, VALUE path) {
275
512
  Check_Type(path, T_STRING);
276
513
  const char* xpath_str = StringValueCStr(path);
277
514
 
515
+ #ifdef HAVE_XALAN
516
+ // Use Xalan for full XPath 1.0 support
517
+ DOMElement* root = doc_wrapper->doc->getDocumentElement();
518
+ if (!root) {
519
+ NodeSetWrapper* wrapper = ALLOC(NodeSetWrapper);
520
+ wrapper->nodes_array = rb_ary_new();
521
+ return TypedData_Wrap_Struct(rb_cNodeSet, &nodeset_type, wrapper);
522
+ }
523
+ return execute_xpath_with_xalan(root, xpath_str, self);
524
+ #else
525
+ // Fall back to Xerces XPath subset
278
526
  try {
279
527
  DOMElement* root = doc_wrapper->doc->getDocumentElement();
280
528
  if (!root) {
@@ -324,6 +572,7 @@ static VALUE document_xpath(VALUE self, VALUE path) {
324
572
  NodeSetWrapper* wrapper = ALLOC(NodeSetWrapper);
325
573
  wrapper->nodes_array = rb_ary_new();
326
574
  return TypedData_Wrap_Struct(rb_cNodeSet, &nodeset_type, wrapper);
575
+ #endif
327
576
  }
328
577
 
329
578
  // node.name
@@ -341,6 +590,24 @@ static VALUE node_name(VALUE self) {
341
590
  return rb_str_new_cstr(utf8_name.localForm());
342
591
  }
343
592
 
593
+ // node.namespace
594
+ static VALUE node_namespace(VALUE self) {
595
+ NodeWrapper* wrapper;
596
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
597
+
598
+ if (!wrapper->node) {
599
+ return Qnil;
600
+ }
601
+
602
+ const XMLCh* namespaceURI = wrapper->node->getNamespaceURI();
603
+ if (!namespaceURI || XMLString::stringLen(namespaceURI) == 0) {
604
+ return Qnil;
605
+ }
606
+
607
+ CharStr utf8_namespace(namespaceURI);
608
+ return rb_str_new_cstr(utf8_namespace.localForm());
609
+ }
610
+
344
611
  // node.text / node.content
345
612
  static VALUE node_text(VALUE self) {
346
613
  NodeWrapper* wrapper;
@@ -443,6 +710,335 @@ static VALUE node_children(VALUE self) {
443
710
  return children;
444
711
  }
445
712
 
713
+ // node.parent
714
+ static VALUE node_parent(VALUE self) {
715
+ NodeWrapper* wrapper;
716
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
717
+
718
+ if (!wrapper->node) {
719
+ return Qnil;
720
+ }
721
+
722
+ DOMNode* parent = wrapper->node->getParentNode();
723
+ if (!parent) {
724
+ return Qnil;
725
+ }
726
+
727
+ VALUE doc_ref = rb_iv_get(self, "@document");
728
+ return wrap_node(parent, doc_ref);
729
+ }
730
+
731
+ // node.attributes - returns hash of all attributes (only for element nodes)
732
+ static VALUE node_attributes(VALUE self) {
733
+ NodeWrapper* wrapper;
734
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
735
+
736
+ if (!wrapper->node || wrapper->node->getNodeType() != DOMNode::ELEMENT_NODE) {
737
+ return rb_hash_new();
738
+ }
739
+
740
+ DOMElement* element = dynamic_cast<DOMElement*>(wrapper->node);
741
+ DOMNamedNodeMap* attributes = element->getAttributes();
742
+
743
+ if (!attributes) {
744
+ return rb_hash_new();
745
+ }
746
+
747
+ VALUE hash = rb_hash_new();
748
+ XMLSize_t length = attributes->getLength();
749
+
750
+ for (XMLSize_t i = 0; i < length; i++) {
751
+ DOMNode* attr = attributes->item(i);
752
+ if (attr) {
753
+ const XMLCh* name = attr->getNodeName();
754
+ const XMLCh* value = attr->getNodeValue();
755
+
756
+ CharStr attr_name(name);
757
+ CharStr attr_value(value);
758
+
759
+ rb_hash_aset(hash,
760
+ rb_str_new_cstr(attr_name.localForm()),
761
+ rb_str_new_cstr(attr_value.localForm()));
762
+ }
763
+ }
764
+
765
+ return hash;
766
+ }
767
+
768
+ // node.next_sibling
769
+ static VALUE node_next_sibling(VALUE self) {
770
+ NodeWrapper* wrapper;
771
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
772
+
773
+ if (!wrapper->node) {
774
+ return Qnil;
775
+ }
776
+
777
+ DOMNode* next = wrapper->node->getNextSibling();
778
+ if (!next) {
779
+ return Qnil;
780
+ }
781
+
782
+ VALUE doc_ref = rb_iv_get(self, "@document");
783
+ return wrap_node(next, doc_ref);
784
+ }
785
+
786
+ // node.previous_sibling
787
+ static VALUE node_previous_sibling(VALUE self) {
788
+ NodeWrapper* wrapper;
789
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
790
+
791
+ if (!wrapper->node) {
792
+ return Qnil;
793
+ }
794
+
795
+ DOMNode* prev = wrapper->node->getPreviousSibling();
796
+ if (!prev) {
797
+ return Qnil;
798
+ }
799
+
800
+ VALUE doc_ref = rb_iv_get(self, "@document");
801
+ return wrap_node(prev, doc_ref);
802
+ }
803
+
804
+ // node.add_child(node_or_string) - adds a child node
805
+ static VALUE node_add_child(VALUE self, VALUE child) {
806
+ NodeWrapper* wrapper;
807
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
808
+
809
+ if (!wrapper->node) {
810
+ rb_raise(rb_eRuntimeError, "Cannot add child to null node");
811
+ }
812
+
813
+ DOMDocument* doc = wrapper->node->getOwnerDocument();
814
+ if (!doc) {
815
+ rb_raise(rb_eRuntimeError, "Node has no owner document");
816
+ }
817
+
818
+ DOMNode* child_node = NULL;
819
+
820
+ // Check if child is a string or a node
821
+ if (TYPE(child) == T_STRING) {
822
+ // Create a text node from the string
823
+ const char* text_str = StringValueCStr(child);
824
+ XMLCh* text_content = XMLString::transcode(text_str);
825
+ child_node = doc->createTextNode(text_content);
826
+ XMLString::release(&text_content);
827
+ } else {
828
+ // Assume it's a Node object
829
+ NodeWrapper* child_wrapper;
830
+ if (rb_obj_is_kind_of(child, rb_cNode)) {
831
+ TypedData_Get_Struct(child, NodeWrapper, &node_type, child_wrapper);
832
+ child_node = child_wrapper->node;
833
+ } else {
834
+ rb_raise(rb_eTypeError, "Argument must be a String or Node");
835
+ }
836
+ }
837
+
838
+ if (!child_node) {
839
+ rb_raise(rb_eRuntimeError, "Failed to create child node");
840
+ }
841
+
842
+ try {
843
+ wrapper->node->appendChild(child_node);
844
+ } catch (const DOMException& e) {
845
+ char* message = XMLString::transcode(e.getMessage());
846
+ VALUE rb_error = rb_str_new_cstr(message);
847
+ XMLString::release(&message);
848
+ rb_raise(rb_eRuntimeError, "Failed to add child: %s", StringValueCStr(rb_error));
849
+ }
850
+
851
+ return child;
852
+ }
853
+
854
+ // node.remove / node.unlink - removes node from its parent
855
+ static VALUE node_remove(VALUE self) {
856
+ NodeWrapper* wrapper;
857
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
858
+
859
+ if (!wrapper->node) {
860
+ rb_raise(rb_eRuntimeError, "Cannot remove null node");
861
+ }
862
+
863
+ DOMNode* parent = wrapper->node->getParentNode();
864
+ if (!parent) {
865
+ rb_raise(rb_eRuntimeError, "Node has no parent to remove from");
866
+ }
867
+
868
+ try {
869
+ parent->removeChild(wrapper->node);
870
+ } catch (const DOMException& e) {
871
+ char* message = XMLString::transcode(e.getMessage());
872
+ VALUE rb_error = rb_str_new_cstr(message);
873
+ XMLString::release(&message);
874
+ rb_raise(rb_eRuntimeError, "Failed to remove node: %s", StringValueCStr(rb_error));
875
+ }
876
+
877
+ return self;
878
+ }
879
+
880
+ // node.inner_html / node.inner_xml - returns XML content of children
881
+ static VALUE node_inner_html(VALUE self) {
882
+ NodeWrapper* wrapper;
883
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
884
+
885
+ if (!wrapper->node) {
886
+ return rb_str_new_cstr("");
887
+ }
888
+
889
+ try {
890
+ DOMImplementation* impl = DOMImplementationRegistry::getDOMImplementation(XStr("LS").unicodeForm());
891
+ DOMLSSerializer* serializer = ((DOMImplementationLS*)impl)->createLSSerializer();
892
+
893
+ // Build a string by serializing each child
894
+ std::string result;
895
+ DOMNodeList* children = wrapper->node->getChildNodes();
896
+ XMLSize_t count = children->getLength();
897
+
898
+ for (XMLSize_t i = 0; i < count; i++) {
899
+ DOMNode* child = children->item(i);
900
+ XMLCh* xml_str = serializer->writeToString(child);
901
+ CharStr utf8_str(xml_str);
902
+ result += utf8_str.localForm();
903
+ XMLString::release(&xml_str);
904
+ }
905
+
906
+ serializer->release();
907
+ return rb_str_new_cstr(result.c_str());
908
+ } catch (const DOMException& e) {
909
+ char* message = XMLString::transcode(e.getMessage());
910
+ VALUE rb_error = rb_str_new_cstr(message);
911
+ XMLString::release(&message);
912
+ rb_raise(rb_eRuntimeError, "Failed to serialize inner content: %s", StringValueCStr(rb_error));
913
+ } catch (...) {
914
+ rb_raise(rb_eRuntimeError, "Failed to serialize inner content");
915
+ }
916
+
917
+ return rb_str_new_cstr("");
918
+ }
919
+
920
+ // node.path - returns XPath to the node
921
+ static VALUE node_path(VALUE self) {
922
+ NodeWrapper* wrapper;
923
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
924
+
925
+ if (!wrapper->node) {
926
+ return rb_str_new_cstr("");
927
+ }
928
+
929
+ std::string path = "";
930
+ DOMNode* current = wrapper->node;
931
+
932
+ // Build path from current node to root
933
+ while (current && current->getNodeType() != DOMNode::DOCUMENT_NODE) {
934
+ std::string segment = "";
935
+
936
+ if (current->getNodeType() == DOMNode::ELEMENT_NODE) {
937
+ CharStr name(current->getNodeName());
938
+ segment = std::string(name.localForm());
939
+
940
+ // Count position among siblings with same name
941
+ int position = 1;
942
+ DOMNode* sibling = current->getPreviousSibling();
943
+ while (sibling) {
944
+ if (sibling->getNodeType() == DOMNode::ELEMENT_NODE &&
945
+ XMLString::equals(sibling->getNodeName(), current->getNodeName())) {
946
+ position++;
947
+ }
948
+ sibling = sibling->getPreviousSibling();
949
+ }
950
+
951
+ // Add position predicate
952
+ segment += "[" + std::to_string(position) + "]";
953
+ path = "/" + segment + path;
954
+ } else if (current->getNodeType() == DOMNode::TEXT_NODE) {
955
+ // Count position among text node siblings
956
+ int position = 1;
957
+ DOMNode* sibling = current->getPreviousSibling();
958
+ while (sibling) {
959
+ if (sibling->getNodeType() == DOMNode::TEXT_NODE) {
960
+ position++;
961
+ }
962
+ sibling = sibling->getPreviousSibling();
963
+ }
964
+ path = "/text()[" + std::to_string(position) + "]" + path;
965
+ }
966
+
967
+ current = current->getParentNode();
968
+ }
969
+
970
+ return rb_str_new_cstr(path.c_str());
971
+ }
972
+
973
+ // node.blank? - returns true if node has no meaningful content
974
+ static VALUE node_blank_p(VALUE self) {
975
+ NodeWrapper* wrapper;
976
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
977
+
978
+ if (!wrapper->node) {
979
+ return Qtrue;
980
+ }
981
+
982
+ // Text nodes are blank if they contain only whitespace
983
+ if (wrapper->node->getNodeType() == DOMNode::TEXT_NODE) {
984
+ const XMLCh* text_content = wrapper->node->getNodeValue();
985
+ if (!text_content) {
986
+ return Qtrue;
987
+ }
988
+
989
+ // Check if text contains only whitespace
990
+ CharStr utf8_text(text_content);
991
+ const char* str = utf8_text.localForm();
992
+ while (*str) {
993
+ if (!isspace((unsigned char)*str)) {
994
+ return Qfalse;
995
+ }
996
+ str++;
997
+ }
998
+ return Qtrue;
999
+ }
1000
+
1001
+ // Element nodes are blank if they have no child elements and no non-blank text
1002
+ if (wrapper->node->getNodeType() == DOMNode::ELEMENT_NODE) {
1003
+ DOMNodeList* children = wrapper->node->getChildNodes();
1004
+ XMLSize_t count = children->getLength();
1005
+
1006
+ if (count == 0) {
1007
+ return Qtrue;
1008
+ }
1009
+
1010
+ // Check if all children are blank text nodes
1011
+ for (XMLSize_t i = 0; i < count; i++) {
1012
+ DOMNode* child = children->item(i);
1013
+
1014
+ // If there's an element child, not blank
1015
+ if (child->getNodeType() == DOMNode::ELEMENT_NODE) {
1016
+ return Qfalse;
1017
+ }
1018
+
1019
+ // If there's a non-whitespace text node, not blank
1020
+ if (child->getNodeType() == DOMNode::TEXT_NODE) {
1021
+ const XMLCh* text_content = child->getNodeValue();
1022
+ if (text_content) {
1023
+ CharStr utf8_text(text_content);
1024
+ const char* str = utf8_text.localForm();
1025
+ while (*str) {
1026
+ if (!isspace((unsigned char)*str)) {
1027
+ return Qfalse;
1028
+ }
1029
+ str++;
1030
+ }
1031
+ }
1032
+ }
1033
+ }
1034
+
1035
+ return Qtrue;
1036
+ }
1037
+
1038
+ // Other node types are considered blank
1039
+ return Qtrue;
1040
+ }
1041
+
446
1042
  // node.xpath(path)
447
1043
  static VALUE node_xpath(VALUE self, VALUE path) {
448
1044
  NodeWrapper* node_wrapper;
@@ -458,6 +1054,11 @@ static VALUE node_xpath(VALUE self, VALUE path) {
458
1054
  const char* xpath_str = StringValueCStr(path);
459
1055
  VALUE doc_ref = rb_iv_get(self, "@document");
460
1056
 
1057
+ #ifdef HAVE_XALAN
1058
+ // Use Xalan for full XPath 1.0 support
1059
+ return execute_xpath_with_xalan(node_wrapper->node, xpath_str, doc_ref);
1060
+ #else
1061
+ // Fall back to Xerces XPath subset
461
1062
  try {
462
1063
  DOMDocument* doc = node_wrapper->node->getOwnerDocument();
463
1064
  if (!doc) {
@@ -507,6 +1108,26 @@ static VALUE node_xpath(VALUE self, VALUE path) {
507
1108
  NodeSetWrapper* wrapper = ALLOC(NodeSetWrapper);
508
1109
  wrapper->nodes_array = rb_ary_new();
509
1110
  return TypedData_Wrap_Struct(rb_cNodeSet, &nodeset_type, wrapper);
1111
+ #endif
1112
+ }
1113
+
1114
+ // node.at_xpath(path) - returns first matching node or nil
1115
+ static VALUE node_at_xpath(VALUE self, VALUE path) {
1116
+ VALUE nodeset = node_xpath(self, path);
1117
+ NodeSetWrapper* wrapper;
1118
+ TypedData_Get_Struct(nodeset, NodeSetWrapper, &nodeset_type, wrapper);
1119
+
1120
+ if (RARRAY_LEN(wrapper->nodes_array) == 0) {
1121
+ return Qnil;
1122
+ }
1123
+
1124
+ return rb_ary_entry(wrapper->nodes_array, 0);
1125
+ }
1126
+
1127
+ // node.css(selector) - CSS selectors not supported
1128
+ static VALUE node_css(VALUE self, VALUE selector) {
1129
+ rb_raise(rb_eNotImpError, "CSS selectors are not supported. Use xpath() instead. Xerces-C only supports XPath queries.");
1130
+ return Qnil;
510
1131
  }
511
1132
 
512
1133
  // nodeset.length / nodeset.size
@@ -550,7 +1171,188 @@ static VALUE nodeset_to_a(VALUE self) {
550
1171
  return rb_ary_dup(wrapper->nodes_array);
551
1172
  }
552
1173
 
553
- extern "C" void Init_rxerces(void) {
1174
+ // Schema.from_document(schema_doc) or Schema.from_string(xsd_string)
1175
+ static VALUE schema_from_document(int argc, VALUE* argv, VALUE klass) {
1176
+ VALUE schema_source;
1177
+ rb_scan_args(argc, argv, "1", &schema_source);
1178
+
1179
+ // Ensure Xerces is initialized
1180
+ if (!xerces_initialized) {
1181
+ try {
1182
+ XMLPlatformUtils::Initialize();
1183
+ xerces_initialized = true;
1184
+ } catch (const XMLException& e) {
1185
+ char* message = XMLString::transcode(e.getMessage());
1186
+ VALUE rb_error = rb_str_new_cstr(message);
1187
+ XMLString::release(&message);
1188
+ rb_raise(rb_eRuntimeError, "Failed to initialize Xerces-C: %s", StringValueCStr(rb_error));
1189
+ }
1190
+ }
1191
+
1192
+ try {
1193
+ SchemaWrapper* wrapper = ALLOC(SchemaWrapper);
1194
+ wrapper->schemaContent = new std::string();
1195
+
1196
+ // Convert schema source to string
1197
+ std::string xsd_content;
1198
+ if (rb_obj_is_kind_of(schema_source, rb_cString)) {
1199
+ xsd_content = std::string(RSTRING_PTR(schema_source), RSTRING_LEN(schema_source));
1200
+ } else {
1201
+ // Assume it's a Document, call to_s
1202
+ VALUE str = rb_funcall(schema_source, rb_intern("to_s"), 0);
1203
+ xsd_content = std::string(RSTRING_PTR(str), RSTRING_LEN(str));
1204
+ }
1205
+
1206
+ // Store the schema content
1207
+ *wrapper->schemaContent = xsd_content;
1208
+
1209
+ // Validate that it's valid XML by trying to parse it
1210
+ XercesDOMParser* schemaParser = new XercesDOMParser();
1211
+ schemaParser->setValidationScheme(XercesDOMParser::Val_Never);
1212
+ schemaParser->setDoNamespaces(true);
1213
+
1214
+ // Parse the schema using MemBufInputSource
1215
+ MemBufInputSource schemaInput(
1216
+ (const XMLByte*)xsd_content.c_str(),
1217
+ xsd_content.length(),
1218
+ "schema"
1219
+ );
1220
+
1221
+ try {
1222
+ schemaParser->parse(schemaInput);
1223
+ } catch (...) {
1224
+ delete schemaParser;
1225
+ delete wrapper->schemaContent;
1226
+ xfree(wrapper);
1227
+ rb_raise(rb_eRuntimeError, "Schema parsing failed: Invalid XML");
1228
+ }
1229
+
1230
+ delete schemaParser;
1231
+
1232
+ VALUE rb_schema = TypedData_Wrap_Struct(klass, &schema_type, wrapper);
1233
+ return rb_schema;
1234
+
1235
+ } catch (const XMLException& e) {
1236
+ char* message = XMLString::transcode(e.getMessage());
1237
+ VALUE rb_error = rb_str_new_cstr(message);
1238
+ XMLString::release(&message);
1239
+ rb_raise(rb_eRuntimeError, "XMLException: %s", StringValueCStr(rb_error));
1240
+ } catch (const DOMException& e) {
1241
+ char* message = XMLString::transcode(e.getMessage());
1242
+ VALUE rb_error = rb_str_new_cstr(message);
1243
+ XMLString::release(&message);
1244
+ rb_raise(rb_eRuntimeError, "DOMException: %s", StringValueCStr(rb_error));
1245
+ } catch (...) {
1246
+ rb_raise(rb_eRuntimeError, "Unknown exception during schema parsing");
1247
+ }
1248
+
1249
+ return Qnil;
1250
+ }
1251
+
1252
+ // document.validate(schema) - returns array of error messages (empty if valid)
1253
+ static VALUE document_validate(VALUE self, VALUE rb_schema) {
1254
+ DocumentWrapper* doc_wrapper;
1255
+ TypedData_Get_Struct(self, DocumentWrapper, &document_type, doc_wrapper);
1256
+
1257
+ SchemaWrapper* schema_wrapper;
1258
+ TypedData_Get_Struct(rb_schema, SchemaWrapper, &schema_type, schema_wrapper);
1259
+
1260
+ try {
1261
+ // Serialize the document to UTF-8 for validation
1262
+ DOMImplementation* impl = DOMImplementationRegistry::getDOMImplementation(XMLString::transcode("LS"));
1263
+ DOMLSSerializer* serializer = ((DOMImplementationLS*)impl)->createLSSerializer();
1264
+
1265
+ // Use a MemBufFormatTarget to get UTF-8 encoded output
1266
+ MemBufFormatTarget target;
1267
+ DOMLSOutput* output = ((DOMImplementationLS*)impl)->createLSOutput();
1268
+ output->setByteStream(&target);
1269
+
1270
+ serializer->write(doc_wrapper->doc, output);
1271
+
1272
+ // Get the UTF-8 content
1273
+ std::string xml_content((const char*)target.getRawBuffer(), target.getLen());
1274
+
1275
+ output->release();
1276
+ serializer->release();
1277
+
1278
+ // Create a validating parser
1279
+ XercesDOMParser* validator = new XercesDOMParser();
1280
+ validator->setValidationScheme(XercesDOMParser::Val_Always);
1281
+ validator->setDoNamespaces(true);
1282
+ validator->setDoSchema(true);
1283
+ validator->setValidationSchemaFullChecking(true);
1284
+
1285
+ ValidationErrorHandler errorHandler;
1286
+ validator->setErrorHandler(&errorHandler);
1287
+
1288
+ // Create a combined input with both the schema and the document
1289
+ // First, we need to add schema location to the document
1290
+ std::string schema_location = "http://example.com/schema";
1291
+
1292
+ // Create memory buffers for both schema and document
1293
+ MemBufInputSource schemaSource(
1294
+ (const XMLByte*)schema_wrapper->schemaContent->c_str(),
1295
+ schema_wrapper->schemaContent->length(),
1296
+ "schema.xsd"
1297
+ );
1298
+
1299
+ // Load the schema grammar
1300
+ try {
1301
+ validator->loadGrammar(schemaSource, Grammar::SchemaGrammarType, true);
1302
+ validator->setExternalNoNamespaceSchemaLocation("schema.xsd");
1303
+ validator->useCachedGrammarInParse(true);
1304
+ } catch (...) {
1305
+ // If grammar loading fails, just note it
1306
+ errorHandler.errors.push_back("Warning: Schema grammar could not be loaded");
1307
+ }
1308
+
1309
+ // Now parse and validate the document
1310
+ MemBufInputSource docSource(
1311
+ (const XMLByte*)xml_content.c_str(),
1312
+ xml_content.length(),
1313
+ "document.xml"
1314
+ );
1315
+
1316
+ try {
1317
+ validator->parse(docSource);
1318
+ } catch (const XMLException& e) {
1319
+ char* message = XMLString::transcode(e.getMessage());
1320
+ errorHandler.errors.push_back(std::string("XMLException: ") + message);
1321
+ XMLString::release(&message);
1322
+ } catch (const DOMException& e) {
1323
+ char* message = XMLString::transcode(e.getMessage());
1324
+ errorHandler.errors.push_back(std::string("DOMException: ") + message);
1325
+ XMLString::release(&message);
1326
+ } catch (...) {
1327
+ errorHandler.errors.push_back("Unknown parsing exception");
1328
+ }
1329
+
1330
+ delete validator;
1331
+
1332
+ // Return array of error messages
1333
+ VALUE errors_array = rb_ary_new();
1334
+ for (const auto& err : errorHandler.errors) {
1335
+ rb_ary_push(errors_array, rb_str_new_cstr(err.c_str()));
1336
+ }
1337
+
1338
+ return errors_array;
1339
+
1340
+ } catch (const XMLException& e) {
1341
+ char* message = XMLString::transcode(e.getMessage());
1342
+ VALUE rb_error = rb_str_new_cstr(message);
1343
+ XMLString::release(&message);
1344
+ rb_raise(rb_eRuntimeError, "XMLException during validation: %s", StringValueCStr(rb_error));
1345
+ } catch (const DOMException& e) {
1346
+ char* message = XMLString::transcode(e.getMessage());
1347
+ VALUE rb_error = rb_str_new_cstr(message);
1348
+ XMLString::release(&message);
1349
+ rb_raise(rb_eRuntimeError, "DOMException during validation: %s", StringValueCStr(rb_error));
1350
+ } catch (...) {
1351
+ rb_raise(rb_eRuntimeError, "Unknown exception during validation");
1352
+ }
1353
+
1354
+ return Qnil;
1355
+ }extern "C" void Init_rxerces(void) {
554
1356
  rb_mRXerces = rb_define_module("RXerces");
555
1357
  rb_mXML = rb_define_module_under(rb_mRXerces, "XML");
556
1358
 
@@ -559,20 +1361,38 @@ extern "C" void Init_rxerces(void) {
559
1361
  rb_define_singleton_method(rb_cDocument, "parse", RUBY_METHOD_FUNC(document_parse), 1);
560
1362
  rb_define_method(rb_cDocument, "root", RUBY_METHOD_FUNC(document_root), 0);
561
1363
  rb_define_method(rb_cDocument, "to_s", RUBY_METHOD_FUNC(document_to_s), 0);
562
- rb_define_method(rb_cDocument, "to_xml", RUBY_METHOD_FUNC(document_to_s), 0);
1364
+ rb_define_alias(rb_cDocument, "to_xml", "to_s");
563
1365
  rb_define_method(rb_cDocument, "xpath", RUBY_METHOD_FUNC(document_xpath), 1);
1366
+ rb_define_method(rb_cDocument, "encoding", RUBY_METHOD_FUNC(document_encoding), 0);
1367
+ rb_define_method(rb_cDocument, "create_element", RUBY_METHOD_FUNC(document_create_element), 1);
564
1368
 
565
1369
  rb_cNode = rb_define_class_under(rb_mXML, "Node", rb_cObject);
566
1370
  rb_undef_alloc_func(rb_cNode);
567
1371
  rb_define_method(rb_cNode, "name", RUBY_METHOD_FUNC(node_name), 0);
1372
+ rb_define_method(rb_cNode, "namespace", RUBY_METHOD_FUNC(node_namespace), 0);
568
1373
  rb_define_method(rb_cNode, "text", RUBY_METHOD_FUNC(node_text), 0);
569
- rb_define_method(rb_cNode, "content", RUBY_METHOD_FUNC(node_text), 0);
1374
+ rb_define_alias(rb_cNode, "content", "text");
570
1375
  rb_define_method(rb_cNode, "text=", RUBY_METHOD_FUNC(node_text_set), 1);
571
- rb_define_method(rb_cNode, "content=", RUBY_METHOD_FUNC(node_text_set), 1);
1376
+ rb_define_alias(rb_cNode, "content=", "text=");
572
1377
  rb_define_method(rb_cNode, "[]", RUBY_METHOD_FUNC(node_get_attribute), 1);
573
1378
  rb_define_method(rb_cNode, "[]=", RUBY_METHOD_FUNC(node_set_attribute), 2);
574
1379
  rb_define_method(rb_cNode, "children", RUBY_METHOD_FUNC(node_children), 0);
1380
+ rb_define_method(rb_cNode, "parent", RUBY_METHOD_FUNC(node_parent), 0);
1381
+ rb_define_method(rb_cNode, "attributes", RUBY_METHOD_FUNC(node_attributes), 0);
1382
+ rb_define_method(rb_cNode, "next_sibling", RUBY_METHOD_FUNC(node_next_sibling), 0);
1383
+ rb_define_method(rb_cNode, "previous_sibling", RUBY_METHOD_FUNC(node_previous_sibling), 0);
1384
+ rb_define_method(rb_cNode, "add_child", RUBY_METHOD_FUNC(node_add_child), 1);
1385
+ rb_define_method(rb_cNode, "remove", RUBY_METHOD_FUNC(node_remove), 0);
1386
+ rb_define_alias(rb_cNode, "unlink", "remove");
1387
+ rb_define_method(rb_cNode, "inner_html", RUBY_METHOD_FUNC(node_inner_html), 0);
1388
+ rb_define_alias(rb_cNode, "inner_xml", "inner_html");
1389
+ rb_define_method(rb_cNode, "path", RUBY_METHOD_FUNC(node_path), 0);
1390
+ rb_define_method(rb_cNode, "blank?", RUBY_METHOD_FUNC(node_blank_p), 0);
575
1391
  rb_define_method(rb_cNode, "xpath", RUBY_METHOD_FUNC(node_xpath), 1);
1392
+ rb_define_alias(rb_cNode, "search", "xpath");
1393
+ rb_define_method(rb_cNode, "at_xpath", RUBY_METHOD_FUNC(node_at_xpath), 1);
1394
+ rb_define_alias(rb_cNode, "at", "at_xpath");
1395
+ rb_define_method(rb_cNode, "css", RUBY_METHOD_FUNC(node_css), 1);
576
1396
 
577
1397
  rb_cElement = rb_define_class_under(rb_mXML, "Element", rb_cNode);
578
1398
  rb_undef_alloc_func(rb_cElement);
@@ -583,9 +1403,16 @@ extern "C" void Init_rxerces(void) {
583
1403
  rb_cNodeSet = rb_define_class_under(rb_mXML, "NodeSet", rb_cObject);
584
1404
  rb_undef_alloc_func(rb_cNodeSet);
585
1405
  rb_define_method(rb_cNodeSet, "length", RUBY_METHOD_FUNC(nodeset_length), 0);
586
- rb_define_method(rb_cNodeSet, "size", RUBY_METHOD_FUNC(nodeset_length), 0);
1406
+ rb_define_alias(rb_cNodeSet, "size", "length");
587
1407
  rb_define_method(rb_cNodeSet, "[]", RUBY_METHOD_FUNC(nodeset_at), 1);
588
1408
  rb_define_method(rb_cNodeSet, "each", RUBY_METHOD_FUNC(nodeset_each), 0);
589
1409
  rb_define_method(rb_cNodeSet, "to_a", RUBY_METHOD_FUNC(nodeset_to_a), 0);
590
1410
  rb_include_module(rb_cNodeSet, rb_mEnumerable);
1411
+
1412
+ rb_cSchema = rb_define_class_under(rb_mXML, "Schema", rb_cObject);
1413
+ rb_undef_alloc_func(rb_cSchema);
1414
+ rb_define_singleton_method(rb_cSchema, "from_document", RUBY_METHOD_FUNC(schema_from_document), -1);
1415
+ rb_define_singleton_method(rb_cSchema, "from_string", RUBY_METHOD_FUNC(schema_from_document), -1);
1416
+
1417
+ rb_define_method(rb_cDocument, "validate", RUBY_METHOD_FUNC(document_validate), 1);
591
1418
  }