rxerces 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,6 +51,47 @@ static bool xerces_initialized = false;
51
51
  static bool xalan_initialized = false;
52
52
  #endif
53
53
 
54
+ // Forward declarations
55
+ static std::string css_to_xpath(const char* css);
56
+ static VALUE node_css(VALUE self, VALUE selector);
57
+ static VALUE node_xpath(VALUE self, VALUE path);
58
+ static VALUE document_xpath(VALUE self, VALUE path);
59
+
60
+ // Initialize Xerces (and Xalan if available) exactly once
61
+ static void ensure_xerces_initialized() {
62
+ if (xerces_initialized) {
63
+ return;
64
+ }
65
+
66
+ try {
67
+ XMLPlatformUtils::Initialize();
68
+ #ifdef HAVE_XALAN
69
+ XPathEvaluator::initialize();
70
+ xalan_initialized = true;
71
+ #endif
72
+ xerces_initialized = true;
73
+ } catch (const XMLException& e) {
74
+ char* message = XMLString::transcode(e.getMessage());
75
+ std::string error_msg = std::string("Xerces initialization failed: ") + message;
76
+ XMLString::release(&message);
77
+ rb_raise(rb_eRuntimeError, "%s", error_msg.c_str());
78
+ }
79
+ }
80
+
81
+ // Cleanup function called at exit
82
+ static void cleanup_xerces() {
83
+ #ifdef HAVE_XALAN
84
+ if (xalan_initialized) {
85
+ XPathEvaluator::terminate();
86
+ xalan_initialized = false;
87
+ }
88
+ #endif
89
+ if (xerces_initialized) {
90
+ XMLPlatformUtils::Terminate();
91
+ xerces_initialized = false;
92
+ }
93
+ }
94
+
54
95
  // Helper class to manage XMLCh strings
55
96
  class XStr {
56
97
  public:
@@ -97,6 +138,7 @@ private:
97
138
  typedef struct {
98
139
  DOMDocument* doc;
99
140
  XercesDOMParser* parser;
141
+ std::vector<std::string>* parse_errors;
100
142
  } DocumentWrapper;
101
143
 
102
144
  // Wrapper structure for DOMNode
@@ -122,19 +164,34 @@ public:
122
164
 
123
165
  void warning(const SAXParseException& e) {
124
166
  char* msg = XMLString::transcode(e.getMessage());
125
- errors.push_back(std::string("Warning: ") + msg);
167
+ char buffer[512];
168
+ snprintf(buffer, sizeof(buffer), "Warning at line %lu, column %lu: %s",
169
+ (unsigned long)e.getLineNumber(),
170
+ (unsigned long)e.getColumnNumber(),
171
+ msg);
172
+ errors.push_back(buffer);
126
173
  XMLString::release(&msg);
127
174
  }
128
175
 
129
176
  void error(const SAXParseException& e) {
130
177
  char* msg = XMLString::transcode(e.getMessage());
131
- errors.push_back(std::string("Error: ") + msg);
178
+ char buffer[512];
179
+ snprintf(buffer, sizeof(buffer), "Error at line %lu, column %lu: %s",
180
+ (unsigned long)e.getLineNumber(),
181
+ (unsigned long)e.getColumnNumber(),
182
+ msg);
183
+ errors.push_back(buffer);
132
184
  XMLString::release(&msg);
133
185
  }
134
186
 
135
187
  void fatalError(const SAXParseException& e) {
136
188
  char* msg = XMLString::transcode(e.getMessage());
137
- errors.push_back(std::string("Fatal: ") + msg);
189
+ char buffer[512];
190
+ snprintf(buffer, sizeof(buffer), "Fatal error at line %lu, column %lu: %s",
191
+ (unsigned long)e.getLineNumber(),
192
+ (unsigned long)e.getColumnNumber(),
193
+ msg);
194
+ errors.push_back(buffer);
138
195
  XMLString::release(&msg);
139
196
  }
140
197
 
@@ -143,6 +200,55 @@ public:
143
200
  }
144
201
  };
145
202
 
203
+ // Error handler for parsing - stores errors but doesn't throw
204
+ class ParseErrorHandler : public ErrorHandler {
205
+ public:
206
+ std::vector<std::string>* errors;
207
+ bool has_fatal;
208
+
209
+ ParseErrorHandler(std::vector<std::string>* error_vec)
210
+ : errors(error_vec), has_fatal(false) {}
211
+
212
+ void warning(const SAXParseException& e) {
213
+ char* msg = XMLString::transcode(e.getMessage());
214
+ char buffer[512];
215
+ snprintf(buffer, sizeof(buffer), "Warning at line %lu, column %lu: %s",
216
+ (unsigned long)e.getLineNumber(),
217
+ (unsigned long)e.getColumnNumber(),
218
+ msg);
219
+ errors->push_back(buffer);
220
+ XMLString::release(&msg);
221
+ }
222
+
223
+ void error(const SAXParseException& e) {
224
+ char* msg = XMLString::transcode(e.getMessage());
225
+ char buffer[512];
226
+ snprintf(buffer, sizeof(buffer), "Error at line %lu, column %lu: %s",
227
+ (unsigned long)e.getLineNumber(),
228
+ (unsigned long)e.getColumnNumber(),
229
+ msg);
230
+ errors->push_back(buffer);
231
+ XMLString::release(&msg);
232
+ }
233
+
234
+ void fatalError(const SAXParseException& e) {
235
+ has_fatal = true;
236
+ char* msg = XMLString::transcode(e.getMessage());
237
+ char buffer[512];
238
+ snprintf(buffer, sizeof(buffer), "Fatal error at line %lu, column %lu: %s",
239
+ (unsigned long)e.getLineNumber(),
240
+ (unsigned long)e.getColumnNumber(),
241
+ msg);
242
+ errors->push_back(buffer);
243
+ XMLString::release(&msg);
244
+ }
245
+
246
+ void resetErrors() {
247
+ errors->clear();
248
+ has_fatal = false;
249
+ }
250
+ };
251
+
146
252
  // Memory management functions
147
253
  static void document_free(void* ptr) {
148
254
  DocumentWrapper* wrapper = (DocumentWrapper*)ptr;
@@ -150,6 +256,9 @@ static void document_free(void* ptr) {
150
256
  if (wrapper->parser) {
151
257
  delete wrapper->parser;
152
258
  }
259
+ if (wrapper->parse_errors) {
260
+ delete wrapper->parse_errors;
261
+ }
153
262
  // Document is owned by parser, so don't delete it separately
154
263
  xfree(wrapper);
155
264
  }
@@ -163,6 +272,13 @@ static void node_free(void* ptr) {
163
272
  }
164
273
  }
165
274
 
275
+ static void node_mark(void* ptr) {
276
+ NodeWrapper* wrapper = (NodeWrapper*)ptr;
277
+ if (wrapper) {
278
+ rb_gc_mark(wrapper->doc_ref);
279
+ }
280
+ }
281
+
166
282
  static void nodeset_free(void* ptr) {
167
283
  NodeSetWrapper* wrapper = (NodeSetWrapper*)ptr;
168
284
  if (wrapper) {
@@ -170,6 +286,13 @@ static void nodeset_free(void* ptr) {
170
286
  }
171
287
  }
172
288
 
289
+ static void nodeset_mark(void* ptr) {
290
+ NodeSetWrapper* wrapper = (NodeSetWrapper*)ptr;
291
+ if (wrapper) {
292
+ rb_gc_mark(wrapper->nodes_array);
293
+ }
294
+ }
295
+
173
296
  static void schema_free(void* ptr) {
174
297
  SchemaWrapper* wrapper = (SchemaWrapper*)ptr;
175
298
  if (wrapper) {
@@ -205,14 +328,14 @@ static const rb_data_type_t document_type = {
205
328
 
206
329
  static const rb_data_type_t node_type = {
207
330
  "RXerces::XML::Node",
208
- {0, node_free, node_size},
331
+ {node_mark, node_free, node_size},
209
332
  0, 0,
210
333
  RUBY_TYPED_FREE_IMMEDIATELY
211
334
  };
212
335
 
213
336
  static const rb_data_type_t nodeset_type = {
214
337
  "RXerces::XML::NodeSet",
215
- {0, nodeset_free, nodeset_size},
338
+ {nodeset_mark, nodeset_free, nodeset_size},
216
339
  0, 0,
217
340
  RUBY_TYPED_FREE_IMMEDIATELY
218
341
  };
@@ -248,22 +371,12 @@ static VALUE wrap_node(DOMNode* node, VALUE doc_ref) {
248
371
  break;
249
372
  }
250
373
 
251
- // Keep reference to document to prevent GC
252
- rb_iv_set(rb_node, "@document", doc_ref);
253
-
254
374
  return rb_node;
255
375
  }
256
376
 
257
377
  // RXerces::XML::Document.parse(string)
258
378
  static VALUE document_parse(VALUE klass, VALUE str) {
259
- if (!xerces_initialized) {
260
- try {
261
- XMLPlatformUtils::Initialize();
262
- xerces_initialized = true;
263
- } catch (const XMLException& e) {
264
- rb_raise(rb_eRuntimeError, "Xerces initialization failed");
265
- }
266
- }
379
+ ensure_xerces_initialized();
267
380
 
268
381
  Check_Type(str, T_STRING);
269
382
  const char* xml_str = StringValueCStr(str);
@@ -273,6 +386,11 @@ static VALUE document_parse(VALUE klass, VALUE str) {
273
386
  parser->setDoNamespaces(true);
274
387
  parser->setDoSchema(false);
275
388
 
389
+ // Set up error handler to capture parse errors
390
+ std::vector<std::string>* parse_errors = new std::vector<std::string>();
391
+ ParseErrorHandler error_handler(parse_errors);
392
+ parser->setErrorHandler(&error_handler);
393
+
276
394
  try {
277
395
  MemBufInputSource input((const XMLByte*)xml_str, strlen(xml_str), "memory");
278
396
  parser->parse(input);
@@ -282,18 +400,33 @@ static VALUE document_parse(VALUE klass, VALUE str) {
282
400
  DocumentWrapper* wrapper = ALLOC(DocumentWrapper);
283
401
  wrapper->doc = doc;
284
402
  wrapper->parser = parser;
403
+ wrapper->parse_errors = parse_errors;
285
404
 
286
405
  VALUE rb_doc = TypedData_Wrap_Struct(rb_cDocument, &document_type, wrapper);
406
+
407
+ // If there were fatal errors, raise an exception with details
408
+ if (error_handler.has_fatal && !parse_errors->empty()) {
409
+ std::string all_errors;
410
+ for (const auto& err : *parse_errors) {
411
+ if (!all_errors.empty()) all_errors += "\n";
412
+ all_errors += err;
413
+ }
414
+ rb_raise(rb_eRuntimeError, "XML parsing failed:\n%s", all_errors.c_str());
415
+ }
416
+
287
417
  return rb_doc;
288
418
  } catch (const XMLException& e) {
289
419
  CharStr message(e.getMessage());
420
+ delete parse_errors;
290
421
  delete parser;
291
422
  rb_raise(rb_eRuntimeError, "XML parsing error: %s", message.localForm());
292
423
  } catch (const DOMException& e) {
293
424
  CharStr message(e.getMessage());
425
+ delete parse_errors;
294
426
  delete parser;
295
427
  rb_raise(rb_eRuntimeError, "DOM error: %s", message.localForm());
296
428
  } catch (...) {
429
+ delete parse_errors;
297
430
  delete parser;
298
431
  rb_raise(rb_eRuntimeError, "Unknown XML parsing error");
299
432
  }
@@ -301,6 +434,22 @@ static VALUE document_parse(VALUE klass, VALUE str) {
301
434
  return Qnil;
302
435
  }
303
436
 
437
+ // document.errors - returns array of parse errors (warnings and errors)
438
+ static VALUE document_errors(VALUE self) {
439
+ DocumentWrapper* wrapper;
440
+ TypedData_Get_Struct(self, DocumentWrapper, &document_type, wrapper);
441
+
442
+ VALUE errors_array = rb_ary_new();
443
+
444
+ if (wrapper->parse_errors) {
445
+ for (const auto& error : *wrapper->parse_errors) {
446
+ rb_ary_push(errors_array, rb_str_new2(error.c_str()));
447
+ }
448
+ }
449
+
450
+ return errors_array;
451
+ }
452
+
304
453
  // document.root
305
454
  static VALUE document_root(VALUE self) {
306
455
  DocumentWrapper* wrapper;
@@ -343,6 +492,45 @@ static VALUE document_to_s(VALUE self) {
343
492
  return Qnil;
344
493
  }
345
494
 
495
+ // document.inspect - human-readable representation
496
+ static VALUE document_inspect(VALUE self) {
497
+ DocumentWrapper* wrapper;
498
+ TypedData_Get_Struct(self, DocumentWrapper, &document_type, wrapper);
499
+
500
+ std::string result = "#<RXerces::XML::Document:0x";
501
+
502
+ // Add object ID
503
+ char buf[32];
504
+ snprintf(buf, sizeof(buf), "%016lx", (unsigned long)self);
505
+ result += buf;
506
+
507
+ if (!wrapper->doc) {
508
+ result += " (empty)>";
509
+ return rb_str_new_cstr(result.c_str());
510
+ }
511
+
512
+ // Add encoding
513
+ const XMLCh* encoding = wrapper->doc->getXmlEncoding();
514
+ if (encoding && XMLString::stringLen(encoding) > 0) {
515
+ CharStr utf8_encoding(encoding);
516
+ result += " encoding=\"";
517
+ result += utf8_encoding.localForm();
518
+ result += "\"";
519
+ }
520
+
521
+ // Add root element name
522
+ DOMElement* root = wrapper->doc->getDocumentElement();
523
+ if (root) {
524
+ CharStr rootName(root->getNodeName());
525
+ result += " root=<";
526
+ result += rootName.localForm();
527
+ result += ">";
528
+ }
529
+
530
+ result += ">";
531
+ return rb_str_new_cstr(result.c_str());
532
+ }
533
+
346
534
  // document.encoding
347
535
  static VALUE document_encoding(VALUE self) {
348
536
  DocumentWrapper* wrapper;
@@ -362,6 +550,29 @@ static VALUE document_encoding(VALUE self) {
362
550
  return rb_str_new_cstr(utf8_encoding.localForm());
363
551
  }
364
552
 
553
+ // document.text / document.content - returns text content of entire document
554
+ static VALUE document_text(VALUE self) {
555
+ DocumentWrapper* wrapper;
556
+ TypedData_Get_Struct(self, DocumentWrapper, &document_type, wrapper);
557
+
558
+ if (!wrapper->doc) {
559
+ return rb_str_new_cstr("");
560
+ }
561
+
562
+ DOMElement* root = wrapper->doc->getDocumentElement();
563
+ if (!root) {
564
+ return rb_str_new_cstr("");
565
+ }
566
+
567
+ const XMLCh* content = root->getTextContent();
568
+ if (!content) {
569
+ return rb_str_new_cstr("");
570
+ }
571
+
572
+ CharStr utf8_content(content);
573
+ return rb_str_new_cstr(utf8_content.localForm());
574
+ }
575
+
365
576
  // document.create_element(name)
366
577
  static VALUE document_create_element(VALUE self, VALUE name) {
367
578
  DocumentWrapper* doc_wrapper;
@@ -400,14 +611,9 @@ static VALUE document_create_element(VALUE self, VALUE name) {
400
611
  #ifdef HAVE_XALAN
401
612
  // Helper function to execute XPath using Xalan for full XPath 1.0 support
402
613
  static VALUE execute_xpath_with_xalan(DOMNode* context_node, const char* xpath_str, VALUE doc_ref) {
403
- try {
404
- // Initialize Xalan if needed
405
- if (!xalan_initialized) {
406
- XPathEvaluator::initialize();
407
- XMLPlatformUtils::Initialize();
408
- xalan_initialized = true;
409
- }
614
+ ensure_xerces_initialized();
410
615
 
616
+ try {
411
617
  // Get the document
412
618
  DOMDocument* domDoc = context_node->getOwnerDocument();
413
619
  if (!domDoc && context_node->getNodeType() == DOMNode::DOCUMENT_NODE) {
@@ -532,8 +738,9 @@ static VALUE document_xpath(VALUE self, VALUE path) {
532
738
  }
533
739
 
534
740
  DOMXPathNSResolver* resolver = doc_wrapper->doc->createNSResolver(root);
741
+ XStr xpath_xstr(xpath_str);
535
742
  DOMXPathExpression* expression = doc_wrapper->doc->createExpression(
536
- XStr(xpath_str).unicodeForm(), resolver);
743
+ xpath_xstr.unicodeForm(), resolver);
537
744
 
538
745
  DOMXPathResult* result = expression->evaluate(
539
746
  doc_wrapper->doc->getDocumentElement(),
@@ -575,6 +782,140 @@ static VALUE document_xpath(VALUE self, VALUE path) {
575
782
  #endif
576
783
  }
577
784
 
785
+ // document.css(selector) - Convert CSS to XPath and execute
786
+ static VALUE document_css(VALUE self, VALUE selector) {
787
+ Check_Type(selector, T_STRING);
788
+ const char* css_str = StringValueCStr(selector);
789
+
790
+ // Convert CSS to XPath
791
+ std::string xpath_str = css_to_xpath(css_str);
792
+
793
+ // Call the xpath method with converted selector
794
+ return document_xpath(self, rb_str_new2(xpath_str.c_str()));
795
+ }
796
+
797
+ // document.at_css(selector) - Returns first matching node
798
+ static VALUE document_at_css(VALUE self, VALUE selector) {
799
+ VALUE nodeset = document_css(self, selector);
800
+
801
+ NodeSetWrapper* wrapper;
802
+ TypedData_Get_Struct(nodeset, NodeSetWrapper, &nodeset_type, wrapper);
803
+
804
+ if (RARRAY_LEN(wrapper->nodes_array) == 0) {
805
+ return Qnil;
806
+ }
807
+
808
+ return rb_ary_entry(wrapper->nodes_array, 0);
809
+ }
810
+
811
+ // node.inspect - human-readable representation
812
+ static VALUE node_inspect(VALUE self) {
813
+ NodeWrapper* wrapper;
814
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
815
+
816
+ if (!wrapper->node) {
817
+ return rb_str_new_cstr("#<RXerces::XML::Node (nil)>");
818
+ }
819
+
820
+ DOMNode::NodeType nodeType = wrapper->node->getNodeType();
821
+ std::string result;
822
+
823
+ // Add object ID
824
+ char buf[32];
825
+ snprintf(buf, sizeof(buf), "%016lx", (unsigned long)self);
826
+
827
+ if (nodeType == DOMNode::ELEMENT_NODE) {
828
+ result = "#<RXerces::XML::Element:0x";
829
+ result += buf;
830
+ result += " <";
831
+
832
+ CharStr name(wrapper->node->getNodeName());
833
+ result += name.localForm();
834
+
835
+ // Add attributes
836
+ DOMElement* element = dynamic_cast<DOMElement*>(wrapper->node);
837
+ if (element) {
838
+ DOMNamedNodeMap* attributes = element->getAttributes();
839
+ if (attributes && attributes->getLength() > 0) {
840
+ XMLSize_t attrLen = attributes->getLength();
841
+ if (attrLen > 3) attrLen = 3;
842
+
843
+ for (XMLSize_t i = 0; i < attrLen; i++) {
844
+ DOMNode* attr = attributes->item(i);
845
+ CharStr attrName(attr->getNodeName());
846
+ CharStr attrValue(attr->getNodeValue());
847
+ result += " ";
848
+ result += attrName.localForm();
849
+ result += "=\"";
850
+ result += attrValue.localForm();
851
+ result += "\"";
852
+ }
853
+ if (attributes->getLength() > 3) {
854
+ result += " ...";
855
+ }
856
+ }
857
+ }
858
+
859
+ result += ">";
860
+
861
+ // Add truncated text content
862
+ const XMLCh* textContent = wrapper->node->getTextContent();
863
+ if (textContent && XMLString::stringLen(textContent) > 0) {
864
+ CharStr text(textContent);
865
+ std::string textStr = text.localForm();
866
+
867
+ size_t start = textStr.find_first_not_of(" \t\n\r");
868
+ if (start != std::string::npos) {
869
+ size_t end = textStr.find_last_not_of(" \t\n\r");
870
+ textStr = textStr.substr(start, end - start + 1);
871
+
872
+ if (textStr.length() > 40) {
873
+ textStr = textStr.substr(0, 37) + "...";
874
+ }
875
+
876
+ result += "\"";
877
+ result += textStr;
878
+ result += "\"";
879
+ }
880
+ }
881
+
882
+ result += ">";
883
+ } else if (nodeType == DOMNode::TEXT_NODE) {
884
+ result = "#<RXerces::XML::Text:0x";
885
+ result += buf;
886
+ result += " \"";
887
+
888
+ const XMLCh* textContent = wrapper->node->getNodeValue();
889
+ if (textContent) {
890
+ CharStr text(textContent);
891
+ std::string textStr = text.localForm();
892
+
893
+ size_t start = textStr.find_first_not_of(" \t\n\r");
894
+ if (start != std::string::npos) {
895
+ size_t end = textStr.find_last_not_of(" \t\n\r");
896
+ textStr = textStr.substr(start, end - start + 1);
897
+
898
+ if (textStr.length() > 40) {
899
+ textStr = textStr.substr(0, 37) + "...";
900
+ }
901
+
902
+ result += textStr;
903
+ }
904
+ }
905
+
906
+ result += "\">";
907
+ } else {
908
+ result = "#<RXerces::XML::Node:0x";
909
+ result += buf;
910
+ result += " ";
911
+ CharStr name(wrapper->node->getNodeName());
912
+ result += name.localForm();
913
+ result += ">";
914
+ }
915
+
916
+ return rb_str_new_cstr(result.c_str());
917
+ }
918
+
578
919
  // node.name
579
920
  static VALUE node_name(VALUE self) {
580
921
  NodeWrapper* wrapper;
@@ -638,7 +979,8 @@ static VALUE node_text_set(VALUE self, VALUE text) {
638
979
  Check_Type(text, T_STRING);
639
980
  const char* text_str = StringValueCStr(text);
640
981
 
641
- wrapper->node->setTextContent(XStr(text_str).unicodeForm());
982
+ XStr text_xstr(text_str);
983
+ wrapper->node->setTextContent(text_xstr.unicodeForm());
642
984
 
643
985
  return text;
644
986
  }
@@ -656,7 +998,8 @@ static VALUE node_get_attribute(VALUE self, VALUE attr_name) {
656
998
  const char* attr_str = StringValueCStr(attr_name);
657
999
 
658
1000
  DOMElement* element = dynamic_cast<DOMElement*>(wrapper->node);
659
- const XMLCh* value = element->getAttribute(XStr(attr_str).unicodeForm());
1001
+ XStr attr_xstr(attr_str);
1002
+ const XMLCh* value = element->getAttribute(attr_xstr.unicodeForm());
660
1003
 
661
1004
  if (!value || XMLString::stringLen(value) == 0) {
662
1005
  return Qnil;
@@ -682,23 +1025,49 @@ static VALUE node_set_attribute(VALUE self, VALUE attr_name, VALUE attr_value) {
682
1025
  const char* value_str = StringValueCStr(attr_value);
683
1026
 
684
1027
  DOMElement* element = dynamic_cast<DOMElement*>(wrapper->node);
685
- element->setAttribute(XStr(attr_str).unicodeForm(), XStr(value_str).unicodeForm());
1028
+ XStr attr_xstr(attr_str);
1029
+ XStr value_xstr(value_str);
1030
+ element->setAttribute(attr_xstr.unicodeForm(), value_xstr.unicodeForm());
686
1031
 
687
1032
  return attr_value;
688
1033
  }
689
1034
 
1035
+ // node.has_attribute?(attribute_name)
1036
+ static VALUE node_has_attribute_p(VALUE self, VALUE attr_name) {
1037
+ NodeWrapper* wrapper;
1038
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
1039
+
1040
+ if (!wrapper->node || wrapper->node->getNodeType() != DOMNode::ELEMENT_NODE) {
1041
+ return Qfalse;
1042
+ }
1043
+
1044
+ Check_Type(attr_name, T_STRING);
1045
+ const char* attr_str = StringValueCStr(attr_name);
1046
+
1047
+ DOMElement* element = dynamic_cast<DOMElement*>(wrapper->node);
1048
+ XStr attr_xstr(attr_str);
1049
+ const XMLCh* value = element->getAttribute(attr_xstr.unicodeForm());
1050
+
1051
+ if (!value || XMLString::stringLen(value) == 0) {
1052
+ return Qfalse;
1053
+ }
1054
+
1055
+ return Qtrue;
1056
+ }
1057
+
690
1058
  // node.children
691
1059
  static VALUE node_children(VALUE self) {
692
1060
  NodeWrapper* wrapper;
693
1061
  TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
694
1062
 
695
- VALUE doc_ref = rb_iv_get(self, "@document");
696
1063
  VALUE children = rb_ary_new();
697
1064
 
698
1065
  if (!wrapper->node) {
699
1066
  return children;
700
1067
  }
701
1068
 
1069
+ VALUE doc_ref = wrapper->doc_ref;
1070
+
702
1071
  DOMNodeList* child_nodes = wrapper->node->getChildNodes();
703
1072
  XMLSize_t count = child_nodes->getLength();
704
1073
 
@@ -710,6 +1079,31 @@ static VALUE node_children(VALUE self) {
710
1079
  return children;
711
1080
  }
712
1081
 
1082
+ // node.element_children - returns only element children (no text nodes)
1083
+ static VALUE node_element_children(VALUE self) {
1084
+ NodeWrapper* wrapper;
1085
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
1086
+
1087
+ VALUE children = rb_ary_new();
1088
+
1089
+ if (!wrapper->node) {
1090
+ return children;
1091
+ }
1092
+
1093
+ VALUE doc_ref = wrapper->doc_ref;
1094
+ DOMNodeList* child_nodes = wrapper->node->getChildNodes();
1095
+ XMLSize_t count = child_nodes->getLength();
1096
+
1097
+ for (XMLSize_t i = 0; i < count; i++) {
1098
+ DOMNode* child = child_nodes->item(i);
1099
+ if (child->getNodeType() == DOMNode::ELEMENT_NODE) {
1100
+ rb_ary_push(children, wrap_node(child, doc_ref));
1101
+ }
1102
+ }
1103
+
1104
+ return children;
1105
+ }
1106
+
713
1107
  // node.parent
714
1108
  static VALUE node_parent(VALUE self) {
715
1109
  NodeWrapper* wrapper;
@@ -724,10 +1118,82 @@ static VALUE node_parent(VALUE self) {
724
1118
  return Qnil;
725
1119
  }
726
1120
 
727
- VALUE doc_ref = rb_iv_get(self, "@document");
1121
+ VALUE doc_ref = wrapper->doc_ref;
728
1122
  return wrap_node(parent, doc_ref);
729
1123
  }
730
1124
 
1125
+ // node.ancestors(selector = nil) - returns an array of all ancestor nodes, optionally filtered by selector
1126
+ static VALUE node_ancestors(int argc, VALUE* argv, VALUE self) {
1127
+ VALUE selector;
1128
+ rb_scan_args(argc, argv, "01", &selector);
1129
+
1130
+ NodeWrapper* wrapper;
1131
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
1132
+
1133
+ VALUE ancestors = rb_ary_new();
1134
+
1135
+ if (!wrapper->node) {
1136
+ return ancestors;
1137
+ }
1138
+
1139
+ VALUE doc_ref = wrapper->doc_ref;
1140
+ DOMNode* current = wrapper->node->getParentNode();
1141
+
1142
+ // Walk up the tree, collecting all ancestors
1143
+ while (current) {
1144
+ // Stop at the document node (don't include it in ancestors)
1145
+ if (current->getNodeType() == DOMNode::DOCUMENT_NODE) {
1146
+ break;
1147
+ }
1148
+ rb_ary_push(ancestors, wrap_node(current, doc_ref));
1149
+ current = current->getParentNode();
1150
+ }
1151
+
1152
+ // If selector is provided, filter the ancestors
1153
+ if (!NIL_P(selector)) {
1154
+ Check_Type(selector, T_STRING);
1155
+ const char* selector_str = StringValueCStr(selector);
1156
+
1157
+ // Convert CSS to XPath if needed (css_to_xpath adds // prefix)
1158
+ std::string xpath_str = css_to_xpath(selector_str);
1159
+
1160
+ // Get all matching nodes from the document
1161
+ VALUE all_matches = document_xpath(doc_ref, rb_str_new2(xpath_str.c_str()));
1162
+
1163
+ NodeSetWrapper* matches_wrapper;
1164
+ TypedData_Get_Struct(all_matches, NodeSetWrapper, &nodeset_type, matches_wrapper);
1165
+
1166
+ VALUE filtered = rb_ary_new();
1167
+ long ancestor_len = RARRAY_LEN(ancestors);
1168
+ long matches_len = RARRAY_LEN(matches_wrapper->nodes_array);
1169
+
1170
+ // For each ancestor, check if it's in the matches
1171
+ for (long i = 0; i < ancestor_len; i++) {
1172
+ VALUE ancestor = rb_ary_entry(ancestors, i);
1173
+
1174
+ NodeWrapper* ancestor_wrapper;
1175
+ TypedData_Get_Struct(ancestor, NodeWrapper, &node_type, ancestor_wrapper);
1176
+
1177
+ // Check if this ancestor node is in the matches
1178
+ for (long j = 0; j < matches_len; j++) {
1179
+ VALUE match = rb_ary_entry(matches_wrapper->nodes_array, j);
1180
+ NodeWrapper* match_wrapper;
1181
+ TypedData_Get_Struct(match, NodeWrapper, &node_type, match_wrapper);
1182
+
1183
+ // Compare the actual DOM nodes
1184
+ if (ancestor_wrapper->node == match_wrapper->node) {
1185
+ rb_ary_push(filtered, ancestor);
1186
+ break;
1187
+ }
1188
+ }
1189
+ }
1190
+
1191
+ return filtered;
1192
+ }
1193
+
1194
+ return ancestors;
1195
+ }
1196
+
731
1197
  // node.attributes - returns hash of all attributes (only for element nodes)
732
1198
  static VALUE node_attributes(VALUE self) {
733
1199
  NodeWrapper* wrapper;
@@ -779,7 +1245,7 @@ static VALUE node_next_sibling(VALUE self) {
779
1245
  return Qnil;
780
1246
  }
781
1247
 
782
- VALUE doc_ref = rb_iv_get(self, "@document");
1248
+ VALUE doc_ref = wrapper->doc_ref;
783
1249
  return wrap_node(next, doc_ref);
784
1250
  }
785
1251
 
@@ -797,7 +1263,55 @@ static VALUE node_previous_sibling(VALUE self) {
797
1263
  return Qnil;
798
1264
  }
799
1265
 
800
- VALUE doc_ref = rb_iv_get(self, "@document");
1266
+ VALUE doc_ref = wrapper->doc_ref;
1267
+ return wrap_node(prev, doc_ref);
1268
+ }
1269
+
1270
+ // node.next_element - next sibling that is an element (skipping text nodes)
1271
+ static VALUE node_next_element(VALUE self) {
1272
+ NodeWrapper* wrapper;
1273
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
1274
+
1275
+ if (!wrapper->node) {
1276
+ return Qnil;
1277
+ }
1278
+
1279
+ VALUE doc_ref = wrapper->doc_ref;
1280
+ DOMNode* next = wrapper->node->getNextSibling();
1281
+
1282
+ // Skip non-element nodes
1283
+ while (next && next->getNodeType() != DOMNode::ELEMENT_NODE) {
1284
+ next = next->getNextSibling();
1285
+ }
1286
+
1287
+ if (!next) {
1288
+ return Qnil;
1289
+ }
1290
+
1291
+ return wrap_node(next, doc_ref);
1292
+ }
1293
+
1294
+ // node.previous_element - previous sibling that is an element (skipping text nodes)
1295
+ static VALUE node_previous_element(VALUE self) {
1296
+ NodeWrapper* wrapper;
1297
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
1298
+
1299
+ if (!wrapper->node) {
1300
+ return Qnil;
1301
+ }
1302
+
1303
+ VALUE doc_ref = wrapper->doc_ref;
1304
+ DOMNode* prev = wrapper->node->getPreviousSibling();
1305
+
1306
+ // Skip non-element nodes
1307
+ while (prev && prev->getNodeType() != DOMNode::ELEMENT_NODE) {
1308
+ prev = prev->getPreviousSibling();
1309
+ }
1310
+
1311
+ if (!prev) {
1312
+ return Qnil;
1313
+ }
1314
+
801
1315
  return wrap_node(prev, doc_ref);
802
1316
  }
803
1317
 
@@ -816,6 +1330,7 @@ static VALUE node_add_child(VALUE self, VALUE child) {
816
1330
  }
817
1331
 
818
1332
  DOMNode* child_node = NULL;
1333
+ bool needs_import = false;
819
1334
 
820
1335
  // Check if child is a string or a node
821
1336
  if (TYPE(child) == T_STRING) {
@@ -830,6 +1345,13 @@ static VALUE node_add_child(VALUE self, VALUE child) {
830
1345
  if (rb_obj_is_kind_of(child, rb_cNode)) {
831
1346
  TypedData_Get_Struct(child, NodeWrapper, &node_type, child_wrapper);
832
1347
  child_node = child_wrapper->node;
1348
+
1349
+ // Check if child belongs to a different document
1350
+ DOMDocument* child_doc = child_node->getOwnerDocument();
1351
+ if (child_doc && child_doc != doc) {
1352
+ rb_raise(rb_eRuntimeError,
1353
+ "Node belongs to a different document. Use importNode to adopt nodes from other documents.");
1354
+ }
833
1355
  } else {
834
1356
  rb_raise(rb_eTypeError, "Argument must be a String or Node");
835
1357
  }
@@ -840,12 +1362,24 @@ static VALUE node_add_child(VALUE self, VALUE child) {
840
1362
  }
841
1363
 
842
1364
  try {
1365
+ // appendChild will automatically detach the node from its current parent if it has one
843
1366
  wrapper->node->appendChild(child_node);
844
1367
  } catch (const DOMException& e) {
845
1368
  char* message = XMLString::transcode(e.getMessage());
846
1369
  VALUE rb_error = rb_str_new_cstr(message);
847
1370
  XMLString::release(&message);
848
- rb_raise(rb_eRuntimeError, "Failed to add child: %s", StringValueCStr(rb_error));
1371
+
1372
+ // Provide more context for common errors
1373
+ unsigned short code = e.code;
1374
+ if (code == DOMException::WRONG_DOCUMENT_ERR) {
1375
+ rb_raise(rb_eRuntimeError, "Node belongs to a different document: %s", StringValueCStr(rb_error));
1376
+ } else if (code == DOMException::HIERARCHY_REQUEST_ERR) {
1377
+ rb_raise(rb_eRuntimeError, "Invalid hierarchy: cannot add this node as a child: %s", StringValueCStr(rb_error));
1378
+ } else if (code == DOMException::NO_MODIFICATION_ALLOWED_ERR) {
1379
+ rb_raise(rb_eRuntimeError, "Node is read-only: %s", StringValueCStr(rb_error));
1380
+ } else {
1381
+ rb_raise(rb_eRuntimeError, "Failed to add child: %s", StringValueCStr(rb_error));
1382
+ }
849
1383
  }
850
1384
 
851
1385
  return child;
@@ -887,7 +1421,8 @@ static VALUE node_inner_html(VALUE self) {
887
1421
  }
888
1422
 
889
1423
  try {
890
- DOMImplementation* impl = DOMImplementationRegistry::getDOMImplementation(XStr("LS").unicodeForm());
1424
+ XStr ls_name("LS");
1425
+ DOMImplementation* impl = DOMImplementationRegistry::getDOMImplementation(ls_name.unicodeForm());
891
1426
  DOMLSSerializer* serializer = ((DOMImplementationLS*)impl)->createLSSerializer();
892
1427
 
893
1428
  // Build a string by serializing each child
@@ -1052,7 +1587,7 @@ static VALUE node_xpath(VALUE self, VALUE path) {
1052
1587
 
1053
1588
  Check_Type(path, T_STRING);
1054
1589
  const char* xpath_str = StringValueCStr(path);
1055
- VALUE doc_ref = rb_iv_get(self, "@document");
1590
+ VALUE doc_ref = node_wrapper->doc_ref;
1056
1591
 
1057
1592
  #ifdef HAVE_XALAN
1058
1593
  // Use Xalan for full XPath 1.0 support
@@ -1068,8 +1603,9 @@ static VALUE node_xpath(VALUE self, VALUE path) {
1068
1603
  }
1069
1604
 
1070
1605
  DOMXPathNSResolver* resolver = doc->createNSResolver(node_wrapper->node);
1606
+ XStr xpath_xstr(xpath_str);
1071
1607
  DOMXPathExpression* expression = doc->createExpression(
1072
- XStr(xpath_str).unicodeForm(), resolver);
1608
+ xpath_xstr.unicodeForm(), resolver);
1073
1609
 
1074
1610
  DOMXPathResult* result = expression->evaluate(
1075
1611
  node_wrapper->node,
@@ -1124,10 +1660,185 @@ static VALUE node_at_xpath(VALUE self, VALUE path) {
1124
1660
  return rb_ary_entry(wrapper->nodes_array, 0);
1125
1661
  }
1126
1662
 
1127
- // node.css(selector) - CSS selectors not supported
1663
+ // node.at_css(selector) - returns first matching node or nil
1664
+ static VALUE node_at_css(VALUE self, VALUE selector) {
1665
+ VALUE nodeset = node_css(self, selector);
1666
+ NodeSetWrapper* wrapper;
1667
+ TypedData_Get_Struct(nodeset, NodeSetWrapper, &nodeset_type, wrapper);
1668
+
1669
+ if (RARRAY_LEN(wrapper->nodes_array) == 0) {
1670
+ return Qnil;
1671
+ }
1672
+
1673
+ return rb_ary_entry(wrapper->nodes_array, 0);
1674
+ }
1675
+
1676
+ // Helper function to convert basic CSS selectors to XPath
1677
+ // Supports common patterns like: tag, .class, #id, tag.class, tag#id, [attr], [attr=value]
1678
+ static std::string css_to_xpath(const char* css) {
1679
+ std::string selector(css);
1680
+
1681
+ // Trim whitespace
1682
+ size_t start = selector.find_first_not_of(" \t\n\r");
1683
+ size_t end = selector.find_last_not_of(" \t\n\r");
1684
+ if (start == std::string::npos) return "//*";
1685
+ selector = selector.substr(start, end - start + 1);
1686
+
1687
+ std::string result = "//";
1688
+ std::string current_element = "*";
1689
+ bool has_element = false;
1690
+ bool in_brackets = false;
1691
+
1692
+ for (size_t i = 0; i < selector.length(); i++) {
1693
+ char c = selector[i];
1694
+
1695
+ if (c == '[') in_brackets = true;
1696
+ if (c == ']') in_brackets = false;
1697
+
1698
+ // Handle spaces (descendant combinator) outside of attribute selectors
1699
+ if (c == ' ' && !in_brackets) {
1700
+ // Flush current element
1701
+ if (!has_element && current_element != "*") {
1702
+ result += current_element;
1703
+ }
1704
+ // Skip multiple spaces
1705
+ while (i + 1 < selector.length() && selector[i + 1] == ' ') i++;
1706
+ result += "//";
1707
+ current_element = "*";
1708
+ has_element = false;
1709
+ continue;
1710
+ }
1711
+
1712
+ // Handle child combinator
1713
+ if (c == '>' && !in_brackets) {
1714
+ // Flush current element
1715
+ if (!has_element && current_element != "*") {
1716
+ result += current_element;
1717
+ }
1718
+ // Remove any trailing slashes and spaces
1719
+ while (!result.empty() && (result.back() == ' ' || result.back() == '/')) {
1720
+ if (result.back() == '/') {
1721
+ result.pop_back();
1722
+ break;
1723
+ }
1724
+ result.pop_back();
1725
+ }
1726
+ result += "/";
1727
+ // Skip spaces after >
1728
+ while (i + 1 < selector.length() && selector[i + 1] == ' ') i++;
1729
+ current_element = "*";
1730
+ has_element = false;
1731
+ continue;
1732
+ }
1733
+
1734
+ // Handle ID selector
1735
+ if (c == '#' && !in_brackets) {
1736
+ if (!has_element) {
1737
+ result += "*";
1738
+ has_element = true;
1739
+ } else if (current_element != "*") {
1740
+ result += current_element;
1741
+ current_element = "*";
1742
+ has_element = true;
1743
+ }
1744
+ result += "[@id='";
1745
+ i++;
1746
+ while (i < selector.length() && selector[i] != ' ' && selector[i] != '.' &&
1747
+ selector[i] != '[' && selector[i] != '>' && selector[i] != '+' && selector[i] != '~') {
1748
+ result += selector[i++];
1749
+ }
1750
+ result += "']";
1751
+ i--;
1752
+ continue;
1753
+ }
1754
+
1755
+ // Handle class selector
1756
+ if (c == '.' && !in_brackets) {
1757
+ if (!has_element) {
1758
+ result += "*";
1759
+ has_element = true;
1760
+ } else if (current_element != "*") {
1761
+ result += current_element;
1762
+ current_element = "*";
1763
+ has_element = true;
1764
+ }
1765
+ result += "[contains(concat(' ', @class, ' '), ' ";
1766
+ i++;
1767
+ while (i < selector.length() && selector[i] != ' ' && selector[i] != '.' &&
1768
+ selector[i] != '[' && selector[i] != '>' && selector[i] != '+' && selector[i] != '~' && selector[i] != '#') {
1769
+ result += selector[i++];
1770
+ }
1771
+ result += " ')]";
1772
+ i--;
1773
+ continue;
1774
+ }
1775
+
1776
+ // Handle attribute selectors
1777
+ if (c == '[') {
1778
+ if (!has_element && current_element != "*") {
1779
+ result += current_element;
1780
+ has_element = true;
1781
+ }
1782
+ result += "[@";
1783
+ i++;
1784
+ // Get attribute name
1785
+ while (i < selector.length() && selector[i] != ']' && selector[i] != '=' &&
1786
+ selector[i] != '!' && selector[i] != '~' && selector[i] != '^' && selector[i] != '$' && selector[i] != '*') {
1787
+ result += selector[i++];
1788
+ }
1789
+
1790
+ if (i < selector.length() && selector[i] == '=') {
1791
+ result += "='";
1792
+ i++;
1793
+ // Skip quotes if present
1794
+ if (i < selector.length() && (selector[i] == '"' || selector[i] == '\'')) {
1795
+ char quote = selector[i++];
1796
+ while (i < selector.length() && selector[i] != quote) {
1797
+ result += selector[i++];
1798
+ }
1799
+ if (i < selector.length()) i++; // Skip closing quote
1800
+ } else {
1801
+ // No quotes, read until ]
1802
+ while (i < selector.length() && selector[i] != ']') {
1803
+ result += selector[i++];
1804
+ }
1805
+ }
1806
+ result += "'";
1807
+ }
1808
+
1809
+ // Skip to closing bracket
1810
+ while (i < selector.length() && selector[i] != ']') i++;
1811
+ result += ']';
1812
+ continue;
1813
+ }
1814
+
1815
+ // Regular character - part of element name
1816
+ if (c != ' ' && c != '>' && c != '.' && c != '#' && c != '[' && !has_element) {
1817
+ if (current_element == "*") {
1818
+ current_element = "";
1819
+ }
1820
+ current_element += c;
1821
+ }
1822
+ }
1823
+
1824
+ // Flush any remaining element name
1825
+ if (!has_element && current_element != "*") {
1826
+ result += current_element;
1827
+ }
1828
+
1829
+ return result;
1830
+ }
1831
+
1832
+ // node.css(selector) - Convert CSS to XPath and execute
1128
1833
  static VALUE node_css(VALUE self, VALUE selector) {
1129
- rb_raise(rb_eNotImpError, "CSS selectors are not supported. Use xpath() instead. Xerces-C only supports XPath queries.");
1130
- return Qnil;
1834
+ Check_Type(selector, T_STRING);
1835
+ const char* css_str = StringValueCStr(selector);
1836
+
1837
+ // Convert CSS to XPath
1838
+ std::string xpath_str = css_to_xpath(css_str);
1839
+
1840
+ // Call the xpath method with converted selector
1841
+ return node_xpath(self, rb_str_new2(xpath_str.c_str()));
1131
1842
  }
1132
1843
 
1133
1844
  // nodeset.length / nodeset.size
@@ -1171,23 +1882,211 @@ static VALUE nodeset_to_a(VALUE self) {
1171
1882
  return rb_ary_dup(wrapper->nodes_array);
1172
1883
  }
1173
1884
 
1885
+ // nodeset.first - returns first node or nil
1886
+ static VALUE nodeset_first(VALUE self) {
1887
+ NodeSetWrapper* wrapper;
1888
+ TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
1889
+
1890
+ if (RARRAY_LEN(wrapper->nodes_array) == 0) {
1891
+ return Qnil;
1892
+ }
1893
+
1894
+ return rb_ary_entry(wrapper->nodes_array, 0);
1895
+ }
1896
+
1897
+ // nodeset.last - returns last node or nil
1898
+ static VALUE nodeset_last(VALUE self) {
1899
+ NodeSetWrapper* wrapper;
1900
+ TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
1901
+
1902
+ long len = RARRAY_LEN(wrapper->nodes_array);
1903
+ if (len == 0) {
1904
+ return Qnil;
1905
+ }
1906
+
1907
+ return rb_ary_entry(wrapper->nodes_array, len - 1);
1908
+ }
1909
+
1910
+ // nodeset.empty? - returns true if nodeset is empty
1911
+ static VALUE nodeset_empty_p(VALUE self) {
1912
+ NodeSetWrapper* wrapper;
1913
+ TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
1914
+
1915
+ return RARRAY_LEN(wrapper->nodes_array) == 0 ? Qtrue : Qfalse;
1916
+ }
1917
+
1918
+ // nodeset.inner_html - returns concatenated inner_html of all nodes
1919
+ static VALUE nodeset_inner_html(VALUE self) {
1920
+ NodeSetWrapper* wrapper;
1921
+ TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
1922
+
1923
+ std::string result;
1924
+ long len = RARRAY_LEN(wrapper->nodes_array);
1925
+
1926
+ for (long i = 0; i < len; i++) {
1927
+ VALUE node = rb_ary_entry(wrapper->nodes_array, i);
1928
+ VALUE inner_html = rb_funcall(node, rb_intern("inner_html"), 0);
1929
+ result += StringValueCStr(inner_html);
1930
+ }
1931
+
1932
+ return rb_str_new_cstr(result.c_str());
1933
+ }
1934
+
1935
+ // nodeset.text - returns concatenated text content of all nodes
1936
+ static VALUE nodeset_text(VALUE self) {
1937
+ NodeSetWrapper* wrapper;
1938
+ TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
1939
+
1940
+ std::string result;
1941
+ long len = RARRAY_LEN(wrapper->nodes_array);
1942
+
1943
+ for (long i = 0; i < len; i++) {
1944
+ VALUE node = rb_ary_entry(wrapper->nodes_array, i);
1945
+ NodeWrapper* node_wrapper;
1946
+ TypedData_Get_Struct(node, NodeWrapper, &node_type, node_wrapper);
1947
+
1948
+ if (node_wrapper->node) {
1949
+ const XMLCh* content = node_wrapper->node->getTextContent();
1950
+ if (content) {
1951
+ CharStr utf8_content(content);
1952
+ result += utf8_content.localForm();
1953
+ }
1954
+ }
1955
+ }
1956
+
1957
+ return rb_str_new_cstr(result.c_str());
1958
+ }
1959
+
1960
+ // nodeset.inspect / nodeset.to_s - human-readable representation
1961
+ static VALUE nodeset_inspect(VALUE self) {
1962
+ NodeSetWrapper* wrapper;
1963
+ TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
1964
+
1965
+ long len = RARRAY_LEN(wrapper->nodes_array);
1966
+ std::string result = "#<RXerces::XML::NodeSet:0x";
1967
+
1968
+ // Add object ID
1969
+ char buf[32];
1970
+ snprintf(buf, sizeof(buf), "%016lx", (unsigned long)self);
1971
+ result += buf;
1972
+ result += " [";
1973
+
1974
+ for (long i = 0; i < len; i++) {
1975
+ if (i > 0) result += ", ";
1976
+
1977
+ VALUE node = rb_ary_entry(wrapper->nodes_array, i);
1978
+ NodeWrapper* node_wrapper;
1979
+ TypedData_Get_Struct(node, NodeWrapper, &node_type, node_wrapper);
1980
+
1981
+ if (!node_wrapper->node) {
1982
+ result += "nil";
1983
+ continue;
1984
+ }
1985
+
1986
+ DOMNode::NodeType nodeType = node_wrapper->node->getNodeType();
1987
+
1988
+ if (nodeType == DOMNode::ELEMENT_NODE) {
1989
+ // For elements, show: <tag attr="value">content</tag>
1990
+ CharStr name(node_wrapper->node->getNodeName());
1991
+ result += "<";
1992
+ result += name.localForm();
1993
+
1994
+ // Add first few attributes if present
1995
+ DOMElement* element = dynamic_cast<DOMElement*>(node_wrapper->node);
1996
+ if (element) {
1997
+ DOMNamedNodeMap* attributes = element->getAttributes();
1998
+ if (attributes && attributes->getLength() > 0) {
1999
+ XMLSize_t attrLen = attributes->getLength();
2000
+ if (attrLen > 3) attrLen = 3; // Limit to first 3 attributes
2001
+
2002
+ for (XMLSize_t j = 0; j < attrLen; j++) {
2003
+ DOMNode* attr = attributes->item(j);
2004
+ CharStr attrName(attr->getNodeName());
2005
+ CharStr attrValue(attr->getNodeValue());
2006
+ result += " ";
2007
+ result += attrName.localForm();
2008
+ result += "=\"";
2009
+ result += attrValue.localForm();
2010
+ result += "\"";
2011
+ }
2012
+ if (attributes->getLength() > 3) {
2013
+ result += " ...";
2014
+ }
2015
+ }
2016
+ }
2017
+
2018
+ // Show truncated text content
2019
+ const XMLCh* textContent = node_wrapper->node->getTextContent();
2020
+ if (textContent && XMLString::stringLen(textContent) > 0) {
2021
+ CharStr text(textContent);
2022
+ std::string textStr = text.localForm();
2023
+
2024
+ // Trim whitespace and truncate
2025
+ size_t start = textStr.find_first_not_of(" \t\n\r");
2026
+ if (start != std::string::npos) {
2027
+ size_t end = textStr.find_last_not_of(" \t\n\r");
2028
+ textStr = textStr.substr(start, end - start + 1);
2029
+
2030
+ if (textStr.length() > 30) {
2031
+ textStr = textStr.substr(0, 27) + "...";
2032
+ }
2033
+
2034
+ result += ">";
2035
+ result += textStr;
2036
+ result += "</";
2037
+ result += name.localForm();
2038
+ result += ">";
2039
+ } else {
2040
+ result += ">";
2041
+ }
2042
+ } else {
2043
+ result += ">";
2044
+ }
2045
+ } else if (nodeType == DOMNode::TEXT_NODE) {
2046
+ // For text nodes, show: text("content")
2047
+ const XMLCh* textContent = node_wrapper->node->getNodeValue();
2048
+ if (textContent) {
2049
+ CharStr text(textContent);
2050
+ std::string textStr = text.localForm();
2051
+
2052
+ // Trim and truncate
2053
+ size_t start = textStr.find_first_not_of(" \t\n\r");
2054
+ if (start != std::string::npos) {
2055
+ size_t end = textStr.find_last_not_of(" \t\n\r");
2056
+ textStr = textStr.substr(start, end - start + 1);
2057
+
2058
+ if (textStr.length() > 30) {
2059
+ textStr = textStr.substr(0, 27) + "...";
2060
+ }
2061
+
2062
+ result += "text(\"";
2063
+ result += textStr;
2064
+ result += "\")";
2065
+ } else {
2066
+ result += "text()";
2067
+ }
2068
+ } else {
2069
+ result += "text()";
2070
+ }
2071
+ } else {
2072
+ // For other nodes, just show the type
2073
+ CharStr name(node_wrapper->node->getNodeName());
2074
+ result += "#<";
2075
+ result += name.localForm();
2076
+ result += ">";
2077
+ }
2078
+ }
2079
+
2080
+ result += "]>";
2081
+ return rb_str_new_cstr(result.c_str());
2082
+ }
2083
+
1174
2084
  // Schema.from_document(schema_doc) or Schema.from_string(xsd_string)
1175
2085
  static VALUE schema_from_document(int argc, VALUE* argv, VALUE klass) {
1176
2086
  VALUE schema_source;
1177
2087
  rb_scan_args(argc, argv, "1", &schema_source);
1178
2088
 
1179
- // Ensure Xerces is initialized
1180
- if (!xerces_initialized) {
1181
- try {
1182
- XMLPlatformUtils::Initialize();
1183
- xerces_initialized = true;
1184
- } catch (const XMLException& e) {
1185
- char* message = XMLString::transcode(e.getMessage());
1186
- VALUE rb_error = rb_str_new_cstr(message);
1187
- XMLString::release(&message);
1188
- rb_raise(rb_eRuntimeError, "Failed to initialize Xerces-C: %s", StringValueCStr(rb_error));
1189
- }
1190
- }
2089
+ ensure_xerces_initialized();
1191
2090
 
1192
2091
  try {
1193
2092
  SchemaWrapper* wrapper = ALLOC(SchemaWrapper);
@@ -1360,14 +2259,21 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
1360
2259
  rb_undef_alloc_func(rb_cDocument);
1361
2260
  rb_define_singleton_method(rb_cDocument, "parse", RUBY_METHOD_FUNC(document_parse), 1);
1362
2261
  rb_define_method(rb_cDocument, "root", RUBY_METHOD_FUNC(document_root), 0);
2262
+ rb_define_method(rb_cDocument, "errors", RUBY_METHOD_FUNC(document_errors), 0);
1363
2263
  rb_define_method(rb_cDocument, "to_s", RUBY_METHOD_FUNC(document_to_s), 0);
1364
2264
  rb_define_alias(rb_cDocument, "to_xml", "to_s");
2265
+ rb_define_method(rb_cDocument, "inspect", RUBY_METHOD_FUNC(document_inspect), 0);
1365
2266
  rb_define_method(rb_cDocument, "xpath", RUBY_METHOD_FUNC(document_xpath), 1);
2267
+ rb_define_method(rb_cDocument, "css", RUBY_METHOD_FUNC(document_css), 1);
2268
+ rb_define_method(rb_cDocument, "at_css", RUBY_METHOD_FUNC(document_at_css), 1);
1366
2269
  rb_define_method(rb_cDocument, "encoding", RUBY_METHOD_FUNC(document_encoding), 0);
2270
+ rb_define_method(rb_cDocument, "text", RUBY_METHOD_FUNC(document_text), 0);
2271
+ rb_define_alias(rb_cDocument, "content", "text");
1367
2272
  rb_define_method(rb_cDocument, "create_element", RUBY_METHOD_FUNC(document_create_element), 1);
1368
2273
 
1369
2274
  rb_cNode = rb_define_class_under(rb_mXML, "Node", rb_cObject);
1370
2275
  rb_undef_alloc_func(rb_cNode);
2276
+ rb_define_method(rb_cNode, "inspect", RUBY_METHOD_FUNC(node_inspect), 0);
1371
2277
  rb_define_method(rb_cNode, "name", RUBY_METHOD_FUNC(node_name), 0);
1372
2278
  rb_define_method(rb_cNode, "namespace", RUBY_METHOD_FUNC(node_namespace), 0);
1373
2279
  rb_define_method(rb_cNode, "text", RUBY_METHOD_FUNC(node_text), 0);
@@ -1376,11 +2282,19 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
1376
2282
  rb_define_alias(rb_cNode, "content=", "text=");
1377
2283
  rb_define_method(rb_cNode, "[]", RUBY_METHOD_FUNC(node_get_attribute), 1);
1378
2284
  rb_define_method(rb_cNode, "[]=", RUBY_METHOD_FUNC(node_set_attribute), 2);
2285
+ rb_define_alias(rb_cNode, "get_attribute", "[]");
2286
+ rb_define_alias(rb_cNode, "attribute", "[]");
2287
+ rb_define_method(rb_cNode, "has_attribute?", RUBY_METHOD_FUNC(node_has_attribute_p), 1);
1379
2288
  rb_define_method(rb_cNode, "children", RUBY_METHOD_FUNC(node_children), 0);
2289
+ rb_define_method(rb_cNode, "element_children", RUBY_METHOD_FUNC(node_element_children), 0);
2290
+ rb_define_alias(rb_cNode, "elements", "element_children");
1380
2291
  rb_define_method(rb_cNode, "parent", RUBY_METHOD_FUNC(node_parent), 0);
2292
+ rb_define_method(rb_cNode, "ancestors", RUBY_METHOD_FUNC(node_ancestors), -1);
1381
2293
  rb_define_method(rb_cNode, "attributes", RUBY_METHOD_FUNC(node_attributes), 0);
1382
2294
  rb_define_method(rb_cNode, "next_sibling", RUBY_METHOD_FUNC(node_next_sibling), 0);
2295
+ rb_define_method(rb_cNode, "next_element", RUBY_METHOD_FUNC(node_next_element), 0);
1383
2296
  rb_define_method(rb_cNode, "previous_sibling", RUBY_METHOD_FUNC(node_previous_sibling), 0);
2297
+ rb_define_method(rb_cNode, "previous_element", RUBY_METHOD_FUNC(node_previous_element), 0);
1384
2298
  rb_define_method(rb_cNode, "add_child", RUBY_METHOD_FUNC(node_add_child), 1);
1385
2299
  rb_define_method(rb_cNode, "remove", RUBY_METHOD_FUNC(node_remove), 0);
1386
2300
  rb_define_alias(rb_cNode, "unlink", "remove");
@@ -1393,6 +2307,9 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
1393
2307
  rb_define_method(rb_cNode, "at_xpath", RUBY_METHOD_FUNC(node_at_xpath), 1);
1394
2308
  rb_define_alias(rb_cNode, "at", "at_xpath");
1395
2309
  rb_define_method(rb_cNode, "css", RUBY_METHOD_FUNC(node_css), 1);
2310
+ rb_define_method(rb_cNode, "at_css", RUBY_METHOD_FUNC(node_at_css), 1);
2311
+ rb_define_alias(rb_cNode, "get_attribute", "[]");
2312
+ rb_define_alias(rb_cNode, "attribute", "[]");
1396
2313
 
1397
2314
  rb_cElement = rb_define_class_under(rb_mXML, "Element", rb_cNode);
1398
2315
  rb_undef_alloc_func(rb_cElement);
@@ -1405,8 +2322,15 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
1405
2322
  rb_define_method(rb_cNodeSet, "length", RUBY_METHOD_FUNC(nodeset_length), 0);
1406
2323
  rb_define_alias(rb_cNodeSet, "size", "length");
1407
2324
  rb_define_method(rb_cNodeSet, "[]", RUBY_METHOD_FUNC(nodeset_at), 1);
2325
+ rb_define_method(rb_cNodeSet, "first", RUBY_METHOD_FUNC(nodeset_first), 0);
2326
+ rb_define_method(rb_cNodeSet, "last", RUBY_METHOD_FUNC(nodeset_last), 0);
2327
+ rb_define_method(rb_cNodeSet, "empty?", RUBY_METHOD_FUNC(nodeset_empty_p), 0);
1408
2328
  rb_define_method(rb_cNodeSet, "each", RUBY_METHOD_FUNC(nodeset_each), 0);
1409
2329
  rb_define_method(rb_cNodeSet, "to_a", RUBY_METHOD_FUNC(nodeset_to_a), 0);
2330
+ rb_define_method(rb_cNodeSet, "text", RUBY_METHOD_FUNC(nodeset_text), 0);
2331
+ rb_define_method(rb_cNodeSet, "inner_html", RUBY_METHOD_FUNC(nodeset_inner_html), 0);
2332
+ rb_define_method(rb_cNodeSet, "inspect", RUBY_METHOD_FUNC(nodeset_inspect), 0);
2333
+ rb_define_alias(rb_cNodeSet, "to_s", "inspect");
1410
2334
  rb_include_module(rb_cNodeSet, rb_mEnumerable);
1411
2335
 
1412
2336
  rb_cSchema = rb_define_class_under(rb_mXML, "Schema", rb_cObject);
@@ -1415,4 +2339,7 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
1415
2339
  rb_define_singleton_method(rb_cSchema, "from_string", RUBY_METHOD_FUNC(schema_from_document), -1);
1416
2340
 
1417
2341
  rb_define_method(rb_cDocument, "validate", RUBY_METHOD_FUNC(document_validate), 1);
2342
+
2343
+ // Register cleanup handler
2344
+ atexit(cleanup_xerces);
1418
2345
  }