rxerces 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/CHANGES.md +20 -0
- data/README.md +40 -5
- data/ext/rxerces/extconf.rb +42 -0
- data/ext/rxerces/rxerces.cpp +980 -8
- data/lib/rxerces/nokogiri.rb +26 -0
- data/lib/rxerces/version.rb +1 -1
- data/rxerces.gemspec +5 -2
- data/spec/document_spec.rb +78 -0
- data/spec/node_spec.rb +434 -0
- data/spec/nodeset_spec.rb +59 -0
- data/spec/nokogiri_compatibility_spec.rb +44 -0
- data/spec/rxerces_shared.rb +1 -1
- data/spec/xpath_spec.rb +252 -18
- data.tar.gz.sig +0 -0
- metadata +4 -3
- metadata.gz.sig +2 -2
data/ext/rxerces/rxerces.cpp
CHANGED
|
@@ -13,7 +13,28 @@
|
|
|
13
13
|
#include <sstream>
|
|
14
14
|
#include <vector>
|
|
15
15
|
|
|
16
|
+
#ifdef HAVE_XALAN
|
|
17
|
+
#include <xalanc/XPath/XPathEvaluator.hpp>
|
|
18
|
+
#include <xalanc/XPath/NodeRefList.hpp>
|
|
19
|
+
#include <xalanc/XPath/XObject.hpp>
|
|
20
|
+
#include <xalanc/XPath/XObjectFactoryDefault.hpp>
|
|
21
|
+
#include <xalanc/XPath/XPathEnvSupportDefault.hpp>
|
|
22
|
+
#include <xalanc/XPath/XPathExecutionContextDefault.hpp>
|
|
23
|
+
#include <xalanc/XPath/XPathConstructionContextDefault.hpp>
|
|
24
|
+
#include <xalanc/XPath/ElementPrefixResolverProxy.hpp>
|
|
25
|
+
#include <xalanc/XPath/XPathFactoryDefault.hpp>
|
|
26
|
+
#include <xalanc/XPath/XPathProcessorImpl.hpp>
|
|
27
|
+
#include <xalanc/XPath/XPath.hpp>
|
|
28
|
+
#include <xalanc/XercesParserLiaison/XercesParserLiaison.hpp>
|
|
29
|
+
#include <xalanc/XercesParserLiaison/XercesDOMSupport.hpp>
|
|
30
|
+
#include <xalanc/XercesParserLiaison/XercesDocumentWrapper.hpp>
|
|
31
|
+
#include <xalanc/PlatformSupport/XalanMemoryManagerDefault.hpp>
|
|
32
|
+
#endif
|
|
33
|
+
|
|
16
34
|
using namespace xercesc;
|
|
35
|
+
#ifdef HAVE_XALAN
|
|
36
|
+
using namespace xalanc;
|
|
37
|
+
#endif
|
|
17
38
|
|
|
18
39
|
VALUE rb_mRXerces;
|
|
19
40
|
VALUE rb_mXML;
|
|
@@ -24,8 +45,17 @@ VALUE rb_cElement;
|
|
|
24
45
|
VALUE rb_cText;
|
|
25
46
|
VALUE rb_cSchema;
|
|
26
47
|
|
|
27
|
-
//
|
|
48
|
+
// Initialization flags
|
|
28
49
|
static bool xerces_initialized = false;
|
|
50
|
+
#ifdef HAVE_XALAN
|
|
51
|
+
static bool xalan_initialized = false;
|
|
52
|
+
#endif
|
|
53
|
+
|
|
54
|
+
// Forward declarations
|
|
55
|
+
static std::string css_to_xpath(const char* css);
|
|
56
|
+
static VALUE node_css(VALUE self, VALUE selector);
|
|
57
|
+
static VALUE node_xpath(VALUE self, VALUE path);
|
|
58
|
+
static VALUE document_xpath(VALUE self, VALUE path);
|
|
29
59
|
|
|
30
60
|
// Helper class to manage XMLCh strings
|
|
31
61
|
class XStr {
|
|
@@ -246,7 +276,7 @@ static VALUE document_parse(VALUE klass, VALUE str) {
|
|
|
246
276
|
|
|
247
277
|
XercesDOMParser* parser = new XercesDOMParser();
|
|
248
278
|
parser->setValidationScheme(XercesDOMParser::Val_Never);
|
|
249
|
-
parser->setDoNamespaces(
|
|
279
|
+
parser->setDoNamespaces(true);
|
|
250
280
|
parser->setDoSchema(false);
|
|
251
281
|
|
|
252
282
|
try {
|
|
@@ -319,6 +349,87 @@ static VALUE document_to_s(VALUE self) {
|
|
|
319
349
|
return Qnil;
|
|
320
350
|
}
|
|
321
351
|
|
|
352
|
+
// document.inspect - human-readable representation
|
|
353
|
+
static VALUE document_inspect(VALUE self) {
|
|
354
|
+
DocumentWrapper* wrapper;
|
|
355
|
+
TypedData_Get_Struct(self, DocumentWrapper, &document_type, wrapper);
|
|
356
|
+
|
|
357
|
+
std::string result = "#<RXerces::XML::Document:0x";
|
|
358
|
+
|
|
359
|
+
// Add object ID
|
|
360
|
+
char buf[32];
|
|
361
|
+
snprintf(buf, sizeof(buf), "%016lx", (unsigned long)self);
|
|
362
|
+
result += buf;
|
|
363
|
+
|
|
364
|
+
if (!wrapper->doc) {
|
|
365
|
+
result += " (empty)>";
|
|
366
|
+
return rb_str_new_cstr(result.c_str());
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// Add encoding
|
|
370
|
+
const XMLCh* encoding = wrapper->doc->getXmlEncoding();
|
|
371
|
+
if (encoding && XMLString::stringLen(encoding) > 0) {
|
|
372
|
+
CharStr utf8_encoding(encoding);
|
|
373
|
+
result += " encoding=\"";
|
|
374
|
+
result += utf8_encoding.localForm();
|
|
375
|
+
result += "\"";
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
// Add root element name
|
|
379
|
+
DOMElement* root = wrapper->doc->getDocumentElement();
|
|
380
|
+
if (root) {
|
|
381
|
+
CharStr rootName(root->getNodeName());
|
|
382
|
+
result += " root=<";
|
|
383
|
+
result += rootName.localForm();
|
|
384
|
+
result += ">";
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
result += ">";
|
|
388
|
+
return rb_str_new_cstr(result.c_str());
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// document.encoding
|
|
392
|
+
static VALUE document_encoding(VALUE self) {
|
|
393
|
+
DocumentWrapper* wrapper;
|
|
394
|
+
TypedData_Get_Struct(self, DocumentWrapper, &document_type, wrapper);
|
|
395
|
+
|
|
396
|
+
if (!wrapper->doc) {
|
|
397
|
+
return Qnil;
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
const XMLCh* encoding = wrapper->doc->getXmlEncoding();
|
|
401
|
+
if (!encoding || XMLString::stringLen(encoding) == 0) {
|
|
402
|
+
// Default to UTF-8 if no encoding is specified
|
|
403
|
+
return rb_str_new_cstr("UTF-8");
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
CharStr utf8_encoding(encoding);
|
|
407
|
+
return rb_str_new_cstr(utf8_encoding.localForm());
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// document.text / document.content - returns text content of entire document
|
|
411
|
+
static VALUE document_text(VALUE self) {
|
|
412
|
+
DocumentWrapper* wrapper;
|
|
413
|
+
TypedData_Get_Struct(self, DocumentWrapper, &document_type, wrapper);
|
|
414
|
+
|
|
415
|
+
if (!wrapper->doc) {
|
|
416
|
+
return rb_str_new_cstr("");
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
DOMElement* root = wrapper->doc->getDocumentElement();
|
|
420
|
+
if (!root) {
|
|
421
|
+
return rb_str_new_cstr("");
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
const XMLCh* content = root->getTextContent();
|
|
425
|
+
if (!content) {
|
|
426
|
+
return rb_str_new_cstr("");
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
CharStr utf8_content(content);
|
|
430
|
+
return rb_str_new_cstr(utf8_content.localForm());
|
|
431
|
+
}
|
|
432
|
+
|
|
322
433
|
// document.create_element(name)
|
|
323
434
|
static VALUE document_create_element(VALUE self, VALUE name) {
|
|
324
435
|
DocumentWrapper* doc_wrapper;
|
|
@@ -354,6 +465,107 @@ static VALUE document_create_element(VALUE self, VALUE name) {
|
|
|
354
465
|
return Qnil;
|
|
355
466
|
}
|
|
356
467
|
|
|
468
|
+
#ifdef HAVE_XALAN
|
|
469
|
+
// Helper function to execute XPath using Xalan for full XPath 1.0 support
|
|
470
|
+
static VALUE execute_xpath_with_xalan(DOMNode* context_node, const char* xpath_str, VALUE doc_ref) {
|
|
471
|
+
try {
|
|
472
|
+
// Initialize Xalan if needed
|
|
473
|
+
if (!xalan_initialized) {
|
|
474
|
+
XPathEvaluator::initialize();
|
|
475
|
+
XMLPlatformUtils::Initialize();
|
|
476
|
+
xalan_initialized = true;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
// Get the document
|
|
480
|
+
DOMDocument* domDoc = context_node->getOwnerDocument();
|
|
481
|
+
if (!domDoc && context_node->getNodeType() == DOMNode::DOCUMENT_NODE) {
|
|
482
|
+
domDoc = static_cast<DOMDocument*>(context_node);
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
if (!domDoc) {
|
|
486
|
+
NodeSetWrapper* wrapper = ALLOC(NodeSetWrapper);
|
|
487
|
+
wrapper->nodes_array = rb_ary_new();
|
|
488
|
+
return TypedData_Wrap_Struct(rb_cNodeSet, &nodeset_type, wrapper);
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
// Create Xalan support objects
|
|
492
|
+
XercesParserLiaison liaison;
|
|
493
|
+
XercesDOMSupport domSupport(liaison);
|
|
494
|
+
|
|
495
|
+
// Create Xalan document - this creates and returns a XercesDocumentWrapper
|
|
496
|
+
XalanDocument* xalanDoc = liaison.createDocument(domDoc, false, false, false);
|
|
497
|
+
if (!xalanDoc) {
|
|
498
|
+
rb_raise(rb_eRuntimeError, "Failed to create Xalan document wrapper");
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
// The document IS the wrapper
|
|
502
|
+
XercesDocumentWrapper* docWrapper = static_cast<XercesDocumentWrapper*>(xalanDoc);
|
|
503
|
+
|
|
504
|
+
// Map the context node to Xalan
|
|
505
|
+
XalanNode* xalanContextNode = docWrapper->mapNode(context_node);
|
|
506
|
+
if (!xalanContextNode) {
|
|
507
|
+
xalanContextNode = docWrapper;
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
// Set up XPath factories and contexts
|
|
511
|
+
XPathEnvSupportDefault envSupport;
|
|
512
|
+
XObjectFactoryDefault objectFactory;
|
|
513
|
+
XPathExecutionContextDefault executionContext(envSupport, domSupport, objectFactory);
|
|
514
|
+
XPathConstructionContextDefault constructionContext;
|
|
515
|
+
XPathFactoryDefault factory;
|
|
516
|
+
|
|
517
|
+
// Create XPath
|
|
518
|
+
XPathProcessorImpl processor;
|
|
519
|
+
XPath* xpath = factory.create();
|
|
520
|
+
|
|
521
|
+
// Compile XPath expression
|
|
522
|
+
ElementPrefixResolverProxy resolver(docWrapper->getDocumentElement(), envSupport, domSupport);
|
|
523
|
+
processor.initXPath(*xpath, constructionContext, XalanDOMString(xpath_str), resolver);
|
|
524
|
+
|
|
525
|
+
// Execute XPath query
|
|
526
|
+
const XObjectPtr result = xpath->execute(xalanContextNode, resolver, executionContext);
|
|
527
|
+
|
|
528
|
+
VALUE nodes_array = rb_ary_new();
|
|
529
|
+
|
|
530
|
+
if (result.get() != 0) {
|
|
531
|
+
// Check if result is a node set
|
|
532
|
+
const NodeRefListBase& nodeList = result->nodeset();
|
|
533
|
+
const NodeRefListBase::size_type length = nodeList.getLength();
|
|
534
|
+
|
|
535
|
+
for (NodeRefListBase::size_type i = 0; i < length; ++i) {
|
|
536
|
+
XalanNode* xalanNode = nodeList.item(i);
|
|
537
|
+
if (xalanNode) {
|
|
538
|
+
// Map back to Xerces DOM node
|
|
539
|
+
const DOMNode* domNode = docWrapper->mapNode(xalanNode);
|
|
540
|
+
if (domNode) {
|
|
541
|
+
rb_ary_push(nodes_array, wrap_node(const_cast<DOMNode*>(domNode), doc_ref));
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
factory.returnObject(xpath);
|
|
548
|
+
|
|
549
|
+
NodeSetWrapper* wrapper = ALLOC(NodeSetWrapper);
|
|
550
|
+
wrapper->nodes_array = nodes_array;
|
|
551
|
+
return TypedData_Wrap_Struct(rb_cNodeSet, &nodeset_type, wrapper);
|
|
552
|
+
|
|
553
|
+
} catch (const XalanXPathException& e) {
|
|
554
|
+
CharStr msg(e.getMessage().c_str());
|
|
555
|
+
rb_raise(rb_eRuntimeError, "XPath error: %s", msg.localForm());
|
|
556
|
+
} catch (const XMLException& e) {
|
|
557
|
+
CharStr message(e.getMessage());
|
|
558
|
+
rb_raise(rb_eRuntimeError, "XML error: %s", message.localForm());
|
|
559
|
+
} catch (...) {
|
|
560
|
+
rb_raise(rb_eRuntimeError, "Unknown XPath error");
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
NodeSetWrapper* wrapper = ALLOC(NodeSetWrapper);
|
|
564
|
+
wrapper->nodes_array = rb_ary_new();
|
|
565
|
+
return TypedData_Wrap_Struct(rb_cNodeSet, &nodeset_type, wrapper);
|
|
566
|
+
}
|
|
567
|
+
#endif
|
|
568
|
+
|
|
357
569
|
// document.xpath(path)
|
|
358
570
|
static VALUE document_xpath(VALUE self, VALUE path) {
|
|
359
571
|
DocumentWrapper* doc_wrapper;
|
|
@@ -368,6 +580,17 @@ static VALUE document_xpath(VALUE self, VALUE path) {
|
|
|
368
580
|
Check_Type(path, T_STRING);
|
|
369
581
|
const char* xpath_str = StringValueCStr(path);
|
|
370
582
|
|
|
583
|
+
#ifdef HAVE_XALAN
|
|
584
|
+
// Use Xalan for full XPath 1.0 support
|
|
585
|
+
DOMElement* root = doc_wrapper->doc->getDocumentElement();
|
|
586
|
+
if (!root) {
|
|
587
|
+
NodeSetWrapper* wrapper = ALLOC(NodeSetWrapper);
|
|
588
|
+
wrapper->nodes_array = rb_ary_new();
|
|
589
|
+
return TypedData_Wrap_Struct(rb_cNodeSet, &nodeset_type, wrapper);
|
|
590
|
+
}
|
|
591
|
+
return execute_xpath_with_xalan(root, xpath_str, self);
|
|
592
|
+
#else
|
|
593
|
+
// Fall back to Xerces XPath subset
|
|
371
594
|
try {
|
|
372
595
|
DOMElement* root = doc_wrapper->doc->getDocumentElement();
|
|
373
596
|
if (!root) {
|
|
@@ -417,6 +640,141 @@ static VALUE document_xpath(VALUE self, VALUE path) {
|
|
|
417
640
|
NodeSetWrapper* wrapper = ALLOC(NodeSetWrapper);
|
|
418
641
|
wrapper->nodes_array = rb_ary_new();
|
|
419
642
|
return TypedData_Wrap_Struct(rb_cNodeSet, &nodeset_type, wrapper);
|
|
643
|
+
#endif
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
// document.css(selector) - Convert CSS to XPath and execute
|
|
647
|
+
static VALUE document_css(VALUE self, VALUE selector) {
|
|
648
|
+
Check_Type(selector, T_STRING);
|
|
649
|
+
const char* css_str = StringValueCStr(selector);
|
|
650
|
+
|
|
651
|
+
// Convert CSS to XPath
|
|
652
|
+
std::string xpath_str = css_to_xpath(css_str);
|
|
653
|
+
|
|
654
|
+
// Call the xpath method with converted selector
|
|
655
|
+
return document_xpath(self, rb_str_new2(xpath_str.c_str()));
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
// document.at_css(selector) - Returns first matching node
|
|
659
|
+
static VALUE document_at_css(VALUE self, VALUE selector) {
|
|
660
|
+
VALUE nodeset = document_css(self, selector);
|
|
661
|
+
|
|
662
|
+
NodeSetWrapper* wrapper;
|
|
663
|
+
TypedData_Get_Struct(nodeset, NodeSetWrapper, &nodeset_type, wrapper);
|
|
664
|
+
|
|
665
|
+
if (RARRAY_LEN(wrapper->nodes_array) == 0) {
|
|
666
|
+
return Qnil;
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
return rb_ary_entry(wrapper->nodes_array, 0);
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
// node.inspect - human-readable representation
|
|
673
|
+
static VALUE node_inspect(VALUE self) {
|
|
674
|
+
NodeWrapper* wrapper;
|
|
675
|
+
TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
|
|
676
|
+
|
|
677
|
+
if (!wrapper->node) {
|
|
678
|
+
return rb_str_new_cstr("#<RXerces::XML::Node (nil)>");
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
DOMNode::NodeType nodeType = wrapper->node->getNodeType();
|
|
682
|
+
std::string result;
|
|
683
|
+
|
|
684
|
+
// Add object ID
|
|
685
|
+
char buf[32];
|
|
686
|
+
snprintf(buf, sizeof(buf), "%016lx", (unsigned long)self);
|
|
687
|
+
|
|
688
|
+
if (nodeType == DOMNode::ELEMENT_NODE) {
|
|
689
|
+
result = "#<RXerces::XML::Element:0x";
|
|
690
|
+
result += buf;
|
|
691
|
+
result += " <";
|
|
692
|
+
|
|
693
|
+
CharStr name(wrapper->node->getNodeName());
|
|
694
|
+
result += name.localForm();
|
|
695
|
+
|
|
696
|
+
// Add attributes
|
|
697
|
+
DOMElement* element = dynamic_cast<DOMElement*>(wrapper->node);
|
|
698
|
+
if (element) {
|
|
699
|
+
DOMNamedNodeMap* attributes = element->getAttributes();
|
|
700
|
+
if (attributes && attributes->getLength() > 0) {
|
|
701
|
+
XMLSize_t attrLen = attributes->getLength();
|
|
702
|
+
if (attrLen > 3) attrLen = 3;
|
|
703
|
+
|
|
704
|
+
for (XMLSize_t i = 0; i < attrLen; i++) {
|
|
705
|
+
DOMNode* attr = attributes->item(i);
|
|
706
|
+
CharStr attrName(attr->getNodeName());
|
|
707
|
+
CharStr attrValue(attr->getNodeValue());
|
|
708
|
+
result += " ";
|
|
709
|
+
result += attrName.localForm();
|
|
710
|
+
result += "=\"";
|
|
711
|
+
result += attrValue.localForm();
|
|
712
|
+
result += "\"";
|
|
713
|
+
}
|
|
714
|
+
if (attributes->getLength() > 3) {
|
|
715
|
+
result += " ...";
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
result += ">";
|
|
721
|
+
|
|
722
|
+
// Add truncated text content
|
|
723
|
+
const XMLCh* textContent = wrapper->node->getTextContent();
|
|
724
|
+
if (textContent && XMLString::stringLen(textContent) > 0) {
|
|
725
|
+
CharStr text(textContent);
|
|
726
|
+
std::string textStr = text.localForm();
|
|
727
|
+
|
|
728
|
+
size_t start = textStr.find_first_not_of(" \t\n\r");
|
|
729
|
+
if (start != std::string::npos) {
|
|
730
|
+
size_t end = textStr.find_last_not_of(" \t\n\r");
|
|
731
|
+
textStr = textStr.substr(start, end - start + 1);
|
|
732
|
+
|
|
733
|
+
if (textStr.length() > 40) {
|
|
734
|
+
textStr = textStr.substr(0, 37) + "...";
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
result += "\"";
|
|
738
|
+
result += textStr;
|
|
739
|
+
result += "\"";
|
|
740
|
+
}
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
result += ">";
|
|
744
|
+
} else if (nodeType == DOMNode::TEXT_NODE) {
|
|
745
|
+
result = "#<RXerces::XML::Text:0x";
|
|
746
|
+
result += buf;
|
|
747
|
+
result += " \"";
|
|
748
|
+
|
|
749
|
+
const XMLCh* textContent = wrapper->node->getNodeValue();
|
|
750
|
+
if (textContent) {
|
|
751
|
+
CharStr text(textContent);
|
|
752
|
+
std::string textStr = text.localForm();
|
|
753
|
+
|
|
754
|
+
size_t start = textStr.find_first_not_of(" \t\n\r");
|
|
755
|
+
if (start != std::string::npos) {
|
|
756
|
+
size_t end = textStr.find_last_not_of(" \t\n\r");
|
|
757
|
+
textStr = textStr.substr(start, end - start + 1);
|
|
758
|
+
|
|
759
|
+
if (textStr.length() > 40) {
|
|
760
|
+
textStr = textStr.substr(0, 37) + "...";
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
result += textStr;
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
result += "\">";
|
|
768
|
+
} else {
|
|
769
|
+
result = "#<RXerces::XML::Node:0x";
|
|
770
|
+
result += buf;
|
|
771
|
+
result += " ";
|
|
772
|
+
CharStr name(wrapper->node->getNodeName());
|
|
773
|
+
result += name.localForm();
|
|
774
|
+
result += ">";
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
return rb_str_new_cstr(result.c_str());
|
|
420
778
|
}
|
|
421
779
|
|
|
422
780
|
// node.name
|
|
@@ -434,6 +792,24 @@ static VALUE node_name(VALUE self) {
|
|
|
434
792
|
return rb_str_new_cstr(utf8_name.localForm());
|
|
435
793
|
}
|
|
436
794
|
|
|
795
|
+
// node.namespace
|
|
796
|
+
static VALUE node_namespace(VALUE self) {
|
|
797
|
+
NodeWrapper* wrapper;
|
|
798
|
+
TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
|
|
799
|
+
|
|
800
|
+
if (!wrapper->node) {
|
|
801
|
+
return Qnil;
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
const XMLCh* namespaceURI = wrapper->node->getNamespaceURI();
|
|
805
|
+
if (!namespaceURI || XMLString::stringLen(namespaceURI) == 0) {
|
|
806
|
+
return Qnil;
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
CharStr utf8_namespace(namespaceURI);
|
|
810
|
+
return rb_str_new_cstr(utf8_namespace.localForm());
|
|
811
|
+
}
|
|
812
|
+
|
|
437
813
|
// node.text / node.content
|
|
438
814
|
static VALUE node_text(VALUE self) {
|
|
439
815
|
NodeWrapper* wrapper;
|
|
@@ -513,6 +889,28 @@ static VALUE node_set_attribute(VALUE self, VALUE attr_name, VALUE attr_value) {
|
|
|
513
889
|
return attr_value;
|
|
514
890
|
}
|
|
515
891
|
|
|
892
|
+
// node.has_attribute?(attribute_name)
|
|
893
|
+
static VALUE node_has_attribute_p(VALUE self, VALUE attr_name) {
|
|
894
|
+
NodeWrapper* wrapper;
|
|
895
|
+
TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
|
|
896
|
+
|
|
897
|
+
if (!wrapper->node || wrapper->node->getNodeType() != DOMNode::ELEMENT_NODE) {
|
|
898
|
+
return Qfalse;
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
Check_Type(attr_name, T_STRING);
|
|
902
|
+
const char* attr_str = StringValueCStr(attr_name);
|
|
903
|
+
|
|
904
|
+
DOMElement* element = dynamic_cast<DOMElement*>(wrapper->node);
|
|
905
|
+
const XMLCh* value = element->getAttribute(XStr(attr_str).unicodeForm());
|
|
906
|
+
|
|
907
|
+
if (!value || XMLString::stringLen(value) == 0) {
|
|
908
|
+
return Qfalse;
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
return Qtrue;
|
|
912
|
+
}
|
|
913
|
+
|
|
516
914
|
// node.children
|
|
517
915
|
static VALUE node_children(VALUE self) {
|
|
518
916
|
NodeWrapper* wrapper;
|
|
@@ -536,6 +934,31 @@ static VALUE node_children(VALUE self) {
|
|
|
536
934
|
return children;
|
|
537
935
|
}
|
|
538
936
|
|
|
937
|
+
// node.element_children - returns only element children (no text nodes)
|
|
938
|
+
static VALUE node_element_children(VALUE self) {
|
|
939
|
+
NodeWrapper* wrapper;
|
|
940
|
+
TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
|
|
941
|
+
|
|
942
|
+
VALUE doc_ref = rb_iv_get(self, "@document");
|
|
943
|
+
VALUE children = rb_ary_new();
|
|
944
|
+
|
|
945
|
+
if (!wrapper->node) {
|
|
946
|
+
return children;
|
|
947
|
+
}
|
|
948
|
+
|
|
949
|
+
DOMNodeList* child_nodes = wrapper->node->getChildNodes();
|
|
950
|
+
XMLSize_t count = child_nodes->getLength();
|
|
951
|
+
|
|
952
|
+
for (XMLSize_t i = 0; i < count; i++) {
|
|
953
|
+
DOMNode* child = child_nodes->item(i);
|
|
954
|
+
if (child->getNodeType() == DOMNode::ELEMENT_NODE) {
|
|
955
|
+
rb_ary_push(children, wrap_node(child, doc_ref));
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
return children;
|
|
960
|
+
}
|
|
961
|
+
|
|
539
962
|
// node.parent
|
|
540
963
|
static VALUE node_parent(VALUE self) {
|
|
541
964
|
NodeWrapper* wrapper;
|
|
@@ -554,6 +977,78 @@ static VALUE node_parent(VALUE self) {
|
|
|
554
977
|
return wrap_node(parent, doc_ref);
|
|
555
978
|
}
|
|
556
979
|
|
|
980
|
+
// node.ancestors(selector = nil) - returns an array of all ancestor nodes, optionally filtered by selector
|
|
981
|
+
static VALUE node_ancestors(int argc, VALUE* argv, VALUE self) {
|
|
982
|
+
VALUE selector;
|
|
983
|
+
rb_scan_args(argc, argv, "01", &selector);
|
|
984
|
+
|
|
985
|
+
NodeWrapper* wrapper;
|
|
986
|
+
TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
|
|
987
|
+
|
|
988
|
+
VALUE ancestors = rb_ary_new();
|
|
989
|
+
|
|
990
|
+
if (!wrapper->node) {
|
|
991
|
+
return ancestors;
|
|
992
|
+
}
|
|
993
|
+
|
|
994
|
+
VALUE doc_ref = rb_iv_get(self, "@document");
|
|
995
|
+
DOMNode* current = wrapper->node->getParentNode();
|
|
996
|
+
|
|
997
|
+
// Walk up the tree, collecting all ancestors
|
|
998
|
+
while (current) {
|
|
999
|
+
// Stop at the document node (don't include it in ancestors)
|
|
1000
|
+
if (current->getNodeType() == DOMNode::DOCUMENT_NODE) {
|
|
1001
|
+
break;
|
|
1002
|
+
}
|
|
1003
|
+
rb_ary_push(ancestors, wrap_node(current, doc_ref));
|
|
1004
|
+
current = current->getParentNode();
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
// If selector is provided, filter the ancestors
|
|
1008
|
+
if (!NIL_P(selector)) {
|
|
1009
|
+
Check_Type(selector, T_STRING);
|
|
1010
|
+
const char* selector_str = StringValueCStr(selector);
|
|
1011
|
+
|
|
1012
|
+
// Convert CSS to XPath if needed (css_to_xpath adds // prefix)
|
|
1013
|
+
std::string xpath_str = css_to_xpath(selector_str);
|
|
1014
|
+
|
|
1015
|
+
// Get all matching nodes from the document
|
|
1016
|
+
VALUE all_matches = document_xpath(doc_ref, rb_str_new2(xpath_str.c_str()));
|
|
1017
|
+
|
|
1018
|
+
NodeSetWrapper* matches_wrapper;
|
|
1019
|
+
TypedData_Get_Struct(all_matches, NodeSetWrapper, &nodeset_type, matches_wrapper);
|
|
1020
|
+
|
|
1021
|
+
VALUE filtered = rb_ary_new();
|
|
1022
|
+
long ancestor_len = RARRAY_LEN(ancestors);
|
|
1023
|
+
long matches_len = RARRAY_LEN(matches_wrapper->nodes_array);
|
|
1024
|
+
|
|
1025
|
+
// For each ancestor, check if it's in the matches
|
|
1026
|
+
for (long i = 0; i < ancestor_len; i++) {
|
|
1027
|
+
VALUE ancestor = rb_ary_entry(ancestors, i);
|
|
1028
|
+
|
|
1029
|
+
NodeWrapper* ancestor_wrapper;
|
|
1030
|
+
TypedData_Get_Struct(ancestor, NodeWrapper, &node_type, ancestor_wrapper);
|
|
1031
|
+
|
|
1032
|
+
// Check if this ancestor node is in the matches
|
|
1033
|
+
for (long j = 0; j < matches_len; j++) {
|
|
1034
|
+
VALUE match = rb_ary_entry(matches_wrapper->nodes_array, j);
|
|
1035
|
+
NodeWrapper* match_wrapper;
|
|
1036
|
+
TypedData_Get_Struct(match, NodeWrapper, &node_type, match_wrapper);
|
|
1037
|
+
|
|
1038
|
+
// Compare the actual DOM nodes
|
|
1039
|
+
if (ancestor_wrapper->node == match_wrapper->node) {
|
|
1040
|
+
rb_ary_push(filtered, ancestor);
|
|
1041
|
+
break;
|
|
1042
|
+
}
|
|
1043
|
+
}
|
|
1044
|
+
}
|
|
1045
|
+
|
|
1046
|
+
return filtered;
|
|
1047
|
+
}
|
|
1048
|
+
|
|
1049
|
+
return ancestors;
|
|
1050
|
+
}
|
|
1051
|
+
|
|
557
1052
|
// node.attributes - returns hash of all attributes (only for element nodes)
|
|
558
1053
|
static VALUE node_attributes(VALUE self) {
|
|
559
1054
|
NodeWrapper* wrapper;
|
|
@@ -627,6 +1122,54 @@ static VALUE node_previous_sibling(VALUE self) {
|
|
|
627
1122
|
return wrap_node(prev, doc_ref);
|
|
628
1123
|
}
|
|
629
1124
|
|
|
1125
|
+
// node.next_element - next sibling that is an element (skipping text nodes)
|
|
1126
|
+
static VALUE node_next_element(VALUE self) {
|
|
1127
|
+
NodeWrapper* wrapper;
|
|
1128
|
+
TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
|
|
1129
|
+
|
|
1130
|
+
if (!wrapper->node) {
|
|
1131
|
+
return Qnil;
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
VALUE doc_ref = rb_iv_get(self, "@document");
|
|
1135
|
+
DOMNode* next = wrapper->node->getNextSibling();
|
|
1136
|
+
|
|
1137
|
+
// Skip non-element nodes
|
|
1138
|
+
while (next && next->getNodeType() != DOMNode::ELEMENT_NODE) {
|
|
1139
|
+
next = next->getNextSibling();
|
|
1140
|
+
}
|
|
1141
|
+
|
|
1142
|
+
if (!next) {
|
|
1143
|
+
return Qnil;
|
|
1144
|
+
}
|
|
1145
|
+
|
|
1146
|
+
return wrap_node(next, doc_ref);
|
|
1147
|
+
}
|
|
1148
|
+
|
|
1149
|
+
// node.previous_element - previous sibling that is an element (skipping text nodes)
|
|
1150
|
+
static VALUE node_previous_element(VALUE self) {
|
|
1151
|
+
NodeWrapper* wrapper;
|
|
1152
|
+
TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
|
|
1153
|
+
|
|
1154
|
+
if (!wrapper->node) {
|
|
1155
|
+
return Qnil;
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
VALUE doc_ref = rb_iv_get(self, "@document");
|
|
1159
|
+
DOMNode* prev = wrapper->node->getPreviousSibling();
|
|
1160
|
+
|
|
1161
|
+
// Skip non-element nodes
|
|
1162
|
+
while (prev && prev->getNodeType() != DOMNode::ELEMENT_NODE) {
|
|
1163
|
+
prev = prev->getPreviousSibling();
|
|
1164
|
+
}
|
|
1165
|
+
|
|
1166
|
+
if (!prev) {
|
|
1167
|
+
return Qnil;
|
|
1168
|
+
}
|
|
1169
|
+
|
|
1170
|
+
return wrap_node(prev, doc_ref);
|
|
1171
|
+
}
|
|
1172
|
+
|
|
630
1173
|
// node.add_child(node_or_string) - adds a child node
|
|
631
1174
|
static VALUE node_add_child(VALUE self, VALUE child) {
|
|
632
1175
|
NodeWrapper* wrapper;
|
|
@@ -880,6 +1423,11 @@ static VALUE node_xpath(VALUE self, VALUE path) {
|
|
|
880
1423
|
const char* xpath_str = StringValueCStr(path);
|
|
881
1424
|
VALUE doc_ref = rb_iv_get(self, "@document");
|
|
882
1425
|
|
|
1426
|
+
#ifdef HAVE_XALAN
|
|
1427
|
+
// Use Xalan for full XPath 1.0 support
|
|
1428
|
+
return execute_xpath_with_xalan(node_wrapper->node, xpath_str, doc_ref);
|
|
1429
|
+
#else
|
|
1430
|
+
// Fall back to Xerces XPath subset
|
|
883
1431
|
try {
|
|
884
1432
|
DOMDocument* doc = node_wrapper->node->getOwnerDocument();
|
|
885
1433
|
if (!doc) {
|
|
@@ -929,6 +1477,201 @@ static VALUE node_xpath(VALUE self, VALUE path) {
|
|
|
929
1477
|
NodeSetWrapper* wrapper = ALLOC(NodeSetWrapper);
|
|
930
1478
|
wrapper->nodes_array = rb_ary_new();
|
|
931
1479
|
return TypedData_Wrap_Struct(rb_cNodeSet, &nodeset_type, wrapper);
|
|
1480
|
+
#endif
|
|
1481
|
+
}
|
|
1482
|
+
|
|
1483
|
+
// node.at_xpath(path) - returns first matching node or nil
|
|
1484
|
+
static VALUE node_at_xpath(VALUE self, VALUE path) {
|
|
1485
|
+
VALUE nodeset = node_xpath(self, path);
|
|
1486
|
+
NodeSetWrapper* wrapper;
|
|
1487
|
+
TypedData_Get_Struct(nodeset, NodeSetWrapper, &nodeset_type, wrapper);
|
|
1488
|
+
|
|
1489
|
+
if (RARRAY_LEN(wrapper->nodes_array) == 0) {
|
|
1490
|
+
return Qnil;
|
|
1491
|
+
}
|
|
1492
|
+
|
|
1493
|
+
return rb_ary_entry(wrapper->nodes_array, 0);
|
|
1494
|
+
}
|
|
1495
|
+
|
|
1496
|
+
// node.at_css(selector) - returns first matching node or nil
|
|
1497
|
+
static VALUE node_at_css(VALUE self, VALUE selector) {
|
|
1498
|
+
VALUE nodeset = node_css(self, selector);
|
|
1499
|
+
NodeSetWrapper* wrapper;
|
|
1500
|
+
TypedData_Get_Struct(nodeset, NodeSetWrapper, &nodeset_type, wrapper);
|
|
1501
|
+
|
|
1502
|
+
if (RARRAY_LEN(wrapper->nodes_array) == 0) {
|
|
1503
|
+
return Qnil;
|
|
1504
|
+
}
|
|
1505
|
+
|
|
1506
|
+
return rb_ary_entry(wrapper->nodes_array, 0);
|
|
1507
|
+
}
|
|
1508
|
+
|
|
1509
|
+
// Helper function to convert basic CSS selectors to XPath
|
|
1510
|
+
// Supports common patterns like: tag, .class, #id, tag.class, tag#id, [attr], [attr=value]
|
|
1511
|
+
static std::string css_to_xpath(const char* css) {
|
|
1512
|
+
std::string selector(css);
|
|
1513
|
+
|
|
1514
|
+
// Trim whitespace
|
|
1515
|
+
size_t start = selector.find_first_not_of(" \t\n\r");
|
|
1516
|
+
size_t end = selector.find_last_not_of(" \t\n\r");
|
|
1517
|
+
if (start == std::string::npos) return "//*";
|
|
1518
|
+
selector = selector.substr(start, end - start + 1);
|
|
1519
|
+
|
|
1520
|
+
std::string result = "//";
|
|
1521
|
+
std::string current_element = "*";
|
|
1522
|
+
bool has_element = false;
|
|
1523
|
+
bool in_brackets = false;
|
|
1524
|
+
|
|
1525
|
+
for (size_t i = 0; i < selector.length(); i++) {
|
|
1526
|
+
char c = selector[i];
|
|
1527
|
+
|
|
1528
|
+
if (c == '[') in_brackets = true;
|
|
1529
|
+
if (c == ']') in_brackets = false;
|
|
1530
|
+
|
|
1531
|
+
// Handle spaces (descendant combinator) outside of attribute selectors
|
|
1532
|
+
if (c == ' ' && !in_brackets) {
|
|
1533
|
+
// Flush current element
|
|
1534
|
+
if (!has_element && current_element != "*") {
|
|
1535
|
+
result += current_element;
|
|
1536
|
+
}
|
|
1537
|
+
// Skip multiple spaces
|
|
1538
|
+
while (i + 1 < selector.length() && selector[i + 1] == ' ') i++;
|
|
1539
|
+
result += "//";
|
|
1540
|
+
current_element = "*";
|
|
1541
|
+
has_element = false;
|
|
1542
|
+
continue;
|
|
1543
|
+
}
|
|
1544
|
+
|
|
1545
|
+
// Handle child combinator
|
|
1546
|
+
if (c == '>' && !in_brackets) {
|
|
1547
|
+
// Flush current element
|
|
1548
|
+
if (!has_element && current_element != "*") {
|
|
1549
|
+
result += current_element;
|
|
1550
|
+
}
|
|
1551
|
+
// Remove any trailing slashes and spaces
|
|
1552
|
+
while (!result.empty() && (result.back() == ' ' || result.back() == '/')) {
|
|
1553
|
+
if (result.back() == '/') {
|
|
1554
|
+
result.pop_back();
|
|
1555
|
+
break;
|
|
1556
|
+
}
|
|
1557
|
+
result.pop_back();
|
|
1558
|
+
}
|
|
1559
|
+
result += "/";
|
|
1560
|
+
// Skip spaces after >
|
|
1561
|
+
while (i + 1 < selector.length() && selector[i + 1] == ' ') i++;
|
|
1562
|
+
current_element = "*";
|
|
1563
|
+
has_element = false;
|
|
1564
|
+
continue;
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1567
|
+
// Handle ID selector
|
|
1568
|
+
if (c == '#' && !in_brackets) {
|
|
1569
|
+
if (!has_element) {
|
|
1570
|
+
result += "*";
|
|
1571
|
+
has_element = true;
|
|
1572
|
+
} else if (current_element != "*") {
|
|
1573
|
+
result += current_element;
|
|
1574
|
+
current_element = "*";
|
|
1575
|
+
has_element = true;
|
|
1576
|
+
}
|
|
1577
|
+
result += "[@id='";
|
|
1578
|
+
i++;
|
|
1579
|
+
while (i < selector.length() && selector[i] != ' ' && selector[i] != '.' &&
|
|
1580
|
+
selector[i] != '[' && selector[i] != '>' && selector[i] != '+' && selector[i] != '~') {
|
|
1581
|
+
result += selector[i++];
|
|
1582
|
+
}
|
|
1583
|
+
result += "']";
|
|
1584
|
+
i--;
|
|
1585
|
+
continue;
|
|
1586
|
+
}
|
|
1587
|
+
|
|
1588
|
+
// Handle class selector
|
|
1589
|
+
if (c == '.' && !in_brackets) {
|
|
1590
|
+
if (!has_element) {
|
|
1591
|
+
result += "*";
|
|
1592
|
+
has_element = true;
|
|
1593
|
+
} else if (current_element != "*") {
|
|
1594
|
+
result += current_element;
|
|
1595
|
+
current_element = "*";
|
|
1596
|
+
has_element = true;
|
|
1597
|
+
}
|
|
1598
|
+
result += "[contains(concat(' ', @class, ' '), ' ";
|
|
1599
|
+
i++;
|
|
1600
|
+
while (i < selector.length() && selector[i] != ' ' && selector[i] != '.' &&
|
|
1601
|
+
selector[i] != '[' && selector[i] != '>' && selector[i] != '+' && selector[i] != '~' && selector[i] != '#') {
|
|
1602
|
+
result += selector[i++];
|
|
1603
|
+
}
|
|
1604
|
+
result += " ')]";
|
|
1605
|
+
i--;
|
|
1606
|
+
continue;
|
|
1607
|
+
}
|
|
1608
|
+
|
|
1609
|
+
// Handle attribute selectors
|
|
1610
|
+
if (c == '[') {
|
|
1611
|
+
if (!has_element && current_element != "*") {
|
|
1612
|
+
result += current_element;
|
|
1613
|
+
has_element = true;
|
|
1614
|
+
}
|
|
1615
|
+
result += "[@";
|
|
1616
|
+
i++;
|
|
1617
|
+
// Get attribute name
|
|
1618
|
+
while (i < selector.length() && selector[i] != ']' && selector[i] != '=' &&
|
|
1619
|
+
selector[i] != '!' && selector[i] != '~' && selector[i] != '^' && selector[i] != '$' && selector[i] != '*') {
|
|
1620
|
+
result += selector[i++];
|
|
1621
|
+
}
|
|
1622
|
+
|
|
1623
|
+
if (i < selector.length() && selector[i] == '=') {
|
|
1624
|
+
result += "='";
|
|
1625
|
+
i++;
|
|
1626
|
+
// Skip quotes if present
|
|
1627
|
+
if (i < selector.length() && (selector[i] == '"' || selector[i] == '\'')) {
|
|
1628
|
+
char quote = selector[i++];
|
|
1629
|
+
while (i < selector.length() && selector[i] != quote) {
|
|
1630
|
+
result += selector[i++];
|
|
1631
|
+
}
|
|
1632
|
+
if (i < selector.length()) i++; // Skip closing quote
|
|
1633
|
+
} else {
|
|
1634
|
+
// No quotes, read until ]
|
|
1635
|
+
while (i < selector.length() && selector[i] != ']') {
|
|
1636
|
+
result += selector[i++];
|
|
1637
|
+
}
|
|
1638
|
+
}
|
|
1639
|
+
result += "'";
|
|
1640
|
+
}
|
|
1641
|
+
|
|
1642
|
+
// Skip to closing bracket
|
|
1643
|
+
while (i < selector.length() && selector[i] != ']') i++;
|
|
1644
|
+
result += ']';
|
|
1645
|
+
continue;
|
|
1646
|
+
}
|
|
1647
|
+
|
|
1648
|
+
// Regular character - part of element name
|
|
1649
|
+
if (c != ' ' && c != '>' && c != '.' && c != '#' && c != '[' && !has_element) {
|
|
1650
|
+
if (current_element == "*") {
|
|
1651
|
+
current_element = "";
|
|
1652
|
+
}
|
|
1653
|
+
current_element += c;
|
|
1654
|
+
}
|
|
1655
|
+
}
|
|
1656
|
+
|
|
1657
|
+
// Flush any remaining element name
|
|
1658
|
+
if (!has_element && current_element != "*") {
|
|
1659
|
+
result += current_element;
|
|
1660
|
+
}
|
|
1661
|
+
|
|
1662
|
+
return result;
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
// node.css(selector) - Convert CSS to XPath and execute
|
|
1666
|
+
static VALUE node_css(VALUE self, VALUE selector) {
|
|
1667
|
+
Check_Type(selector, T_STRING);
|
|
1668
|
+
const char* css_str = StringValueCStr(selector);
|
|
1669
|
+
|
|
1670
|
+
// Convert CSS to XPath
|
|
1671
|
+
std::string xpath_str = css_to_xpath(css_str);
|
|
1672
|
+
|
|
1673
|
+
// Call the xpath method with converted selector
|
|
1674
|
+
return node_xpath(self, rb_str_new2(xpath_str.c_str()));
|
|
932
1675
|
}
|
|
933
1676
|
|
|
934
1677
|
// nodeset.length / nodeset.size
|
|
@@ -972,6 +1715,205 @@ static VALUE nodeset_to_a(VALUE self) {
|
|
|
972
1715
|
return rb_ary_dup(wrapper->nodes_array);
|
|
973
1716
|
}
|
|
974
1717
|
|
|
1718
|
+
// nodeset.first - returns first node or nil
|
|
1719
|
+
static VALUE nodeset_first(VALUE self) {
|
|
1720
|
+
NodeSetWrapper* wrapper;
|
|
1721
|
+
TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
|
|
1722
|
+
|
|
1723
|
+
if (RARRAY_LEN(wrapper->nodes_array) == 0) {
|
|
1724
|
+
return Qnil;
|
|
1725
|
+
}
|
|
1726
|
+
|
|
1727
|
+
return rb_ary_entry(wrapper->nodes_array, 0);
|
|
1728
|
+
}
|
|
1729
|
+
|
|
1730
|
+
// nodeset.last - returns last node or nil
|
|
1731
|
+
static VALUE nodeset_last(VALUE self) {
|
|
1732
|
+
NodeSetWrapper* wrapper;
|
|
1733
|
+
TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
|
|
1734
|
+
|
|
1735
|
+
long len = RARRAY_LEN(wrapper->nodes_array);
|
|
1736
|
+
if (len == 0) {
|
|
1737
|
+
return Qnil;
|
|
1738
|
+
}
|
|
1739
|
+
|
|
1740
|
+
return rb_ary_entry(wrapper->nodes_array, len - 1);
|
|
1741
|
+
}
|
|
1742
|
+
|
|
1743
|
+
// nodeset.empty? - returns true if nodeset is empty
|
|
1744
|
+
static VALUE nodeset_empty_p(VALUE self) {
|
|
1745
|
+
NodeSetWrapper* wrapper;
|
|
1746
|
+
TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
|
|
1747
|
+
|
|
1748
|
+
return RARRAY_LEN(wrapper->nodes_array) == 0 ? Qtrue : Qfalse;
|
|
1749
|
+
}
|
|
1750
|
+
|
|
1751
|
+
// nodeset.inner_html - returns concatenated inner_html of all nodes
|
|
1752
|
+
static VALUE nodeset_inner_html(VALUE self) {
|
|
1753
|
+
NodeSetWrapper* wrapper;
|
|
1754
|
+
TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
|
|
1755
|
+
|
|
1756
|
+
std::string result;
|
|
1757
|
+
long len = RARRAY_LEN(wrapper->nodes_array);
|
|
1758
|
+
|
|
1759
|
+
for (long i = 0; i < len; i++) {
|
|
1760
|
+
VALUE node = rb_ary_entry(wrapper->nodes_array, i);
|
|
1761
|
+
VALUE inner_html = rb_funcall(node, rb_intern("inner_html"), 0);
|
|
1762
|
+
result += StringValueCStr(inner_html);
|
|
1763
|
+
}
|
|
1764
|
+
|
|
1765
|
+
return rb_str_new_cstr(result.c_str());
|
|
1766
|
+
}
|
|
1767
|
+
|
|
1768
|
+
// nodeset.text - returns concatenated text content of all nodes
|
|
1769
|
+
static VALUE nodeset_text(VALUE self) {
|
|
1770
|
+
NodeSetWrapper* wrapper;
|
|
1771
|
+
TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
|
|
1772
|
+
|
|
1773
|
+
std::string result;
|
|
1774
|
+
long len = RARRAY_LEN(wrapper->nodes_array);
|
|
1775
|
+
|
|
1776
|
+
for (long i = 0; i < len; i++) {
|
|
1777
|
+
VALUE node = rb_ary_entry(wrapper->nodes_array, i);
|
|
1778
|
+
NodeWrapper* node_wrapper;
|
|
1779
|
+
TypedData_Get_Struct(node, NodeWrapper, &node_type, node_wrapper);
|
|
1780
|
+
|
|
1781
|
+
if (node_wrapper->node) {
|
|
1782
|
+
const XMLCh* content = node_wrapper->node->getTextContent();
|
|
1783
|
+
if (content) {
|
|
1784
|
+
CharStr utf8_content(content);
|
|
1785
|
+
result += utf8_content.localForm();
|
|
1786
|
+
}
|
|
1787
|
+
}
|
|
1788
|
+
}
|
|
1789
|
+
|
|
1790
|
+
return rb_str_new_cstr(result.c_str());
|
|
1791
|
+
}
|
|
1792
|
+
|
|
1793
|
+
// nodeset.inspect / nodeset.to_s - human-readable representation
|
|
1794
|
+
static VALUE nodeset_inspect(VALUE self) {
|
|
1795
|
+
NodeSetWrapper* wrapper;
|
|
1796
|
+
TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
|
|
1797
|
+
|
|
1798
|
+
long len = RARRAY_LEN(wrapper->nodes_array);
|
|
1799
|
+
std::string result = "#<RXerces::XML::NodeSet:0x";
|
|
1800
|
+
|
|
1801
|
+
// Add object ID
|
|
1802
|
+
char buf[32];
|
|
1803
|
+
snprintf(buf, sizeof(buf), "%016lx", (unsigned long)self);
|
|
1804
|
+
result += buf;
|
|
1805
|
+
result += " [";
|
|
1806
|
+
|
|
1807
|
+
for (long i = 0; i < len; i++) {
|
|
1808
|
+
if (i > 0) result += ", ";
|
|
1809
|
+
|
|
1810
|
+
VALUE node = rb_ary_entry(wrapper->nodes_array, i);
|
|
1811
|
+
NodeWrapper* node_wrapper;
|
|
1812
|
+
TypedData_Get_Struct(node, NodeWrapper, &node_type, node_wrapper);
|
|
1813
|
+
|
|
1814
|
+
if (!node_wrapper->node) {
|
|
1815
|
+
result += "nil";
|
|
1816
|
+
continue;
|
|
1817
|
+
}
|
|
1818
|
+
|
|
1819
|
+
DOMNode::NodeType nodeType = node_wrapper->node->getNodeType();
|
|
1820
|
+
|
|
1821
|
+
if (nodeType == DOMNode::ELEMENT_NODE) {
|
|
1822
|
+
// For elements, show: <tag attr="value">content</tag>
|
|
1823
|
+
CharStr name(node_wrapper->node->getNodeName());
|
|
1824
|
+
result += "<";
|
|
1825
|
+
result += name.localForm();
|
|
1826
|
+
|
|
1827
|
+
// Add first few attributes if present
|
|
1828
|
+
DOMElement* element = dynamic_cast<DOMElement*>(node_wrapper->node);
|
|
1829
|
+
if (element) {
|
|
1830
|
+
DOMNamedNodeMap* attributes = element->getAttributes();
|
|
1831
|
+
if (attributes && attributes->getLength() > 0) {
|
|
1832
|
+
XMLSize_t attrLen = attributes->getLength();
|
|
1833
|
+
if (attrLen > 3) attrLen = 3; // Limit to first 3 attributes
|
|
1834
|
+
|
|
1835
|
+
for (XMLSize_t j = 0; j < attrLen; j++) {
|
|
1836
|
+
DOMNode* attr = attributes->item(j);
|
|
1837
|
+
CharStr attrName(attr->getNodeName());
|
|
1838
|
+
CharStr attrValue(attr->getNodeValue());
|
|
1839
|
+
result += " ";
|
|
1840
|
+
result += attrName.localForm();
|
|
1841
|
+
result += "=\"";
|
|
1842
|
+
result += attrValue.localForm();
|
|
1843
|
+
result += "\"";
|
|
1844
|
+
}
|
|
1845
|
+
if (attributes->getLength() > 3) {
|
|
1846
|
+
result += " ...";
|
|
1847
|
+
}
|
|
1848
|
+
}
|
|
1849
|
+
}
|
|
1850
|
+
|
|
1851
|
+
// Show truncated text content
|
|
1852
|
+
const XMLCh* textContent = node_wrapper->node->getTextContent();
|
|
1853
|
+
if (textContent && XMLString::stringLen(textContent) > 0) {
|
|
1854
|
+
CharStr text(textContent);
|
|
1855
|
+
std::string textStr = text.localForm();
|
|
1856
|
+
|
|
1857
|
+
// Trim whitespace and truncate
|
|
1858
|
+
size_t start = textStr.find_first_not_of(" \t\n\r");
|
|
1859
|
+
if (start != std::string::npos) {
|
|
1860
|
+
size_t end = textStr.find_last_not_of(" \t\n\r");
|
|
1861
|
+
textStr = textStr.substr(start, end - start + 1);
|
|
1862
|
+
|
|
1863
|
+
if (textStr.length() > 30) {
|
|
1864
|
+
textStr = textStr.substr(0, 27) + "...";
|
|
1865
|
+
}
|
|
1866
|
+
|
|
1867
|
+
result += ">";
|
|
1868
|
+
result += textStr;
|
|
1869
|
+
result += "</";
|
|
1870
|
+
result += name.localForm();
|
|
1871
|
+
result += ">";
|
|
1872
|
+
} else {
|
|
1873
|
+
result += ">";
|
|
1874
|
+
}
|
|
1875
|
+
} else {
|
|
1876
|
+
result += ">";
|
|
1877
|
+
}
|
|
1878
|
+
} else if (nodeType == DOMNode::TEXT_NODE) {
|
|
1879
|
+
// For text nodes, show: text("content")
|
|
1880
|
+
const XMLCh* textContent = node_wrapper->node->getNodeValue();
|
|
1881
|
+
if (textContent) {
|
|
1882
|
+
CharStr text(textContent);
|
|
1883
|
+
std::string textStr = text.localForm();
|
|
1884
|
+
|
|
1885
|
+
// Trim and truncate
|
|
1886
|
+
size_t start = textStr.find_first_not_of(" \t\n\r");
|
|
1887
|
+
if (start != std::string::npos) {
|
|
1888
|
+
size_t end = textStr.find_last_not_of(" \t\n\r");
|
|
1889
|
+
textStr = textStr.substr(start, end - start + 1);
|
|
1890
|
+
|
|
1891
|
+
if (textStr.length() > 30) {
|
|
1892
|
+
textStr = textStr.substr(0, 27) + "...";
|
|
1893
|
+
}
|
|
1894
|
+
|
|
1895
|
+
result += "text(\"";
|
|
1896
|
+
result += textStr;
|
|
1897
|
+
result += "\")";
|
|
1898
|
+
} else {
|
|
1899
|
+
result += "text()";
|
|
1900
|
+
}
|
|
1901
|
+
} else {
|
|
1902
|
+
result += "text()";
|
|
1903
|
+
}
|
|
1904
|
+
} else {
|
|
1905
|
+
// For other nodes, just show the type
|
|
1906
|
+
CharStr name(node_wrapper->node->getNodeName());
|
|
1907
|
+
result += "#<";
|
|
1908
|
+
result += name.localForm();
|
|
1909
|
+
result += ">";
|
|
1910
|
+
}
|
|
1911
|
+
}
|
|
1912
|
+
|
|
1913
|
+
result += "]>";
|
|
1914
|
+
return rb_str_new_cstr(result.c_str());
|
|
1915
|
+
}
|
|
1916
|
+
|
|
975
1917
|
// Schema.from_document(schema_doc) or Schema.from_string(xsd_string)
|
|
976
1918
|
static VALUE schema_from_document(int argc, VALUE* argv, VALUE klass) {
|
|
977
1919
|
VALUE schema_source;
|
|
@@ -1162,32 +2104,55 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
|
|
|
1162
2104
|
rb_define_singleton_method(rb_cDocument, "parse", RUBY_METHOD_FUNC(document_parse), 1);
|
|
1163
2105
|
rb_define_method(rb_cDocument, "root", RUBY_METHOD_FUNC(document_root), 0);
|
|
1164
2106
|
rb_define_method(rb_cDocument, "to_s", RUBY_METHOD_FUNC(document_to_s), 0);
|
|
1165
|
-
|
|
2107
|
+
rb_define_alias(rb_cDocument, "to_xml", "to_s");
|
|
2108
|
+
rb_define_method(rb_cDocument, "inspect", RUBY_METHOD_FUNC(document_inspect), 0);
|
|
1166
2109
|
rb_define_method(rb_cDocument, "xpath", RUBY_METHOD_FUNC(document_xpath), 1);
|
|
2110
|
+
rb_define_method(rb_cDocument, "css", RUBY_METHOD_FUNC(document_css), 1);
|
|
2111
|
+
rb_define_method(rb_cDocument, "at_css", RUBY_METHOD_FUNC(document_at_css), 1);
|
|
2112
|
+
rb_define_method(rb_cDocument, "encoding", RUBY_METHOD_FUNC(document_encoding), 0);
|
|
2113
|
+
rb_define_method(rb_cDocument, "text", RUBY_METHOD_FUNC(document_text), 0);
|
|
2114
|
+
rb_define_alias(rb_cDocument, "content", "text");
|
|
1167
2115
|
rb_define_method(rb_cDocument, "create_element", RUBY_METHOD_FUNC(document_create_element), 1);
|
|
1168
2116
|
|
|
1169
2117
|
rb_cNode = rb_define_class_under(rb_mXML, "Node", rb_cObject);
|
|
1170
2118
|
rb_undef_alloc_func(rb_cNode);
|
|
2119
|
+
rb_define_method(rb_cNode, "inspect", RUBY_METHOD_FUNC(node_inspect), 0);
|
|
1171
2120
|
rb_define_method(rb_cNode, "name", RUBY_METHOD_FUNC(node_name), 0);
|
|
2121
|
+
rb_define_method(rb_cNode, "namespace", RUBY_METHOD_FUNC(node_namespace), 0);
|
|
1172
2122
|
rb_define_method(rb_cNode, "text", RUBY_METHOD_FUNC(node_text), 0);
|
|
1173
|
-
|
|
2123
|
+
rb_define_alias(rb_cNode, "content", "text");
|
|
1174
2124
|
rb_define_method(rb_cNode, "text=", RUBY_METHOD_FUNC(node_text_set), 1);
|
|
1175
|
-
|
|
2125
|
+
rb_define_alias(rb_cNode, "content=", "text=");
|
|
1176
2126
|
rb_define_method(rb_cNode, "[]", RUBY_METHOD_FUNC(node_get_attribute), 1);
|
|
1177
2127
|
rb_define_method(rb_cNode, "[]=", RUBY_METHOD_FUNC(node_set_attribute), 2);
|
|
2128
|
+
rb_define_alias(rb_cNode, "get_attribute", "[]");
|
|
2129
|
+
rb_define_alias(rb_cNode, "attribute", "[]");
|
|
2130
|
+
rb_define_method(rb_cNode, "has_attribute?", RUBY_METHOD_FUNC(node_has_attribute_p), 1);
|
|
1178
2131
|
rb_define_method(rb_cNode, "children", RUBY_METHOD_FUNC(node_children), 0);
|
|
2132
|
+
rb_define_method(rb_cNode, "element_children", RUBY_METHOD_FUNC(node_element_children), 0);
|
|
2133
|
+
rb_define_alias(rb_cNode, "elements", "element_children");
|
|
1179
2134
|
rb_define_method(rb_cNode, "parent", RUBY_METHOD_FUNC(node_parent), 0);
|
|
2135
|
+
rb_define_method(rb_cNode, "ancestors", RUBY_METHOD_FUNC(node_ancestors), -1);
|
|
1180
2136
|
rb_define_method(rb_cNode, "attributes", RUBY_METHOD_FUNC(node_attributes), 0);
|
|
1181
2137
|
rb_define_method(rb_cNode, "next_sibling", RUBY_METHOD_FUNC(node_next_sibling), 0);
|
|
2138
|
+
rb_define_method(rb_cNode, "next_element", RUBY_METHOD_FUNC(node_next_element), 0);
|
|
1182
2139
|
rb_define_method(rb_cNode, "previous_sibling", RUBY_METHOD_FUNC(node_previous_sibling), 0);
|
|
2140
|
+
rb_define_method(rb_cNode, "previous_element", RUBY_METHOD_FUNC(node_previous_element), 0);
|
|
1183
2141
|
rb_define_method(rb_cNode, "add_child", RUBY_METHOD_FUNC(node_add_child), 1);
|
|
1184
2142
|
rb_define_method(rb_cNode, "remove", RUBY_METHOD_FUNC(node_remove), 0);
|
|
1185
|
-
|
|
2143
|
+
rb_define_alias(rb_cNode, "unlink", "remove");
|
|
1186
2144
|
rb_define_method(rb_cNode, "inner_html", RUBY_METHOD_FUNC(node_inner_html), 0);
|
|
1187
|
-
|
|
2145
|
+
rb_define_alias(rb_cNode, "inner_xml", "inner_html");
|
|
1188
2146
|
rb_define_method(rb_cNode, "path", RUBY_METHOD_FUNC(node_path), 0);
|
|
1189
2147
|
rb_define_method(rb_cNode, "blank?", RUBY_METHOD_FUNC(node_blank_p), 0);
|
|
1190
2148
|
rb_define_method(rb_cNode, "xpath", RUBY_METHOD_FUNC(node_xpath), 1);
|
|
2149
|
+
rb_define_alias(rb_cNode, "search", "xpath");
|
|
2150
|
+
rb_define_method(rb_cNode, "at_xpath", RUBY_METHOD_FUNC(node_at_xpath), 1);
|
|
2151
|
+
rb_define_alias(rb_cNode, "at", "at_xpath");
|
|
2152
|
+
rb_define_method(rb_cNode, "css", RUBY_METHOD_FUNC(node_css), 1);
|
|
2153
|
+
rb_define_method(rb_cNode, "at_css", RUBY_METHOD_FUNC(node_at_css), 1);
|
|
2154
|
+
rb_define_alias(rb_cNode, "get_attribute", "[]");
|
|
2155
|
+
rb_define_alias(rb_cNode, "attribute", "[]");
|
|
1191
2156
|
|
|
1192
2157
|
rb_cElement = rb_define_class_under(rb_mXML, "Element", rb_cNode);
|
|
1193
2158
|
rb_undef_alloc_func(rb_cElement);
|
|
@@ -1198,10 +2163,17 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
|
|
|
1198
2163
|
rb_cNodeSet = rb_define_class_under(rb_mXML, "NodeSet", rb_cObject);
|
|
1199
2164
|
rb_undef_alloc_func(rb_cNodeSet);
|
|
1200
2165
|
rb_define_method(rb_cNodeSet, "length", RUBY_METHOD_FUNC(nodeset_length), 0);
|
|
1201
|
-
|
|
2166
|
+
rb_define_alias(rb_cNodeSet, "size", "length");
|
|
1202
2167
|
rb_define_method(rb_cNodeSet, "[]", RUBY_METHOD_FUNC(nodeset_at), 1);
|
|
2168
|
+
rb_define_method(rb_cNodeSet, "first", RUBY_METHOD_FUNC(nodeset_first), 0);
|
|
2169
|
+
rb_define_method(rb_cNodeSet, "last", RUBY_METHOD_FUNC(nodeset_last), 0);
|
|
2170
|
+
rb_define_method(rb_cNodeSet, "empty?", RUBY_METHOD_FUNC(nodeset_empty_p), 0);
|
|
1203
2171
|
rb_define_method(rb_cNodeSet, "each", RUBY_METHOD_FUNC(nodeset_each), 0);
|
|
1204
2172
|
rb_define_method(rb_cNodeSet, "to_a", RUBY_METHOD_FUNC(nodeset_to_a), 0);
|
|
2173
|
+
rb_define_method(rb_cNodeSet, "text", RUBY_METHOD_FUNC(nodeset_text), 0);
|
|
2174
|
+
rb_define_method(rb_cNodeSet, "inner_html", RUBY_METHOD_FUNC(nodeset_inner_html), 0);
|
|
2175
|
+
rb_define_method(rb_cNodeSet, "inspect", RUBY_METHOD_FUNC(nodeset_inspect), 0);
|
|
2176
|
+
rb_define_alias(rb_cNodeSet, "to_s", "inspect");
|
|
1205
2177
|
rb_include_module(rb_cNodeSet, rb_mEnumerable);
|
|
1206
2178
|
|
|
1207
2179
|
rb_cSchema = rb_define_class_under(rb_mXML, "Schema", rb_cObject);
|