rxerces 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,6 +51,12 @@ static bool xerces_initialized = false;
51
51
  static bool xalan_initialized = false;
52
52
  #endif
53
53
 
54
+ // Forward declarations
55
+ static std::string css_to_xpath(const char* css);
56
+ static VALUE node_css(VALUE self, VALUE selector);
57
+ static VALUE node_xpath(VALUE self, VALUE path);
58
+ static VALUE document_xpath(VALUE self, VALUE path);
59
+
54
60
  // Helper class to manage XMLCh strings
55
61
  class XStr {
56
62
  public:
@@ -343,6 +349,45 @@ static VALUE document_to_s(VALUE self) {
343
349
  return Qnil;
344
350
  }
345
351
 
352
+ // document.inspect - human-readable representation
353
+ static VALUE document_inspect(VALUE self) {
354
+ DocumentWrapper* wrapper;
355
+ TypedData_Get_Struct(self, DocumentWrapper, &document_type, wrapper);
356
+
357
+ std::string result = "#<RXerces::XML::Document:0x";
358
+
359
+ // Add object ID
360
+ char buf[32];
361
+ snprintf(buf, sizeof(buf), "%016lx", (unsigned long)self);
362
+ result += buf;
363
+
364
+ if (!wrapper->doc) {
365
+ result += " (empty)>";
366
+ return rb_str_new_cstr(result.c_str());
367
+ }
368
+
369
+ // Add encoding
370
+ const XMLCh* encoding = wrapper->doc->getXmlEncoding();
371
+ if (encoding && XMLString::stringLen(encoding) > 0) {
372
+ CharStr utf8_encoding(encoding);
373
+ result += " encoding=\"";
374
+ result += utf8_encoding.localForm();
375
+ result += "\"";
376
+ }
377
+
378
+ // Add root element name
379
+ DOMElement* root = wrapper->doc->getDocumentElement();
380
+ if (root) {
381
+ CharStr rootName(root->getNodeName());
382
+ result += " root=<";
383
+ result += rootName.localForm();
384
+ result += ">";
385
+ }
386
+
387
+ result += ">";
388
+ return rb_str_new_cstr(result.c_str());
389
+ }
390
+
346
391
  // document.encoding
347
392
  static VALUE document_encoding(VALUE self) {
348
393
  DocumentWrapper* wrapper;
@@ -362,6 +407,29 @@ static VALUE document_encoding(VALUE self) {
362
407
  return rb_str_new_cstr(utf8_encoding.localForm());
363
408
  }
364
409
 
410
+ // document.text / document.content - returns text content of entire document
411
+ static VALUE document_text(VALUE self) {
412
+ DocumentWrapper* wrapper;
413
+ TypedData_Get_Struct(self, DocumentWrapper, &document_type, wrapper);
414
+
415
+ if (!wrapper->doc) {
416
+ return rb_str_new_cstr("");
417
+ }
418
+
419
+ DOMElement* root = wrapper->doc->getDocumentElement();
420
+ if (!root) {
421
+ return rb_str_new_cstr("");
422
+ }
423
+
424
+ const XMLCh* content = root->getTextContent();
425
+ if (!content) {
426
+ return rb_str_new_cstr("");
427
+ }
428
+
429
+ CharStr utf8_content(content);
430
+ return rb_str_new_cstr(utf8_content.localForm());
431
+ }
432
+
365
433
  // document.create_element(name)
366
434
  static VALUE document_create_element(VALUE self, VALUE name) {
367
435
  DocumentWrapper* doc_wrapper;
@@ -575,6 +643,140 @@ static VALUE document_xpath(VALUE self, VALUE path) {
575
643
  #endif
576
644
  }
577
645
 
646
+ // document.css(selector) - Convert CSS to XPath and execute
647
+ static VALUE document_css(VALUE self, VALUE selector) {
648
+ Check_Type(selector, T_STRING);
649
+ const char* css_str = StringValueCStr(selector);
650
+
651
+ // Convert CSS to XPath
652
+ std::string xpath_str = css_to_xpath(css_str);
653
+
654
+ // Call the xpath method with converted selector
655
+ return document_xpath(self, rb_str_new2(xpath_str.c_str()));
656
+ }
657
+
658
+ // document.at_css(selector) - Returns first matching node
659
+ static VALUE document_at_css(VALUE self, VALUE selector) {
660
+ VALUE nodeset = document_css(self, selector);
661
+
662
+ NodeSetWrapper* wrapper;
663
+ TypedData_Get_Struct(nodeset, NodeSetWrapper, &nodeset_type, wrapper);
664
+
665
+ if (RARRAY_LEN(wrapper->nodes_array) == 0) {
666
+ return Qnil;
667
+ }
668
+
669
+ return rb_ary_entry(wrapper->nodes_array, 0);
670
+ }
671
+
672
+ // node.inspect - human-readable representation
673
+ static VALUE node_inspect(VALUE self) {
674
+ NodeWrapper* wrapper;
675
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
676
+
677
+ if (!wrapper->node) {
678
+ return rb_str_new_cstr("#<RXerces::XML::Node (nil)>");
679
+ }
680
+
681
+ DOMNode::NodeType nodeType = wrapper->node->getNodeType();
682
+ std::string result;
683
+
684
+ // Add object ID
685
+ char buf[32];
686
+ snprintf(buf, sizeof(buf), "%016lx", (unsigned long)self);
687
+
688
+ if (nodeType == DOMNode::ELEMENT_NODE) {
689
+ result = "#<RXerces::XML::Element:0x";
690
+ result += buf;
691
+ result += " <";
692
+
693
+ CharStr name(wrapper->node->getNodeName());
694
+ result += name.localForm();
695
+
696
+ // Add attributes
697
+ DOMElement* element = dynamic_cast<DOMElement*>(wrapper->node);
698
+ if (element) {
699
+ DOMNamedNodeMap* attributes = element->getAttributes();
700
+ if (attributes && attributes->getLength() > 0) {
701
+ XMLSize_t attrLen = attributes->getLength();
702
+ if (attrLen > 3) attrLen = 3;
703
+
704
+ for (XMLSize_t i = 0; i < attrLen; i++) {
705
+ DOMNode* attr = attributes->item(i);
706
+ CharStr attrName(attr->getNodeName());
707
+ CharStr attrValue(attr->getNodeValue());
708
+ result += " ";
709
+ result += attrName.localForm();
710
+ result += "=\"";
711
+ result += attrValue.localForm();
712
+ result += "\"";
713
+ }
714
+ if (attributes->getLength() > 3) {
715
+ result += " ...";
716
+ }
717
+ }
718
+ }
719
+
720
+ result += ">";
721
+
722
+ // Add truncated text content
723
+ const XMLCh* textContent = wrapper->node->getTextContent();
724
+ if (textContent && XMLString::stringLen(textContent) > 0) {
725
+ CharStr text(textContent);
726
+ std::string textStr = text.localForm();
727
+
728
+ size_t start = textStr.find_first_not_of(" \t\n\r");
729
+ if (start != std::string::npos) {
730
+ size_t end = textStr.find_last_not_of(" \t\n\r");
731
+ textStr = textStr.substr(start, end - start + 1);
732
+
733
+ if (textStr.length() > 40) {
734
+ textStr = textStr.substr(0, 37) + "...";
735
+ }
736
+
737
+ result += "\"";
738
+ result += textStr;
739
+ result += "\"";
740
+ }
741
+ }
742
+
743
+ result += ">";
744
+ } else if (nodeType == DOMNode::TEXT_NODE) {
745
+ result = "#<RXerces::XML::Text:0x";
746
+ result += buf;
747
+ result += " \"";
748
+
749
+ const XMLCh* textContent = wrapper->node->getNodeValue();
750
+ if (textContent) {
751
+ CharStr text(textContent);
752
+ std::string textStr = text.localForm();
753
+
754
+ size_t start = textStr.find_first_not_of(" \t\n\r");
755
+ if (start != std::string::npos) {
756
+ size_t end = textStr.find_last_not_of(" \t\n\r");
757
+ textStr = textStr.substr(start, end - start + 1);
758
+
759
+ if (textStr.length() > 40) {
760
+ textStr = textStr.substr(0, 37) + "...";
761
+ }
762
+
763
+ result += textStr;
764
+ }
765
+ }
766
+
767
+ result += "\">";
768
+ } else {
769
+ result = "#<RXerces::XML::Node:0x";
770
+ result += buf;
771
+ result += " ";
772
+ CharStr name(wrapper->node->getNodeName());
773
+ result += name.localForm();
774
+ result += ">";
775
+ }
776
+
777
+ return rb_str_new_cstr(result.c_str());
778
+ }
779
+
578
780
  // node.name
579
781
  static VALUE node_name(VALUE self) {
580
782
  NodeWrapper* wrapper;
@@ -687,6 +889,28 @@ static VALUE node_set_attribute(VALUE self, VALUE attr_name, VALUE attr_value) {
687
889
  return attr_value;
688
890
  }
689
891
 
892
+ // node.has_attribute?(attribute_name)
893
+ static VALUE node_has_attribute_p(VALUE self, VALUE attr_name) {
894
+ NodeWrapper* wrapper;
895
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
896
+
897
+ if (!wrapper->node || wrapper->node->getNodeType() != DOMNode::ELEMENT_NODE) {
898
+ return Qfalse;
899
+ }
900
+
901
+ Check_Type(attr_name, T_STRING);
902
+ const char* attr_str = StringValueCStr(attr_name);
903
+
904
+ DOMElement* element = dynamic_cast<DOMElement*>(wrapper->node);
905
+ const XMLCh* value = element->getAttribute(XStr(attr_str).unicodeForm());
906
+
907
+ if (!value || XMLString::stringLen(value) == 0) {
908
+ return Qfalse;
909
+ }
910
+
911
+ return Qtrue;
912
+ }
913
+
690
914
  // node.children
691
915
  static VALUE node_children(VALUE self) {
692
916
  NodeWrapper* wrapper;
@@ -710,6 +934,31 @@ static VALUE node_children(VALUE self) {
710
934
  return children;
711
935
  }
712
936
 
937
+ // node.element_children - returns only element children (no text nodes)
938
+ static VALUE node_element_children(VALUE self) {
939
+ NodeWrapper* wrapper;
940
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
941
+
942
+ VALUE doc_ref = rb_iv_get(self, "@document");
943
+ VALUE children = rb_ary_new();
944
+
945
+ if (!wrapper->node) {
946
+ return children;
947
+ }
948
+
949
+ DOMNodeList* child_nodes = wrapper->node->getChildNodes();
950
+ XMLSize_t count = child_nodes->getLength();
951
+
952
+ for (XMLSize_t i = 0; i < count; i++) {
953
+ DOMNode* child = child_nodes->item(i);
954
+ if (child->getNodeType() == DOMNode::ELEMENT_NODE) {
955
+ rb_ary_push(children, wrap_node(child, doc_ref));
956
+ }
957
+ }
958
+
959
+ return children;
960
+ }
961
+
713
962
  // node.parent
714
963
  static VALUE node_parent(VALUE self) {
715
964
  NodeWrapper* wrapper;
@@ -728,6 +977,78 @@ static VALUE node_parent(VALUE self) {
728
977
  return wrap_node(parent, doc_ref);
729
978
  }
730
979
 
980
+ // node.ancestors(selector = nil) - returns an array of all ancestor nodes, optionally filtered by selector
981
+ static VALUE node_ancestors(int argc, VALUE* argv, VALUE self) {
982
+ VALUE selector;
983
+ rb_scan_args(argc, argv, "01", &selector);
984
+
985
+ NodeWrapper* wrapper;
986
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
987
+
988
+ VALUE ancestors = rb_ary_new();
989
+
990
+ if (!wrapper->node) {
991
+ return ancestors;
992
+ }
993
+
994
+ VALUE doc_ref = rb_iv_get(self, "@document");
995
+ DOMNode* current = wrapper->node->getParentNode();
996
+
997
+ // Walk up the tree, collecting all ancestors
998
+ while (current) {
999
+ // Stop at the document node (don't include it in ancestors)
1000
+ if (current->getNodeType() == DOMNode::DOCUMENT_NODE) {
1001
+ break;
1002
+ }
1003
+ rb_ary_push(ancestors, wrap_node(current, doc_ref));
1004
+ current = current->getParentNode();
1005
+ }
1006
+
1007
+ // If selector is provided, filter the ancestors
1008
+ if (!NIL_P(selector)) {
1009
+ Check_Type(selector, T_STRING);
1010
+ const char* selector_str = StringValueCStr(selector);
1011
+
1012
+ // Convert CSS to XPath if needed (css_to_xpath adds // prefix)
1013
+ std::string xpath_str = css_to_xpath(selector_str);
1014
+
1015
+ // Get all matching nodes from the document
1016
+ VALUE all_matches = document_xpath(doc_ref, rb_str_new2(xpath_str.c_str()));
1017
+
1018
+ NodeSetWrapper* matches_wrapper;
1019
+ TypedData_Get_Struct(all_matches, NodeSetWrapper, &nodeset_type, matches_wrapper);
1020
+
1021
+ VALUE filtered = rb_ary_new();
1022
+ long ancestor_len = RARRAY_LEN(ancestors);
1023
+ long matches_len = RARRAY_LEN(matches_wrapper->nodes_array);
1024
+
1025
+ // For each ancestor, check if it's in the matches
1026
+ for (long i = 0; i < ancestor_len; i++) {
1027
+ VALUE ancestor = rb_ary_entry(ancestors, i);
1028
+
1029
+ NodeWrapper* ancestor_wrapper;
1030
+ TypedData_Get_Struct(ancestor, NodeWrapper, &node_type, ancestor_wrapper);
1031
+
1032
+ // Check if this ancestor node is in the matches
1033
+ for (long j = 0; j < matches_len; j++) {
1034
+ VALUE match = rb_ary_entry(matches_wrapper->nodes_array, j);
1035
+ NodeWrapper* match_wrapper;
1036
+ TypedData_Get_Struct(match, NodeWrapper, &node_type, match_wrapper);
1037
+
1038
+ // Compare the actual DOM nodes
1039
+ if (ancestor_wrapper->node == match_wrapper->node) {
1040
+ rb_ary_push(filtered, ancestor);
1041
+ break;
1042
+ }
1043
+ }
1044
+ }
1045
+
1046
+ return filtered;
1047
+ }
1048
+
1049
+ return ancestors;
1050
+ }
1051
+
731
1052
  // node.attributes - returns hash of all attributes (only for element nodes)
732
1053
  static VALUE node_attributes(VALUE self) {
733
1054
  NodeWrapper* wrapper;
@@ -801,6 +1122,54 @@ static VALUE node_previous_sibling(VALUE self) {
801
1122
  return wrap_node(prev, doc_ref);
802
1123
  }
803
1124
 
1125
+ // node.next_element - next sibling that is an element (skipping text nodes)
1126
+ static VALUE node_next_element(VALUE self) {
1127
+ NodeWrapper* wrapper;
1128
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
1129
+
1130
+ if (!wrapper->node) {
1131
+ return Qnil;
1132
+ }
1133
+
1134
+ VALUE doc_ref = rb_iv_get(self, "@document");
1135
+ DOMNode* next = wrapper->node->getNextSibling();
1136
+
1137
+ // Skip non-element nodes
1138
+ while (next && next->getNodeType() != DOMNode::ELEMENT_NODE) {
1139
+ next = next->getNextSibling();
1140
+ }
1141
+
1142
+ if (!next) {
1143
+ return Qnil;
1144
+ }
1145
+
1146
+ return wrap_node(next, doc_ref);
1147
+ }
1148
+
1149
+ // node.previous_element - previous sibling that is an element (skipping text nodes)
1150
+ static VALUE node_previous_element(VALUE self) {
1151
+ NodeWrapper* wrapper;
1152
+ TypedData_Get_Struct(self, NodeWrapper, &node_type, wrapper);
1153
+
1154
+ if (!wrapper->node) {
1155
+ return Qnil;
1156
+ }
1157
+
1158
+ VALUE doc_ref = rb_iv_get(self, "@document");
1159
+ DOMNode* prev = wrapper->node->getPreviousSibling();
1160
+
1161
+ // Skip non-element nodes
1162
+ while (prev && prev->getNodeType() != DOMNode::ELEMENT_NODE) {
1163
+ prev = prev->getPreviousSibling();
1164
+ }
1165
+
1166
+ if (!prev) {
1167
+ return Qnil;
1168
+ }
1169
+
1170
+ return wrap_node(prev, doc_ref);
1171
+ }
1172
+
804
1173
  // node.add_child(node_or_string) - adds a child node
805
1174
  static VALUE node_add_child(VALUE self, VALUE child) {
806
1175
  NodeWrapper* wrapper;
@@ -1124,10 +1493,185 @@ static VALUE node_at_xpath(VALUE self, VALUE path) {
1124
1493
  return rb_ary_entry(wrapper->nodes_array, 0);
1125
1494
  }
1126
1495
 
1127
- // node.css(selector) - CSS selectors not supported
1496
+ // node.at_css(selector) - returns first matching node or nil
1497
+ static VALUE node_at_css(VALUE self, VALUE selector) {
1498
+ VALUE nodeset = node_css(self, selector);
1499
+ NodeSetWrapper* wrapper;
1500
+ TypedData_Get_Struct(nodeset, NodeSetWrapper, &nodeset_type, wrapper);
1501
+
1502
+ if (RARRAY_LEN(wrapper->nodes_array) == 0) {
1503
+ return Qnil;
1504
+ }
1505
+
1506
+ return rb_ary_entry(wrapper->nodes_array, 0);
1507
+ }
1508
+
1509
+ // Helper function to convert basic CSS selectors to XPath
1510
+ // Supports common patterns like: tag, .class, #id, tag.class, tag#id, [attr], [attr=value]
1511
+ static std::string css_to_xpath(const char* css) {
1512
+ std::string selector(css);
1513
+
1514
+ // Trim whitespace
1515
+ size_t start = selector.find_first_not_of(" \t\n\r");
1516
+ size_t end = selector.find_last_not_of(" \t\n\r");
1517
+ if (start == std::string::npos) return "//*";
1518
+ selector = selector.substr(start, end - start + 1);
1519
+
1520
+ std::string result = "//";
1521
+ std::string current_element = "*";
1522
+ bool has_element = false;
1523
+ bool in_brackets = false;
1524
+
1525
+ for (size_t i = 0; i < selector.length(); i++) {
1526
+ char c = selector[i];
1527
+
1528
+ if (c == '[') in_brackets = true;
1529
+ if (c == ']') in_brackets = false;
1530
+
1531
+ // Handle spaces (descendant combinator) outside of attribute selectors
1532
+ if (c == ' ' && !in_brackets) {
1533
+ // Flush current element
1534
+ if (!has_element && current_element != "*") {
1535
+ result += current_element;
1536
+ }
1537
+ // Skip multiple spaces
1538
+ while (i + 1 < selector.length() && selector[i + 1] == ' ') i++;
1539
+ result += "//";
1540
+ current_element = "*";
1541
+ has_element = false;
1542
+ continue;
1543
+ }
1544
+
1545
+ // Handle child combinator
1546
+ if (c == '>' && !in_brackets) {
1547
+ // Flush current element
1548
+ if (!has_element && current_element != "*") {
1549
+ result += current_element;
1550
+ }
1551
+ // Remove any trailing slashes and spaces
1552
+ while (!result.empty() && (result.back() == ' ' || result.back() == '/')) {
1553
+ if (result.back() == '/') {
1554
+ result.pop_back();
1555
+ break;
1556
+ }
1557
+ result.pop_back();
1558
+ }
1559
+ result += "/";
1560
+ // Skip spaces after >
1561
+ while (i + 1 < selector.length() && selector[i + 1] == ' ') i++;
1562
+ current_element = "*";
1563
+ has_element = false;
1564
+ continue;
1565
+ }
1566
+
1567
+ // Handle ID selector
1568
+ if (c == '#' && !in_brackets) {
1569
+ if (!has_element) {
1570
+ result += "*";
1571
+ has_element = true;
1572
+ } else if (current_element != "*") {
1573
+ result += current_element;
1574
+ current_element = "*";
1575
+ has_element = true;
1576
+ }
1577
+ result += "[@id='";
1578
+ i++;
1579
+ while (i < selector.length() && selector[i] != ' ' && selector[i] != '.' &&
1580
+ selector[i] != '[' && selector[i] != '>' && selector[i] != '+' && selector[i] != '~') {
1581
+ result += selector[i++];
1582
+ }
1583
+ result += "']";
1584
+ i--;
1585
+ continue;
1586
+ }
1587
+
1588
+ // Handle class selector
1589
+ if (c == '.' && !in_brackets) {
1590
+ if (!has_element) {
1591
+ result += "*";
1592
+ has_element = true;
1593
+ } else if (current_element != "*") {
1594
+ result += current_element;
1595
+ current_element = "*";
1596
+ has_element = true;
1597
+ }
1598
+ result += "[contains(concat(' ', @class, ' '), ' ";
1599
+ i++;
1600
+ while (i < selector.length() && selector[i] != ' ' && selector[i] != '.' &&
1601
+ selector[i] != '[' && selector[i] != '>' && selector[i] != '+' && selector[i] != '~' && selector[i] != '#') {
1602
+ result += selector[i++];
1603
+ }
1604
+ result += " ')]";
1605
+ i--;
1606
+ continue;
1607
+ }
1608
+
1609
+ // Handle attribute selectors
1610
+ if (c == '[') {
1611
+ if (!has_element && current_element != "*") {
1612
+ result += current_element;
1613
+ has_element = true;
1614
+ }
1615
+ result += "[@";
1616
+ i++;
1617
+ // Get attribute name
1618
+ while (i < selector.length() && selector[i] != ']' && selector[i] != '=' &&
1619
+ selector[i] != '!' && selector[i] != '~' && selector[i] != '^' && selector[i] != '$' && selector[i] != '*') {
1620
+ result += selector[i++];
1621
+ }
1622
+
1623
+ if (i < selector.length() && selector[i] == '=') {
1624
+ result += "='";
1625
+ i++;
1626
+ // Skip quotes if present
1627
+ if (i < selector.length() && (selector[i] == '"' || selector[i] == '\'')) {
1628
+ char quote = selector[i++];
1629
+ while (i < selector.length() && selector[i] != quote) {
1630
+ result += selector[i++];
1631
+ }
1632
+ if (i < selector.length()) i++; // Skip closing quote
1633
+ } else {
1634
+ // No quotes, read until ]
1635
+ while (i < selector.length() && selector[i] != ']') {
1636
+ result += selector[i++];
1637
+ }
1638
+ }
1639
+ result += "'";
1640
+ }
1641
+
1642
+ // Skip to closing bracket
1643
+ while (i < selector.length() && selector[i] != ']') i++;
1644
+ result += ']';
1645
+ continue;
1646
+ }
1647
+
1648
+ // Regular character - part of element name
1649
+ if (c != ' ' && c != '>' && c != '.' && c != '#' && c != '[' && !has_element) {
1650
+ if (current_element == "*") {
1651
+ current_element = "";
1652
+ }
1653
+ current_element += c;
1654
+ }
1655
+ }
1656
+
1657
+ // Flush any remaining element name
1658
+ if (!has_element && current_element != "*") {
1659
+ result += current_element;
1660
+ }
1661
+
1662
+ return result;
1663
+ }
1664
+
1665
+ // node.css(selector) - Convert CSS to XPath and execute
1128
1666
  static VALUE node_css(VALUE self, VALUE selector) {
1129
- rb_raise(rb_eNotImpError, "CSS selectors are not supported. Use xpath() instead. Xerces-C only supports XPath queries.");
1130
- return Qnil;
1667
+ Check_Type(selector, T_STRING);
1668
+ const char* css_str = StringValueCStr(selector);
1669
+
1670
+ // Convert CSS to XPath
1671
+ std::string xpath_str = css_to_xpath(css_str);
1672
+
1673
+ // Call the xpath method with converted selector
1674
+ return node_xpath(self, rb_str_new2(xpath_str.c_str()));
1131
1675
  }
1132
1676
 
1133
1677
  // nodeset.length / nodeset.size
@@ -1171,6 +1715,205 @@ static VALUE nodeset_to_a(VALUE self) {
1171
1715
  return rb_ary_dup(wrapper->nodes_array);
1172
1716
  }
1173
1717
 
1718
+ // nodeset.first - returns first node or nil
1719
+ static VALUE nodeset_first(VALUE self) {
1720
+ NodeSetWrapper* wrapper;
1721
+ TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
1722
+
1723
+ if (RARRAY_LEN(wrapper->nodes_array) == 0) {
1724
+ return Qnil;
1725
+ }
1726
+
1727
+ return rb_ary_entry(wrapper->nodes_array, 0);
1728
+ }
1729
+
1730
+ // nodeset.last - returns last node or nil
1731
+ static VALUE nodeset_last(VALUE self) {
1732
+ NodeSetWrapper* wrapper;
1733
+ TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
1734
+
1735
+ long len = RARRAY_LEN(wrapper->nodes_array);
1736
+ if (len == 0) {
1737
+ return Qnil;
1738
+ }
1739
+
1740
+ return rb_ary_entry(wrapper->nodes_array, len - 1);
1741
+ }
1742
+
1743
+ // nodeset.empty? - returns true if nodeset is empty
1744
+ static VALUE nodeset_empty_p(VALUE self) {
1745
+ NodeSetWrapper* wrapper;
1746
+ TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
1747
+
1748
+ return RARRAY_LEN(wrapper->nodes_array) == 0 ? Qtrue : Qfalse;
1749
+ }
1750
+
1751
+ // nodeset.inner_html - returns concatenated inner_html of all nodes
1752
+ static VALUE nodeset_inner_html(VALUE self) {
1753
+ NodeSetWrapper* wrapper;
1754
+ TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
1755
+
1756
+ std::string result;
1757
+ long len = RARRAY_LEN(wrapper->nodes_array);
1758
+
1759
+ for (long i = 0; i < len; i++) {
1760
+ VALUE node = rb_ary_entry(wrapper->nodes_array, i);
1761
+ VALUE inner_html = rb_funcall(node, rb_intern("inner_html"), 0);
1762
+ result += StringValueCStr(inner_html);
1763
+ }
1764
+
1765
+ return rb_str_new_cstr(result.c_str());
1766
+ }
1767
+
1768
+ // nodeset.text - returns concatenated text content of all nodes
1769
+ static VALUE nodeset_text(VALUE self) {
1770
+ NodeSetWrapper* wrapper;
1771
+ TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
1772
+
1773
+ std::string result;
1774
+ long len = RARRAY_LEN(wrapper->nodes_array);
1775
+
1776
+ for (long i = 0; i < len; i++) {
1777
+ VALUE node = rb_ary_entry(wrapper->nodes_array, i);
1778
+ NodeWrapper* node_wrapper;
1779
+ TypedData_Get_Struct(node, NodeWrapper, &node_type, node_wrapper);
1780
+
1781
+ if (node_wrapper->node) {
1782
+ const XMLCh* content = node_wrapper->node->getTextContent();
1783
+ if (content) {
1784
+ CharStr utf8_content(content);
1785
+ result += utf8_content.localForm();
1786
+ }
1787
+ }
1788
+ }
1789
+
1790
+ return rb_str_new_cstr(result.c_str());
1791
+ }
1792
+
1793
+ // nodeset.inspect / nodeset.to_s - human-readable representation
1794
+ static VALUE nodeset_inspect(VALUE self) {
1795
+ NodeSetWrapper* wrapper;
1796
+ TypedData_Get_Struct(self, NodeSetWrapper, &nodeset_type, wrapper);
1797
+
1798
+ long len = RARRAY_LEN(wrapper->nodes_array);
1799
+ std::string result = "#<RXerces::XML::NodeSet:0x";
1800
+
1801
+ // Add object ID
1802
+ char buf[32];
1803
+ snprintf(buf, sizeof(buf), "%016lx", (unsigned long)self);
1804
+ result += buf;
1805
+ result += " [";
1806
+
1807
+ for (long i = 0; i < len; i++) {
1808
+ if (i > 0) result += ", ";
1809
+
1810
+ VALUE node = rb_ary_entry(wrapper->nodes_array, i);
1811
+ NodeWrapper* node_wrapper;
1812
+ TypedData_Get_Struct(node, NodeWrapper, &node_type, node_wrapper);
1813
+
1814
+ if (!node_wrapper->node) {
1815
+ result += "nil";
1816
+ continue;
1817
+ }
1818
+
1819
+ DOMNode::NodeType nodeType = node_wrapper->node->getNodeType();
1820
+
1821
+ if (nodeType == DOMNode::ELEMENT_NODE) {
1822
+ // For elements, show: <tag attr="value">content</tag>
1823
+ CharStr name(node_wrapper->node->getNodeName());
1824
+ result += "<";
1825
+ result += name.localForm();
1826
+
1827
+ // Add first few attributes if present
1828
+ DOMElement* element = dynamic_cast<DOMElement*>(node_wrapper->node);
1829
+ if (element) {
1830
+ DOMNamedNodeMap* attributes = element->getAttributes();
1831
+ if (attributes && attributes->getLength() > 0) {
1832
+ XMLSize_t attrLen = attributes->getLength();
1833
+ if (attrLen > 3) attrLen = 3; // Limit to first 3 attributes
1834
+
1835
+ for (XMLSize_t j = 0; j < attrLen; j++) {
1836
+ DOMNode* attr = attributes->item(j);
1837
+ CharStr attrName(attr->getNodeName());
1838
+ CharStr attrValue(attr->getNodeValue());
1839
+ result += " ";
1840
+ result += attrName.localForm();
1841
+ result += "=\"";
1842
+ result += attrValue.localForm();
1843
+ result += "\"";
1844
+ }
1845
+ if (attributes->getLength() > 3) {
1846
+ result += " ...";
1847
+ }
1848
+ }
1849
+ }
1850
+
1851
+ // Show truncated text content
1852
+ const XMLCh* textContent = node_wrapper->node->getTextContent();
1853
+ if (textContent && XMLString::stringLen(textContent) > 0) {
1854
+ CharStr text(textContent);
1855
+ std::string textStr = text.localForm();
1856
+
1857
+ // Trim whitespace and truncate
1858
+ size_t start = textStr.find_first_not_of(" \t\n\r");
1859
+ if (start != std::string::npos) {
1860
+ size_t end = textStr.find_last_not_of(" \t\n\r");
1861
+ textStr = textStr.substr(start, end - start + 1);
1862
+
1863
+ if (textStr.length() > 30) {
1864
+ textStr = textStr.substr(0, 27) + "...";
1865
+ }
1866
+
1867
+ result += ">";
1868
+ result += textStr;
1869
+ result += "</";
1870
+ result += name.localForm();
1871
+ result += ">";
1872
+ } else {
1873
+ result += ">";
1874
+ }
1875
+ } else {
1876
+ result += ">";
1877
+ }
1878
+ } else if (nodeType == DOMNode::TEXT_NODE) {
1879
+ // For text nodes, show: text("content")
1880
+ const XMLCh* textContent = node_wrapper->node->getNodeValue();
1881
+ if (textContent) {
1882
+ CharStr text(textContent);
1883
+ std::string textStr = text.localForm();
1884
+
1885
+ // Trim and truncate
1886
+ size_t start = textStr.find_first_not_of(" \t\n\r");
1887
+ if (start != std::string::npos) {
1888
+ size_t end = textStr.find_last_not_of(" \t\n\r");
1889
+ textStr = textStr.substr(start, end - start + 1);
1890
+
1891
+ if (textStr.length() > 30) {
1892
+ textStr = textStr.substr(0, 27) + "...";
1893
+ }
1894
+
1895
+ result += "text(\"";
1896
+ result += textStr;
1897
+ result += "\")";
1898
+ } else {
1899
+ result += "text()";
1900
+ }
1901
+ } else {
1902
+ result += "text()";
1903
+ }
1904
+ } else {
1905
+ // For other nodes, just show the type
1906
+ CharStr name(node_wrapper->node->getNodeName());
1907
+ result += "#<";
1908
+ result += name.localForm();
1909
+ result += ">";
1910
+ }
1911
+ }
1912
+
1913
+ result += "]>";
1914
+ return rb_str_new_cstr(result.c_str());
1915
+ }
1916
+
1174
1917
  // Schema.from_document(schema_doc) or Schema.from_string(xsd_string)
1175
1918
  static VALUE schema_from_document(int argc, VALUE* argv, VALUE klass) {
1176
1919
  VALUE schema_source;
@@ -1362,12 +2105,18 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
1362
2105
  rb_define_method(rb_cDocument, "root", RUBY_METHOD_FUNC(document_root), 0);
1363
2106
  rb_define_method(rb_cDocument, "to_s", RUBY_METHOD_FUNC(document_to_s), 0);
1364
2107
  rb_define_alias(rb_cDocument, "to_xml", "to_s");
2108
+ rb_define_method(rb_cDocument, "inspect", RUBY_METHOD_FUNC(document_inspect), 0);
1365
2109
  rb_define_method(rb_cDocument, "xpath", RUBY_METHOD_FUNC(document_xpath), 1);
2110
+ rb_define_method(rb_cDocument, "css", RUBY_METHOD_FUNC(document_css), 1);
2111
+ rb_define_method(rb_cDocument, "at_css", RUBY_METHOD_FUNC(document_at_css), 1);
1366
2112
  rb_define_method(rb_cDocument, "encoding", RUBY_METHOD_FUNC(document_encoding), 0);
2113
+ rb_define_method(rb_cDocument, "text", RUBY_METHOD_FUNC(document_text), 0);
2114
+ rb_define_alias(rb_cDocument, "content", "text");
1367
2115
  rb_define_method(rb_cDocument, "create_element", RUBY_METHOD_FUNC(document_create_element), 1);
1368
2116
 
1369
2117
  rb_cNode = rb_define_class_under(rb_mXML, "Node", rb_cObject);
1370
2118
  rb_undef_alloc_func(rb_cNode);
2119
+ rb_define_method(rb_cNode, "inspect", RUBY_METHOD_FUNC(node_inspect), 0);
1371
2120
  rb_define_method(rb_cNode, "name", RUBY_METHOD_FUNC(node_name), 0);
1372
2121
  rb_define_method(rb_cNode, "namespace", RUBY_METHOD_FUNC(node_namespace), 0);
1373
2122
  rb_define_method(rb_cNode, "text", RUBY_METHOD_FUNC(node_text), 0);
@@ -1376,11 +2125,19 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
1376
2125
  rb_define_alias(rb_cNode, "content=", "text=");
1377
2126
  rb_define_method(rb_cNode, "[]", RUBY_METHOD_FUNC(node_get_attribute), 1);
1378
2127
  rb_define_method(rb_cNode, "[]=", RUBY_METHOD_FUNC(node_set_attribute), 2);
2128
+ rb_define_alias(rb_cNode, "get_attribute", "[]");
2129
+ rb_define_alias(rb_cNode, "attribute", "[]");
2130
+ rb_define_method(rb_cNode, "has_attribute?", RUBY_METHOD_FUNC(node_has_attribute_p), 1);
1379
2131
  rb_define_method(rb_cNode, "children", RUBY_METHOD_FUNC(node_children), 0);
2132
+ rb_define_method(rb_cNode, "element_children", RUBY_METHOD_FUNC(node_element_children), 0);
2133
+ rb_define_alias(rb_cNode, "elements", "element_children");
1380
2134
  rb_define_method(rb_cNode, "parent", RUBY_METHOD_FUNC(node_parent), 0);
2135
+ rb_define_method(rb_cNode, "ancestors", RUBY_METHOD_FUNC(node_ancestors), -1);
1381
2136
  rb_define_method(rb_cNode, "attributes", RUBY_METHOD_FUNC(node_attributes), 0);
1382
2137
  rb_define_method(rb_cNode, "next_sibling", RUBY_METHOD_FUNC(node_next_sibling), 0);
2138
+ rb_define_method(rb_cNode, "next_element", RUBY_METHOD_FUNC(node_next_element), 0);
1383
2139
  rb_define_method(rb_cNode, "previous_sibling", RUBY_METHOD_FUNC(node_previous_sibling), 0);
2140
+ rb_define_method(rb_cNode, "previous_element", RUBY_METHOD_FUNC(node_previous_element), 0);
1384
2141
  rb_define_method(rb_cNode, "add_child", RUBY_METHOD_FUNC(node_add_child), 1);
1385
2142
  rb_define_method(rb_cNode, "remove", RUBY_METHOD_FUNC(node_remove), 0);
1386
2143
  rb_define_alias(rb_cNode, "unlink", "remove");
@@ -1393,6 +2150,9 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
1393
2150
  rb_define_method(rb_cNode, "at_xpath", RUBY_METHOD_FUNC(node_at_xpath), 1);
1394
2151
  rb_define_alias(rb_cNode, "at", "at_xpath");
1395
2152
  rb_define_method(rb_cNode, "css", RUBY_METHOD_FUNC(node_css), 1);
2153
+ rb_define_method(rb_cNode, "at_css", RUBY_METHOD_FUNC(node_at_css), 1);
2154
+ rb_define_alias(rb_cNode, "get_attribute", "[]");
2155
+ rb_define_alias(rb_cNode, "attribute", "[]");
1396
2156
 
1397
2157
  rb_cElement = rb_define_class_under(rb_mXML, "Element", rb_cNode);
1398
2158
  rb_undef_alloc_func(rb_cElement);
@@ -1405,8 +2165,15 @@ static VALUE document_validate(VALUE self, VALUE rb_schema) {
1405
2165
  rb_define_method(rb_cNodeSet, "length", RUBY_METHOD_FUNC(nodeset_length), 0);
1406
2166
  rb_define_alias(rb_cNodeSet, "size", "length");
1407
2167
  rb_define_method(rb_cNodeSet, "[]", RUBY_METHOD_FUNC(nodeset_at), 1);
2168
+ rb_define_method(rb_cNodeSet, "first", RUBY_METHOD_FUNC(nodeset_first), 0);
2169
+ rb_define_method(rb_cNodeSet, "last", RUBY_METHOD_FUNC(nodeset_last), 0);
2170
+ rb_define_method(rb_cNodeSet, "empty?", RUBY_METHOD_FUNC(nodeset_empty_p), 0);
1408
2171
  rb_define_method(rb_cNodeSet, "each", RUBY_METHOD_FUNC(nodeset_each), 0);
1409
2172
  rb_define_method(rb_cNodeSet, "to_a", RUBY_METHOD_FUNC(nodeset_to_a), 0);
2173
+ rb_define_method(rb_cNodeSet, "text", RUBY_METHOD_FUNC(nodeset_text), 0);
2174
+ rb_define_method(rb_cNodeSet, "inner_html", RUBY_METHOD_FUNC(nodeset_inner_html), 0);
2175
+ rb_define_method(rb_cNodeSet, "inspect", RUBY_METHOD_FUNC(nodeset_inspect), 0);
2176
+ rb_define_alias(rb_cNodeSet, "to_s", "inspect");
1410
2177
  rb_include_module(rb_cNodeSet, rb_mEnumerable);
1411
2178
 
1412
2179
  rb_cSchema = rb_define_class_under(rb_mXML, "Schema", rb_cObject);