kodexa-document 7.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodexa-document might be problematic. Click here for more details.

@@ -0,0 +1,677 @@
1
+ """Abstract Syntax Tree nodes for parsed XPath expressions.
2
+
3
+ This module contains basic nodes for representing parsed XPath expressions
4
+ created by the ANTLR-based parser. These classes provide the same functionality
5
+ as the original PLY-based parser's AST classes but are designed to work with
6
+ the ANTLR-generated parse tree.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from typing import List, Optional, Any, Dict, Union, Tuple
13
+
14
+ # Import these types but make them optional to avoid circular imports
15
+ # In a real implementation, you'd use proper type annotations
16
+ try:
17
+ from kodexa_document.model import ContentNode, ContentFeature, Document
18
+ except ImportError:
19
+ ContentNode = Any
20
+ ContentFeature = Any
21
+ Document = Any
22
+
23
+ __all__ = [
24
+ "SelectorContext",
25
+ "UnaryExpression",
26
+ "BinaryExpression",
27
+ "PredicatedExpression",
28
+ "PipelineExpression",
29
+ "AbsolutePath",
30
+ "Step",
31
+ "NameTest",
32
+ "NodeType",
33
+ "AbbreviatedStep",
34
+ "VariableReference",
35
+ "FunctionCall",
36
+ ]
37
+
38
+
39
+ class SelectorContext:
40
+ """Context for selector resolution, maintains state during traversal."""
41
+
42
+ def __init__(self, document: Document, first_only=False):
43
+ """Initialize a new SelectorContext.
44
+
45
+ Args:
46
+ document: The document being searched
47
+ first_only: Whether to return only the first match
48
+ """
49
+ self.pattern_cache = {}
50
+ self.last_op = None
51
+ self.document: Document = document
52
+ self.stream = 0
53
+ self.first_only = first_only
54
+
55
+ def cache_pattern(self, pattern: str) -> re.Pattern:
56
+ """Get a compiled regex pattern, caching for reuse.
57
+
58
+ Args:
59
+ pattern: The regex pattern string
60
+
61
+ Returns:
62
+ The compiled regex pattern
63
+ """
64
+ if pattern not in self.pattern_cache:
65
+ self.pattern_cache[pattern] = re.compile(pattern)
66
+ return self.pattern_cache[pattern]
67
+
68
+
69
+ class PipelineExpression:
70
+ """A pipeline XPath expression (e.g., a stream b)."""
71
+
72
+ def __init__(self, left: Any, op: str, right: Any):
73
+ """Initialize a new PipelineExpression.
74
+
75
+ Args:
76
+ left: Left side of the pipeline
77
+ op: The pipeline operator
78
+ right: Right side of the pipeline
79
+ """
80
+ self.left = left
81
+ self.op = op
82
+ self.right = right
83
+
84
+ def resolve(self, content_node: ContentNode, variables: Dict, context: SelectorContext) -> List[ContentNode]:
85
+ """Resolve this pipeline expression.
86
+
87
+ Args:
88
+ content_node: The current content node
89
+ variables: Variable bindings
90
+ context: The selector context
91
+
92
+ Returns:
93
+ List of matching content nodes
94
+ """
95
+ left_nodes = self.left.resolve(content_node, variables, context)
96
+ result_nodes: List[ContentNode] = []
97
+ context.stream = context.stream + 1
98
+
99
+ # If first_only is True and we already have left nodes, only process the first one
100
+ nodes_to_process = left_nodes[:1] if context.first_only and left_nodes else left_nodes
101
+
102
+ for node in nodes_to_process:
103
+ right_results = self.right.resolve(node, variables, context)
104
+ result_nodes.extend(right_results)
105
+ # If first_only is True and we found a match, return immediately
106
+ if context.first_only and result_nodes:
107
+ break
108
+
109
+ context.stream = context.stream - 1
110
+ return result_nodes[:1] if context.first_only else result_nodes
111
+
112
+
113
+ class UnaryExpression:
114
+ """A unary XPath expression (e.g., -foo)."""
115
+
116
+ def __init__(self, op: str, right: Any):
117
+ """Initialize a new UnaryExpression.
118
+
119
+ Args:
120
+ op: The operator
121
+ right: The expression the operator is applied to
122
+ """
123
+ self.op = op
124
+ self.right = right
125
+
126
+ def resolve(self, content_node: ContentNode, variables: Dict, context: SelectorContext) -> Any:
127
+ """Resolve this unary expression.
128
+
129
+ Args:
130
+ content_node: The current content node
131
+ variables: Variable bindings
132
+ context: The selector context
133
+
134
+ Returns:
135
+ The result of applying the operator to the right expression
136
+ """
137
+ # Handle negation
138
+ if self.op == "-":
139
+ right_value = self.right.resolve(content_node, variables, context)
140
+ if isinstance(right_value, (int, float)):
141
+ return -right_value
142
+
143
+ return None
144
+
145
+
146
+ class BinaryExpression:
147
+ """Any binary XPath expression (e.g., a/b, a and b, a | b)."""
148
+
149
+ def __init__(self, left: Any, op: str, right: Any):
150
+ """Initialize a new BinaryExpression.
151
+
152
+ Args:
153
+ left: Left side of the expression
154
+ op: The operator
155
+ right: Right side of the expression
156
+ """
157
+ self.left = left
158
+ self.op = op
159
+ self.right = right
160
+
161
+ def resolve(self, content_node: ContentNode, variables: Dict, context: SelectorContext) -> Any:
162
+ """Resolve this binary expression.
163
+
164
+ Args:
165
+ content_node: The current content node
166
+ variables: Variable bindings
167
+ context: The selector context
168
+
169
+ Returns:
170
+ The result of applying the operator to the left and right expressions
171
+ """
172
+ if self.op == "|":
173
+ return self.left.resolve(
174
+ content_node, variables, context
175
+ ) + self.right.resolve(content_node, variables, context)
176
+ if self.op == "=":
177
+ return self.get_value(
178
+ self.left, content_node, variables, context
179
+ ) == self.get_value(self.right, content_node, variables, context)
180
+ if self.op == "!=":
181
+ return self.get_value(
182
+ self.left, content_node, variables, context
183
+ ) != self.get_value(self.right, content_node, variables, context)
184
+ if self.op == "intersect":
185
+ left_value = self.get_value(self.left, content_node, variables, context)
186
+ right_value = self.get_value(self.right, content_node, variables, context)
187
+ if isinstance(left_value, list) and isinstance(right_value, list):
188
+ intersection_list = [
189
+ value for value in left_value if value in right_value
190
+ ]
191
+ return intersection_list
192
+
193
+ return []
194
+ if self.op == "and":
195
+ return bool(
196
+ self.get_value(self.left, content_node, variables, context)
197
+ ) and bool(self.get_value(self.right, content_node, variables, context))
198
+ if self.op == "or":
199
+ return bool(
200
+ self.get_value(self.left, content_node, variables, context)
201
+ ) or bool(self.get_value(self.right, content_node, variables, context))
202
+
203
+ # Handle path operations
204
+ if self.op == "/" or self.op == "//":
205
+ # For path expressions, resolve left first then apply right to each result
206
+ left_results = self.left.resolve(content_node, variables, context)
207
+ context.last_op = self.op
208
+
209
+ all_results = []
210
+ for node in left_results:
211
+ right_results = self.right.resolve(node, variables, context)
212
+ all_results.extend(right_results)
213
+
214
+ # If first_only is True and we found a match, return immediately
215
+ if context.first_only and all_results:
216
+ break
217
+
218
+ return all_results[:1] if context.first_only else all_results
219
+
220
+ return None
221
+
222
+ def get_value(self, side: Any, content_node: ContentNode, variables: Dict, context: SelectorContext) -> Any:
223
+ """Get the value of an expression.
224
+
225
+ Args:
226
+ side: The expression to evaluate
227
+ content_node: The current content node
228
+ variables: Variable bindings
229
+ context: The selector context
230
+
231
+ Returns:
232
+ The evaluated value
233
+ """
234
+ if isinstance(side, FunctionCall):
235
+ return side.resolve(content_node, variables, context)
236
+ if isinstance(side, (AbsolutePath, BinaryExpression, UnaryExpression)):
237
+ return side.resolve(content_node, variables, context)
238
+
239
+ return side
240
+
241
+
242
+ class PredicatedExpression:
243
+ """A filtered XPath expression (e.g., $var[1], (a or b)[foo][@bar])."""
244
+
245
+ def __init__(self, base: Any, predicates: List = None):
246
+ """Initialize a new PredicatedExpression.
247
+
248
+ Args:
249
+ base: The base expression to be filtered
250
+ predicates: List of filter predicates
251
+ """
252
+ self.base = base
253
+ self.predicates = predicates or []
254
+
255
+ def append_predicate(self, pred: Any) -> None:
256
+ """Add a predicate to this expression.
257
+
258
+ Args:
259
+ pred: The predicate to add
260
+ """
261
+ self.predicates.append(pred)
262
+
263
+ def resolve(self, content_node: ContentNode, variables: Dict, context: SelectorContext) -> List[ContentNode]:
264
+ """Resolve this predicated expression.
265
+
266
+ Args:
267
+ content_node: The current content node
268
+ variables: Variable bindings
269
+ context: The selector context
270
+
271
+ Returns:
272
+ List of content nodes that match the predicates
273
+ """
274
+ nodes = self.base.resolve(content_node, variables, context)
275
+ results = []
276
+ for idx, node in enumerate(nodes):
277
+ for predicate in self.predicates:
278
+ if isinstance(predicate, int) and predicate == idx:
279
+ results.append(node)
280
+ return results
281
+
282
+ if not isinstance(predicate, int) and predicate.resolve(node, variables, context):
283
+ results.append(node)
284
+
285
+ return results
286
+
287
+
288
+ class AbsolutePath:
289
+ """An absolute XPath path (e.g., /a/b/c, //a/ancestor:b/@c)."""
290
+
291
+ def __init__(self, op: str = "/", relative: Any = None):
292
+ """Initialize a new AbsolutePath.
293
+
294
+ Args:
295
+ op: The operator used to root the expression
296
+ relative: The relative path after the absolute root operator
297
+ """
298
+ self.op = op
299
+ self.relative = relative
300
+
301
+ def resolve(self, content_node: ContentNode, variables: Dict, context: SelectorContext) -> List[ContentNode]:
302
+ """Resolve this absolute path.
303
+
304
+ Args:
305
+ content_node: The current content node
306
+ variables: Variable bindings
307
+ context: The selector context
308
+
309
+ Returns:
310
+ List of matching content nodes
311
+ """
312
+ if self.op == "/":
313
+ context.last_op = "/"
314
+ # Start from the root node for absolute paths
315
+ root_node = content_node
316
+ while root_node.get_parent() is not None:
317
+ root_node = root_node.get_parent()
318
+
319
+ if self.relative is None:
320
+ return [root_node]
321
+
322
+ return self.relative.resolve(root_node, variables, context)
323
+
324
+ if self.op == "//":
325
+ context.last_op = "//"
326
+ # Start from the root but search all descendants
327
+ root_node = content_node
328
+ while root_node.get_parent() is not None:
329
+ root_node = root_node.get_parent()
330
+
331
+ return self.relative.resolve(root_node, variables, context)
332
+
333
+ raise Exception(f"Unsupported absolute path operator: {self.op}")
334
+
335
+
336
+ class Step:
337
+ """A single step in a relative path."""
338
+
339
+ def __init__(self, axis: Optional[str], node_test: Any, predicates: List):
340
+ """Initialize a new Step.
341
+
342
+ Args:
343
+ axis: The axis for this step
344
+ node_test: The node test to apply
345
+ predicates: List of predicates to filter nodes
346
+ """
347
+ self.axis = axis
348
+ self.node_test = node_test
349
+ self.predicates = predicates
350
+
351
+ def resolve(self, content_node: ContentNode, variables: Dict, context: SelectorContext) -> List[ContentNode]:
352
+ """Resolve this step.
353
+
354
+ Args:
355
+ content_node: The current content node
356
+ variables: Variable bindings
357
+ context: The selector context
358
+
359
+ Returns:
360
+ List of matching content nodes
361
+ """
362
+ if content_node is None:
363
+ return []
364
+
365
+ match = True
366
+ if isinstance(content_node, ContentFeature):
367
+ match = self.node_test.test(content_node, variables, context)
368
+
369
+ axis_node = None
370
+
371
+ if isinstance(content_node, ContentNode):
372
+ axis_node = content_node
373
+
374
+ if self.axis == "parent":
375
+ parent = axis_node.get_parent()
376
+ # For parent axis, we need to check if any parent in the hierarchy matches
377
+ while parent is not None:
378
+ # For wildcard, return any parent
379
+ if self.node_test is None or (hasattr(self.node_test, 'name') and self.node_test.name == '*'):
380
+ return [parent]
381
+
382
+ # If the parent node type matches the requested node type, return it
383
+ if hasattr(self.node_test, 'name') and (parent.node_type == self.node_test.name):
384
+ return [parent]
385
+
386
+ # Try the next parent
387
+ parent = parent.get_parent()
388
+
389
+ # Look for parents elsewhere in the document to handle cross-references
390
+ if hasattr(self.node_test, 'name') and self.node_test.name != '*':
391
+ possible_parents = context.document.get_persistence().get_content_nodes(
392
+ self.node_test.name,
393
+ axis_node,
394
+ True
395
+ )
396
+ for possible_parent in possible_parents:
397
+ # Check if this node is a parent of our node
398
+ current = axis_node
399
+ while current is not None:
400
+ if current.get_parent() is not None and current.get_parent().id == possible_parent.id:
401
+ return [possible_parent]
402
+ current = current.get_parent()
403
+
404
+ return []
405
+
406
+ nodes = self.node_test.test(axis_node, variables, context)
407
+ final_nodes = []
408
+
409
+ # Special case for the direct node type with index selector pattern (like '//p[0]')
410
+ # This pattern should return all nodes of the given type, regardless of their index
411
+ direct_node_index_pattern = len(self.predicates) == 1 and isinstance(self.predicates[0], int)
412
+
413
+ # If first_only is True, only process until we find the first match
414
+ for node in nodes:
415
+ match = True
416
+ for predicate in self.predicates:
417
+ if isinstance(predicate, int):
418
+ # For direct node type with index patterns (//p[0]), ignore the index check
419
+ if direct_node_index_pattern:
420
+ # Keep match as True
421
+ pass
422
+ elif predicate == node.index:
423
+ match = True
424
+ else:
425
+ match = False
426
+ elif not predicate.resolve(node, variables, context):
427
+ match = False
428
+
429
+ if match:
430
+ final_nodes.append(node)
431
+ if context.first_only:
432
+ break
433
+
434
+ return final_nodes
435
+
436
+ if match:
437
+ return [axis_node]
438
+
439
+ if self.axis is not None:
440
+ return self.resolve(axis_node, variables, context)
441
+
442
+ return []
443
+
444
+
445
+ class NameTest:
446
+ """An element name node test for a Step."""
447
+
448
+ def __init__(self, prefix: Optional[str], name: str):
449
+ """Initialize a new NameTest.
450
+
451
+ Args:
452
+ prefix: The namespace prefix, or None if unspecified
453
+ name: The local element name
454
+ """
455
+ self.prefix = prefix
456
+ self.name = name
457
+
458
+ def test(self, obj: Union[ContentNode, ContentFeature], variables: Dict, context: SelectorContext) -> Union[bool, List[ContentNode]]:
459
+ """Test if a node matches this name test.
460
+
461
+ Args:
462
+ obj: The node or feature to test
463
+ variables: Variable bindings
464
+ context: The selector context
465
+
466
+ Returns:
467
+ Either a boolean result or a list of matching nodes
468
+ """
469
+ if isinstance(obj, ContentNode):
470
+ if context.stream > 0:
471
+ # For streaming contexts, ensure exact node type match
472
+ if self.name == "*" or self.name == obj.node_type:
473
+ return [obj]
474
+ return []
475
+ else:
476
+ # For "//p" style selectors, we need to be more careful
477
+ # Get all possible matching nodes first
478
+ nodes = context.document.get_persistence().get_content_nodes(
479
+ self.name, obj, context.last_op != "/"
480
+ )
481
+
482
+ # Only add the current node if it exactly matches the node type
483
+ if self.name == "*" or self.name == obj.node_type:
484
+ nodes = [obj] + nodes
485
+
486
+ # Filter the nodes to ensure exact node type matches
487
+ if self.name != "*":
488
+ nodes = [node for node in nodes if node.node_type == self.name]
489
+
490
+ # If first_only is True, return only the first matching node
491
+ return nodes[:1] if context.first_only else nodes
492
+
493
+ if isinstance(obj, ContentFeature):
494
+ return self.name == "*" or (
495
+ obj.feature_type == self.prefix and obj.name == self.name
496
+ )
497
+ return False
498
+
499
+
500
+ class NodeType:
501
+ """A node type node test for a Step."""
502
+
503
+ def __init__(self, name: str, literal: Optional[str] = None):
504
+ """Initialize a new NodeType.
505
+
506
+ Args:
507
+ name: The node type name, such as node or text
508
+ literal: The literal argument (for processing-instruction type)
509
+ """
510
+ self.name = name
511
+ self.literal = literal
512
+
513
+
514
+ class AbbreviatedStep:
515
+ """An abbreviated XPath step (. or ..)."""
516
+
517
+ def __init__(self, abbr: str):
518
+ """Initialize a new AbbreviatedStep.
519
+
520
+ Args:
521
+ abbr: The abbreviated step (. or ..)
522
+ """
523
+ self.abbr = abbr
524
+
525
+ def resolve(self, content_node: ContentNode, variables: Dict, context: SelectorContext) -> List[ContentNode]:
526
+ """Resolve this abbreviated step.
527
+
528
+ Args:
529
+ content_node: The current content node
530
+ variables: Variable bindings
531
+ context: The selector context
532
+
533
+ Returns:
534
+ List of matching content nodes
535
+ """
536
+ if self.abbr == ".":
537
+ return [content_node]
538
+ if self.abbr == "..":
539
+ return [content_node.get_parent()] if content_node.get_parent() else []
540
+ raise Exception(f"Unsupported abbreviated step: {self.abbr}")
541
+
542
+
543
+ class VariableReference:
544
+ """An XPath variable reference (e.g., $foo, $myns:foo)."""
545
+
546
+ def __init__(self, name: Tuple[Optional[str], str]):
547
+ """Initialize a new VariableReference.
548
+
549
+ Args:
550
+ name: A tuple (prefix, localname) containing the variable name
551
+ """
552
+ self.name = name
553
+
554
+ def resolve(self, variables: Dict, context: SelectorContext) -> Any:
555
+ """Resolve this variable reference.
556
+
557
+ Args:
558
+ variables: Variable bindings
559
+ context: The selector context
560
+
561
+ Returns:
562
+ The value of the variable, or None if not found
563
+ """
564
+ if self.name[1] in variables:
565
+ return variables[self.name[1]]
566
+
567
+ return None
568
+
569
+
570
+ class FunctionCall:
571
+ """An XPath function call (e.g., foo(), my:foo(1), foo(1, 'a', $var))."""
572
+
573
+ def __init__(self, prefix: Optional[str], name: str, args: List):
574
+ """Initialize a new FunctionCall.
575
+
576
+ Args:
577
+ prefix: The namespace prefix, or None if unspecified
578
+ name: The local function name
579
+ args: A list of argument expressions
580
+ """
581
+ self.prefix = prefix
582
+ self.name = name
583
+ self.args = args
584
+
585
+ def resolve(self, content_node: "ContentNode", variables: Dict, context: "SelectorContext") -> Any:
586
+ """Resolve this function call.
587
+
588
+ Args:
589
+ content_node: The current content node
590
+ variables: Variable bindings
591
+ context: The selector context
592
+
593
+ Returns:
594
+ The result of the function call
595
+ """
596
+ args = []
597
+ for arg in self.args:
598
+ if isinstance(arg, VariableReference):
599
+ args.append(arg.resolve(variables, context))
600
+ elif hasattr(arg, 'resolve'):
601
+ args.append(arg.resolve(content_node, variables, context))
602
+ else:
603
+ args.append(arg)
604
+
605
+ if self.name == "true":
606
+ return True
607
+
608
+ if self.name == "false":
609
+ return False
610
+
611
+ if self.name == "contentRegex":
612
+ compiled_pattern = context.cache_pattern(args[0])
613
+
614
+ content_to_test = content_node.content
615
+
616
+ if len(args) > 1:
617
+ if bool(args[1]):
618
+ content_to_test = content_node.get_all_content()
619
+
620
+ if content_to_test is not None and compiled_pattern.match(content_to_test):
621
+ return content_to_test
622
+
623
+ return None
624
+
625
+ if self.name == "typeRegex":
626
+ compiled_pattern = context.cache_pattern(args[0])
627
+ if content_node.node_type is not None and compiled_pattern.match(
628
+ content_node.node_type
629
+ ):
630
+ return content_node.node_type
631
+
632
+ return None
633
+
634
+ if self.name == "tagRegex":
635
+ compiled_pattern = context.cache_pattern(args[0])
636
+ for feature in content_node.get_features_of_type("tag"):
637
+ if feature.name is not None and compiled_pattern.match(feature.name):
638
+ return True
639
+
640
+ return False
641
+
642
+ if self.name == "hasTag":
643
+
644
+ if len(args) > 0:
645
+ # Check for a specific tag
646
+ return content_node.has_feature("tag", args[0])
647
+ else:
648
+ print(content_node.get_tags())
649
+ return len(content_node.get_tags()) > 0
650
+
651
+ if self.name == "hasFeature":
652
+ if len(args) == 0:
653
+ return len(content_node.get_features()) > 0
654
+
655
+ return content_node.has_feature(args[0], args[1])
656
+
657
+ if self.name == "hasFeatureValue":
658
+ values = content_node.get_feature_values(args[0], args[1])
659
+ if values:
660
+ for value in values:
661
+ if value == args[2]:
662
+ return True
663
+ return False
664
+
665
+ if self.name == "content":
666
+ return content_node.content
667
+
668
+ if self.name == "id":
669
+ return content_node.id
670
+
671
+ if self.name == "node_type":
672
+ return content_node.node_type
673
+
674
+ if self.name == "index":
675
+ return content_node.index
676
+
677
+ return []
@@ -0,0 +1,29 @@
1
+ from antlr4.error.ErrorListener import ErrorListener
2
+
3
+ class KodexaSyntaxErrorListener(ErrorListener):
4
+ """
5
+ Custom error listener for ANTLR parser to provide better error messages.
6
+ """
7
+
8
+ def __init__(self):
9
+ super(KodexaSyntaxErrorListener, self).__init__()
10
+ self.errors = []
11
+
12
+ def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
13
+ """
14
+ Collect syntax errors during parsing.
15
+ """
16
+ error_msg = f"line {line}:{column} {msg}"
17
+ self.errors.append(error_msg)
18
+
19
+ def hasErrors(self):
20
+ """
21
+ Check if any errors were detected.
22
+ """
23
+ return len(self.errors) > 0
24
+
25
+ def getErrors(self):
26
+ """
27
+ Get the list of error messages.
28
+ """
29
+ return self.errors