kodexa-document 7.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodexa-document might be problematic. Click here for more details.

@@ -0,0 +1,3642 @@
1
+ """
2
+ The core model provides definitions for all the base objects in the Kodexa Content Model
3
+ """
4
+
5
+ import dataclasses
6
+ import datetime
7
+ import inspect
8
+ import json
9
+ import os
10
+ import re
11
+ import uuid
12
+ from enum import Enum
13
+ from typing import Any, List, Optional, Union
14
+ from addict import Dict
15
+ import deepdiff
16
+ import msgpack
17
+ from pydantic import BaseModel, ConfigDict, Field
18
+ from typing import Optional, Annotated
19
+
20
+ from pydantic import BaseModel, Field, WithJsonSchema, PlainSerializer, ConfigDict
21
+
22
+
23
+ def to_camel(string: str) -> str:
24
+ return "".join(word.capitalize() for word in string.split("_"))
25
+
26
+
27
+ StandardDateTime = Annotated[
28
+ datetime,
29
+ PlainSerializer(
30
+ lambda v: (
31
+ v.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
32
+ if not isinstance(v, str)
33
+ else v
34
+ ),
35
+ return_type=str,
36
+ ),
37
+ WithJsonSchema({"type": "datetime"}, mode="serialization"),
38
+ ]
39
+
40
+
41
+ class ContentType(Enum):
42
+ """A class representing the content type of a document or native file."""
43
+
44
+ document = "DOCUMENT"
45
+ native = "NATIVE"
46
+
47
+
48
+ class Label(BaseModel):
49
+ """
50
+
51
+ """
52
+ model_config = ConfigDict(
53
+ populate_by_name=True,
54
+ use_enum_values=True,
55
+ arbitrary_types_allowed=True,
56
+ protected_namespaces=("model_config",),
57
+ )
58
+ """
59
+ The labels from the latest content object in the family
60
+ """
61
+
62
+ id: Optional[str] = Field(None)
63
+ uuid: Optional[str] = None
64
+ change_sequence: Optional[int] = Field(None, alias="changeSequence")
65
+ created_on: Optional[StandardDateTime] = Field(None, alias="createdOn")
66
+ updated_on: Optional[StandardDateTime] = Field(None, alias="updatedOn")
67
+ name: str
68
+ color: Optional[str] = None
69
+ label: str
70
+
71
+
72
+ class TaxonValidation(BaseModel):
73
+ model_config = ConfigDict(
74
+ populate_by_name=True,
75
+ use_enum_values=True,
76
+ arbitrary_types_allowed=True,
77
+ protected_namespaces=("model_config",),
78
+ )
79
+
80
+ name: Optional[str] = Field(None)
81
+ description: Optional[str] = Field(None)
82
+ rule_formula: Optional[str] = Field(None, alias="ruleFormula")
83
+ message_formula: Optional[str] = Field(None, alias="messageFormula")
84
+ detail_formula: Optional[str] = Field(None, alias="detailFormula")
85
+ exception_id: Optional[str] = Field(None, alias="exceptionId")
86
+ support_article_id: Optional[str] = Field(None, alias="supportArticleId")
87
+ overridable: Optional[bool] = None
88
+ disabled: Optional[bool] = None
89
+
90
+
91
+ class DocumentTaxonValidation(BaseModel):
92
+ model_config = ConfigDict(
93
+ populate_by_name=True,
94
+ use_enum_values=True,
95
+ arbitrary_types_allowed=True,
96
+ protected_namespaces=("model_config",),
97
+ )
98
+
99
+ taxonomy_ref: Optional[str] = Field(None, alias="taxonomyRef")
100
+ taxon_path: Optional[str] = Field(None, alias="taxonPath")
101
+ validation: Optional[TaxonValidation] = None
102
+
103
+
104
+ class ContentFeature(BaseModel):
105
+ """ """
106
+
107
+ model_config = ConfigDict(
108
+ populate_by_name=True,
109
+ use_enum_values=True,
110
+ arbitrary_types_allowed=True,
111
+ protected_namespaces=("model_config",),
112
+ )
113
+ feature_type: Optional[str] = Field(None, alias="featureType")
114
+ name: Optional[str] = None
115
+ value: Optional[List[Dict[str, Any]]] = None
116
+ single: Optional[bool] = None
117
+
118
+
119
+ class ContentObject(BaseModel):
120
+ """ """
121
+
122
+ model_config = ConfigDict(
123
+ populate_by_name=True,
124
+ use_enum_values=True,
125
+ arbitrary_types_allowed=True,
126
+ protected_namespaces=("model_config",),
127
+ )
128
+ id: Optional[str] = Field(None)
129
+ uuid: Optional[str] = None
130
+ change_sequence: Optional[int] = Field(None, alias="changeSequence")
131
+ created_on: Optional[StandardDateTime] = Field(None, alias="createdOn")
132
+ updated_on: Optional[StandardDateTime] = Field(None, alias="updatedOn")
133
+ content_type: ContentType = Field(
134
+ ..., alias="contentType", description="The type of content"
135
+ )
136
+ document_version: Optional[str] = Field(None, alias="documentVersion")
137
+ index: Optional[int] = None
138
+ labels: Optional[List[Label]] = Field(default_factory=list)
139
+ metadata: Optional[Dict[str, Any]] = None
140
+ mixins: Optional[List[str]] = Field(default_factory=list)
141
+ created: Optional[StandardDateTime] = None
142
+ modified: Optional[StandardDateTime] = None
143
+ size: Optional[int] = None
144
+ store_ref: Optional[str] = Field(None, alias="storeRef")
145
+ document_family_id: Optional[str] = Field(None, alias="documentFamilyId")
146
+
147
+
148
+ class NodeFeatures(BaseModel):
149
+ """ """
150
+
151
+ model_config = ConfigDict(
152
+ populate_by_name=True,
153
+ use_enum_values=True,
154
+ arbitrary_types_allowed=True,
155
+ protected_namespaces=("model_config",),
156
+ )
157
+ node_uuid: Optional[str] = Field(None, alias="nodeUuid")
158
+ features: Optional[List[ContentFeature]] = None
159
+
160
+
161
+ class FeatureSet(BaseModel):
162
+ """ """
163
+
164
+ model_config = ConfigDict(
165
+ populate_by_name=True,
166
+ use_enum_values=True,
167
+ arbitrary_types_allowed=True,
168
+ protected_namespaces=("model_config",),
169
+ )
170
+ owner_uri: Optional[str] = Field(None, alias="ownerUri")
171
+ node_features: Optional[List[NodeFeatures]] = Field(None, alias="nodeFeatures")
172
+
173
+
174
+ class Ref:
175
+ """
176
+ A class to represent a reference.
177
+
178
+ Attributes
179
+ ----------
180
+ ref : str
181
+ a string reference
182
+ version : str, optional
183
+ a version of the reference, default is None
184
+ resource : str, optional
185
+ a resource of the reference, default is None
186
+ slug : str
187
+ a slug of the reference, default is an empty string
188
+ org_slug : str
189
+ an organization slug of the reference, default is an empty string
190
+ object_ref : str
191
+ a formatted string of the reference
192
+
193
+ Methods
194
+ -------
195
+ __init__(self, ref: str)
196
+ Constructs all the necessary attributes for the Ref object.
197
+ """
198
+
199
+ def __init__(self, ref: str):
200
+ self.ref: str = ref
201
+ first_part = ref
202
+ self.version: Optional[str] = None
203
+ self.resource: Optional[str] = None
204
+ self.slug: str = ""
205
+ self.org_slug: str = ""
206
+
207
+ if ":" in ref:
208
+ (first_part, self.version) = ref.split(":")
209
+
210
+ if "/" in self.version:
211
+ (self.version, self.resource) = self.version.split("/")
212
+
213
+ (self.org_slug, self.slug) = first_part.split("/")
214
+
215
+ self.object_ref = (
216
+ f"{self.org_slug}/{self.slug}:{self.version}"
217
+ if self.version
218
+ else f"{self.org_slug}/{self.slug}"
219
+ )
220
+
221
+
222
+ import addict
223
+
224
+
225
+ class DocumentMetadata(addict.Dict):
226
+ """A flexible dict based approach to capturing metadata for the document.
227
+
228
+ This class extends from Dict to provide a flexible way to store and
229
+ manage metadata associated with a document.
230
+
231
+ Args:
232
+ *args: Variable length argument list.
233
+ **kwargs: Arbitrary keyword arguments.
234
+ """
235
+
236
+ """A flexible dict based approach to capturing metadata for the document"""
237
+
238
+ def __init__(self, *args, **kwargs):
239
+ super().__init__(*args, **kwargs)
240
+
241
+
242
+ class ContentException(dict):
243
+ """A content exception represents an issue identified during labeling or validation at the document level.
244
+
245
+ Attributes:
246
+ tag (Optional[str]): Tag associated with the exception.
247
+ message (str): Message describing the exception.
248
+ exception_details (Optional[str]): Detailed information about the exception.
249
+ group_uuid (Optional[str]): UUID of the group associated with the exception.
250
+ tag_uuid (Optional[str]): UUID of the tag associated with the exception.
251
+ exception_type (str): Type of the exception.
252
+ node_uuid (Optional[str]): UUID of the node associated with the exception.
253
+ severity (str): Severity level of the exception, default is 'ERROR'.
254
+ value (Optional[str]): Value associated with the exception.
255
+ exception_type_id (Optional[str]): ID of the exception type.
256
+ """
257
+
258
+ """A content exception represents an issue identified during labeling or validation at the document level"""
259
+
260
+ def __init__(
261
+ self,
262
+ exception_type: str,
263
+ message: str,
264
+ severity: str = "ERROR",
265
+ tag: Optional[str] = None,
266
+ group_uuid: Optional[str] = None,
267
+ tag_uuid: Optional[str] = None,
268
+ exception_type_id: Optional[str] = None,
269
+ exception_details: Optional[str] = None,
270
+ node_uuid: Optional[str] = None,
271
+ value: Optional[str] = None,
272
+ boolean_value: Optional[bool] = None,
273
+ *args,
274
+ **kwargs,
275
+ ):
276
+ super().__init__(*args, **kwargs)
277
+ self.tag = tag
278
+ self.message = message
279
+ self.exception_details = exception_details
280
+ self.group_uuid = group_uuid
281
+ self.tag_uuid = tag_uuid
282
+ self.exception_type = exception_type
283
+ self.node_uuid = node_uuid
284
+ self.severity = severity
285
+ self.value = value
286
+ self.exception_type_id = exception_type_id
287
+ self.boolean_value = boolean_value
288
+
289
+
290
+ class Tag(object):
291
+ """A class to represent the metadata for a label that is applied as a feature on a content node.
292
+
293
+ Attributes:
294
+ start (Optional[int]): The start position (zero indexed) of the content within the node. If None, label is applied to the whole node.
295
+ end (Optional[int]): The end position (zero indexed) of the content within the node. If None, label is applied to the whole node.
296
+ value (Optional[str]): A string representing the value that was labelled in the node.
297
+ data (Optional[Any]): Any data object (JSON serializable) that you wish to associate with the label.
298
+ uuid (Optional[str]): The UUID for this tag instance. This allows tags that are on different content nodes to be related through the same UUID.
299
+ confidence (Optional[float]): The confidence of the tag in a range of 0-1.
300
+ index (Optional[int]): The tag index. This is used to allow us to order tags, and understand the ordering of parent child tag relationships.
301
+ bbox (Optional[List[int]]): The optional bounding box that can be used if the label is spatial (based on the node as the container).
302
+ group_uuid (Optional[str]): The UUID of the group that this tag belongs to. This is used to allow us to group tags together.
303
+ parent_group_uuid (Optional[str]): The UUID of the parent group that this tag belongs to. This is used to allow us to group tags together.
304
+ cell_index (Optional[int]): The cell index of the cell that this tag belongs to. This is used to allow us to group tags together.
305
+ note (Optional[str]): A note that can be associated with the tag.
306
+ status (Optional[str]): The status of the tag. This can be passed to an attribute status during extraction.
307
+ owner_uri (Optional[str]): The URI of the owner (ie. model://kodexa/narrative:1.0.0 or user://pdodds).
308
+ """
309
+
310
+ """A tag represents the metadata for a label that is applies as a feature on a content node"""
311
+
312
+ def __init__(
313
+ self,
314
+ start: Optional[int] = None,
315
+ end: Optional[int] = None,
316
+ value: Optional[str] = None,
317
+ uuid: Optional[str] = None,
318
+ data: Any = None,
319
+ *args,
320
+ confidence: Optional[float] = None,
321
+ group_uuid: Optional[str] = None,
322
+ parent_group_uuid: Optional[str] = None,
323
+ cell_index: Optional[int] = None,
324
+ index: Optional[int] = None,
325
+ bbox: Optional[List[int]] = None,
326
+ note: Optional[str] = None,
327
+ status: Optional[str] = None,
328
+ owner_uri: Optional[str] = None,
329
+ is_dirty: Optional[bool] = None,
330
+ **kwargs,
331
+ ):
332
+ import uuid as uuid_gen
333
+
334
+ # Store values both as attributes and dictionary keys
335
+ self.start = start
336
+
337
+ self.end = end
338
+
339
+ self.value = value
340
+
341
+ self.data = data
342
+
343
+ tag_uuid = uuid or str(uuid_gen.uuid4())
344
+ self.uuid = tag_uuid
345
+
346
+ self.confidence = confidence
347
+
348
+ self.index = index
349
+
350
+ self.bbox = bbox
351
+
352
+ self.group_uuid = group_uuid
353
+
354
+ self.parent_group_uuid = parent_group_uuid
355
+
356
+ self.cell_index = cell_index
357
+
358
+ self.note = note
359
+
360
+ self.status = status
361
+
362
+ self.owner_uri = owner_uri
363
+
364
+ self.is_dirty = is_dirty
365
+
366
+ # Pull the cell index from the data to the tag if we have it in the data
367
+ if (
368
+ self.cell_index is None
369
+ and data
370
+ and isinstance(data, dict)
371
+ and "cell_index" in data
372
+ ):
373
+ self.cell_index = data["cell_index"]
374
+
375
+ def to_dict(self):
376
+ """
377
+ Create a dictionary representing this Tag's structure and content.
378
+
379
+ Returns:
380
+ dict: The properties of this Tag structured as a dictionary.
381
+ """
382
+ result = {"uuid": self.uuid}
383
+
384
+ if self.start is not None:
385
+ result["start"] = self.start
386
+
387
+ if self.end is not None:
388
+ result["end"] = self.end
389
+
390
+ if self.value is not None:
391
+ result["value"] = self.value
392
+
393
+ if self.data is not None:
394
+ result["data"] = self.data
395
+
396
+ if self.confidence is not None:
397
+ result["confidence"] = self.confidence
398
+
399
+ if self.index is not None:
400
+ result["index"] = self.index
401
+
402
+ if self.bbox is not None:
403
+ result["bbox"] = self.bbox
404
+
405
+ if self.group_uuid is not None:
406
+ result["group_uuid"] = self.group_uuid
407
+
408
+ if self.parent_group_uuid is not None:
409
+ result["parent_group_uuid"] = self.parent_group_uuid
410
+
411
+ if self.cell_index is not None:
412
+ result["cell_index"] = self.cell_index
413
+
414
+ if self.note is not None:
415
+ result["note"] = self.note
416
+
417
+ if self.status is not None:
418
+ result["status"] = self.status
419
+
420
+ if self.owner_uri is not None:
421
+ result["owner_uri"] = self.owner_uri
422
+
423
+ if self.is_dirty is not None:
424
+ result["is_dirty"] = self.is_dirty
425
+
426
+ return result
427
+
428
+
429
+ class FindDirection(Enum):
430
+ """
431
+ Enum class for defining the direction of search in a tree structure.
432
+
433
+ Attributes:
434
+ CHILDREN (int): Represents the direction towards children nodes.
435
+ PARENT (int): Represents the direction towards parent node.
436
+ """
437
+
438
+ """ """
439
+ CHILDREN = 1
440
+ PARENT = 2
441
+
442
+
443
+ class Traverse(Enum):
444
+ """
445
+ An enumeration class that represents different types of traversals.
446
+
447
+ Attributes:
448
+ SIBLING (int): Represents traversal to a sibling.
449
+ CHILDREN (int): Represents traversal to children.
450
+ PARENT (int): Represents traversal to a parent.
451
+ ALL (int): Represents traversal to all types of nodes.
452
+ """
453
+
454
+ """ """
455
+ SIBLING = 1
456
+ CHILDREN = 2
457
+ PARENT = 3
458
+ ALL = 4
459
+
460
+
461
+ class ContentNode(object):
462
+ """A Content Node identifies a section of the document containing logical
463
+ grouping of information.
464
+
465
+ The node will have content and can include any number of features.
466
+
467
+ You should always create a node using the Document's create_node method to
468
+ ensure that the correct mixins are applied.
469
+
470
+ >>> new_page = document.create_node(node_type='page')
471
+ <kodexa_document.model.ContentNode object at 0x7f80605e53c8>
472
+ >>> current_content_node.add_child(new_page)
473
+
474
+ >>> new_page = document.create_node(node_type='page', content='This is page 1')
475
+ <kodexa_document.model.ContentNode object at 0x7f80605e53c8>
476
+ >>> current_content_node.add_child(new_page)
477
+
478
+ """
479
+
480
+ def __init__(
481
+ self,
482
+ document,
483
+ node_type: str,
484
+ id: Optional[int] = None,
485
+ content: Optional[str] = None,
486
+ content_parts: Optional[List[Any]] = None,
487
+ parent=None,
488
+ index: Optional[int] = None,
489
+ virtual: bool = False,
490
+ ):
491
+ self.id: Optional[int] = id
492
+ """The ID of the content node"""
493
+ self.node_type: str = node_type
494
+ """The node type (ie. line, page, cell etc)"""
495
+ self.document: Document = document
496
+ """The document that the node belongs to"""
497
+ self._content_parts: Optional[List[Any]] = content_parts
498
+ """The children of the content node"""
499
+ self._index: Optional[int] = index
500
+ """The index of the content node"""
501
+ self.id: Optional[int] = None
502
+ """The ID of the content node"""
503
+ self.virtual: bool = virtual
504
+ """Is the node virtual (ie. it doesn't actually exist in the document)"""
505
+
506
+ self._parent_id = parent.id if parent else None
507
+
508
+ if content_parts is not None:
509
+ self.set_content_parts(content_parts)
510
+ self._content_parts = self.get_content_parts()
511
+
512
+ if content is not None and len(self._content_parts) == 0:
513
+ self.set_content_parts([content])
514
+
515
+ @property
516
+ def id(self) -> Optional[int]:
517
+ """Get the ID of this node"""
518
+ return self._id
519
+
520
+ @id.setter
521
+ def id(self, value: Optional[int]):
522
+ """Set the ID of this node"""
523
+ self._id = value
524
+
525
+ @property
526
+ def index(self) -> Optional[int]:
527
+ """Get the index of this node"""
528
+ return self._index
529
+
530
+ @index.setter
531
+ def index(self, value: Optional[int]):
532
+ """Set the index of this node"""
533
+ self._index = value
534
+
535
+ def get_content_parts(self):
536
+ return self.document.get_persistence().get_content_parts(self)
537
+
538
+ def set_content_parts(self, content_parts):
539
+ self.document.get_persistence().update_content_parts(self, content_parts)
540
+
541
+ def update(self):
542
+ """
543
+ Update this node in the document persistence
544
+
545
+ :return:
546
+ """
547
+ self.document.get_persistence().update_node(self)
548
+
549
+ @property
550
+ def content(self):
551
+ if len(self.get_content_parts()) == 0:
552
+ return None
553
+
554
+ s = ""
555
+ for part in self.get_content_parts():
556
+ if isinstance(part, str):
557
+ if s != "":
558
+ s += " "
559
+ s += part
560
+
561
+ return s
562
+
563
+ @content.setter
564
+ def content(self, new_content):
565
+ if len(self.get_content_parts()) == 0:
566
+ self.set_content_parts([new_content])
567
+ else:
568
+ # We need to remove all the strings and add this one
569
+ # back at the front
570
+ parts = self.get_content_parts()
571
+ filtered_parts = list(filter(lambda part: isinstance(part, int), parts))
572
+ if new_content is not None and new_content != "":
573
+ filtered_parts.insert(0, new_content)
574
+ self.set_content_parts(filtered_parts)
575
+
576
+ def __eq__(self, other):
577
+ return (
578
+ other is not None
579
+ and self.id == other.id
580
+ and (self.id is not None and other.id is not None)
581
+ )
582
+
583
+ def __hash__(self):
584
+ return hash(self.id)
585
+
586
+ def get_parent(self) -> Optional["ContentNode"]:
587
+ return self.document.get_persistence().get_parent(self)
588
+
589
+ def __str__(self):
590
+ return (
591
+ f"ContentNode {self.id} [node_type:{self.node_type}] ({len(self.get_features())} features, {len(self.get_children())} children) ["
592
+ + str(self.content)
593
+ + "]"
594
+ )
595
+
596
+ def to_json(self):
597
+ """Create a JSON string representation of this ContentNode.
598
+
599
+ Args:
600
+
601
+ Returns:
602
+ str: The JSON formatted string representation of this ContentNode.
603
+
604
+ >>> node.to_json()
605
+ """
606
+ return json.dumps(self.to_dict())
607
+
608
+ def to_dict(self):
609
+ """Create a dictionary representing this ContentNode's structure and content.
610
+
611
+ Args:
612
+
613
+ Returns:
614
+ dict: The properties of this ContentNode and all of its children structured as a dictionary.
615
+
616
+ >>> node.to_dict()
617
+ """
618
+ new_dict = {
619
+ "node_type": self.node_type,
620
+ "content": self.content,
621
+ "content_parts": self.get_content_parts(),
622
+ "features": [],
623
+ "index": self.index,
624
+ "children": [],
625
+ "uuid": self.id,
626
+ }
627
+ for feature in self.get_features():
628
+ new_dict["features"].append(feature.to_dict())
629
+
630
+ for child in self.get_children():
631
+ new_dict["children"].append(child.to_dict())
632
+ return new_dict
633
+
634
+ @staticmethod
635
+ def from_dict(
636
+ document: "Document",
637
+ content_node_dict: dict,
638
+ parent: Optional["ContentNode"] = None,
639
+ ):
640
+ """Build a new ContentNode from a dictionary represention.
641
+
642
+ Args:
643
+ document (Document): The Kodexa document from which the new ContentNode will be created (not added).
644
+ content_node_dict (Dict): The dictionary-structured representation of a ContentNode. This value will be unpacked into a ContentNode.
645
+ parent (Optional[ContentNode]): Optionally the parent content node
646
+ Returns:
647
+ ContentNode: A ContentNode containing the unpacked values from the content_node_dict parameter.
648
+
649
+ >>> ContentNode.from_dict(document, content_node_dict)
650
+ """
651
+
652
+ node_type = (
653
+ content_node_dict["type"]
654
+ if document.version == Document.PREVIOUS_VERSION
655
+ else content_node_dict["node_type"]
656
+ )
657
+
658
+ new_content_node = document.create_node(
659
+ node_type=node_type,
660
+ content=(
661
+ content_node_dict["content"] if "content" in content_node_dict else None
662
+ ),
663
+ index=content_node_dict["index"],
664
+ parent=parent,
665
+ )
666
+
667
+ if (
668
+ "content_parts" in content_node_dict
669
+ and len(content_node_dict["content_parts"]) > 0
670
+ ):
671
+ new_content_node.set_content_parts(content_node_dict["content_parts"])
672
+
673
+ for dict_feature in content_node_dict["features"]:
674
+ feature_type = dict_feature["name"].split(":")[0]
675
+ feature_name = dict_feature["name"].split(":")[1]
676
+ feature_value = dict_feature["value"]
677
+
678
+ if feature_type == "tag":
679
+ # Handle both single tag and list of tags
680
+ if isinstance(feature_value, list):
681
+ # It's a list of tags
682
+ for tag_value in feature_value:
683
+ if isinstance(tag_value, Tag):
684
+ new_content_node.add_feature(
685
+ feature_type, feature_name, tag_value
686
+ )
687
+ else:
688
+ if isinstance(tag_value, list):
689
+ # if it is an empty list, turn it into a {}
690
+ if len(tag_value) == 0:
691
+ tag_value = {}
692
+ else:
693
+ raise ValueError(
694
+ f"Tag values cannot be a list of lists {tag_value}"
695
+ )
696
+ new_content_node.add_feature(
697
+ feature_type, feature_name, Tag(**tag_value)
698
+ )
699
+ else:
700
+ # It's a single tag
701
+ if isinstance(feature_value, Tag):
702
+ new_content_node.add_feature(
703
+ feature_type, feature_name, feature_value
704
+ )
705
+ else:
706
+ new_content_node.add_feature(
707
+ feature_type, feature_name, Tag(**feature_value)
708
+ )
709
+ else:
710
+ # For non-tag features, check if it's a list of values
711
+ if isinstance(feature_value, list):
712
+ # Add each value in the list individually
713
+ for value in feature_value:
714
+ new_content_node.add_feature(feature_type, feature_name, value)
715
+ else:
716
+ # Add the single value directly
717
+ new_content_node.add_feature(
718
+ feature_type, feature_name, feature_value
719
+ )
720
+
721
+ for dict_child in content_node_dict["children"]:
722
+ ContentNode.from_dict(document, dict_child, new_content_node)
723
+
724
+ return new_content_node
725
+
726
+ def add_child_content(
727
+ self, node_type: str, content: str, index: Optional[int] = None
728
+ ) -> "ContentNode":
729
+ """Convenience method to allow you to quick add a child node with a type and content
730
+
731
+ Args:
732
+ node_type: the node type
733
+ content: the content
734
+ index: the index (optional) (Default value = None)
735
+
736
+ Returns:
737
+ the new ContentNode
738
+
739
+ """
740
+ new_node = self.document.create_node(
741
+ node_type=node_type, parent=self, content=content
742
+ )
743
+ self.add_child(new_node, index)
744
+ return new_node
745
+
746
+ def add_child(self, child: "ContentNode", index: Optional[int] = None):
747
+ """Add a ContentNode as a child of this ContentNode
748
+
749
+ Args:
750
+ child (ContentNode): The node that will be added as a child of this node
751
+ index (Optional[int]): The index at which this child node should be added; defaults to None. If None, index is set as the count of child node elements.
752
+
753
+ Returns:
754
+
755
+ >>> new_page = document.create_node(node_type='page')
756
+ <kodexa_document.model.ContentNode object at 0x7f80605e53c8>
757
+ >>> current_content_node.add_child(new_page)
758
+ """
759
+ # Don't do any index manipulation for virtual nodes
760
+ if child.virtual:
761
+ child.index = index if index is not None else 0
762
+ # Skip directly to setting the parent without persistence
763
+ child._parent_id = self.id if self.id else None
764
+ return
765
+
766
+ existing_children = self.get_children()
767
+ num_existing_children = len(existing_children)
768
+
769
+ final_child_index: int
770
+
771
+ if index is None:
772
+ # If no index is provided, append the child.
773
+ final_child_index = num_existing_children
774
+ # No shifting of other children is needed for an append.
775
+ else:
776
+ # An index is provided. This is an insertion.
777
+ # The new child will take this index. Assumes index is non-negative.
778
+ final_child_index = index
779
+
780
+ # Existing children at or after this insertion point need their indices incremented.
781
+ children_to_shift = [
782
+ ec
783
+ for ec in existing_children
784
+ if ec.index is not None and ec.index >= final_child_index
785
+ ]
786
+
787
+ # Sort children to be shifted by their current index in descending order
788
+ # to prevent index collisions during sequential updates.
789
+ children_to_shift.sort(key=lambda c: c.index, reverse=True)
790
+
791
+ for c_to_shift in children_to_shift:
792
+ c_to_shift.index += 1
793
+ # Persist the updated index for each shifted child.
794
+ self.document.get_persistence().update_node(c_to_shift)
795
+
796
+ # Set the index on the child before adding it
797
+ child.index = final_child_index
798
+
799
+ # Add the child to the persistence layer
800
+ self.document.get_persistence().add_content_node(child, self)
801
+
802
+ def remove_child(self, content_node):
803
+ self.document.get_persistence().remove_content_node(content_node)
804
+
805
+ def get_children(self) -> List["ContentNode"]:
806
+ """Returns a list of the children of this node.
807
+
808
+ Returns:
809
+ list[ContentNode]: The list of child nodes for this ContentNode.
810
+
811
+ >>> node.get_children()
812
+ """
813
+ return self.document.get_persistence().get_children(self)
814
+
815
+ def set_feature(self, feature_type, name, value):
816
+ """Sets a feature for this ContentNode, replacing the value if a feature by this type and name already exists.
817
+
818
+ Args:
819
+ feature_type (str): The type of feature to be added to the node.
820
+ name (str): The name of the feature.
821
+ value (Any): The value of the feature.
822
+
823
+ Returns:
824
+ ContentFeature: The feature that was added to this ContentNode
825
+
826
+ >>> new_page = document.create_node(node_type='page')
827
+ <kodexa_document.model.ContentNode object at 0x7f80605e53c8>
828
+ >>> new_page.add_feature('pagination','pageNum',1)
829
+ """
830
+ self.remove_feature(feature_type, name)
831
+ return self.add_feature(feature_type, name, value)
832
+
833
+ def update_feature(self, feature: "ContentFeature"):
834
+ """
835
+ Update a feature on this node in document persistence
836
+
837
+ :param feature:
838
+ :return:
839
+ """
840
+ self.document.get_persistence().remove_feature(
841
+ self, feature.feature_type, feature.name
842
+ )
843
+ self.document.get_persistence().add_feature(self, feature, replace=True)
844
+
845
+ def add_feature(self, feature_type, name, value):
846
+ """
847
+ Add a new feature to this ContentNode.
848
+
849
+ For tag features, this will add a new tag instance. Multiple tags of the same type/name
850
+ can exist, each with its own data and UUID, sharing a common underlying feature entry.
851
+
852
+ For non-tag features, this will replace any existing feature of the same type and name.
853
+
854
+ Args:
855
+ feature_type (str): The type of feature to be added to the node.
856
+ name (str): The name of the feature.
857
+ value (Any): The value of the feature (e.g., a Tag object for tags, or other data for non-tags).
858
+
859
+ Returns:
860
+ ContentFeature: The ContentFeature object that was constructed and passed to persistence.
861
+ """
862
+
863
+ if feature_type == "tag":
864
+ if not isinstance(value, Tag):
865
+ if isinstance(value, list):
866
+ value = [Tag(**v) if not isinstance(v, Tag) else v for v in value]
867
+ else:
868
+ value = Tag(**value)
869
+
870
+ the_feature_to_add = ContentFeature(feature_type, name, value)
871
+ self.document.get_persistence().add_feature(
872
+ self, the_feature_to_add, replace=False
873
+ )
874
+
875
+ return the_feature_to_add
876
+
877
+ def delete_children(
878
+ self, nodes: Optional[List] = None, exclude_nodes: Optional[List] = None
879
+ ):
880
+ """Delete the children of this node, you can either supply a list of the nodes to delete
881
+ or the nodes to exclude from the delete, if neither are supplied then we delete all the children.
882
+
883
+ Note there is precedence in place, if you have provided a list of nodes to delete then the nodes
884
+ to exclude is ignored.
885
+
886
+ Args:
887
+ nodes: Optional[List[ContentNode]] a list of content nodes that are children to delete
888
+ exclude_nodes: Optional[List[ContentNode]] a list of content node that are children not to delete
889
+ nodes: Optional[List]: (Default value = None)
890
+ exclude_nodes: Optional[List]: (Default value = None)
891
+ """
892
+ children_to_delete = []
893
+
894
+ for child_node in self.get_children():
895
+ if nodes is not None:
896
+ for node_to_delete in nodes:
897
+ if node_to_delete.id == child_node.id:
898
+ children_to_delete.append(child_node)
899
+ elif exclude_nodes is not None:
900
+ if len(exclude_nodes) == 0:
901
+ children_to_delete.append(child_node)
902
+ else:
903
+ for nodes_to_exclude in exclude_nodes:
904
+ if nodes_to_exclude.id != child_node.id:
905
+ children_to_delete.append(child_node)
906
+ else:
907
+ children_to_delete.append(child_node)
908
+
909
+ for child_to_delete in children_to_delete:
910
+ if child_to_delete in self.get_children():
911
+ self.document.get_persistence().remove_content_node(child_to_delete)
912
+
913
+ def get_feature(self, feature_type, name):
914
+ """Gets the value for the given feature.
915
+
916
+ Args:
917
+ feature_type (str): The type of the feature.
918
+ name (str): The name of the feature.
919
+
920
+ Returns:
921
+ ContentFeature or None: The feature with the specified type & name. If no feature is found, None is returned.
922
+ Note that if there are more than one instance of the feature you will only get the first one
923
+
924
+ >>> new_page.get_feature('pagination','pageNum')
925
+ 1
926
+ """
927
+ hits = [
928
+ i
929
+ for i in self.get_features()
930
+ if i.feature_type == feature_type and i.name == name
931
+ ]
932
+ if len(hits) > 0:
933
+ return hits[0]
934
+
935
+ return None
936
+
937
+ def get_features_of_type(self, feature_type):
938
+ """Get all features of a specific type.
939
+
940
+ Args:
941
+ feature_type (str): The type of the feature.
942
+
943
+ Returns:
944
+ list[ContentFeature]: A list of feature with the specified type. If no features are found, an empty list is returned.
945
+
946
+ >>> new_page.get_features_of_type('my_type')
947
+ []
948
+ """
949
+ return [i for i in self.get_features() if i.feature_type == feature_type]
950
+
951
+ def has_feature(self, feature_type: str, name: str):
952
+ """Determines if a feature with the given feature and name exists on this content node.
953
+
954
+ Args:
955
+ feature_type (str): The type of the feature.
956
+ name (str): The name of the feature.
957
+
958
+ Returns:
959
+ bool: True if the feature is present; else, False.
960
+
961
+ >>> new_page.has_feature('pagination','pageNum')
962
+ True
963
+ """
964
+ return (
965
+ len(
966
+ [
967
+ i
968
+ for i in self.get_features()
969
+ if i.feature_type == feature_type and i.name == name
970
+ ]
971
+ )
972
+ > 0
973
+ )
974
+
975
+ def get_features(self) -> List["ContentFeature"]:
976
+ """Get all features on this ContentNode.
977
+
978
+ Returns:
979
+ list[ContentFeature]: A list of the features on this ContentNode.
980
+
981
+ """
982
+ return self.document.get_persistence().get_features(self)
983
+
984
+ def remove_feature(
985
+ self, feature_type: str, name: str, include_children: bool = False
986
+ ):
987
+ """Removes the feature with the given name and type from this node.
988
+
989
+ Args:
990
+ feature_type (str): The type of the feature.
991
+ name (str): The name of the feature.
992
+ include_children (bool): also remove the feature from nodes children
993
+
994
+ >>> new_page.remove_feature('pagination','pageNum')
995
+ """
996
+ self.document.get_persistence().remove_feature(self, feature_type, name)
997
+
998
+ if include_children:
999
+ for child in self.get_children():
1000
+ child.remove_feature(feature_type, name, include_children)
1001
+
1002
+ def get_feature_value(self, feature_type: str, name: str) -> Optional[Any]:
1003
+ """Get the value for a feature with the given name and type on this ContentNode.
1004
+
1005
+ Args:
1006
+ feature_type (str): The type of the feature.
1007
+ name (str): The name of the feature.
1008
+
1009
+ Returns:
1010
+ Any or None: The value of the feature if it exists on this ContentNode otherwise, None, note this
1011
+ only returns the first value (check single to determine if there are multiple)
1012
+
1013
+ >>> new_page.get_feature_value('pagination','pageNum')
1014
+ 1
1015
+ """
1016
+ feature = self.get_feature(feature_type, name)
1017
+
1018
+ return None if feature is None else feature.value
1019
+
1020
+ def get_feature_values(self, feature_type: str, name: str) -> Optional[List[Any]]:
1021
+ """Get the value for a feature with the given name and type on this ContentNode.
1022
+
1023
+ Args:
1024
+ feature_type (str): The type of the feature.
1025
+ name (str): The name of the feature.
1026
+
1027
+ Returns:
1028
+ The list of feature values or None if there is no feature
1029
+
1030
+ >>> new_page.get_feature_value('pagination','pageNum')
1031
+ 1
1032
+ """
1033
+ feature = self.get_feature(feature_type, name)
1034
+
1035
+ # Simply return all the feature values
1036
+ return None if feature is None else feature.value
1037
+
1038
+ def get_content(self):
1039
+ """Get the content of this node.
1040
+
1041
+ Args:
1042
+
1043
+ Returns:
1044
+ str: The content of this ContentNode.
1045
+
1046
+ >>> new_page.get_content()
1047
+ "This is page one"
1048
+ """
1049
+ return self.content
1050
+
1051
+ def get_node_type(self):
1052
+ """Get the type of this node.
1053
+
1054
+ Args:
1055
+
1056
+ Returns:
1057
+ str: The type of this ContentNode.
1058
+
1059
+ >>> new_page.get_content()
1060
+ "page"
1061
+ """
1062
+ return self.node_type
1063
+
1064
+ def select_first(self, selector, variables=None) -> Optional["ContentNode"]:
1065
+ """Select and return the first child of this node that match the selector value.
1066
+
1067
+ Args:
1068
+ selector (str): The selector (ie. //*)
1069
+ variables (dict, optional): A dictionary of variable name/value to use in substituion; defaults to None. Dictionary keys should match a variable specified in the selector.
1070
+
1071
+ Returns:
1072
+ Optional[ContentNode]: The first matching node or none
1073
+
1074
+ >>> document.get_root().select_first('.')
1075
+ ContentNode
1076
+
1077
+ >>> document.get_root().select_first('//*[hasTag($tagName)]', {"tagName": "div"})
1078
+ ContentNode
1079
+ """
1080
+ result = self.select(selector, variables)
1081
+ return result[0] if len(result) > 0 else None
1082
+
1083
+ def select(self, selector, variables=None, first_only=False):
1084
+ """Select and return the child nodes of this node that match the selector value.
1085
+
1086
+ Args:
1087
+ selector (str): The selector (ie. //*)
1088
+ variables (dict, optional): A dictionary of variable name/value to use in substituion; defaults to None. Dictionary keys should match a variable specified in the selector.
1089
+ first_only (bool, optional): If True, only the first matching node will be returned; defaults to False.
1090
+
1091
+ Returns:
1092
+ list[ContentNode]: A list of the matching content nodes. If no matches are found, the list will be empty.
1093
+
1094
+ >>> document.get_root().select('.')
1095
+ [ContentNode]
1096
+
1097
+ >>> document.get_root().select('//*[hasTag($tagName)]', {"tagName": "div"})
1098
+ [ContentNode]
1099
+ """
1100
+
1101
+ if variables is None:
1102
+ variables = {}
1103
+ from kodexa_document.selectors import parse
1104
+ from kodexa_document.selectors.parser import SelectorContext
1105
+
1106
+ context = SelectorContext(self.document, first_only=first_only)
1107
+ parsed_selector = parse(selector)
1108
+ return parsed_selector.resolve(self, variables, context)
1109
+
1110
+ def get_all_content(self, separator=" ", strip=True):
1111
+ """Get this node's content, concatenated with all of its children's content.
1112
+
1113
+ Args:
1114
+ separator(str, optional): The separator to use in joining content together; defaults to " ".
1115
+ strip(boolean, optional): Strip the result
1116
+
1117
+ Returns:
1118
+ str: The complete content for this node concatenated with the content of all child nodes.
1119
+
1120
+ >>> document.content_node.get_all_content()
1121
+
1122
+ "This string is made up of multiple nodes"
1123
+ """
1124
+ s = ""
1125
+ children = self.get_content_parts()
1126
+ for part in children:
1127
+ if isinstance(part, str):
1128
+ if s != "":
1129
+ s += separator
1130
+ s += part
1131
+ if isinstance(part, int):
1132
+ if s != "":
1133
+ s += separator
1134
+ s += [
1135
+ child.get_all_content(separator, strip=strip)
1136
+ for child in self.get_children()
1137
+ if child.index == part
1138
+ ][0]
1139
+
1140
+ # We need to determine if we have missing children and add them to the end
1141
+ for child in self.get_children():
1142
+ if child.index not in self.get_content_parts():
1143
+ if s != "":
1144
+ s += separator
1145
+ s += child.get_all_content(separator, strip=strip)
1146
+
1147
+ return s.strip() if strip else s
1148
+
1149
+ def adopt_children(self, nodes_to_adopt, replace=False):
1150
+ """This will take a list of content nodes and adopt them under this node, ensuring they are re-parented.
1151
+
1152
+ Args:
1153
+ nodes_to_adopt (List[ContentNode]): A list of ContentNodes that will be added to the end of this node's children collection
1154
+ replace (bool): If True, will remove all current children and replace them with the new list; defaults to True
1155
+
1156
+ >>> # select all nodes of type 'line', then the root node 'adopts' them
1157
+ >>> # and replaces all it's existing children with these 'line' nodes.
1158
+ >>> document.get_root().adopt_children(document.select('//line'), replace=True)
1159
+ """
1160
+ child_idx_base = 0
1161
+
1162
+ # We need to copy this since we might well mutate
1163
+ # it as we adopt
1164
+ children = nodes_to_adopt.copy()
1165
+ for existing_child in self.get_children():
1166
+ if existing_child not in children:
1167
+ existing_child.index = child_idx_base
1168
+ self.document.get_persistence().update_node(existing_child)
1169
+ else:
1170
+ existing_child.index = children.index(existing_child)
1171
+ existing_child._parent_id = self.id
1172
+ self.document.get_persistence().update_node(existing_child)
1173
+ child_idx_base += 1
1174
+
1175
+ # Copy to avoid mutation
1176
+ for new_child in children.copy():
1177
+ if new_child not in self.get_children():
1178
+ self.add_child(new_child, children.index(new_child))
1179
+ child_idx_base += 1
1180
+
1181
+ if replace:
1182
+ # Copy to avoid mutation
1183
+ for child in self.get_children().copy():
1184
+ if child not in children:
1185
+ self.remove_child(child)
1186
+
1187
+ def remove_tag(self, tag_name):
1188
+ """Remove a tag from this content node.
1189
+
1190
+ Args:
1191
+ str: tag_name: The name of the tag that should be removed.
1192
+ tag_name:
1193
+
1194
+ Returns:
1195
+
1196
+ >>> document.get_root().remove_tag('foo')
1197
+ """
1198
+ self.remove_feature("tag", tag_name)
1199
+
1200
+ def set_statistics(self, statistics):
1201
+ """Set the spatial statistics for this node
1202
+
1203
+ Args:
1204
+ statistics: the statistics object
1205
+
1206
+ Returns:
1207
+
1208
+ >>> document.select.('//page')[0].set_statistics(NodeStatistics())
1209
+ """
1210
+ self.add_feature("spatial", "statistics", statistics)
1211
+
1212
+ def get_statistics(self):
1213
+ """Get the spatial statistics for this node
1214
+
1215
+
1216
+ :return: the statistics object (or None if not set)
1217
+
1218
+ Args:
1219
+
1220
+ Returns:
1221
+
1222
+ >>> document.select.('//page')[0].get_statistics()
1223
+ <kodexa.spatial.NodeStatistics object at 0x7f80605e53c8>
1224
+ """
1225
+ return self.get_feature_value("spatial", "statistics")
1226
+
1227
+ def set_bbox(self, bbox):
1228
+ """Set the bounding box for the node, this is structured as:
1229
+
1230
+ [x1,y1,x2,y2]
1231
+
1232
+ Args:
1233
+ bbox: the bounding box array
1234
+
1235
+
1236
+ >>> document.select.('//page')[0].set_bbox([10,20,50,100])
1237
+ """
1238
+ self.set_feature("spatial", "bbox", bbox)
1239
+
1240
+ def get_bbox(self):
1241
+ """Get the bounding box for the node, this is structured as:
1242
+
1243
+ [x1,y1,x2,y2]
1244
+
1245
+
1246
+ :return: the bounding box array
1247
+
1248
+ >>> document.select.('//page')[0].get_bbox()
1249
+ [10,20,50,100]
1250
+ """
1251
+ bbox_value = self.get_feature_value("spatial", "bbox")
1252
+ if bbox_value is None:
1253
+ return None
1254
+
1255
+ if len(bbox_value) == 4:
1256
+ return bbox_value
1257
+
1258
+ if len(bbox_value) == 1:
1259
+ return bbox_value[0]
1260
+
1261
+ return self.get_feature_value("spatial", "bbox")
1262
+
1263
+ def set_bbox_from_children(self):
1264
+ """Set the bounding box for this node based on its children"""
1265
+
1266
+ x_min = None
1267
+ x_max = None
1268
+ y_min = None
1269
+ y_max = None
1270
+
1271
+ for child in self.get_children():
1272
+ child_bbox = child.get_bbox()
1273
+ if child_bbox:
1274
+ if not x_min or x_min > child_bbox[0]:
1275
+ x_min = child_bbox[0]
1276
+ if not x_max or x_max < child_bbox[2]:
1277
+ x_max = child_bbox[2]
1278
+ if not y_min or y_min > child_bbox[1]:
1279
+ y_min = child_bbox[1]
1280
+ if not y_max or y_max < child_bbox[3]:
1281
+ y_max = child_bbox[3]
1282
+
1283
+ if x_min:
1284
+ self.set_bbox([x_min, y_min, x_max, y_max])
1285
+
1286
+ def set_rotate(self, rotate):
1287
+ """Set the rotate of the node
1288
+
1289
+ Args:
1290
+ rotate: the rotation of the node
1291
+
1292
+ Returns:
1293
+
1294
+ >>> document.select.('//page')[0].set_rotate(90)
1295
+ """
1296
+ self.add_feature("spatial", "rotate", rotate)
1297
+
1298
+ def get_rotate(self):
1299
+ """Get the rotate of the node
1300
+
1301
+
1302
+ :return: the rotation of the node
1303
+
1304
+ Args:
1305
+
1306
+ Returns:
1307
+
1308
+ >>> document.select.('//page')[0].get_rotate()
1309
+ 90
1310
+ """
1311
+ return self.get_feature_value("spatial", "rotate")
1312
+
1313
+ def get_x(self):
1314
+ """Get the X position of the node
1315
+
1316
+
1317
+ :return: the X position of the node
1318
+
1319
+ Args:
1320
+
1321
+ Returns:
1322
+
1323
+ >>> document.select.('//page')[0].get_x()
1324
+ 10
1325
+ """
1326
+ self_bbox = self.get_bbox()
1327
+ if self_bbox:
1328
+ return self_bbox[0]
1329
+
1330
+ return None
1331
+
1332
+ def get_y(self):
1333
+ """Get the Y position of the node
1334
+
1335
+
1336
+ :return: the Y position of the node
1337
+
1338
+ Args:
1339
+
1340
+ Returns:
1341
+
1342
+ >>> document.select.('//page')[0].get_y()
1343
+ 90
1344
+ """
1345
+ self_bbox = self.get_bbox()
1346
+ if self_bbox:
1347
+ return self_bbox[1]
1348
+
1349
+ return None
1350
+
1351
+ def get_width(self):
1352
+ """Get the width of the node
1353
+
1354
+
1355
+ :return: the width of the node
1356
+
1357
+ Args:
1358
+
1359
+ Returns:
1360
+
1361
+ >>> document.select.('//page')[0].get_width()
1362
+ 70
1363
+ """
1364
+ self_bbox = self.get_bbox()
1365
+ if self_bbox:
1366
+ return self_bbox[2] - self_bbox[0]
1367
+
1368
+ return None
1369
+
1370
+ def get_height(self):
1371
+ """Get the height of the node
1372
+
1373
+
1374
+ :return: the height of the node
1375
+
1376
+ Args:
1377
+
1378
+ Returns:
1379
+
1380
+ >>> document.select.('//page')[0].get_height()
1381
+ 40
1382
+ """
1383
+ self_bbox = self.get_bbox()
1384
+ if self_bbox:
1385
+ return self_bbox[3] - self_bbox[1]
1386
+
1387
+ return None
1388
+
1389
+ def copy_tag(self, selector=".", existing_tag_name=None, new_tag_name=None):
1390
+ """Creates a new tag of 'new_tag_name' on the selected content node(s) with the same information as the tag with 'existing_tag_name'.
1391
+ Both existing_tag_name and new_tag_name values are required and must be different from one another. Otherwise, no action is taken.
1392
+ If a tag with the 'existing_tag_name' does not exist on a selected node, no action is taken for that node.
1393
+
1394
+ Args:
1395
+ selector: The selector to identify the source nodes to work on (default . - the current node)
1396
+ str: existing_tag_name: The name of the existing tag whose values will be copied to the new tag.
1397
+ str: new_tag_name: The name of the new tag. This must be different from the existing_tag_name.
1398
+ existing_tag_name: (Default value = None)
1399
+ new_tag_name: (Default value = None)
1400
+
1401
+ Returns:
1402
+
1403
+ >>> document.get_root().copy_tag('foo', 'bar')
1404
+ """
1405
+ if (
1406
+ existing_tag_name is None
1407
+ or new_tag_name is None
1408
+ or existing_tag_name == new_tag_name
1409
+ ):
1410
+ return # do nothing, just exit function
1411
+
1412
+ for node in self.select(selector):
1413
+ existing_tag_values = node.get_feature_values("tag", existing_tag_name)
1414
+ if existing_tag_values:
1415
+ for val in existing_tag_values:
1416
+ tag = Tag(
1417
+ start=val.start,
1418
+ end=val.end,
1419
+ value=val.value,
1420
+ uuid=val.uuid,
1421
+ data=val.data if val.data else {},
1422
+ )
1423
+ node.add_feature("tag", new_tag_name, tag)
1424
+
1425
+ def collect_nodes_to(self, end_node):
1426
+ """Get the the sibling nodes between the current node and the end_node.
1427
+
1428
+ Args:
1429
+ ContentNode: end_node: The node to end at
1430
+ end_node:
1431
+
1432
+ Returns:
1433
+ list[ContentNode]: A list of sibling nodes between this node and the end_node.
1434
+
1435
+ >>> document.content_node.get_children()[0].collect_nodes_to(end_node=document.content_node.get_children()[5])
1436
+ """
1437
+ nodes = []
1438
+ current_node = self
1439
+ while current_node.id != end_node.id:
1440
+ nodes.append(current_node)
1441
+ if current_node.has_next_node():
1442
+ current_node = current_node.next_node()
1443
+ else:
1444
+ break
1445
+ return nodes
1446
+
1447
+ def tag_nodes_to(self, end_node, tag_to_apply, tag_uuid: str = None):
1448
+ """Tag all the nodes from this node to the end_node with the given tag name
1449
+
1450
+ Args:
1451
+ end_node (ContentNode): The node to end with
1452
+ tag_to_apply (str): The tag name that will be applied to each node
1453
+ tag_uuid (str): The tag uuid used if you want to group them
1454
+
1455
+ >>> document.content_node.get_children()[0].tag_nodes_to(document.content_node.get_children()[5], tag_name='foo')
1456
+ """
1457
+ [
1458
+ node.tag(tag_to_apply, tag_uuid=tag_uuid)
1459
+ for node in self.collect_nodes_to(end_node)
1460
+ ]
1461
+
1462
+ def tag_range(
1463
+ self,
1464
+ start_content_re,
1465
+ end_content_re,
1466
+ tag_to_apply,
1467
+ node_type_re=".*",
1468
+ use_all_content=False,
1469
+ ):
1470
+ """This will tag all the child nodes between the start and end content regular expressions
1471
+
1472
+ Args:
1473
+ start_content_re: The regular expression to match the starting child
1474
+ end_content_re: The regular expression to match the ending child
1475
+ tag_to_apply: The tag name that will be applied to the nodes in range
1476
+ node_type_re: The node type to match (default is all)
1477
+ use_all_content: Use full content (including child nodes, default is False)
1478
+
1479
+ Returns:
1480
+
1481
+ >>> document.content_node.tag_range(start_content_re='.*Cheese.*', end_content_re='.*Fish.*', tag_to_apply='foo')
1482
+ """
1483
+
1484
+ # Could be line, word, or content-area
1485
+ all_nodes = self.select(f"//*[typeRegex('{node_type_re}')]")
1486
+
1487
+ start_index_list = [
1488
+ n_idx
1489
+ for n_idx, node in enumerate(all_nodes)
1490
+ if re.compile(start_content_re).match(
1491
+ node.get_all_content() if use_all_content else node.content
1492
+ )
1493
+ ]
1494
+ end_index_list = [
1495
+ n_idx
1496
+ for n_idx, node in enumerate(all_nodes)
1497
+ if re.compile(end_content_re).match(
1498
+ node.get_all_content() if use_all_content else node.content
1499
+ )
1500
+ ]
1501
+
1502
+ start_index = (
1503
+ 0
1504
+ if start_content_re == ""
1505
+ else start_index_list[0] if len(start_index_list) > 0 else None
1506
+ )
1507
+ if start_index is not None:
1508
+ end_index_list = [i for i in end_index_list if i >= start_index]
1509
+
1510
+ end_index = (
1511
+ len(all_nodes)
1512
+ if end_content_re == ""
1513
+ else end_index_list[0] if len(end_index_list) > 0 else len(all_nodes)
1514
+ )
1515
+
1516
+ if start_index is not None:
1517
+ [node.tag(tag_to_apply) for node in all_nodes[start_index:end_index]]
1518
+
1519
+ def tag(
1520
+ self,
1521
+ tag_to_apply,
1522
+ selector=".",
1523
+ content_re=None,
1524
+ use_all_content=False,
1525
+ node_only=None,
1526
+ fixed_position=None,
1527
+ data=None,
1528
+ separator=" ",
1529
+ tag_uuid: str = None,
1530
+ confidence=None,
1531
+ value=None,
1532
+ use_match=True,
1533
+ index=None,
1534
+ cell_index=None,
1535
+ group_uuid=None,
1536
+ parent_group_uuid=None,
1537
+ note=None,
1538
+ status=None,
1539
+ owner_uri=None,
1540
+ is_dirty=None,
1541
+ sort_by_bbox: bool = False,
1542
+ ):
1543
+ """
1544
+ This will tag (see Feature Tagging) the expression groups identified by the regular expression.
1545
+
1546
+ Note that if you use the flag use_all_content then node_only will default to True if not set, else it
1547
+ will default to False
1548
+
1549
+ Args:
1550
+ tag_to_apply: The name of tag that will be applied to the node
1551
+ selector: The selector to identify the source nodes to work on (default . - the current node)
1552
+ content_re: The regular expression that you wish to use to tag, note that we will create a tag for each matching group (Default value = None)
1553
+ use_all_content: Apply the regular expression to the all_content (include content from child nodes) (Default value = False)
1554
+ separator: Separator to use for use_all_content (Default value = " ")
1555
+ node_only: Ignore the matching groups and tag the whole node (Default value = None)
1556
+ fixed_position: Use a fixed position, supplied as a tuple i.e. - (4,10) tag from position 4 to 10 (default None)
1557
+ data: A dictionary of data for the given tag (Default value = None)
1558
+ tag_uuid: A UUID used to tie tags in order to demonstrate they're related and form a single concept.
1559
+ For example, if tagging the two words "Wells" and "Fargo" as an ORGANIZATION, the tag on both words should have the
1560
+ same tag_uuid in order to indicate they are both needed to form the single ORGANIZATION. If a tag_uuid is provided, it is used
1561
+ on all tags created in this method. This may result in multiple nodes or multiple feature values having the same tag_uuid.
1562
+ For example, if the selector provided results in more than one node being selected, each node would be tagged with the same tag_uuid.
1563
+ The same holds true if a content_re value is provided, node_only is set to False, and multiple matches are found for the content_re
1564
+ pattern. In that case, each feature value would share the same UUID.
1565
+ If no tag_uuid is provided, a new uuid is generated for each tag instance.
1566
+ tag_uuid: str: (Default value = None)
1567
+ confidence: The confidence in the tag (0-1)
1568
+ value: The value you wish to store with the tag, this allows you to provide text that isn't part of the content but represents the data you wish tagged
1569
+ use_match: If True (default) we will use match for regex matching, if False we will use search
1570
+ index: The index for the tag
1571
+ cell_index: The cell index for the tag
1572
+ group_uuid: The group uuid for the tag
1573
+ parent_group_uuid: The parent group uuid for the tag
1574
+ note: a text note for the tag
1575
+ status: a status for the tag, this can be transistioned to an attribute status during extraction
1576
+ owner_uri: the uri of the entity that created the tag (model vs user; example: model://cdad-healthcare/cdad-excel-model:1.0.0 or user://pdodds)
1577
+ is_dirty: when the model is run, is_dirty = false for all tags. New tags and editted tags, is_dirty = true.
1578
+
1579
+ >>> document.content_node.tag('is_cheese')
1580
+ """
1581
+
1582
+ if use_all_content and node_only is None:
1583
+ node_only = True
1584
+ elif node_only is None:
1585
+ node_only = False
1586
+
1587
+ def get_tag_uuid(tag_uuid):
1588
+ """
1589
+ This function returns the provided tag_uuid if it exists, otherwise it generates a new UUID.
1590
+
1591
+ Args:
1592
+ tag_uuid (str): The UUID of the tag.
1593
+
1594
+ Returns:
1595
+ str: The provided tag_uuid if it exists, otherwise a newly generated UUID.
1596
+ """
1597
+ if tag_uuid:
1598
+ return tag_uuid
1599
+
1600
+ return str(uuid.uuid4())
1601
+
1602
+ def tag_node_position(
1603
+ node_to_check,
1604
+ start,
1605
+ end,
1606
+ node_data,
1607
+ tag_uuid,
1608
+ offset=0,
1609
+ value=None,
1610
+ sort_by_bbox: bool = False,
1611
+ ):
1612
+ """
1613
+ This function tags a node position in a given data structure. It iterates over the content parts of the node to check,
1614
+ and based on the type of the part (string or integer), it performs different operations. If the part is a string, it
1615
+ adjusts the start and end positions and adds a feature to the node. If the part is an integer, it finds the corresponding
1616
+ child node and recursively calls the function on the child node. After processing all parts, it checks for any missing
1617
+ children and processes them as well. Finally, it checks if the length of all content matches the calculated content length.
1618
+
1619
+ Args:
1620
+ node_to_check (Node): The node to check and tag.
1621
+ start (int): The start position of the tag.
1622
+ end (int): The end position of the tag.
1623
+ node_data (dict): The data associated with the node.
1624
+ tag_uuid (str): The UUID of the tag.
1625
+ offset (int, optional): The offset to apply. Defaults to 0.
1626
+ value (str, optional): The value to use for the tag. If None, the part of the content at the start and end positions is used. Defaults to None.
1627
+
1628
+ Raises:
1629
+ Exception: If an invalid part is encountered in the content parts of the node to check.
1630
+ Exception: If there is a mismatch between the length of all content and the calculated content length.
1631
+
1632
+ Returns:
1633
+ int: The calculated content length.
1634
+ """
1635
+ content_length = 0
1636
+ original_start = start
1637
+ original_end = end
1638
+ for part_idx, part in enumerate(node_to_check.get_content_parts()):
1639
+ if isinstance(part, str):
1640
+ if len(part) > 0:
1641
+ # It is just content
1642
+ part_length = len(part)
1643
+ if part_idx > 0:
1644
+ end = end - len(separator)
1645
+ content_length = content_length + len(separator)
1646
+ offset = offset + len(separator)
1647
+ start = (
1648
+ 0
1649
+ if start - len(separator) < 0
1650
+ else start - len(separator)
1651
+ )
1652
+
1653
+ if start < part_length and end < part_length:
1654
+ node_to_check.add_feature(
1655
+ "tag",
1656
+ tag_to_apply,
1657
+ Tag(
1658
+ original_start,
1659
+ original_end,
1660
+ part[start:end] if value is None else value,
1661
+ data=node_data,
1662
+ uuid=tag_uuid,
1663
+ confidence=confidence,
1664
+ index=index,
1665
+ parent_group_uuid=parent_group_uuid,
1666
+ group_uuid=group_uuid,
1667
+ cell_index=cell_index,
1668
+ note=note,
1669
+ status=status,
1670
+ owner_uri=owner_uri,
1671
+ is_dirty=is_dirty,
1672
+ ),
1673
+ )
1674
+ return -1
1675
+ if start < part_length <= end:
1676
+ node_to_check.add_feature(
1677
+ "tag",
1678
+ tag_to_apply,
1679
+ Tag(
1680
+ original_start,
1681
+ content_length + part_length,
1682
+ value=part[start:] if value is None else value,
1683
+ data=node_data,
1684
+ uuid=tag_uuid,
1685
+ confidence=confidence,
1686
+ index=index,
1687
+ parent_group_uuid=parent_group_uuid,
1688
+ group_uuid=group_uuid,
1689
+ cell_index=cell_index,
1690
+ note=note,
1691
+ status=status,
1692
+ owner_uri=owner_uri,
1693
+ is_dirty=is_dirty,
1694
+ ),
1695
+ )
1696
+
1697
+ end = end - part_length
1698
+ content_length = content_length + part_length
1699
+ offset = offset + part_length
1700
+ start = 0 if start - part_length < 0 else start - part_length
1701
+
1702
+ elif isinstance(part, int):
1703
+ child_node = [
1704
+ child
1705
+ for child in node_to_check.get_children()
1706
+ if child.index == part
1707
+ ][0]
1708
+
1709
+ if part_idx > 0:
1710
+ end = end - len(separator)
1711
+ content_length = content_length + len(separator)
1712
+ offset = offset + len(separator)
1713
+ start = (
1714
+ 0 if start - len(separator) < 0 else start - len(separator)
1715
+ )
1716
+
1717
+ result = tag_node_position(
1718
+ child_node,
1719
+ start,
1720
+ end,
1721
+ node_data,
1722
+ tag_uuid,
1723
+ offset=offset,
1724
+ value=value,
1725
+ sort_by_bbox=sort_by_bbox,
1726
+ )
1727
+
1728
+ if result < 0 or (end - result) <= 0:
1729
+ return -1
1730
+
1731
+ offset = offset + result
1732
+ end = end - result
1733
+ start = 0 if start - result < 0 else start - result
1734
+
1735
+ content_length = content_length + result
1736
+ else:
1737
+ raise Exception("Invalid part?")
1738
+
1739
+ # We need to determine if we have missing children and add them to the end
1740
+ node_children = node_to_check.get_children()
1741
+ if node_children and sort_by_bbox:
1742
+ # Sort nodes by x-coordinate if they have bboxes, otherwise use index
1743
+ try:
1744
+ node_children.sort(
1745
+ key=lambda x: (
1746
+ x.get_bbox()[0]
1747
+ if hasattr(x, "get_bbox")
1748
+ else x.index if hasattr(x, "index") else 0
1749
+ )
1750
+ )
1751
+ except (AttributeError, TypeError, IndexError):
1752
+ # If sorting fails, keep original order
1753
+ pass
1754
+
1755
+ for child_idx, child_node in enumerate(node_children):
1756
+ if child_node.index not in node_to_check.get_content_parts():
1757
+ if content_length > 0:
1758
+ end = end - len(separator)
1759
+ content_length = content_length + len(separator)
1760
+ offset = offset + len(separator)
1761
+ start = (
1762
+ 0 if start - len(separator) < 0 else start - len(separator)
1763
+ )
1764
+
1765
+ result = tag_node_position(
1766
+ child_node,
1767
+ start,
1768
+ end,
1769
+ node_data,
1770
+ tag_uuid,
1771
+ offset=offset,
1772
+ value=value,
1773
+ sort_by_bbox=sort_by_bbox,
1774
+ )
1775
+
1776
+ if result < 0 or (end - result) <= 0:
1777
+ return -1
1778
+
1779
+ offset = offset + result
1780
+ end = end - result
1781
+ start = 0 if start - result < 0 else start - result
1782
+
1783
+ content_length = content_length + result
1784
+
1785
+ if len(node_to_check.get_all_content(strip=False)) != content_length:
1786
+ raise Exception(
1787
+ f"There is a problem in the structure? (2) Length mismatch ({len(node_to_check.get_all_content(strip=False))} != {content_length})"
1788
+ )
1789
+
1790
+ return content_length
1791
+
1792
+ if content_re:
1793
+ pattern = re.compile(
1794
+ content_re.replace(" ", r"\s+")
1795
+ if use_all_content and not node_only
1796
+ else content_re
1797
+ )
1798
+
1799
+ for node in self.select(selector):
1800
+ if fixed_position:
1801
+ tag_node_position(
1802
+ node,
1803
+ fixed_position[0],
1804
+ fixed_position[1],
1805
+ data,
1806
+ get_tag_uuid(tag_uuid),
1807
+ 0,
1808
+ value=value,
1809
+ sort_by_bbox=sort_by_bbox,
1810
+ )
1811
+
1812
+ else:
1813
+ if not content_re:
1814
+ node.add_feature(
1815
+ "tag",
1816
+ tag_to_apply,
1817
+ Tag(
1818
+ data=data,
1819
+ uuid=get_tag_uuid(tag_uuid),
1820
+ confidence=confidence,
1821
+ value=value,
1822
+ index=index,
1823
+ parent_group_uuid=parent_group_uuid,
1824
+ group_uuid=group_uuid,
1825
+ cell_index=cell_index,
1826
+ note=note,
1827
+ status=status,
1828
+ owner_uri=owner_uri,
1829
+ is_dirty=is_dirty,
1830
+ ),
1831
+ )
1832
+ else:
1833
+ if not use_all_content:
1834
+ if node.content:
1835
+ content = node.content
1836
+ else:
1837
+ content = None
1838
+ else:
1839
+ content = (
1840
+ node.get_all_content(separator=separator, strip=False)
1841
+ if not node_only
1842
+ else node.get_all_content(separator=separator)
1843
+ )
1844
+
1845
+ if content is not None:
1846
+ if use_match:
1847
+ matches = pattern.finditer(content)
1848
+
1849
+ if node_only:
1850
+ if any(True for _ in matches):
1851
+ node.add_feature(
1852
+ "tag",
1853
+ tag_to_apply,
1854
+ Tag(
1855
+ data=data,
1856
+ uuid=get_tag_uuid(tag_uuid),
1857
+ confidence=confidence,
1858
+ value=value,
1859
+ index=index,
1860
+ parent_group_uuid=parent_group_uuid,
1861
+ group_uuid=group_uuid,
1862
+ cell_index=cell_index,
1863
+ note=note,
1864
+ status=status,
1865
+ owner_uri=owner_uri,
1866
+ is_dirty=is_dirty,
1867
+ ),
1868
+ )
1869
+ else:
1870
+ if matches:
1871
+ for match in matches:
1872
+ start_offset = match.span()[0]
1873
+ end_offset = match.span()[1]
1874
+ tag_node_position(
1875
+ node,
1876
+ start_offset,
1877
+ end_offset,
1878
+ data,
1879
+ get_tag_uuid(tag_uuid),
1880
+ value=value,
1881
+ sort_by_bbox=sort_by_bbox,
1882
+ )
1883
+
1884
+ else:
1885
+ search_match = pattern.search(content)
1886
+ if search_match is not None:
1887
+ start_offset = search_match.span()[0]
1888
+ end_offset = search_match.span()[1]
1889
+ tag_node_position(
1890
+ node,
1891
+ start_offset,
1892
+ end_offset,
1893
+ data,
1894
+ get_tag_uuid(tag_uuid),
1895
+ value=value,
1896
+ sort_by_bbox=sort_by_bbox,
1897
+ )
1898
+
1899
+ def get_tags(self):
1900
+ """Returns a list of the names of the tags on the given node
1901
+
1902
+
1903
+ :return: A list of the tag name
1904
+
1905
+ Args:
1906
+
1907
+ Returns:
1908
+
1909
+ >>> document.content_node.select('*').get_tags()
1910
+ ['is_cheese']
1911
+ """
1912
+ return [i.name for i in self.get_features_of_type("tag")]
1913
+
1914
+ def get_tag_features(self):
1915
+ """Returns a list of the features that are tags on the given node
1916
+
1917
+
1918
+ :return: A list of the tag name
1919
+
1920
+ Args:
1921
+
1922
+ Returns:
1923
+
1924
+ >>> document.content_node.select('*').get_tag_features()
1925
+ [ContentFeature()]
1926
+ """
1927
+ return [i for i in self.get_features_of_type("tag")]
1928
+
1929
+ def get_tag_values(self, tag_name, include_children=False):
1930
+ """Get the values for a specific tag name
1931
+
1932
+ Args:
1933
+ tag_name: tag name
1934
+ include_children: include the children of this node (Default value = False)
1935
+
1936
+ Returns:
1937
+ a list of the tag values
1938
+
1939
+ """
1940
+ values = []
1941
+ for tag in self.get_tag(tag_name):
1942
+ values.append(tag.value)
1943
+
1944
+ if include_children:
1945
+ for child in self.get_children():
1946
+ values.extend(child.get_tag_values(tag_name, include_children))
1947
+
1948
+ return values
1949
+
1950
+ def get_related_tag_values(
1951
+ self,
1952
+ tag_name: str,
1953
+ include_children: bool = False,
1954
+ value_separator: str = " ",
1955
+ tag_uuid=None,
1956
+ ):
1957
+ """Get the values for a specific tag name, grouped by uuid
1958
+
1959
+ Args:
1960
+ tag_name (str): tag name
1961
+ include_children (bool): include the children of this node
1962
+ value_separator (str): the string to be used to join related tag values
1963
+
1964
+ Returns:
1965
+ a list of the tag values
1966
+
1967
+ """
1968
+
1969
+ def group_tag_values(group_dict, feature_val, tag_uuid, tag_node):
1970
+ """
1971
+ This function groups tag values if they share the same uuid. It checks if the uuid of the feature value matches the tag uuid.
1972
+ If they match, it sets the final value to the feature value if it exists, otherwise it sets it to the tag node content.
1973
+ Then, it checks if the uuid is in the value groups keys. If it is, it appends the final value to the group.
1974
+ If it's the first occurrence, it sets the group to the final value.
1975
+
1976
+ Args:
1977
+ group_dict (dict): The dictionary to group the values in.
1978
+ feature_val (dict): The feature value to check.
1979
+ tag_uuid (str): The uuid of the tag.
1980
+ tag_node (Node): The node of the tag.
1981
+
1982
+ Returns:
1983
+ None
1984
+ """
1985
+ # we know the names of all these tags are the same, but we want to group them if they share the same uuid
1986
+
1987
+ if feature_val["uuid"] != tag_uuid:
1988
+ return
1989
+
1990
+ final_value = feature_val["value"] if "value" in feature_val else None
1991
+ if final_value is None:
1992
+ final_value = tag_node.content
1993
+
1994
+ if feature_val["uuid"] in value_groups.keys():
1995
+ # we've seen this UUID - add it's value to the group
1996
+ group_dict[feature_val["uuid"]].append(final_value)
1997
+ else:
1998
+ # first occurrence
1999
+ group_dict[feature_val["uuid"]] = [final_value]
2000
+
2001
+ if include_children:
2002
+ tagged_nodes = self.document.get_tagged_nodes(tag_name, tag_uuid=tag_uuid)
2003
+ else:
2004
+ tagged_nodes = self.select(".")
2005
+
2006
+ value_groups: Dict[str, Any] = {}
2007
+ for tag_node in tagged_nodes:
2008
+ tag_feature_vals = tag_node.get_feature_value("tag", tag_name)
2009
+ if tag_feature_vals:
2010
+ if not isinstance(tag_feature_vals, list):
2011
+ tag_feature_vals = [tag_feature_vals]
2012
+
2013
+ for v in tag_feature_vals:
2014
+ group_tag_values(value_groups, v, tag_uuid, tag_node)
2015
+
2016
+ value_strings = []
2017
+ for k in value_groups.keys():
2018
+ if (
2019
+ value_groups[k]
2020
+ and len(value_groups[k]) > 0
2021
+ and value_groups[k][0] is not None
2022
+ ):
2023
+ value_strings.append(value_separator.join(value_groups[k]))
2024
+
2025
+ return value_strings
2026
+
2027
+ def get_related_tag_nodes(
2028
+ self, tag_name: str, everywhere: bool = False, tag_uuid=None
2029
+ ):
2030
+ """Get the nodes for a specific tag name, grouped by uuid
2031
+
2032
+ Args:
2033
+ tag_name (str): tag name
2034
+ everywhere (bool): include the children of this node
2035
+ tag_uuid (optional(str)): if set we will only get nodes related to this tag UUID
2036
+
2037
+ Returns:
2038
+ a dictionary that groups nodes by tag UUID
2039
+
2040
+ """
2041
+ if everywhere:
2042
+ tagged_nodes = self.document.get_tagged_nodes(tag_name, tag_uuid)
2043
+ else:
2044
+ tagged_nodes = [self]
2045
+
2046
+ # We need to group these nodes together based on the TAG UUID
2047
+
2048
+ node_groups = {}
2049
+
2050
+ for tagged_node in tagged_nodes:
2051
+ tag_instances = tagged_node.get_tag(tag_name)
2052
+
2053
+ for tag_instance in tag_instances:
2054
+ if "uuid" in tag_instance:
2055
+ if tag_instance["uuid"] not in node_groups:
2056
+ node_groups[tag_instance["uuid"]] = [tagged_node]
2057
+ else:
2058
+ node_groups[tag_instance["uuid"]].append(tagged_node)
2059
+
2060
+ return node_groups
2061
+
2062
+ def get_tag(self, tag_name, tag_uuid=None):
2063
+ """Returns the value of a tag (a dictionary), this can be either a single value in a list [[start,end,value]] or if multiple parts of the
2064
+ content of this node match you can end up with a list of lists i.e. [[start1,end1,value1],[start2,end2,value2]]
2065
+
2066
+ Args:
2067
+ tag_name: The name of the tag
2068
+ tag_uuid (Optional): Optionally you can also provide the tag UUID
2069
+
2070
+ Returns:
2071
+ A list tagged location and values for this label in this node
2072
+
2073
+ >>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').get_tag('is_cheese')
2074
+ [0,10,'The Cheese Moved']
2075
+ """
2076
+ tag_details = self.get_feature_value("tag", tag_name)
2077
+
2078
+ if tag_details is None:
2079
+ return []
2080
+
2081
+ if not isinstance(tag_details, list):
2082
+ tag_details = [tag_details]
2083
+
2084
+ return tag_details
2085
+
2086
+ def get_all_tags(self):
2087
+ """Get the names of all tags that have been applied to this node or to its children.
2088
+
2089
+ Args:
2090
+
2091
+ Returns:
2092
+ list[str]: A list of the tag names belonging to this node and/or its children.
2093
+
2094
+ >>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').get_all_tags()
2095
+ ['is_cheese']
2096
+ """
2097
+ tags = []
2098
+ tags.extend(self.get_tags())
2099
+ for child in self.get_children():
2100
+ tags.extend(child.get_all_tags())
2101
+ return list(set(tags))
2102
+
2103
+ def has_tags(self):
2104
+ """Determines if this node has any tags at all.
2105
+
2106
+ Args:
2107
+
2108
+ Returns:
2109
+ bool: True if node has any tags; else, False;
2110
+
2111
+ >>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').has_tags()
2112
+ True
2113
+ """
2114
+ return len([i.value for i in self.get_features_of_type("tag")]) > 0
2115
+
2116
+ def has_tag(self, tag, include_children=False):
2117
+ """Determine if this node has a tag with the specified name.
2118
+
2119
+ Args:
2120
+ tag(str): The name of the tag.
2121
+ include_children(bool): should we include child nodes
2122
+
2123
+ Returns:
2124
+ bool: True if node has a tag by the specified name; else, False;
2125
+
2126
+ >>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').has_tag('is_cheese')
2127
+ True
2128
+ >>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').has_tag('is_fish')
2129
+ False
2130
+ """
2131
+ for feature in self.get_features():
2132
+ if feature.feature_type == "tag" and feature.name == tag:
2133
+ return True
2134
+ result = False
2135
+ if include_children:
2136
+ for child in self.get_children():
2137
+ if child.has_tag(tag, True):
2138
+ result = True
2139
+ return result
2140
+
2141
+ def is_first_child(self):
2142
+ """Determines if this node is the first child of its parent or has no parent.
2143
+
2144
+ Args:
2145
+
2146
+ Returns:
2147
+ bool: True if this node is the first child of its parent or if this node has no parent; else, False;
2148
+
2149
+ """
2150
+ if not self.parent:
2151
+ return True
2152
+
2153
+ return self.index == 0
2154
+
2155
+ def is_last_child(self):
2156
+ """Determines if this node is the last child of its parent or has no parent.
2157
+
2158
+ Returns:
2159
+ bool: True if this node is the last child of its parent or if this node has no parent; else, False;
2160
+
2161
+ """
2162
+
2163
+ if not self.get_parent():
2164
+ return True
2165
+
2166
+ return self.index == self.get_parent().get_last_child_index()
2167
+
2168
+ def get_last_child_index(self):
2169
+ """Returns the max index value for the children of this node. If the node has no children, returns None.
2170
+
2171
+ Returns:
2172
+ int or None: The max index of the children of this node, or None if there are no children.
2173
+
2174
+ """
2175
+
2176
+ if not self.get_children():
2177
+ return None
2178
+
2179
+ max_index = 0
2180
+ for child in self.get_children():
2181
+ if child.index > max_index:
2182
+ max_index = child.index
2183
+
2184
+ return max_index
2185
+
2186
+ def get_node_at_index(self, index):
2187
+ """Returns the child node at the specified index. If the specified index is outside the first (0), or
2188
+ last child's index, None is returned.
2189
+
2190
+ Note: documents allow for sparse representation and child nodes may not have consecutive index numbers.
2191
+ If there isn't a child node at the specfied index, a 'virtual' node will be returned. This 'virtual' node
2192
+ will have the node type of its nearest sibling and will have an index value, but will have no features or content.
2193
+
2194
+ Args:
2195
+ index (int): The index (zero-based) for the child node.
2196
+
2197
+ Returns:
2198
+ ContentNode or None: Node at index, or None if the index is outside the boundaries of child nodes.
2199
+
2200
+ """
2201
+ children = self.get_children()
2202
+
2203
+ if children:
2204
+ # First check if we have a real node at this index
2205
+ for child in children:
2206
+ if child.index == index:
2207
+ return child
2208
+
2209
+ # Check if index is before the first child
2210
+ if index < children[0].index:
2211
+ virtual_node = self.document.create_node(
2212
+ node_type=children[0].node_type,
2213
+ virtual=True,
2214
+ parent=self,
2215
+ index=index,
2216
+ )
2217
+ return virtual_node
2218
+
2219
+ # Check if index is between existing children
2220
+ last_child = None
2221
+ next_child = None
2222
+
2223
+ for i, child in enumerate(children):
2224
+ if child.index < index:
2225
+ last_child = child
2226
+ # Look for the next child after this index
2227
+ if i + 1 < len(children):
2228
+ if children[i + 1].index > index:
2229
+ next_child = children[i + 1]
2230
+ break
2231
+ elif child.index > index:
2232
+ next_child = child
2233
+ break
2234
+
2235
+ # If we have a gap between two nodes that contains our index
2236
+ if last_child and index < children[-1].index:
2237
+ virtual_node = self.document.create_node(
2238
+ node_type=last_child.node_type,
2239
+ virtual=True,
2240
+ parent=self,
2241
+ index=index,
2242
+ )
2243
+ return virtual_node
2244
+
2245
+ return None
2246
+ else:
2247
+ return None
2248
+
2249
+ def has_next_node(self, node_type_re=".*", skip_virtual=False):
2250
+ """Determine if this node has a next sibling that matches the type specified by the node_type_re regex.
2251
+
2252
+ Args:
2253
+ node_type_re(str, optional, optional): The regular expression to match against the next sibling node's type; default is '.*'.
2254
+ skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
2255
+
2256
+ Returns:
2257
+ bool: True if there is a next sibling node matching the specified type regex; else, False.
2258
+
2259
+ """
2260
+ return self.next_node(node_type_re, skip_virtual=skip_virtual) is not None
2261
+
2262
+ def has_previous_node(self, node_type_re=".*", skip_virtual=False):
2263
+ """Determine if this node has a previous sibling that matches the type specified by the node_type_re regex.
2264
+
2265
+ Args:
2266
+ node_type_re(str, optional, optional): The regular expression to match against the previous sibling node's type; default is '.*'.
2267
+ skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
2268
+
2269
+ Returns:
2270
+ bool: True if there is a previous sibling node matching the specified type regex; else, False.
2271
+
2272
+ """
2273
+ return (
2274
+ self.previous_node(node_type_re=node_type_re, skip_virtual=skip_virtual)
2275
+ is not None
2276
+ )
2277
+
2278
+ def next_node(
2279
+ self,
2280
+ node_type_re=".*",
2281
+ skip_virtual=False,
2282
+ has_no_content=True,
2283
+ traverse=Traverse.SIBLING,
2284
+ ):
2285
+ """
2286
+ Returns the next sibling content node.
2287
+
2288
+ Note: This logic relies on node indexes. Documents allow for sparse representation and child nodes may not have consecutive index numbers.
2289
+ Therefore, the next node might actually be a virtual node that is created to fill a gap in the document. You can skip virtual nodes by setting the
2290
+ skip_virtual parameter to False.
2291
+
2292
+ Args:
2293
+ node_type_re(str, optional, optional): The regular expression to match against the next sibling node's type; default is '.*'.
2294
+ skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
2295
+ has_no_content(bool, optional, optional): Allow a node that has no content to be returned; default is True.
2296
+
2297
+ Returns:
2298
+ ContentNode or None: The next node or None, if no node exists
2299
+
2300
+ """
2301
+ # If this node has no index, we can't determine the next node by index
2302
+ # Use sibling enumeration instead
2303
+ if self.index is None:
2304
+ # Get all siblings and find the node right after this one
2305
+ if self.get_parent():
2306
+ siblings = self.get_parent().get_children()
2307
+ for i, sibling in enumerate(siblings):
2308
+ if sibling.id == self.id and i + 1 < len(siblings):
2309
+ return siblings[i + 1]
2310
+ return None
2311
+
2312
+ # If we have a valid index, use the original implementation
2313
+ search_index = self.index + 1
2314
+ compiled_node_type_re = re.compile(node_type_re)
2315
+
2316
+ while True:
2317
+ node = (
2318
+ self.get_parent().get_node_at_index(search_index)
2319
+ if self.get_parent()
2320
+ else None
2321
+ )
2322
+
2323
+ if not node:
2324
+ if (
2325
+ (traverse == Traverse.ALL or traverse == Traverse.PARENT)
2326
+ and self.get_parent()
2327
+ and self.get_parent().get_parent()
2328
+ ):
2329
+ # noinspection PyBroadException
2330
+ try:
2331
+ potential_next_node = (
2332
+ self.get_parent()
2333
+ .get_parent()
2334
+ .get_children()[self.get_parent().index + 1]
2335
+ )
2336
+ if potential_next_node:
2337
+ return potential_next_node
2338
+ except Exception:
2339
+ # traverse additional layer
2340
+ try:
2341
+ potential_next_node = (
2342
+ self.get_parent()
2343
+ .get_parent()
2344
+ .get_parent()
2345
+ .get_children()[
2346
+ self.get_parent().get_parent().index + 1
2347
+ ]
2348
+ )
2349
+ if potential_next_node:
2350
+ return potential_next_node
2351
+ except Exception:
2352
+ pass
2353
+ return node
2354
+
2355
+ if compiled_node_type_re.match(node.node_type) and (
2356
+ not skip_virtual or not node.virtual
2357
+ ):
2358
+ if (not has_no_content and node.content) or has_no_content:
2359
+ return node
2360
+
2361
+ search_index += 1
2362
+
2363
+ def previous_node(
2364
+ self,
2365
+ node_type_re=".*",
2366
+ skip_virtual=False,
2367
+ has_no_content=False,
2368
+ traverse=Traverse.SIBLING,
2369
+ ):
2370
+ """Returns the previous sibling content node.
2371
+
2372
+ Note: This logic relies on node indexes. Documents allow for sparse representation and child nodes may not have consecutive index numbers.
2373
+ Therefore, the previous node might actually be a virtual node that is created to fill a gap in the document. You can skip virtual nodes by setting the
2374
+ skip_virtual parameter to False.
2375
+
2376
+ Args:
2377
+ node_type_re(str, optional, optional): The regular expression to match against the previous node's type; default is '.*'.
2378
+ skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
2379
+ has_no_content(bool, optional, optional): Allow a node that has no content to be returned; default is False.
2380
+ traverse(Traverse(enum), optional, optional): The transition you'd like to traverse (SIBLING, CHILDREN, PARENT, or ALL); default is Traverse.SIBLING.
2381
+
2382
+ Returns:
2383
+ ContentNode or None: The previous node or None, if no node exists
2384
+
2385
+ """
2386
+
2387
+ # TODO: implement/differentiate traverse logic for CHILDREN and SIBLING
2388
+ if self.index == 0:
2389
+ if (
2390
+ traverse == traverse.ALL
2391
+ or traverse == traverse.PARENT
2392
+ and self.get_parent()
2393
+ ):
2394
+ # Lets look for a previous node on the parent
2395
+ return self.get_parent().previous_node(
2396
+ node_type_re, skip_virtual, has_no_content, traverse
2397
+ )
2398
+
2399
+ return None
2400
+
2401
+ search_index = self.index - 1
2402
+ compiled_node_type_re = re.compile(node_type_re)
2403
+
2404
+ while True:
2405
+ node = self.get_parent().get_node_at_index(search_index)
2406
+
2407
+ if not node:
2408
+ return node
2409
+
2410
+ if compiled_node_type_re.match(node.node_type) and (
2411
+ not skip_virtual or not node.virtual
2412
+ ):
2413
+ if (not has_no_content) or (has_no_content and not node.content):
2414
+ return node
2415
+
2416
+ search_index -= 1
2417
+
2418
+
2419
+ class ContentFeature(object):
2420
+ """
2421
+ A feature allows you to capture almost any additional data or metadata and associate it with a ContentNode.
2422
+ The 'value' of a feature is always a list, allowing a single feature name (e.g., 'ner') to hold multiple
2423
+ data points (e.g., multiple recognized entities).
2424
+ """
2425
+
2426
+ def __init__(self, feature_type: str, name: str, value: Any):
2427
+ self.feature_type: str = feature_type
2428
+ """The type of feature, a logical name to group feature types together (ie. spatial)"""
2429
+ self.name: str = name
2430
+ """The name of the feature (ie. bbox)"""
2431
+
2432
+ if not isinstance(value, list):
2433
+ self.value: List[Any] = [value]
2434
+ else:
2435
+ self.value: List[Any] = value
2436
+ """The list of values for this feature. For example, a 'ner' feature might have multiple Tag objects,
2437
+ a 'spatial:bbox' might have multiple bounding box coordinate lists, etc. (Always a list)"""
2438
+
2439
+ def __str__(self):
2440
+ # Consider showing a snippet of values if not too long, or just type and count for brevity
2441
+ return f"Feature [type='{self.feature_type}' name='{self.name}' value_count='{len(self.value)}']"
2442
+
2443
+ def to_dict(self):
2444
+ """
2445
+ Create a dictionary representing this ContentFeature's structure and content.
2446
+ The 'value' in the dictionary will be a list of serialized items.
2447
+
2448
+ Returns:
2449
+ dict: The properties of this ContentFeature structured as a dictionary.
2450
+ """
2451
+ processed_value_list = []
2452
+ # self.value is now guaranteed to be a list
2453
+ for item in self.value:
2454
+ if hasattr(item, "to_dict") and callable(getattr(item, "to_dict")):
2455
+ processed_value_list.append(item.to_dict())
2456
+ else:
2457
+ # Handle cases where item might be None or a primitive type
2458
+ processed_value_list.append(item)
2459
+ return {
2460
+ "name": self.feature_type + ":" + self.name,
2461
+ "value": processed_value_list,
2462
+ }
2463
+
2464
+ def get_value(self) -> List[Any]:
2465
+ """
2466
+ Get the list of values for the feature.
2467
+
2468
+ Returns:
2469
+ List[Any]: The list of values of the feature. This is always a list, even if it contains a single item or is empty.
2470
+ """
2471
+ return self.value
2472
+
2473
+
2474
+ class ModelInsight(BaseModel):
2475
+ model_config = ConfigDict(
2476
+ populate_by_name=True,
2477
+ use_enum_values=True,
2478
+ arbitrary_types_allowed=True,
2479
+ protected_namespaces=("model_config",),
2480
+ )
2481
+ """
2482
+ A class used to represent the insights of a model.
2483
+
2484
+ Attributes:
2485
+ model_ref (str): The reference to the model.
2486
+ insight_type (str): The type of the insight.
2487
+ description (Optional[str]): The description of the insight, default is None.
2488
+ details (Optional[str]): The details of the insight, default is None.
2489
+ properties (Optional[Dict]): The properties of the insight, default is None.
2490
+ """
2491
+
2492
+ model_ref: str
2493
+ insight_type: str
2494
+ description: Optional[str] = None
2495
+ details: Optional[str] = None
2496
+ properties: Optional[Dict] = None
2497
+
2498
+
2499
+ @dataclasses.dataclass()
2500
+ class SourceMetadata:
2501
+ """Class for keeping track of an original source information for a document.
2502
+
2503
+ Attributes:
2504
+ original_filename (Optional[str]): The original filename of the document.
2505
+ original_path (Optional[str]): The original path of the document.
2506
+ checksum (Optional[str]): The checksum of the document.
2507
+ cid (Optional[str]): The ID used for internal caching.
2508
+ last_modified (Optional[str]): The last modified date of the document.
2509
+ created (Optional[str]): The creation date of the document.
2510
+ connector (Optional[str]): The connector used for the document.
2511
+ mime_type (Optional[str]): The MIME type of the document.
2512
+ headers (Optional[Dict]): The headers of the document.
2513
+ lineage_document_uuid (Optional[str]): The UUID of the document that this document was derived from.
2514
+ source_document_uuid (Optional[str]): The UUID of the original first document.
2515
+ pdf_document_uuid (Optional[str]): The UUID of the document in a PDF form (used for archiving and preview).
2516
+ """
2517
+
2518
+ """Class for keeping track of the original source information for a
2519
+ document
2520
+
2521
+ Args:
2522
+
2523
+ Returns:
2524
+
2525
+ """
2526
+ original_filename: Optional[str] = None
2527
+ original_path: Optional[str] = None
2528
+ checksum: Optional[str] = None
2529
+
2530
+ # The ID used for internal caching
2531
+ cid: Optional[str] = None
2532
+ last_modified: Optional[str] = None
2533
+ created: Optional[str] = None
2534
+ connector: Optional[str] = None
2535
+ mime_type: Optional[str] = None
2536
+ headers: Optional[Dict] = None
2537
+
2538
+ # The UUID of the document that this document was derived from
2539
+ # noting that multiple documents coming from an original source
2540
+ lineage_document_uuid: Optional[str] = None
2541
+
2542
+ # The UUID of the original first document
2543
+ source_document_uuid: Optional[str] = None
2544
+
2545
+ # The UUID of the document in a PDF form (used for archiving and preview)
2546
+ pdf_document_uuid: Optional[str] = None
2547
+
2548
+ @classmethod
2549
+ def from_dict(cls, env):
2550
+ """Creates an instance of the class from a dictionary.
2551
+
2552
+ Args:
2553
+ env (dict): A dictionary containing the attributes of the class.
2554
+
2555
+ Returns:
2556
+ SourceMetadata: An instance of the class.
2557
+ """
2558
+ return cls(
2559
+ **{k: v for k, v in env.items() if k in inspect.signature(cls).parameters}
2560
+ )
2561
+
2562
+
2563
+ class FeatureSetDiff:
2564
+ """
2565
+ A utility class that can be used to diff two feature sets.
2566
+ """
2567
+
2568
+ """
2569
+ A utility class that can be used to diff two feature sets
2570
+ """
2571
+
2572
+ def __init__(self, first_feature_set: FeatureSet, second_feature_set: FeatureSet):
2573
+ self.first_feature_map = self.parse_feature_set(first_feature_set)
2574
+ self.second_feature_map = self.parse_feature_set(second_feature_set)
2575
+ self._differences = deepdiff.DeepDiff(
2576
+ self.first_feature_map,
2577
+ self.second_feature_map,
2578
+ exclude_obj_callback=self.exclude_callback,
2579
+ ).to_dict()
2580
+
2581
+ def get_differences(self):
2582
+ """
2583
+ Gets the differences between the two feature sets.
2584
+
2585
+ Returns:
2586
+ dict: A dictionary containing the differences between the two feature sets.
2587
+ """
2588
+ if "type_changes" in self._differences:
2589
+ self._differences.pop("type_changes")
2590
+
2591
+ return self._differences
2592
+
2593
+ def get_exclude_paths(self):
2594
+ """
2595
+ Gets the paths to exclude.
2596
+
2597
+ Returns:
2598
+ list: A list of paths to exclude.
2599
+ """
2600
+ return ["shape", "group_uuid", "uuid", "parent_group_uuid", "single"]
2601
+
2602
+ def exclude_callback(self, path, key):
2603
+ """
2604
+ Checks if the key is to be excluded from the diff.
2605
+
2606
+ Args:
2607
+ path (str): The path that contains the values of the key.
2608
+ key (str): The key of the data dictionary to compare.
2609
+
2610
+ Returns:
2611
+ bool: True if the key is to be excluded, False otherwise.
2612
+ """
2613
+ if any(re.search(exclude_key, key) for exclude_key in self.get_exclude_paths()):
2614
+ return True
2615
+ else:
2616
+ return False
2617
+
2618
+ def parse_feature_set(self, feature_set: FeatureSet):
2619
+ """
2620
+ Parses the feature set.
2621
+
2622
+ Args:
2623
+ feature_set (FeatureSet): The feature set to be parsed.
2624
+
2625
+ Returns:
2626
+ dict: A dictionary of features with the key as the nodeUuid.
2627
+ """
2628
+ return {
2629
+ feature.get("nodeUuid"): feature for feature in feature_set.node_features
2630
+ }
2631
+
2632
+ def parsed_values_changed(self):
2633
+ """
2634
+ Checks if the old value is still in the second feature map. If it is, remove the key.
2635
+ """
2636
+ for key, value in self._differences.get("values_changed").items():
2637
+ # Check if the old_value is stil in the second_feature_map. If it is remove the key
2638
+ if key in self.second_feature_map.node_features:
2639
+ self._differences.get("values_changed").remove(key)
2640
+
2641
+ def is_equal(self) -> bool:
2642
+ """
2643
+ Checks if the two feature sets are equal to each other.
2644
+
2645
+ Returns:
2646
+ bool: True if the feature sets are equal, False otherwise.
2647
+ """
2648
+ return self._differences == {}
2649
+
2650
+ def get_changed_nodes(self):
2651
+ """
2652
+ Gets the nodes that were changed.
2653
+
2654
+ Returns:
2655
+ dict: A dictionary containing the nodes that were changed.
2656
+ """
2657
+ if self.is_equal():
2658
+ return []
2659
+
2660
+ # Check for new nodes added in the second_feature_map
2661
+ new_added_nodes = []
2662
+
2663
+ # Checked for removed nodes in the first_feature_map
2664
+ removed_nodes = []
2665
+
2666
+ # Checked for modified nodes
2667
+ modified_nodes = []
2668
+ for key, value in self._differences.get("values_changed").items():
2669
+ modified_nodes.append(self.parsed_node_uuid(key))
2670
+
2671
+ # Merge unique nodeUuid of first_feature_map and second_feature_map
2672
+ merged_node_uuids = set(self.first_feature_map.keys()).union(
2673
+ set(self.second_feature_map.keys())
2674
+ )
2675
+ for node_uuid in merged_node_uuids:
2676
+ if node_uuid not in self.first_feature_map:
2677
+ new_added_nodes.append(node_uuid)
2678
+ elif node_uuid not in self.second_feature_map:
2679
+ removed_nodes.append(node_uuid)
2680
+
2681
+ return {
2682
+ "new_added_nodes": new_added_nodes,
2683
+ "removed_nodes": removed_nodes,
2684
+ "existing_modified_nodes": modified_nodes,
2685
+ }
2686
+
2687
+ def get_difference_count(self):
2688
+ """
2689
+ Gets the total number of differences between the feature sets.
2690
+
2691
+ Returns:
2692
+ int: The total number of differences between the feature sets.
2693
+ """
2694
+ return len(self._differences().keys())
2695
+
2696
+ def parsed_item_added(self):
2697
+ """
2698
+ Parses the items that were added.
2699
+
2700
+ Returns:
2701
+ dict: A dictionary containing the items that were added.
2702
+ """
2703
+ item_added: Dict = self._differences.get("iterable_item_added")
2704
+ if item_added:
2705
+ return {}
2706
+
2707
+ for key, value in item_added.items():
2708
+ node = self.parsed_node_uuid(key)
2709
+ if node in self._changed_nodes["new_added_nodes"]:
2710
+ self._differences["iterable_item_added"][key][
2711
+ "details"
2712
+ ] = f"Node: {node} was added"
2713
+ continue
2714
+
2715
+ # if node in
2716
+ return self.get_difference_count()
2717
+
2718
+ def parsed_node_uuid(self, key):
2719
+ """
2720
+ Parses the node uuid from the key.
2721
+
2722
+ Args:
2723
+ key (str): The key of the data dictionary.
2724
+
2725
+ Returns:
2726
+ str: The node uuid from the key.
2727
+ """
2728
+ node = key.split("['")[1].split("']")[0]
2729
+ return node
2730
+
2731
+
2732
+ class ProcessingStep(BaseModel):
2733
+ id: str = Field(default_factory=lambda: str(uuid.uuid4()))
2734
+ name: str
2735
+ metadata: dict = Field(default_factory=lambda: {})
2736
+ presentation_metadata: dict = Field(
2737
+ default_factory=lambda: {}, alias="presentationMetadata"
2738
+ )
2739
+ children: List["ProcessingStep"] = Field(default_factory=list)
2740
+ parents: List["ProcessingStep"] = Field(default_factory=list)
2741
+
2742
+ def add_child(self, child_step: "ProcessingStep"):
2743
+ self.children.append(child_step)
2744
+ child_step.parents.append(self)
2745
+
2746
+ @staticmethod
2747
+ def merge_with(*other_steps: "ProcessingStep") -> "ProcessingStep":
2748
+ merged_step = ProcessingStep(
2749
+ name=f"Merged({', '.join(step.name for step in other_steps)})"
2750
+ )
2751
+ for step in other_steps:
2752
+ step.children.append(merged_step)
2753
+ merged_step.parents.append(step)
2754
+ return merged_step
2755
+
2756
+ model_config = ConfigDict(
2757
+ arbitrary_types_allowed = True,
2758
+ json_encoders = {"ProcessingStep": lambda step: step.to_dict()}
2759
+ )
2760
+
2761
+ def to_dict(self, seen=None):
2762
+ if seen is None:
2763
+ seen = set()
2764
+
2765
+ # Avoid circular references by skipping already seen objects
2766
+ if self.id in seen:
2767
+ return {"id": self.id, "name": self.name}
2768
+
2769
+ seen.add(self.id)
2770
+
2771
+ return {
2772
+ "id": self.id,
2773
+ "name": self.name,
2774
+ "metadata": self.metadata,
2775
+ "presentationMetadata": self.presentation_metadata,
2776
+ "children": [child.to_dict(seen) for child in self.children],
2777
+ "parents": [
2778
+ {"id": parent.id, "name": parent.name} for parent in self.parents
2779
+ ], # or parent.to_dict(seen) if full structure is needed
2780
+ }
2781
+
2782
+ def to_json(self):
2783
+ return json.dumps(self.to_dict())
2784
+
2785
+ def __repr__(self):
2786
+ return f"Step(id={self.id}, name={self.name})"
2787
+
2788
+
2789
+ class Document(object):
2790
+ """A Document is a collection of metadata and a set of content nodes."""
2791
+
2792
+ PREVIOUS_VERSION: str = "1.0.0"
2793
+ CURRENT_VERSION: str = "8.0.0"
2794
+
2795
+ def __init__(
2796
+ self,
2797
+ metadata=None,
2798
+ source=None,
2799
+ ref: str = None,
2800
+ kddb_path: str = None,
2801
+ delete_on_close=False,
2802
+ inmemory=False,
2803
+ ):
2804
+ """A Kodexa Document has content nodes and metadata to represent the information.
2805
+
2806
+ Args:
2807
+ metadata (DocumentMetadata): The metadata for the document (default is empty)
2808
+ source (SourceMetadata): The source metadata for the document (optional)
2809
+ ref (str): The reference (if it is a remote document)
2810
+ kddb_path (str): If we want to open an existing kddb
2811
+ delete_on_close (boolean): Whether to delete on close
2812
+ inmemory (boolean): Whether to operate in memory (faster but more memory intensive)
2813
+ """
2814
+ self.metadata = metadata if metadata is not None else DocumentMetadata()
2815
+ """The metadata for the document"""
2816
+
2817
+ self._mixins = []
2818
+ self._persistence_layer = None
2819
+
2820
+ self.labels = []
2821
+ """A list of document level labels"""
2822
+ self.content_node = None
2823
+ self.source = source if source is not None else SourceMetadata()
2824
+ """The source of the document"""
2825
+
2826
+ self.uuid = str(uuid.uuid4())
2827
+ """A UUID representing this document"""
2828
+
2829
+ self.create_persistence_layer(kddb_path, delete_on_close, inmemory)
2830
+
2831
+ if ref is not None:
2832
+ self.ref = Ref(ref)
2833
+
2834
+ self.version = self.CURRENT_VERSION
2835
+
2836
+ def __str__(self):
2837
+ return f"kodexa://{self.uuid}"
2838
+
2839
+ def get_validations(self) -> list[DocumentTaxonValidation]:
2840
+ return self.get_persistence().get_validations()
2841
+
2842
+ def set_validations(self, validations: list[DocumentTaxonValidation]):
2843
+ self.get_persistence().set_validations(validations)
2844
+
2845
+ def add_exception(self, exception: ContentException):
2846
+ self._persistence_layer.add_exception(exception)
2847
+
2848
+ def get_exceptions(self) -> List[ContentException]:
2849
+ return self._persistence_layer.get_exceptions()
2850
+
2851
+ def get_external_data(self, key="default") -> dict:
2852
+ return self._persistence_layer.get_external_data(key)
2853
+
2854
+ def get_external_data_keys(self) -> list[str]:
2855
+ return self._persistence_layer.get_external_data_keys()
2856
+
2857
+ def set_external_data(self, external_data: dict, key="default"):
2858
+ return self._persistence_layer.set_external_data(external_data, key)
2859
+
2860
+ def get_steps(self) -> list[ProcessingStep]:
2861
+ return self._persistence_layer.get_steps()
2862
+
2863
+ def set_steps(self, steps: list[ProcessingStep]):
2864
+ self._persistence_layer.set_steps(steps)
2865
+
2866
+ def replace_exceptions(self, exceptions: List[ContentException]):
2867
+ self._persistence_layer.replace_exceptions(exceptions)
2868
+
2869
+ def create_persistence_layer(
2870
+ self, kddb_path=None, delete_on_close=False, inmemory=False
2871
+ ):
2872
+ """
2873
+ Creates a persistence layer for the document
2874
+
2875
+ Args:
2876
+ kddb_path: Path to the KDDB file
2877
+ delete_on_close: Whether to delete the file on close
2878
+ inmemory: Whether to operate in memory
2879
+ """
2880
+ from kodexa_document.persistence import SqliteDocumentPersistence
2881
+
2882
+ self._persistence_layer = SqliteDocumentPersistence(
2883
+ document=self,
2884
+ filename=kddb_path,
2885
+ delete_on_close=delete_on_close,
2886
+ inmemory=inmemory,
2887
+ )
2888
+ self._persistence_layer.initialize()
2889
+
2890
+ def remove_tags_by_owner(self, owner_uri: str):
2891
+
2892
+ for tag in self.get_all_tags():
2893
+ for tag_instance in self.get_tag_instances(tag):
2894
+ tag_meta: dict = tag_instance.get_data()
2895
+ if "owner_uri" in tag_meta and tag_meta["owner_uri"] == owner_uri:
2896
+ for node in tag_instance.nodes:
2897
+ node.remove_tag(tag)
2898
+
2899
+ def get_nodes_by_type(self, node_type: str) -> List[ContentNode]:
2900
+ """
2901
+ Get all the nodes of a specific type
2902
+
2903
+ Args:
2904
+ node_type: the type of the node
2905
+
2906
+ Returns:
2907
+ a list of nodes
2908
+
2909
+ """
2910
+ return self._persistence_layer.get_nodes_by_type(node_type)
2911
+
2912
+ def get_node_by_uuid(self, uuid: int) -> ContentNode:
2913
+ """
2914
+ Get a node by its uuid
2915
+
2916
+ Args:
2917
+ uuid: the uuid of the node
2918
+
2919
+ Returns:
2920
+ the node
2921
+
2922
+ """
2923
+ return self._persistence_layer.get_node_by_uuid(uuid)
2924
+
2925
+ def add_tag_instance(
2926
+ self, tag_to_apply: str, node_list: List[ContentNode], tag_uuid: str = None
2927
+ ):
2928
+ """
2929
+ This will create a group of a tag with indexes
2930
+ :param tag_to_apply: name of the tag
2931
+ :param node_list: contains the list of index of a node
2932
+ :return:
2933
+ """
2934
+ # For each node in the list create/update a feature
2935
+ tag = Tag()
2936
+ tag.uuid = tag_uuid if tag_uuid else str(uuid.uuid4())
2937
+ for node in node_list:
2938
+ node.add_feature("tag", tag_to_apply, tag)
2939
+
2940
+ def update_tag_instance(self, tag_uuid):
2941
+ for tag_instance in self.get_tag_instances(tag_uuid):
2942
+ if tag_instance.tag.id == tag_uuid:
2943
+ # Update attributes of a Tag
2944
+ for node in tag_instance.nodes:
2945
+ node.get_tag(tag_instance.tag.value, tag_uuid=tag_instance.tag.id)
2946
+
2947
+ def get_tag_instance(self, tag):
2948
+ """
2949
+ Get the tag instance based on the tag itself
2950
+ :param tag: name of the tag
2951
+ :return: a list of tag instance
2952
+ """
2953
+ return [
2954
+ tag_instance
2955
+ for tag_instance in self.get_tag_instances(tag)
2956
+ if tag_instance.tag == tag
2957
+ ]
2958
+
2959
+ def get_persistence(self):
2960
+ return self._persistence_layer
2961
+
2962
+ def get_all_tags(self):
2963
+ return self._persistence_layer.get_all_tags()
2964
+
2965
+ def add_model_insight(self, model_insight: ModelInsight):
2966
+ self._persistence_layer.add_model_insight(model_insight)
2967
+
2968
+ def clear_model_insights(self):
2969
+ self._persistence_layer.clear_model_insights()
2970
+
2971
+ def get_model_insights(self) -> List[ModelInsight]:
2972
+ return self._persistence_layer.get_model_insights()
2973
+
2974
+ def get_tagged_nodes(self, tag_name, tag_uuid=None):
2975
+ return self._persistence_layer.get_tagged_nodes(tag_name, tag_uuid)
2976
+
2977
+ @property
2978
+ def content_node(self) -> ContentNode:
2979
+ """The root content Node"""
2980
+ return self._content_node
2981
+
2982
+ @content_node.setter
2983
+ def content_node(self, value):
2984
+ """Set the content node for the document"""
2985
+ # Initialize _content_node attribute if it doesn't exist
2986
+ if not hasattr(self, "_content_node"):
2987
+ self._content_node = None
2988
+
2989
+ # Handle None value
2990
+ if value is None:
2991
+ self._content_node = None
2992
+ return
2993
+
2994
+ # Set index to 0 if node has an index property
2995
+ value.index = 0
2996
+
2997
+ # Remove old content node if it exists
2998
+ if self._content_node is not None and value != self._content_node:
2999
+ self.get_persistence().remove_content_node(self._content_node)
3000
+
3001
+ # Set the new content node
3002
+ self._content_node = value
3003
+
3004
+ # Add the content node to persistence
3005
+ self.get_persistence().add_content_node(self._content_node, None)
3006
+
3007
+ def get_tag_instances(self, tag):
3008
+ groups = self.content_node.get_related_tag_nodes(tag, everywhere=True)
3009
+ tag_instances = []
3010
+ for key in groups.keys():
3011
+ tag_instances.append(TagInstance(key, groups[key]))
3012
+ return tag_instances
3013
+
3014
+ def add_label(self, label: str):
3015
+ """Add a label to the document
3016
+
3017
+ Args:
3018
+ label: str Label to add
3019
+ label: str:
3020
+
3021
+ Returns:
3022
+ the document
3023
+
3024
+ """
3025
+ if label not in self.labels:
3026
+ self.labels.append(label)
3027
+
3028
+ return self
3029
+
3030
+ def remove_label(self, label: str):
3031
+ """Remove a label from the document
3032
+
3033
+ Args:
3034
+ label: str Label to remove
3035
+ label: str:
3036
+
3037
+ Returns:
3038
+ the document
3039
+
3040
+ """
3041
+ self.labels.remove(label)
3042
+ return self
3043
+
3044
+ @classmethod
3045
+ def from_text(cls, text, separator=None, inmemory=False):
3046
+ """Creates a new Document from the text provided.
3047
+
3048
+ Args:
3049
+ text: str Text to be used as content on the Document's ContentNode(s)
3050
+ separator: str If provided, this string will be used to split the text and the resulting text will be placed on children of the root ContentNode. (Default value = None)
3051
+
3052
+ Returns:
3053
+ the document
3054
+
3055
+ """
3056
+ new_document = Document(inmemory=inmemory)
3057
+ new_document.source.original_filename = f"text-{uuid.uuid4()}"
3058
+ new_document.content_node = new_document.create_node(node_type="text", index=0)
3059
+ if text:
3060
+ if separator:
3061
+ for s in text.split(separator):
3062
+ # Create the node with content
3063
+ child_node = new_document.create_node(node_type="text", content=s)
3064
+ # Add as a child to the content node
3065
+ new_document.content_node.add_child(child_node)
3066
+ # Explicitly make sure content parts are set
3067
+ if s:
3068
+ child_node.set_content_parts([s])
3069
+ else:
3070
+ new_document.content_node.content = text
3071
+ new_document.content_node.set_content_parts([text])
3072
+
3073
+ new_document.add_mixin("text")
3074
+ return new_document
3075
+
3076
+ def get_root(self):
3077
+ """Get the root content node for the document (same as content_node)"""
3078
+ return self.content_node
3079
+
3080
+ def to_kdxa(self, file_path: str):
3081
+ """Write the document to the kdxa format (msgpack) which can be
3082
+ used with the Kodexa platform
3083
+
3084
+ Args:
3085
+ file_path: the path to the mdoc you wish to create
3086
+ file_path: str:
3087
+
3088
+ Returns:
3089
+
3090
+ >>> document.to_mdoc('my-document.kdxa')
3091
+ """
3092
+ with open(file_path, "wb") as outfile:
3093
+ msgpack.pack(self.to_dict(), outfile, use_bin_type=True)
3094
+
3095
+ @staticmethod
3096
+ def open_kddb(file_path):
3097
+ """
3098
+ Opens a Kodexa Document Database.
3099
+
3100
+ This is the Kodexa V4 default way to store documents, it provides high-performance
3101
+ and also the ability to handle very large document objects
3102
+
3103
+ :param file_path: The file path
3104
+ :return: The Document instance
3105
+ """
3106
+ return Document(kddb_path=file_path)
3107
+
3108
+ def close(self):
3109
+ """
3110
+ Close the document and clean up the resources
3111
+ """
3112
+ self.get_persistence().close()
3113
+
3114
+ def to_kddb(self, path=None):
3115
+ """
3116
+ Either write this document to a KDDB file or convert this document object structure into a KDDB and return a bytes-like object
3117
+
3118
+ This is dependent on whether you provide a path to write to
3119
+ """
3120
+
3121
+ if path is None:
3122
+ return self.get_persistence().get_bytes()
3123
+
3124
+ with open(path, "wb") as output_file:
3125
+ output_file.write(self.get_persistence().get_bytes())
3126
+
3127
+ @staticmethod
3128
+ def from_kdxa(file_path):
3129
+ """Read an .kdxa file from the given file_path and
3130
+
3131
+ Args:
3132
+ file_path: the path to the mdoc file
3133
+
3134
+ Returns:
3135
+
3136
+ >>> document = Document.from_kdxa('my-document.kdxa')
3137
+ """
3138
+ with open(file_path, "rb") as data_file:
3139
+ data_loaded = msgpack.unpack(data_file, raw=False)
3140
+ return Document.from_dict(data_loaded)
3141
+
3142
+ def to_msgpack(self):
3143
+ """Convert this document object structure into a message pack"""
3144
+ return msgpack.packb(self.to_dict(), use_bin_type=True)
3145
+
3146
+ def to_json(self):
3147
+ """Create a JSON string representation of this Document.
3148
+
3149
+ Args:
3150
+
3151
+ Returns:
3152
+ str: The JSON formatted string representation of this Document.
3153
+
3154
+ >>> document.to_json()
3155
+ """
3156
+ return json.dumps(self.to_dict(), ensure_ascii=False)
3157
+
3158
+ def to_dict(self):
3159
+ """Create a dictionary representing this Document's structure and content.
3160
+
3161
+ Args:
3162
+
3163
+ Returns:
3164
+ dict: A dictionary representation of this Document.
3165
+
3166
+ >>> document.to_dict()
3167
+ """
3168
+
3169
+ # We don't want to store the none values
3170
+ def clean_none_values(d):
3171
+ """
3172
+ This function recursively cleans a dictionary by removing keys with None values.
3173
+
3174
+ Args:
3175
+ d (dict): The dictionary to clean.
3176
+
3177
+ Returns:
3178
+ dict: A new dictionary with the same structure as the input, but without keys that had None values.
3179
+ """
3180
+ clean = {}
3181
+ for k, v in d.items():
3182
+ if isinstance(v, dict):
3183
+ nested = clean_none_values(v)
3184
+ if len(nested.keys()) > 0:
3185
+ clean[k] = nested
3186
+ elif v is not None:
3187
+ clean[k] = v
3188
+ return clean
3189
+
3190
+ return {
3191
+ "version": Document.CURRENT_VERSION,
3192
+ "metadata": self.metadata,
3193
+ "content_node": self.content_node.to_dict() if self.content_node else None,
3194
+ "source": clean_none_values(dataclasses.asdict(self.source)),
3195
+ "mixins": self._mixins,
3196
+ "labels": self.labels,
3197
+ "uuid": self.uuid,
3198
+ }
3199
+
3200
+ @staticmethod
3201
+ def from_dict(doc_dict):
3202
+ """Build a new Document from a dictionary.
3203
+
3204
+ Args:
3205
+ dict: doc_dict: A dictionary representation of a Kodexa Document.
3206
+ doc_dict:
3207
+
3208
+ Returns:
3209
+ Document: A complete Kodexa Document
3210
+
3211
+ >>> Document.from_dict(doc_dict)
3212
+ """
3213
+ new_document = Document(DocumentMetadata(doc_dict["metadata"]))
3214
+ new_document.version = (
3215
+ doc_dict["version"]
3216
+ if "version" in doc_dict and doc_dict["version"]
3217
+ else Document.PREVIOUS_VERSION
3218
+ ) # some older docs don't have a version or it's None
3219
+ new_document.uuid = (
3220
+ doc_dict["uuid"]
3221
+ if "uuid" in doc_dict
3222
+ else str(uuid.uuid5(uuid.NAMESPACE_DNS, "kodexa.com"))
3223
+ )
3224
+
3225
+ if "content_node" in doc_dict and doc_dict["content_node"]:
3226
+ new_document.content_node = ContentNode.from_dict(
3227
+ new_document, doc_dict["content_node"]
3228
+ )
3229
+
3230
+ if "source" in doc_dict and doc_dict["source"]:
3231
+ new_document.source = SourceMetadata.from_dict(doc_dict["source"])
3232
+ if "labels" in doc_dict and doc_dict["labels"]:
3233
+ new_document.labels = doc_dict["labels"]
3234
+
3235
+ new_document.get_persistence().update_metadata()
3236
+ return new_document
3237
+
3238
+ @staticmethod
3239
+ def from_json(json_string):
3240
+ """Create an instance of a Document from a JSON string.
3241
+
3242
+ Args:
3243
+ str: json_string: A JSON string representation of a Kodexa Document
3244
+ json_string:
3245
+
3246
+ Returns:
3247
+ Document: A complete Kodexa Document
3248
+
3249
+ >>> Document.from_json(json_string)
3250
+ """
3251
+ return Document.from_dict(json.loads(json_string))
3252
+
3253
+ @staticmethod
3254
+ def from_msgpack(msgpack_bytes):
3255
+ """Create an instance of a Document from a message pack byte array.
3256
+
3257
+ Args:
3258
+ msgpack_bytes: bytes: A message pack byte array.
3259
+
3260
+ Returns:
3261
+ Document: A complete Kodexa Document
3262
+
3263
+ >>> Document.from_msgpack(open(os.path.join('news-doc.kdxa'), 'rb').read())
3264
+ """
3265
+ return Document.from_dict(msgpack.unpackb(msgpack_bytes, raw=False))
3266
+
3267
+ def get_mixins(self):
3268
+ """
3269
+ Get the list of mixins that have been enabled on this document
3270
+
3271
+ Returns:
3272
+ mixins: list[str] a list of the mixin names
3273
+ """
3274
+ return self._mixins
3275
+
3276
+ def add_mixin(self, mixin):
3277
+ """
3278
+ Add the given mixin to this document, this will apply the mixin to all the content nodes,
3279
+ and also register it with the document so that future invocations of create_node will ensure
3280
+ the node has the mixin appled.
3281
+
3282
+ Args:
3283
+ mixin:str the name of the mixin to add
3284
+
3285
+ Returns:
3286
+ >>> import * from kodexa
3287
+ >>> document = Document()
3288
+ >>> document.add_mixin('spatial')
3289
+ """
3290
+ self._mixins.append(mixin)
3291
+ self.get_persistence().update_metadata()
3292
+
3293
+ def create_node(
3294
+ self,
3295
+ node_type: str,
3296
+ content: Optional[str] = None,
3297
+ virtual: bool = False,
3298
+ parent: ContentNode = None,
3299
+ index: Optional[int] = None,
3300
+ ):
3301
+ """
3302
+ Creates a new node for the document. The new node is not added to the document, but any mixins that have been
3303
+ applied to the document will also be available on the new node.
3304
+
3305
+ Args:
3306
+ node_type (str): The type of node.
3307
+ content (str): The content for the node; defaults to None.
3308
+ virtual (bool): Indicates if this is a 'real' or 'virtual' node; default is False. 'Real' nodes contain
3309
+ document content. 'Virtual' nodes are synthesized as necessary to fill gaps in between
3310
+ non-consecutively indexed siblings. Such indexing arises when document content is sparse.
3311
+ parent (ContentNode): The parent for this newly created node; default is None;
3312
+ index (Optional[int)): The index property to be set on this node; default is 0;
3313
+
3314
+ Returns:
3315
+ ContentNode: This newly created node.
3316
+
3317
+ >>> document.create_node(node_type='page')
3318
+ <kodexa_document.model.ContentNode object at 0x7f80605e53c8>
3319
+ """
3320
+ content_node = ContentNode(
3321
+ document=self,
3322
+ node_type=node_type,
3323
+ content=content,
3324
+ parent=parent,
3325
+ index=index,
3326
+ virtual=virtual,
3327
+ )
3328
+
3329
+ if virtual:
3330
+ # For virtual nodes, we just set the parent ID without persistence
3331
+ if parent is not None:
3332
+ content_node._parent_id = parent.id if parent.id else None
3333
+ else:
3334
+ # Only add non-virtual nodes to the document
3335
+ if parent is not None:
3336
+ parent.add_child(content_node, index)
3337
+ else:
3338
+ self.get_persistence().add_content_node(content_node)
3339
+
3340
+ # This is redundant as we already set content in the ContentNode constructor
3341
+ # We should remove this and rely on the ContentNode logic
3342
+ if content is not None and len(content_node.get_content_parts()) == 0:
3343
+ content_node.set_content_parts([content])
3344
+
3345
+ return content_node
3346
+
3347
+ @classmethod
3348
+ def from_kddb(
3349
+ cls, source: Union[str, bytes], detached: bool = True, inmemory: bool = False
3350
+ ):
3351
+ """
3352
+ Create a document from a KDDB (Kodexa Document Database) source. The source can either be a file path or the KDDB bytes.
3353
+
3354
+ Args:
3355
+ source (str or bytes): The KDDB source.
3356
+ detached (bool, optional): Whether to create a detached Document. Defaults to True.
3357
+ inmemory (bool, optional): Whether to load the document in memory. Defaults to False.
3358
+
3359
+ Returns:
3360
+ Document: A new Document instance loaded from the KDDB source.
3361
+
3362
+ >>> document = Document.from_kddb('path/to/document.kddb')
3363
+ """
3364
+ from kodexa_document.persistence import SqliteDocumentPersistence
3365
+
3366
+ document = cls(
3367
+ kddb_path=source if isinstance(source, str) else None, inmemory=inmemory
3368
+ )
3369
+ temp_file = None
3370
+
3371
+ try:
3372
+ if isinstance(source, bytes):
3373
+ # We are getting byte source
3374
+ import tempfile
3375
+ import os
3376
+ temp_file = tempfile.NamedTemporaryFile(
3377
+ suffix=".kddb", delete=False
3378
+ )
3379
+ temp_file.write(source)
3380
+ temp_file.close()
3381
+ file_name = temp_file.name
3382
+ else:
3383
+ file_name = source
3384
+
3385
+ # We should make sure that we transfer features
3386
+ document._persistence_layer = SqliteDocumentPersistence(
3387
+ document, file_name, True, inmemory
3388
+ )
3389
+ document._persistence_layer.initialize()
3390
+
3391
+ if detached:
3392
+ document._detached = True
3393
+
3394
+ # Save the document type for easier checking
3395
+ document._document_type = "kddb"
3396
+
3397
+ return document
3398
+ except Exception as e:
3399
+ # Clean up the document resources if initialization failed
3400
+ if hasattr(document, "_persistence_layer") and document._persistence_layer:
3401
+ try:
3402
+ document._persistence_layer.close()
3403
+ except:
3404
+ pass
3405
+ raise e
3406
+ finally:
3407
+ # Clean up the temporary file if we created one
3408
+ if temp_file and os.path.exists(temp_file.name):
3409
+ try:
3410
+ os.unlink(temp_file.name)
3411
+ except:
3412
+ pass
3413
+
3414
+ @classmethod
3415
+ def from_file(cls, file, unpack: bool = False):
3416
+ """Creates a Document that has a 'file-handle' connector to the specified file.
3417
+
3418
+ Args:
3419
+ file: file: The file to which the new Document is connected.
3420
+ unpack: bool: (Default value = False)
3421
+
3422
+ Returns:
3423
+ Document: A Document connected to the specified file.
3424
+
3425
+ """
3426
+ if unpack:
3427
+ Document.from_kdxa(file)
3428
+ else:
3429
+ file_document = Document()
3430
+ file_document.metadata["connector"] = "file-handle"
3431
+ file_document.metadata["connector_options"] = {}
3432
+ file_document.metadata["connector_options"]["file"] = file
3433
+ file_document.source
3434
+ file_document.source.connector = "file-handle"
3435
+ file_document.source.original_filename = os.path.basename(file)
3436
+ file_document.source.original_path = file
3437
+ return file_document
3438
+
3439
+ @classmethod
3440
+ def from_url(cls, url, headers=None):
3441
+ """Creates a Document that has a 'url' connector for the specified url.
3442
+
3443
+ Args:
3444
+ str: url: The URL to which the new Document is connected.
3445
+ dict: headers: Headers that should be used when reading from the URL
3446
+ url:
3447
+ headers: (Default value = None)
3448
+
3449
+ Returns:
3450
+ Document: A Document connected to the specified URL with the specified headers (if any).
3451
+
3452
+ """
3453
+ if headers is None:
3454
+ headers = {}
3455
+ url_document = Document()
3456
+ url_document.metadata.connector = "url"
3457
+ url_document.metadata.connector_options.base_url = url
3458
+ url_document.metadata.connector_options.headers = headers
3459
+ url_document.source.connector = "url"
3460
+ url_document.source.original_filename = url
3461
+ url_document.source.original_path = url
3462
+ url_document.source.headers = headers
3463
+ return url_document
3464
+
3465
+ def select_first(self, selector, variables=None) -> Optional[ContentNode]:
3466
+ """Select and return the first child of this node that match the selector value.
3467
+
3468
+ Args:
3469
+ selector (str): The selector (ie. //*)
3470
+ variables (dict, optional): A dictionary of variable name/value to use in substituion; defaults to None.
3471
+ Dictionary keys should match a variable specified in the selector.
3472
+
3473
+ Returns:
3474
+ Optional[ContentNode]: The first matching node or none
3475
+
3476
+ >>> document.get_root().select_first('.')
3477
+ ContentNode
3478
+
3479
+ >>> document.get_root().select_first('//*[hasTag($tagName)]', {"tagName": "div"})
3480
+ ContentNode
3481
+ """
3482
+ result = self.select(selector, variables, first_only=True)
3483
+ return result[0] if len(result) > 0 else None
3484
+
3485
+ def select(
3486
+ self, selector: str, variables: Optional[dict] = None, first_only=False
3487
+ ) -> List[ContentNode]:
3488
+ """Execute a selector on the root node and then return a list of the matching nodes.
3489
+
3490
+ Args:
3491
+ selector (str): The selector (ie. //*)
3492
+ variables (Optional[dict): A dictionary of variable name/value to use in substituion; defaults to an empty
3493
+ first_only (bool): If True, only the first matching node is returned; defaults to False.
3494
+ dictionary. Dictionary keys should match a variable specified in the selector.
3495
+
3496
+ Returns:
3497
+ list[ContentNodes]: A list of the matching ContentNodes. If no matches found, list is empty.
3498
+
3499
+ >>> document.select('.')
3500
+ [ContentNode]
3501
+ """
3502
+ if variables is None:
3503
+ variables = {}
3504
+ if self.content_node:
3505
+ result = self.content_node.select(selector, variables, first_only)
3506
+ if isinstance(result, list):
3507
+ return result
3508
+ elif isinstance(result, ContentNode):
3509
+ return [result]
3510
+
3511
+ return [self.content_node] if bool(result) else []
3512
+ return []
3513
+
3514
+ def get_labels(self) -> List[str]:
3515
+ """
3516
+
3517
+ Args:
3518
+
3519
+ Returns:
3520
+ List[str]: list of associated labels
3521
+
3522
+ """
3523
+ return self.labels
3524
+
3525
+ def get_feature_set(self, owner_uri: Optional[str] = None) -> FeatureSet:
3526
+ """ """
3527
+ feature_set = FeatureSet()
3528
+ feature_set.node_features = []
3529
+ for tagged_node in self.get_all_tagged_nodes():
3530
+ node_feature = {"nodeUuid": str(tagged_node.id), "features": []}
3531
+
3532
+ feature_set.node_features.append(node_feature)
3533
+
3534
+ # TODO this needs to be cleaned up, also should it only really
3535
+ # be the tag features?
3536
+ for feature in tagged_node.get_features():
3537
+ if feature.feature_type == "tag":
3538
+ if owner_uri is not None:
3539
+ if (
3540
+ "owner_uri" in feature.value[0]
3541
+ and feature.value[0]["owner_uri"] != owner_uri
3542
+ ):
3543
+ continue
3544
+
3545
+ feature_dict = feature.to_dict()
3546
+ feature_dict["featureType"] = feature.feature_type
3547
+ feature_dict["name"] = feature.name
3548
+
3549
+ if isinstance(feature_dict["value"][0], Tag):
3550
+ feature_dict["value"] = [feature_dict["value"][0].to_dict()]
3551
+
3552
+ node_feature["features"].append(feature_dict)
3553
+
3554
+ return feature_set
3555
+
3556
+ def get_all_tagged_nodes(self) -> List[ContentNode]:
3557
+ """
3558
+ Get all the tagged nodes in the document
3559
+
3560
+ :return:
3561
+ """
3562
+ return self._persistence_layer.get_all_tagged_nodes()
3563
+
3564
+
3565
+ class TagInstance:
3566
+ """
3567
+ A class to represent a TagInstance.
3568
+
3569
+ ...
3570
+
3571
+ Attributes
3572
+ ----------
3573
+ tag_uuid : str
3574
+ a string that represents the unique identifier of the tag
3575
+ nodes : list
3576
+ a list of nodes associated with the tag
3577
+
3578
+ Methods
3579
+ -------
3580
+ get_value():
3581
+ Returns the combined content of all nodes.
3582
+ get_data():
3583
+ Returns the data of the tag feature with the same uuid as the tag.
3584
+ """
3585
+
3586
+ def __init__(self, tag_uuid, nodes):
3587
+ self.tag_uuid = tag_uuid
3588
+ self.nodes = nodes
3589
+
3590
+ def get_value(self):
3591
+ """
3592
+ Combines and returns the content of all nodes.
3593
+
3594
+ Returns
3595
+ -------
3596
+ str
3597
+ a string that represents the combined content of all nodes
3598
+ """
3599
+ content_parts = []
3600
+ for node in self.nodes:
3601
+ content_parts.append(node.get_all_content())
3602
+ return " ".join(content_parts)
3603
+
3604
+ def get_data(self):
3605
+ """
3606
+ Returns the data of the tag feature with the same uuid as the tag.
3607
+
3608
+ Returns
3609
+ -------
3610
+ dict
3611
+ a dictionary that represents the data of the tag feature with the same uuid as the tag
3612
+ """
3613
+ for node in self.nodes:
3614
+ for tag_feature in node.get_tag_features():
3615
+ data = tag_feature.value[0]
3616
+ if "uuid" in data and data["uuid"] == self.tag_uuid:
3617
+ return data
3618
+ return {}
3619
+
3620
+
3621
+ class ContentObjectReference:
3622
+ """A reference to a content object within a document.
3623
+
3624
+ This class provides a way to reference a specific content object within a document,
3625
+ and includes information about the document's family and the store where the document is located.
3626
+
3627
+ Attributes:
3628
+ content_object (ContentObject): The content object being referenced.
3629
+ store: The store where the document is located.
3630
+ document (Document): The document in which the content object is located.
3631
+ document_family: The family to which the document belongs.
3632
+ """
3633
+
3634
+ """ """
3635
+
3636
+ def __init__(
3637
+ self, content_object: ContentObject, store, document: Document, document_family
3638
+ ):
3639
+ self.content_object = content_object
3640
+ self.store = store
3641
+ self.document = document
3642
+ self.document_family = document_family