kodexa 7.4.414781565138__py3-none-any.whl → 8.0.14958192442__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. kodexa/dataclasses/__init__.py +1 -1
  2. kodexa/model/__init__.py +2 -2
  3. kodexa/model/utils.py +1 -1
  4. kodexa/pipeline/pipeline.py +1 -1
  5. kodexa/platform/client.py +1 -2
  6. kodexa/selectors/__init__.py +1 -1
  7. kodexa/selectors/ast.py +371 -98
  8. kodexa/selectors/error.py +29 -0
  9. kodexa/selectors/kodexa-ast-visitor.py +268 -0
  10. kodexa/selectors/parser.py +91 -0
  11. kodexa/selectors/resources/KodexaSelector.interp +99 -0
  12. kodexa/selectors/resources/KodexaSelector.tokens +56 -0
  13. kodexa/selectors/resources/KodexaSelectorLexer.interp +119 -0
  14. kodexa/selectors/resources/KodexaSelectorLexer.py +204 -0
  15. kodexa/selectors/resources/KodexaSelectorLexer.tokens +56 -0
  16. kodexa/selectors/resources/KodexaSelectorListener.py +570 -0
  17. kodexa/selectors/resources/KodexaSelectorParser.py +3246 -0
  18. kodexa/selectors/resources/KodexaSelectorVisitor.py +323 -0
  19. kodexa/selectors/visitor.py +265 -0
  20. kodexa/steps/__init__.py +4 -2
  21. kodexa/steps/common.py +0 -68
  22. kodexa/testing/test_utils.py +1 -1
  23. {kodexa-7.4.414781565138.dist-info → kodexa-8.0.14958192442.dist-info}/METADATA +3 -1
  24. kodexa-8.0.14958192442.dist-info/RECORD +53 -0
  25. {kodexa-7.4.414781565138.dist-info → kodexa-8.0.14958192442.dist-info}/WHEEL +1 -1
  26. kodexa/model/model.py +0 -3259
  27. kodexa/model/persistence.py +0 -2017
  28. kodexa/selectors/core.py +0 -124
  29. kodexa/selectors/lexrules.py +0 -137
  30. kodexa/selectors/lextab.py +0 -83
  31. kodexa/selectors/lextab.pyi +0 -1
  32. kodexa/selectors/parserules.py +0 -414
  33. kodexa/selectors/parserules.pyi +0 -1
  34. kodexa/selectors/parsetab.py +0 -90
  35. kodexa/selectors/parsetab.pyi +0 -1
  36. kodexa-7.4.414781565138.dist-info/RECORD +0 -51
  37. {kodexa-7.4.414781565138.dist-info → kodexa-8.0.14958192442.dist-info}/LICENSE +0 -0
kodexa/model/model.py DELETED
@@ -1,3259 +0,0 @@
1
- """
2
- The core model provides definitions for all the base objects in the Kodexa Content Model
3
- """
4
- import dataclasses
5
- import inspect
6
- import json
7
- import os
8
- import re
9
- import uuid
10
- from enum import Enum
11
- from typing import Any, List, Optional
12
- from addict import Dict
13
- import deepdiff
14
- import msgpack
15
- from pydantic import BaseModel, ConfigDict, Field
16
-
17
- from kodexa.model.objects import ContentObject, FeatureSet, DocumentTaxonValidation
18
-
19
-
20
- class Ref:
21
- """
22
- A class to represent a reference.
23
-
24
- Attributes
25
- ----------
26
- ref : str
27
- a string reference
28
- version : str, optional
29
- a version of the reference, default is None
30
- resource : str, optional
31
- a resource of the reference, default is None
32
- slug : str
33
- a slug of the reference, default is an empty string
34
- org_slug : str
35
- an organization slug of the reference, default is an empty string
36
- object_ref : str
37
- a formatted string of the reference
38
-
39
- Methods
40
- -------
41
- __init__(self, ref: str)
42
- Constructs all the necessary attributes for the Ref object.
43
- """
44
-
45
- def __init__(self, ref: str):
46
- self.ref: str = ref
47
- first_part = ref
48
- self.version: Optional[str] = None
49
- self.resource: Optional[str] = None
50
- self.slug: str = ""
51
- self.org_slug: str = ""
52
-
53
- if ":" in ref:
54
- (first_part, self.version) = ref.split(":")
55
-
56
- if "/" in self.version:
57
- (self.version, self.resource) = self.version.split("/")
58
-
59
- (self.org_slug, self.slug) = first_part.split("/")
60
-
61
- self.object_ref = (
62
- f"{self.org_slug}/{self.slug}:{self.version}"
63
- if self.version
64
- else f"{self.org_slug}/{self.slug}"
65
- )
66
-
67
-
68
- import addict
69
-
70
-
71
- class DocumentMetadata(addict.Dict):
72
- """A flexible dict based approach to capturing metadata for the document.
73
-
74
- This class extends from Dict to provide a flexible way to store and
75
- manage metadata associated with a document.
76
-
77
- Args:
78
- *args: Variable length argument list.
79
- **kwargs: Arbitrary keyword arguments.
80
- """
81
-
82
- """A flexible dict based approach to capturing metadata for the document"""
83
-
84
- def __init__(self, *args, **kwargs):
85
- super().__init__(*args, **kwargs)
86
-
87
-
88
- class ContentException(dict):
89
- """A content exception represents an issue identified during labeling or validation at the document level.
90
-
91
- Attributes:
92
- tag (Optional[str]): Tag associated with the exception.
93
- message (str): Message describing the exception.
94
- exception_details (Optional[str]): Detailed information about the exception.
95
- group_uuid (Optional[str]): UUID of the group associated with the exception.
96
- tag_uuid (Optional[str]): UUID of the tag associated with the exception.
97
- exception_type (str): Type of the exception.
98
- node_uuid (Optional[str]): UUID of the node associated with the exception.
99
- severity (str): Severity level of the exception, default is 'ERROR'.
100
- value (Optional[str]): Value associated with the exception.
101
- exception_type_id (Optional[str]): ID of the exception type.
102
- """
103
-
104
- """A content exception represents an issue identified during labeling or validation at the document level"""
105
-
106
- def __init__(
107
- self,
108
- exception_type: str,
109
- message: str,
110
- severity: str = "ERROR",
111
- tag: Optional[str] = None,
112
- group_uuid: Optional[str] = None,
113
- tag_uuid: Optional[str] = None,
114
- exception_type_id: Optional[str] = None,
115
- exception_details: Optional[str] = None,
116
- node_uuid: Optional[str] = None,
117
- value: Optional[str] = None,
118
- boolean_value: Optional[bool] = None,
119
- *args,
120
- **kwargs,
121
- ):
122
- super().__init__(*args, **kwargs)
123
- self.tag = tag
124
- self.message = message
125
- self.exception_details = exception_details
126
- self.group_uuid = group_uuid
127
- self.tag_uuid = tag_uuid
128
- self.exception_type = exception_type
129
- self.node_uuid = node_uuid
130
- self.severity = severity
131
- self.value = value
132
- self.exception_type_id = exception_type_id
133
- self.boolean_value = boolean_value
134
-
135
-
136
- class Tag(Dict):
137
- """A class to represent the metadata for a label that is applied as a feature on a content node.
138
-
139
- Attributes:
140
- start (Optional[int]): The start position (zero indexed) of the content within the node. If None, label is applied to the whole node.
141
- end (Optional[int]): The end position (zero indexed) of the content within the node. If None, label is applied to the whole node.
142
- value (Optional[str]): A string representing the value that was labelled in the node.
143
- data (Optional[Any]): Any data object (JSON serializable) that you wish to associate with the label.
144
- uuid (Optional[str]): The UUID for this tag instance. This allows tags that are on different content nodes to be related through the same UUID.
145
- confidence (Optional[float]): The confidence of the tag in a range of 0-1.
146
- index (Optional[int]): The tag index. This is used to allow us to order tags, and understand the ordering of parent child tag relationships.
147
- bbox (Optional[List[int]]): The optional bounding box that can be used if the label is spatial (based on the node as the container).
148
- group_uuid (Optional[str]): The UUID of the group that this tag belongs to. This is used to allow us to group tags together.
149
- parent_group_uuid (Optional[str]): The UUID of the parent group that this tag belongs to. This is used to allow us to group tags together.
150
- cell_index (Optional[int]): The cell index of the cell that this tag belongs to. This is used to allow us to group tags together.
151
- note (Optional[str]): A note that can be associated with the tag.
152
- status (Optional[str]): The status of the tag. This can be passed to an attribute status during extraction.
153
- owner_uri (Optional[str]): The URI of the owner (ie. model://kodexa/narrative:1.0.0 or user://pdodds).
154
- """
155
-
156
- """A tag represents the metadata for a label that is applies as a feature on a content node"""
157
-
158
- def __init__(
159
- self,
160
- start: Optional[int] = None,
161
- end: Optional[int] = None,
162
- value: Optional[str] = None,
163
- uuid: Optional[str] = None,
164
- data: Any = None,
165
- *args,
166
- confidence: Optional[float] = None,
167
- group_uuid: Optional[str] = None,
168
- parent_group_uuid: Optional[str] = None,
169
- cell_index: Optional[int] = None,
170
- index: Optional[int] = None,
171
- bbox: Optional[List[int]] = None,
172
- note: Optional[str] = None,
173
- status: Optional[str] = None,
174
- owner_uri: Optional[str] = None,
175
- is_dirty: Optional[bool] = None,
176
- **kwargs,
177
- ):
178
- super().__init__(*args, **kwargs)
179
-
180
- import uuid as uuid_gen
181
- self.start: Optional[int] = start
182
- """The start position (zero indexed) of the content within the node, if None then label is applied to the whole node"""
183
- self.end: Optional[int] = end
184
- """The end position (zero indexed) of the content within the node, if None then label is applied to the whole node"""
185
- self.value: Optional[str] = value
186
- """A string representing the value that was labelled in the node"""
187
- self.data: Optional[Any] = data
188
- """Any data object (JSON serializable) that you wish to associate with the label"""
189
- self.uuid: Optional[str] = uuid or str(uuid_gen.uuid4())
190
- """The UUID for this tag instance, this allows tags that are on different content nodes to be related through the same UUID"""
191
- self.confidence: Optional[float] = confidence
192
- """The confidence of the tag in a range of 0-1"""
193
- self.index: Optional[int] = index
194
- """The tag index, this is used to allow us to order tags, and understand the ordering of parent child tag relationships"""
195
- self.bbox: Optional[List[int]] = bbox
196
- """The optional bounding box that can be used if the label is spatial (based on the node as the container)"""
197
- self.group_uuid: Optional[str] = group_uuid
198
- """The UUID of the group that this tag belongs to, this is used to allow us to group tags together"""
199
- self.parent_group_uuid: Optional[str] = parent_group_uuid
200
- """The UUID of the parent group that this tag belongs to, this is used to allow us to group tags together"""
201
- self.cell_index: Optional[int] = cell_index
202
- """The cell index of the cell that this tag belongs to, this is used to allow us to group tags together"""
203
- self.note: Optional[str] = note
204
- """A note that can be associated with the tag"""
205
- self.status: Optional[str] = status
206
- """The status of the tag, this can be passed to an attribute status during extraction"""
207
- self.owner_uri: Optional[str] = owner_uri
208
- """The URI of the owner (ie. model://kodexa/narrative:1.0.0 or user://pdodds)"""
209
- self.is_dirty: Optional[bool] = is_dirty
210
- """Whether or not the """
211
- # Pull the cell index from the data to the tag if we have it in the data
212
- if self.cell_index is None:
213
- if data and "cell_index" in data:
214
- self.cell_index = data["cell_index"]
215
-
216
-
217
- class FindDirection(Enum):
218
- """
219
- Enum class for defining the direction of search in a tree structure.
220
-
221
- Attributes:
222
- CHILDREN (int): Represents the direction towards children nodes.
223
- PARENT (int): Represents the direction towards parent node.
224
- """
225
-
226
- """ """
227
- CHILDREN = 1
228
- PARENT = 2
229
-
230
-
231
- class Traverse(Enum):
232
- """
233
- An enumeration class that represents different types of traversals.
234
-
235
- Attributes:
236
- SIBLING (int): Represents traversal to a sibling.
237
- CHILDREN (int): Represents traversal to children.
238
- PARENT (int): Represents traversal to a parent.
239
- ALL (int): Represents traversal to all types of nodes.
240
- """
241
-
242
- """ """
243
- SIBLING = 1
244
- CHILDREN = 2
245
- PARENT = 3
246
- ALL = 4
247
-
248
-
249
- class ContentNode(object):
250
- """A Content Node identifies a section of the document containing logical
251
- grouping of information.
252
-
253
- The node will have content and can include any number of features.
254
-
255
- You should always create a node using the Document's create_node method to
256
- ensure that the correct mixins are applied.
257
-
258
- >>> new_page = document.create_node(node_type='page')
259
- <kodexa.model.model.ContentNode object at 0x7f80605e53c8>
260
- >>> current_content_node.add_child(new_page)
261
-
262
- >>> new_page = document.create_node(node_type='page', content='This is page 1')
263
- <kodexa.model.model.ContentNode object at 0x7f80605e53c8>
264
- >>> current_content_node.add_child(new_page)
265
-
266
- """
267
-
268
- def __init__(
269
- self,
270
- document,
271
- node_type: str,
272
- content: Optional[str] = None,
273
- content_parts: Optional[List[Any]] = None,
274
- parent=None,
275
- index: Optional[int] = None,
276
- virtual: bool = False,
277
- ):
278
- self.node_type: str = node_type
279
- """The node type (ie. line, page, cell etc)"""
280
- self.document: Document = document
281
- """The document that the node belongs to"""
282
- self._content_parts: Optional[List[Any]] = content_parts
283
- """The children of the content node"""
284
- self.index: Optional[int] = index
285
- """The index of the content node"""
286
- self.uuid: Optional[int] = None
287
- """The ID of the content node"""
288
- self.virtual: bool = virtual
289
- """Is the node virtual (ie. it doesn't actually exist in the document)"""
290
-
291
- self._parent_uuid = parent.uuid if parent else None
292
-
293
- if content is not None and len(self.get_content_parts()) == 0:
294
- self.set_content_parts([content])
295
-
296
- def get_content_parts(self):
297
- return self.document.get_persistence().get_content_parts(self)
298
-
299
- def set_content_parts(self, content_parts):
300
- self.document.get_persistence().update_content_parts(self, content_parts)
301
-
302
- def update(self):
303
- """
304
- Update this node in the document persistence
305
-
306
- :return:
307
- """
308
- self.document.get_persistence().update_node(self)
309
-
310
- @property
311
- def content(self):
312
- if len(self.get_content_parts()) == 0:
313
- return None
314
-
315
- s = ""
316
- for part in self.get_content_parts():
317
- if isinstance(part, str):
318
- if s != "":
319
- s += " "
320
- s += part
321
-
322
- return s
323
-
324
- @content.setter
325
- def content(self, new_content):
326
- if len(self.get_content_parts()) == 0:
327
- self.set_content_parts([new_content])
328
- else:
329
- # We need to remove all the strings and add this one
330
- # back at the front
331
- parts = self.get_content_parts()
332
- filtered_parts = list(filter(lambda part: isinstance(part, int), parts))
333
- if new_content is not None and new_content != "":
334
- filtered_parts.insert(0, new_content)
335
- self.set_content_parts(filtered_parts)
336
-
337
- def __eq__(self, other):
338
- return (
339
- other is not None
340
- and self.uuid == other.uuid
341
- and (self.uuid is not None and other.uuid is not None)
342
- )
343
-
344
- def __hash__(self):
345
- return hash(self.uuid)
346
-
347
- def get_parent(self):
348
- return self.document.get_persistence().get_parent(self)
349
-
350
- def __str__(self):
351
- return (
352
- f"ContentNode {self.uuid} [node_type:{self.node_type}] ({len(self.get_features())} features, {len(self.get_children())} children) ["
353
- + str(self.content)
354
- + "]"
355
- )
356
-
357
- def to_json(self):
358
- """Create a JSON string representation of this ContentNode.
359
-
360
- Args:
361
-
362
- Returns:
363
- str: The JSON formatted string representation of this ContentNode.
364
-
365
- >>> node.to_json()
366
- """
367
- return json.dumps(self.to_dict())
368
-
369
- def to_dict(self):
370
- """Create a dictionary representing this ContentNode's structure and content.
371
-
372
- Args:
373
-
374
- Returns:
375
- dict: The properties of this ContentNode and all of its children structured as a dictionary.
376
-
377
- >>> node.to_dict()
378
- """
379
- new_dict = {
380
- "node_type": self.node_type,
381
- "content": self.content,
382
- "content_parts": self.get_content_parts(),
383
- "features": [],
384
- "index": self.index,
385
- "children": [],
386
- "uuid": self.uuid,
387
- }
388
- for feature in self.get_features():
389
- new_dict["features"].append(feature.to_dict())
390
-
391
- for child in self.get_children():
392
- new_dict["children"].append(child.to_dict())
393
- return new_dict
394
-
395
- @staticmethod
396
- def from_dict(document, content_node_dict: dict, parent=None):
397
- """Build a new ContentNode from a dictionary represention.
398
-
399
- Args:
400
- document (Document): The Kodexa document from which the new ContentNode will be created (not added).
401
- content_node_dict (Dict): The dictionary-structured representation of a ContentNode. This value will be unpacked into a ContentNode.
402
- parent (Optional[ContentNode]): Optionally the parent content node
403
- Returns:
404
- ContentNode: A ContentNode containing the unpacked values from the content_node_dict parameter.
405
-
406
- >>> ContentNode.from_dict(document, content_node_dict)
407
- """
408
-
409
- node_type = (
410
- content_node_dict["type"]
411
- if document.version == Document.PREVIOUS_VERSION
412
- else content_node_dict["node_type"]
413
- )
414
-
415
- new_content_node = document.create_node(
416
- node_type=node_type,
417
- content=content_node_dict["content"]
418
- if "content" in content_node_dict
419
- else None,
420
- index=content_node_dict["index"],
421
- parent=parent,
422
- )
423
-
424
- if (
425
- "content_parts" in content_node_dict
426
- and len(content_node_dict["content_parts"]) > 0
427
- ):
428
- new_content_node.set_content_parts(content_node_dict["content_parts"])
429
-
430
- for dict_feature in content_node_dict["features"]:
431
- feature_type = dict_feature["name"].split(":")[0]
432
- if feature_type == "tag":
433
- new_content_node.add_feature(
434
- feature_type,
435
- dict_feature["name"].split(":")[1],
436
- dict_feature["value"],
437
- dict_feature["single"],
438
- True,
439
- )
440
- else:
441
- new_content_node.add_feature(
442
- feature_type,
443
- dict_feature["name"].split(":")[1],
444
- dict_feature["value"],
445
- dict_feature["single"],
446
- True,
447
- )
448
-
449
- for dict_child in content_node_dict["children"]:
450
- ContentNode.from_dict(document, dict_child, new_content_node)
451
-
452
- return new_content_node
453
-
454
- def add_child_content(
455
- self, node_type: str, content: str, index: Optional[int] = None
456
- ) -> "ContentNode":
457
- """Convenience method to allow you to quick add a child node with a type and content
458
-
459
- Args:
460
- node_type: the node type
461
- content: the content
462
- index: the index (optional) (Default value = None)
463
-
464
- Returns:
465
- the new ContentNode
466
-
467
- """
468
- new_node = self.document.create_node(
469
- node_type=node_type, parent=self, content=content
470
- )
471
- self.add_child(new_node, index)
472
- return new_node
473
-
474
- def add_child(self, child, index: Optional[int] = None):
475
- """Add a ContentNode as a child of this ContentNode
476
-
477
- Args:
478
- child (ContentNode): The node that will be added as a child of this node
479
- index (Optional[int]): The index at which this child node should be added; defaults to None. If None, index is set as the count of child node elements.
480
-
481
- Returns:
482
-
483
- >>> new_page = document.create_node(node_type='page')
484
- <kodexa.model.model.ContentNode object at 0x7f80605e53c8>
485
- >>> current_content_node.add_child(new_page)
486
- """
487
- if index is None:
488
- if len(self.get_children()) > 0:
489
- child.index = self.get_children()[-1].index + 1
490
- else:
491
- child.index = 0
492
- else:
493
- child.index = index
494
-
495
- self.document.get_persistence().add_content_node(child, self)
496
-
497
- def remove_child(self, content_node):
498
- self.document.get_persistence().remove_content_node(content_node)
499
-
500
- def get_children(self):
501
- """Returns a list of the children of this node.
502
-
503
- Returns:
504
- list[ContentNode]: The list of child nodes for this ContentNode.
505
-
506
- >>> node.get_children()
507
- """
508
- return self.document.get_persistence().get_children(self)
509
-
510
- def set_feature(self, feature_type, name, value):
511
- """Sets a feature for this ContentNode, replacing the value if a feature by this type and name already exists.
512
-
513
- Args:
514
- feature_type (str): The type of feature to be added to the node.
515
- name (str): The name of the feature.
516
- value (Any): The value of the feature.
517
-
518
- Returns:
519
- ContentFeature: The feature that was added to this ContentNode
520
-
521
- >>> new_page = document.create_node(node_type='page')
522
- <kodexa.model.model.ContentNode object at 0x7f80605e53c8>
523
- >>> new_page.add_feature('pagination','pageNum',1)
524
- """
525
- self.remove_feature(feature_type, name)
526
- return self.add_feature(feature_type, name, value)
527
-
528
- def update_feature(self, feature: "ContentFeature"):
529
- """
530
- Update a feature on this node in document persistence
531
-
532
- :param feature:
533
- :return:
534
- """
535
- self.document.get_persistence().remove_feature(
536
- self, feature.feature_type, feature.name
537
- )
538
- self.document.get_persistence().add_feature(self, feature)
539
-
540
- def add_feature(self, feature_type, name, value, single=True, serialized=False):
541
- """
542
- Add a new feature to this ContentNode.
543
-
544
- Note: if a feature for this feature_type/name already exists, the new value will be added to the existing feature;
545
- therefore the feature value might become a list.
546
-
547
- Args:
548
- feature_type (str): The type of feature to be added to the node.
549
- name (str): The name of the feature.
550
- value (Any): The value of the feature.
551
- single (boolean): Indicates that the value is singular, rather than a collection (ex: str vs list); defaults to True.
552
- serialized (boolean): Indicates that the value is/is not already serialized; defaults to False.
553
-
554
- Returns:
555
- ContentFeature: The feature that was added to this ContentNode.
556
-
557
- >>> new_page = document.create_node(node_type='page')
558
- <kodexa.model.model.ContentNode object at 0x7f80605e53c8>
559
- >>> new_page.add_feature('pagination','pageNum',1)
560
- """
561
- if self.has_feature(feature_type, name):
562
- existing_feature = self.get_feature(feature_type, name)
563
- if isinstance(existing_feature.value, list):
564
- existing_feature.value.append(value)
565
- else:
566
- existing_feature.value = [existing_feature.value, value]
567
- self.update_feature(existing_feature)
568
- return existing_feature
569
-
570
- # Make sure that we treat the value as list all the time
571
- new_feature = ContentFeature(
572
- feature_type,
573
- name,
574
- [value] if single and not serialized else value,
575
- single=single,
576
- )
577
- self.document.get_persistence().add_feature(self, new_feature)
578
- return new_feature
579
-
580
- def delete_children(
581
- self, nodes: Optional[List] = None, exclude_nodes: Optional[List] = None
582
- ):
583
- """Delete the children of this node, you can either supply a list of the nodes to delete
584
- or the nodes to exclude from the delete, if neither are supplied then we delete all the children.
585
-
586
- Note there is precedence in place, if you have provided a list of nodes to delete then the nodes
587
- to exclude is ignored.
588
-
589
- Args:
590
- nodes: Optional[List[ContentNode]] a list of content nodes that are children to delete
591
- exclude_nodes: Optional[List[ContentNode]] a list of content node that are children not to delete
592
- nodes: Optional[List]: (Default value = None)
593
- exclude_nodes: Optional[List]: (Default value = None)
594
- """
595
- children_to_delete = []
596
-
597
- for child_node in self.get_children():
598
- if nodes is not None:
599
- for node_to_delete in nodes:
600
- if node_to_delete.uuid == child_node.uuid:
601
- children_to_delete.append(child_node)
602
- elif exclude_nodes is not None:
603
- if len(exclude_nodes) == 0:
604
- children_to_delete.append(child_node)
605
- else:
606
- for nodes_to_exclude in exclude_nodes:
607
- if nodes_to_exclude.uuid != child_node.uuid:
608
- children_to_delete.append(child_node)
609
- else:
610
- children_to_delete.append(child_node)
611
-
612
- for child_to_delete in children_to_delete:
613
- if child_to_delete in self.get_children():
614
- self.document.get_persistence().remove_content_node(child_to_delete)
615
-
616
- def get_feature(self, feature_type, name):
617
- """Gets the value for the given feature.
618
-
619
- Args:
620
- feature_type (str): The type of the feature.
621
- name (str): The name of the feature.
622
-
623
- Returns:
624
- ContentFeature or None: The feature with the specified type & name. If no feature is found, None is returned.
625
- Note that if there are more than one instance of the feature you will only get the first one
626
-
627
- >>> new_page.get_feature('pagination','pageNum')
628
- 1
629
- """
630
- hits = [
631
- i
632
- for i in self.get_features()
633
- if i.feature_type == feature_type and i.name == name
634
- ]
635
- if len(hits) > 0:
636
-
637
- # We have a situation where the feature isn't a dict since it
638
- # was added as a "Tag", lets turn it back into a dict to be
639
- # consistent
640
- if isinstance(hits[0].value, Tag):
641
- hits[0].value = hits[0].value.to_dict()
642
-
643
- return hits[0]
644
-
645
- return None
646
-
647
- def get_features_of_type(self, feature_type):
648
- """Get all features of a specific type.
649
-
650
- Args:
651
- feature_type (str): The type of the feature.
652
-
653
- Returns:
654
- list[ContentFeature]: A list of feature with the specified type. If no features are found, an empty list is returned.
655
-
656
- >>> new_page.get_features_of_type('my_type')
657
- []
658
- """
659
- return [i for i in self.get_features() if i.feature_type == feature_type]
660
-
661
- def has_feature(self, feature_type: str, name: str):
662
- """Determines if a feature with the given feature and name exists on this content node.
663
-
664
- Args:
665
- feature_type (str): The type of the feature.
666
- name (str): The name of the feature.
667
-
668
- Returns:
669
- bool: True if the feature is present; else, False.
670
-
671
- >>> new_page.has_feature('pagination','pageNum')
672
- True
673
- """
674
- return (
675
- len(
676
- [
677
- i
678
- for i in self.get_features()
679
- if i.feature_type == feature_type and i.name == name
680
- ]
681
- )
682
- > 0
683
- )
684
-
685
- def get_features(self):
686
- """Get all features on this ContentNode.
687
-
688
- Returns:
689
- list[ContentFeature]: A list of the features on this ContentNode.
690
-
691
- """
692
- return self.document.get_persistence().get_features(self)
693
-
694
- def remove_feature(
695
- self, feature_type: str, name: str, include_children: bool = False
696
- ):
697
- """Removes the feature with the given name and type from this node.
698
-
699
- Args:
700
- feature_type (str): The type of the feature.
701
- name (str): The name of the feature.
702
- include_children (bool): also remove the feature from nodes children
703
-
704
- >>> new_page.remove_feature('pagination','pageNum')
705
- """
706
- self.document.get_persistence().remove_feature(self, feature_type, name)
707
-
708
- if include_children:
709
- for child in self.get_children():
710
- child.remove_feature(feature_type, name, include_children)
711
-
712
- def get_feature_value(self, feature_type: str, name: str) -> Optional[Any]:
713
- """Get the value for a feature with the given name and type on this ContentNode.
714
-
715
- Args:
716
- feature_type (str): The type of the feature.
717
- name (str): The name of the feature.
718
-
719
- Returns:
720
- Any or None: The value of the feature if it exists on this ContentNode otherwise, None, note this
721
- only returns the first value (check single to determine if there are multiple)
722
-
723
- >>> new_page.get_feature_value('pagination','pageNum')
724
- 1
725
- """
726
- feature = self.get_feature(feature_type, name)
727
-
728
- # Need to make sure we handle the idea of a single value for a feature
729
- return None if feature is None else feature.value[0]
730
-
731
- def get_feature_values(self, feature_type: str, name: str) -> Optional[List[Any]]:
732
- """Get the value for a feature with the given name and type on this ContentNode.
733
-
734
- Args:
735
- feature_type (str): The type of the feature.
736
- name (str): The name of the feature.
737
-
738
- Returns:
739
- The list of feature values or None if there is no feature
740
-
741
- >>> new_page.get_feature_value('pagination','pageNum')
742
- 1
743
- """
744
- feature = self.get_feature(feature_type, name)
745
-
746
- # Simply return all the feature values
747
- return None if feature is None else feature.value
748
-
749
- def get_content(self):
750
- """Get the content of this node.
751
-
752
- Args:
753
-
754
- Returns:
755
- str: The content of this ContentNode.
756
-
757
- >>> new_page.get_content()
758
- "This is page one"
759
- """
760
- return self.content
761
-
762
- def get_node_type(self):
763
- """Get the type of this node.
764
-
765
- Args:
766
-
767
- Returns:
768
- str: The type of this ContentNode.
769
-
770
- >>> new_page.get_content()
771
- "page"
772
- """
773
- return self.node_type
774
-
775
- def select_first(self, selector, variables=None) -> Optional["ContentNode"]:
776
- """Select and return the first child of this node that match the selector value.
777
-
778
- Args:
779
- selector (str): The selector (ie. //*)
780
- variables (dict, optional): A dictionary of variable name/value to use in substituion; defaults to None. Dictionary keys should match a variable specified in the selector.
781
-
782
- Returns:
783
- Optional[ContentNode]: The first matching node or none
784
-
785
- >>> document.get_root().select_first('.')
786
- ContentNode
787
-
788
- >>> document.get_root().select_first('//*[hasTag($tagName)]', {"tagName": "div"})
789
- ContentNode
790
- """
791
- result = self.select(selector, variables)
792
- return result[0] if len(result) > 0 else None
793
-
794
- def select(self, selector, variables=None, first_only=False):
795
- """Select and return the child nodes of this node that match the selector value.
796
-
797
- Args:
798
- selector (str): The selector (ie. //*)
799
- variables (dict, optional): A dictionary of variable name/value to use in substituion; defaults to None. Dictionary keys should match a variable specified in the selector.
800
- first_only (bool, optional): If True, only the first matching node will be returned; defaults to False.
801
-
802
- Returns:
803
- list[ContentNode]: A list of the matching content nodes. If no matches are found, the list will be empty.
804
-
805
- >>> document.get_root().select('.')
806
- [ContentNode]
807
-
808
- >>> document.get_root().select('//*[hasTag($tagName)]', {"tagName": "div"})
809
- [ContentNode]
810
- """
811
-
812
- if variables is None:
813
- variables = {}
814
- from kodexa.selectors import parse
815
- from kodexa.selectors.ast import SelectorContext
816
-
817
- context = SelectorContext(self.document, first_only=first_only)
818
- self.document.get_persistence().flush_cache()
819
- parsed_selector = parse(selector)
820
- return parsed_selector.resolve(self, variables, context)
821
-
822
- def get_all_content(self, separator=" ", strip=True):
823
- """Get this node's content, concatenated with all of its children's content.
824
-
825
- Args:
826
- separator(str, optional): The separator to use in joining content together; defaults to " ".
827
- strip(boolean, optional): Strip the result
828
-
829
- Returns:
830
- str: The complete content for this node concatenated with the content of all child nodes.
831
-
832
- >>> document.content_node.get_all_content()
833
-
834
- "This string is made up of multiple nodes"
835
- """
836
- s = ""
837
- children = self.get_content_parts()
838
- for part in children:
839
- if isinstance(part, str):
840
- if s != "":
841
- s += separator
842
- s += part
843
- if isinstance(part, int):
844
- if s != "":
845
- s += separator
846
- s += [
847
- child.get_all_content(separator, strip=strip)
848
- for child in self.get_children()
849
- if child.index == part
850
- ][0]
851
-
852
- # We need to determine if we have missing children and add them to the end
853
- for child in self.get_children():
854
- if child.index not in self.get_content_parts():
855
- if s != "":
856
- s += separator
857
- s += child.get_all_content(separator, strip=strip)
858
-
859
- return s.strip() if strip else s
860
-
861
- def adopt_children(self, nodes_to_adopt, replace=False):
862
- """This will take a list of content nodes and adopt them under this node, ensuring they are re-parented.
863
-
864
- Args:
865
- nodes_to_adopt (List[ContentNode]): A list of ContentNodes that will be added to the end of this node's children collection
866
- replace (bool): If True, will remove all current children and replace them with the new list; defaults to True
867
-
868
- >>> # select all nodes of type 'line', then the root node 'adopts' them
869
- >>> # and replaces all it's existing children with these 'line' nodes.
870
- >>> document.get_root().adopt_children(document.select('//line'), replace=True)
871
- """
872
- child_idx_base = 0
873
-
874
- # We need to copy this since we might well mutate
875
- # it as we adopt
876
- children = nodes_to_adopt.copy()
877
- for existing_child in self.get_children():
878
- if existing_child not in children:
879
- existing_child.index = child_idx_base
880
- self.document.get_persistence().update_node(existing_child)
881
- else:
882
- existing_child.index = children.index(existing_child)
883
- existing_child._parent_uuid = self.uuid
884
- self.document.get_persistence().update_node(existing_child)
885
- child_idx_base += 1
886
-
887
- # Copy to avoid mutation
888
- for new_child in children.copy():
889
- if new_child not in self.get_children():
890
- self.add_child(new_child, children.index(new_child))
891
- child_idx_base += 1
892
-
893
- if replace:
894
- # Copy to avoid mutation
895
- for child in self.get_children().copy():
896
- if child not in children:
897
- self.remove_child(child)
898
-
899
- def remove_tag(self, tag_name):
900
- """Remove a tag from this content node.
901
-
902
- Args:
903
- str: tag_name: The name of the tag that should be removed.
904
- tag_name:
905
-
906
- Returns:
907
-
908
- >>> document.get_root().remove_tag('foo')
909
- """
910
- self.remove_feature("tag", tag_name)
911
-
912
- def set_statistics(self, statistics):
913
- """Set the spatial statistics for this node
914
-
915
- Args:
916
- statistics: the statistics object
917
-
918
- Returns:
919
-
920
- >>> document.select.('//page')[0].set_statistics(NodeStatistics())
921
- """
922
- self.add_feature("spatial", "statistics", statistics)
923
-
924
- def get_statistics(self):
925
- """Get the spatial statistics for this node
926
-
927
-
928
- :return: the statistics object (or None if not set)
929
-
930
- Args:
931
-
932
- Returns:
933
-
934
- >>> document.select.('//page')[0].get_statistics()
935
- <kodexa.spatial.NodeStatistics object at 0x7f80605e53c8>
936
- """
937
- return self.get_feature_value("spatial", "statistics")
938
-
939
- def set_bbox(self, bbox):
940
- """Set the bounding box for the node, this is structured as:
941
-
942
- [x1,y1,x2,y2]
943
-
944
- Args:
945
- bbox: the bounding box array
946
-
947
-
948
- >>> document.select.('//page')[0].set_bbox([10,20,50,100])
949
- """
950
- self.set_feature("spatial", "bbox", bbox)
951
-
952
- def get_bbox(self):
953
- """Get the bounding box for the node, this is structured as:
954
-
955
- [x1,y1,x2,y2]
956
-
957
-
958
- :return: the bounding box array
959
-
960
- >>> document.select.('//page')[0].get_bbox()
961
- [10,20,50,100]
962
- """
963
- return self.get_feature_value("spatial", "bbox")
964
-
965
- def set_bbox_from_children(self):
966
- """Set the bounding box for this node based on its children"""
967
-
968
- x_min = None
969
- x_max = None
970
- y_min = None
971
- y_max = None
972
-
973
- for child in self.get_children():
974
- child_bbox = child.get_bbox()
975
- if child_bbox:
976
- if not x_min or x_min > child_bbox[0]:
977
- x_min = child_bbox[0]
978
- if not x_max or x_max < child_bbox[2]:
979
- x_max = child_bbox[2]
980
- if not y_min or y_min > child_bbox[1]:
981
- y_min = child_bbox[1]
982
- if not y_max or y_max < child_bbox[3]:
983
- y_max = child_bbox[3]
984
-
985
- if x_min:
986
- self.set_bbox([x_min, y_min, x_max, y_max])
987
-
988
- def set_rotate(self, rotate):
989
- """Set the rotate of the node
990
-
991
- Args:
992
- rotate: the rotation of the node
993
-
994
- Returns:
995
-
996
- >>> document.select.('//page')[0].set_rotate(90)
997
- """
998
- self.add_feature("spatial", "rotate", rotate)
999
-
1000
- def get_rotate(self):
1001
- """Get the rotate of the node
1002
-
1003
-
1004
- :return: the rotation of the node
1005
-
1006
- Args:
1007
-
1008
- Returns:
1009
-
1010
- >>> document.select.('//page')[0].get_rotate()
1011
- 90
1012
- """
1013
- return self.get_feature_value("spatial", "rotate")
1014
-
1015
- def get_x(self):
1016
- """Get the X position of the node
1017
-
1018
-
1019
- :return: the X position of the node
1020
-
1021
- Args:
1022
-
1023
- Returns:
1024
-
1025
- >>> document.select.('//page')[0].get_x()
1026
- 10
1027
- """
1028
- self_bbox = self.get_bbox()
1029
- if self_bbox:
1030
- return self_bbox[0]
1031
-
1032
- return None
1033
-
1034
- def get_y(self):
1035
- """Get the Y position of the node
1036
-
1037
-
1038
- :return: the Y position of the node
1039
-
1040
- Args:
1041
-
1042
- Returns:
1043
-
1044
- >>> document.select.('//page')[0].get_y()
1045
- 90
1046
- """
1047
- self_bbox = self.get_bbox()
1048
- if self_bbox:
1049
- return self_bbox[1]
1050
-
1051
- return None
1052
-
1053
- def get_width(self):
1054
- """Get the width of the node
1055
-
1056
-
1057
- :return: the width of the node
1058
-
1059
- Args:
1060
-
1061
- Returns:
1062
-
1063
- >>> document.select.('//page')[0].get_width()
1064
- 70
1065
- """
1066
- self_bbox = self.get_bbox()
1067
- if self_bbox:
1068
- return self_bbox[2] - self_bbox[0]
1069
-
1070
- return None
1071
-
1072
- def get_height(self):
1073
- """Get the height of the node
1074
-
1075
-
1076
- :return: the height of the node
1077
-
1078
- Args:
1079
-
1080
- Returns:
1081
-
1082
- >>> document.select.('//page')[0].get_height()
1083
- 40
1084
- """
1085
- self_bbox = self.get_bbox()
1086
- if self_bbox:
1087
- return self_bbox[3] - self_bbox[1]
1088
-
1089
- return None
1090
-
1091
- def copy_tag(self, selector=".", existing_tag_name=None, new_tag_name=None):
1092
- """Creates a new tag of 'new_tag_name' on the selected content node(s) with the same information as the tag with 'existing_tag_name'.
1093
- Both existing_tag_name and new_tag_name values are required and must be different from one another. Otherwise, no action is taken.
1094
- If a tag with the 'existing_tag_name' does not exist on a selected node, no action is taken for that node.
1095
-
1096
- Args:
1097
- selector: The selector to identify the source nodes to work on (default . - the current node)
1098
- str: existing_tag_name: The name of the existing tag whose values will be copied to the new tag.
1099
- str: new_tag_name: The name of the new tag. This must be different from the existing_tag_name.
1100
- existing_tag_name: (Default value = None)
1101
- new_tag_name: (Default value = None)
1102
-
1103
- Returns:
1104
-
1105
- >>> document.get_root().copy_tag('foo', 'bar')
1106
- """
1107
- if (
1108
- existing_tag_name is None
1109
- or new_tag_name is None
1110
- or existing_tag_name == new_tag_name
1111
- ):
1112
- return # do nothing, just exit function
1113
-
1114
- for node in self.select(selector):
1115
- existing_tag_values = node.get_feature_values("tag", existing_tag_name)
1116
- if existing_tag_values:
1117
- for val in existing_tag_values:
1118
- tag = Tag(
1119
- start=val["start"],
1120
- end=val["end"],
1121
- value=val["value"],
1122
- uuid=val["uuid"],
1123
- data=val["data"],
1124
- )
1125
- node.add_feature("tag", new_tag_name, tag)
1126
-
1127
- def collect_nodes_to(self, end_node):
1128
- """Get the the sibling nodes between the current node and the end_node.
1129
-
1130
- Args:
1131
- ContentNode: end_node: The node to end at
1132
- end_node:
1133
-
1134
- Returns:
1135
- list[ContentNode]: A list of sibling nodes between this node and the end_node.
1136
-
1137
- >>> document.content_node.get_children()[0].collect_nodes_to(end_node=document.content_node.get_children()[5])
1138
- """
1139
- nodes = []
1140
- current_node = self
1141
- while current_node.uuid != end_node.uuid:
1142
- nodes.append(current_node)
1143
- if current_node.has_next_node():
1144
- current_node = current_node.next_node()
1145
- else:
1146
- break
1147
- return nodes
1148
-
1149
- def tag_nodes_to(self, end_node, tag_to_apply, tag_uuid: str = None):
1150
- """Tag all the nodes from this node to the end_node with the given tag name
1151
-
1152
- Args:
1153
- end_node (ContentNode): The node to end with
1154
- tag_to_apply (str): The tag name that will be applied to each node
1155
- tag_uuid (str): The tag uuid used if you want to group them
1156
-
1157
- >>> document.content_node.get_children()[0].tag_nodes_to(document.content_node.get_children()[5], tag_name='foo')
1158
- """
1159
- [
1160
- node.tag(tag_to_apply, tag_uuid=tag_uuid)
1161
- for node in self.collect_nodes_to(end_node)
1162
- ]
1163
-
1164
- def tag_range(
1165
- self,
1166
- start_content_re,
1167
- end_content_re,
1168
- tag_to_apply,
1169
- node_type_re=".*",
1170
- use_all_content=False,
1171
- ):
1172
- """This will tag all the child nodes between the start and end content regular expressions
1173
-
1174
- Args:
1175
- start_content_re: The regular expression to match the starting child
1176
- end_content_re: The regular expression to match the ending child
1177
- tag_to_apply: The tag name that will be applied to the nodes in range
1178
- node_type_re: The node type to match (default is all)
1179
- use_all_content: Use full content (including child nodes, default is False)
1180
-
1181
- Returns:
1182
-
1183
- >>> document.content_node.tag_range(start_content_re='.*Cheese.*', end_content_re='.*Fish.*', tag_to_apply='foo')
1184
- """
1185
-
1186
- # Could be line, word, or content-area
1187
- all_nodes = self.select(f"//*[typeRegex('{node_type_re}')]")
1188
-
1189
- start_index_list = [
1190
- n_idx
1191
- for n_idx, node in enumerate(all_nodes)
1192
- if re.compile(start_content_re).match(
1193
- node.get_all_content() if use_all_content else node.content
1194
- )
1195
- ]
1196
- end_index_list = [
1197
- n_idx
1198
- for n_idx, node in enumerate(all_nodes)
1199
- if re.compile(end_content_re).match(
1200
- node.get_all_content() if use_all_content else node.content
1201
- )
1202
- ]
1203
-
1204
- start_index = (
1205
- 0
1206
- if start_content_re == ""
1207
- else start_index_list[0]
1208
- if len(start_index_list) > 0
1209
- else None
1210
- )
1211
- if start_index is not None:
1212
- end_index_list = [i for i in end_index_list if i >= start_index]
1213
-
1214
- end_index = (
1215
- len(all_nodes)
1216
- if end_content_re == ""
1217
- else end_index_list[0]
1218
- if len(end_index_list) > 0
1219
- else len(all_nodes)
1220
- )
1221
-
1222
- if start_index is not None:
1223
- [node.tag(tag_to_apply) for node in all_nodes[start_index:end_index]]
1224
-
1225
- def tag(
1226
- self,
1227
- tag_to_apply,
1228
- selector=".",
1229
- content_re=None,
1230
- use_all_content=False,
1231
- node_only=None,
1232
- fixed_position=None,
1233
- data=None,
1234
- separator=" ",
1235
- tag_uuid: str = None,
1236
- confidence=None,
1237
- value=None,
1238
- use_match=True,
1239
- index=None,
1240
- cell_index=None,
1241
- group_uuid=None,
1242
- parent_group_uuid=None,
1243
- note=None,
1244
- status=None,
1245
- owner_uri=None,
1246
- is_dirty=None,
1247
- sort_by_bbox: bool=False,
1248
- ):
1249
- """
1250
- This will tag (see Feature Tagging) the expression groups identified by the regular expression.
1251
-
1252
- Note that if you use the flag use_all_content then node_only will default to True if not set, else it
1253
- will default to False
1254
-
1255
- Args:
1256
- tag_to_apply: The name of tag that will be applied to the node
1257
- selector: The selector to identify the source nodes to work on (default . - the current node)
1258
- content_re: The regular expression that you wish to use to tag, note that we will create a tag for each matching group (Default value = None)
1259
- use_all_content: Apply the regular expression to the all_content (include content from child nodes) (Default value = False)
1260
- separator: Separator to use for use_all_content (Default value = " ")
1261
- node_only: Ignore the matching groups and tag the whole node (Default value = None)
1262
- fixed_position: Use a fixed position, supplied as a tuple i.e. - (4,10) tag from position 4 to 10 (default None)
1263
- data: A dictionary of data for the given tag (Default value = None)
1264
- tag_uuid: A UUID used to tie tags in order to demonstrate they're related and form a single concept.
1265
- For example, if tagging the two words "Wells" and "Fargo" as an ORGANIZATION, the tag on both words should have the
1266
- same tag_uuid in order to indicate they are both needed to form the single ORGANIZATION. If a tag_uuid is provided, it is used
1267
- on all tags created in this method. This may result in multiple nodes or multiple feature values having the same tag_uuid.
1268
- For example, if the selector provided results in more than one node being selected, each node would be tagged with the same tag_uuid.
1269
- The same holds true if a content_re value is provided, node_only is set to False, and multiple matches are found for the content_re
1270
- pattern. In that case, each feature value would share the same UUID.
1271
- If no tag_uuid is provided, a new uuid is generated for each tag instance.
1272
- tag_uuid: str: (Default value = None)
1273
- confidence: The confidence in the tag (0-1)
1274
- value: The value you wish to store with the tag, this allows you to provide text that isn't part of the content but represents the data you wish tagged
1275
- use_match: If True (default) we will use match for regex matching, if False we will use search
1276
- index: The index for the tag
1277
- cell_index: The cell index for the tag
1278
- group_uuid: The group uuid for the tag
1279
- parent_group_uuid: The parent group uuid for the tag
1280
- note: a text note for the tag
1281
- status: a status for the tag, this can be transistioned to an attribute status during extraction
1282
- owner_uri: the uri of the entity that created the tag (model vs user; example: model://cdad-healthcare/cdad-excel-model:1.0.0 or user://pdodds)
1283
- is_dirty: when the model is run, is_dirty = false for all tags. New tags and editted tags, is_dirty = true.
1284
-
1285
- >>> document.content_node.tag('is_cheese')
1286
- """
1287
-
1288
- if use_all_content and node_only is None:
1289
- node_only = True
1290
- elif node_only is None:
1291
- node_only = False
1292
-
1293
- def get_tag_uuid(tag_uuid):
1294
- """
1295
- This function returns the provided tag_uuid if it exists, otherwise it generates a new UUID.
1296
-
1297
- Args:
1298
- tag_uuid (str): The UUID of the tag.
1299
-
1300
- Returns:
1301
- str: The provided tag_uuid if it exists, otherwise a newly generated UUID.
1302
- """
1303
- if tag_uuid:
1304
- return tag_uuid
1305
-
1306
- return str(uuid.uuid4())
1307
-
1308
- def tag_node_position(
1309
- node_to_check, start, end, node_data, tag_uuid, offset=0, value=None, sort_by_bbox: bool=False
1310
- ):
1311
- """
1312
- This function tags a node position in a given data structure. It iterates over the content parts of the node to check,
1313
- and based on the type of the part (string or integer), it performs different operations. If the part is a string, it
1314
- adjusts the start and end positions and adds a feature to the node. If the part is an integer, it finds the corresponding
1315
- child node and recursively calls the function on the child node. After processing all parts, it checks for any missing
1316
- children and processes them as well. Finally, it checks if the length of all content matches the calculated content length.
1317
-
1318
- Args:
1319
- node_to_check (Node): The node to check and tag.
1320
- start (int): The start position of the tag.
1321
- end (int): The end position of the tag.
1322
- node_data (dict): The data associated with the node.
1323
- tag_uuid (str): The UUID of the tag.
1324
- offset (int, optional): The offset to apply. Defaults to 0.
1325
- value (str, optional): The value to use for the tag. If None, the part of the content at the start and end positions is used. Defaults to None.
1326
-
1327
- Raises:
1328
- Exception: If an invalid part is encountered in the content parts of the node to check.
1329
- Exception: If there is a mismatch between the length of all content and the calculated content length.
1330
-
1331
- Returns:
1332
- int: The calculated content length.
1333
- """
1334
- content_length = 0
1335
- original_start = start
1336
- original_end = end
1337
- for part_idx, part in enumerate(node_to_check.get_content_parts()):
1338
- if isinstance(part, str):
1339
- if len(part) > 0:
1340
- # It is just content
1341
- part_length = len(part)
1342
- if part_idx > 0:
1343
- end = end - len(separator)
1344
- content_length = content_length + len(separator)
1345
- offset = offset + len(separator)
1346
- start = (
1347
- 0
1348
- if start - len(separator) < 0
1349
- else start - len(separator)
1350
- )
1351
-
1352
- if start < part_length and end < part_length:
1353
- node_to_check.add_feature(
1354
- "tag",
1355
- tag_to_apply,
1356
- Tag(
1357
- original_start,
1358
- original_end,
1359
- part[start:end] if value is None else value,
1360
- data=node_data,
1361
- uuid=tag_uuid,
1362
- confidence=confidence,
1363
- index=index,
1364
- parent_group_uuid=parent_group_uuid,
1365
- group_uuid=group_uuid,
1366
- cell_index=cell_index,
1367
- note=note,
1368
- status=status,
1369
- owner_uri=owner_uri,
1370
- is_dirty=is_dirty,
1371
- ),
1372
- )
1373
- return -1
1374
- if start < part_length <= end:
1375
- node_to_check.add_feature(
1376
- "tag",
1377
- tag_to_apply,
1378
- Tag(
1379
- original_start,
1380
- content_length + part_length,
1381
- value=part[start:] if value is None else value,
1382
- data=node_data,
1383
- uuid=tag_uuid,
1384
- confidence=confidence,
1385
- index=index,
1386
- parent_group_uuid=parent_group_uuid,
1387
- group_uuid=group_uuid,
1388
- cell_index=cell_index,
1389
- note=note,
1390
- status=status,
1391
- owner_uri=owner_uri,
1392
- is_dirty=is_dirty,
1393
- ),
1394
- )
1395
-
1396
- end = end - part_length
1397
- content_length = content_length + part_length
1398
- offset = offset + part_length
1399
- start = 0 if start - part_length < 0 else start - part_length
1400
-
1401
- elif isinstance(part, int):
1402
- child_node = [
1403
- child
1404
- for child in node_to_check.get_children()
1405
- if child.index == part
1406
- ][0]
1407
-
1408
- if part_idx > 0:
1409
- end = end - len(separator)
1410
- content_length = content_length + len(separator)
1411
- offset = offset + len(separator)
1412
- start = (
1413
- 0 if start - len(separator) < 0 else start - len(separator)
1414
- )
1415
-
1416
- result = tag_node_position(
1417
- child_node,
1418
- start,
1419
- end,
1420
- node_data,
1421
- tag_uuid,
1422
- offset=offset,
1423
- value=value,
1424
- sort_by_bbox=sort_by_bbox,
1425
- )
1426
-
1427
- if result < 0 or (end - result) <= 0:
1428
- return -1
1429
-
1430
- offset = offset + result
1431
- end = end - result
1432
- start = 0 if start - result < 0 else start - result
1433
-
1434
- content_length = content_length + result
1435
- else:
1436
- raise Exception("Invalid part?")
1437
-
1438
- # We need to determine if we have missing children and add them to the end
1439
- node_children = node_to_check.get_children()
1440
- if node_children and sort_by_bbox:
1441
- # Sort nodes by x-coordinate if they have bboxes, otherwise use index
1442
- try:
1443
- node_children.sort(key=lambda x: x.get_bbox()[0] if hasattr(x, 'get_bbox') else x.index if hasattr(x, 'index') else 0)
1444
- except (AttributeError, TypeError, IndexError):
1445
- # If sorting fails, keep original order
1446
- pass
1447
-
1448
- for child_idx, child_node in enumerate(node_children):
1449
- if child_node.index not in node_to_check.get_content_parts():
1450
- if content_length > 0:
1451
- end = end - len(separator)
1452
- content_length = content_length + len(separator)
1453
- offset = offset + len(separator)
1454
- start = (
1455
- 0 if start - len(separator) < 0 else start - len(separator)
1456
- )
1457
-
1458
- result = tag_node_position(
1459
- child_node,
1460
- start,
1461
- end,
1462
- node_data,
1463
- tag_uuid,
1464
- offset=offset,
1465
- value=value,
1466
- sort_by_bbox=sort_by_bbox,
1467
- )
1468
-
1469
- if result < 0 or (end - result) <= 0:
1470
- return -1
1471
-
1472
- offset = offset + result
1473
- end = end - result
1474
- start = 0 if start - result < 0 else start - result
1475
-
1476
- content_length = content_length + result
1477
-
1478
- if len(node_to_check.get_all_content(strip=False)) != content_length:
1479
- raise Exception(
1480
- f"There is a problem in the structure? (2) Length mismatch ({len(node_to_check.get_all_content(strip=False))} != {content_length})"
1481
- )
1482
-
1483
- return content_length
1484
-
1485
- if content_re:
1486
- pattern = re.compile(
1487
- content_re.replace(" ", r"\s+")
1488
- if use_all_content and not node_only
1489
- else content_re
1490
- )
1491
-
1492
- for node in self.select(selector):
1493
- if fixed_position:
1494
- tag_node_position(
1495
- node,
1496
- fixed_position[0],
1497
- fixed_position[1],
1498
- data,
1499
- get_tag_uuid(tag_uuid),
1500
- 0,
1501
- value=value,
1502
- sort_by_bbox=sort_by_bbox,
1503
- )
1504
-
1505
- else:
1506
- if not content_re:
1507
- node.add_feature(
1508
- "tag",
1509
- tag_to_apply,
1510
- Tag(
1511
- data=data,
1512
- uuid=get_tag_uuid(tag_uuid),
1513
- confidence=confidence,
1514
- value=value,
1515
- index=index,
1516
- parent_group_uuid=parent_group_uuid,
1517
- group_uuid=group_uuid,
1518
- cell_index=cell_index,
1519
- note=note,
1520
- status=status,
1521
- owner_uri=owner_uri,
1522
- is_dirty=is_dirty,
1523
- ),
1524
- )
1525
- else:
1526
- if not use_all_content:
1527
- if node.content:
1528
- content = node.content
1529
- else:
1530
- content = None
1531
- else:
1532
- content = (
1533
- node.get_all_content(separator=separator, strip=False)
1534
- if not node_only
1535
- else node.get_all_content(separator=separator)
1536
- )
1537
-
1538
- if content is not None:
1539
- if use_match:
1540
- matches = pattern.finditer(content)
1541
-
1542
- if node_only:
1543
- if any(True for _ in matches):
1544
- node.add_feature(
1545
- "tag",
1546
- tag_to_apply,
1547
- Tag(
1548
- data=data,
1549
- uuid=get_tag_uuid(tag_uuid),
1550
- confidence=confidence,
1551
- value=value,
1552
- index=index,
1553
- parent_group_uuid=parent_group_uuid,
1554
- group_uuid=group_uuid,
1555
- cell_index=cell_index,
1556
- note=note,
1557
- status=status,
1558
- owner_uri=owner_uri,
1559
- is_dirty=is_dirty,
1560
- ),
1561
- )
1562
- else:
1563
- if matches:
1564
- for match in matches:
1565
- start_offset = match.span()[0]
1566
- end_offset = match.span()[1]
1567
- tag_node_position(
1568
- node,
1569
- start_offset,
1570
- end_offset,
1571
- data,
1572
- get_tag_uuid(tag_uuid),
1573
- value=value,
1574
- sort_by_bbox=sort_by_bbox,
1575
- )
1576
-
1577
- else:
1578
- search_match = pattern.search(content)
1579
- if search_match is not None:
1580
- start_offset = search_match.span()[0]
1581
- end_offset = search_match.span()[1]
1582
- tag_node_position(
1583
- node,
1584
- start_offset,
1585
- end_offset,
1586
- data,
1587
- get_tag_uuid(tag_uuid),
1588
- value=value,
1589
- sort_by_bbox=sort_by_bbox,
1590
- )
1591
-
1592
- def get_tags(self):
1593
- """Returns a list of the names of the tags on the given node
1594
-
1595
-
1596
- :return: A list of the tag name
1597
-
1598
- Args:
1599
-
1600
- Returns:
1601
-
1602
- >>> document.content_node.select('*').get_tags()
1603
- ['is_cheese']
1604
- """
1605
- return [i.name for i in self.get_features_of_type("tag")]
1606
-
1607
- def get_tag_features(self):
1608
- """Returns a list of the features that are tags on the given node
1609
-
1610
-
1611
- :return: A list of the tag name
1612
-
1613
- Args:
1614
-
1615
- Returns:
1616
-
1617
- >>> document.content_node.select('*').get_tag_features()
1618
- [ContentFeature()]
1619
- """
1620
- return [i for i in self.get_features_of_type("tag")]
1621
-
1622
- def get_tag_values(self, tag_name, include_children=False):
1623
- """Get the values for a specific tag name
1624
-
1625
- Args:
1626
- tag_name: tag name
1627
- include_children: include the children of this node (Default value = False)
1628
-
1629
- Returns:
1630
- a list of the tag values
1631
-
1632
- """
1633
- values = []
1634
- for tag in self.get_tag(tag_name):
1635
- values.append(tag["value"])
1636
-
1637
- if include_children:
1638
- for child in self.get_children():
1639
- values.extend(child.get_tag_values(tag_name, include_children))
1640
-
1641
- return values
1642
-
1643
- def get_related_tag_values(
1644
- self,
1645
- tag_name: str,
1646
- include_children: bool = False,
1647
- value_separator: str = " ",
1648
- tag_uuid=None,
1649
- ):
1650
- """Get the values for a specific tag name, grouped by uuid
1651
-
1652
- Args:
1653
- tag_name (str): tag name
1654
- include_children (bool): include the children of this node
1655
- value_separator (str): the string to be used to join related tag values
1656
-
1657
- Returns:
1658
- a list of the tag values
1659
-
1660
- """
1661
-
1662
- def group_tag_values(group_dict, feature_val, tag_uuid, tag_node):
1663
- """
1664
- This function groups tag values if they share the same uuid. It checks if the uuid of the feature value matches the tag uuid.
1665
- If they match, it sets the final value to the feature value if it exists, otherwise it sets it to the tag node content.
1666
- Then, it checks if the uuid is in the value groups keys. If it is, it appends the final value to the group.
1667
- If it's the first occurrence, it sets the group to the final value.
1668
-
1669
- Args:
1670
- group_dict (dict): The dictionary to group the values in.
1671
- feature_val (dict): The feature value to check.
1672
- tag_uuid (str): The uuid of the tag.
1673
- tag_node (Node): The node of the tag.
1674
-
1675
- Returns:
1676
- None
1677
- """
1678
- # we know the names of all these tags are the same, but we want to group them if they share the same uuid
1679
-
1680
- if feature_val["uuid"] != tag_uuid:
1681
- return
1682
-
1683
- final_value = feature_val["value"] if "value" in feature_val else None
1684
- if final_value is None:
1685
- final_value = tag_node.content
1686
-
1687
- if feature_val["uuid"] in value_groups.keys():
1688
- # we've seen this UUID - add it's value to the group
1689
- group_dict[feature_val["uuid"]].append(final_value)
1690
- else:
1691
- # first occurrence
1692
- group_dict[feature_val["uuid"]] = [final_value]
1693
-
1694
- if include_children:
1695
- tagged_nodes = self.document.get_tagged_nodes(tag_name, tag_uuid=tag_uuid)
1696
- else:
1697
- tagged_nodes = self.select(".")
1698
-
1699
- value_groups: Dict[str, Any] = {}
1700
- for tag_node in tagged_nodes:
1701
- tag_feature_vals = tag_node.get_feature_value("tag", tag_name)
1702
- if tag_feature_vals:
1703
- if not isinstance(tag_feature_vals, list):
1704
- tag_feature_vals = [tag_feature_vals]
1705
-
1706
- for v in tag_feature_vals:
1707
- group_tag_values(value_groups, v, tag_uuid, tag_node)
1708
-
1709
- value_strings = []
1710
- for k in value_groups.keys():
1711
- if (
1712
- value_groups[k]
1713
- and len(value_groups[k]) > 0
1714
- and value_groups[k][0] is not None
1715
- ):
1716
- value_strings.append(value_separator.join(value_groups[k]))
1717
-
1718
- return value_strings
1719
-
1720
- def get_related_tag_nodes(
1721
- self, tag_name: str, everywhere: bool = False, tag_uuid=None
1722
- ):
1723
- """Get the nodes for a specific tag name, grouped by uuid
1724
-
1725
- Args:
1726
- tag_name (str): tag name
1727
- everywhere (bool): include the children of this node
1728
- tag_uuid (optional(str)): if set we will only get nodes related to this tag UUID
1729
-
1730
- Returns:
1731
- a dictionary that groups nodes by tag UUID
1732
-
1733
- """
1734
- if everywhere:
1735
- tagged_nodes = self.document.get_tagged_nodes(tag_name, tag_uuid)
1736
- else:
1737
- tagged_nodes = [self]
1738
-
1739
- # We need to group these nodes together based on the TAG UUID
1740
-
1741
- node_groups = {}
1742
-
1743
- for tagged_node in tagged_nodes:
1744
- tag_instances = tagged_node.get_tag(tag_name)
1745
-
1746
- for tag_instance in tag_instances:
1747
- if "uuid" in tag_instance:
1748
- if tag_instance["uuid"] not in node_groups:
1749
- node_groups[tag_instance["uuid"]] = [tagged_node]
1750
- else:
1751
- node_groups[tag_instance["uuid"]].append(tagged_node)
1752
-
1753
- return node_groups
1754
-
1755
- def get_tag(self, tag_name, tag_uuid=None):
1756
- """Returns the value of a tag (a dictionary), this can be either a single value in a list [[start,end,value]] or if multiple parts of the
1757
- content of this node match you can end up with a list of lists i.e. [[start1,end1,value1],[start2,end2,value2]]
1758
-
1759
- Args:
1760
- tag_name: The name of the tag
1761
- tag_uuid (Optional): Optionally you can also provide the tag UUID
1762
-
1763
- Returns:
1764
- A list tagged location and values for this label in this node
1765
-
1766
- >>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').get_tag('is_cheese')
1767
- [0,10,'The Cheese Moved']
1768
- """
1769
- tag_details = self.get_feature_value("tag", tag_name)
1770
-
1771
- if tag_details is None:
1772
- return []
1773
-
1774
- if not isinstance(tag_details, list):
1775
- tag_details = [tag_details]
1776
-
1777
- final_result = []
1778
- for tag_detail in tag_details:
1779
- if "uuid" in tag_detail and tag_uuid:
1780
- if tag_detail["uuid"] == tag_uuid:
1781
- final_result.append(tag_detail)
1782
- else:
1783
- final_result.append(tag_detail)
1784
- return final_result
1785
-
1786
- def get_all_tags(self):
1787
- """Get the names of all tags that have been applied to this node or to its children.
1788
-
1789
- Args:
1790
-
1791
- Returns:
1792
- list[str]: A list of the tag names belonging to this node and/or its children.
1793
-
1794
- >>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').get_all_tags()
1795
- ['is_cheese']
1796
- """
1797
- tags = []
1798
- tags.extend(self.get_tags())
1799
- for child in self.get_children():
1800
- tags.extend(child.get_all_tags())
1801
- return list(set(tags))
1802
-
1803
- def has_tags(self):
1804
- """Determines if this node has any tags at all.
1805
-
1806
- Args:
1807
-
1808
- Returns:
1809
- bool: True if node has any tags; else, False;
1810
-
1811
- >>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').has_tags()
1812
- True
1813
- """
1814
- return len([i.value for i in self.get_features_of_type("tag")]) > 0
1815
-
1816
- def has_tag(self, tag, include_children=False):
1817
- """Determine if this node has a tag with the specified name.
1818
-
1819
- Args:
1820
- tag(str): The name of the tag.
1821
- include_children(bool): should we include child nodes
1822
-
1823
- Returns:
1824
- bool: True if node has a tag by the specified name; else, False;
1825
-
1826
- >>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').has_tag('is_cheese')
1827
- True
1828
- >>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').has_tag('is_fish')
1829
- False
1830
- """
1831
- for feature in self.get_features():
1832
- if feature.feature_type == "tag" and feature.name == tag:
1833
- return True
1834
- result = False
1835
- if include_children:
1836
- for child in self.get_children():
1837
- if child.has_tag(tag, True):
1838
- result = True
1839
- return result
1840
-
1841
- def is_first_child(self):
1842
- """Determines if this node is the first child of its parent or has no parent.
1843
-
1844
- Args:
1845
-
1846
- Returns:
1847
- bool: True if this node is the first child of its parent or if this node has no parent; else, False;
1848
-
1849
- """
1850
- if not self.parent:
1851
- return True
1852
-
1853
- return self.index == 0
1854
-
1855
- def is_last_child(self):
1856
- """Determines if this node is the last child of its parent or has no parent.
1857
-
1858
- Returns:
1859
- bool: True if this node is the last child of its parent or if this node has no parent; else, False;
1860
-
1861
- """
1862
-
1863
- if not self.get_parent():
1864
- return True
1865
-
1866
- return self.index == self.get_parent().get_last_child_index()
1867
-
1868
- def get_last_child_index(self):
1869
- """Returns the max index value for the children of this node. If the node has no children, returns None.
1870
-
1871
- Returns:
1872
- int or None: The max index of the children of this node, or None if there are no children.
1873
-
1874
- """
1875
-
1876
- if not self.get_children():
1877
- return None
1878
-
1879
- max_index = 0
1880
- for child in self.get_children():
1881
- if child.index > max_index:
1882
- max_index = child.index
1883
-
1884
- return max_index
1885
-
1886
- def get_node_at_index(self, index):
1887
- """Returns the child node at the specified index. If the specified index is outside the first (0), or
1888
- last child's index, None is returned.
1889
-
1890
- Note: documents allow for sparse representation and child nodes may not have consecutive index numbers.
1891
- If there isn't a child node at the specfied index, a 'virtual' node will be returned. This 'virtual' node
1892
- will have the node type of its nearest sibling and will have an index value, but will have no features or content.
1893
-
1894
- Args:
1895
- index (int): The index (zero-based) for the child node.
1896
-
1897
- Returns:
1898
- ContentNode or None: Node at index, or None if the index is outside the boundaries of child nodes.
1899
-
1900
- """
1901
- if self.get_children():
1902
- if index < self.get_children()[0].index:
1903
- virtual_node = self.document.create_node(
1904
- node_type=self.get_children()[0].node_type,
1905
- virtual=True,
1906
- parent=self,
1907
- index=index,
1908
- )
1909
- return virtual_node
1910
-
1911
- last_child = None
1912
- for child in self.get_children():
1913
- if child.index < index:
1914
- last_child = child
1915
- elif child.index == index:
1916
- return child
1917
- else:
1918
- break
1919
-
1920
- if last_child:
1921
- if last_child.index != index and index < self.get_children()[-1].index:
1922
- virtual_node = self.document.create_node(
1923
- node_type=last_child.node_type,
1924
- virtual=True,
1925
- parent=self,
1926
- index=index,
1927
- )
1928
- return virtual_node
1929
- else:
1930
- return None
1931
- else:
1932
- return None
1933
-
1934
- def has_next_node(self, node_type_re=".*", skip_virtual=False):
1935
- """Determine if this node has a next sibling that matches the type specified by the node_type_re regex.
1936
-
1937
- Args:
1938
- node_type_re(str, optional, optional): The regular expression to match against the next sibling node's type; default is '.*'.
1939
- skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
1940
-
1941
- Returns:
1942
- bool: True if there is a next sibling node matching the specified type regex; else, False.
1943
-
1944
- """
1945
- return self.next_node(node_type_re, skip_virtual=skip_virtual) is not None
1946
-
1947
- def has_previous_node(self, node_type_re=".*", skip_virtual=False):
1948
- """Determine if this node has a previous sibling that matches the type specified by the node_type_re regex.
1949
-
1950
- Args:
1951
- node_type_re(str, optional, optional): The regular expression to match against the previous sibling node's type; default is '.*'.
1952
- skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
1953
-
1954
- Returns:
1955
- bool: True if there is a previous sibling node matching the specified type regex; else, False.
1956
-
1957
- """
1958
- return (
1959
- self.previous_node(node_type_re=node_type_re, skip_virtual=skip_virtual)
1960
- is not None
1961
- )
1962
-
1963
- def next_node(
1964
- self,
1965
- node_type_re=".*",
1966
- skip_virtual=False,
1967
- has_no_content=True,
1968
- traverse=Traverse.SIBLING,
1969
- ):
1970
- """Returns the next sibling content node.
1971
-
1972
- Note: This logic relies on node indexes. Documents allow for sparse representation and child nodes may not have consecutive index numbers.
1973
- Therefore, the next node might actually be a virtual node that is created to fill a gap in the document. You can skip virtual nodes by setting the
1974
- skip_virtual parameter to False.
1975
-
1976
- Args:
1977
- node_type_re(str, optional, optional): The regular expression to match against the next sibling node's type; default is '.*'.
1978
- skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
1979
- has_no_content(bool, optional, optional): Allow a node that has no content to be returned; default is True.
1980
-
1981
- Returns:
1982
- ContentNode or None: The next node or None, if no node exists
1983
-
1984
- """
1985
- search_index = self.index + 1
1986
- compiled_node_type_re = re.compile(node_type_re)
1987
-
1988
- while True:
1989
- node = (
1990
- self.get_parent().get_node_at_index(search_index)
1991
- if self.get_parent()
1992
- else None
1993
- )
1994
-
1995
- if not node:
1996
- if (
1997
- traverse == traverse.ALL or traverse == traverse.PARENT
1998
- ) and self.get_parent().get_parent():
1999
- # noinspection PyBroadException
2000
- try:
2001
- potential_next_node = (
2002
- self.get_parent()
2003
- .get_parent()
2004
- .get_children()[self.get_parent().index + 1]
2005
- .get_children()[0]
2006
- )
2007
- if potential_next_node:
2008
- return potential_next_node
2009
- except Exception:
2010
-
2011
- # traverse additional layer
2012
- potential_next_node = (
2013
- self.get_parent()
2014
- .get_parent()
2015
- .get_parent()
2016
- .get_children()[self.get_parent().get_parent().index + 1]
2017
- .get_children()[0]
2018
- .get_children()[0]
2019
- )
2020
- if potential_next_node:
2021
- return potential_next_node
2022
- return node
2023
-
2024
- if compiled_node_type_re.match(node.node_type) and (
2025
- not skip_virtual or not node.virtual
2026
- ):
2027
- if (not has_no_content and node.content) or has_no_content:
2028
- return node
2029
-
2030
- search_index += 1
2031
-
2032
- def previous_node(
2033
- self,
2034
- node_type_re=".*",
2035
- skip_virtual=False,
2036
- has_no_content=False,
2037
- traverse=Traverse.SIBLING,
2038
- ):
2039
- """Returns the previous sibling content node.
2040
-
2041
- Note: This logic relies on node indexes. Documents allow for sparse representation and child nodes may not have consecutive index numbers.
2042
- Therefore, the previous node might actually be a virtual node that is created to fill a gap in the document. You can skip virtual nodes by setting the
2043
- skip_virtual parameter to False.
2044
-
2045
- Args:
2046
- node_type_re(str, optional, optional): The regular expression to match against the previous node's type; default is '.*'.
2047
- skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
2048
- has_no_content(bool, optional, optional): Allow a node that has no content to be returned; default is False.
2049
- traverse(Traverse(enum), optional, optional): The transition you'd like to traverse (SIBLING, CHILDREN, PARENT, or ALL); default is Traverse.SIBLING.
2050
-
2051
- Returns:
2052
- ContentNode or None: The previous node or None, if no node exists
2053
-
2054
- """
2055
-
2056
- # TODO: implement/differentiate traverse logic for CHILDREN and SIBLING
2057
- if self.index == 0:
2058
- if (
2059
- traverse == traverse.ALL
2060
- or traverse == traverse.PARENT
2061
- and self.get_parent()
2062
- ):
2063
- # Lets look for a previous node on the parent
2064
- return self.get_parent().previous_node(
2065
- node_type_re, skip_virtual, has_no_content, traverse
2066
- )
2067
-
2068
- return None
2069
-
2070
- search_index = self.index - 1
2071
- compiled_node_type_re = re.compile(node_type_re)
2072
-
2073
- while True:
2074
- node = self.get_parent().get_node_at_index(search_index)
2075
-
2076
- if not node:
2077
- return node
2078
-
2079
- if compiled_node_type_re.match(node.node_type) and (
2080
- not skip_virtual or not node.virtual
2081
- ):
2082
- if (not has_no_content) or (has_no_content and not node.content):
2083
- return node
2084
-
2085
- search_index -= 1
2086
-
2087
-
2088
- class ContentFeature(object):
2089
- """
2090
- A feature allows you to capture almost any additional data or metadata and associate it with a ContentNode.
2091
- """
2092
-
2093
- """A feature allows you to capture almost any additional data or metadata and associate it with a ContentNode"""
2094
-
2095
- def __init__(self, feature_type: str, name: str, value: Any, single: bool = True):
2096
- self.feature_type: str = feature_type
2097
- """The type of feature, a logical name to group feature types together (ie. spatial)"""
2098
- self.name: str = name
2099
- """The name of the feature (ie. bbox)"""
2100
- self.value: Any = value
2101
- """Description of the feature (Optional)"""
2102
- self.single: bool = single
2103
- """Determines whether the data for this feature is a single instance or an array, if you have added the same feature to the same node you will end up with multiple data elements in the content feature and the single flag will be false"""
2104
-
2105
- def __str__(self):
2106
- return f"Feature [type='{self.feature_type}' name='{self.name}' value='{self.value}' single='{self.single}']"
2107
-
2108
- def to_dict(self):
2109
- """
2110
- Create a dictionary representing this ContentFeature's structure and content.
2111
-
2112
- Returns:
2113
- dict: The properties of this ContentFeature structured as a dictionary.
2114
- """
2115
- return {
2116
- "name": self.feature_type + ":" + self.name,
2117
- "value": self.value,
2118
- "single": self.single,
2119
- }
2120
-
2121
- def get_value(self):
2122
- """
2123
- Get the value from the feature. This method will handle the single flag.
2124
-
2125
- Returns:
2126
- Any: The value of the feature.
2127
- """
2128
- if self.single:
2129
- return self.value[0]
2130
-
2131
- return self.value
2132
-
2133
-
2134
- class ModelInsight(BaseModel):
2135
- model_config = ConfigDict(populate_by_name=True, use_enum_values=True, arbitrary_types_allowed=True,
2136
- protected_namespaces=("model_config",))
2137
- """
2138
- A class used to represent the insights of a model.
2139
-
2140
- Attributes:
2141
- model_ref (str): The reference to the model.
2142
- insight_type (str): The type of the insight.
2143
- description (Optional[str]): The description of the insight, default is None.
2144
- details (Optional[str]): The details of the insight, default is None.
2145
- properties (Optional[Dict]): The properties of the insight, default is None.
2146
- """
2147
-
2148
- model_ref: str
2149
- insight_type: str
2150
- description: Optional[str] = None
2151
- details: Optional[str] = None
2152
- properties: Optional[Dict] = None
2153
-
2154
-
2155
- @dataclasses.dataclass()
2156
- class SourceMetadata:
2157
- """Class for keeping track of an original source information for a document.
2158
-
2159
- Attributes:
2160
- original_filename (Optional[str]): The original filename of the document.
2161
- original_path (Optional[str]): The original path of the document.
2162
- checksum (Optional[str]): The checksum of the document.
2163
- cid (Optional[str]): The ID used for internal caching.
2164
- last_modified (Optional[str]): The last modified date of the document.
2165
- created (Optional[str]): The creation date of the document.
2166
- connector (Optional[str]): The connector used for the document.
2167
- mime_type (Optional[str]): The MIME type of the document.
2168
- headers (Optional[Dict]): The headers of the document.
2169
- lineage_document_uuid (Optional[str]): The UUID of the document that this document was derived from.
2170
- source_document_uuid (Optional[str]): The UUID of the original first document.
2171
- pdf_document_uuid (Optional[str]): The UUID of the document in a PDF form (used for archiving and preview).
2172
- """
2173
-
2174
- """Class for keeping track of the original source information for a
2175
- document
2176
-
2177
- Args:
2178
-
2179
- Returns:
2180
-
2181
- """
2182
- original_filename: Optional[str] = None
2183
- original_path: Optional[str] = None
2184
- checksum: Optional[str] = None
2185
-
2186
- # The ID used for internal caching
2187
- cid: Optional[str] = None
2188
- last_modified: Optional[str] = None
2189
- created: Optional[str] = None
2190
- connector: Optional[str] = None
2191
- mime_type: Optional[str] = None
2192
- headers: Optional[Dict] = None
2193
-
2194
- # The UUID of the document that this document was derived from
2195
- # noting that multiple documents coming from an original source
2196
- lineage_document_uuid: Optional[str] = None
2197
-
2198
- # The UUID of the original first document
2199
- source_document_uuid: Optional[str] = None
2200
-
2201
- # The UUID of the document in a PDF form (used for archiving and preview)
2202
- pdf_document_uuid: Optional[str] = None
2203
-
2204
- @classmethod
2205
- def from_dict(cls, env):
2206
- """Creates an instance of the class from a dictionary.
2207
-
2208
- Args:
2209
- env (dict): A dictionary containing the attributes of the class.
2210
-
2211
- Returns:
2212
- SourceMetadata: An instance of the class.
2213
- """
2214
- return cls(
2215
- **{k: v for k, v in env.items() if k in inspect.signature(cls).parameters}
2216
- )
2217
-
2218
-
2219
- class FeatureSetDiff:
2220
- """
2221
- A utility class that can be used to diff two feature sets.
2222
- """
2223
-
2224
- """
2225
- A utility class that can be used to diff two feature sets
2226
- """
2227
-
2228
- def __init__(self, first_feature_set: FeatureSet, second_feature_set: FeatureSet):
2229
- self.first_feature_map = self.parse_feature_set(first_feature_set)
2230
- self.second_feature_map = self.parse_feature_set(second_feature_set)
2231
- self._differences = deepdiff.DeepDiff(
2232
- self.first_feature_map,
2233
- self.second_feature_map,
2234
- exclude_obj_callback=self.exclude_callback,
2235
- ).to_dict()
2236
-
2237
- def get_differences(self):
2238
- """
2239
- Gets the differences between the two feature sets.
2240
-
2241
- Returns:
2242
- dict: A dictionary containing the differences between the two feature sets.
2243
- """
2244
- if "type_changes" in self._differences:
2245
- self._differences.pop("type_changes")
2246
-
2247
- return self._differences
2248
-
2249
- def get_exclude_paths(self):
2250
- """
2251
- Gets the paths to exclude.
2252
-
2253
- Returns:
2254
- list: A list of paths to exclude.
2255
- """
2256
- return ["shape", "group_uuid", "uuid", "parent_group_uuid", "single"]
2257
-
2258
- def exclude_callback(self, path, key):
2259
- """
2260
- Checks if the key is to be excluded from the diff.
2261
-
2262
- Args:
2263
- path (str): The path that contains the values of the key.
2264
- key (str): The key of the data dictionary to compare.
2265
-
2266
- Returns:
2267
- bool: True if the key is to be excluded, False otherwise.
2268
- """
2269
- if any(re.search(exclude_key, key) for exclude_key in self.get_exclude_paths()):
2270
- return True
2271
- else:
2272
- return False
2273
-
2274
- def parse_feature_set(self, feature_set: FeatureSet):
2275
- """
2276
- Parses the feature set.
2277
-
2278
- Args:
2279
- feature_set (FeatureSet): The feature set to be parsed.
2280
-
2281
- Returns:
2282
- dict: A dictionary of features with the key as the nodeUuid.
2283
- """
2284
- return {
2285
- feature.get("nodeUuid"): feature for feature in feature_set.node_features
2286
- }
2287
-
2288
- def parsed_values_changed(self):
2289
- """
2290
- Checks if the old value is still in the second feature map. If it is, remove the key.
2291
- """
2292
- for key, value in self._differences.get("values_changed").items():
2293
- # Check if the old_value is stil in the second_feature_map. If it is remove the key
2294
- if key in self.second_feature_map.node_features:
2295
- self._differences.get("values_changed").remove(key)
2296
-
2297
- def is_equal(self) -> bool:
2298
- """
2299
- Checks if the two feature sets are equal to each other.
2300
-
2301
- Returns:
2302
- bool: True if the feature sets are equal, False otherwise.
2303
- """
2304
- return self._differences == {}
2305
-
2306
- def get_changed_nodes(self):
2307
- """
2308
- Gets the nodes that were changed.
2309
-
2310
- Returns:
2311
- dict: A dictionary containing the nodes that were changed.
2312
- """
2313
- if self.is_equal():
2314
- return []
2315
-
2316
- # Check for new nodes added in the second_feature_map
2317
- new_added_nodes = []
2318
-
2319
- # Checked for removed nodes in the first_feature_map
2320
- removed_nodes = []
2321
-
2322
- # Checked for modified nodes
2323
- modified_nodes = []
2324
- for key, value in self._differences.get("values_changed").items():
2325
- modified_nodes.append(self.parsed_node_uuid(key))
2326
-
2327
- # Merge unique nodeUuid of first_feature_map and second_feature_map
2328
- merged_node_uuids = set(self.first_feature_map.keys()).union(
2329
- set(self.second_feature_map.keys())
2330
- )
2331
- for node_uuid in merged_node_uuids:
2332
- if node_uuid not in self.first_feature_map:
2333
- new_added_nodes.append(node_uuid)
2334
- elif node_uuid not in self.second_feature_map:
2335
- removed_nodes.append(node_uuid)
2336
-
2337
- return {
2338
- "new_added_nodes": new_added_nodes,
2339
- "removed_nodes": removed_nodes,
2340
- "existing_modified_nodes": modified_nodes,
2341
- }
2342
-
2343
- def get_difference_count(self):
2344
- """
2345
- Gets the total number of differences between the feature sets.
2346
-
2347
- Returns:
2348
- int: The total number of differences between the feature sets.
2349
- """
2350
- return len(self._differences().keys())
2351
-
2352
- def parsed_item_added(self):
2353
- """
2354
- Parses the items that were added.
2355
-
2356
- Returns:
2357
- dict: A dictionary containing the items that were added.
2358
- """
2359
- item_added: Dict = self._differences.get("iterable_item_added")
2360
- if item_added:
2361
- return {}
2362
-
2363
- for key, value in item_added.items():
2364
- node = self.parsed_node_uuid(key)
2365
- if node in self._changed_nodes["new_added_nodes"]:
2366
- self._differences["iterable_item_added"][key][
2367
- "details"
2368
- ] = f"Node: {node} was added"
2369
- continue
2370
-
2371
- # if node in
2372
- return self.get_difference_count()
2373
-
2374
- def parsed_node_uuid(self, key):
2375
- """
2376
- Parses the node uuid from the key.
2377
-
2378
- Args:
2379
- key (str): The key of the data dictionary.
2380
-
2381
- Returns:
2382
- str: The node uuid from the key.
2383
- """
2384
- node = key.split("['")[1].split("']")[0]
2385
- return node
2386
-
2387
-
2388
- class ProcessingStep(BaseModel):
2389
- id: str = Field(default_factory=lambda: str(uuid.uuid4()))
2390
- name: str
2391
- metadata: dict = Field(default_factory=lambda: {})
2392
- presentation_metadata: dict = Field(default_factory=lambda: {}, alias='presentationMetadata')
2393
- children: List['ProcessingStep'] = Field(default_factory=list)
2394
- parents: List['ProcessingStep'] = Field(default_factory=list)
2395
-
2396
- def add_child(self, child_step: 'ProcessingStep'):
2397
- self.children.append(child_step)
2398
- child_step.parents.append(self)
2399
-
2400
- @staticmethod
2401
- def merge_with(*other_steps: 'ProcessingStep') -> 'ProcessingStep':
2402
- merged_step = ProcessingStep(name=f"Merged({', '.join(step.name for step in other_steps)})")
2403
- for step in other_steps:
2404
- step.children.append(merged_step)
2405
- merged_step.parents.append(step)
2406
- return merged_step
2407
-
2408
- class Config:
2409
- arbitrary_types_allowed = True
2410
- json_encoders = {
2411
- 'ProcessingStep': lambda step: step.to_dict()
2412
- }
2413
-
2414
- def to_dict(self, seen=None):
2415
- if seen is None:
2416
- seen = set()
2417
-
2418
- # Avoid circular references by skipping already seen objects
2419
- if self.id in seen:
2420
- return {'id': self.id, 'name': self.name}
2421
-
2422
- seen.add(self.id)
2423
-
2424
- return {
2425
- 'id': self.id,
2426
- 'name': self.name,
2427
- 'metadata': self.metadata,
2428
- 'presentationMetadata': self.presentation_metadata,
2429
- 'children': [child.to_dict(seen) for child in self.children],
2430
- 'parents': [{'id': parent.id, 'name': parent.name} for parent in self.parents], # or parent.to_dict(seen) if full structure is needed
2431
- }
2432
-
2433
- def to_json(self):
2434
- return json.dumps(self.to_dict())
2435
-
2436
- def __repr__(self):
2437
- return f"Step(id={self.id}, name={self.name})"
2438
-
2439
-
2440
- class Document(object):
2441
- """A Document is a collection of metadata and a set of content nodes."""
2442
-
2443
- PREVIOUS_VERSION: str = "1.0.0"
2444
- CURRENT_VERSION: str = "6.0.0"
2445
-
2446
- def __str__(self):
2447
- return f"kodexa://{self.uuid}"
2448
-
2449
- def get_validations(self) -> list[DocumentTaxonValidation]:
2450
- return self.get_persistence().get_validations()
2451
-
2452
- def set_validations(self, validations: list[DocumentTaxonValidation]):
2453
- self.get_persistence().set_validations(validations)
2454
-
2455
- def add_exception(self, exception: ContentException):
2456
- self._persistence_layer.add_exception(exception)
2457
-
2458
- def get_exceptions(self) -> List[ContentException]:
2459
- return self._persistence_layer.get_exceptions()
2460
-
2461
- def get_external_data(self, key="default") -> dict:
2462
- return self._persistence_layer.get_external_data(key)
2463
-
2464
- def get_external_data_keys(self) -> list[str]:
2465
- return self._persistence_layer.get_external_data_keys()
2466
-
2467
- def set_external_data(self, external_data:dict, key="default"):
2468
- return self._persistence_layer.set_external_data(external_data, key)
2469
-
2470
- def get_steps(self) -> list[ProcessingStep]:
2471
- return self._persistence_layer.get_steps()
2472
-
2473
- def set_steps(self, steps: list[ProcessingStep]):
2474
- self._persistence_layer.set_steps(steps)
2475
-
2476
- def replace_exceptions(self, exceptions: List[ContentException]):
2477
- self._persistence_layer.replace_exceptions(exceptions)
2478
-
2479
- def __init__(
2480
- self,
2481
- metadata=None,
2482
- content_node: ContentNode = None,
2483
- source=None,
2484
- ref: str = None,
2485
- kddb_path: str = None,
2486
- delete_on_close=False,
2487
- inmemory=False,
2488
- ):
2489
- if metadata is None:
2490
- metadata = DocumentMetadata()
2491
- if source is None:
2492
- source = SourceMetadata()
2493
-
2494
- # Mix-ins are going away - so we will allow people to turn them off as needed
2495
- self.disable_mixin_methods = True
2496
-
2497
- self.delete_on_close = delete_on_close
2498
-
2499
- # The ref is not stored and is used when we have
2500
- # initialized a document from a remote store and want
2501
- # to keep track of that
2502
- self.ref = ref
2503
-
2504
- self.metadata: DocumentMetadata = metadata
2505
- """Metadata relating to the document"""
2506
- self._content_node: Optional[ContentNode] = content_node
2507
- """The root content node"""
2508
- self.virtual: bool = False
2509
- """Is the document virtual (deprecated)"""
2510
- self._mixins: List[str] = []
2511
- """A list of the mixins for this document"""
2512
- self.uuid: str = str(uuid.uuid4())
2513
- """A log for this document (deprecated)"""
2514
- self.version = Document.CURRENT_VERSION
2515
- """The version of the document"""
2516
- self.source: SourceMetadata = source
2517
- """Source metadata for this document"""
2518
- self.labels: List[str] = []
2519
- """A list of the document level labels for the document"""
2520
- self.tag_instances: List[TagInstance] = []
2521
- """A list of tag instances that contains a set of tag that has a set of nodes"""
2522
-
2523
- # Start persistence layer
2524
- from kodexa.model import PersistenceManager
2525
-
2526
- self._persistence_layer: Optional[PersistenceManager] = PersistenceManager(
2527
- document=self, filename=kddb_path, delete_on_close=delete_on_close, inmemory=inmemory
2528
- )
2529
- self._persistence_layer.initialize()
2530
-
2531
- def remove_tags_by_owner(self, owner_uri: str):
2532
-
2533
- for tag in self.get_all_tags():
2534
- for tag_instance in self.get_tag_instances(tag):
2535
- tag_meta: dict = tag_instance.get_data()
2536
- if 'owner_uri' in tag_meta and tag_meta['owner_uri'] == owner_uri:
2537
- for node in tag_instance.nodes:
2538
- node.remove_tag(tag)
2539
-
2540
- def get_nodes_by_type(self, node_type: str) -> List[ContentNode]:
2541
- """
2542
- Get all the nodes of a specific type
2543
-
2544
- Args:
2545
- node_type: the type of the node
2546
-
2547
- Returns:
2548
- a list of nodes
2549
-
2550
- """
2551
- return self._persistence_layer.get_nodes_by_type(node_type)
2552
-
2553
- def get_node_by_uuid(self, uuid: int) -> ContentNode:
2554
- """
2555
- Get a node by its uuid
2556
-
2557
- Args:
2558
- uuid: the uuid of the node
2559
-
2560
- Returns:
2561
- the node
2562
-
2563
- """
2564
- return self._persistence_layer.get_node_by_uuid(uuid)
2565
-
2566
- def add_tag_instance(self, tag_to_apply: str, node_list: List[ContentNode]):
2567
- """
2568
- This will create a group of a tag with indexes
2569
- :param tag_to_apply: name of the tag
2570
- :param node_list: contains the list of index of a node
2571
- :return:
2572
- """
2573
- # For each node in the list create/update a feature
2574
- tag = Tag()
2575
- for node in node_list:
2576
- node.add_feature("tag", tag_to_apply, Tag)
2577
- # Tag Object
2578
- tag_instance = TagInstance(tag, node_list)
2579
- self.tag_instances.append(tag_instance)
2580
-
2581
- def update_tag_instance(self, tag_uuid):
2582
- for tag_instance in self.tag_instances:
2583
- if tag_instance.tag.uuid == tag_uuid:
2584
- # Update attributes of a Tag
2585
- for node in tag_instance.nodes:
2586
- node.get_tag(tag_instance.tag.value, tag_uuid=tag_instance.tag.uuid)
2587
-
2588
- def get_tag_instance(self, tag):
2589
- """
2590
- Get the tag instance based on the tag itself
2591
- :param tag: name of the tag
2592
- :return: a list of tag instance
2593
- """
2594
- return [
2595
- tag_instance
2596
- for tag_instance in self.tag_instances
2597
- if tag_instance.tag == tag
2598
- ]
2599
-
2600
- def get_persistence(self):
2601
- return self._persistence_layer
2602
-
2603
- def get_all_tags(self):
2604
- return self._persistence_layer.get_all_tags()
2605
-
2606
- def add_model_insight(self, model_insight: ModelInsight):
2607
- self._persistence_layer.add_model_insight(model_insight)
2608
-
2609
- def clear_model_insights(self):
2610
- self._persistence_layer.clear_model_insights()
2611
-
2612
- def get_model_insights(self) -> List[ModelInsight]:
2613
- return self._persistence_layer.get_model_insights()
2614
-
2615
- def get_tagged_nodes(self, tag_name, tag_uuid=None):
2616
- return self._persistence_layer.get_tagged_nodes(tag_name, tag_uuid)
2617
-
2618
- @property
2619
- def content_node(self):
2620
- """The root content Node"""
2621
- return self._content_node
2622
-
2623
- @content_node.setter
2624
- def content_node(self, value):
2625
- value.index = 0
2626
- if value != self._content_node and self._content_node is not None:
2627
- self.get_persistence().remove_content_node(self._content_node)
2628
-
2629
- self._content_node = value
2630
- if value is not None:
2631
- self.get_persistence().add_content_node(self._content_node, None)
2632
-
2633
- def get_tag_instances(self, tag):
2634
- groups = self.content_node.get_related_tag_nodes(tag, everywhere=True)
2635
- tag_instances = []
2636
-
2637
- class TagInstance:
2638
- """
2639
- A class to represent a TagInstance.
2640
-
2641
- ...
2642
-
2643
- Attributes
2644
- ----------
2645
- tag_uuid : str
2646
- a string that represents the unique identifier of the tag
2647
- nodes : list
2648
- a list of nodes associated with the tag
2649
-
2650
- Methods
2651
- -------
2652
- get_value():
2653
- Returns the combined content of all nodes.
2654
- get_data():
2655
- Returns the data of the tag feature with the same uuid as the tag.
2656
- """
2657
-
2658
- def __init__(self, tag_uuid, nodes):
2659
- self.tag_uuid = tag_uuid
2660
- self.nodes = nodes
2661
-
2662
- def get_value(self):
2663
- """
2664
- Combines and returns the content of all nodes.
2665
-
2666
- Returns
2667
- -------
2668
- str
2669
- a string that represents the combined content of all nodes
2670
- """
2671
- content_parts = []
2672
- for node in self.nodes:
2673
- content_parts.append(node.get_all_content())
2674
- return " ".join(content_parts)
2675
-
2676
- def get_data(self):
2677
- """
2678
- Returns the data of the tag feature with the same uuid as the tag.
2679
-
2680
- Returns
2681
- -------
2682
- dict
2683
- a dictionary that represents the data of the tag feature with the same uuid as the tag
2684
- """
2685
- for node in self.nodes:
2686
- for tag_feature in node.get_tag_features():
2687
- data = tag_feature.value[0]
2688
- if "uuid" in data and data["uuid"] == self.tag_uuid:
2689
- return data
2690
- return {}
2691
-
2692
- for key in groups.keys():
2693
- tag_instances.append(TagInstance(key, groups[key]))
2694
- return tag_instances
2695
-
2696
- def add_label(self, label: str):
2697
- """Add a label to the document
2698
-
2699
- Args:
2700
- label: str Label to add
2701
- label: str:
2702
-
2703
- Returns:
2704
- the document
2705
-
2706
- """
2707
- if label not in self.labels:
2708
- self.labels.append(label)
2709
-
2710
- return self
2711
-
2712
- def remove_label(self, label: str):
2713
- """Remove a label from the document
2714
-
2715
- Args:
2716
- label: str Label to remove
2717
- label: str:
2718
-
2719
- Returns:
2720
- the document
2721
-
2722
- """
2723
- self.labels.remove(label)
2724
- return self
2725
-
2726
- @classmethod
2727
- def from_text(cls, text, separator=None, inmemory=False):
2728
- """Creates a new Document from the text provided.
2729
-
2730
- Args:
2731
- text: str Text to be used as content on the Document's ContentNode(s)
2732
- separator: str If provided, this string will be used to split the text and the resulting text will be placed on children of the root ContentNode. (Default value = None)
2733
-
2734
- Returns:
2735
- the document
2736
-
2737
- """
2738
- new_document = Document(inmemory=inmemory)
2739
- new_document.source.original_filename = f"text-{uuid.uuid4()}"
2740
- new_document.content_node = new_document.create_node(node_type="text", index=0)
2741
- if text:
2742
- if separator:
2743
- for s in text.split(separator):
2744
- new_document.content_node.add_child(
2745
- new_document.create_node(node_type="text", content=s)
2746
- )
2747
- else:
2748
- new_document.content_node.content = text
2749
-
2750
- new_document.add_mixin("text")
2751
- return new_document
2752
-
2753
- def get_root(self):
2754
- """Get the root content node for the document (same as content_node)"""
2755
- return self.content_node
2756
-
2757
- def to_kdxa(self, file_path: str):
2758
- """Write the document to the kdxa format (msgpack) which can be
2759
- used with the Kodexa platform
2760
-
2761
- Args:
2762
- file_path: the path to the mdoc you wish to create
2763
- file_path: str:
2764
-
2765
- Returns:
2766
-
2767
- >>> document.to_mdoc('my-document.kdxa')
2768
- """
2769
- with open(file_path, "wb") as outfile:
2770
- msgpack.pack(self.to_dict(), outfile, use_bin_type=True)
2771
-
2772
- @staticmethod
2773
- def open_kddb(file_path):
2774
- """
2775
- Opens a Kodexa Document Database.
2776
-
2777
- This is the Kodexa V4 default way to store documents, it provides high-performance
2778
- and also the ability to handle very large document objects
2779
-
2780
- :param file_path: The file path
2781
- :return: The Document instance
2782
- """
2783
- return Document(kddb_path=file_path)
2784
-
2785
- def close(self):
2786
- """
2787
- Close the document and clean up the resources
2788
- """
2789
- self.get_persistence().close()
2790
-
2791
- def to_kddb(self, path=None):
2792
- """
2793
- Either write this document to a KDDB file or convert this document object structure into a KDDB and return a bytes-like object
2794
-
2795
- This is dependent on whether you provide a path to write to
2796
- """
2797
-
2798
- if path is None:
2799
- return self.get_persistence().get_bytes()
2800
-
2801
- with open(path, "wb") as output_file:
2802
- output_file.write(self.get_persistence().get_bytes())
2803
-
2804
- @staticmethod
2805
- def from_kdxa(file_path):
2806
- """Read an .kdxa file from the given file_path and
2807
-
2808
- Args:
2809
- file_path: the path to the mdoc file
2810
-
2811
- Returns:
2812
-
2813
- >>> document = Document.from_kdxa('my-document.kdxa')
2814
- """
2815
- with open(file_path, "rb") as data_file:
2816
- data_loaded = msgpack.unpack(data_file, raw=False)
2817
- return Document.from_dict(data_loaded)
2818
-
2819
- def to_msgpack(self):
2820
- """Convert this document object structure into a message pack"""
2821
- return msgpack.packb(self.to_dict(), use_bin_type=True)
2822
-
2823
- def to_json(self):
2824
- """Create a JSON string representation of this Document.
2825
-
2826
- Args:
2827
-
2828
- Returns:
2829
- str: The JSON formatted string representation of this Document.
2830
-
2831
- >>> document.to_json()
2832
- """
2833
- return json.dumps(self.to_dict(), ensure_ascii=False)
2834
-
2835
- def to_dict(self):
2836
- """Create a dictionary representing this Document's structure and content.
2837
-
2838
- Args:
2839
-
2840
- Returns:
2841
- dict: A dictionary representation of this Document.
2842
-
2843
- >>> document.to_dict()
2844
- """
2845
-
2846
- # We don't want to store the none values
2847
- def clean_none_values(d):
2848
- """
2849
- This function recursively cleans a dictionary by removing keys with None values.
2850
-
2851
- Args:
2852
- d (dict): The dictionary to clean.
2853
-
2854
- Returns:
2855
- dict: A new dictionary with the same structure as the input, but without keys that had None values.
2856
- """
2857
- clean = {}
2858
- for k, v in d.items():
2859
- if isinstance(v, dict):
2860
- nested = clean_none_values(v)
2861
- if len(nested.keys()) > 0:
2862
- clean[k] = nested
2863
- elif v is not None:
2864
- clean[k] = v
2865
- return clean
2866
-
2867
- return {
2868
- "version": Document.CURRENT_VERSION,
2869
- "metadata": self.metadata,
2870
- "content_node": self.content_node.to_dict() if self.content_node else None,
2871
- "source": clean_none_values(dataclasses.asdict(self.source)),
2872
- "mixins": self._mixins,
2873
- "labels": self.labels,
2874
- "uuid": self.uuid,
2875
- }
2876
-
2877
- @staticmethod
2878
- def from_dict(doc_dict):
2879
- """Build a new Document from a dictionary.
2880
-
2881
- Args:
2882
- dict: doc_dict: A dictionary representation of a Kodexa Document.
2883
- doc_dict:
2884
-
2885
- Returns:
2886
- Document: A complete Kodexa Document
2887
-
2888
- >>> Document.from_dict(doc_dict)
2889
- """
2890
- new_document = Document(DocumentMetadata(doc_dict["metadata"]))
2891
- new_document.version = (
2892
- doc_dict["version"]
2893
- if "version" in doc_dict and doc_dict["version"]
2894
- else Document.PREVIOUS_VERSION
2895
- ) # some older docs don't have a version or it's None
2896
- new_document.uuid = (
2897
- doc_dict["uuid"]
2898
- if "uuid" in doc_dict
2899
- else str(uuid.uuid5(uuid.NAMESPACE_DNS, "kodexa.com"))
2900
- )
2901
-
2902
- if "content_node" in doc_dict and doc_dict["content_node"]:
2903
- new_document.content_node = ContentNode.from_dict(
2904
- new_document, doc_dict["content_node"]
2905
- )
2906
-
2907
- if "source" in doc_dict and doc_dict["source"]:
2908
- new_document.source = SourceMetadata.from_dict(doc_dict["source"])
2909
- if "labels" in doc_dict and doc_dict["labels"]:
2910
- new_document.labels = doc_dict["labels"]
2911
-
2912
- new_document.get_persistence().update_metadata()
2913
- return new_document
2914
-
2915
- @staticmethod
2916
- def from_json(json_string):
2917
- """Create an instance of a Document from a JSON string.
2918
-
2919
- Args:
2920
- str: json_string: A JSON string representation of a Kodexa Document
2921
- json_string:
2922
-
2923
- Returns:
2924
- Document: A complete Kodexa Document
2925
-
2926
- >>> Document.from_json(json_string)
2927
- """
2928
- return Document.from_dict(json.loads(json_string))
2929
-
2930
- @staticmethod
2931
- def from_msgpack(msgpack_bytes):
2932
- """Create an instance of a Document from a message pack byte array.
2933
-
2934
- Args:
2935
- msgpack_bytes: bytes: A message pack byte array.
2936
-
2937
- Returns:
2938
- Document: A complete Kodexa Document
2939
-
2940
- >>> Document.from_msgpack(open(os.path.join('news-doc.kdxa'), 'rb').read())
2941
- """
2942
- return Document.from_dict(msgpack.unpackb(msgpack_bytes, raw=False))
2943
-
2944
- def get_mixins(self):
2945
- """
2946
- Get the list of mixins that have been enabled on this document
2947
-
2948
- Returns:
2949
- mixins: list[str] a list of the mixin names
2950
- """
2951
- return self._mixins
2952
-
2953
- def add_mixin(self, mixin):
2954
- """
2955
- Add the given mixin to this document, this will apply the mixin to all the content nodes,
2956
- and also register it with the document so that future invocations of create_node will ensure
2957
- the node has the mixin appled.
2958
-
2959
- Args:
2960
- mixin:str the name of the mixin to add
2961
-
2962
- Returns:
2963
- >>> import * from kodexa
2964
- >>> document = Document()
2965
- >>> document.add_mixin('spatial')
2966
- """
2967
- self._mixins.append(mixin)
2968
- self.get_persistence().update_metadata()
2969
-
2970
- def create_node(
2971
- self,
2972
- node_type: str,
2973
- content: Optional[str] = None,
2974
- virtual: bool = False,
2975
- parent: ContentNode = None,
2976
- index: Optional[int] = None,
2977
- ):
2978
- """
2979
- Creates a new node for the document. The new node is not added to the document, but any mixins that have been
2980
- applied to the document will also be available on the new node.
2981
-
2982
- Args:
2983
- node_type (str): The type of node.
2984
- content (str): The content for the node; defaults to None.
2985
- virtual (bool): Indicates if this is a 'real' or 'virtual' node; default is False. 'Real' nodes contain
2986
- document content. 'Virtual' nodes are synthesized as necessary to fill gaps in between
2987
- non-consecutively indexed siblings. Such indexing arises when document content is sparse.
2988
- parent (ContentNode): The parent for this newly created node; default is None;
2989
- index (Optional[int)): The index property to be set on this node; default is 0;
2990
-
2991
- Returns:
2992
- ContentNode: This newly created node.
2993
-
2994
- >>> document.create_node(node_type='page')
2995
- <kodexa.model.model.ContentNode object at 0x7f80605e53c8>
2996
- """
2997
- content_node = ContentNode(
2998
- document=self,
2999
- node_type=node_type,
3000
- content=content,
3001
- parent=parent,
3002
- index=index,
3003
- virtual=virtual,
3004
- )
3005
- if parent is not None:
3006
- parent.add_child(content_node, index)
3007
- else:
3008
- self.get_persistence().add_content_node(content_node, None)
3009
-
3010
- if content is not None and len(content_node.get_content_parts()) == 0:
3011
- content_node.set_content_parts([content])
3012
-
3013
- return content_node
3014
-
3015
- @classmethod
3016
- def from_kddb(cls, source, detached: bool = True, inmemory: bool = False):
3017
- """
3018
- Loads a document from a Kodexa Document Database (KDDB) file
3019
-
3020
- Args:
3021
-
3022
- input: if a string we will load the file at that path, if bytes we will create a temp file and
3023
- load the KDDB to it
3024
- detached (bool): if reading from a file we will create a copy so we don't update in place
3025
- inmemory (bool): if true we will load the KDDB into memory
3026
-
3027
- :return: the document
3028
- """
3029
- if isinstance(source, str):
3030
- if isinstance(source, str):
3031
- # If we are using the detached flag we will create a copy of the KDDB file
3032
- if detached:
3033
- import tempfile
3034
- from kodexa import KodexaPlatform
3035
-
3036
- fp = tempfile.NamedTemporaryFile(
3037
- suffix=".kddb", delete=False, dir=KodexaPlatform.get_tempdir()
3038
- )
3039
- fp.write(open(source, "rb").read())
3040
- fp.close()
3041
- return Document(kddb_path=fp.name, delete_on_close=True, inmemory=inmemory)
3042
-
3043
- return Document(kddb_path=source, inmemory=inmemory)
3044
-
3045
- # We will assume the input is of byte type
3046
- import tempfile
3047
- from kodexa import KodexaPlatform
3048
-
3049
- fp = tempfile.NamedTemporaryFile(
3050
- suffix=".kddb", delete=False, dir=KodexaPlatform.get_tempdir()
3051
- )
3052
- fp.write(source)
3053
- fp.close()
3054
- return Document(kddb_path=fp.name, delete_on_close=True, inmemory=inmemory)
3055
-
3056
- @classmethod
3057
- def from_file(cls, file, unpack: bool = False):
3058
- """Creates a Document that has a 'file-handle' connector to the specified file.
3059
-
3060
- Args:
3061
- file: file: The file to which the new Document is connected.
3062
- unpack: bool: (Default value = False)
3063
-
3064
- Returns:
3065
- Document: A Document connected to the specified file.
3066
-
3067
- """
3068
- if unpack:
3069
- Document.from_kdxa(file)
3070
- else:
3071
- file_document = Document()
3072
- file_document.metadata["connector"] = "file-handle"
3073
- file_document.metadata["connector_options"] = {};
3074
- file_document.metadata["connector_options"]["file"] = file
3075
- file_document.source
3076
- file_document.source.connector = "file-handle"
3077
- file_document.source.original_filename = os.path.basename(file)
3078
- file_document.source.original_path = file
3079
- return file_document
3080
-
3081
- @classmethod
3082
- def from_url(cls, url, headers=None):
3083
- """Creates a Document that has a 'url' connector for the specified url.
3084
-
3085
- Args:
3086
- str: url: The URL to which the new Document is connected.
3087
- dict: headers: Headers that should be used when reading from the URL
3088
- url:
3089
- headers: (Default value = None)
3090
-
3091
- Returns:
3092
- Document: A Document connected to the specified URL with the specified headers (if any).
3093
-
3094
- """
3095
- if headers is None:
3096
- headers = {}
3097
- url_document = Document()
3098
- url_document.metadata.connector = "url"
3099
- url_document.metadata.connector_options.base_url = url
3100
- url_document.metadata.connector_options.headers = headers
3101
- url_document.source.connector = "url"
3102
- url_document.source.original_filename = url
3103
- url_document.source.original_path = url
3104
- url_document.source.headers = headers
3105
- return url_document
3106
-
3107
- def select_first(self, selector, variables=None) -> Optional[ContentNode]:
3108
- """Select and return the first child of this node that match the selector value.
3109
-
3110
- Args:
3111
- selector (str): The selector (ie. //*)
3112
- variables (dict, optional): A dictionary of variable name/value to use in substituion; defaults to None.
3113
- Dictionary keys should match a variable specified in the selector.
3114
-
3115
- Returns:
3116
- Optional[ContentNode]: The first matching node or none
3117
-
3118
- >>> document.get_root().select_first('.')
3119
- ContentNode
3120
-
3121
- >>> document.get_root().select_first('//*[hasTag($tagName)]', {"tagName": "div"})
3122
- ContentNode
3123
- """
3124
- result = self.select(selector, variables, first_only=True)
3125
- return result[0] if len(result) > 0 else None
3126
-
3127
- def select(
3128
- self, selector: str, variables: Optional[dict] = None, first_only=False
3129
- ) -> List[ContentNode]:
3130
- """Execute a selector on the root node and then return a list of the matching nodes.
3131
-
3132
- Args:
3133
- selector (str): The selector (ie. //*)
3134
- variables (Optional[dict): A dictionary of variable name/value to use in substituion; defaults to an empty
3135
- first_only (bool): If True, only the first matching node is returned; defaults to False.
3136
- dictionary. Dictionary keys should match a variable specified in the selector.
3137
-
3138
- Returns:
3139
- list[ContentNodes]: A list of the matching ContentNodes. If no matches found, list is empty.
3140
-
3141
- >>> document.select('.')
3142
- [ContentNode]
3143
- """
3144
- if variables is None:
3145
- variables = {}
3146
- if self.content_node:
3147
- result = self.content_node.select(selector, variables, first_only)
3148
- if isinstance(result, list):
3149
- return result
3150
-
3151
- return [self.content_node] if bool(result) else []
3152
- return []
3153
-
3154
- def get_labels(self) -> List[str]:
3155
- """
3156
-
3157
- Args:
3158
-
3159
- Returns:
3160
- List[str]: list of associated labels
3161
-
3162
- """
3163
- return self.labels
3164
-
3165
- def get_feature_set(self, owner_uri: Optional[str] = None) -> FeatureSet:
3166
- """ """
3167
- feature_set = FeatureSet()
3168
- feature_set.node_features = []
3169
- for tagged_node in self.get_all_tagged_nodes():
3170
- node_feature = {"nodeUuid": str(tagged_node.uuid), "features": []}
3171
-
3172
- feature_set.node_features.append(node_feature)
3173
-
3174
- # TODO this needs to be cleaned up, also should it only really
3175
- # be the tag features?
3176
- for feature in tagged_node.get_features():
3177
- if feature.feature_type == "tag":
3178
- if owner_uri is not None:
3179
- if (
3180
- "owner_uri" in feature.value[0]
3181
- and feature.value[0]["owner_uri"] != owner_uri
3182
- ):
3183
- continue
3184
-
3185
- feature_dict = feature.to_dict()
3186
- feature_dict["featureType"] = feature.feature_type
3187
- feature_dict["name"] = feature.name
3188
-
3189
- if isinstance(feature_dict['value'][0], Tag):
3190
- feature_dict['value'] = [feature_dict['value'][0].to_dict()]
3191
-
3192
- node_feature["features"].append(feature_dict)
3193
-
3194
- return feature_set
3195
-
3196
- def get_all_tagged_nodes(self) -> List[ContentNode]:
3197
- """
3198
- Get all the tagged nodes in the document
3199
-
3200
- :return:
3201
- """
3202
- return self._persistence_layer.get_all_tagged_nodes()
3203
-
3204
-
3205
- class TagInstance:
3206
- """
3207
- A class to represent a TagInstance.
3208
-
3209
- Attributes
3210
- ----------
3211
- tag : Tag
3212
- an instance of Tag class
3213
- nodes : list
3214
- a list of nodes
3215
-
3216
- Methods
3217
- -------
3218
- add_node(nodes: List[ContentNode])
3219
- Extend the list of nodes with new nodes.
3220
- """
3221
-
3222
- def __init__(self, tag: Tag, nodes):
3223
- self.tag = tag
3224
- self.nodes = nodes
3225
-
3226
- def add_node(self, nodes: List[ContentNode]):
3227
- """
3228
- Extend the list of nodes with new nodes.
3229
-
3230
- Parameters
3231
- ----------
3232
- nodes : List[ContentNode]
3233
- a list of new nodes to be added
3234
- """
3235
- self.nodes.extend(nodes)
3236
-
3237
-
3238
- class ContentObjectReference:
3239
- """A reference to a content object within a document.
3240
-
3241
- This class provides a way to reference a specific content object within a document,
3242
- and includes information about the document's family and the store where the document is located.
3243
-
3244
- Attributes:
3245
- content_object (ContentObject): The content object being referenced.
3246
- store: The store where the document is located.
3247
- document (Document): The document in which the content object is located.
3248
- document_family: The family to which the document belongs.
3249
- """
3250
-
3251
- """ """
3252
-
3253
- def __init__(
3254
- self, content_object: ContentObject, store, document: Document, document_family
3255
- ):
3256
- self.content_object = content_object
3257
- self.store = store
3258
- self.document = document
3259
- self.document_family = document_family