kodexa 7.5.514404640805__py3-none-any.whl → 8.0.14958192442__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kodexa/dataclasses/__init__.py +1 -1
- kodexa/model/__init__.py +2 -2
- kodexa/model/objects.py +21 -1
- kodexa/model/utils.py +1 -1
- kodexa/pipeline/pipeline.py +1 -1
- kodexa/platform/client.py +1 -2
- kodexa/platform/kodexa.py +4 -1
- kodexa/platform/manifest.py +447 -0
- kodexa/selectors/__init__.py +1 -1
- kodexa/selectors/ast.py +371 -98
- kodexa/selectors/error.py +29 -0
- kodexa/selectors/kodexa-ast-visitor.py +268 -0
- kodexa/selectors/parser.py +91 -0
- kodexa/selectors/resources/KodexaSelector.interp +99 -0
- kodexa/selectors/resources/KodexaSelector.tokens +56 -0
- kodexa/selectors/resources/KodexaSelectorLexer.interp +119 -0
- kodexa/selectors/resources/KodexaSelectorLexer.py +204 -0
- kodexa/selectors/resources/KodexaSelectorLexer.tokens +56 -0
- kodexa/selectors/resources/KodexaSelectorListener.py +570 -0
- kodexa/selectors/resources/KodexaSelectorParser.py +3246 -0
- kodexa/selectors/resources/KodexaSelectorVisitor.py +323 -0
- kodexa/selectors/visitor.py +265 -0
- kodexa/steps/__init__.py +4 -2
- kodexa/steps/common.py +0 -68
- kodexa/testing/test_utils.py +1 -1
- {kodexa-7.5.514404640805.dist-info → kodexa-8.0.14958192442.dist-info}/METADATA +7 -3
- kodexa-8.0.14958192442.dist-info/RECORD +53 -0
- {kodexa-7.5.514404640805.dist-info → kodexa-8.0.14958192442.dist-info}/WHEEL +1 -1
- kodexa/model/model.py +0 -3259
- kodexa/model/persistence.py +0 -2017
- kodexa/selectors/core.py +0 -124
- kodexa/selectors/lexrules.py +0 -137
- kodexa/selectors/lextab.py +0 -83
- kodexa/selectors/lextab.pyi +0 -1
- kodexa/selectors/parserules.py +0 -414
- kodexa/selectors/parserules.pyi +0 -1
- kodexa/selectors/parsetab.py +0 -4149
- kodexa/selectors/parsetab.pyi +0 -1
- kodexa-7.5.514404640805.dist-info/RECORD +0 -50
- {kodexa-7.5.514404640805.dist-info → kodexa-8.0.14958192442.dist-info}/LICENSE +0 -0
kodexa/model/model.py
DELETED
@@ -1,3259 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
The core model provides definitions for all the base objects in the Kodexa Content Model
|
3
|
-
"""
|
4
|
-
import dataclasses
|
5
|
-
import inspect
|
6
|
-
import json
|
7
|
-
import os
|
8
|
-
import re
|
9
|
-
import uuid
|
10
|
-
from enum import Enum
|
11
|
-
from typing import Any, List, Optional
|
12
|
-
from addict import Dict
|
13
|
-
import deepdiff
|
14
|
-
import msgpack
|
15
|
-
from pydantic import BaseModel, ConfigDict, Field
|
16
|
-
|
17
|
-
from kodexa.model.objects import ContentObject, FeatureSet, DocumentTaxonValidation
|
18
|
-
|
19
|
-
|
20
|
-
class Ref:
|
21
|
-
"""
|
22
|
-
A class to represent a reference.
|
23
|
-
|
24
|
-
Attributes
|
25
|
-
----------
|
26
|
-
ref : str
|
27
|
-
a string reference
|
28
|
-
version : str, optional
|
29
|
-
a version of the reference, default is None
|
30
|
-
resource : str, optional
|
31
|
-
a resource of the reference, default is None
|
32
|
-
slug : str
|
33
|
-
a slug of the reference, default is an empty string
|
34
|
-
org_slug : str
|
35
|
-
an organization slug of the reference, default is an empty string
|
36
|
-
object_ref : str
|
37
|
-
a formatted string of the reference
|
38
|
-
|
39
|
-
Methods
|
40
|
-
-------
|
41
|
-
__init__(self, ref: str)
|
42
|
-
Constructs all the necessary attributes for the Ref object.
|
43
|
-
"""
|
44
|
-
|
45
|
-
def __init__(self, ref: str):
|
46
|
-
self.ref: str = ref
|
47
|
-
first_part = ref
|
48
|
-
self.version: Optional[str] = None
|
49
|
-
self.resource: Optional[str] = None
|
50
|
-
self.slug: str = ""
|
51
|
-
self.org_slug: str = ""
|
52
|
-
|
53
|
-
if ":" in ref:
|
54
|
-
(first_part, self.version) = ref.split(":")
|
55
|
-
|
56
|
-
if "/" in self.version:
|
57
|
-
(self.version, self.resource) = self.version.split("/")
|
58
|
-
|
59
|
-
(self.org_slug, self.slug) = first_part.split("/")
|
60
|
-
|
61
|
-
self.object_ref = (
|
62
|
-
f"{self.org_slug}/{self.slug}:{self.version}"
|
63
|
-
if self.version
|
64
|
-
else f"{self.org_slug}/{self.slug}"
|
65
|
-
)
|
66
|
-
|
67
|
-
|
68
|
-
import addict
|
69
|
-
|
70
|
-
|
71
|
-
class DocumentMetadata(addict.Dict):
|
72
|
-
"""A flexible dict based approach to capturing metadata for the document.
|
73
|
-
|
74
|
-
This class extends from Dict to provide a flexible way to store and
|
75
|
-
manage metadata associated with a document.
|
76
|
-
|
77
|
-
Args:
|
78
|
-
*args: Variable length argument list.
|
79
|
-
**kwargs: Arbitrary keyword arguments.
|
80
|
-
"""
|
81
|
-
|
82
|
-
"""A flexible dict based approach to capturing metadata for the document"""
|
83
|
-
|
84
|
-
def __init__(self, *args, **kwargs):
|
85
|
-
super().__init__(*args, **kwargs)
|
86
|
-
|
87
|
-
|
88
|
-
class ContentException(dict):
|
89
|
-
"""A content exception represents an issue identified during labeling or validation at the document level.
|
90
|
-
|
91
|
-
Attributes:
|
92
|
-
tag (Optional[str]): Tag associated with the exception.
|
93
|
-
message (str): Message describing the exception.
|
94
|
-
exception_details (Optional[str]): Detailed information about the exception.
|
95
|
-
group_uuid (Optional[str]): UUID of the group associated with the exception.
|
96
|
-
tag_uuid (Optional[str]): UUID of the tag associated with the exception.
|
97
|
-
exception_type (str): Type of the exception.
|
98
|
-
node_uuid (Optional[str]): UUID of the node associated with the exception.
|
99
|
-
severity (str): Severity level of the exception, default is 'ERROR'.
|
100
|
-
value (Optional[str]): Value associated with the exception.
|
101
|
-
exception_type_id (Optional[str]): ID of the exception type.
|
102
|
-
"""
|
103
|
-
|
104
|
-
"""A content exception represents an issue identified during labeling or validation at the document level"""
|
105
|
-
|
106
|
-
def __init__(
|
107
|
-
self,
|
108
|
-
exception_type: str,
|
109
|
-
message: str,
|
110
|
-
severity: str = "ERROR",
|
111
|
-
tag: Optional[str] = None,
|
112
|
-
group_uuid: Optional[str] = None,
|
113
|
-
tag_uuid: Optional[str] = None,
|
114
|
-
exception_type_id: Optional[str] = None,
|
115
|
-
exception_details: Optional[str] = None,
|
116
|
-
node_uuid: Optional[str] = None,
|
117
|
-
value: Optional[str] = None,
|
118
|
-
boolean_value: Optional[bool] = None,
|
119
|
-
*args,
|
120
|
-
**kwargs,
|
121
|
-
):
|
122
|
-
super().__init__(*args, **kwargs)
|
123
|
-
self.tag = tag
|
124
|
-
self.message = message
|
125
|
-
self.exception_details = exception_details
|
126
|
-
self.group_uuid = group_uuid
|
127
|
-
self.tag_uuid = tag_uuid
|
128
|
-
self.exception_type = exception_type
|
129
|
-
self.node_uuid = node_uuid
|
130
|
-
self.severity = severity
|
131
|
-
self.value = value
|
132
|
-
self.exception_type_id = exception_type_id
|
133
|
-
self.boolean_value = boolean_value
|
134
|
-
|
135
|
-
|
136
|
-
class Tag(Dict):
|
137
|
-
"""A class to represent the metadata for a label that is applied as a feature on a content node.
|
138
|
-
|
139
|
-
Attributes:
|
140
|
-
start (Optional[int]): The start position (zero indexed) of the content within the node. If None, label is applied to the whole node.
|
141
|
-
end (Optional[int]): The end position (zero indexed) of the content within the node. If None, label is applied to the whole node.
|
142
|
-
value (Optional[str]): A string representing the value that was labelled in the node.
|
143
|
-
data (Optional[Any]): Any data object (JSON serializable) that you wish to associate with the label.
|
144
|
-
uuid (Optional[str]): The UUID for this tag instance. This allows tags that are on different content nodes to be related through the same UUID.
|
145
|
-
confidence (Optional[float]): The confidence of the tag in a range of 0-1.
|
146
|
-
index (Optional[int]): The tag index. This is used to allow us to order tags, and understand the ordering of parent child tag relationships.
|
147
|
-
bbox (Optional[List[int]]): The optional bounding box that can be used if the label is spatial (based on the node as the container).
|
148
|
-
group_uuid (Optional[str]): The UUID of the group that this tag belongs to. This is used to allow us to group tags together.
|
149
|
-
parent_group_uuid (Optional[str]): The UUID of the parent group that this tag belongs to. This is used to allow us to group tags together.
|
150
|
-
cell_index (Optional[int]): The cell index of the cell that this tag belongs to. This is used to allow us to group tags together.
|
151
|
-
note (Optional[str]): A note that can be associated with the tag.
|
152
|
-
status (Optional[str]): The status of the tag. This can be passed to an attribute status during extraction.
|
153
|
-
owner_uri (Optional[str]): The URI of the owner (ie. model://kodexa/narrative:1.0.0 or user://pdodds).
|
154
|
-
"""
|
155
|
-
|
156
|
-
"""A tag represents the metadata for a label that is applies as a feature on a content node"""
|
157
|
-
|
158
|
-
def __init__(
|
159
|
-
self,
|
160
|
-
start: Optional[int] = None,
|
161
|
-
end: Optional[int] = None,
|
162
|
-
value: Optional[str] = None,
|
163
|
-
uuid: Optional[str] = None,
|
164
|
-
data: Any = None,
|
165
|
-
*args,
|
166
|
-
confidence: Optional[float] = None,
|
167
|
-
group_uuid: Optional[str] = None,
|
168
|
-
parent_group_uuid: Optional[str] = None,
|
169
|
-
cell_index: Optional[int] = None,
|
170
|
-
index: Optional[int] = None,
|
171
|
-
bbox: Optional[List[int]] = None,
|
172
|
-
note: Optional[str] = None,
|
173
|
-
status: Optional[str] = None,
|
174
|
-
owner_uri: Optional[str] = None,
|
175
|
-
is_dirty: Optional[bool] = None,
|
176
|
-
**kwargs,
|
177
|
-
):
|
178
|
-
super().__init__(*args, **kwargs)
|
179
|
-
|
180
|
-
import uuid as uuid_gen
|
181
|
-
self.start: Optional[int] = start
|
182
|
-
"""The start position (zero indexed) of the content within the node, if None then label is applied to the whole node"""
|
183
|
-
self.end: Optional[int] = end
|
184
|
-
"""The end position (zero indexed) of the content within the node, if None then label is applied to the whole node"""
|
185
|
-
self.value: Optional[str] = value
|
186
|
-
"""A string representing the value that was labelled in the node"""
|
187
|
-
self.data: Optional[Any] = data
|
188
|
-
"""Any data object (JSON serializable) that you wish to associate with the label"""
|
189
|
-
self.uuid: Optional[str] = uuid or str(uuid_gen.uuid4())
|
190
|
-
"""The UUID for this tag instance, this allows tags that are on different content nodes to be related through the same UUID"""
|
191
|
-
self.confidence: Optional[float] = confidence
|
192
|
-
"""The confidence of the tag in a range of 0-1"""
|
193
|
-
self.index: Optional[int] = index
|
194
|
-
"""The tag index, this is used to allow us to order tags, and understand the ordering of parent child tag relationships"""
|
195
|
-
self.bbox: Optional[List[int]] = bbox
|
196
|
-
"""The optional bounding box that can be used if the label is spatial (based on the node as the container)"""
|
197
|
-
self.group_uuid: Optional[str] = group_uuid
|
198
|
-
"""The UUID of the group that this tag belongs to, this is used to allow us to group tags together"""
|
199
|
-
self.parent_group_uuid: Optional[str] = parent_group_uuid
|
200
|
-
"""The UUID of the parent group that this tag belongs to, this is used to allow us to group tags together"""
|
201
|
-
self.cell_index: Optional[int] = cell_index
|
202
|
-
"""The cell index of the cell that this tag belongs to, this is used to allow us to group tags together"""
|
203
|
-
self.note: Optional[str] = note
|
204
|
-
"""A note that can be associated with the tag"""
|
205
|
-
self.status: Optional[str] = status
|
206
|
-
"""The status of the tag, this can be passed to an attribute status during extraction"""
|
207
|
-
self.owner_uri: Optional[str] = owner_uri
|
208
|
-
"""The URI of the owner (ie. model://kodexa/narrative:1.0.0 or user://pdodds)"""
|
209
|
-
self.is_dirty: Optional[bool] = is_dirty
|
210
|
-
"""Whether or not the """
|
211
|
-
# Pull the cell index from the data to the tag if we have it in the data
|
212
|
-
if self.cell_index is None:
|
213
|
-
if data and "cell_index" in data:
|
214
|
-
self.cell_index = data["cell_index"]
|
215
|
-
|
216
|
-
|
217
|
-
class FindDirection(Enum):
|
218
|
-
"""
|
219
|
-
Enum class for defining the direction of search in a tree structure.
|
220
|
-
|
221
|
-
Attributes:
|
222
|
-
CHILDREN (int): Represents the direction towards children nodes.
|
223
|
-
PARENT (int): Represents the direction towards parent node.
|
224
|
-
"""
|
225
|
-
|
226
|
-
""" """
|
227
|
-
CHILDREN = 1
|
228
|
-
PARENT = 2
|
229
|
-
|
230
|
-
|
231
|
-
class Traverse(Enum):
|
232
|
-
"""
|
233
|
-
An enumeration class that represents different types of traversals.
|
234
|
-
|
235
|
-
Attributes:
|
236
|
-
SIBLING (int): Represents traversal to a sibling.
|
237
|
-
CHILDREN (int): Represents traversal to children.
|
238
|
-
PARENT (int): Represents traversal to a parent.
|
239
|
-
ALL (int): Represents traversal to all types of nodes.
|
240
|
-
"""
|
241
|
-
|
242
|
-
""" """
|
243
|
-
SIBLING = 1
|
244
|
-
CHILDREN = 2
|
245
|
-
PARENT = 3
|
246
|
-
ALL = 4
|
247
|
-
|
248
|
-
|
249
|
-
class ContentNode(object):
|
250
|
-
"""A Content Node identifies a section of the document containing logical
|
251
|
-
grouping of information.
|
252
|
-
|
253
|
-
The node will have content and can include any number of features.
|
254
|
-
|
255
|
-
You should always create a node using the Document's create_node method to
|
256
|
-
ensure that the correct mixins are applied.
|
257
|
-
|
258
|
-
>>> new_page = document.create_node(node_type='page')
|
259
|
-
<kodexa.model.model.ContentNode object at 0x7f80605e53c8>
|
260
|
-
>>> current_content_node.add_child(new_page)
|
261
|
-
|
262
|
-
>>> new_page = document.create_node(node_type='page', content='This is page 1')
|
263
|
-
<kodexa.model.model.ContentNode object at 0x7f80605e53c8>
|
264
|
-
>>> current_content_node.add_child(new_page)
|
265
|
-
|
266
|
-
"""
|
267
|
-
|
268
|
-
def __init__(
|
269
|
-
self,
|
270
|
-
document,
|
271
|
-
node_type: str,
|
272
|
-
content: Optional[str] = None,
|
273
|
-
content_parts: Optional[List[Any]] = None,
|
274
|
-
parent=None,
|
275
|
-
index: Optional[int] = None,
|
276
|
-
virtual: bool = False,
|
277
|
-
):
|
278
|
-
self.node_type: str = node_type
|
279
|
-
"""The node type (ie. line, page, cell etc)"""
|
280
|
-
self.document: Document = document
|
281
|
-
"""The document that the node belongs to"""
|
282
|
-
self._content_parts: Optional[List[Any]] = content_parts
|
283
|
-
"""The children of the content node"""
|
284
|
-
self.index: Optional[int] = index
|
285
|
-
"""The index of the content node"""
|
286
|
-
self.uuid: Optional[int] = None
|
287
|
-
"""The ID of the content node"""
|
288
|
-
self.virtual: bool = virtual
|
289
|
-
"""Is the node virtual (ie. it doesn't actually exist in the document)"""
|
290
|
-
|
291
|
-
self._parent_uuid = parent.uuid if parent else None
|
292
|
-
|
293
|
-
if content is not None and len(self.get_content_parts()) == 0:
|
294
|
-
self.set_content_parts([content])
|
295
|
-
|
296
|
-
def get_content_parts(self):
|
297
|
-
return self.document.get_persistence().get_content_parts(self)
|
298
|
-
|
299
|
-
def set_content_parts(self, content_parts):
|
300
|
-
self.document.get_persistence().update_content_parts(self, content_parts)
|
301
|
-
|
302
|
-
def update(self):
|
303
|
-
"""
|
304
|
-
Update this node in the document persistence
|
305
|
-
|
306
|
-
:return:
|
307
|
-
"""
|
308
|
-
self.document.get_persistence().update_node(self)
|
309
|
-
|
310
|
-
@property
|
311
|
-
def content(self):
|
312
|
-
if len(self.get_content_parts()) == 0:
|
313
|
-
return None
|
314
|
-
|
315
|
-
s = ""
|
316
|
-
for part in self.get_content_parts():
|
317
|
-
if isinstance(part, str):
|
318
|
-
if s != "":
|
319
|
-
s += " "
|
320
|
-
s += part
|
321
|
-
|
322
|
-
return s
|
323
|
-
|
324
|
-
@content.setter
|
325
|
-
def content(self, new_content):
|
326
|
-
if len(self.get_content_parts()) == 0:
|
327
|
-
self.set_content_parts([new_content])
|
328
|
-
else:
|
329
|
-
# We need to remove all the strings and add this one
|
330
|
-
# back at the front
|
331
|
-
parts = self.get_content_parts()
|
332
|
-
filtered_parts = list(filter(lambda part: isinstance(part, int), parts))
|
333
|
-
if new_content is not None and new_content != "":
|
334
|
-
filtered_parts.insert(0, new_content)
|
335
|
-
self.set_content_parts(filtered_parts)
|
336
|
-
|
337
|
-
def __eq__(self, other):
|
338
|
-
return (
|
339
|
-
other is not None
|
340
|
-
and self.uuid == other.uuid
|
341
|
-
and (self.uuid is not None and other.uuid is not None)
|
342
|
-
)
|
343
|
-
|
344
|
-
def __hash__(self):
|
345
|
-
return hash(self.uuid)
|
346
|
-
|
347
|
-
def get_parent(self):
|
348
|
-
return self.document.get_persistence().get_parent(self)
|
349
|
-
|
350
|
-
def __str__(self):
|
351
|
-
return (
|
352
|
-
f"ContentNode {self.uuid} [node_type:{self.node_type}] ({len(self.get_features())} features, {len(self.get_children())} children) ["
|
353
|
-
+ str(self.content)
|
354
|
-
+ "]"
|
355
|
-
)
|
356
|
-
|
357
|
-
def to_json(self):
|
358
|
-
"""Create a JSON string representation of this ContentNode.
|
359
|
-
|
360
|
-
Args:
|
361
|
-
|
362
|
-
Returns:
|
363
|
-
str: The JSON formatted string representation of this ContentNode.
|
364
|
-
|
365
|
-
>>> node.to_json()
|
366
|
-
"""
|
367
|
-
return json.dumps(self.to_dict())
|
368
|
-
|
369
|
-
def to_dict(self):
|
370
|
-
"""Create a dictionary representing this ContentNode's structure and content.
|
371
|
-
|
372
|
-
Args:
|
373
|
-
|
374
|
-
Returns:
|
375
|
-
dict: The properties of this ContentNode and all of its children structured as a dictionary.
|
376
|
-
|
377
|
-
>>> node.to_dict()
|
378
|
-
"""
|
379
|
-
new_dict = {
|
380
|
-
"node_type": self.node_type,
|
381
|
-
"content": self.content,
|
382
|
-
"content_parts": self.get_content_parts(),
|
383
|
-
"features": [],
|
384
|
-
"index": self.index,
|
385
|
-
"children": [],
|
386
|
-
"uuid": self.uuid,
|
387
|
-
}
|
388
|
-
for feature in self.get_features():
|
389
|
-
new_dict["features"].append(feature.to_dict())
|
390
|
-
|
391
|
-
for child in self.get_children():
|
392
|
-
new_dict["children"].append(child.to_dict())
|
393
|
-
return new_dict
|
394
|
-
|
395
|
-
@staticmethod
|
396
|
-
def from_dict(document, content_node_dict: dict, parent=None):
|
397
|
-
"""Build a new ContentNode from a dictionary represention.
|
398
|
-
|
399
|
-
Args:
|
400
|
-
document (Document): The Kodexa document from which the new ContentNode will be created (not added).
|
401
|
-
content_node_dict (Dict): The dictionary-structured representation of a ContentNode. This value will be unpacked into a ContentNode.
|
402
|
-
parent (Optional[ContentNode]): Optionally the parent content node
|
403
|
-
Returns:
|
404
|
-
ContentNode: A ContentNode containing the unpacked values from the content_node_dict parameter.
|
405
|
-
|
406
|
-
>>> ContentNode.from_dict(document, content_node_dict)
|
407
|
-
"""
|
408
|
-
|
409
|
-
node_type = (
|
410
|
-
content_node_dict["type"]
|
411
|
-
if document.version == Document.PREVIOUS_VERSION
|
412
|
-
else content_node_dict["node_type"]
|
413
|
-
)
|
414
|
-
|
415
|
-
new_content_node = document.create_node(
|
416
|
-
node_type=node_type,
|
417
|
-
content=content_node_dict["content"]
|
418
|
-
if "content" in content_node_dict
|
419
|
-
else None,
|
420
|
-
index=content_node_dict["index"],
|
421
|
-
parent=parent,
|
422
|
-
)
|
423
|
-
|
424
|
-
if (
|
425
|
-
"content_parts" in content_node_dict
|
426
|
-
and len(content_node_dict["content_parts"]) > 0
|
427
|
-
):
|
428
|
-
new_content_node.set_content_parts(content_node_dict["content_parts"])
|
429
|
-
|
430
|
-
for dict_feature in content_node_dict["features"]:
|
431
|
-
feature_type = dict_feature["name"].split(":")[0]
|
432
|
-
if feature_type == "tag":
|
433
|
-
new_content_node.add_feature(
|
434
|
-
feature_type,
|
435
|
-
dict_feature["name"].split(":")[1],
|
436
|
-
dict_feature["value"],
|
437
|
-
dict_feature["single"],
|
438
|
-
True,
|
439
|
-
)
|
440
|
-
else:
|
441
|
-
new_content_node.add_feature(
|
442
|
-
feature_type,
|
443
|
-
dict_feature["name"].split(":")[1],
|
444
|
-
dict_feature["value"],
|
445
|
-
dict_feature["single"],
|
446
|
-
True,
|
447
|
-
)
|
448
|
-
|
449
|
-
for dict_child in content_node_dict["children"]:
|
450
|
-
ContentNode.from_dict(document, dict_child, new_content_node)
|
451
|
-
|
452
|
-
return new_content_node
|
453
|
-
|
454
|
-
def add_child_content(
|
455
|
-
self, node_type: str, content: str, index: Optional[int] = None
|
456
|
-
) -> "ContentNode":
|
457
|
-
"""Convenience method to allow you to quick add a child node with a type and content
|
458
|
-
|
459
|
-
Args:
|
460
|
-
node_type: the node type
|
461
|
-
content: the content
|
462
|
-
index: the index (optional) (Default value = None)
|
463
|
-
|
464
|
-
Returns:
|
465
|
-
the new ContentNode
|
466
|
-
|
467
|
-
"""
|
468
|
-
new_node = self.document.create_node(
|
469
|
-
node_type=node_type, parent=self, content=content
|
470
|
-
)
|
471
|
-
self.add_child(new_node, index)
|
472
|
-
return new_node
|
473
|
-
|
474
|
-
def add_child(self, child, index: Optional[int] = None):
|
475
|
-
"""Add a ContentNode as a child of this ContentNode
|
476
|
-
|
477
|
-
Args:
|
478
|
-
child (ContentNode): The node that will be added as a child of this node
|
479
|
-
index (Optional[int]): The index at which this child node should be added; defaults to None. If None, index is set as the count of child node elements.
|
480
|
-
|
481
|
-
Returns:
|
482
|
-
|
483
|
-
>>> new_page = document.create_node(node_type='page')
|
484
|
-
<kodexa.model.model.ContentNode object at 0x7f80605e53c8>
|
485
|
-
>>> current_content_node.add_child(new_page)
|
486
|
-
"""
|
487
|
-
if index is None:
|
488
|
-
if len(self.get_children()) > 0:
|
489
|
-
child.index = self.get_children()[-1].index + 1
|
490
|
-
else:
|
491
|
-
child.index = 0
|
492
|
-
else:
|
493
|
-
child.index = index
|
494
|
-
|
495
|
-
self.document.get_persistence().add_content_node(child, self)
|
496
|
-
|
497
|
-
def remove_child(self, content_node):
|
498
|
-
self.document.get_persistence().remove_content_node(content_node)
|
499
|
-
|
500
|
-
def get_children(self):
|
501
|
-
"""Returns a list of the children of this node.
|
502
|
-
|
503
|
-
Returns:
|
504
|
-
list[ContentNode]: The list of child nodes for this ContentNode.
|
505
|
-
|
506
|
-
>>> node.get_children()
|
507
|
-
"""
|
508
|
-
return self.document.get_persistence().get_children(self)
|
509
|
-
|
510
|
-
def set_feature(self, feature_type, name, value):
|
511
|
-
"""Sets a feature for this ContentNode, replacing the value if a feature by this type and name already exists.
|
512
|
-
|
513
|
-
Args:
|
514
|
-
feature_type (str): The type of feature to be added to the node.
|
515
|
-
name (str): The name of the feature.
|
516
|
-
value (Any): The value of the feature.
|
517
|
-
|
518
|
-
Returns:
|
519
|
-
ContentFeature: The feature that was added to this ContentNode
|
520
|
-
|
521
|
-
>>> new_page = document.create_node(node_type='page')
|
522
|
-
<kodexa.model.model.ContentNode object at 0x7f80605e53c8>
|
523
|
-
>>> new_page.add_feature('pagination','pageNum',1)
|
524
|
-
"""
|
525
|
-
self.remove_feature(feature_type, name)
|
526
|
-
return self.add_feature(feature_type, name, value)
|
527
|
-
|
528
|
-
def update_feature(self, feature: "ContentFeature"):
|
529
|
-
"""
|
530
|
-
Update a feature on this node in document persistence
|
531
|
-
|
532
|
-
:param feature:
|
533
|
-
:return:
|
534
|
-
"""
|
535
|
-
self.document.get_persistence().remove_feature(
|
536
|
-
self, feature.feature_type, feature.name
|
537
|
-
)
|
538
|
-
self.document.get_persistence().add_feature(self, feature)
|
539
|
-
|
540
|
-
def add_feature(self, feature_type, name, value, single=True, serialized=False):
|
541
|
-
"""
|
542
|
-
Add a new feature to this ContentNode.
|
543
|
-
|
544
|
-
Note: if a feature for this feature_type/name already exists, the new value will be added to the existing feature;
|
545
|
-
therefore the feature value might become a list.
|
546
|
-
|
547
|
-
Args:
|
548
|
-
feature_type (str): The type of feature to be added to the node.
|
549
|
-
name (str): The name of the feature.
|
550
|
-
value (Any): The value of the feature.
|
551
|
-
single (boolean): Indicates that the value is singular, rather than a collection (ex: str vs list); defaults to True.
|
552
|
-
serialized (boolean): Indicates that the value is/is not already serialized; defaults to False.
|
553
|
-
|
554
|
-
Returns:
|
555
|
-
ContentFeature: The feature that was added to this ContentNode.
|
556
|
-
|
557
|
-
>>> new_page = document.create_node(node_type='page')
|
558
|
-
<kodexa.model.model.ContentNode object at 0x7f80605e53c8>
|
559
|
-
>>> new_page.add_feature('pagination','pageNum',1)
|
560
|
-
"""
|
561
|
-
if self.has_feature(feature_type, name):
|
562
|
-
existing_feature = self.get_feature(feature_type, name)
|
563
|
-
if isinstance(existing_feature.value, list):
|
564
|
-
existing_feature.value.append(value)
|
565
|
-
else:
|
566
|
-
existing_feature.value = [existing_feature.value, value]
|
567
|
-
self.update_feature(existing_feature)
|
568
|
-
return existing_feature
|
569
|
-
|
570
|
-
# Make sure that we treat the value as list all the time
|
571
|
-
new_feature = ContentFeature(
|
572
|
-
feature_type,
|
573
|
-
name,
|
574
|
-
[value] if single and not serialized else value,
|
575
|
-
single=single,
|
576
|
-
)
|
577
|
-
self.document.get_persistence().add_feature(self, new_feature)
|
578
|
-
return new_feature
|
579
|
-
|
580
|
-
def delete_children(
|
581
|
-
self, nodes: Optional[List] = None, exclude_nodes: Optional[List] = None
|
582
|
-
):
|
583
|
-
"""Delete the children of this node, you can either supply a list of the nodes to delete
|
584
|
-
or the nodes to exclude from the delete, if neither are supplied then we delete all the children.
|
585
|
-
|
586
|
-
Note there is precedence in place, if you have provided a list of nodes to delete then the nodes
|
587
|
-
to exclude is ignored.
|
588
|
-
|
589
|
-
Args:
|
590
|
-
nodes: Optional[List[ContentNode]] a list of content nodes that are children to delete
|
591
|
-
exclude_nodes: Optional[List[ContentNode]] a list of content node that are children not to delete
|
592
|
-
nodes: Optional[List]: (Default value = None)
|
593
|
-
exclude_nodes: Optional[List]: (Default value = None)
|
594
|
-
"""
|
595
|
-
children_to_delete = []
|
596
|
-
|
597
|
-
for child_node in self.get_children():
|
598
|
-
if nodes is not None:
|
599
|
-
for node_to_delete in nodes:
|
600
|
-
if node_to_delete.uuid == child_node.uuid:
|
601
|
-
children_to_delete.append(child_node)
|
602
|
-
elif exclude_nodes is not None:
|
603
|
-
if len(exclude_nodes) == 0:
|
604
|
-
children_to_delete.append(child_node)
|
605
|
-
else:
|
606
|
-
for nodes_to_exclude in exclude_nodes:
|
607
|
-
if nodes_to_exclude.uuid != child_node.uuid:
|
608
|
-
children_to_delete.append(child_node)
|
609
|
-
else:
|
610
|
-
children_to_delete.append(child_node)
|
611
|
-
|
612
|
-
for child_to_delete in children_to_delete:
|
613
|
-
if child_to_delete in self.get_children():
|
614
|
-
self.document.get_persistence().remove_content_node(child_to_delete)
|
615
|
-
|
616
|
-
def get_feature(self, feature_type, name):
|
617
|
-
"""Gets the value for the given feature.
|
618
|
-
|
619
|
-
Args:
|
620
|
-
feature_type (str): The type of the feature.
|
621
|
-
name (str): The name of the feature.
|
622
|
-
|
623
|
-
Returns:
|
624
|
-
ContentFeature or None: The feature with the specified type & name. If no feature is found, None is returned.
|
625
|
-
Note that if there are more than one instance of the feature you will only get the first one
|
626
|
-
|
627
|
-
>>> new_page.get_feature('pagination','pageNum')
|
628
|
-
1
|
629
|
-
"""
|
630
|
-
hits = [
|
631
|
-
i
|
632
|
-
for i in self.get_features()
|
633
|
-
if i.feature_type == feature_type and i.name == name
|
634
|
-
]
|
635
|
-
if len(hits) > 0:
|
636
|
-
|
637
|
-
# We have a situation where the feature isn't a dict since it
|
638
|
-
# was added as a "Tag", lets turn it back into a dict to be
|
639
|
-
# consistent
|
640
|
-
if isinstance(hits[0].value, Tag):
|
641
|
-
hits[0].value = hits[0].value.to_dict()
|
642
|
-
|
643
|
-
return hits[0]
|
644
|
-
|
645
|
-
return None
|
646
|
-
|
647
|
-
def get_features_of_type(self, feature_type):
|
648
|
-
"""Get all features of a specific type.
|
649
|
-
|
650
|
-
Args:
|
651
|
-
feature_type (str): The type of the feature.
|
652
|
-
|
653
|
-
Returns:
|
654
|
-
list[ContentFeature]: A list of feature with the specified type. If no features are found, an empty list is returned.
|
655
|
-
|
656
|
-
>>> new_page.get_features_of_type('my_type')
|
657
|
-
[]
|
658
|
-
"""
|
659
|
-
return [i for i in self.get_features() if i.feature_type == feature_type]
|
660
|
-
|
661
|
-
def has_feature(self, feature_type: str, name: str):
|
662
|
-
"""Determines if a feature with the given feature and name exists on this content node.
|
663
|
-
|
664
|
-
Args:
|
665
|
-
feature_type (str): The type of the feature.
|
666
|
-
name (str): The name of the feature.
|
667
|
-
|
668
|
-
Returns:
|
669
|
-
bool: True if the feature is present; else, False.
|
670
|
-
|
671
|
-
>>> new_page.has_feature('pagination','pageNum')
|
672
|
-
True
|
673
|
-
"""
|
674
|
-
return (
|
675
|
-
len(
|
676
|
-
[
|
677
|
-
i
|
678
|
-
for i in self.get_features()
|
679
|
-
if i.feature_type == feature_type and i.name == name
|
680
|
-
]
|
681
|
-
)
|
682
|
-
> 0
|
683
|
-
)
|
684
|
-
|
685
|
-
def get_features(self):
|
686
|
-
"""Get all features on this ContentNode.
|
687
|
-
|
688
|
-
Returns:
|
689
|
-
list[ContentFeature]: A list of the features on this ContentNode.
|
690
|
-
|
691
|
-
"""
|
692
|
-
return self.document.get_persistence().get_features(self)
|
693
|
-
|
694
|
-
def remove_feature(
|
695
|
-
self, feature_type: str, name: str, include_children: bool = False
|
696
|
-
):
|
697
|
-
"""Removes the feature with the given name and type from this node.
|
698
|
-
|
699
|
-
Args:
|
700
|
-
feature_type (str): The type of the feature.
|
701
|
-
name (str): The name of the feature.
|
702
|
-
include_children (bool): also remove the feature from nodes children
|
703
|
-
|
704
|
-
>>> new_page.remove_feature('pagination','pageNum')
|
705
|
-
"""
|
706
|
-
self.document.get_persistence().remove_feature(self, feature_type, name)
|
707
|
-
|
708
|
-
if include_children:
|
709
|
-
for child in self.get_children():
|
710
|
-
child.remove_feature(feature_type, name, include_children)
|
711
|
-
|
712
|
-
def get_feature_value(self, feature_type: str, name: str) -> Optional[Any]:
|
713
|
-
"""Get the value for a feature with the given name and type on this ContentNode.
|
714
|
-
|
715
|
-
Args:
|
716
|
-
feature_type (str): The type of the feature.
|
717
|
-
name (str): The name of the feature.
|
718
|
-
|
719
|
-
Returns:
|
720
|
-
Any or None: The value of the feature if it exists on this ContentNode otherwise, None, note this
|
721
|
-
only returns the first value (check single to determine if there are multiple)
|
722
|
-
|
723
|
-
>>> new_page.get_feature_value('pagination','pageNum')
|
724
|
-
1
|
725
|
-
"""
|
726
|
-
feature = self.get_feature(feature_type, name)
|
727
|
-
|
728
|
-
# Need to make sure we handle the idea of a single value for a feature
|
729
|
-
return None if feature is None else feature.value[0]
|
730
|
-
|
731
|
-
def get_feature_values(self, feature_type: str, name: str) -> Optional[List[Any]]:
|
732
|
-
"""Get the value for a feature with the given name and type on this ContentNode.
|
733
|
-
|
734
|
-
Args:
|
735
|
-
feature_type (str): The type of the feature.
|
736
|
-
name (str): The name of the feature.
|
737
|
-
|
738
|
-
Returns:
|
739
|
-
The list of feature values or None if there is no feature
|
740
|
-
|
741
|
-
>>> new_page.get_feature_value('pagination','pageNum')
|
742
|
-
1
|
743
|
-
"""
|
744
|
-
feature = self.get_feature(feature_type, name)
|
745
|
-
|
746
|
-
# Simply return all the feature values
|
747
|
-
return None if feature is None else feature.value
|
748
|
-
|
749
|
-
def get_content(self):
|
750
|
-
"""Get the content of this node.
|
751
|
-
|
752
|
-
Args:
|
753
|
-
|
754
|
-
Returns:
|
755
|
-
str: The content of this ContentNode.
|
756
|
-
|
757
|
-
>>> new_page.get_content()
|
758
|
-
"This is page one"
|
759
|
-
"""
|
760
|
-
return self.content
|
761
|
-
|
762
|
-
def get_node_type(self):
|
763
|
-
"""Get the type of this node.
|
764
|
-
|
765
|
-
Args:
|
766
|
-
|
767
|
-
Returns:
|
768
|
-
str: The type of this ContentNode.
|
769
|
-
|
770
|
-
>>> new_page.get_content()
|
771
|
-
"page"
|
772
|
-
"""
|
773
|
-
return self.node_type
|
774
|
-
|
775
|
-
def select_first(self, selector, variables=None) -> Optional["ContentNode"]:
|
776
|
-
"""Select and return the first child of this node that match the selector value.
|
777
|
-
|
778
|
-
Args:
|
779
|
-
selector (str): The selector (ie. //*)
|
780
|
-
variables (dict, optional): A dictionary of variable name/value to use in substituion; defaults to None. Dictionary keys should match a variable specified in the selector.
|
781
|
-
|
782
|
-
Returns:
|
783
|
-
Optional[ContentNode]: The first matching node or none
|
784
|
-
|
785
|
-
>>> document.get_root().select_first('.')
|
786
|
-
ContentNode
|
787
|
-
|
788
|
-
>>> document.get_root().select_first('//*[hasTag($tagName)]', {"tagName": "div"})
|
789
|
-
ContentNode
|
790
|
-
"""
|
791
|
-
result = self.select(selector, variables)
|
792
|
-
return result[0] if len(result) > 0 else None
|
793
|
-
|
794
|
-
def select(self, selector, variables=None, first_only=False):
|
795
|
-
"""Select and return the child nodes of this node that match the selector value.
|
796
|
-
|
797
|
-
Args:
|
798
|
-
selector (str): The selector (ie. //*)
|
799
|
-
variables (dict, optional): A dictionary of variable name/value to use in substituion; defaults to None. Dictionary keys should match a variable specified in the selector.
|
800
|
-
first_only (bool, optional): If True, only the first matching node will be returned; defaults to False.
|
801
|
-
|
802
|
-
Returns:
|
803
|
-
list[ContentNode]: A list of the matching content nodes. If no matches are found, the list will be empty.
|
804
|
-
|
805
|
-
>>> document.get_root().select('.')
|
806
|
-
[ContentNode]
|
807
|
-
|
808
|
-
>>> document.get_root().select('//*[hasTag($tagName)]', {"tagName": "div"})
|
809
|
-
[ContentNode]
|
810
|
-
"""
|
811
|
-
|
812
|
-
if variables is None:
|
813
|
-
variables = {}
|
814
|
-
from kodexa.selectors import parse
|
815
|
-
from kodexa.selectors.ast import SelectorContext
|
816
|
-
|
817
|
-
context = SelectorContext(self.document, first_only=first_only)
|
818
|
-
self.document.get_persistence().flush_cache()
|
819
|
-
parsed_selector = parse(selector)
|
820
|
-
return parsed_selector.resolve(self, variables, context)
|
821
|
-
|
822
|
-
def get_all_content(self, separator=" ", strip=True):
|
823
|
-
"""Get this node's content, concatenated with all of its children's content.
|
824
|
-
|
825
|
-
Args:
|
826
|
-
separator(str, optional): The separator to use in joining content together; defaults to " ".
|
827
|
-
strip(boolean, optional): Strip the result
|
828
|
-
|
829
|
-
Returns:
|
830
|
-
str: The complete content for this node concatenated with the content of all child nodes.
|
831
|
-
|
832
|
-
>>> document.content_node.get_all_content()
|
833
|
-
|
834
|
-
"This string is made up of multiple nodes"
|
835
|
-
"""
|
836
|
-
s = ""
|
837
|
-
children = self.get_content_parts()
|
838
|
-
for part in children:
|
839
|
-
if isinstance(part, str):
|
840
|
-
if s != "":
|
841
|
-
s += separator
|
842
|
-
s += part
|
843
|
-
if isinstance(part, int):
|
844
|
-
if s != "":
|
845
|
-
s += separator
|
846
|
-
s += [
|
847
|
-
child.get_all_content(separator, strip=strip)
|
848
|
-
for child in self.get_children()
|
849
|
-
if child.index == part
|
850
|
-
][0]
|
851
|
-
|
852
|
-
# We need to determine if we have missing children and add them to the end
|
853
|
-
for child in self.get_children():
|
854
|
-
if child.index not in self.get_content_parts():
|
855
|
-
if s != "":
|
856
|
-
s += separator
|
857
|
-
s += child.get_all_content(separator, strip=strip)
|
858
|
-
|
859
|
-
return s.strip() if strip else s
|
860
|
-
|
861
|
-
def adopt_children(self, nodes_to_adopt, replace=False):
|
862
|
-
"""This will take a list of content nodes and adopt them under this node, ensuring they are re-parented.
|
863
|
-
|
864
|
-
Args:
|
865
|
-
nodes_to_adopt (List[ContentNode]): A list of ContentNodes that will be added to the end of this node's children collection
|
866
|
-
replace (bool): If True, will remove all current children and replace them with the new list; defaults to True
|
867
|
-
|
868
|
-
>>> # select all nodes of type 'line', then the root node 'adopts' them
|
869
|
-
>>> # and replaces all it's existing children with these 'line' nodes.
|
870
|
-
>>> document.get_root().adopt_children(document.select('//line'), replace=True)
|
871
|
-
"""
|
872
|
-
child_idx_base = 0
|
873
|
-
|
874
|
-
# We need to copy this since we might well mutate
|
875
|
-
# it as we adopt
|
876
|
-
children = nodes_to_adopt.copy()
|
877
|
-
for existing_child in self.get_children():
|
878
|
-
if existing_child not in children:
|
879
|
-
existing_child.index = child_idx_base
|
880
|
-
self.document.get_persistence().update_node(existing_child)
|
881
|
-
else:
|
882
|
-
existing_child.index = children.index(existing_child)
|
883
|
-
existing_child._parent_uuid = self.uuid
|
884
|
-
self.document.get_persistence().update_node(existing_child)
|
885
|
-
child_idx_base += 1
|
886
|
-
|
887
|
-
# Copy to avoid mutation
|
888
|
-
for new_child in children.copy():
|
889
|
-
if new_child not in self.get_children():
|
890
|
-
self.add_child(new_child, children.index(new_child))
|
891
|
-
child_idx_base += 1
|
892
|
-
|
893
|
-
if replace:
|
894
|
-
# Copy to avoid mutation
|
895
|
-
for child in self.get_children().copy():
|
896
|
-
if child not in children:
|
897
|
-
self.remove_child(child)
|
898
|
-
|
899
|
-
def remove_tag(self, tag_name):
|
900
|
-
"""Remove a tag from this content node.
|
901
|
-
|
902
|
-
Args:
|
903
|
-
str: tag_name: The name of the tag that should be removed.
|
904
|
-
tag_name:
|
905
|
-
|
906
|
-
Returns:
|
907
|
-
|
908
|
-
>>> document.get_root().remove_tag('foo')
|
909
|
-
"""
|
910
|
-
self.remove_feature("tag", tag_name)
|
911
|
-
|
912
|
-
def set_statistics(self, statistics):
|
913
|
-
"""Set the spatial statistics for this node
|
914
|
-
|
915
|
-
Args:
|
916
|
-
statistics: the statistics object
|
917
|
-
|
918
|
-
Returns:
|
919
|
-
|
920
|
-
>>> document.select.('//page')[0].set_statistics(NodeStatistics())
|
921
|
-
"""
|
922
|
-
self.add_feature("spatial", "statistics", statistics)
|
923
|
-
|
924
|
-
def get_statistics(self):
|
925
|
-
"""Get the spatial statistics for this node
|
926
|
-
|
927
|
-
|
928
|
-
:return: the statistics object (or None if not set)
|
929
|
-
|
930
|
-
Args:
|
931
|
-
|
932
|
-
Returns:
|
933
|
-
|
934
|
-
>>> document.select.('//page')[0].get_statistics()
|
935
|
-
<kodexa.spatial.NodeStatistics object at 0x7f80605e53c8>
|
936
|
-
"""
|
937
|
-
return self.get_feature_value("spatial", "statistics")
|
938
|
-
|
939
|
-
def set_bbox(self, bbox):
|
940
|
-
"""Set the bounding box for the node, this is structured as:
|
941
|
-
|
942
|
-
[x1,y1,x2,y2]
|
943
|
-
|
944
|
-
Args:
|
945
|
-
bbox: the bounding box array
|
946
|
-
|
947
|
-
|
948
|
-
>>> document.select.('//page')[0].set_bbox([10,20,50,100])
|
949
|
-
"""
|
950
|
-
self.set_feature("spatial", "bbox", bbox)
|
951
|
-
|
952
|
-
def get_bbox(self):
|
953
|
-
"""Get the bounding box for the node, this is structured as:
|
954
|
-
|
955
|
-
[x1,y1,x2,y2]
|
956
|
-
|
957
|
-
|
958
|
-
:return: the bounding box array
|
959
|
-
|
960
|
-
>>> document.select.('//page')[0].get_bbox()
|
961
|
-
[10,20,50,100]
|
962
|
-
"""
|
963
|
-
return self.get_feature_value("spatial", "bbox")
|
964
|
-
|
965
|
-
def set_bbox_from_children(self):
|
966
|
-
"""Set the bounding box for this node based on its children"""
|
967
|
-
|
968
|
-
x_min = None
|
969
|
-
x_max = None
|
970
|
-
y_min = None
|
971
|
-
y_max = None
|
972
|
-
|
973
|
-
for child in self.get_children():
|
974
|
-
child_bbox = child.get_bbox()
|
975
|
-
if child_bbox:
|
976
|
-
if not x_min or x_min > child_bbox[0]:
|
977
|
-
x_min = child_bbox[0]
|
978
|
-
if not x_max or x_max < child_bbox[2]:
|
979
|
-
x_max = child_bbox[2]
|
980
|
-
if not y_min or y_min > child_bbox[1]:
|
981
|
-
y_min = child_bbox[1]
|
982
|
-
if not y_max or y_max < child_bbox[3]:
|
983
|
-
y_max = child_bbox[3]
|
984
|
-
|
985
|
-
if x_min:
|
986
|
-
self.set_bbox([x_min, y_min, x_max, y_max])
|
987
|
-
|
988
|
-
def set_rotate(self, rotate):
|
989
|
-
"""Set the rotate of the node
|
990
|
-
|
991
|
-
Args:
|
992
|
-
rotate: the rotation of the node
|
993
|
-
|
994
|
-
Returns:
|
995
|
-
|
996
|
-
>>> document.select.('//page')[0].set_rotate(90)
|
997
|
-
"""
|
998
|
-
self.add_feature("spatial", "rotate", rotate)
|
999
|
-
|
1000
|
-
def get_rotate(self):
|
1001
|
-
"""Get the rotate of the node
|
1002
|
-
|
1003
|
-
|
1004
|
-
:return: the rotation of the node
|
1005
|
-
|
1006
|
-
Args:
|
1007
|
-
|
1008
|
-
Returns:
|
1009
|
-
|
1010
|
-
>>> document.select.('//page')[0].get_rotate()
|
1011
|
-
90
|
1012
|
-
"""
|
1013
|
-
return self.get_feature_value("spatial", "rotate")
|
1014
|
-
|
1015
|
-
def get_x(self):
|
1016
|
-
"""Get the X position of the node
|
1017
|
-
|
1018
|
-
|
1019
|
-
:return: the X position of the node
|
1020
|
-
|
1021
|
-
Args:
|
1022
|
-
|
1023
|
-
Returns:
|
1024
|
-
|
1025
|
-
>>> document.select.('//page')[0].get_x()
|
1026
|
-
10
|
1027
|
-
"""
|
1028
|
-
self_bbox = self.get_bbox()
|
1029
|
-
if self_bbox:
|
1030
|
-
return self_bbox[0]
|
1031
|
-
|
1032
|
-
return None
|
1033
|
-
|
1034
|
-
def get_y(self):
|
1035
|
-
"""Get the Y position of the node
|
1036
|
-
|
1037
|
-
|
1038
|
-
:return: the Y position of the node
|
1039
|
-
|
1040
|
-
Args:
|
1041
|
-
|
1042
|
-
Returns:
|
1043
|
-
|
1044
|
-
>>> document.select.('//page')[0].get_y()
|
1045
|
-
90
|
1046
|
-
"""
|
1047
|
-
self_bbox = self.get_bbox()
|
1048
|
-
if self_bbox:
|
1049
|
-
return self_bbox[1]
|
1050
|
-
|
1051
|
-
return None
|
1052
|
-
|
1053
|
-
def get_width(self):
|
1054
|
-
"""Get the width of the node
|
1055
|
-
|
1056
|
-
|
1057
|
-
:return: the width of the node
|
1058
|
-
|
1059
|
-
Args:
|
1060
|
-
|
1061
|
-
Returns:
|
1062
|
-
|
1063
|
-
>>> document.select.('//page')[0].get_width()
|
1064
|
-
70
|
1065
|
-
"""
|
1066
|
-
self_bbox = self.get_bbox()
|
1067
|
-
if self_bbox:
|
1068
|
-
return self_bbox[2] - self_bbox[0]
|
1069
|
-
|
1070
|
-
return None
|
1071
|
-
|
1072
|
-
def get_height(self):
|
1073
|
-
"""Get the height of the node
|
1074
|
-
|
1075
|
-
|
1076
|
-
:return: the height of the node
|
1077
|
-
|
1078
|
-
Args:
|
1079
|
-
|
1080
|
-
Returns:
|
1081
|
-
|
1082
|
-
>>> document.select.('//page')[0].get_height()
|
1083
|
-
40
|
1084
|
-
"""
|
1085
|
-
self_bbox = self.get_bbox()
|
1086
|
-
if self_bbox:
|
1087
|
-
return self_bbox[3] - self_bbox[1]
|
1088
|
-
|
1089
|
-
return None
|
1090
|
-
|
1091
|
-
def copy_tag(self, selector=".", existing_tag_name=None, new_tag_name=None):
|
1092
|
-
"""Creates a new tag of 'new_tag_name' on the selected content node(s) with the same information as the tag with 'existing_tag_name'.
|
1093
|
-
Both existing_tag_name and new_tag_name values are required and must be different from one another. Otherwise, no action is taken.
|
1094
|
-
If a tag with the 'existing_tag_name' does not exist on a selected node, no action is taken for that node.
|
1095
|
-
|
1096
|
-
Args:
|
1097
|
-
selector: The selector to identify the source nodes to work on (default . - the current node)
|
1098
|
-
str: existing_tag_name: The name of the existing tag whose values will be copied to the new tag.
|
1099
|
-
str: new_tag_name: The name of the new tag. This must be different from the existing_tag_name.
|
1100
|
-
existing_tag_name: (Default value = None)
|
1101
|
-
new_tag_name: (Default value = None)
|
1102
|
-
|
1103
|
-
Returns:
|
1104
|
-
|
1105
|
-
>>> document.get_root().copy_tag('foo', 'bar')
|
1106
|
-
"""
|
1107
|
-
if (
|
1108
|
-
existing_tag_name is None
|
1109
|
-
or new_tag_name is None
|
1110
|
-
or existing_tag_name == new_tag_name
|
1111
|
-
):
|
1112
|
-
return # do nothing, just exit function
|
1113
|
-
|
1114
|
-
for node in self.select(selector):
|
1115
|
-
existing_tag_values = node.get_feature_values("tag", existing_tag_name)
|
1116
|
-
if existing_tag_values:
|
1117
|
-
for val in existing_tag_values:
|
1118
|
-
tag = Tag(
|
1119
|
-
start=val["start"],
|
1120
|
-
end=val["end"],
|
1121
|
-
value=val["value"],
|
1122
|
-
uuid=val["uuid"],
|
1123
|
-
data=val["data"],
|
1124
|
-
)
|
1125
|
-
node.add_feature("tag", new_tag_name, tag)
|
1126
|
-
|
1127
|
-
def collect_nodes_to(self, end_node):
|
1128
|
-
"""Get the the sibling nodes between the current node and the end_node.
|
1129
|
-
|
1130
|
-
Args:
|
1131
|
-
ContentNode: end_node: The node to end at
|
1132
|
-
end_node:
|
1133
|
-
|
1134
|
-
Returns:
|
1135
|
-
list[ContentNode]: A list of sibling nodes between this node and the end_node.
|
1136
|
-
|
1137
|
-
>>> document.content_node.get_children()[0].collect_nodes_to(end_node=document.content_node.get_children()[5])
|
1138
|
-
"""
|
1139
|
-
nodes = []
|
1140
|
-
current_node = self
|
1141
|
-
while current_node.uuid != end_node.uuid:
|
1142
|
-
nodes.append(current_node)
|
1143
|
-
if current_node.has_next_node():
|
1144
|
-
current_node = current_node.next_node()
|
1145
|
-
else:
|
1146
|
-
break
|
1147
|
-
return nodes
|
1148
|
-
|
1149
|
-
def tag_nodes_to(self, end_node, tag_to_apply, tag_uuid: str = None):
|
1150
|
-
"""Tag all the nodes from this node to the end_node with the given tag name
|
1151
|
-
|
1152
|
-
Args:
|
1153
|
-
end_node (ContentNode): The node to end with
|
1154
|
-
tag_to_apply (str): The tag name that will be applied to each node
|
1155
|
-
tag_uuid (str): The tag uuid used if you want to group them
|
1156
|
-
|
1157
|
-
>>> document.content_node.get_children()[0].tag_nodes_to(document.content_node.get_children()[5], tag_name='foo')
|
1158
|
-
"""
|
1159
|
-
[
|
1160
|
-
node.tag(tag_to_apply, tag_uuid=tag_uuid)
|
1161
|
-
for node in self.collect_nodes_to(end_node)
|
1162
|
-
]
|
1163
|
-
|
1164
|
-
def tag_range(
|
1165
|
-
self,
|
1166
|
-
start_content_re,
|
1167
|
-
end_content_re,
|
1168
|
-
tag_to_apply,
|
1169
|
-
node_type_re=".*",
|
1170
|
-
use_all_content=False,
|
1171
|
-
):
|
1172
|
-
"""This will tag all the child nodes between the start and end content regular expressions
|
1173
|
-
|
1174
|
-
Args:
|
1175
|
-
start_content_re: The regular expression to match the starting child
|
1176
|
-
end_content_re: The regular expression to match the ending child
|
1177
|
-
tag_to_apply: The tag name that will be applied to the nodes in range
|
1178
|
-
node_type_re: The node type to match (default is all)
|
1179
|
-
use_all_content: Use full content (including child nodes, default is False)
|
1180
|
-
|
1181
|
-
Returns:
|
1182
|
-
|
1183
|
-
>>> document.content_node.tag_range(start_content_re='.*Cheese.*', end_content_re='.*Fish.*', tag_to_apply='foo')
|
1184
|
-
"""
|
1185
|
-
|
1186
|
-
# Could be line, word, or content-area
|
1187
|
-
all_nodes = self.select(f"//*[typeRegex('{node_type_re}')]")
|
1188
|
-
|
1189
|
-
start_index_list = [
|
1190
|
-
n_idx
|
1191
|
-
for n_idx, node in enumerate(all_nodes)
|
1192
|
-
if re.compile(start_content_re).match(
|
1193
|
-
node.get_all_content() if use_all_content else node.content
|
1194
|
-
)
|
1195
|
-
]
|
1196
|
-
end_index_list = [
|
1197
|
-
n_idx
|
1198
|
-
for n_idx, node in enumerate(all_nodes)
|
1199
|
-
if re.compile(end_content_re).match(
|
1200
|
-
node.get_all_content() if use_all_content else node.content
|
1201
|
-
)
|
1202
|
-
]
|
1203
|
-
|
1204
|
-
start_index = (
|
1205
|
-
0
|
1206
|
-
if start_content_re == ""
|
1207
|
-
else start_index_list[0]
|
1208
|
-
if len(start_index_list) > 0
|
1209
|
-
else None
|
1210
|
-
)
|
1211
|
-
if start_index is not None:
|
1212
|
-
end_index_list = [i for i in end_index_list if i >= start_index]
|
1213
|
-
|
1214
|
-
end_index = (
|
1215
|
-
len(all_nodes)
|
1216
|
-
if end_content_re == ""
|
1217
|
-
else end_index_list[0]
|
1218
|
-
if len(end_index_list) > 0
|
1219
|
-
else len(all_nodes)
|
1220
|
-
)
|
1221
|
-
|
1222
|
-
if start_index is not None:
|
1223
|
-
[node.tag(tag_to_apply) for node in all_nodes[start_index:end_index]]
|
1224
|
-
|
1225
|
-
def tag(
|
1226
|
-
self,
|
1227
|
-
tag_to_apply,
|
1228
|
-
selector=".",
|
1229
|
-
content_re=None,
|
1230
|
-
use_all_content=False,
|
1231
|
-
node_only=None,
|
1232
|
-
fixed_position=None,
|
1233
|
-
data=None,
|
1234
|
-
separator=" ",
|
1235
|
-
tag_uuid: str = None,
|
1236
|
-
confidence=None,
|
1237
|
-
value=None,
|
1238
|
-
use_match=True,
|
1239
|
-
index=None,
|
1240
|
-
cell_index=None,
|
1241
|
-
group_uuid=None,
|
1242
|
-
parent_group_uuid=None,
|
1243
|
-
note=None,
|
1244
|
-
status=None,
|
1245
|
-
owner_uri=None,
|
1246
|
-
is_dirty=None,
|
1247
|
-
sort_by_bbox: bool=False,
|
1248
|
-
):
|
1249
|
-
"""
|
1250
|
-
This will tag (see Feature Tagging) the expression groups identified by the regular expression.
|
1251
|
-
|
1252
|
-
Note that if you use the flag use_all_content then node_only will default to True if not set, else it
|
1253
|
-
will default to False
|
1254
|
-
|
1255
|
-
Args:
|
1256
|
-
tag_to_apply: The name of tag that will be applied to the node
|
1257
|
-
selector: The selector to identify the source nodes to work on (default . - the current node)
|
1258
|
-
content_re: The regular expression that you wish to use to tag, note that we will create a tag for each matching group (Default value = None)
|
1259
|
-
use_all_content: Apply the regular expression to the all_content (include content from child nodes) (Default value = False)
|
1260
|
-
separator: Separator to use for use_all_content (Default value = " ")
|
1261
|
-
node_only: Ignore the matching groups and tag the whole node (Default value = None)
|
1262
|
-
fixed_position: Use a fixed position, supplied as a tuple i.e. - (4,10) tag from position 4 to 10 (default None)
|
1263
|
-
data: A dictionary of data for the given tag (Default value = None)
|
1264
|
-
tag_uuid: A UUID used to tie tags in order to demonstrate they're related and form a single concept.
|
1265
|
-
For example, if tagging the two words "Wells" and "Fargo" as an ORGANIZATION, the tag on both words should have the
|
1266
|
-
same tag_uuid in order to indicate they are both needed to form the single ORGANIZATION. If a tag_uuid is provided, it is used
|
1267
|
-
on all tags created in this method. This may result in multiple nodes or multiple feature values having the same tag_uuid.
|
1268
|
-
For example, if the selector provided results in more than one node being selected, each node would be tagged with the same tag_uuid.
|
1269
|
-
The same holds true if a content_re value is provided, node_only is set to False, and multiple matches are found for the content_re
|
1270
|
-
pattern. In that case, each feature value would share the same UUID.
|
1271
|
-
If no tag_uuid is provided, a new uuid is generated for each tag instance.
|
1272
|
-
tag_uuid: str: (Default value = None)
|
1273
|
-
confidence: The confidence in the tag (0-1)
|
1274
|
-
value: The value you wish to store with the tag, this allows you to provide text that isn't part of the content but represents the data you wish tagged
|
1275
|
-
use_match: If True (default) we will use match for regex matching, if False we will use search
|
1276
|
-
index: The index for the tag
|
1277
|
-
cell_index: The cell index for the tag
|
1278
|
-
group_uuid: The group uuid for the tag
|
1279
|
-
parent_group_uuid: The parent group uuid for the tag
|
1280
|
-
note: a text note for the tag
|
1281
|
-
status: a status for the tag, this can be transistioned to an attribute status during extraction
|
1282
|
-
owner_uri: the uri of the entity that created the tag (model vs user; example: model://cdad-healthcare/cdad-excel-model:1.0.0 or user://pdodds)
|
1283
|
-
is_dirty: when the model is run, is_dirty = false for all tags. New tags and editted tags, is_dirty = true.
|
1284
|
-
|
1285
|
-
>>> document.content_node.tag('is_cheese')
|
1286
|
-
"""
|
1287
|
-
|
1288
|
-
if use_all_content and node_only is None:
|
1289
|
-
node_only = True
|
1290
|
-
elif node_only is None:
|
1291
|
-
node_only = False
|
1292
|
-
|
1293
|
-
def get_tag_uuid(tag_uuid):
|
1294
|
-
"""
|
1295
|
-
This function returns the provided tag_uuid if it exists, otherwise it generates a new UUID.
|
1296
|
-
|
1297
|
-
Args:
|
1298
|
-
tag_uuid (str): The UUID of the tag.
|
1299
|
-
|
1300
|
-
Returns:
|
1301
|
-
str: The provided tag_uuid if it exists, otherwise a newly generated UUID.
|
1302
|
-
"""
|
1303
|
-
if tag_uuid:
|
1304
|
-
return tag_uuid
|
1305
|
-
|
1306
|
-
return str(uuid.uuid4())
|
1307
|
-
|
1308
|
-
def tag_node_position(
|
1309
|
-
node_to_check, start, end, node_data, tag_uuid, offset=0, value=None, sort_by_bbox: bool=False
|
1310
|
-
):
|
1311
|
-
"""
|
1312
|
-
This function tags a node position in a given data structure. It iterates over the content parts of the node to check,
|
1313
|
-
and based on the type of the part (string or integer), it performs different operations. If the part is a string, it
|
1314
|
-
adjusts the start and end positions and adds a feature to the node. If the part is an integer, it finds the corresponding
|
1315
|
-
child node and recursively calls the function on the child node. After processing all parts, it checks for any missing
|
1316
|
-
children and processes them as well. Finally, it checks if the length of all content matches the calculated content length.
|
1317
|
-
|
1318
|
-
Args:
|
1319
|
-
node_to_check (Node): The node to check and tag.
|
1320
|
-
start (int): The start position of the tag.
|
1321
|
-
end (int): The end position of the tag.
|
1322
|
-
node_data (dict): The data associated with the node.
|
1323
|
-
tag_uuid (str): The UUID of the tag.
|
1324
|
-
offset (int, optional): The offset to apply. Defaults to 0.
|
1325
|
-
value (str, optional): The value to use for the tag. If None, the part of the content at the start and end positions is used. Defaults to None.
|
1326
|
-
|
1327
|
-
Raises:
|
1328
|
-
Exception: If an invalid part is encountered in the content parts of the node to check.
|
1329
|
-
Exception: If there is a mismatch between the length of all content and the calculated content length.
|
1330
|
-
|
1331
|
-
Returns:
|
1332
|
-
int: The calculated content length.
|
1333
|
-
"""
|
1334
|
-
content_length = 0
|
1335
|
-
original_start = start
|
1336
|
-
original_end = end
|
1337
|
-
for part_idx, part in enumerate(node_to_check.get_content_parts()):
|
1338
|
-
if isinstance(part, str):
|
1339
|
-
if len(part) > 0:
|
1340
|
-
# It is just content
|
1341
|
-
part_length = len(part)
|
1342
|
-
if part_idx > 0:
|
1343
|
-
end = end - len(separator)
|
1344
|
-
content_length = content_length + len(separator)
|
1345
|
-
offset = offset + len(separator)
|
1346
|
-
start = (
|
1347
|
-
0
|
1348
|
-
if start - len(separator) < 0
|
1349
|
-
else start - len(separator)
|
1350
|
-
)
|
1351
|
-
|
1352
|
-
if start < part_length and end < part_length:
|
1353
|
-
node_to_check.add_feature(
|
1354
|
-
"tag",
|
1355
|
-
tag_to_apply,
|
1356
|
-
Tag(
|
1357
|
-
original_start,
|
1358
|
-
original_end,
|
1359
|
-
part[start:end] if value is None else value,
|
1360
|
-
data=node_data,
|
1361
|
-
uuid=tag_uuid,
|
1362
|
-
confidence=confidence,
|
1363
|
-
index=index,
|
1364
|
-
parent_group_uuid=parent_group_uuid,
|
1365
|
-
group_uuid=group_uuid,
|
1366
|
-
cell_index=cell_index,
|
1367
|
-
note=note,
|
1368
|
-
status=status,
|
1369
|
-
owner_uri=owner_uri,
|
1370
|
-
is_dirty=is_dirty,
|
1371
|
-
),
|
1372
|
-
)
|
1373
|
-
return -1
|
1374
|
-
if start < part_length <= end:
|
1375
|
-
node_to_check.add_feature(
|
1376
|
-
"tag",
|
1377
|
-
tag_to_apply,
|
1378
|
-
Tag(
|
1379
|
-
original_start,
|
1380
|
-
content_length + part_length,
|
1381
|
-
value=part[start:] if value is None else value,
|
1382
|
-
data=node_data,
|
1383
|
-
uuid=tag_uuid,
|
1384
|
-
confidence=confidence,
|
1385
|
-
index=index,
|
1386
|
-
parent_group_uuid=parent_group_uuid,
|
1387
|
-
group_uuid=group_uuid,
|
1388
|
-
cell_index=cell_index,
|
1389
|
-
note=note,
|
1390
|
-
status=status,
|
1391
|
-
owner_uri=owner_uri,
|
1392
|
-
is_dirty=is_dirty,
|
1393
|
-
),
|
1394
|
-
)
|
1395
|
-
|
1396
|
-
end = end - part_length
|
1397
|
-
content_length = content_length + part_length
|
1398
|
-
offset = offset + part_length
|
1399
|
-
start = 0 if start - part_length < 0 else start - part_length
|
1400
|
-
|
1401
|
-
elif isinstance(part, int):
|
1402
|
-
child_node = [
|
1403
|
-
child
|
1404
|
-
for child in node_to_check.get_children()
|
1405
|
-
if child.index == part
|
1406
|
-
][0]
|
1407
|
-
|
1408
|
-
if part_idx > 0:
|
1409
|
-
end = end - len(separator)
|
1410
|
-
content_length = content_length + len(separator)
|
1411
|
-
offset = offset + len(separator)
|
1412
|
-
start = (
|
1413
|
-
0 if start - len(separator) < 0 else start - len(separator)
|
1414
|
-
)
|
1415
|
-
|
1416
|
-
result = tag_node_position(
|
1417
|
-
child_node,
|
1418
|
-
start,
|
1419
|
-
end,
|
1420
|
-
node_data,
|
1421
|
-
tag_uuid,
|
1422
|
-
offset=offset,
|
1423
|
-
value=value,
|
1424
|
-
sort_by_bbox=sort_by_bbox,
|
1425
|
-
)
|
1426
|
-
|
1427
|
-
if result < 0 or (end - result) <= 0:
|
1428
|
-
return -1
|
1429
|
-
|
1430
|
-
offset = offset + result
|
1431
|
-
end = end - result
|
1432
|
-
start = 0 if start - result < 0 else start - result
|
1433
|
-
|
1434
|
-
content_length = content_length + result
|
1435
|
-
else:
|
1436
|
-
raise Exception("Invalid part?")
|
1437
|
-
|
1438
|
-
# We need to determine if we have missing children and add them to the end
|
1439
|
-
node_children = node_to_check.get_children()
|
1440
|
-
if node_children and sort_by_bbox:
|
1441
|
-
# Sort nodes by x-coordinate if they have bboxes, otherwise use index
|
1442
|
-
try:
|
1443
|
-
node_children.sort(key=lambda x: x.get_bbox()[0] if hasattr(x, 'get_bbox') else x.index if hasattr(x, 'index') else 0)
|
1444
|
-
except (AttributeError, TypeError, IndexError):
|
1445
|
-
# If sorting fails, keep original order
|
1446
|
-
pass
|
1447
|
-
|
1448
|
-
for child_idx, child_node in enumerate(node_children):
|
1449
|
-
if child_node.index not in node_to_check.get_content_parts():
|
1450
|
-
if content_length > 0:
|
1451
|
-
end = end - len(separator)
|
1452
|
-
content_length = content_length + len(separator)
|
1453
|
-
offset = offset + len(separator)
|
1454
|
-
start = (
|
1455
|
-
0 if start - len(separator) < 0 else start - len(separator)
|
1456
|
-
)
|
1457
|
-
|
1458
|
-
result = tag_node_position(
|
1459
|
-
child_node,
|
1460
|
-
start,
|
1461
|
-
end,
|
1462
|
-
node_data,
|
1463
|
-
tag_uuid,
|
1464
|
-
offset=offset,
|
1465
|
-
value=value,
|
1466
|
-
sort_by_bbox=sort_by_bbox,
|
1467
|
-
)
|
1468
|
-
|
1469
|
-
if result < 0 or (end - result) <= 0:
|
1470
|
-
return -1
|
1471
|
-
|
1472
|
-
offset = offset + result
|
1473
|
-
end = end - result
|
1474
|
-
start = 0 if start - result < 0 else start - result
|
1475
|
-
|
1476
|
-
content_length = content_length + result
|
1477
|
-
|
1478
|
-
if len(node_to_check.get_all_content(strip=False)) != content_length:
|
1479
|
-
raise Exception(
|
1480
|
-
f"There is a problem in the structure? (2) Length mismatch ({len(node_to_check.get_all_content(strip=False))} != {content_length})"
|
1481
|
-
)
|
1482
|
-
|
1483
|
-
return content_length
|
1484
|
-
|
1485
|
-
if content_re:
|
1486
|
-
pattern = re.compile(
|
1487
|
-
content_re.replace(" ", r"\s+")
|
1488
|
-
if use_all_content and not node_only
|
1489
|
-
else content_re
|
1490
|
-
)
|
1491
|
-
|
1492
|
-
for node in self.select(selector):
|
1493
|
-
if fixed_position:
|
1494
|
-
tag_node_position(
|
1495
|
-
node,
|
1496
|
-
fixed_position[0],
|
1497
|
-
fixed_position[1],
|
1498
|
-
data,
|
1499
|
-
get_tag_uuid(tag_uuid),
|
1500
|
-
0,
|
1501
|
-
value=value,
|
1502
|
-
sort_by_bbox=sort_by_bbox,
|
1503
|
-
)
|
1504
|
-
|
1505
|
-
else:
|
1506
|
-
if not content_re:
|
1507
|
-
node.add_feature(
|
1508
|
-
"tag",
|
1509
|
-
tag_to_apply,
|
1510
|
-
Tag(
|
1511
|
-
data=data,
|
1512
|
-
uuid=get_tag_uuid(tag_uuid),
|
1513
|
-
confidence=confidence,
|
1514
|
-
value=value,
|
1515
|
-
index=index,
|
1516
|
-
parent_group_uuid=parent_group_uuid,
|
1517
|
-
group_uuid=group_uuid,
|
1518
|
-
cell_index=cell_index,
|
1519
|
-
note=note,
|
1520
|
-
status=status,
|
1521
|
-
owner_uri=owner_uri,
|
1522
|
-
is_dirty=is_dirty,
|
1523
|
-
),
|
1524
|
-
)
|
1525
|
-
else:
|
1526
|
-
if not use_all_content:
|
1527
|
-
if node.content:
|
1528
|
-
content = node.content
|
1529
|
-
else:
|
1530
|
-
content = None
|
1531
|
-
else:
|
1532
|
-
content = (
|
1533
|
-
node.get_all_content(separator=separator, strip=False)
|
1534
|
-
if not node_only
|
1535
|
-
else node.get_all_content(separator=separator)
|
1536
|
-
)
|
1537
|
-
|
1538
|
-
if content is not None:
|
1539
|
-
if use_match:
|
1540
|
-
matches = pattern.finditer(content)
|
1541
|
-
|
1542
|
-
if node_only:
|
1543
|
-
if any(True for _ in matches):
|
1544
|
-
node.add_feature(
|
1545
|
-
"tag",
|
1546
|
-
tag_to_apply,
|
1547
|
-
Tag(
|
1548
|
-
data=data,
|
1549
|
-
uuid=get_tag_uuid(tag_uuid),
|
1550
|
-
confidence=confidence,
|
1551
|
-
value=value,
|
1552
|
-
index=index,
|
1553
|
-
parent_group_uuid=parent_group_uuid,
|
1554
|
-
group_uuid=group_uuid,
|
1555
|
-
cell_index=cell_index,
|
1556
|
-
note=note,
|
1557
|
-
status=status,
|
1558
|
-
owner_uri=owner_uri,
|
1559
|
-
is_dirty=is_dirty,
|
1560
|
-
),
|
1561
|
-
)
|
1562
|
-
else:
|
1563
|
-
if matches:
|
1564
|
-
for match in matches:
|
1565
|
-
start_offset = match.span()[0]
|
1566
|
-
end_offset = match.span()[1]
|
1567
|
-
tag_node_position(
|
1568
|
-
node,
|
1569
|
-
start_offset,
|
1570
|
-
end_offset,
|
1571
|
-
data,
|
1572
|
-
get_tag_uuid(tag_uuid),
|
1573
|
-
value=value,
|
1574
|
-
sort_by_bbox=sort_by_bbox,
|
1575
|
-
)
|
1576
|
-
|
1577
|
-
else:
|
1578
|
-
search_match = pattern.search(content)
|
1579
|
-
if search_match is not None:
|
1580
|
-
start_offset = search_match.span()[0]
|
1581
|
-
end_offset = search_match.span()[1]
|
1582
|
-
tag_node_position(
|
1583
|
-
node,
|
1584
|
-
start_offset,
|
1585
|
-
end_offset,
|
1586
|
-
data,
|
1587
|
-
get_tag_uuid(tag_uuid),
|
1588
|
-
value=value,
|
1589
|
-
sort_by_bbox=sort_by_bbox,
|
1590
|
-
)
|
1591
|
-
|
1592
|
-
def get_tags(self):
|
1593
|
-
"""Returns a list of the names of the tags on the given node
|
1594
|
-
|
1595
|
-
|
1596
|
-
:return: A list of the tag name
|
1597
|
-
|
1598
|
-
Args:
|
1599
|
-
|
1600
|
-
Returns:
|
1601
|
-
|
1602
|
-
>>> document.content_node.select('*').get_tags()
|
1603
|
-
['is_cheese']
|
1604
|
-
"""
|
1605
|
-
return [i.name for i in self.get_features_of_type("tag")]
|
1606
|
-
|
1607
|
-
def get_tag_features(self):
|
1608
|
-
"""Returns a list of the features that are tags on the given node
|
1609
|
-
|
1610
|
-
|
1611
|
-
:return: A list of the tag name
|
1612
|
-
|
1613
|
-
Args:
|
1614
|
-
|
1615
|
-
Returns:
|
1616
|
-
|
1617
|
-
>>> document.content_node.select('*').get_tag_features()
|
1618
|
-
[ContentFeature()]
|
1619
|
-
"""
|
1620
|
-
return [i for i in self.get_features_of_type("tag")]
|
1621
|
-
|
1622
|
-
def get_tag_values(self, tag_name, include_children=False):
|
1623
|
-
"""Get the values for a specific tag name
|
1624
|
-
|
1625
|
-
Args:
|
1626
|
-
tag_name: tag name
|
1627
|
-
include_children: include the children of this node (Default value = False)
|
1628
|
-
|
1629
|
-
Returns:
|
1630
|
-
a list of the tag values
|
1631
|
-
|
1632
|
-
"""
|
1633
|
-
values = []
|
1634
|
-
for tag in self.get_tag(tag_name):
|
1635
|
-
values.append(tag["value"])
|
1636
|
-
|
1637
|
-
if include_children:
|
1638
|
-
for child in self.get_children():
|
1639
|
-
values.extend(child.get_tag_values(tag_name, include_children))
|
1640
|
-
|
1641
|
-
return values
|
1642
|
-
|
1643
|
-
def get_related_tag_values(
|
1644
|
-
self,
|
1645
|
-
tag_name: str,
|
1646
|
-
include_children: bool = False,
|
1647
|
-
value_separator: str = " ",
|
1648
|
-
tag_uuid=None,
|
1649
|
-
):
|
1650
|
-
"""Get the values for a specific tag name, grouped by uuid
|
1651
|
-
|
1652
|
-
Args:
|
1653
|
-
tag_name (str): tag name
|
1654
|
-
include_children (bool): include the children of this node
|
1655
|
-
value_separator (str): the string to be used to join related tag values
|
1656
|
-
|
1657
|
-
Returns:
|
1658
|
-
a list of the tag values
|
1659
|
-
|
1660
|
-
"""
|
1661
|
-
|
1662
|
-
def group_tag_values(group_dict, feature_val, tag_uuid, tag_node):
|
1663
|
-
"""
|
1664
|
-
This function groups tag values if they share the same uuid. It checks if the uuid of the feature value matches the tag uuid.
|
1665
|
-
If they match, it sets the final value to the feature value if it exists, otherwise it sets it to the tag node content.
|
1666
|
-
Then, it checks if the uuid is in the value groups keys. If it is, it appends the final value to the group.
|
1667
|
-
If it's the first occurrence, it sets the group to the final value.
|
1668
|
-
|
1669
|
-
Args:
|
1670
|
-
group_dict (dict): The dictionary to group the values in.
|
1671
|
-
feature_val (dict): The feature value to check.
|
1672
|
-
tag_uuid (str): The uuid of the tag.
|
1673
|
-
tag_node (Node): The node of the tag.
|
1674
|
-
|
1675
|
-
Returns:
|
1676
|
-
None
|
1677
|
-
"""
|
1678
|
-
# we know the names of all these tags are the same, but we want to group them if they share the same uuid
|
1679
|
-
|
1680
|
-
if feature_val["uuid"] != tag_uuid:
|
1681
|
-
return
|
1682
|
-
|
1683
|
-
final_value = feature_val["value"] if "value" in feature_val else None
|
1684
|
-
if final_value is None:
|
1685
|
-
final_value = tag_node.content
|
1686
|
-
|
1687
|
-
if feature_val["uuid"] in value_groups.keys():
|
1688
|
-
# we've seen this UUID - add it's value to the group
|
1689
|
-
group_dict[feature_val["uuid"]].append(final_value)
|
1690
|
-
else:
|
1691
|
-
# first occurrence
|
1692
|
-
group_dict[feature_val["uuid"]] = [final_value]
|
1693
|
-
|
1694
|
-
if include_children:
|
1695
|
-
tagged_nodes = self.document.get_tagged_nodes(tag_name, tag_uuid=tag_uuid)
|
1696
|
-
else:
|
1697
|
-
tagged_nodes = self.select(".")
|
1698
|
-
|
1699
|
-
value_groups: Dict[str, Any] = {}
|
1700
|
-
for tag_node in tagged_nodes:
|
1701
|
-
tag_feature_vals = tag_node.get_feature_value("tag", tag_name)
|
1702
|
-
if tag_feature_vals:
|
1703
|
-
if not isinstance(tag_feature_vals, list):
|
1704
|
-
tag_feature_vals = [tag_feature_vals]
|
1705
|
-
|
1706
|
-
for v in tag_feature_vals:
|
1707
|
-
group_tag_values(value_groups, v, tag_uuid, tag_node)
|
1708
|
-
|
1709
|
-
value_strings = []
|
1710
|
-
for k in value_groups.keys():
|
1711
|
-
if (
|
1712
|
-
value_groups[k]
|
1713
|
-
and len(value_groups[k]) > 0
|
1714
|
-
and value_groups[k][0] is not None
|
1715
|
-
):
|
1716
|
-
value_strings.append(value_separator.join(value_groups[k]))
|
1717
|
-
|
1718
|
-
return value_strings
|
1719
|
-
|
1720
|
-
def get_related_tag_nodes(
|
1721
|
-
self, tag_name: str, everywhere: bool = False, tag_uuid=None
|
1722
|
-
):
|
1723
|
-
"""Get the nodes for a specific tag name, grouped by uuid
|
1724
|
-
|
1725
|
-
Args:
|
1726
|
-
tag_name (str): tag name
|
1727
|
-
everywhere (bool): include the children of this node
|
1728
|
-
tag_uuid (optional(str)): if set we will only get nodes related to this tag UUID
|
1729
|
-
|
1730
|
-
Returns:
|
1731
|
-
a dictionary that groups nodes by tag UUID
|
1732
|
-
|
1733
|
-
"""
|
1734
|
-
if everywhere:
|
1735
|
-
tagged_nodes = self.document.get_tagged_nodes(tag_name, tag_uuid)
|
1736
|
-
else:
|
1737
|
-
tagged_nodes = [self]
|
1738
|
-
|
1739
|
-
# We need to group these nodes together based on the TAG UUID
|
1740
|
-
|
1741
|
-
node_groups = {}
|
1742
|
-
|
1743
|
-
for tagged_node in tagged_nodes:
|
1744
|
-
tag_instances = tagged_node.get_tag(tag_name)
|
1745
|
-
|
1746
|
-
for tag_instance in tag_instances:
|
1747
|
-
if "uuid" in tag_instance:
|
1748
|
-
if tag_instance["uuid"] not in node_groups:
|
1749
|
-
node_groups[tag_instance["uuid"]] = [tagged_node]
|
1750
|
-
else:
|
1751
|
-
node_groups[tag_instance["uuid"]].append(tagged_node)
|
1752
|
-
|
1753
|
-
return node_groups
|
1754
|
-
|
1755
|
-
def get_tag(self, tag_name, tag_uuid=None):
|
1756
|
-
"""Returns the value of a tag (a dictionary), this can be either a single value in a list [[start,end,value]] or if multiple parts of the
|
1757
|
-
content of this node match you can end up with a list of lists i.e. [[start1,end1,value1],[start2,end2,value2]]
|
1758
|
-
|
1759
|
-
Args:
|
1760
|
-
tag_name: The name of the tag
|
1761
|
-
tag_uuid (Optional): Optionally you can also provide the tag UUID
|
1762
|
-
|
1763
|
-
Returns:
|
1764
|
-
A list tagged location and values for this label in this node
|
1765
|
-
|
1766
|
-
>>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').get_tag('is_cheese')
|
1767
|
-
[0,10,'The Cheese Moved']
|
1768
|
-
"""
|
1769
|
-
tag_details = self.get_feature_value("tag", tag_name)
|
1770
|
-
|
1771
|
-
if tag_details is None:
|
1772
|
-
return []
|
1773
|
-
|
1774
|
-
if not isinstance(tag_details, list):
|
1775
|
-
tag_details = [tag_details]
|
1776
|
-
|
1777
|
-
final_result = []
|
1778
|
-
for tag_detail in tag_details:
|
1779
|
-
if "uuid" in tag_detail and tag_uuid:
|
1780
|
-
if tag_detail["uuid"] == tag_uuid:
|
1781
|
-
final_result.append(tag_detail)
|
1782
|
-
else:
|
1783
|
-
final_result.append(tag_detail)
|
1784
|
-
return final_result
|
1785
|
-
|
1786
|
-
def get_all_tags(self):
|
1787
|
-
"""Get the names of all tags that have been applied to this node or to its children.
|
1788
|
-
|
1789
|
-
Args:
|
1790
|
-
|
1791
|
-
Returns:
|
1792
|
-
list[str]: A list of the tag names belonging to this node and/or its children.
|
1793
|
-
|
1794
|
-
>>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').get_all_tags()
|
1795
|
-
['is_cheese']
|
1796
|
-
"""
|
1797
|
-
tags = []
|
1798
|
-
tags.extend(self.get_tags())
|
1799
|
-
for child in self.get_children():
|
1800
|
-
tags.extend(child.get_all_tags())
|
1801
|
-
return list(set(tags))
|
1802
|
-
|
1803
|
-
def has_tags(self):
|
1804
|
-
"""Determines if this node has any tags at all.
|
1805
|
-
|
1806
|
-
Args:
|
1807
|
-
|
1808
|
-
Returns:
|
1809
|
-
bool: True if node has any tags; else, False;
|
1810
|
-
|
1811
|
-
>>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').has_tags()
|
1812
|
-
True
|
1813
|
-
"""
|
1814
|
-
return len([i.value for i in self.get_features_of_type("tag")]) > 0
|
1815
|
-
|
1816
|
-
def has_tag(self, tag, include_children=False):
|
1817
|
-
"""Determine if this node has a tag with the specified name.
|
1818
|
-
|
1819
|
-
Args:
|
1820
|
-
tag(str): The name of the tag.
|
1821
|
-
include_children(bool): should we include child nodes
|
1822
|
-
|
1823
|
-
Returns:
|
1824
|
-
bool: True if node has a tag by the specified name; else, False;
|
1825
|
-
|
1826
|
-
>>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').has_tag('is_cheese')
|
1827
|
-
True
|
1828
|
-
>>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').has_tag('is_fish')
|
1829
|
-
False
|
1830
|
-
"""
|
1831
|
-
for feature in self.get_features():
|
1832
|
-
if feature.feature_type == "tag" and feature.name == tag:
|
1833
|
-
return True
|
1834
|
-
result = False
|
1835
|
-
if include_children:
|
1836
|
-
for child in self.get_children():
|
1837
|
-
if child.has_tag(tag, True):
|
1838
|
-
result = True
|
1839
|
-
return result
|
1840
|
-
|
1841
|
-
def is_first_child(self):
|
1842
|
-
"""Determines if this node is the first child of its parent or has no parent.
|
1843
|
-
|
1844
|
-
Args:
|
1845
|
-
|
1846
|
-
Returns:
|
1847
|
-
bool: True if this node is the first child of its parent or if this node has no parent; else, False;
|
1848
|
-
|
1849
|
-
"""
|
1850
|
-
if not self.parent:
|
1851
|
-
return True
|
1852
|
-
|
1853
|
-
return self.index == 0
|
1854
|
-
|
1855
|
-
def is_last_child(self):
|
1856
|
-
"""Determines if this node is the last child of its parent or has no parent.
|
1857
|
-
|
1858
|
-
Returns:
|
1859
|
-
bool: True if this node is the last child of its parent or if this node has no parent; else, False;
|
1860
|
-
|
1861
|
-
"""
|
1862
|
-
|
1863
|
-
if not self.get_parent():
|
1864
|
-
return True
|
1865
|
-
|
1866
|
-
return self.index == self.get_parent().get_last_child_index()
|
1867
|
-
|
1868
|
-
def get_last_child_index(self):
|
1869
|
-
"""Returns the max index value for the children of this node. If the node has no children, returns None.
|
1870
|
-
|
1871
|
-
Returns:
|
1872
|
-
int or None: The max index of the children of this node, or None if there are no children.
|
1873
|
-
|
1874
|
-
"""
|
1875
|
-
|
1876
|
-
if not self.get_children():
|
1877
|
-
return None
|
1878
|
-
|
1879
|
-
max_index = 0
|
1880
|
-
for child in self.get_children():
|
1881
|
-
if child.index > max_index:
|
1882
|
-
max_index = child.index
|
1883
|
-
|
1884
|
-
return max_index
|
1885
|
-
|
1886
|
-
def get_node_at_index(self, index):
|
1887
|
-
"""Returns the child node at the specified index. If the specified index is outside the first (0), or
|
1888
|
-
last child's index, None is returned.
|
1889
|
-
|
1890
|
-
Note: documents allow for sparse representation and child nodes may not have consecutive index numbers.
|
1891
|
-
If there isn't a child node at the specfied index, a 'virtual' node will be returned. This 'virtual' node
|
1892
|
-
will have the node type of its nearest sibling and will have an index value, but will have no features or content.
|
1893
|
-
|
1894
|
-
Args:
|
1895
|
-
index (int): The index (zero-based) for the child node.
|
1896
|
-
|
1897
|
-
Returns:
|
1898
|
-
ContentNode or None: Node at index, or None if the index is outside the boundaries of child nodes.
|
1899
|
-
|
1900
|
-
"""
|
1901
|
-
if self.get_children():
|
1902
|
-
if index < self.get_children()[0].index:
|
1903
|
-
virtual_node = self.document.create_node(
|
1904
|
-
node_type=self.get_children()[0].node_type,
|
1905
|
-
virtual=True,
|
1906
|
-
parent=self,
|
1907
|
-
index=index,
|
1908
|
-
)
|
1909
|
-
return virtual_node
|
1910
|
-
|
1911
|
-
last_child = None
|
1912
|
-
for child in self.get_children():
|
1913
|
-
if child.index < index:
|
1914
|
-
last_child = child
|
1915
|
-
elif child.index == index:
|
1916
|
-
return child
|
1917
|
-
else:
|
1918
|
-
break
|
1919
|
-
|
1920
|
-
if last_child:
|
1921
|
-
if last_child.index != index and index < self.get_children()[-1].index:
|
1922
|
-
virtual_node = self.document.create_node(
|
1923
|
-
node_type=last_child.node_type,
|
1924
|
-
virtual=True,
|
1925
|
-
parent=self,
|
1926
|
-
index=index,
|
1927
|
-
)
|
1928
|
-
return virtual_node
|
1929
|
-
else:
|
1930
|
-
return None
|
1931
|
-
else:
|
1932
|
-
return None
|
1933
|
-
|
1934
|
-
def has_next_node(self, node_type_re=".*", skip_virtual=False):
|
1935
|
-
"""Determine if this node has a next sibling that matches the type specified by the node_type_re regex.
|
1936
|
-
|
1937
|
-
Args:
|
1938
|
-
node_type_re(str, optional, optional): The regular expression to match against the next sibling node's type; default is '.*'.
|
1939
|
-
skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
|
1940
|
-
|
1941
|
-
Returns:
|
1942
|
-
bool: True if there is a next sibling node matching the specified type regex; else, False.
|
1943
|
-
|
1944
|
-
"""
|
1945
|
-
return self.next_node(node_type_re, skip_virtual=skip_virtual) is not None
|
1946
|
-
|
1947
|
-
def has_previous_node(self, node_type_re=".*", skip_virtual=False):
|
1948
|
-
"""Determine if this node has a previous sibling that matches the type specified by the node_type_re regex.
|
1949
|
-
|
1950
|
-
Args:
|
1951
|
-
node_type_re(str, optional, optional): The regular expression to match against the previous sibling node's type; default is '.*'.
|
1952
|
-
skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
|
1953
|
-
|
1954
|
-
Returns:
|
1955
|
-
bool: True if there is a previous sibling node matching the specified type regex; else, False.
|
1956
|
-
|
1957
|
-
"""
|
1958
|
-
return (
|
1959
|
-
self.previous_node(node_type_re=node_type_re, skip_virtual=skip_virtual)
|
1960
|
-
is not None
|
1961
|
-
)
|
1962
|
-
|
1963
|
-
def next_node(
|
1964
|
-
self,
|
1965
|
-
node_type_re=".*",
|
1966
|
-
skip_virtual=False,
|
1967
|
-
has_no_content=True,
|
1968
|
-
traverse=Traverse.SIBLING,
|
1969
|
-
):
|
1970
|
-
"""Returns the next sibling content node.
|
1971
|
-
|
1972
|
-
Note: This logic relies on node indexes. Documents allow for sparse representation and child nodes may not have consecutive index numbers.
|
1973
|
-
Therefore, the next node might actually be a virtual node that is created to fill a gap in the document. You can skip virtual nodes by setting the
|
1974
|
-
skip_virtual parameter to False.
|
1975
|
-
|
1976
|
-
Args:
|
1977
|
-
node_type_re(str, optional, optional): The regular expression to match against the next sibling node's type; default is '.*'.
|
1978
|
-
skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
|
1979
|
-
has_no_content(bool, optional, optional): Allow a node that has no content to be returned; default is True.
|
1980
|
-
|
1981
|
-
Returns:
|
1982
|
-
ContentNode or None: The next node or None, if no node exists
|
1983
|
-
|
1984
|
-
"""
|
1985
|
-
search_index = self.index + 1
|
1986
|
-
compiled_node_type_re = re.compile(node_type_re)
|
1987
|
-
|
1988
|
-
while True:
|
1989
|
-
node = (
|
1990
|
-
self.get_parent().get_node_at_index(search_index)
|
1991
|
-
if self.get_parent()
|
1992
|
-
else None
|
1993
|
-
)
|
1994
|
-
|
1995
|
-
if not node:
|
1996
|
-
if (
|
1997
|
-
traverse == traverse.ALL or traverse == traverse.PARENT
|
1998
|
-
) and self.get_parent().get_parent():
|
1999
|
-
# noinspection PyBroadException
|
2000
|
-
try:
|
2001
|
-
potential_next_node = (
|
2002
|
-
self.get_parent()
|
2003
|
-
.get_parent()
|
2004
|
-
.get_children()[self.get_parent().index + 1]
|
2005
|
-
.get_children()[0]
|
2006
|
-
)
|
2007
|
-
if potential_next_node:
|
2008
|
-
return potential_next_node
|
2009
|
-
except Exception:
|
2010
|
-
|
2011
|
-
# traverse additional layer
|
2012
|
-
potential_next_node = (
|
2013
|
-
self.get_parent()
|
2014
|
-
.get_parent()
|
2015
|
-
.get_parent()
|
2016
|
-
.get_children()[self.get_parent().get_parent().index + 1]
|
2017
|
-
.get_children()[0]
|
2018
|
-
.get_children()[0]
|
2019
|
-
)
|
2020
|
-
if potential_next_node:
|
2021
|
-
return potential_next_node
|
2022
|
-
return node
|
2023
|
-
|
2024
|
-
if compiled_node_type_re.match(node.node_type) and (
|
2025
|
-
not skip_virtual or not node.virtual
|
2026
|
-
):
|
2027
|
-
if (not has_no_content and node.content) or has_no_content:
|
2028
|
-
return node
|
2029
|
-
|
2030
|
-
search_index += 1
|
2031
|
-
|
2032
|
-
def previous_node(
|
2033
|
-
self,
|
2034
|
-
node_type_re=".*",
|
2035
|
-
skip_virtual=False,
|
2036
|
-
has_no_content=False,
|
2037
|
-
traverse=Traverse.SIBLING,
|
2038
|
-
):
|
2039
|
-
"""Returns the previous sibling content node.
|
2040
|
-
|
2041
|
-
Note: This logic relies on node indexes. Documents allow for sparse representation and child nodes may not have consecutive index numbers.
|
2042
|
-
Therefore, the previous node might actually be a virtual node that is created to fill a gap in the document. You can skip virtual nodes by setting the
|
2043
|
-
skip_virtual parameter to False.
|
2044
|
-
|
2045
|
-
Args:
|
2046
|
-
node_type_re(str, optional, optional): The regular expression to match against the previous node's type; default is '.*'.
|
2047
|
-
skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
|
2048
|
-
has_no_content(bool, optional, optional): Allow a node that has no content to be returned; default is False.
|
2049
|
-
traverse(Traverse(enum), optional, optional): The transition you'd like to traverse (SIBLING, CHILDREN, PARENT, or ALL); default is Traverse.SIBLING.
|
2050
|
-
|
2051
|
-
Returns:
|
2052
|
-
ContentNode or None: The previous node or None, if no node exists
|
2053
|
-
|
2054
|
-
"""
|
2055
|
-
|
2056
|
-
# TODO: implement/differentiate traverse logic for CHILDREN and SIBLING
|
2057
|
-
if self.index == 0:
|
2058
|
-
if (
|
2059
|
-
traverse == traverse.ALL
|
2060
|
-
or traverse == traverse.PARENT
|
2061
|
-
and self.get_parent()
|
2062
|
-
):
|
2063
|
-
# Lets look for a previous node on the parent
|
2064
|
-
return self.get_parent().previous_node(
|
2065
|
-
node_type_re, skip_virtual, has_no_content, traverse
|
2066
|
-
)
|
2067
|
-
|
2068
|
-
return None
|
2069
|
-
|
2070
|
-
search_index = self.index - 1
|
2071
|
-
compiled_node_type_re = re.compile(node_type_re)
|
2072
|
-
|
2073
|
-
while True:
|
2074
|
-
node = self.get_parent().get_node_at_index(search_index)
|
2075
|
-
|
2076
|
-
if not node:
|
2077
|
-
return node
|
2078
|
-
|
2079
|
-
if compiled_node_type_re.match(node.node_type) and (
|
2080
|
-
not skip_virtual or not node.virtual
|
2081
|
-
):
|
2082
|
-
if (not has_no_content) or (has_no_content and not node.content):
|
2083
|
-
return node
|
2084
|
-
|
2085
|
-
search_index -= 1
|
2086
|
-
|
2087
|
-
|
2088
|
-
class ContentFeature(object):
|
2089
|
-
"""
|
2090
|
-
A feature allows you to capture almost any additional data or metadata and associate it with a ContentNode.
|
2091
|
-
"""
|
2092
|
-
|
2093
|
-
"""A feature allows you to capture almost any additional data or metadata and associate it with a ContentNode"""
|
2094
|
-
|
2095
|
-
def __init__(self, feature_type: str, name: str, value: Any, single: bool = True):
|
2096
|
-
self.feature_type: str = feature_type
|
2097
|
-
"""The type of feature, a logical name to group feature types together (ie. spatial)"""
|
2098
|
-
self.name: str = name
|
2099
|
-
"""The name of the feature (ie. bbox)"""
|
2100
|
-
self.value: Any = value
|
2101
|
-
"""Description of the feature (Optional)"""
|
2102
|
-
self.single: bool = single
|
2103
|
-
"""Determines whether the data for this feature is a single instance or an array, if you have added the same feature to the same node you will end up with multiple data elements in the content feature and the single flag will be false"""
|
2104
|
-
|
2105
|
-
def __str__(self):
|
2106
|
-
return f"Feature [type='{self.feature_type}' name='{self.name}' value='{self.value}' single='{self.single}']"
|
2107
|
-
|
2108
|
-
def to_dict(self):
|
2109
|
-
"""
|
2110
|
-
Create a dictionary representing this ContentFeature's structure and content.
|
2111
|
-
|
2112
|
-
Returns:
|
2113
|
-
dict: The properties of this ContentFeature structured as a dictionary.
|
2114
|
-
"""
|
2115
|
-
return {
|
2116
|
-
"name": self.feature_type + ":" + self.name,
|
2117
|
-
"value": self.value,
|
2118
|
-
"single": self.single,
|
2119
|
-
}
|
2120
|
-
|
2121
|
-
def get_value(self):
|
2122
|
-
"""
|
2123
|
-
Get the value from the feature. This method will handle the single flag.
|
2124
|
-
|
2125
|
-
Returns:
|
2126
|
-
Any: The value of the feature.
|
2127
|
-
"""
|
2128
|
-
if self.single:
|
2129
|
-
return self.value[0]
|
2130
|
-
|
2131
|
-
return self.value
|
2132
|
-
|
2133
|
-
|
2134
|
-
class ModelInsight(BaseModel):
|
2135
|
-
model_config = ConfigDict(populate_by_name=True, use_enum_values=True, arbitrary_types_allowed=True,
|
2136
|
-
protected_namespaces=("model_config",))
|
2137
|
-
"""
|
2138
|
-
A class used to represent the insights of a model.
|
2139
|
-
|
2140
|
-
Attributes:
|
2141
|
-
model_ref (str): The reference to the model.
|
2142
|
-
insight_type (str): The type of the insight.
|
2143
|
-
description (Optional[str]): The description of the insight, default is None.
|
2144
|
-
details (Optional[str]): The details of the insight, default is None.
|
2145
|
-
properties (Optional[Dict]): The properties of the insight, default is None.
|
2146
|
-
"""
|
2147
|
-
|
2148
|
-
model_ref: str
|
2149
|
-
insight_type: str
|
2150
|
-
description: Optional[str] = None
|
2151
|
-
details: Optional[str] = None
|
2152
|
-
properties: Optional[Dict] = None
|
2153
|
-
|
2154
|
-
|
2155
|
-
@dataclasses.dataclass()
|
2156
|
-
class SourceMetadata:
|
2157
|
-
"""Class for keeping track of an original source information for a document.
|
2158
|
-
|
2159
|
-
Attributes:
|
2160
|
-
original_filename (Optional[str]): The original filename of the document.
|
2161
|
-
original_path (Optional[str]): The original path of the document.
|
2162
|
-
checksum (Optional[str]): The checksum of the document.
|
2163
|
-
cid (Optional[str]): The ID used for internal caching.
|
2164
|
-
last_modified (Optional[str]): The last modified date of the document.
|
2165
|
-
created (Optional[str]): The creation date of the document.
|
2166
|
-
connector (Optional[str]): The connector used for the document.
|
2167
|
-
mime_type (Optional[str]): The MIME type of the document.
|
2168
|
-
headers (Optional[Dict]): The headers of the document.
|
2169
|
-
lineage_document_uuid (Optional[str]): The UUID of the document that this document was derived from.
|
2170
|
-
source_document_uuid (Optional[str]): The UUID of the original first document.
|
2171
|
-
pdf_document_uuid (Optional[str]): The UUID of the document in a PDF form (used for archiving and preview).
|
2172
|
-
"""
|
2173
|
-
|
2174
|
-
"""Class for keeping track of the original source information for a
|
2175
|
-
document
|
2176
|
-
|
2177
|
-
Args:
|
2178
|
-
|
2179
|
-
Returns:
|
2180
|
-
|
2181
|
-
"""
|
2182
|
-
original_filename: Optional[str] = None
|
2183
|
-
original_path: Optional[str] = None
|
2184
|
-
checksum: Optional[str] = None
|
2185
|
-
|
2186
|
-
# The ID used for internal caching
|
2187
|
-
cid: Optional[str] = None
|
2188
|
-
last_modified: Optional[str] = None
|
2189
|
-
created: Optional[str] = None
|
2190
|
-
connector: Optional[str] = None
|
2191
|
-
mime_type: Optional[str] = None
|
2192
|
-
headers: Optional[Dict] = None
|
2193
|
-
|
2194
|
-
# The UUID of the document that this document was derived from
|
2195
|
-
# noting that multiple documents coming from an original source
|
2196
|
-
lineage_document_uuid: Optional[str] = None
|
2197
|
-
|
2198
|
-
# The UUID of the original first document
|
2199
|
-
source_document_uuid: Optional[str] = None
|
2200
|
-
|
2201
|
-
# The UUID of the document in a PDF form (used for archiving and preview)
|
2202
|
-
pdf_document_uuid: Optional[str] = None
|
2203
|
-
|
2204
|
-
@classmethod
|
2205
|
-
def from_dict(cls, env):
|
2206
|
-
"""Creates an instance of the class from a dictionary.
|
2207
|
-
|
2208
|
-
Args:
|
2209
|
-
env (dict): A dictionary containing the attributes of the class.
|
2210
|
-
|
2211
|
-
Returns:
|
2212
|
-
SourceMetadata: An instance of the class.
|
2213
|
-
"""
|
2214
|
-
return cls(
|
2215
|
-
**{k: v for k, v in env.items() if k in inspect.signature(cls).parameters}
|
2216
|
-
)
|
2217
|
-
|
2218
|
-
|
2219
|
-
class FeatureSetDiff:
|
2220
|
-
"""
|
2221
|
-
A utility class that can be used to diff two feature sets.
|
2222
|
-
"""
|
2223
|
-
|
2224
|
-
"""
|
2225
|
-
A utility class that can be used to diff two feature sets
|
2226
|
-
"""
|
2227
|
-
|
2228
|
-
def __init__(self, first_feature_set: FeatureSet, second_feature_set: FeatureSet):
|
2229
|
-
self.first_feature_map = self.parse_feature_set(first_feature_set)
|
2230
|
-
self.second_feature_map = self.parse_feature_set(second_feature_set)
|
2231
|
-
self._differences = deepdiff.DeepDiff(
|
2232
|
-
self.first_feature_map,
|
2233
|
-
self.second_feature_map,
|
2234
|
-
exclude_obj_callback=self.exclude_callback,
|
2235
|
-
).to_dict()
|
2236
|
-
|
2237
|
-
def get_differences(self):
|
2238
|
-
"""
|
2239
|
-
Gets the differences between the two feature sets.
|
2240
|
-
|
2241
|
-
Returns:
|
2242
|
-
dict: A dictionary containing the differences between the two feature sets.
|
2243
|
-
"""
|
2244
|
-
if "type_changes" in self._differences:
|
2245
|
-
self._differences.pop("type_changes")
|
2246
|
-
|
2247
|
-
return self._differences
|
2248
|
-
|
2249
|
-
def get_exclude_paths(self):
|
2250
|
-
"""
|
2251
|
-
Gets the paths to exclude.
|
2252
|
-
|
2253
|
-
Returns:
|
2254
|
-
list: A list of paths to exclude.
|
2255
|
-
"""
|
2256
|
-
return ["shape", "group_uuid", "uuid", "parent_group_uuid", "single"]
|
2257
|
-
|
2258
|
-
def exclude_callback(self, path, key):
|
2259
|
-
"""
|
2260
|
-
Checks if the key is to be excluded from the diff.
|
2261
|
-
|
2262
|
-
Args:
|
2263
|
-
path (str): The path that contains the values of the key.
|
2264
|
-
key (str): The key of the data dictionary to compare.
|
2265
|
-
|
2266
|
-
Returns:
|
2267
|
-
bool: True if the key is to be excluded, False otherwise.
|
2268
|
-
"""
|
2269
|
-
if any(re.search(exclude_key, key) for exclude_key in self.get_exclude_paths()):
|
2270
|
-
return True
|
2271
|
-
else:
|
2272
|
-
return False
|
2273
|
-
|
2274
|
-
def parse_feature_set(self, feature_set: FeatureSet):
|
2275
|
-
"""
|
2276
|
-
Parses the feature set.
|
2277
|
-
|
2278
|
-
Args:
|
2279
|
-
feature_set (FeatureSet): The feature set to be parsed.
|
2280
|
-
|
2281
|
-
Returns:
|
2282
|
-
dict: A dictionary of features with the key as the nodeUuid.
|
2283
|
-
"""
|
2284
|
-
return {
|
2285
|
-
feature.get("nodeUuid"): feature for feature in feature_set.node_features
|
2286
|
-
}
|
2287
|
-
|
2288
|
-
def parsed_values_changed(self):
|
2289
|
-
"""
|
2290
|
-
Checks if the old value is still in the second feature map. If it is, remove the key.
|
2291
|
-
"""
|
2292
|
-
for key, value in self._differences.get("values_changed").items():
|
2293
|
-
# Check if the old_value is stil in the second_feature_map. If it is remove the key
|
2294
|
-
if key in self.second_feature_map.node_features:
|
2295
|
-
self._differences.get("values_changed").remove(key)
|
2296
|
-
|
2297
|
-
def is_equal(self) -> bool:
|
2298
|
-
"""
|
2299
|
-
Checks if the two feature sets are equal to each other.
|
2300
|
-
|
2301
|
-
Returns:
|
2302
|
-
bool: True if the feature sets are equal, False otherwise.
|
2303
|
-
"""
|
2304
|
-
return self._differences == {}
|
2305
|
-
|
2306
|
-
def get_changed_nodes(self):
|
2307
|
-
"""
|
2308
|
-
Gets the nodes that were changed.
|
2309
|
-
|
2310
|
-
Returns:
|
2311
|
-
dict: A dictionary containing the nodes that were changed.
|
2312
|
-
"""
|
2313
|
-
if self.is_equal():
|
2314
|
-
return []
|
2315
|
-
|
2316
|
-
# Check for new nodes added in the second_feature_map
|
2317
|
-
new_added_nodes = []
|
2318
|
-
|
2319
|
-
# Checked for removed nodes in the first_feature_map
|
2320
|
-
removed_nodes = []
|
2321
|
-
|
2322
|
-
# Checked for modified nodes
|
2323
|
-
modified_nodes = []
|
2324
|
-
for key, value in self._differences.get("values_changed").items():
|
2325
|
-
modified_nodes.append(self.parsed_node_uuid(key))
|
2326
|
-
|
2327
|
-
# Merge unique nodeUuid of first_feature_map and second_feature_map
|
2328
|
-
merged_node_uuids = set(self.first_feature_map.keys()).union(
|
2329
|
-
set(self.second_feature_map.keys())
|
2330
|
-
)
|
2331
|
-
for node_uuid in merged_node_uuids:
|
2332
|
-
if node_uuid not in self.first_feature_map:
|
2333
|
-
new_added_nodes.append(node_uuid)
|
2334
|
-
elif node_uuid not in self.second_feature_map:
|
2335
|
-
removed_nodes.append(node_uuid)
|
2336
|
-
|
2337
|
-
return {
|
2338
|
-
"new_added_nodes": new_added_nodes,
|
2339
|
-
"removed_nodes": removed_nodes,
|
2340
|
-
"existing_modified_nodes": modified_nodes,
|
2341
|
-
}
|
2342
|
-
|
2343
|
-
def get_difference_count(self):
|
2344
|
-
"""
|
2345
|
-
Gets the total number of differences between the feature sets.
|
2346
|
-
|
2347
|
-
Returns:
|
2348
|
-
int: The total number of differences between the feature sets.
|
2349
|
-
"""
|
2350
|
-
return len(self._differences().keys())
|
2351
|
-
|
2352
|
-
def parsed_item_added(self):
|
2353
|
-
"""
|
2354
|
-
Parses the items that were added.
|
2355
|
-
|
2356
|
-
Returns:
|
2357
|
-
dict: A dictionary containing the items that were added.
|
2358
|
-
"""
|
2359
|
-
item_added: Dict = self._differences.get("iterable_item_added")
|
2360
|
-
if item_added:
|
2361
|
-
return {}
|
2362
|
-
|
2363
|
-
for key, value in item_added.items():
|
2364
|
-
node = self.parsed_node_uuid(key)
|
2365
|
-
if node in self._changed_nodes["new_added_nodes"]:
|
2366
|
-
self._differences["iterable_item_added"][key][
|
2367
|
-
"details"
|
2368
|
-
] = f"Node: {node} was added"
|
2369
|
-
continue
|
2370
|
-
|
2371
|
-
# if node in
|
2372
|
-
return self.get_difference_count()
|
2373
|
-
|
2374
|
-
def parsed_node_uuid(self, key):
|
2375
|
-
"""
|
2376
|
-
Parses the node uuid from the key.
|
2377
|
-
|
2378
|
-
Args:
|
2379
|
-
key (str): The key of the data dictionary.
|
2380
|
-
|
2381
|
-
Returns:
|
2382
|
-
str: The node uuid from the key.
|
2383
|
-
"""
|
2384
|
-
node = key.split("['")[1].split("']")[0]
|
2385
|
-
return node
|
2386
|
-
|
2387
|
-
|
2388
|
-
class ProcessingStep(BaseModel):
|
2389
|
-
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
2390
|
-
name: str
|
2391
|
-
metadata: dict = Field(default_factory=lambda: {})
|
2392
|
-
presentation_metadata: dict = Field(default_factory=lambda: {}, alias='presentationMetadata')
|
2393
|
-
children: List['ProcessingStep'] = Field(default_factory=list)
|
2394
|
-
parents: List['ProcessingStep'] = Field(default_factory=list)
|
2395
|
-
|
2396
|
-
def add_child(self, child_step: 'ProcessingStep'):
|
2397
|
-
self.children.append(child_step)
|
2398
|
-
child_step.parents.append(self)
|
2399
|
-
|
2400
|
-
@staticmethod
|
2401
|
-
def merge_with(*other_steps: 'ProcessingStep') -> 'ProcessingStep':
|
2402
|
-
merged_step = ProcessingStep(name=f"Merged({', '.join(step.name for step in other_steps)})")
|
2403
|
-
for step in other_steps:
|
2404
|
-
step.children.append(merged_step)
|
2405
|
-
merged_step.parents.append(step)
|
2406
|
-
return merged_step
|
2407
|
-
|
2408
|
-
class Config:
|
2409
|
-
arbitrary_types_allowed = True
|
2410
|
-
json_encoders = {
|
2411
|
-
'ProcessingStep': lambda step: step.to_dict()
|
2412
|
-
}
|
2413
|
-
|
2414
|
-
def to_dict(self, seen=None):
|
2415
|
-
if seen is None:
|
2416
|
-
seen = set()
|
2417
|
-
|
2418
|
-
# Avoid circular references by skipping already seen objects
|
2419
|
-
if self.id in seen:
|
2420
|
-
return {'id': self.id, 'name': self.name}
|
2421
|
-
|
2422
|
-
seen.add(self.id)
|
2423
|
-
|
2424
|
-
return {
|
2425
|
-
'id': self.id,
|
2426
|
-
'name': self.name,
|
2427
|
-
'metadata': self.metadata,
|
2428
|
-
'presentationMetadata': self.presentation_metadata,
|
2429
|
-
'children': [child.to_dict(seen) for child in self.children],
|
2430
|
-
'parents': [{'id': parent.id, 'name': parent.name} for parent in self.parents], # or parent.to_dict(seen) if full structure is needed
|
2431
|
-
}
|
2432
|
-
|
2433
|
-
def to_json(self):
|
2434
|
-
return json.dumps(self.to_dict())
|
2435
|
-
|
2436
|
-
def __repr__(self):
|
2437
|
-
return f"Step(id={self.id}, name={self.name})"
|
2438
|
-
|
2439
|
-
|
2440
|
-
class Document(object):
|
2441
|
-
"""A Document is a collection of metadata and a set of content nodes."""
|
2442
|
-
|
2443
|
-
PREVIOUS_VERSION: str = "1.0.0"
|
2444
|
-
CURRENT_VERSION: str = "6.0.0"
|
2445
|
-
|
2446
|
-
def __str__(self):
|
2447
|
-
return f"kodexa://{self.uuid}"
|
2448
|
-
|
2449
|
-
def get_validations(self) -> list[DocumentTaxonValidation]:
|
2450
|
-
return self.get_persistence().get_validations()
|
2451
|
-
|
2452
|
-
def set_validations(self, validations: list[DocumentTaxonValidation]):
|
2453
|
-
self.get_persistence().set_validations(validations)
|
2454
|
-
|
2455
|
-
def add_exception(self, exception: ContentException):
|
2456
|
-
self._persistence_layer.add_exception(exception)
|
2457
|
-
|
2458
|
-
def get_exceptions(self) -> List[ContentException]:
|
2459
|
-
return self._persistence_layer.get_exceptions()
|
2460
|
-
|
2461
|
-
def get_external_data(self, key="default") -> dict:
|
2462
|
-
return self._persistence_layer.get_external_data(key)
|
2463
|
-
|
2464
|
-
def get_external_data_keys(self) -> list[str]:
|
2465
|
-
return self._persistence_layer.get_external_data_keys()
|
2466
|
-
|
2467
|
-
def set_external_data(self, external_data:dict, key="default"):
|
2468
|
-
return self._persistence_layer.set_external_data(external_data, key)
|
2469
|
-
|
2470
|
-
def get_steps(self) -> list[ProcessingStep]:
|
2471
|
-
return self._persistence_layer.get_steps()
|
2472
|
-
|
2473
|
-
def set_steps(self, steps: list[ProcessingStep]):
|
2474
|
-
self._persistence_layer.set_steps(steps)
|
2475
|
-
|
2476
|
-
def replace_exceptions(self, exceptions: List[ContentException]):
|
2477
|
-
self._persistence_layer.replace_exceptions(exceptions)
|
2478
|
-
|
2479
|
-
def __init__(
|
2480
|
-
self,
|
2481
|
-
metadata=None,
|
2482
|
-
content_node: ContentNode = None,
|
2483
|
-
source=None,
|
2484
|
-
ref: str = None,
|
2485
|
-
kddb_path: str = None,
|
2486
|
-
delete_on_close=False,
|
2487
|
-
inmemory=False,
|
2488
|
-
):
|
2489
|
-
if metadata is None:
|
2490
|
-
metadata = DocumentMetadata()
|
2491
|
-
if source is None:
|
2492
|
-
source = SourceMetadata()
|
2493
|
-
|
2494
|
-
# Mix-ins are going away - so we will allow people to turn them off as needed
|
2495
|
-
self.disable_mixin_methods = True
|
2496
|
-
|
2497
|
-
self.delete_on_close = delete_on_close
|
2498
|
-
|
2499
|
-
# The ref is not stored and is used when we have
|
2500
|
-
# initialized a document from a remote store and want
|
2501
|
-
# to keep track of that
|
2502
|
-
self.ref = ref
|
2503
|
-
|
2504
|
-
self.metadata: DocumentMetadata = metadata
|
2505
|
-
"""Metadata relating to the document"""
|
2506
|
-
self._content_node: Optional[ContentNode] = content_node
|
2507
|
-
"""The root content node"""
|
2508
|
-
self.virtual: bool = False
|
2509
|
-
"""Is the document virtual (deprecated)"""
|
2510
|
-
self._mixins: List[str] = []
|
2511
|
-
"""A list of the mixins for this document"""
|
2512
|
-
self.uuid: str = str(uuid.uuid4())
|
2513
|
-
"""A log for this document (deprecated)"""
|
2514
|
-
self.version = Document.CURRENT_VERSION
|
2515
|
-
"""The version of the document"""
|
2516
|
-
self.source: SourceMetadata = source
|
2517
|
-
"""Source metadata for this document"""
|
2518
|
-
self.labels: List[str] = []
|
2519
|
-
"""A list of the document level labels for the document"""
|
2520
|
-
self.tag_instances: List[TagInstance] = []
|
2521
|
-
"""A list of tag instances that contains a set of tag that has a set of nodes"""
|
2522
|
-
|
2523
|
-
# Start persistence layer
|
2524
|
-
from kodexa.model import PersistenceManager
|
2525
|
-
|
2526
|
-
self._persistence_layer: Optional[PersistenceManager] = PersistenceManager(
|
2527
|
-
document=self, filename=kddb_path, delete_on_close=delete_on_close, inmemory=inmemory
|
2528
|
-
)
|
2529
|
-
self._persistence_layer.initialize()
|
2530
|
-
|
2531
|
-
def remove_tags_by_owner(self, owner_uri: str):
|
2532
|
-
|
2533
|
-
for tag in self.get_all_tags():
|
2534
|
-
for tag_instance in self.get_tag_instances(tag):
|
2535
|
-
tag_meta: dict = tag_instance.get_data()
|
2536
|
-
if 'owner_uri' in tag_meta and tag_meta['owner_uri'] == owner_uri:
|
2537
|
-
for node in tag_instance.nodes:
|
2538
|
-
node.remove_tag(tag)
|
2539
|
-
|
2540
|
-
def get_nodes_by_type(self, node_type: str) -> List[ContentNode]:
|
2541
|
-
"""
|
2542
|
-
Get all the nodes of a specific type
|
2543
|
-
|
2544
|
-
Args:
|
2545
|
-
node_type: the type of the node
|
2546
|
-
|
2547
|
-
Returns:
|
2548
|
-
a list of nodes
|
2549
|
-
|
2550
|
-
"""
|
2551
|
-
return self._persistence_layer.get_nodes_by_type(node_type)
|
2552
|
-
|
2553
|
-
def get_node_by_uuid(self, uuid: int) -> ContentNode:
|
2554
|
-
"""
|
2555
|
-
Get a node by its uuid
|
2556
|
-
|
2557
|
-
Args:
|
2558
|
-
uuid: the uuid of the node
|
2559
|
-
|
2560
|
-
Returns:
|
2561
|
-
the node
|
2562
|
-
|
2563
|
-
"""
|
2564
|
-
return self._persistence_layer.get_node_by_uuid(uuid)
|
2565
|
-
|
2566
|
-
def add_tag_instance(self, tag_to_apply: str, node_list: List[ContentNode]):
|
2567
|
-
"""
|
2568
|
-
This will create a group of a tag with indexes
|
2569
|
-
:param tag_to_apply: name of the tag
|
2570
|
-
:param node_list: contains the list of index of a node
|
2571
|
-
:return:
|
2572
|
-
"""
|
2573
|
-
# For each node in the list create/update a feature
|
2574
|
-
tag = Tag()
|
2575
|
-
for node in node_list:
|
2576
|
-
node.add_feature("tag", tag_to_apply, Tag)
|
2577
|
-
# Tag Object
|
2578
|
-
tag_instance = TagInstance(tag, node_list)
|
2579
|
-
self.tag_instances.append(tag_instance)
|
2580
|
-
|
2581
|
-
def update_tag_instance(self, tag_uuid):
|
2582
|
-
for tag_instance in self.tag_instances:
|
2583
|
-
if tag_instance.tag.uuid == tag_uuid:
|
2584
|
-
# Update attributes of a Tag
|
2585
|
-
for node in tag_instance.nodes:
|
2586
|
-
node.get_tag(tag_instance.tag.value, tag_uuid=tag_instance.tag.uuid)
|
2587
|
-
|
2588
|
-
def get_tag_instance(self, tag):
|
2589
|
-
"""
|
2590
|
-
Get the tag instance based on the tag itself
|
2591
|
-
:param tag: name of the tag
|
2592
|
-
:return: a list of tag instance
|
2593
|
-
"""
|
2594
|
-
return [
|
2595
|
-
tag_instance
|
2596
|
-
for tag_instance in self.tag_instances
|
2597
|
-
if tag_instance.tag == tag
|
2598
|
-
]
|
2599
|
-
|
2600
|
-
def get_persistence(self):
|
2601
|
-
return self._persistence_layer
|
2602
|
-
|
2603
|
-
def get_all_tags(self):
|
2604
|
-
return self._persistence_layer.get_all_tags()
|
2605
|
-
|
2606
|
-
def add_model_insight(self, model_insight: ModelInsight):
|
2607
|
-
self._persistence_layer.add_model_insight(model_insight)
|
2608
|
-
|
2609
|
-
def clear_model_insights(self):
|
2610
|
-
self._persistence_layer.clear_model_insights()
|
2611
|
-
|
2612
|
-
def get_model_insights(self) -> List[ModelInsight]:
|
2613
|
-
return self._persistence_layer.get_model_insights()
|
2614
|
-
|
2615
|
-
def get_tagged_nodes(self, tag_name, tag_uuid=None):
|
2616
|
-
return self._persistence_layer.get_tagged_nodes(tag_name, tag_uuid)
|
2617
|
-
|
2618
|
-
@property
|
2619
|
-
def content_node(self):
|
2620
|
-
"""The root content Node"""
|
2621
|
-
return self._content_node
|
2622
|
-
|
2623
|
-
@content_node.setter
|
2624
|
-
def content_node(self, value):
|
2625
|
-
value.index = 0
|
2626
|
-
if value != self._content_node and self._content_node is not None:
|
2627
|
-
self.get_persistence().remove_content_node(self._content_node)
|
2628
|
-
|
2629
|
-
self._content_node = value
|
2630
|
-
if value is not None:
|
2631
|
-
self.get_persistence().add_content_node(self._content_node, None)
|
2632
|
-
|
2633
|
-
def get_tag_instances(self, tag):
|
2634
|
-
groups = self.content_node.get_related_tag_nodes(tag, everywhere=True)
|
2635
|
-
tag_instances = []
|
2636
|
-
|
2637
|
-
class TagInstance:
|
2638
|
-
"""
|
2639
|
-
A class to represent a TagInstance.
|
2640
|
-
|
2641
|
-
...
|
2642
|
-
|
2643
|
-
Attributes
|
2644
|
-
----------
|
2645
|
-
tag_uuid : str
|
2646
|
-
a string that represents the unique identifier of the tag
|
2647
|
-
nodes : list
|
2648
|
-
a list of nodes associated with the tag
|
2649
|
-
|
2650
|
-
Methods
|
2651
|
-
-------
|
2652
|
-
get_value():
|
2653
|
-
Returns the combined content of all nodes.
|
2654
|
-
get_data():
|
2655
|
-
Returns the data of the tag feature with the same uuid as the tag.
|
2656
|
-
"""
|
2657
|
-
|
2658
|
-
def __init__(self, tag_uuid, nodes):
|
2659
|
-
self.tag_uuid = tag_uuid
|
2660
|
-
self.nodes = nodes
|
2661
|
-
|
2662
|
-
def get_value(self):
|
2663
|
-
"""
|
2664
|
-
Combines and returns the content of all nodes.
|
2665
|
-
|
2666
|
-
Returns
|
2667
|
-
-------
|
2668
|
-
str
|
2669
|
-
a string that represents the combined content of all nodes
|
2670
|
-
"""
|
2671
|
-
content_parts = []
|
2672
|
-
for node in self.nodes:
|
2673
|
-
content_parts.append(node.get_all_content())
|
2674
|
-
return " ".join(content_parts)
|
2675
|
-
|
2676
|
-
def get_data(self):
|
2677
|
-
"""
|
2678
|
-
Returns the data of the tag feature with the same uuid as the tag.
|
2679
|
-
|
2680
|
-
Returns
|
2681
|
-
-------
|
2682
|
-
dict
|
2683
|
-
a dictionary that represents the data of the tag feature with the same uuid as the tag
|
2684
|
-
"""
|
2685
|
-
for node in self.nodes:
|
2686
|
-
for tag_feature in node.get_tag_features():
|
2687
|
-
data = tag_feature.value[0]
|
2688
|
-
if "uuid" in data and data["uuid"] == self.tag_uuid:
|
2689
|
-
return data
|
2690
|
-
return {}
|
2691
|
-
|
2692
|
-
for key in groups.keys():
|
2693
|
-
tag_instances.append(TagInstance(key, groups[key]))
|
2694
|
-
return tag_instances
|
2695
|
-
|
2696
|
-
def add_label(self, label: str):
|
2697
|
-
"""Add a label to the document
|
2698
|
-
|
2699
|
-
Args:
|
2700
|
-
label: str Label to add
|
2701
|
-
label: str:
|
2702
|
-
|
2703
|
-
Returns:
|
2704
|
-
the document
|
2705
|
-
|
2706
|
-
"""
|
2707
|
-
if label not in self.labels:
|
2708
|
-
self.labels.append(label)
|
2709
|
-
|
2710
|
-
return self
|
2711
|
-
|
2712
|
-
def remove_label(self, label: str):
|
2713
|
-
"""Remove a label from the document
|
2714
|
-
|
2715
|
-
Args:
|
2716
|
-
label: str Label to remove
|
2717
|
-
label: str:
|
2718
|
-
|
2719
|
-
Returns:
|
2720
|
-
the document
|
2721
|
-
|
2722
|
-
"""
|
2723
|
-
self.labels.remove(label)
|
2724
|
-
return self
|
2725
|
-
|
2726
|
-
@classmethod
|
2727
|
-
def from_text(cls, text, separator=None, inmemory=False):
|
2728
|
-
"""Creates a new Document from the text provided.
|
2729
|
-
|
2730
|
-
Args:
|
2731
|
-
text: str Text to be used as content on the Document's ContentNode(s)
|
2732
|
-
separator: str If provided, this string will be used to split the text and the resulting text will be placed on children of the root ContentNode. (Default value = None)
|
2733
|
-
|
2734
|
-
Returns:
|
2735
|
-
the document
|
2736
|
-
|
2737
|
-
"""
|
2738
|
-
new_document = Document(inmemory=inmemory)
|
2739
|
-
new_document.source.original_filename = f"text-{uuid.uuid4()}"
|
2740
|
-
new_document.content_node = new_document.create_node(node_type="text", index=0)
|
2741
|
-
if text:
|
2742
|
-
if separator:
|
2743
|
-
for s in text.split(separator):
|
2744
|
-
new_document.content_node.add_child(
|
2745
|
-
new_document.create_node(node_type="text", content=s)
|
2746
|
-
)
|
2747
|
-
else:
|
2748
|
-
new_document.content_node.content = text
|
2749
|
-
|
2750
|
-
new_document.add_mixin("text")
|
2751
|
-
return new_document
|
2752
|
-
|
2753
|
-
def get_root(self):
|
2754
|
-
"""Get the root content node for the document (same as content_node)"""
|
2755
|
-
return self.content_node
|
2756
|
-
|
2757
|
-
def to_kdxa(self, file_path: str):
|
2758
|
-
"""Write the document to the kdxa format (msgpack) which can be
|
2759
|
-
used with the Kodexa platform
|
2760
|
-
|
2761
|
-
Args:
|
2762
|
-
file_path: the path to the mdoc you wish to create
|
2763
|
-
file_path: str:
|
2764
|
-
|
2765
|
-
Returns:
|
2766
|
-
|
2767
|
-
>>> document.to_mdoc('my-document.kdxa')
|
2768
|
-
"""
|
2769
|
-
with open(file_path, "wb") as outfile:
|
2770
|
-
msgpack.pack(self.to_dict(), outfile, use_bin_type=True)
|
2771
|
-
|
2772
|
-
@staticmethod
|
2773
|
-
def open_kddb(file_path):
|
2774
|
-
"""
|
2775
|
-
Opens a Kodexa Document Database.
|
2776
|
-
|
2777
|
-
This is the Kodexa V4 default way to store documents, it provides high-performance
|
2778
|
-
and also the ability to handle very large document objects
|
2779
|
-
|
2780
|
-
:param file_path: The file path
|
2781
|
-
:return: The Document instance
|
2782
|
-
"""
|
2783
|
-
return Document(kddb_path=file_path)
|
2784
|
-
|
2785
|
-
def close(self):
|
2786
|
-
"""
|
2787
|
-
Close the document and clean up the resources
|
2788
|
-
"""
|
2789
|
-
self.get_persistence().close()
|
2790
|
-
|
2791
|
-
def to_kddb(self, path=None):
|
2792
|
-
"""
|
2793
|
-
Either write this document to a KDDB file or convert this document object structure into a KDDB and return a bytes-like object
|
2794
|
-
|
2795
|
-
This is dependent on whether you provide a path to write to
|
2796
|
-
"""
|
2797
|
-
|
2798
|
-
if path is None:
|
2799
|
-
return self.get_persistence().get_bytes()
|
2800
|
-
|
2801
|
-
with open(path, "wb") as output_file:
|
2802
|
-
output_file.write(self.get_persistence().get_bytes())
|
2803
|
-
|
2804
|
-
@staticmethod
|
2805
|
-
def from_kdxa(file_path):
|
2806
|
-
"""Read an .kdxa file from the given file_path and
|
2807
|
-
|
2808
|
-
Args:
|
2809
|
-
file_path: the path to the mdoc file
|
2810
|
-
|
2811
|
-
Returns:
|
2812
|
-
|
2813
|
-
>>> document = Document.from_kdxa('my-document.kdxa')
|
2814
|
-
"""
|
2815
|
-
with open(file_path, "rb") as data_file:
|
2816
|
-
data_loaded = msgpack.unpack(data_file, raw=False)
|
2817
|
-
return Document.from_dict(data_loaded)
|
2818
|
-
|
2819
|
-
def to_msgpack(self):
|
2820
|
-
"""Convert this document object structure into a message pack"""
|
2821
|
-
return msgpack.packb(self.to_dict(), use_bin_type=True)
|
2822
|
-
|
2823
|
-
def to_json(self):
|
2824
|
-
"""Create a JSON string representation of this Document.
|
2825
|
-
|
2826
|
-
Args:
|
2827
|
-
|
2828
|
-
Returns:
|
2829
|
-
str: The JSON formatted string representation of this Document.
|
2830
|
-
|
2831
|
-
>>> document.to_json()
|
2832
|
-
"""
|
2833
|
-
return json.dumps(self.to_dict(), ensure_ascii=False)
|
2834
|
-
|
2835
|
-
def to_dict(self):
|
2836
|
-
"""Create a dictionary representing this Document's structure and content.
|
2837
|
-
|
2838
|
-
Args:
|
2839
|
-
|
2840
|
-
Returns:
|
2841
|
-
dict: A dictionary representation of this Document.
|
2842
|
-
|
2843
|
-
>>> document.to_dict()
|
2844
|
-
"""
|
2845
|
-
|
2846
|
-
# We don't want to store the none values
|
2847
|
-
def clean_none_values(d):
|
2848
|
-
"""
|
2849
|
-
This function recursively cleans a dictionary by removing keys with None values.
|
2850
|
-
|
2851
|
-
Args:
|
2852
|
-
d (dict): The dictionary to clean.
|
2853
|
-
|
2854
|
-
Returns:
|
2855
|
-
dict: A new dictionary with the same structure as the input, but without keys that had None values.
|
2856
|
-
"""
|
2857
|
-
clean = {}
|
2858
|
-
for k, v in d.items():
|
2859
|
-
if isinstance(v, dict):
|
2860
|
-
nested = clean_none_values(v)
|
2861
|
-
if len(nested.keys()) > 0:
|
2862
|
-
clean[k] = nested
|
2863
|
-
elif v is not None:
|
2864
|
-
clean[k] = v
|
2865
|
-
return clean
|
2866
|
-
|
2867
|
-
return {
|
2868
|
-
"version": Document.CURRENT_VERSION,
|
2869
|
-
"metadata": self.metadata,
|
2870
|
-
"content_node": self.content_node.to_dict() if self.content_node else None,
|
2871
|
-
"source": clean_none_values(dataclasses.asdict(self.source)),
|
2872
|
-
"mixins": self._mixins,
|
2873
|
-
"labels": self.labels,
|
2874
|
-
"uuid": self.uuid,
|
2875
|
-
}
|
2876
|
-
|
2877
|
-
@staticmethod
|
2878
|
-
def from_dict(doc_dict):
|
2879
|
-
"""Build a new Document from a dictionary.
|
2880
|
-
|
2881
|
-
Args:
|
2882
|
-
dict: doc_dict: A dictionary representation of a Kodexa Document.
|
2883
|
-
doc_dict:
|
2884
|
-
|
2885
|
-
Returns:
|
2886
|
-
Document: A complete Kodexa Document
|
2887
|
-
|
2888
|
-
>>> Document.from_dict(doc_dict)
|
2889
|
-
"""
|
2890
|
-
new_document = Document(DocumentMetadata(doc_dict["metadata"]))
|
2891
|
-
new_document.version = (
|
2892
|
-
doc_dict["version"]
|
2893
|
-
if "version" in doc_dict and doc_dict["version"]
|
2894
|
-
else Document.PREVIOUS_VERSION
|
2895
|
-
) # some older docs don't have a version or it's None
|
2896
|
-
new_document.uuid = (
|
2897
|
-
doc_dict["uuid"]
|
2898
|
-
if "uuid" in doc_dict
|
2899
|
-
else str(uuid.uuid5(uuid.NAMESPACE_DNS, "kodexa.com"))
|
2900
|
-
)
|
2901
|
-
|
2902
|
-
if "content_node" in doc_dict and doc_dict["content_node"]:
|
2903
|
-
new_document.content_node = ContentNode.from_dict(
|
2904
|
-
new_document, doc_dict["content_node"]
|
2905
|
-
)
|
2906
|
-
|
2907
|
-
if "source" in doc_dict and doc_dict["source"]:
|
2908
|
-
new_document.source = SourceMetadata.from_dict(doc_dict["source"])
|
2909
|
-
if "labels" in doc_dict and doc_dict["labels"]:
|
2910
|
-
new_document.labels = doc_dict["labels"]
|
2911
|
-
|
2912
|
-
new_document.get_persistence().update_metadata()
|
2913
|
-
return new_document
|
2914
|
-
|
2915
|
-
@staticmethod
|
2916
|
-
def from_json(json_string):
|
2917
|
-
"""Create an instance of a Document from a JSON string.
|
2918
|
-
|
2919
|
-
Args:
|
2920
|
-
str: json_string: A JSON string representation of a Kodexa Document
|
2921
|
-
json_string:
|
2922
|
-
|
2923
|
-
Returns:
|
2924
|
-
Document: A complete Kodexa Document
|
2925
|
-
|
2926
|
-
>>> Document.from_json(json_string)
|
2927
|
-
"""
|
2928
|
-
return Document.from_dict(json.loads(json_string))
|
2929
|
-
|
2930
|
-
@staticmethod
|
2931
|
-
def from_msgpack(msgpack_bytes):
|
2932
|
-
"""Create an instance of a Document from a message pack byte array.
|
2933
|
-
|
2934
|
-
Args:
|
2935
|
-
msgpack_bytes: bytes: A message pack byte array.
|
2936
|
-
|
2937
|
-
Returns:
|
2938
|
-
Document: A complete Kodexa Document
|
2939
|
-
|
2940
|
-
>>> Document.from_msgpack(open(os.path.join('news-doc.kdxa'), 'rb').read())
|
2941
|
-
"""
|
2942
|
-
return Document.from_dict(msgpack.unpackb(msgpack_bytes, raw=False))
|
2943
|
-
|
2944
|
-
def get_mixins(self):
|
2945
|
-
"""
|
2946
|
-
Get the list of mixins that have been enabled on this document
|
2947
|
-
|
2948
|
-
Returns:
|
2949
|
-
mixins: list[str] a list of the mixin names
|
2950
|
-
"""
|
2951
|
-
return self._mixins
|
2952
|
-
|
2953
|
-
def add_mixin(self, mixin):
|
2954
|
-
"""
|
2955
|
-
Add the given mixin to this document, this will apply the mixin to all the content nodes,
|
2956
|
-
and also register it with the document so that future invocations of create_node will ensure
|
2957
|
-
the node has the mixin appled.
|
2958
|
-
|
2959
|
-
Args:
|
2960
|
-
mixin:str the name of the mixin to add
|
2961
|
-
|
2962
|
-
Returns:
|
2963
|
-
>>> import * from kodexa
|
2964
|
-
>>> document = Document()
|
2965
|
-
>>> document.add_mixin('spatial')
|
2966
|
-
"""
|
2967
|
-
self._mixins.append(mixin)
|
2968
|
-
self.get_persistence().update_metadata()
|
2969
|
-
|
2970
|
-
def create_node(
|
2971
|
-
self,
|
2972
|
-
node_type: str,
|
2973
|
-
content: Optional[str] = None,
|
2974
|
-
virtual: bool = False,
|
2975
|
-
parent: ContentNode = None,
|
2976
|
-
index: Optional[int] = None,
|
2977
|
-
):
|
2978
|
-
"""
|
2979
|
-
Creates a new node for the document. The new node is not added to the document, but any mixins that have been
|
2980
|
-
applied to the document will also be available on the new node.
|
2981
|
-
|
2982
|
-
Args:
|
2983
|
-
node_type (str): The type of node.
|
2984
|
-
content (str): The content for the node; defaults to None.
|
2985
|
-
virtual (bool): Indicates if this is a 'real' or 'virtual' node; default is False. 'Real' nodes contain
|
2986
|
-
document content. 'Virtual' nodes are synthesized as necessary to fill gaps in between
|
2987
|
-
non-consecutively indexed siblings. Such indexing arises when document content is sparse.
|
2988
|
-
parent (ContentNode): The parent for this newly created node; default is None;
|
2989
|
-
index (Optional[int)): The index property to be set on this node; default is 0;
|
2990
|
-
|
2991
|
-
Returns:
|
2992
|
-
ContentNode: This newly created node.
|
2993
|
-
|
2994
|
-
>>> document.create_node(node_type='page')
|
2995
|
-
<kodexa.model.model.ContentNode object at 0x7f80605e53c8>
|
2996
|
-
"""
|
2997
|
-
content_node = ContentNode(
|
2998
|
-
document=self,
|
2999
|
-
node_type=node_type,
|
3000
|
-
content=content,
|
3001
|
-
parent=parent,
|
3002
|
-
index=index,
|
3003
|
-
virtual=virtual,
|
3004
|
-
)
|
3005
|
-
if parent is not None:
|
3006
|
-
parent.add_child(content_node, index)
|
3007
|
-
else:
|
3008
|
-
self.get_persistence().add_content_node(content_node, None)
|
3009
|
-
|
3010
|
-
if content is not None and len(content_node.get_content_parts()) == 0:
|
3011
|
-
content_node.set_content_parts([content])
|
3012
|
-
|
3013
|
-
return content_node
|
3014
|
-
|
3015
|
-
@classmethod
|
3016
|
-
def from_kddb(cls, source, detached: bool = True, inmemory: bool = False):
|
3017
|
-
"""
|
3018
|
-
Loads a document from a Kodexa Document Database (KDDB) file
|
3019
|
-
|
3020
|
-
Args:
|
3021
|
-
|
3022
|
-
input: if a string we will load the file at that path, if bytes we will create a temp file and
|
3023
|
-
load the KDDB to it
|
3024
|
-
detached (bool): if reading from a file we will create a copy so we don't update in place
|
3025
|
-
inmemory (bool): if true we will load the KDDB into memory
|
3026
|
-
|
3027
|
-
:return: the document
|
3028
|
-
"""
|
3029
|
-
if isinstance(source, str):
|
3030
|
-
if isinstance(source, str):
|
3031
|
-
# If we are using the detached flag we will create a copy of the KDDB file
|
3032
|
-
if detached:
|
3033
|
-
import tempfile
|
3034
|
-
from kodexa import KodexaPlatform
|
3035
|
-
|
3036
|
-
fp = tempfile.NamedTemporaryFile(
|
3037
|
-
suffix=".kddb", delete=False, dir=KodexaPlatform.get_tempdir()
|
3038
|
-
)
|
3039
|
-
fp.write(open(source, "rb").read())
|
3040
|
-
fp.close()
|
3041
|
-
return Document(kddb_path=fp.name, delete_on_close=True, inmemory=inmemory)
|
3042
|
-
|
3043
|
-
return Document(kddb_path=source, inmemory=inmemory)
|
3044
|
-
|
3045
|
-
# We will assume the input is of byte type
|
3046
|
-
import tempfile
|
3047
|
-
from kodexa import KodexaPlatform
|
3048
|
-
|
3049
|
-
fp = tempfile.NamedTemporaryFile(
|
3050
|
-
suffix=".kddb", delete=False, dir=KodexaPlatform.get_tempdir()
|
3051
|
-
)
|
3052
|
-
fp.write(source)
|
3053
|
-
fp.close()
|
3054
|
-
return Document(kddb_path=fp.name, delete_on_close=True, inmemory=inmemory)
|
3055
|
-
|
3056
|
-
@classmethod
|
3057
|
-
def from_file(cls, file, unpack: bool = False):
|
3058
|
-
"""Creates a Document that has a 'file-handle' connector to the specified file.
|
3059
|
-
|
3060
|
-
Args:
|
3061
|
-
file: file: The file to which the new Document is connected.
|
3062
|
-
unpack: bool: (Default value = False)
|
3063
|
-
|
3064
|
-
Returns:
|
3065
|
-
Document: A Document connected to the specified file.
|
3066
|
-
|
3067
|
-
"""
|
3068
|
-
if unpack:
|
3069
|
-
Document.from_kdxa(file)
|
3070
|
-
else:
|
3071
|
-
file_document = Document()
|
3072
|
-
file_document.metadata["connector"] = "file-handle"
|
3073
|
-
file_document.metadata["connector_options"] = {};
|
3074
|
-
file_document.metadata["connector_options"]["file"] = file
|
3075
|
-
file_document.source
|
3076
|
-
file_document.source.connector = "file-handle"
|
3077
|
-
file_document.source.original_filename = os.path.basename(file)
|
3078
|
-
file_document.source.original_path = file
|
3079
|
-
return file_document
|
3080
|
-
|
3081
|
-
@classmethod
|
3082
|
-
def from_url(cls, url, headers=None):
|
3083
|
-
"""Creates a Document that has a 'url' connector for the specified url.
|
3084
|
-
|
3085
|
-
Args:
|
3086
|
-
str: url: The URL to which the new Document is connected.
|
3087
|
-
dict: headers: Headers that should be used when reading from the URL
|
3088
|
-
url:
|
3089
|
-
headers: (Default value = None)
|
3090
|
-
|
3091
|
-
Returns:
|
3092
|
-
Document: A Document connected to the specified URL with the specified headers (if any).
|
3093
|
-
|
3094
|
-
"""
|
3095
|
-
if headers is None:
|
3096
|
-
headers = {}
|
3097
|
-
url_document = Document()
|
3098
|
-
url_document.metadata.connector = "url"
|
3099
|
-
url_document.metadata.connector_options.base_url = url
|
3100
|
-
url_document.metadata.connector_options.headers = headers
|
3101
|
-
url_document.source.connector = "url"
|
3102
|
-
url_document.source.original_filename = url
|
3103
|
-
url_document.source.original_path = url
|
3104
|
-
url_document.source.headers = headers
|
3105
|
-
return url_document
|
3106
|
-
|
3107
|
-
def select_first(self, selector, variables=None) -> Optional[ContentNode]:
|
3108
|
-
"""Select and return the first child of this node that match the selector value.
|
3109
|
-
|
3110
|
-
Args:
|
3111
|
-
selector (str): The selector (ie. //*)
|
3112
|
-
variables (dict, optional): A dictionary of variable name/value to use in substituion; defaults to None.
|
3113
|
-
Dictionary keys should match a variable specified in the selector.
|
3114
|
-
|
3115
|
-
Returns:
|
3116
|
-
Optional[ContentNode]: The first matching node or none
|
3117
|
-
|
3118
|
-
>>> document.get_root().select_first('.')
|
3119
|
-
ContentNode
|
3120
|
-
|
3121
|
-
>>> document.get_root().select_first('//*[hasTag($tagName)]', {"tagName": "div"})
|
3122
|
-
ContentNode
|
3123
|
-
"""
|
3124
|
-
result = self.select(selector, variables, first_only=True)
|
3125
|
-
return result[0] if len(result) > 0 else None
|
3126
|
-
|
3127
|
-
def select(
|
3128
|
-
self, selector: str, variables: Optional[dict] = None, first_only=False
|
3129
|
-
) -> List[ContentNode]:
|
3130
|
-
"""Execute a selector on the root node and then return a list of the matching nodes.
|
3131
|
-
|
3132
|
-
Args:
|
3133
|
-
selector (str): The selector (ie. //*)
|
3134
|
-
variables (Optional[dict): A dictionary of variable name/value to use in substituion; defaults to an empty
|
3135
|
-
first_only (bool): If True, only the first matching node is returned; defaults to False.
|
3136
|
-
dictionary. Dictionary keys should match a variable specified in the selector.
|
3137
|
-
|
3138
|
-
Returns:
|
3139
|
-
list[ContentNodes]: A list of the matching ContentNodes. If no matches found, list is empty.
|
3140
|
-
|
3141
|
-
>>> document.select('.')
|
3142
|
-
[ContentNode]
|
3143
|
-
"""
|
3144
|
-
if variables is None:
|
3145
|
-
variables = {}
|
3146
|
-
if self.content_node:
|
3147
|
-
result = self.content_node.select(selector, variables, first_only)
|
3148
|
-
if isinstance(result, list):
|
3149
|
-
return result
|
3150
|
-
|
3151
|
-
return [self.content_node] if bool(result) else []
|
3152
|
-
return []
|
3153
|
-
|
3154
|
-
def get_labels(self) -> List[str]:
|
3155
|
-
"""
|
3156
|
-
|
3157
|
-
Args:
|
3158
|
-
|
3159
|
-
Returns:
|
3160
|
-
List[str]: list of associated labels
|
3161
|
-
|
3162
|
-
"""
|
3163
|
-
return self.labels
|
3164
|
-
|
3165
|
-
def get_feature_set(self, owner_uri: Optional[str] = None) -> FeatureSet:
|
3166
|
-
""" """
|
3167
|
-
feature_set = FeatureSet()
|
3168
|
-
feature_set.node_features = []
|
3169
|
-
for tagged_node in self.get_all_tagged_nodes():
|
3170
|
-
node_feature = {"nodeUuid": str(tagged_node.uuid), "features": []}
|
3171
|
-
|
3172
|
-
feature_set.node_features.append(node_feature)
|
3173
|
-
|
3174
|
-
# TODO this needs to be cleaned up, also should it only really
|
3175
|
-
# be the tag features?
|
3176
|
-
for feature in tagged_node.get_features():
|
3177
|
-
if feature.feature_type == "tag":
|
3178
|
-
if owner_uri is not None:
|
3179
|
-
if (
|
3180
|
-
"owner_uri" in feature.value[0]
|
3181
|
-
and feature.value[0]["owner_uri"] != owner_uri
|
3182
|
-
):
|
3183
|
-
continue
|
3184
|
-
|
3185
|
-
feature_dict = feature.to_dict()
|
3186
|
-
feature_dict["featureType"] = feature.feature_type
|
3187
|
-
feature_dict["name"] = feature.name
|
3188
|
-
|
3189
|
-
if isinstance(feature_dict['value'][0], Tag):
|
3190
|
-
feature_dict['value'] = [feature_dict['value'][0].to_dict()]
|
3191
|
-
|
3192
|
-
node_feature["features"].append(feature_dict)
|
3193
|
-
|
3194
|
-
return feature_set
|
3195
|
-
|
3196
|
-
def get_all_tagged_nodes(self) -> List[ContentNode]:
|
3197
|
-
"""
|
3198
|
-
Get all the tagged nodes in the document
|
3199
|
-
|
3200
|
-
:return:
|
3201
|
-
"""
|
3202
|
-
return self._persistence_layer.get_all_tagged_nodes()
|
3203
|
-
|
3204
|
-
|
3205
|
-
class TagInstance:
|
3206
|
-
"""
|
3207
|
-
A class to represent a TagInstance.
|
3208
|
-
|
3209
|
-
Attributes
|
3210
|
-
----------
|
3211
|
-
tag : Tag
|
3212
|
-
an instance of Tag class
|
3213
|
-
nodes : list
|
3214
|
-
a list of nodes
|
3215
|
-
|
3216
|
-
Methods
|
3217
|
-
-------
|
3218
|
-
add_node(nodes: List[ContentNode])
|
3219
|
-
Extend the list of nodes with new nodes.
|
3220
|
-
"""
|
3221
|
-
|
3222
|
-
def __init__(self, tag: Tag, nodes):
|
3223
|
-
self.tag = tag
|
3224
|
-
self.nodes = nodes
|
3225
|
-
|
3226
|
-
def add_node(self, nodes: List[ContentNode]):
|
3227
|
-
"""
|
3228
|
-
Extend the list of nodes with new nodes.
|
3229
|
-
|
3230
|
-
Parameters
|
3231
|
-
----------
|
3232
|
-
nodes : List[ContentNode]
|
3233
|
-
a list of new nodes to be added
|
3234
|
-
"""
|
3235
|
-
self.nodes.extend(nodes)
|
3236
|
-
|
3237
|
-
|
3238
|
-
class ContentObjectReference:
|
3239
|
-
"""A reference to a content object within a document.
|
3240
|
-
|
3241
|
-
This class provides a way to reference a specific content object within a document,
|
3242
|
-
and includes information about the document's family and the store where the document is located.
|
3243
|
-
|
3244
|
-
Attributes:
|
3245
|
-
content_object (ContentObject): The content object being referenced.
|
3246
|
-
store: The store where the document is located.
|
3247
|
-
document (Document): The document in which the content object is located.
|
3248
|
-
document_family: The family to which the document belongs.
|
3249
|
-
"""
|
3250
|
-
|
3251
|
-
""" """
|
3252
|
-
|
3253
|
-
def __init__(
|
3254
|
-
self, content_object: ContentObject, store, document: Document, document_family
|
3255
|
-
):
|
3256
|
-
self.content_object = content_object
|
3257
|
-
self.store = store
|
3258
|
-
self.document = document
|
3259
|
-
self.document_family = document_family
|