kodexa-document 7.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kodexa-document might be problematic. Click here for more details.
- kodexa_document/connectors.py +456 -0
- kodexa_document/model.py +3642 -0
- kodexa_document/persistence.py +2057 -0
- kodexa_document/persistence_models.py +421 -0
- kodexa_document/selectors/__init__.py +5 -0
- kodexa_document/selectors/ast.py +677 -0
- kodexa_document/selectors/error.py +29 -0
- kodexa_document/selectors/kodexa-ast-visitor.py +268 -0
- kodexa_document/selectors/parser.py +91 -0
- kodexa_document/selectors/resources/KodexaSelector.interp +99 -0
- kodexa_document/selectors/resources/KodexaSelector.tokens +56 -0
- kodexa_document/selectors/resources/KodexaSelectorLexer.interp +119 -0
- kodexa_document/selectors/resources/KodexaSelectorLexer.py +204 -0
- kodexa_document/selectors/resources/KodexaSelectorLexer.tokens +56 -0
- kodexa_document/selectors/resources/KodexaSelectorListener.py +570 -0
- kodexa_document/selectors/resources/KodexaSelectorParser.py +3246 -0
- kodexa_document/selectors/resources/KodexaSelectorVisitor.py +323 -0
- kodexa_document/selectors/visitor.py +265 -0
- kodexa_document/steps.py +109 -0
- kodexa_document-7.5.0.dist-info/METADATA +27 -0
- kodexa_document-7.5.0.dist-info/RECORD +22 -0
- kodexa_document-7.5.0.dist-info/WHEEL +4 -0
kodexa_document/model.py
ADDED
|
@@ -0,0 +1,3642 @@
|
|
|
1
|
+
"""
|
|
2
|
+
The core model provides definitions for all the base objects in the Kodexa Content Model
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import dataclasses
|
|
6
|
+
import datetime
|
|
7
|
+
import inspect
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import re
|
|
11
|
+
import uuid
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from typing import Any, List, Optional, Union
|
|
14
|
+
from addict import Dict
|
|
15
|
+
import deepdiff
|
|
16
|
+
import msgpack
|
|
17
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
18
|
+
from typing import Optional, Annotated
|
|
19
|
+
|
|
20
|
+
from pydantic import BaseModel, Field, WithJsonSchema, PlainSerializer, ConfigDict
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def to_camel(string: str) -> str:
|
|
24
|
+
return "".join(word.capitalize() for word in string.split("_"))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
StandardDateTime = Annotated[
|
|
28
|
+
datetime,
|
|
29
|
+
PlainSerializer(
|
|
30
|
+
lambda v: (
|
|
31
|
+
v.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
|
|
32
|
+
if not isinstance(v, str)
|
|
33
|
+
else v
|
|
34
|
+
),
|
|
35
|
+
return_type=str,
|
|
36
|
+
),
|
|
37
|
+
WithJsonSchema({"type": "datetime"}, mode="serialization"),
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ContentType(Enum):
|
|
42
|
+
"""A class representing the content type of a document or native file."""
|
|
43
|
+
|
|
44
|
+
document = "DOCUMENT"
|
|
45
|
+
native = "NATIVE"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Label(BaseModel):
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
"""
|
|
52
|
+
model_config = ConfigDict(
|
|
53
|
+
populate_by_name=True,
|
|
54
|
+
use_enum_values=True,
|
|
55
|
+
arbitrary_types_allowed=True,
|
|
56
|
+
protected_namespaces=("model_config",),
|
|
57
|
+
)
|
|
58
|
+
"""
|
|
59
|
+
The labels from the latest content object in the family
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
id: Optional[str] = Field(None)
|
|
63
|
+
uuid: Optional[str] = None
|
|
64
|
+
change_sequence: Optional[int] = Field(None, alias="changeSequence")
|
|
65
|
+
created_on: Optional[StandardDateTime] = Field(None, alias="createdOn")
|
|
66
|
+
updated_on: Optional[StandardDateTime] = Field(None, alias="updatedOn")
|
|
67
|
+
name: str
|
|
68
|
+
color: Optional[str] = None
|
|
69
|
+
label: str
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class TaxonValidation(BaseModel):
|
|
73
|
+
model_config = ConfigDict(
|
|
74
|
+
populate_by_name=True,
|
|
75
|
+
use_enum_values=True,
|
|
76
|
+
arbitrary_types_allowed=True,
|
|
77
|
+
protected_namespaces=("model_config",),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
name: Optional[str] = Field(None)
|
|
81
|
+
description: Optional[str] = Field(None)
|
|
82
|
+
rule_formula: Optional[str] = Field(None, alias="ruleFormula")
|
|
83
|
+
message_formula: Optional[str] = Field(None, alias="messageFormula")
|
|
84
|
+
detail_formula: Optional[str] = Field(None, alias="detailFormula")
|
|
85
|
+
exception_id: Optional[str] = Field(None, alias="exceptionId")
|
|
86
|
+
support_article_id: Optional[str] = Field(None, alias="supportArticleId")
|
|
87
|
+
overridable: Optional[bool] = None
|
|
88
|
+
disabled: Optional[bool] = None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class DocumentTaxonValidation(BaseModel):
|
|
92
|
+
model_config = ConfigDict(
|
|
93
|
+
populate_by_name=True,
|
|
94
|
+
use_enum_values=True,
|
|
95
|
+
arbitrary_types_allowed=True,
|
|
96
|
+
protected_namespaces=("model_config",),
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
taxonomy_ref: Optional[str] = Field(None, alias="taxonomyRef")
|
|
100
|
+
taxon_path: Optional[str] = Field(None, alias="taxonPath")
|
|
101
|
+
validation: Optional[TaxonValidation] = None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class ContentFeature(BaseModel):
|
|
105
|
+
""" """
|
|
106
|
+
|
|
107
|
+
model_config = ConfigDict(
|
|
108
|
+
populate_by_name=True,
|
|
109
|
+
use_enum_values=True,
|
|
110
|
+
arbitrary_types_allowed=True,
|
|
111
|
+
protected_namespaces=("model_config",),
|
|
112
|
+
)
|
|
113
|
+
feature_type: Optional[str] = Field(None, alias="featureType")
|
|
114
|
+
name: Optional[str] = None
|
|
115
|
+
value: Optional[List[Dict[str, Any]]] = None
|
|
116
|
+
single: Optional[bool] = None
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class ContentObject(BaseModel):
|
|
120
|
+
""" """
|
|
121
|
+
|
|
122
|
+
model_config = ConfigDict(
|
|
123
|
+
populate_by_name=True,
|
|
124
|
+
use_enum_values=True,
|
|
125
|
+
arbitrary_types_allowed=True,
|
|
126
|
+
protected_namespaces=("model_config",),
|
|
127
|
+
)
|
|
128
|
+
id: Optional[str] = Field(None)
|
|
129
|
+
uuid: Optional[str] = None
|
|
130
|
+
change_sequence: Optional[int] = Field(None, alias="changeSequence")
|
|
131
|
+
created_on: Optional[StandardDateTime] = Field(None, alias="createdOn")
|
|
132
|
+
updated_on: Optional[StandardDateTime] = Field(None, alias="updatedOn")
|
|
133
|
+
content_type: ContentType = Field(
|
|
134
|
+
..., alias="contentType", description="The type of content"
|
|
135
|
+
)
|
|
136
|
+
document_version: Optional[str] = Field(None, alias="documentVersion")
|
|
137
|
+
index: Optional[int] = None
|
|
138
|
+
labels: Optional[List[Label]] = Field(default_factory=list)
|
|
139
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
140
|
+
mixins: Optional[List[str]] = Field(default_factory=list)
|
|
141
|
+
created: Optional[StandardDateTime] = None
|
|
142
|
+
modified: Optional[StandardDateTime] = None
|
|
143
|
+
size: Optional[int] = None
|
|
144
|
+
store_ref: Optional[str] = Field(None, alias="storeRef")
|
|
145
|
+
document_family_id: Optional[str] = Field(None, alias="documentFamilyId")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class NodeFeatures(BaseModel):
|
|
149
|
+
""" """
|
|
150
|
+
|
|
151
|
+
model_config = ConfigDict(
|
|
152
|
+
populate_by_name=True,
|
|
153
|
+
use_enum_values=True,
|
|
154
|
+
arbitrary_types_allowed=True,
|
|
155
|
+
protected_namespaces=("model_config",),
|
|
156
|
+
)
|
|
157
|
+
node_uuid: Optional[str] = Field(None, alias="nodeUuid")
|
|
158
|
+
features: Optional[List[ContentFeature]] = None
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class FeatureSet(BaseModel):
|
|
162
|
+
""" """
|
|
163
|
+
|
|
164
|
+
model_config = ConfigDict(
|
|
165
|
+
populate_by_name=True,
|
|
166
|
+
use_enum_values=True,
|
|
167
|
+
arbitrary_types_allowed=True,
|
|
168
|
+
protected_namespaces=("model_config",),
|
|
169
|
+
)
|
|
170
|
+
owner_uri: Optional[str] = Field(None, alias="ownerUri")
|
|
171
|
+
node_features: Optional[List[NodeFeatures]] = Field(None, alias="nodeFeatures")
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class Ref:
|
|
175
|
+
"""
|
|
176
|
+
A class to represent a reference.
|
|
177
|
+
|
|
178
|
+
Attributes
|
|
179
|
+
----------
|
|
180
|
+
ref : str
|
|
181
|
+
a string reference
|
|
182
|
+
version : str, optional
|
|
183
|
+
a version of the reference, default is None
|
|
184
|
+
resource : str, optional
|
|
185
|
+
a resource of the reference, default is None
|
|
186
|
+
slug : str
|
|
187
|
+
a slug of the reference, default is an empty string
|
|
188
|
+
org_slug : str
|
|
189
|
+
an organization slug of the reference, default is an empty string
|
|
190
|
+
object_ref : str
|
|
191
|
+
a formatted string of the reference
|
|
192
|
+
|
|
193
|
+
Methods
|
|
194
|
+
-------
|
|
195
|
+
__init__(self, ref: str)
|
|
196
|
+
Constructs all the necessary attributes for the Ref object.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def __init__(self, ref: str):
|
|
200
|
+
self.ref: str = ref
|
|
201
|
+
first_part = ref
|
|
202
|
+
self.version: Optional[str] = None
|
|
203
|
+
self.resource: Optional[str] = None
|
|
204
|
+
self.slug: str = ""
|
|
205
|
+
self.org_slug: str = ""
|
|
206
|
+
|
|
207
|
+
if ":" in ref:
|
|
208
|
+
(first_part, self.version) = ref.split(":")
|
|
209
|
+
|
|
210
|
+
if "/" in self.version:
|
|
211
|
+
(self.version, self.resource) = self.version.split("/")
|
|
212
|
+
|
|
213
|
+
(self.org_slug, self.slug) = first_part.split("/")
|
|
214
|
+
|
|
215
|
+
self.object_ref = (
|
|
216
|
+
f"{self.org_slug}/{self.slug}:{self.version}"
|
|
217
|
+
if self.version
|
|
218
|
+
else f"{self.org_slug}/{self.slug}"
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
import addict
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class DocumentMetadata(addict.Dict):
|
|
226
|
+
"""A flexible dict based approach to capturing metadata for the document.
|
|
227
|
+
|
|
228
|
+
This class extends from Dict to provide a flexible way to store and
|
|
229
|
+
manage metadata associated with a document.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
*args: Variable length argument list.
|
|
233
|
+
**kwargs: Arbitrary keyword arguments.
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
"""A flexible dict based approach to capturing metadata for the document"""
|
|
237
|
+
|
|
238
|
+
def __init__(self, *args, **kwargs):
|
|
239
|
+
super().__init__(*args, **kwargs)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class ContentException(dict):
|
|
243
|
+
"""A content exception represents an issue identified during labeling or validation at the document level.
|
|
244
|
+
|
|
245
|
+
Attributes:
|
|
246
|
+
tag (Optional[str]): Tag associated with the exception.
|
|
247
|
+
message (str): Message describing the exception.
|
|
248
|
+
exception_details (Optional[str]): Detailed information about the exception.
|
|
249
|
+
group_uuid (Optional[str]): UUID of the group associated with the exception.
|
|
250
|
+
tag_uuid (Optional[str]): UUID of the tag associated with the exception.
|
|
251
|
+
exception_type (str): Type of the exception.
|
|
252
|
+
node_uuid (Optional[str]): UUID of the node associated with the exception.
|
|
253
|
+
severity (str): Severity level of the exception, default is 'ERROR'.
|
|
254
|
+
value (Optional[str]): Value associated with the exception.
|
|
255
|
+
exception_type_id (Optional[str]): ID of the exception type.
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
"""A content exception represents an issue identified during labeling or validation at the document level"""
|
|
259
|
+
|
|
260
|
+
def __init__(
|
|
261
|
+
self,
|
|
262
|
+
exception_type: str,
|
|
263
|
+
message: str,
|
|
264
|
+
severity: str = "ERROR",
|
|
265
|
+
tag: Optional[str] = None,
|
|
266
|
+
group_uuid: Optional[str] = None,
|
|
267
|
+
tag_uuid: Optional[str] = None,
|
|
268
|
+
exception_type_id: Optional[str] = None,
|
|
269
|
+
exception_details: Optional[str] = None,
|
|
270
|
+
node_uuid: Optional[str] = None,
|
|
271
|
+
value: Optional[str] = None,
|
|
272
|
+
boolean_value: Optional[bool] = None,
|
|
273
|
+
*args,
|
|
274
|
+
**kwargs,
|
|
275
|
+
):
|
|
276
|
+
super().__init__(*args, **kwargs)
|
|
277
|
+
self.tag = tag
|
|
278
|
+
self.message = message
|
|
279
|
+
self.exception_details = exception_details
|
|
280
|
+
self.group_uuid = group_uuid
|
|
281
|
+
self.tag_uuid = tag_uuid
|
|
282
|
+
self.exception_type = exception_type
|
|
283
|
+
self.node_uuid = node_uuid
|
|
284
|
+
self.severity = severity
|
|
285
|
+
self.value = value
|
|
286
|
+
self.exception_type_id = exception_type_id
|
|
287
|
+
self.boolean_value = boolean_value
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
class Tag(object):
|
|
291
|
+
"""A class to represent the metadata for a label that is applied as a feature on a content node.
|
|
292
|
+
|
|
293
|
+
Attributes:
|
|
294
|
+
start (Optional[int]): The start position (zero indexed) of the content within the node. If None, label is applied to the whole node.
|
|
295
|
+
end (Optional[int]): The end position (zero indexed) of the content within the node. If None, label is applied to the whole node.
|
|
296
|
+
value (Optional[str]): A string representing the value that was labelled in the node.
|
|
297
|
+
data (Optional[Any]): Any data object (JSON serializable) that you wish to associate with the label.
|
|
298
|
+
uuid (Optional[str]): The UUID for this tag instance. This allows tags that are on different content nodes to be related through the same UUID.
|
|
299
|
+
confidence (Optional[float]): The confidence of the tag in a range of 0-1.
|
|
300
|
+
index (Optional[int]): The tag index. This is used to allow us to order tags, and understand the ordering of parent child tag relationships.
|
|
301
|
+
bbox (Optional[List[int]]): The optional bounding box that can be used if the label is spatial (based on the node as the container).
|
|
302
|
+
group_uuid (Optional[str]): The UUID of the group that this tag belongs to. This is used to allow us to group tags together.
|
|
303
|
+
parent_group_uuid (Optional[str]): The UUID of the parent group that this tag belongs to. This is used to allow us to group tags together.
|
|
304
|
+
cell_index (Optional[int]): The cell index of the cell that this tag belongs to. This is used to allow us to group tags together.
|
|
305
|
+
note (Optional[str]): A note that can be associated with the tag.
|
|
306
|
+
status (Optional[str]): The status of the tag. This can be passed to an attribute status during extraction.
|
|
307
|
+
owner_uri (Optional[str]): The URI of the owner (ie. model://kodexa/narrative:1.0.0 or user://pdodds).
|
|
308
|
+
"""
|
|
309
|
+
|
|
310
|
+
"""A tag represents the metadata for a label that is applies as a feature on a content node"""
|
|
311
|
+
|
|
312
|
+
def __init__(
|
|
313
|
+
self,
|
|
314
|
+
start: Optional[int] = None,
|
|
315
|
+
end: Optional[int] = None,
|
|
316
|
+
value: Optional[str] = None,
|
|
317
|
+
uuid: Optional[str] = None,
|
|
318
|
+
data: Any = None,
|
|
319
|
+
*args,
|
|
320
|
+
confidence: Optional[float] = None,
|
|
321
|
+
group_uuid: Optional[str] = None,
|
|
322
|
+
parent_group_uuid: Optional[str] = None,
|
|
323
|
+
cell_index: Optional[int] = None,
|
|
324
|
+
index: Optional[int] = None,
|
|
325
|
+
bbox: Optional[List[int]] = None,
|
|
326
|
+
note: Optional[str] = None,
|
|
327
|
+
status: Optional[str] = None,
|
|
328
|
+
owner_uri: Optional[str] = None,
|
|
329
|
+
is_dirty: Optional[bool] = None,
|
|
330
|
+
**kwargs,
|
|
331
|
+
):
|
|
332
|
+
import uuid as uuid_gen
|
|
333
|
+
|
|
334
|
+
# Store values both as attributes and dictionary keys
|
|
335
|
+
self.start = start
|
|
336
|
+
|
|
337
|
+
self.end = end
|
|
338
|
+
|
|
339
|
+
self.value = value
|
|
340
|
+
|
|
341
|
+
self.data = data
|
|
342
|
+
|
|
343
|
+
tag_uuid = uuid or str(uuid_gen.uuid4())
|
|
344
|
+
self.uuid = tag_uuid
|
|
345
|
+
|
|
346
|
+
self.confidence = confidence
|
|
347
|
+
|
|
348
|
+
self.index = index
|
|
349
|
+
|
|
350
|
+
self.bbox = bbox
|
|
351
|
+
|
|
352
|
+
self.group_uuid = group_uuid
|
|
353
|
+
|
|
354
|
+
self.parent_group_uuid = parent_group_uuid
|
|
355
|
+
|
|
356
|
+
self.cell_index = cell_index
|
|
357
|
+
|
|
358
|
+
self.note = note
|
|
359
|
+
|
|
360
|
+
self.status = status
|
|
361
|
+
|
|
362
|
+
self.owner_uri = owner_uri
|
|
363
|
+
|
|
364
|
+
self.is_dirty = is_dirty
|
|
365
|
+
|
|
366
|
+
# Pull the cell index from the data to the tag if we have it in the data
|
|
367
|
+
if (
|
|
368
|
+
self.cell_index is None
|
|
369
|
+
and data
|
|
370
|
+
and isinstance(data, dict)
|
|
371
|
+
and "cell_index" in data
|
|
372
|
+
):
|
|
373
|
+
self.cell_index = data["cell_index"]
|
|
374
|
+
|
|
375
|
+
def to_dict(self):
|
|
376
|
+
"""
|
|
377
|
+
Create a dictionary representing this Tag's structure and content.
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
dict: The properties of this Tag structured as a dictionary.
|
|
381
|
+
"""
|
|
382
|
+
result = {"uuid": self.uuid}
|
|
383
|
+
|
|
384
|
+
if self.start is not None:
|
|
385
|
+
result["start"] = self.start
|
|
386
|
+
|
|
387
|
+
if self.end is not None:
|
|
388
|
+
result["end"] = self.end
|
|
389
|
+
|
|
390
|
+
if self.value is not None:
|
|
391
|
+
result["value"] = self.value
|
|
392
|
+
|
|
393
|
+
if self.data is not None:
|
|
394
|
+
result["data"] = self.data
|
|
395
|
+
|
|
396
|
+
if self.confidence is not None:
|
|
397
|
+
result["confidence"] = self.confidence
|
|
398
|
+
|
|
399
|
+
if self.index is not None:
|
|
400
|
+
result["index"] = self.index
|
|
401
|
+
|
|
402
|
+
if self.bbox is not None:
|
|
403
|
+
result["bbox"] = self.bbox
|
|
404
|
+
|
|
405
|
+
if self.group_uuid is not None:
|
|
406
|
+
result["group_uuid"] = self.group_uuid
|
|
407
|
+
|
|
408
|
+
if self.parent_group_uuid is not None:
|
|
409
|
+
result["parent_group_uuid"] = self.parent_group_uuid
|
|
410
|
+
|
|
411
|
+
if self.cell_index is not None:
|
|
412
|
+
result["cell_index"] = self.cell_index
|
|
413
|
+
|
|
414
|
+
if self.note is not None:
|
|
415
|
+
result["note"] = self.note
|
|
416
|
+
|
|
417
|
+
if self.status is not None:
|
|
418
|
+
result["status"] = self.status
|
|
419
|
+
|
|
420
|
+
if self.owner_uri is not None:
|
|
421
|
+
result["owner_uri"] = self.owner_uri
|
|
422
|
+
|
|
423
|
+
if self.is_dirty is not None:
|
|
424
|
+
result["is_dirty"] = self.is_dirty
|
|
425
|
+
|
|
426
|
+
return result
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
class FindDirection(Enum):
|
|
430
|
+
"""
|
|
431
|
+
Enum class for defining the direction of search in a tree structure.
|
|
432
|
+
|
|
433
|
+
Attributes:
|
|
434
|
+
CHILDREN (int): Represents the direction towards children nodes.
|
|
435
|
+
PARENT (int): Represents the direction towards parent node.
|
|
436
|
+
"""
|
|
437
|
+
|
|
438
|
+
""" """
|
|
439
|
+
CHILDREN = 1
|
|
440
|
+
PARENT = 2
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
class Traverse(Enum):
|
|
444
|
+
"""
|
|
445
|
+
An enumeration class that represents different types of traversals.
|
|
446
|
+
|
|
447
|
+
Attributes:
|
|
448
|
+
SIBLING (int): Represents traversal to a sibling.
|
|
449
|
+
CHILDREN (int): Represents traversal to children.
|
|
450
|
+
PARENT (int): Represents traversal to a parent.
|
|
451
|
+
ALL (int): Represents traversal to all types of nodes.
|
|
452
|
+
"""
|
|
453
|
+
|
|
454
|
+
""" """
|
|
455
|
+
SIBLING = 1
|
|
456
|
+
CHILDREN = 2
|
|
457
|
+
PARENT = 3
|
|
458
|
+
ALL = 4
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
class ContentNode(object):
|
|
462
|
+
"""A Content Node identifies a section of the document containing logical
|
|
463
|
+
grouping of information.
|
|
464
|
+
|
|
465
|
+
The node will have content and can include any number of features.
|
|
466
|
+
|
|
467
|
+
You should always create a node using the Document's create_node method to
|
|
468
|
+
ensure that the correct mixins are applied.
|
|
469
|
+
|
|
470
|
+
>>> new_page = document.create_node(node_type='page')
|
|
471
|
+
<kodexa_document.model.ContentNode object at 0x7f80605e53c8>
|
|
472
|
+
>>> current_content_node.add_child(new_page)
|
|
473
|
+
|
|
474
|
+
>>> new_page = document.create_node(node_type='page', content='This is page 1')
|
|
475
|
+
<kodexa_document.model.ContentNode object at 0x7f80605e53c8>
|
|
476
|
+
>>> current_content_node.add_child(new_page)
|
|
477
|
+
|
|
478
|
+
"""
|
|
479
|
+
|
|
480
|
+
def __init__(
|
|
481
|
+
self,
|
|
482
|
+
document,
|
|
483
|
+
node_type: str,
|
|
484
|
+
id: Optional[int] = None,
|
|
485
|
+
content: Optional[str] = None,
|
|
486
|
+
content_parts: Optional[List[Any]] = None,
|
|
487
|
+
parent=None,
|
|
488
|
+
index: Optional[int] = None,
|
|
489
|
+
virtual: bool = False,
|
|
490
|
+
):
|
|
491
|
+
self.id: Optional[int] = id
|
|
492
|
+
"""The ID of the content node"""
|
|
493
|
+
self.node_type: str = node_type
|
|
494
|
+
"""The node type (ie. line, page, cell etc)"""
|
|
495
|
+
self.document: Document = document
|
|
496
|
+
"""The document that the node belongs to"""
|
|
497
|
+
self._content_parts: Optional[List[Any]] = content_parts
|
|
498
|
+
"""The children of the content node"""
|
|
499
|
+
self._index: Optional[int] = index
|
|
500
|
+
"""The index of the content node"""
|
|
501
|
+
self.id: Optional[int] = None
|
|
502
|
+
"""The ID of the content node"""
|
|
503
|
+
self.virtual: bool = virtual
|
|
504
|
+
"""Is the node virtual (ie. it doesn't actually exist in the document)"""
|
|
505
|
+
|
|
506
|
+
self._parent_id = parent.id if parent else None
|
|
507
|
+
|
|
508
|
+
if content_parts is not None:
|
|
509
|
+
self.set_content_parts(content_parts)
|
|
510
|
+
self._content_parts = self.get_content_parts()
|
|
511
|
+
|
|
512
|
+
if content is not None and len(self._content_parts) == 0:
|
|
513
|
+
self.set_content_parts([content])
|
|
514
|
+
|
|
515
|
+
@property
|
|
516
|
+
def id(self) -> Optional[int]:
|
|
517
|
+
"""Get the ID of this node"""
|
|
518
|
+
return self._id
|
|
519
|
+
|
|
520
|
+
@id.setter
|
|
521
|
+
def id(self, value: Optional[int]):
|
|
522
|
+
"""Set the ID of this node"""
|
|
523
|
+
self._id = value
|
|
524
|
+
|
|
525
|
+
@property
|
|
526
|
+
def index(self) -> Optional[int]:
|
|
527
|
+
"""Get the index of this node"""
|
|
528
|
+
return self._index
|
|
529
|
+
|
|
530
|
+
@index.setter
|
|
531
|
+
def index(self, value: Optional[int]):
|
|
532
|
+
"""Set the index of this node"""
|
|
533
|
+
self._index = value
|
|
534
|
+
|
|
535
|
+
def get_content_parts(self):
|
|
536
|
+
return self.document.get_persistence().get_content_parts(self)
|
|
537
|
+
|
|
538
|
+
def set_content_parts(self, content_parts):
|
|
539
|
+
self.document.get_persistence().update_content_parts(self, content_parts)
|
|
540
|
+
|
|
541
|
+
def update(self):
|
|
542
|
+
"""
|
|
543
|
+
Update this node in the document persistence
|
|
544
|
+
|
|
545
|
+
:return:
|
|
546
|
+
"""
|
|
547
|
+
self.document.get_persistence().update_node(self)
|
|
548
|
+
|
|
549
|
+
@property
|
|
550
|
+
def content(self):
|
|
551
|
+
if len(self.get_content_parts()) == 0:
|
|
552
|
+
return None
|
|
553
|
+
|
|
554
|
+
s = ""
|
|
555
|
+
for part in self.get_content_parts():
|
|
556
|
+
if isinstance(part, str):
|
|
557
|
+
if s != "":
|
|
558
|
+
s += " "
|
|
559
|
+
s += part
|
|
560
|
+
|
|
561
|
+
return s
|
|
562
|
+
|
|
563
|
+
@content.setter
|
|
564
|
+
def content(self, new_content):
|
|
565
|
+
if len(self.get_content_parts()) == 0:
|
|
566
|
+
self.set_content_parts([new_content])
|
|
567
|
+
else:
|
|
568
|
+
# We need to remove all the strings and add this one
|
|
569
|
+
# back at the front
|
|
570
|
+
parts = self.get_content_parts()
|
|
571
|
+
filtered_parts = list(filter(lambda part: isinstance(part, int), parts))
|
|
572
|
+
if new_content is not None and new_content != "":
|
|
573
|
+
filtered_parts.insert(0, new_content)
|
|
574
|
+
self.set_content_parts(filtered_parts)
|
|
575
|
+
|
|
576
|
+
def __eq__(self, other):
|
|
577
|
+
return (
|
|
578
|
+
other is not None
|
|
579
|
+
and self.id == other.id
|
|
580
|
+
and (self.id is not None and other.id is not None)
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
def __hash__(self):
|
|
584
|
+
return hash(self.id)
|
|
585
|
+
|
|
586
|
+
def get_parent(self) -> Optional["ContentNode"]:
|
|
587
|
+
return self.document.get_persistence().get_parent(self)
|
|
588
|
+
|
|
589
|
+
def __str__(self):
|
|
590
|
+
return (
|
|
591
|
+
f"ContentNode {self.id} [node_type:{self.node_type}] ({len(self.get_features())} features, {len(self.get_children())} children) ["
|
|
592
|
+
+ str(self.content)
|
|
593
|
+
+ "]"
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
def to_json(self):
|
|
597
|
+
"""Create a JSON string representation of this ContentNode.
|
|
598
|
+
|
|
599
|
+
Args:
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
str: The JSON formatted string representation of this ContentNode.
|
|
603
|
+
|
|
604
|
+
>>> node.to_json()
|
|
605
|
+
"""
|
|
606
|
+
return json.dumps(self.to_dict())
|
|
607
|
+
|
|
608
|
+
def to_dict(self):
|
|
609
|
+
"""Create a dictionary representing this ContentNode's structure and content.
|
|
610
|
+
|
|
611
|
+
Args:
|
|
612
|
+
|
|
613
|
+
Returns:
|
|
614
|
+
dict: The properties of this ContentNode and all of its children structured as a dictionary.
|
|
615
|
+
|
|
616
|
+
>>> node.to_dict()
|
|
617
|
+
"""
|
|
618
|
+
new_dict = {
|
|
619
|
+
"node_type": self.node_type,
|
|
620
|
+
"content": self.content,
|
|
621
|
+
"content_parts": self.get_content_parts(),
|
|
622
|
+
"features": [],
|
|
623
|
+
"index": self.index,
|
|
624
|
+
"children": [],
|
|
625
|
+
"uuid": self.id,
|
|
626
|
+
}
|
|
627
|
+
for feature in self.get_features():
|
|
628
|
+
new_dict["features"].append(feature.to_dict())
|
|
629
|
+
|
|
630
|
+
for child in self.get_children():
|
|
631
|
+
new_dict["children"].append(child.to_dict())
|
|
632
|
+
return new_dict
|
|
633
|
+
|
|
634
|
+
@staticmethod
|
|
635
|
+
def from_dict(
|
|
636
|
+
document: "Document",
|
|
637
|
+
content_node_dict: dict,
|
|
638
|
+
parent: Optional["ContentNode"] = None,
|
|
639
|
+
):
|
|
640
|
+
"""Build a new ContentNode from a dictionary represention.
|
|
641
|
+
|
|
642
|
+
Args:
|
|
643
|
+
document (Document): The Kodexa document from which the new ContentNode will be created (not added).
|
|
644
|
+
content_node_dict (Dict): The dictionary-structured representation of a ContentNode. This value will be unpacked into a ContentNode.
|
|
645
|
+
parent (Optional[ContentNode]): Optionally the parent content node
|
|
646
|
+
Returns:
|
|
647
|
+
ContentNode: A ContentNode containing the unpacked values from the content_node_dict parameter.
|
|
648
|
+
|
|
649
|
+
>>> ContentNode.from_dict(document, content_node_dict)
|
|
650
|
+
"""
|
|
651
|
+
|
|
652
|
+
node_type = (
|
|
653
|
+
content_node_dict["type"]
|
|
654
|
+
if document.version == Document.PREVIOUS_VERSION
|
|
655
|
+
else content_node_dict["node_type"]
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
new_content_node = document.create_node(
|
|
659
|
+
node_type=node_type,
|
|
660
|
+
content=(
|
|
661
|
+
content_node_dict["content"] if "content" in content_node_dict else None
|
|
662
|
+
),
|
|
663
|
+
index=content_node_dict["index"],
|
|
664
|
+
parent=parent,
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
if (
|
|
668
|
+
"content_parts" in content_node_dict
|
|
669
|
+
and len(content_node_dict["content_parts"]) > 0
|
|
670
|
+
):
|
|
671
|
+
new_content_node.set_content_parts(content_node_dict["content_parts"])
|
|
672
|
+
|
|
673
|
+
for dict_feature in content_node_dict["features"]:
|
|
674
|
+
feature_type = dict_feature["name"].split(":")[0]
|
|
675
|
+
feature_name = dict_feature["name"].split(":")[1]
|
|
676
|
+
feature_value = dict_feature["value"]
|
|
677
|
+
|
|
678
|
+
if feature_type == "tag":
|
|
679
|
+
# Handle both single tag and list of tags
|
|
680
|
+
if isinstance(feature_value, list):
|
|
681
|
+
# It's a list of tags
|
|
682
|
+
for tag_value in feature_value:
|
|
683
|
+
if isinstance(tag_value, Tag):
|
|
684
|
+
new_content_node.add_feature(
|
|
685
|
+
feature_type, feature_name, tag_value
|
|
686
|
+
)
|
|
687
|
+
else:
|
|
688
|
+
if isinstance(tag_value, list):
|
|
689
|
+
# if it is an empty list, turn it into a {}
|
|
690
|
+
if len(tag_value) == 0:
|
|
691
|
+
tag_value = {}
|
|
692
|
+
else:
|
|
693
|
+
raise ValueError(
|
|
694
|
+
f"Tag values cannot be a list of lists {tag_value}"
|
|
695
|
+
)
|
|
696
|
+
new_content_node.add_feature(
|
|
697
|
+
feature_type, feature_name, Tag(**tag_value)
|
|
698
|
+
)
|
|
699
|
+
else:
|
|
700
|
+
# It's a single tag
|
|
701
|
+
if isinstance(feature_value, Tag):
|
|
702
|
+
new_content_node.add_feature(
|
|
703
|
+
feature_type, feature_name, feature_value
|
|
704
|
+
)
|
|
705
|
+
else:
|
|
706
|
+
new_content_node.add_feature(
|
|
707
|
+
feature_type, feature_name, Tag(**feature_value)
|
|
708
|
+
)
|
|
709
|
+
else:
|
|
710
|
+
# For non-tag features, check if it's a list of values
|
|
711
|
+
if isinstance(feature_value, list):
|
|
712
|
+
# Add each value in the list individually
|
|
713
|
+
for value in feature_value:
|
|
714
|
+
new_content_node.add_feature(feature_type, feature_name, value)
|
|
715
|
+
else:
|
|
716
|
+
# Add the single value directly
|
|
717
|
+
new_content_node.add_feature(
|
|
718
|
+
feature_type, feature_name, feature_value
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
for dict_child in content_node_dict["children"]:
|
|
722
|
+
ContentNode.from_dict(document, dict_child, new_content_node)
|
|
723
|
+
|
|
724
|
+
return new_content_node
|
|
725
|
+
|
|
726
|
+
def add_child_content(
|
|
727
|
+
self, node_type: str, content: str, index: Optional[int] = None
|
|
728
|
+
) -> "ContentNode":
|
|
729
|
+
"""Convenience method to allow you to quick add a child node with a type and content
|
|
730
|
+
|
|
731
|
+
Args:
|
|
732
|
+
node_type: the node type
|
|
733
|
+
content: the content
|
|
734
|
+
index: the index (optional) (Default value = None)
|
|
735
|
+
|
|
736
|
+
Returns:
|
|
737
|
+
the new ContentNode
|
|
738
|
+
|
|
739
|
+
"""
|
|
740
|
+
new_node = self.document.create_node(
|
|
741
|
+
node_type=node_type, parent=self, content=content
|
|
742
|
+
)
|
|
743
|
+
self.add_child(new_node, index)
|
|
744
|
+
return new_node
|
|
745
|
+
|
|
746
|
+
def add_child(self, child: "ContentNode", index: Optional[int] = None):
|
|
747
|
+
"""Add a ContentNode as a child of this ContentNode
|
|
748
|
+
|
|
749
|
+
Args:
|
|
750
|
+
child (ContentNode): The node that will be added as a child of this node
|
|
751
|
+
index (Optional[int]): The index at which this child node should be added; defaults to None. If None, index is set as the count of child node elements.
|
|
752
|
+
|
|
753
|
+
Returns:
|
|
754
|
+
|
|
755
|
+
>>> new_page = document.create_node(node_type='page')
|
|
756
|
+
<kodexa_document.model.ContentNode object at 0x7f80605e53c8>
|
|
757
|
+
>>> current_content_node.add_child(new_page)
|
|
758
|
+
"""
|
|
759
|
+
# Don't do any index manipulation for virtual nodes
|
|
760
|
+
if child.virtual:
|
|
761
|
+
child.index = index if index is not None else 0
|
|
762
|
+
# Skip directly to setting the parent without persistence
|
|
763
|
+
child._parent_id = self.id if self.id else None
|
|
764
|
+
return
|
|
765
|
+
|
|
766
|
+
existing_children = self.get_children()
|
|
767
|
+
num_existing_children = len(existing_children)
|
|
768
|
+
|
|
769
|
+
final_child_index: int
|
|
770
|
+
|
|
771
|
+
if index is None:
|
|
772
|
+
# If no index is provided, append the child.
|
|
773
|
+
final_child_index = num_existing_children
|
|
774
|
+
# No shifting of other children is needed for an append.
|
|
775
|
+
else:
|
|
776
|
+
# An index is provided. This is an insertion.
|
|
777
|
+
# The new child will take this index. Assumes index is non-negative.
|
|
778
|
+
final_child_index = index
|
|
779
|
+
|
|
780
|
+
# Existing children at or after this insertion point need their indices incremented.
|
|
781
|
+
children_to_shift = [
|
|
782
|
+
ec
|
|
783
|
+
for ec in existing_children
|
|
784
|
+
if ec.index is not None and ec.index >= final_child_index
|
|
785
|
+
]
|
|
786
|
+
|
|
787
|
+
# Sort children to be shifted by their current index in descending order
|
|
788
|
+
# to prevent index collisions during sequential updates.
|
|
789
|
+
children_to_shift.sort(key=lambda c: c.index, reverse=True)
|
|
790
|
+
|
|
791
|
+
for c_to_shift in children_to_shift:
|
|
792
|
+
c_to_shift.index += 1
|
|
793
|
+
# Persist the updated index for each shifted child.
|
|
794
|
+
self.document.get_persistence().update_node(c_to_shift)
|
|
795
|
+
|
|
796
|
+
# Set the index on the child before adding it
|
|
797
|
+
child.index = final_child_index
|
|
798
|
+
|
|
799
|
+
# Add the child to the persistence layer
|
|
800
|
+
self.document.get_persistence().add_content_node(child, self)
|
|
801
|
+
|
|
802
|
+
def remove_child(self, content_node):
|
|
803
|
+
self.document.get_persistence().remove_content_node(content_node)
|
|
804
|
+
|
|
805
|
+
def get_children(self) -> List["ContentNode"]:
|
|
806
|
+
"""Returns a list of the children of this node.
|
|
807
|
+
|
|
808
|
+
Returns:
|
|
809
|
+
list[ContentNode]: The list of child nodes for this ContentNode.
|
|
810
|
+
|
|
811
|
+
>>> node.get_children()
|
|
812
|
+
"""
|
|
813
|
+
return self.document.get_persistence().get_children(self)
|
|
814
|
+
|
|
815
|
+
def set_feature(self, feature_type, name, value):
|
|
816
|
+
"""Sets a feature for this ContentNode, replacing the value if a feature by this type and name already exists.
|
|
817
|
+
|
|
818
|
+
Args:
|
|
819
|
+
feature_type (str): The type of feature to be added to the node.
|
|
820
|
+
name (str): The name of the feature.
|
|
821
|
+
value (Any): The value of the feature.
|
|
822
|
+
|
|
823
|
+
Returns:
|
|
824
|
+
ContentFeature: The feature that was added to this ContentNode
|
|
825
|
+
|
|
826
|
+
>>> new_page = document.create_node(node_type='page')
|
|
827
|
+
<kodexa_document.model.ContentNode object at 0x7f80605e53c8>
|
|
828
|
+
>>> new_page.add_feature('pagination','pageNum',1)
|
|
829
|
+
"""
|
|
830
|
+
self.remove_feature(feature_type, name)
|
|
831
|
+
return self.add_feature(feature_type, name, value)
|
|
832
|
+
|
|
833
|
+
def update_feature(self, feature: "ContentFeature"):
|
|
834
|
+
"""
|
|
835
|
+
Update a feature on this node in document persistence
|
|
836
|
+
|
|
837
|
+
:param feature:
|
|
838
|
+
:return:
|
|
839
|
+
"""
|
|
840
|
+
self.document.get_persistence().remove_feature(
|
|
841
|
+
self, feature.feature_type, feature.name
|
|
842
|
+
)
|
|
843
|
+
self.document.get_persistence().add_feature(self, feature, replace=True)
|
|
844
|
+
|
|
845
|
+
def add_feature(self, feature_type, name, value):
|
|
846
|
+
"""
|
|
847
|
+
Add a new feature to this ContentNode.
|
|
848
|
+
|
|
849
|
+
For tag features, this will add a new tag instance. Multiple tags of the same type/name
|
|
850
|
+
can exist, each with its own data and UUID, sharing a common underlying feature entry.
|
|
851
|
+
|
|
852
|
+
For non-tag features, this will replace any existing feature of the same type and name.
|
|
853
|
+
|
|
854
|
+
Args:
|
|
855
|
+
feature_type (str): The type of feature to be added to the node.
|
|
856
|
+
name (str): The name of the feature.
|
|
857
|
+
value (Any): The value of the feature (e.g., a Tag object for tags, or other data for non-tags).
|
|
858
|
+
|
|
859
|
+
Returns:
|
|
860
|
+
ContentFeature: The ContentFeature object that was constructed and passed to persistence.
|
|
861
|
+
"""
|
|
862
|
+
|
|
863
|
+
if feature_type == "tag":
|
|
864
|
+
if not isinstance(value, Tag):
|
|
865
|
+
if isinstance(value, list):
|
|
866
|
+
value = [Tag(**v) if not isinstance(v, Tag) else v for v in value]
|
|
867
|
+
else:
|
|
868
|
+
value = Tag(**value)
|
|
869
|
+
|
|
870
|
+
the_feature_to_add = ContentFeature(feature_type, name, value)
|
|
871
|
+
self.document.get_persistence().add_feature(
|
|
872
|
+
self, the_feature_to_add, replace=False
|
|
873
|
+
)
|
|
874
|
+
|
|
875
|
+
return the_feature_to_add
|
|
876
|
+
|
|
877
|
+
def delete_children(
|
|
878
|
+
self, nodes: Optional[List] = None, exclude_nodes: Optional[List] = None
|
|
879
|
+
):
|
|
880
|
+
"""Delete the children of this node, you can either supply a list of the nodes to delete
|
|
881
|
+
or the nodes to exclude from the delete, if neither are supplied then we delete all the children.
|
|
882
|
+
|
|
883
|
+
Note there is precedence in place, if you have provided a list of nodes to delete then the nodes
|
|
884
|
+
to exclude is ignored.
|
|
885
|
+
|
|
886
|
+
Args:
|
|
887
|
+
nodes: Optional[List[ContentNode]] a list of content nodes that are children to delete
|
|
888
|
+
exclude_nodes: Optional[List[ContentNode]] a list of content node that are children not to delete
|
|
889
|
+
nodes: Optional[List]: (Default value = None)
|
|
890
|
+
exclude_nodes: Optional[List]: (Default value = None)
|
|
891
|
+
"""
|
|
892
|
+
children_to_delete = []
|
|
893
|
+
|
|
894
|
+
for child_node in self.get_children():
|
|
895
|
+
if nodes is not None:
|
|
896
|
+
for node_to_delete in nodes:
|
|
897
|
+
if node_to_delete.id == child_node.id:
|
|
898
|
+
children_to_delete.append(child_node)
|
|
899
|
+
elif exclude_nodes is not None:
|
|
900
|
+
if len(exclude_nodes) == 0:
|
|
901
|
+
children_to_delete.append(child_node)
|
|
902
|
+
else:
|
|
903
|
+
for nodes_to_exclude in exclude_nodes:
|
|
904
|
+
if nodes_to_exclude.id != child_node.id:
|
|
905
|
+
children_to_delete.append(child_node)
|
|
906
|
+
else:
|
|
907
|
+
children_to_delete.append(child_node)
|
|
908
|
+
|
|
909
|
+
for child_to_delete in children_to_delete:
|
|
910
|
+
if child_to_delete in self.get_children():
|
|
911
|
+
self.document.get_persistence().remove_content_node(child_to_delete)
|
|
912
|
+
|
|
913
|
+
def get_feature(self, feature_type, name):
|
|
914
|
+
"""Gets the value for the given feature.
|
|
915
|
+
|
|
916
|
+
Args:
|
|
917
|
+
feature_type (str): The type of the feature.
|
|
918
|
+
name (str): The name of the feature.
|
|
919
|
+
|
|
920
|
+
Returns:
|
|
921
|
+
ContentFeature or None: The feature with the specified type & name. If no feature is found, None is returned.
|
|
922
|
+
Note that if there are more than one instance of the feature you will only get the first one
|
|
923
|
+
|
|
924
|
+
>>> new_page.get_feature('pagination','pageNum')
|
|
925
|
+
1
|
|
926
|
+
"""
|
|
927
|
+
hits = [
|
|
928
|
+
i
|
|
929
|
+
for i in self.get_features()
|
|
930
|
+
if i.feature_type == feature_type and i.name == name
|
|
931
|
+
]
|
|
932
|
+
if len(hits) > 0:
|
|
933
|
+
return hits[0]
|
|
934
|
+
|
|
935
|
+
return None
|
|
936
|
+
|
|
937
|
+
def get_features_of_type(self, feature_type):
|
|
938
|
+
"""Get all features of a specific type.
|
|
939
|
+
|
|
940
|
+
Args:
|
|
941
|
+
feature_type (str): The type of the feature.
|
|
942
|
+
|
|
943
|
+
Returns:
|
|
944
|
+
list[ContentFeature]: A list of feature with the specified type. If no features are found, an empty list is returned.
|
|
945
|
+
|
|
946
|
+
>>> new_page.get_features_of_type('my_type')
|
|
947
|
+
[]
|
|
948
|
+
"""
|
|
949
|
+
return [i for i in self.get_features() if i.feature_type == feature_type]
|
|
950
|
+
|
|
951
|
+
def has_feature(self, feature_type: str, name: str):
|
|
952
|
+
"""Determines if a feature with the given feature and name exists on this content node.
|
|
953
|
+
|
|
954
|
+
Args:
|
|
955
|
+
feature_type (str): The type of the feature.
|
|
956
|
+
name (str): The name of the feature.
|
|
957
|
+
|
|
958
|
+
Returns:
|
|
959
|
+
bool: True if the feature is present; else, False.
|
|
960
|
+
|
|
961
|
+
>>> new_page.has_feature('pagination','pageNum')
|
|
962
|
+
True
|
|
963
|
+
"""
|
|
964
|
+
return (
|
|
965
|
+
len(
|
|
966
|
+
[
|
|
967
|
+
i
|
|
968
|
+
for i in self.get_features()
|
|
969
|
+
if i.feature_type == feature_type and i.name == name
|
|
970
|
+
]
|
|
971
|
+
)
|
|
972
|
+
> 0
|
|
973
|
+
)
|
|
974
|
+
|
|
975
|
+
def get_features(self) -> List["ContentFeature"]:
|
|
976
|
+
"""Get all features on this ContentNode.
|
|
977
|
+
|
|
978
|
+
Returns:
|
|
979
|
+
list[ContentFeature]: A list of the features on this ContentNode.
|
|
980
|
+
|
|
981
|
+
"""
|
|
982
|
+
return self.document.get_persistence().get_features(self)
|
|
983
|
+
|
|
984
|
+
def remove_feature(
|
|
985
|
+
self, feature_type: str, name: str, include_children: bool = False
|
|
986
|
+
):
|
|
987
|
+
"""Removes the feature with the given name and type from this node.
|
|
988
|
+
|
|
989
|
+
Args:
|
|
990
|
+
feature_type (str): The type of the feature.
|
|
991
|
+
name (str): The name of the feature.
|
|
992
|
+
include_children (bool): also remove the feature from nodes children
|
|
993
|
+
|
|
994
|
+
>>> new_page.remove_feature('pagination','pageNum')
|
|
995
|
+
"""
|
|
996
|
+
self.document.get_persistence().remove_feature(self, feature_type, name)
|
|
997
|
+
|
|
998
|
+
if include_children:
|
|
999
|
+
for child in self.get_children():
|
|
1000
|
+
child.remove_feature(feature_type, name, include_children)
|
|
1001
|
+
|
|
1002
|
+
def get_feature_value(self, feature_type: str, name: str) -> Optional[Any]:
|
|
1003
|
+
"""Get the value for a feature with the given name and type on this ContentNode.
|
|
1004
|
+
|
|
1005
|
+
Args:
|
|
1006
|
+
feature_type (str): The type of the feature.
|
|
1007
|
+
name (str): The name of the feature.
|
|
1008
|
+
|
|
1009
|
+
Returns:
|
|
1010
|
+
Any or None: The value of the feature if it exists on this ContentNode otherwise, None, note this
|
|
1011
|
+
only returns the first value (check single to determine if there are multiple)
|
|
1012
|
+
|
|
1013
|
+
>>> new_page.get_feature_value('pagination','pageNum')
|
|
1014
|
+
1
|
|
1015
|
+
"""
|
|
1016
|
+
feature = self.get_feature(feature_type, name)
|
|
1017
|
+
|
|
1018
|
+
return None if feature is None else feature.value
|
|
1019
|
+
|
|
1020
|
+
def get_feature_values(self, feature_type: str, name: str) -> Optional[List[Any]]:
|
|
1021
|
+
"""Get the value for a feature with the given name and type on this ContentNode.
|
|
1022
|
+
|
|
1023
|
+
Args:
|
|
1024
|
+
feature_type (str): The type of the feature.
|
|
1025
|
+
name (str): The name of the feature.
|
|
1026
|
+
|
|
1027
|
+
Returns:
|
|
1028
|
+
The list of feature values or None if there is no feature
|
|
1029
|
+
|
|
1030
|
+
>>> new_page.get_feature_value('pagination','pageNum')
|
|
1031
|
+
1
|
|
1032
|
+
"""
|
|
1033
|
+
feature = self.get_feature(feature_type, name)
|
|
1034
|
+
|
|
1035
|
+
# Simply return all the feature values
|
|
1036
|
+
return None if feature is None else feature.value
|
|
1037
|
+
|
|
1038
|
+
def get_content(self):
|
|
1039
|
+
"""Get the content of this node.
|
|
1040
|
+
|
|
1041
|
+
Args:
|
|
1042
|
+
|
|
1043
|
+
Returns:
|
|
1044
|
+
str: The content of this ContentNode.
|
|
1045
|
+
|
|
1046
|
+
>>> new_page.get_content()
|
|
1047
|
+
"This is page one"
|
|
1048
|
+
"""
|
|
1049
|
+
return self.content
|
|
1050
|
+
|
|
1051
|
+
def get_node_type(self):
|
|
1052
|
+
"""Get the type of this node.
|
|
1053
|
+
|
|
1054
|
+
Args:
|
|
1055
|
+
|
|
1056
|
+
Returns:
|
|
1057
|
+
str: The type of this ContentNode.
|
|
1058
|
+
|
|
1059
|
+
>>> new_page.get_content()
|
|
1060
|
+
"page"
|
|
1061
|
+
"""
|
|
1062
|
+
return self.node_type
|
|
1063
|
+
|
|
1064
|
+
def select_first(self, selector, variables=None) -> Optional["ContentNode"]:
|
|
1065
|
+
"""Select and return the first child of this node that match the selector value.
|
|
1066
|
+
|
|
1067
|
+
Args:
|
|
1068
|
+
selector (str): The selector (ie. //*)
|
|
1069
|
+
variables (dict, optional): A dictionary of variable name/value to use in substituion; defaults to None. Dictionary keys should match a variable specified in the selector.
|
|
1070
|
+
|
|
1071
|
+
Returns:
|
|
1072
|
+
Optional[ContentNode]: The first matching node or none
|
|
1073
|
+
|
|
1074
|
+
>>> document.get_root().select_first('.')
|
|
1075
|
+
ContentNode
|
|
1076
|
+
|
|
1077
|
+
>>> document.get_root().select_first('//*[hasTag($tagName)]', {"tagName": "div"})
|
|
1078
|
+
ContentNode
|
|
1079
|
+
"""
|
|
1080
|
+
result = self.select(selector, variables)
|
|
1081
|
+
return result[0] if len(result) > 0 else None
|
|
1082
|
+
|
|
1083
|
+
def select(self, selector, variables=None, first_only=False):
|
|
1084
|
+
"""Select and return the child nodes of this node that match the selector value.
|
|
1085
|
+
|
|
1086
|
+
Args:
|
|
1087
|
+
selector (str): The selector (ie. //*)
|
|
1088
|
+
variables (dict, optional): A dictionary of variable name/value to use in substituion; defaults to None. Dictionary keys should match a variable specified in the selector.
|
|
1089
|
+
first_only (bool, optional): If True, only the first matching node will be returned; defaults to False.
|
|
1090
|
+
|
|
1091
|
+
Returns:
|
|
1092
|
+
list[ContentNode]: A list of the matching content nodes. If no matches are found, the list will be empty.
|
|
1093
|
+
|
|
1094
|
+
>>> document.get_root().select('.')
|
|
1095
|
+
[ContentNode]
|
|
1096
|
+
|
|
1097
|
+
>>> document.get_root().select('//*[hasTag($tagName)]', {"tagName": "div"})
|
|
1098
|
+
[ContentNode]
|
|
1099
|
+
"""
|
|
1100
|
+
|
|
1101
|
+
if variables is None:
|
|
1102
|
+
variables = {}
|
|
1103
|
+
from kodexa_document.selectors import parse
|
|
1104
|
+
from kodexa_document.selectors.parser import SelectorContext
|
|
1105
|
+
|
|
1106
|
+
context = SelectorContext(self.document, first_only=first_only)
|
|
1107
|
+
parsed_selector = parse(selector)
|
|
1108
|
+
return parsed_selector.resolve(self, variables, context)
|
|
1109
|
+
|
|
1110
|
+
def get_all_content(self, separator=" ", strip=True):
|
|
1111
|
+
"""Get this node's content, concatenated with all of its children's content.
|
|
1112
|
+
|
|
1113
|
+
Args:
|
|
1114
|
+
separator(str, optional): The separator to use in joining content together; defaults to " ".
|
|
1115
|
+
strip(boolean, optional): Strip the result
|
|
1116
|
+
|
|
1117
|
+
Returns:
|
|
1118
|
+
str: The complete content for this node concatenated with the content of all child nodes.
|
|
1119
|
+
|
|
1120
|
+
>>> document.content_node.get_all_content()
|
|
1121
|
+
|
|
1122
|
+
"This string is made up of multiple nodes"
|
|
1123
|
+
"""
|
|
1124
|
+
s = ""
|
|
1125
|
+
children = self.get_content_parts()
|
|
1126
|
+
for part in children:
|
|
1127
|
+
if isinstance(part, str):
|
|
1128
|
+
if s != "":
|
|
1129
|
+
s += separator
|
|
1130
|
+
s += part
|
|
1131
|
+
if isinstance(part, int):
|
|
1132
|
+
if s != "":
|
|
1133
|
+
s += separator
|
|
1134
|
+
s += [
|
|
1135
|
+
child.get_all_content(separator, strip=strip)
|
|
1136
|
+
for child in self.get_children()
|
|
1137
|
+
if child.index == part
|
|
1138
|
+
][0]
|
|
1139
|
+
|
|
1140
|
+
# We need to determine if we have missing children and add them to the end
|
|
1141
|
+
for child in self.get_children():
|
|
1142
|
+
if child.index not in self.get_content_parts():
|
|
1143
|
+
if s != "":
|
|
1144
|
+
s += separator
|
|
1145
|
+
s += child.get_all_content(separator, strip=strip)
|
|
1146
|
+
|
|
1147
|
+
return s.strip() if strip else s
|
|
1148
|
+
|
|
1149
|
+
def adopt_children(self, nodes_to_adopt, replace=False):
|
|
1150
|
+
"""This will take a list of content nodes and adopt them under this node, ensuring they are re-parented.
|
|
1151
|
+
|
|
1152
|
+
Args:
|
|
1153
|
+
nodes_to_adopt (List[ContentNode]): A list of ContentNodes that will be added to the end of this node's children collection
|
|
1154
|
+
replace (bool): If True, will remove all current children and replace them with the new list; defaults to True
|
|
1155
|
+
|
|
1156
|
+
>>> # select all nodes of type 'line', then the root node 'adopts' them
|
|
1157
|
+
>>> # and replaces all it's existing children with these 'line' nodes.
|
|
1158
|
+
>>> document.get_root().adopt_children(document.select('//line'), replace=True)
|
|
1159
|
+
"""
|
|
1160
|
+
child_idx_base = 0
|
|
1161
|
+
|
|
1162
|
+
# We need to copy this since we might well mutate
|
|
1163
|
+
# it as we adopt
|
|
1164
|
+
children = nodes_to_adopt.copy()
|
|
1165
|
+
for existing_child in self.get_children():
|
|
1166
|
+
if existing_child not in children:
|
|
1167
|
+
existing_child.index = child_idx_base
|
|
1168
|
+
self.document.get_persistence().update_node(existing_child)
|
|
1169
|
+
else:
|
|
1170
|
+
existing_child.index = children.index(existing_child)
|
|
1171
|
+
existing_child._parent_id = self.id
|
|
1172
|
+
self.document.get_persistence().update_node(existing_child)
|
|
1173
|
+
child_idx_base += 1
|
|
1174
|
+
|
|
1175
|
+
# Copy to avoid mutation
|
|
1176
|
+
for new_child in children.copy():
|
|
1177
|
+
if new_child not in self.get_children():
|
|
1178
|
+
self.add_child(new_child, children.index(new_child))
|
|
1179
|
+
child_idx_base += 1
|
|
1180
|
+
|
|
1181
|
+
if replace:
|
|
1182
|
+
# Copy to avoid mutation
|
|
1183
|
+
for child in self.get_children().copy():
|
|
1184
|
+
if child not in children:
|
|
1185
|
+
self.remove_child(child)
|
|
1186
|
+
|
|
1187
|
+
def remove_tag(self, tag_name):
|
|
1188
|
+
"""Remove a tag from this content node.
|
|
1189
|
+
|
|
1190
|
+
Args:
|
|
1191
|
+
str: tag_name: The name of the tag that should be removed.
|
|
1192
|
+
tag_name:
|
|
1193
|
+
|
|
1194
|
+
Returns:
|
|
1195
|
+
|
|
1196
|
+
>>> document.get_root().remove_tag('foo')
|
|
1197
|
+
"""
|
|
1198
|
+
self.remove_feature("tag", tag_name)
|
|
1199
|
+
|
|
1200
|
+
def set_statistics(self, statistics):
|
|
1201
|
+
"""Set the spatial statistics for this node
|
|
1202
|
+
|
|
1203
|
+
Args:
|
|
1204
|
+
statistics: the statistics object
|
|
1205
|
+
|
|
1206
|
+
Returns:
|
|
1207
|
+
|
|
1208
|
+
>>> document.select.('//page')[0].set_statistics(NodeStatistics())
|
|
1209
|
+
"""
|
|
1210
|
+
self.add_feature("spatial", "statistics", statistics)
|
|
1211
|
+
|
|
1212
|
+
def get_statistics(self):
|
|
1213
|
+
"""Get the spatial statistics for this node
|
|
1214
|
+
|
|
1215
|
+
|
|
1216
|
+
:return: the statistics object (or None if not set)
|
|
1217
|
+
|
|
1218
|
+
Args:
|
|
1219
|
+
|
|
1220
|
+
Returns:
|
|
1221
|
+
|
|
1222
|
+
>>> document.select.('//page')[0].get_statistics()
|
|
1223
|
+
<kodexa.spatial.NodeStatistics object at 0x7f80605e53c8>
|
|
1224
|
+
"""
|
|
1225
|
+
return self.get_feature_value("spatial", "statistics")
|
|
1226
|
+
|
|
1227
|
+
def set_bbox(self, bbox):
|
|
1228
|
+
"""Set the bounding box for the node, this is structured as:
|
|
1229
|
+
|
|
1230
|
+
[x1,y1,x2,y2]
|
|
1231
|
+
|
|
1232
|
+
Args:
|
|
1233
|
+
bbox: the bounding box array
|
|
1234
|
+
|
|
1235
|
+
|
|
1236
|
+
>>> document.select.('//page')[0].set_bbox([10,20,50,100])
|
|
1237
|
+
"""
|
|
1238
|
+
self.set_feature("spatial", "bbox", bbox)
|
|
1239
|
+
|
|
1240
|
+
def get_bbox(self):
|
|
1241
|
+
"""Get the bounding box for the node, this is structured as:
|
|
1242
|
+
|
|
1243
|
+
[x1,y1,x2,y2]
|
|
1244
|
+
|
|
1245
|
+
|
|
1246
|
+
:return: the bounding box array
|
|
1247
|
+
|
|
1248
|
+
>>> document.select.('//page')[0].get_bbox()
|
|
1249
|
+
[10,20,50,100]
|
|
1250
|
+
"""
|
|
1251
|
+
bbox_value = self.get_feature_value("spatial", "bbox")
|
|
1252
|
+
if bbox_value is None:
|
|
1253
|
+
return None
|
|
1254
|
+
|
|
1255
|
+
if len(bbox_value) == 4:
|
|
1256
|
+
return bbox_value
|
|
1257
|
+
|
|
1258
|
+
if len(bbox_value) == 1:
|
|
1259
|
+
return bbox_value[0]
|
|
1260
|
+
|
|
1261
|
+
return self.get_feature_value("spatial", "bbox")
|
|
1262
|
+
|
|
1263
|
+
def set_bbox_from_children(self):
|
|
1264
|
+
"""Set the bounding box for this node based on its children"""
|
|
1265
|
+
|
|
1266
|
+
x_min = None
|
|
1267
|
+
x_max = None
|
|
1268
|
+
y_min = None
|
|
1269
|
+
y_max = None
|
|
1270
|
+
|
|
1271
|
+
for child in self.get_children():
|
|
1272
|
+
child_bbox = child.get_bbox()
|
|
1273
|
+
if child_bbox:
|
|
1274
|
+
if not x_min or x_min > child_bbox[0]:
|
|
1275
|
+
x_min = child_bbox[0]
|
|
1276
|
+
if not x_max or x_max < child_bbox[2]:
|
|
1277
|
+
x_max = child_bbox[2]
|
|
1278
|
+
if not y_min or y_min > child_bbox[1]:
|
|
1279
|
+
y_min = child_bbox[1]
|
|
1280
|
+
if not y_max or y_max < child_bbox[3]:
|
|
1281
|
+
y_max = child_bbox[3]
|
|
1282
|
+
|
|
1283
|
+
if x_min:
|
|
1284
|
+
self.set_bbox([x_min, y_min, x_max, y_max])
|
|
1285
|
+
|
|
1286
|
+
def set_rotate(self, rotate):
|
|
1287
|
+
"""Set the rotate of the node
|
|
1288
|
+
|
|
1289
|
+
Args:
|
|
1290
|
+
rotate: the rotation of the node
|
|
1291
|
+
|
|
1292
|
+
Returns:
|
|
1293
|
+
|
|
1294
|
+
>>> document.select.('//page')[0].set_rotate(90)
|
|
1295
|
+
"""
|
|
1296
|
+
self.add_feature("spatial", "rotate", rotate)
|
|
1297
|
+
|
|
1298
|
+
def get_rotate(self):
|
|
1299
|
+
"""Get the rotate of the node
|
|
1300
|
+
|
|
1301
|
+
|
|
1302
|
+
:return: the rotation of the node
|
|
1303
|
+
|
|
1304
|
+
Args:
|
|
1305
|
+
|
|
1306
|
+
Returns:
|
|
1307
|
+
|
|
1308
|
+
>>> document.select.('//page')[0].get_rotate()
|
|
1309
|
+
90
|
|
1310
|
+
"""
|
|
1311
|
+
return self.get_feature_value("spatial", "rotate")
|
|
1312
|
+
|
|
1313
|
+
def get_x(self):
|
|
1314
|
+
"""Get the X position of the node
|
|
1315
|
+
|
|
1316
|
+
|
|
1317
|
+
:return: the X position of the node
|
|
1318
|
+
|
|
1319
|
+
Args:
|
|
1320
|
+
|
|
1321
|
+
Returns:
|
|
1322
|
+
|
|
1323
|
+
>>> document.select.('//page')[0].get_x()
|
|
1324
|
+
10
|
|
1325
|
+
"""
|
|
1326
|
+
self_bbox = self.get_bbox()
|
|
1327
|
+
if self_bbox:
|
|
1328
|
+
return self_bbox[0]
|
|
1329
|
+
|
|
1330
|
+
return None
|
|
1331
|
+
|
|
1332
|
+
def get_y(self):
|
|
1333
|
+
"""Get the Y position of the node
|
|
1334
|
+
|
|
1335
|
+
|
|
1336
|
+
:return: the Y position of the node
|
|
1337
|
+
|
|
1338
|
+
Args:
|
|
1339
|
+
|
|
1340
|
+
Returns:
|
|
1341
|
+
|
|
1342
|
+
>>> document.select.('//page')[0].get_y()
|
|
1343
|
+
90
|
|
1344
|
+
"""
|
|
1345
|
+
self_bbox = self.get_bbox()
|
|
1346
|
+
if self_bbox:
|
|
1347
|
+
return self_bbox[1]
|
|
1348
|
+
|
|
1349
|
+
return None
|
|
1350
|
+
|
|
1351
|
+
def get_width(self):
|
|
1352
|
+
"""Get the width of the node
|
|
1353
|
+
|
|
1354
|
+
|
|
1355
|
+
:return: the width of the node
|
|
1356
|
+
|
|
1357
|
+
Args:
|
|
1358
|
+
|
|
1359
|
+
Returns:
|
|
1360
|
+
|
|
1361
|
+
>>> document.select.('//page')[0].get_width()
|
|
1362
|
+
70
|
|
1363
|
+
"""
|
|
1364
|
+
self_bbox = self.get_bbox()
|
|
1365
|
+
if self_bbox:
|
|
1366
|
+
return self_bbox[2] - self_bbox[0]
|
|
1367
|
+
|
|
1368
|
+
return None
|
|
1369
|
+
|
|
1370
|
+
def get_height(self):
|
|
1371
|
+
"""Get the height of the node
|
|
1372
|
+
|
|
1373
|
+
|
|
1374
|
+
:return: the height of the node
|
|
1375
|
+
|
|
1376
|
+
Args:
|
|
1377
|
+
|
|
1378
|
+
Returns:
|
|
1379
|
+
|
|
1380
|
+
>>> document.select.('//page')[0].get_height()
|
|
1381
|
+
40
|
|
1382
|
+
"""
|
|
1383
|
+
self_bbox = self.get_bbox()
|
|
1384
|
+
if self_bbox:
|
|
1385
|
+
return self_bbox[3] - self_bbox[1]
|
|
1386
|
+
|
|
1387
|
+
return None
|
|
1388
|
+
|
|
1389
|
+
def copy_tag(self, selector=".", existing_tag_name=None, new_tag_name=None):
|
|
1390
|
+
"""Creates a new tag of 'new_tag_name' on the selected content node(s) with the same information as the tag with 'existing_tag_name'.
|
|
1391
|
+
Both existing_tag_name and new_tag_name values are required and must be different from one another. Otherwise, no action is taken.
|
|
1392
|
+
If a tag with the 'existing_tag_name' does not exist on a selected node, no action is taken for that node.
|
|
1393
|
+
|
|
1394
|
+
Args:
|
|
1395
|
+
selector: The selector to identify the source nodes to work on (default . - the current node)
|
|
1396
|
+
str: existing_tag_name: The name of the existing tag whose values will be copied to the new tag.
|
|
1397
|
+
str: new_tag_name: The name of the new tag. This must be different from the existing_tag_name.
|
|
1398
|
+
existing_tag_name: (Default value = None)
|
|
1399
|
+
new_tag_name: (Default value = None)
|
|
1400
|
+
|
|
1401
|
+
Returns:
|
|
1402
|
+
|
|
1403
|
+
>>> document.get_root().copy_tag('foo', 'bar')
|
|
1404
|
+
"""
|
|
1405
|
+
if (
|
|
1406
|
+
existing_tag_name is None
|
|
1407
|
+
or new_tag_name is None
|
|
1408
|
+
or existing_tag_name == new_tag_name
|
|
1409
|
+
):
|
|
1410
|
+
return # do nothing, just exit function
|
|
1411
|
+
|
|
1412
|
+
for node in self.select(selector):
|
|
1413
|
+
existing_tag_values = node.get_feature_values("tag", existing_tag_name)
|
|
1414
|
+
if existing_tag_values:
|
|
1415
|
+
for val in existing_tag_values:
|
|
1416
|
+
tag = Tag(
|
|
1417
|
+
start=val.start,
|
|
1418
|
+
end=val.end,
|
|
1419
|
+
value=val.value,
|
|
1420
|
+
uuid=val.uuid,
|
|
1421
|
+
data=val.data if val.data else {},
|
|
1422
|
+
)
|
|
1423
|
+
node.add_feature("tag", new_tag_name, tag)
|
|
1424
|
+
|
|
1425
|
+
def collect_nodes_to(self, end_node):
|
|
1426
|
+
"""Get the the sibling nodes between the current node and the end_node.
|
|
1427
|
+
|
|
1428
|
+
Args:
|
|
1429
|
+
ContentNode: end_node: The node to end at
|
|
1430
|
+
end_node:
|
|
1431
|
+
|
|
1432
|
+
Returns:
|
|
1433
|
+
list[ContentNode]: A list of sibling nodes between this node and the end_node.
|
|
1434
|
+
|
|
1435
|
+
>>> document.content_node.get_children()[0].collect_nodes_to(end_node=document.content_node.get_children()[5])
|
|
1436
|
+
"""
|
|
1437
|
+
nodes = []
|
|
1438
|
+
current_node = self
|
|
1439
|
+
while current_node.id != end_node.id:
|
|
1440
|
+
nodes.append(current_node)
|
|
1441
|
+
if current_node.has_next_node():
|
|
1442
|
+
current_node = current_node.next_node()
|
|
1443
|
+
else:
|
|
1444
|
+
break
|
|
1445
|
+
return nodes
|
|
1446
|
+
|
|
1447
|
+
def tag_nodes_to(self, end_node, tag_to_apply, tag_uuid: str = None):
|
|
1448
|
+
"""Tag all the nodes from this node to the end_node with the given tag name
|
|
1449
|
+
|
|
1450
|
+
Args:
|
|
1451
|
+
end_node (ContentNode): The node to end with
|
|
1452
|
+
tag_to_apply (str): The tag name that will be applied to each node
|
|
1453
|
+
tag_uuid (str): The tag uuid used if you want to group them
|
|
1454
|
+
|
|
1455
|
+
>>> document.content_node.get_children()[0].tag_nodes_to(document.content_node.get_children()[5], tag_name='foo')
|
|
1456
|
+
"""
|
|
1457
|
+
[
|
|
1458
|
+
node.tag(tag_to_apply, tag_uuid=tag_uuid)
|
|
1459
|
+
for node in self.collect_nodes_to(end_node)
|
|
1460
|
+
]
|
|
1461
|
+
|
|
1462
|
+
def tag_range(
|
|
1463
|
+
self,
|
|
1464
|
+
start_content_re,
|
|
1465
|
+
end_content_re,
|
|
1466
|
+
tag_to_apply,
|
|
1467
|
+
node_type_re=".*",
|
|
1468
|
+
use_all_content=False,
|
|
1469
|
+
):
|
|
1470
|
+
"""This will tag all the child nodes between the start and end content regular expressions
|
|
1471
|
+
|
|
1472
|
+
Args:
|
|
1473
|
+
start_content_re: The regular expression to match the starting child
|
|
1474
|
+
end_content_re: The regular expression to match the ending child
|
|
1475
|
+
tag_to_apply: The tag name that will be applied to the nodes in range
|
|
1476
|
+
node_type_re: The node type to match (default is all)
|
|
1477
|
+
use_all_content: Use full content (including child nodes, default is False)
|
|
1478
|
+
|
|
1479
|
+
Returns:
|
|
1480
|
+
|
|
1481
|
+
>>> document.content_node.tag_range(start_content_re='.*Cheese.*', end_content_re='.*Fish.*', tag_to_apply='foo')
|
|
1482
|
+
"""
|
|
1483
|
+
|
|
1484
|
+
# Could be line, word, or content-area
|
|
1485
|
+
all_nodes = self.select(f"//*[typeRegex('{node_type_re}')]")
|
|
1486
|
+
|
|
1487
|
+
start_index_list = [
|
|
1488
|
+
n_idx
|
|
1489
|
+
for n_idx, node in enumerate(all_nodes)
|
|
1490
|
+
if re.compile(start_content_re).match(
|
|
1491
|
+
node.get_all_content() if use_all_content else node.content
|
|
1492
|
+
)
|
|
1493
|
+
]
|
|
1494
|
+
end_index_list = [
|
|
1495
|
+
n_idx
|
|
1496
|
+
for n_idx, node in enumerate(all_nodes)
|
|
1497
|
+
if re.compile(end_content_re).match(
|
|
1498
|
+
node.get_all_content() if use_all_content else node.content
|
|
1499
|
+
)
|
|
1500
|
+
]
|
|
1501
|
+
|
|
1502
|
+
start_index = (
|
|
1503
|
+
0
|
|
1504
|
+
if start_content_re == ""
|
|
1505
|
+
else start_index_list[0] if len(start_index_list) > 0 else None
|
|
1506
|
+
)
|
|
1507
|
+
if start_index is not None:
|
|
1508
|
+
end_index_list = [i for i in end_index_list if i >= start_index]
|
|
1509
|
+
|
|
1510
|
+
end_index = (
|
|
1511
|
+
len(all_nodes)
|
|
1512
|
+
if end_content_re == ""
|
|
1513
|
+
else end_index_list[0] if len(end_index_list) > 0 else len(all_nodes)
|
|
1514
|
+
)
|
|
1515
|
+
|
|
1516
|
+
if start_index is not None:
|
|
1517
|
+
[node.tag(tag_to_apply) for node in all_nodes[start_index:end_index]]
|
|
1518
|
+
|
|
1519
|
+
def tag(
|
|
1520
|
+
self,
|
|
1521
|
+
tag_to_apply,
|
|
1522
|
+
selector=".",
|
|
1523
|
+
content_re=None,
|
|
1524
|
+
use_all_content=False,
|
|
1525
|
+
node_only=None,
|
|
1526
|
+
fixed_position=None,
|
|
1527
|
+
data=None,
|
|
1528
|
+
separator=" ",
|
|
1529
|
+
tag_uuid: str = None,
|
|
1530
|
+
confidence=None,
|
|
1531
|
+
value=None,
|
|
1532
|
+
use_match=True,
|
|
1533
|
+
index=None,
|
|
1534
|
+
cell_index=None,
|
|
1535
|
+
group_uuid=None,
|
|
1536
|
+
parent_group_uuid=None,
|
|
1537
|
+
note=None,
|
|
1538
|
+
status=None,
|
|
1539
|
+
owner_uri=None,
|
|
1540
|
+
is_dirty=None,
|
|
1541
|
+
sort_by_bbox: bool = False,
|
|
1542
|
+
):
|
|
1543
|
+
"""
|
|
1544
|
+
This will tag (see Feature Tagging) the expression groups identified by the regular expression.
|
|
1545
|
+
|
|
1546
|
+
Note that if you use the flag use_all_content then node_only will default to True if not set, else it
|
|
1547
|
+
will default to False
|
|
1548
|
+
|
|
1549
|
+
Args:
|
|
1550
|
+
tag_to_apply: The name of tag that will be applied to the node
|
|
1551
|
+
selector: The selector to identify the source nodes to work on (default . - the current node)
|
|
1552
|
+
content_re: The regular expression that you wish to use to tag, note that we will create a tag for each matching group (Default value = None)
|
|
1553
|
+
use_all_content: Apply the regular expression to the all_content (include content from child nodes) (Default value = False)
|
|
1554
|
+
separator: Separator to use for use_all_content (Default value = " ")
|
|
1555
|
+
node_only: Ignore the matching groups and tag the whole node (Default value = None)
|
|
1556
|
+
fixed_position: Use a fixed position, supplied as a tuple i.e. - (4,10) tag from position 4 to 10 (default None)
|
|
1557
|
+
data: A dictionary of data for the given tag (Default value = None)
|
|
1558
|
+
tag_uuid: A UUID used to tie tags in order to demonstrate they're related and form a single concept.
|
|
1559
|
+
For example, if tagging the two words "Wells" and "Fargo" as an ORGANIZATION, the tag on both words should have the
|
|
1560
|
+
same tag_uuid in order to indicate they are both needed to form the single ORGANIZATION. If a tag_uuid is provided, it is used
|
|
1561
|
+
on all tags created in this method. This may result in multiple nodes or multiple feature values having the same tag_uuid.
|
|
1562
|
+
For example, if the selector provided results in more than one node being selected, each node would be tagged with the same tag_uuid.
|
|
1563
|
+
The same holds true if a content_re value is provided, node_only is set to False, and multiple matches are found for the content_re
|
|
1564
|
+
pattern. In that case, each feature value would share the same UUID.
|
|
1565
|
+
If no tag_uuid is provided, a new uuid is generated for each tag instance.
|
|
1566
|
+
tag_uuid: str: (Default value = None)
|
|
1567
|
+
confidence: The confidence in the tag (0-1)
|
|
1568
|
+
value: The value you wish to store with the tag, this allows you to provide text that isn't part of the content but represents the data you wish tagged
|
|
1569
|
+
use_match: If True (default) we will use match for regex matching, if False we will use search
|
|
1570
|
+
index: The index for the tag
|
|
1571
|
+
cell_index: The cell index for the tag
|
|
1572
|
+
group_uuid: The group uuid for the tag
|
|
1573
|
+
parent_group_uuid: The parent group uuid for the tag
|
|
1574
|
+
note: a text note for the tag
|
|
1575
|
+
status: a status for the tag, this can be transistioned to an attribute status during extraction
|
|
1576
|
+
owner_uri: the uri of the entity that created the tag (model vs user; example: model://cdad-healthcare/cdad-excel-model:1.0.0 or user://pdodds)
|
|
1577
|
+
is_dirty: when the model is run, is_dirty = false for all tags. New tags and editted tags, is_dirty = true.
|
|
1578
|
+
|
|
1579
|
+
>>> document.content_node.tag('is_cheese')
|
|
1580
|
+
"""
|
|
1581
|
+
|
|
1582
|
+
if use_all_content and node_only is None:
|
|
1583
|
+
node_only = True
|
|
1584
|
+
elif node_only is None:
|
|
1585
|
+
node_only = False
|
|
1586
|
+
|
|
1587
|
+
def get_tag_uuid(tag_uuid):
|
|
1588
|
+
"""
|
|
1589
|
+
This function returns the provided tag_uuid if it exists, otherwise it generates a new UUID.
|
|
1590
|
+
|
|
1591
|
+
Args:
|
|
1592
|
+
tag_uuid (str): The UUID of the tag.
|
|
1593
|
+
|
|
1594
|
+
Returns:
|
|
1595
|
+
str: The provided tag_uuid if it exists, otherwise a newly generated UUID.
|
|
1596
|
+
"""
|
|
1597
|
+
if tag_uuid:
|
|
1598
|
+
return tag_uuid
|
|
1599
|
+
|
|
1600
|
+
return str(uuid.uuid4())
|
|
1601
|
+
|
|
1602
|
+
def tag_node_position(
|
|
1603
|
+
node_to_check,
|
|
1604
|
+
start,
|
|
1605
|
+
end,
|
|
1606
|
+
node_data,
|
|
1607
|
+
tag_uuid,
|
|
1608
|
+
offset=0,
|
|
1609
|
+
value=None,
|
|
1610
|
+
sort_by_bbox: bool = False,
|
|
1611
|
+
):
|
|
1612
|
+
"""
|
|
1613
|
+
This function tags a node position in a given data structure. It iterates over the content parts of the node to check,
|
|
1614
|
+
and based on the type of the part (string or integer), it performs different operations. If the part is a string, it
|
|
1615
|
+
adjusts the start and end positions and adds a feature to the node. If the part is an integer, it finds the corresponding
|
|
1616
|
+
child node and recursively calls the function on the child node. After processing all parts, it checks for any missing
|
|
1617
|
+
children and processes them as well. Finally, it checks if the length of all content matches the calculated content length.
|
|
1618
|
+
|
|
1619
|
+
Args:
|
|
1620
|
+
node_to_check (Node): The node to check and tag.
|
|
1621
|
+
start (int): The start position of the tag.
|
|
1622
|
+
end (int): The end position of the tag.
|
|
1623
|
+
node_data (dict): The data associated with the node.
|
|
1624
|
+
tag_uuid (str): The UUID of the tag.
|
|
1625
|
+
offset (int, optional): The offset to apply. Defaults to 0.
|
|
1626
|
+
value (str, optional): The value to use for the tag. If None, the part of the content at the start and end positions is used. Defaults to None.
|
|
1627
|
+
|
|
1628
|
+
Raises:
|
|
1629
|
+
Exception: If an invalid part is encountered in the content parts of the node to check.
|
|
1630
|
+
Exception: If there is a mismatch between the length of all content and the calculated content length.
|
|
1631
|
+
|
|
1632
|
+
Returns:
|
|
1633
|
+
int: The calculated content length.
|
|
1634
|
+
"""
|
|
1635
|
+
content_length = 0
|
|
1636
|
+
original_start = start
|
|
1637
|
+
original_end = end
|
|
1638
|
+
for part_idx, part in enumerate(node_to_check.get_content_parts()):
|
|
1639
|
+
if isinstance(part, str):
|
|
1640
|
+
if len(part) > 0:
|
|
1641
|
+
# It is just content
|
|
1642
|
+
part_length = len(part)
|
|
1643
|
+
if part_idx > 0:
|
|
1644
|
+
end = end - len(separator)
|
|
1645
|
+
content_length = content_length + len(separator)
|
|
1646
|
+
offset = offset + len(separator)
|
|
1647
|
+
start = (
|
|
1648
|
+
0
|
|
1649
|
+
if start - len(separator) < 0
|
|
1650
|
+
else start - len(separator)
|
|
1651
|
+
)
|
|
1652
|
+
|
|
1653
|
+
if start < part_length and end < part_length:
|
|
1654
|
+
node_to_check.add_feature(
|
|
1655
|
+
"tag",
|
|
1656
|
+
tag_to_apply,
|
|
1657
|
+
Tag(
|
|
1658
|
+
original_start,
|
|
1659
|
+
original_end,
|
|
1660
|
+
part[start:end] if value is None else value,
|
|
1661
|
+
data=node_data,
|
|
1662
|
+
uuid=tag_uuid,
|
|
1663
|
+
confidence=confidence,
|
|
1664
|
+
index=index,
|
|
1665
|
+
parent_group_uuid=parent_group_uuid,
|
|
1666
|
+
group_uuid=group_uuid,
|
|
1667
|
+
cell_index=cell_index,
|
|
1668
|
+
note=note,
|
|
1669
|
+
status=status,
|
|
1670
|
+
owner_uri=owner_uri,
|
|
1671
|
+
is_dirty=is_dirty,
|
|
1672
|
+
),
|
|
1673
|
+
)
|
|
1674
|
+
return -1
|
|
1675
|
+
if start < part_length <= end:
|
|
1676
|
+
node_to_check.add_feature(
|
|
1677
|
+
"tag",
|
|
1678
|
+
tag_to_apply,
|
|
1679
|
+
Tag(
|
|
1680
|
+
original_start,
|
|
1681
|
+
content_length + part_length,
|
|
1682
|
+
value=part[start:] if value is None else value,
|
|
1683
|
+
data=node_data,
|
|
1684
|
+
uuid=tag_uuid,
|
|
1685
|
+
confidence=confidence,
|
|
1686
|
+
index=index,
|
|
1687
|
+
parent_group_uuid=parent_group_uuid,
|
|
1688
|
+
group_uuid=group_uuid,
|
|
1689
|
+
cell_index=cell_index,
|
|
1690
|
+
note=note,
|
|
1691
|
+
status=status,
|
|
1692
|
+
owner_uri=owner_uri,
|
|
1693
|
+
is_dirty=is_dirty,
|
|
1694
|
+
),
|
|
1695
|
+
)
|
|
1696
|
+
|
|
1697
|
+
end = end - part_length
|
|
1698
|
+
content_length = content_length + part_length
|
|
1699
|
+
offset = offset + part_length
|
|
1700
|
+
start = 0 if start - part_length < 0 else start - part_length
|
|
1701
|
+
|
|
1702
|
+
elif isinstance(part, int):
|
|
1703
|
+
child_node = [
|
|
1704
|
+
child
|
|
1705
|
+
for child in node_to_check.get_children()
|
|
1706
|
+
if child.index == part
|
|
1707
|
+
][0]
|
|
1708
|
+
|
|
1709
|
+
if part_idx > 0:
|
|
1710
|
+
end = end - len(separator)
|
|
1711
|
+
content_length = content_length + len(separator)
|
|
1712
|
+
offset = offset + len(separator)
|
|
1713
|
+
start = (
|
|
1714
|
+
0 if start - len(separator) < 0 else start - len(separator)
|
|
1715
|
+
)
|
|
1716
|
+
|
|
1717
|
+
result = tag_node_position(
|
|
1718
|
+
child_node,
|
|
1719
|
+
start,
|
|
1720
|
+
end,
|
|
1721
|
+
node_data,
|
|
1722
|
+
tag_uuid,
|
|
1723
|
+
offset=offset,
|
|
1724
|
+
value=value,
|
|
1725
|
+
sort_by_bbox=sort_by_bbox,
|
|
1726
|
+
)
|
|
1727
|
+
|
|
1728
|
+
if result < 0 or (end - result) <= 0:
|
|
1729
|
+
return -1
|
|
1730
|
+
|
|
1731
|
+
offset = offset + result
|
|
1732
|
+
end = end - result
|
|
1733
|
+
start = 0 if start - result < 0 else start - result
|
|
1734
|
+
|
|
1735
|
+
content_length = content_length + result
|
|
1736
|
+
else:
|
|
1737
|
+
raise Exception("Invalid part?")
|
|
1738
|
+
|
|
1739
|
+
# We need to determine if we have missing children and add them to the end
|
|
1740
|
+
node_children = node_to_check.get_children()
|
|
1741
|
+
if node_children and sort_by_bbox:
|
|
1742
|
+
# Sort nodes by x-coordinate if they have bboxes, otherwise use index
|
|
1743
|
+
try:
|
|
1744
|
+
node_children.sort(
|
|
1745
|
+
key=lambda x: (
|
|
1746
|
+
x.get_bbox()[0]
|
|
1747
|
+
if hasattr(x, "get_bbox")
|
|
1748
|
+
else x.index if hasattr(x, "index") else 0
|
|
1749
|
+
)
|
|
1750
|
+
)
|
|
1751
|
+
except (AttributeError, TypeError, IndexError):
|
|
1752
|
+
# If sorting fails, keep original order
|
|
1753
|
+
pass
|
|
1754
|
+
|
|
1755
|
+
for child_idx, child_node in enumerate(node_children):
|
|
1756
|
+
if child_node.index not in node_to_check.get_content_parts():
|
|
1757
|
+
if content_length > 0:
|
|
1758
|
+
end = end - len(separator)
|
|
1759
|
+
content_length = content_length + len(separator)
|
|
1760
|
+
offset = offset + len(separator)
|
|
1761
|
+
start = (
|
|
1762
|
+
0 if start - len(separator) < 0 else start - len(separator)
|
|
1763
|
+
)
|
|
1764
|
+
|
|
1765
|
+
result = tag_node_position(
|
|
1766
|
+
child_node,
|
|
1767
|
+
start,
|
|
1768
|
+
end,
|
|
1769
|
+
node_data,
|
|
1770
|
+
tag_uuid,
|
|
1771
|
+
offset=offset,
|
|
1772
|
+
value=value,
|
|
1773
|
+
sort_by_bbox=sort_by_bbox,
|
|
1774
|
+
)
|
|
1775
|
+
|
|
1776
|
+
if result < 0 or (end - result) <= 0:
|
|
1777
|
+
return -1
|
|
1778
|
+
|
|
1779
|
+
offset = offset + result
|
|
1780
|
+
end = end - result
|
|
1781
|
+
start = 0 if start - result < 0 else start - result
|
|
1782
|
+
|
|
1783
|
+
content_length = content_length + result
|
|
1784
|
+
|
|
1785
|
+
if len(node_to_check.get_all_content(strip=False)) != content_length:
|
|
1786
|
+
raise Exception(
|
|
1787
|
+
f"There is a problem in the structure? (2) Length mismatch ({len(node_to_check.get_all_content(strip=False))} != {content_length})"
|
|
1788
|
+
)
|
|
1789
|
+
|
|
1790
|
+
return content_length
|
|
1791
|
+
|
|
1792
|
+
if content_re:
|
|
1793
|
+
pattern = re.compile(
|
|
1794
|
+
content_re.replace(" ", r"\s+")
|
|
1795
|
+
if use_all_content and not node_only
|
|
1796
|
+
else content_re
|
|
1797
|
+
)
|
|
1798
|
+
|
|
1799
|
+
for node in self.select(selector):
|
|
1800
|
+
if fixed_position:
|
|
1801
|
+
tag_node_position(
|
|
1802
|
+
node,
|
|
1803
|
+
fixed_position[0],
|
|
1804
|
+
fixed_position[1],
|
|
1805
|
+
data,
|
|
1806
|
+
get_tag_uuid(tag_uuid),
|
|
1807
|
+
0,
|
|
1808
|
+
value=value,
|
|
1809
|
+
sort_by_bbox=sort_by_bbox,
|
|
1810
|
+
)
|
|
1811
|
+
|
|
1812
|
+
else:
|
|
1813
|
+
if not content_re:
|
|
1814
|
+
node.add_feature(
|
|
1815
|
+
"tag",
|
|
1816
|
+
tag_to_apply,
|
|
1817
|
+
Tag(
|
|
1818
|
+
data=data,
|
|
1819
|
+
uuid=get_tag_uuid(tag_uuid),
|
|
1820
|
+
confidence=confidence,
|
|
1821
|
+
value=value,
|
|
1822
|
+
index=index,
|
|
1823
|
+
parent_group_uuid=parent_group_uuid,
|
|
1824
|
+
group_uuid=group_uuid,
|
|
1825
|
+
cell_index=cell_index,
|
|
1826
|
+
note=note,
|
|
1827
|
+
status=status,
|
|
1828
|
+
owner_uri=owner_uri,
|
|
1829
|
+
is_dirty=is_dirty,
|
|
1830
|
+
),
|
|
1831
|
+
)
|
|
1832
|
+
else:
|
|
1833
|
+
if not use_all_content:
|
|
1834
|
+
if node.content:
|
|
1835
|
+
content = node.content
|
|
1836
|
+
else:
|
|
1837
|
+
content = None
|
|
1838
|
+
else:
|
|
1839
|
+
content = (
|
|
1840
|
+
node.get_all_content(separator=separator, strip=False)
|
|
1841
|
+
if not node_only
|
|
1842
|
+
else node.get_all_content(separator=separator)
|
|
1843
|
+
)
|
|
1844
|
+
|
|
1845
|
+
if content is not None:
|
|
1846
|
+
if use_match:
|
|
1847
|
+
matches = pattern.finditer(content)
|
|
1848
|
+
|
|
1849
|
+
if node_only:
|
|
1850
|
+
if any(True for _ in matches):
|
|
1851
|
+
node.add_feature(
|
|
1852
|
+
"tag",
|
|
1853
|
+
tag_to_apply,
|
|
1854
|
+
Tag(
|
|
1855
|
+
data=data,
|
|
1856
|
+
uuid=get_tag_uuid(tag_uuid),
|
|
1857
|
+
confidence=confidence,
|
|
1858
|
+
value=value,
|
|
1859
|
+
index=index,
|
|
1860
|
+
parent_group_uuid=parent_group_uuid,
|
|
1861
|
+
group_uuid=group_uuid,
|
|
1862
|
+
cell_index=cell_index,
|
|
1863
|
+
note=note,
|
|
1864
|
+
status=status,
|
|
1865
|
+
owner_uri=owner_uri,
|
|
1866
|
+
is_dirty=is_dirty,
|
|
1867
|
+
),
|
|
1868
|
+
)
|
|
1869
|
+
else:
|
|
1870
|
+
if matches:
|
|
1871
|
+
for match in matches:
|
|
1872
|
+
start_offset = match.span()[0]
|
|
1873
|
+
end_offset = match.span()[1]
|
|
1874
|
+
tag_node_position(
|
|
1875
|
+
node,
|
|
1876
|
+
start_offset,
|
|
1877
|
+
end_offset,
|
|
1878
|
+
data,
|
|
1879
|
+
get_tag_uuid(tag_uuid),
|
|
1880
|
+
value=value,
|
|
1881
|
+
sort_by_bbox=sort_by_bbox,
|
|
1882
|
+
)
|
|
1883
|
+
|
|
1884
|
+
else:
|
|
1885
|
+
search_match = pattern.search(content)
|
|
1886
|
+
if search_match is not None:
|
|
1887
|
+
start_offset = search_match.span()[0]
|
|
1888
|
+
end_offset = search_match.span()[1]
|
|
1889
|
+
tag_node_position(
|
|
1890
|
+
node,
|
|
1891
|
+
start_offset,
|
|
1892
|
+
end_offset,
|
|
1893
|
+
data,
|
|
1894
|
+
get_tag_uuid(tag_uuid),
|
|
1895
|
+
value=value,
|
|
1896
|
+
sort_by_bbox=sort_by_bbox,
|
|
1897
|
+
)
|
|
1898
|
+
|
|
1899
|
+
def get_tags(self):
|
|
1900
|
+
"""Returns a list of the names of the tags on the given node
|
|
1901
|
+
|
|
1902
|
+
|
|
1903
|
+
:return: A list of the tag name
|
|
1904
|
+
|
|
1905
|
+
Args:
|
|
1906
|
+
|
|
1907
|
+
Returns:
|
|
1908
|
+
|
|
1909
|
+
>>> document.content_node.select('*').get_tags()
|
|
1910
|
+
['is_cheese']
|
|
1911
|
+
"""
|
|
1912
|
+
return [i.name for i in self.get_features_of_type("tag")]
|
|
1913
|
+
|
|
1914
|
+
def get_tag_features(self):
|
|
1915
|
+
"""Returns a list of the features that are tags on the given node
|
|
1916
|
+
|
|
1917
|
+
|
|
1918
|
+
:return: A list of the tag name
|
|
1919
|
+
|
|
1920
|
+
Args:
|
|
1921
|
+
|
|
1922
|
+
Returns:
|
|
1923
|
+
|
|
1924
|
+
>>> document.content_node.select('*').get_tag_features()
|
|
1925
|
+
[ContentFeature()]
|
|
1926
|
+
"""
|
|
1927
|
+
return [i for i in self.get_features_of_type("tag")]
|
|
1928
|
+
|
|
1929
|
+
def get_tag_values(self, tag_name, include_children=False):
|
|
1930
|
+
"""Get the values for a specific tag name
|
|
1931
|
+
|
|
1932
|
+
Args:
|
|
1933
|
+
tag_name: tag name
|
|
1934
|
+
include_children: include the children of this node (Default value = False)
|
|
1935
|
+
|
|
1936
|
+
Returns:
|
|
1937
|
+
a list of the tag values
|
|
1938
|
+
|
|
1939
|
+
"""
|
|
1940
|
+
values = []
|
|
1941
|
+
for tag in self.get_tag(tag_name):
|
|
1942
|
+
values.append(tag.value)
|
|
1943
|
+
|
|
1944
|
+
if include_children:
|
|
1945
|
+
for child in self.get_children():
|
|
1946
|
+
values.extend(child.get_tag_values(tag_name, include_children))
|
|
1947
|
+
|
|
1948
|
+
return values
|
|
1949
|
+
|
|
1950
|
+
def get_related_tag_values(
|
|
1951
|
+
self,
|
|
1952
|
+
tag_name: str,
|
|
1953
|
+
include_children: bool = False,
|
|
1954
|
+
value_separator: str = " ",
|
|
1955
|
+
tag_uuid=None,
|
|
1956
|
+
):
|
|
1957
|
+
"""Get the values for a specific tag name, grouped by uuid
|
|
1958
|
+
|
|
1959
|
+
Args:
|
|
1960
|
+
tag_name (str): tag name
|
|
1961
|
+
include_children (bool): include the children of this node
|
|
1962
|
+
value_separator (str): the string to be used to join related tag values
|
|
1963
|
+
|
|
1964
|
+
Returns:
|
|
1965
|
+
a list of the tag values
|
|
1966
|
+
|
|
1967
|
+
"""
|
|
1968
|
+
|
|
1969
|
+
def group_tag_values(group_dict, feature_val, tag_uuid, tag_node):
|
|
1970
|
+
"""
|
|
1971
|
+
This function groups tag values if they share the same uuid. It checks if the uuid of the feature value matches the tag uuid.
|
|
1972
|
+
If they match, it sets the final value to the feature value if it exists, otherwise it sets it to the tag node content.
|
|
1973
|
+
Then, it checks if the uuid is in the value groups keys. If it is, it appends the final value to the group.
|
|
1974
|
+
If it's the first occurrence, it sets the group to the final value.
|
|
1975
|
+
|
|
1976
|
+
Args:
|
|
1977
|
+
group_dict (dict): The dictionary to group the values in.
|
|
1978
|
+
feature_val (dict): The feature value to check.
|
|
1979
|
+
tag_uuid (str): The uuid of the tag.
|
|
1980
|
+
tag_node (Node): The node of the tag.
|
|
1981
|
+
|
|
1982
|
+
Returns:
|
|
1983
|
+
None
|
|
1984
|
+
"""
|
|
1985
|
+
# we know the names of all these tags are the same, but we want to group them if they share the same uuid
|
|
1986
|
+
|
|
1987
|
+
if feature_val["uuid"] != tag_uuid:
|
|
1988
|
+
return
|
|
1989
|
+
|
|
1990
|
+
final_value = feature_val["value"] if "value" in feature_val else None
|
|
1991
|
+
if final_value is None:
|
|
1992
|
+
final_value = tag_node.content
|
|
1993
|
+
|
|
1994
|
+
if feature_val["uuid"] in value_groups.keys():
|
|
1995
|
+
# we've seen this UUID - add it's value to the group
|
|
1996
|
+
group_dict[feature_val["uuid"]].append(final_value)
|
|
1997
|
+
else:
|
|
1998
|
+
# first occurrence
|
|
1999
|
+
group_dict[feature_val["uuid"]] = [final_value]
|
|
2000
|
+
|
|
2001
|
+
if include_children:
|
|
2002
|
+
tagged_nodes = self.document.get_tagged_nodes(tag_name, tag_uuid=tag_uuid)
|
|
2003
|
+
else:
|
|
2004
|
+
tagged_nodes = self.select(".")
|
|
2005
|
+
|
|
2006
|
+
value_groups: Dict[str, Any] = {}
|
|
2007
|
+
for tag_node in tagged_nodes:
|
|
2008
|
+
tag_feature_vals = tag_node.get_feature_value("tag", tag_name)
|
|
2009
|
+
if tag_feature_vals:
|
|
2010
|
+
if not isinstance(tag_feature_vals, list):
|
|
2011
|
+
tag_feature_vals = [tag_feature_vals]
|
|
2012
|
+
|
|
2013
|
+
for v in tag_feature_vals:
|
|
2014
|
+
group_tag_values(value_groups, v, tag_uuid, tag_node)
|
|
2015
|
+
|
|
2016
|
+
value_strings = []
|
|
2017
|
+
for k in value_groups.keys():
|
|
2018
|
+
if (
|
|
2019
|
+
value_groups[k]
|
|
2020
|
+
and len(value_groups[k]) > 0
|
|
2021
|
+
and value_groups[k][0] is not None
|
|
2022
|
+
):
|
|
2023
|
+
value_strings.append(value_separator.join(value_groups[k]))
|
|
2024
|
+
|
|
2025
|
+
return value_strings
|
|
2026
|
+
|
|
2027
|
+
def get_related_tag_nodes(
|
|
2028
|
+
self, tag_name: str, everywhere: bool = False, tag_uuid=None
|
|
2029
|
+
):
|
|
2030
|
+
"""Get the nodes for a specific tag name, grouped by uuid
|
|
2031
|
+
|
|
2032
|
+
Args:
|
|
2033
|
+
tag_name (str): tag name
|
|
2034
|
+
everywhere (bool): include the children of this node
|
|
2035
|
+
tag_uuid (optional(str)): if set we will only get nodes related to this tag UUID
|
|
2036
|
+
|
|
2037
|
+
Returns:
|
|
2038
|
+
a dictionary that groups nodes by tag UUID
|
|
2039
|
+
|
|
2040
|
+
"""
|
|
2041
|
+
if everywhere:
|
|
2042
|
+
tagged_nodes = self.document.get_tagged_nodes(tag_name, tag_uuid)
|
|
2043
|
+
else:
|
|
2044
|
+
tagged_nodes = [self]
|
|
2045
|
+
|
|
2046
|
+
# We need to group these nodes together based on the TAG UUID
|
|
2047
|
+
|
|
2048
|
+
node_groups = {}
|
|
2049
|
+
|
|
2050
|
+
for tagged_node in tagged_nodes:
|
|
2051
|
+
tag_instances = tagged_node.get_tag(tag_name)
|
|
2052
|
+
|
|
2053
|
+
for tag_instance in tag_instances:
|
|
2054
|
+
if "uuid" in tag_instance:
|
|
2055
|
+
if tag_instance["uuid"] not in node_groups:
|
|
2056
|
+
node_groups[tag_instance["uuid"]] = [tagged_node]
|
|
2057
|
+
else:
|
|
2058
|
+
node_groups[tag_instance["uuid"]].append(tagged_node)
|
|
2059
|
+
|
|
2060
|
+
return node_groups
|
|
2061
|
+
|
|
2062
|
+
def get_tag(self, tag_name, tag_uuid=None):
|
|
2063
|
+
"""Returns the value of a tag (a dictionary), this can be either a single value in a list [[start,end,value]] or if multiple parts of the
|
|
2064
|
+
content of this node match you can end up with a list of lists i.e. [[start1,end1,value1],[start2,end2,value2]]
|
|
2065
|
+
|
|
2066
|
+
Args:
|
|
2067
|
+
tag_name: The name of the tag
|
|
2068
|
+
tag_uuid (Optional): Optionally you can also provide the tag UUID
|
|
2069
|
+
|
|
2070
|
+
Returns:
|
|
2071
|
+
A list tagged location and values for this label in this node
|
|
2072
|
+
|
|
2073
|
+
>>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').get_tag('is_cheese')
|
|
2074
|
+
[0,10,'The Cheese Moved']
|
|
2075
|
+
"""
|
|
2076
|
+
tag_details = self.get_feature_value("tag", tag_name)
|
|
2077
|
+
|
|
2078
|
+
if tag_details is None:
|
|
2079
|
+
return []
|
|
2080
|
+
|
|
2081
|
+
if not isinstance(tag_details, list):
|
|
2082
|
+
tag_details = [tag_details]
|
|
2083
|
+
|
|
2084
|
+
return tag_details
|
|
2085
|
+
|
|
2086
|
+
def get_all_tags(self):
|
|
2087
|
+
"""Get the names of all tags that have been applied to this node or to its children.
|
|
2088
|
+
|
|
2089
|
+
Args:
|
|
2090
|
+
|
|
2091
|
+
Returns:
|
|
2092
|
+
list[str]: A list of the tag names belonging to this node and/or its children.
|
|
2093
|
+
|
|
2094
|
+
>>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').get_all_tags()
|
|
2095
|
+
['is_cheese']
|
|
2096
|
+
"""
|
|
2097
|
+
tags = []
|
|
2098
|
+
tags.extend(self.get_tags())
|
|
2099
|
+
for child in self.get_children():
|
|
2100
|
+
tags.extend(child.get_all_tags())
|
|
2101
|
+
return list(set(tags))
|
|
2102
|
+
|
|
2103
|
+
def has_tags(self):
|
|
2104
|
+
"""Determines if this node has any tags at all.
|
|
2105
|
+
|
|
2106
|
+
Args:
|
|
2107
|
+
|
|
2108
|
+
Returns:
|
|
2109
|
+
bool: True if node has any tags; else, False;
|
|
2110
|
+
|
|
2111
|
+
>>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').has_tags()
|
|
2112
|
+
True
|
|
2113
|
+
"""
|
|
2114
|
+
return len([i.value for i in self.get_features_of_type("tag")]) > 0
|
|
2115
|
+
|
|
2116
|
+
def has_tag(self, tag, include_children=False):
|
|
2117
|
+
"""Determine if this node has a tag with the specified name.
|
|
2118
|
+
|
|
2119
|
+
Args:
|
|
2120
|
+
tag(str): The name of the tag.
|
|
2121
|
+
include_children(bool): should we include child nodes
|
|
2122
|
+
|
|
2123
|
+
Returns:
|
|
2124
|
+
bool: True if node has a tag by the specified name; else, False;
|
|
2125
|
+
|
|
2126
|
+
>>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').has_tag('is_cheese')
|
|
2127
|
+
True
|
|
2128
|
+
>>> document.content_node.select_first('//*[contentRegex(".*Cheese.*")]').has_tag('is_fish')
|
|
2129
|
+
False
|
|
2130
|
+
"""
|
|
2131
|
+
for feature in self.get_features():
|
|
2132
|
+
if feature.feature_type == "tag" and feature.name == tag:
|
|
2133
|
+
return True
|
|
2134
|
+
result = False
|
|
2135
|
+
if include_children:
|
|
2136
|
+
for child in self.get_children():
|
|
2137
|
+
if child.has_tag(tag, True):
|
|
2138
|
+
result = True
|
|
2139
|
+
return result
|
|
2140
|
+
|
|
2141
|
+
def is_first_child(self):
|
|
2142
|
+
"""Determines if this node is the first child of its parent or has no parent.
|
|
2143
|
+
|
|
2144
|
+
Args:
|
|
2145
|
+
|
|
2146
|
+
Returns:
|
|
2147
|
+
bool: True if this node is the first child of its parent or if this node has no parent; else, False;
|
|
2148
|
+
|
|
2149
|
+
"""
|
|
2150
|
+
if not self.parent:
|
|
2151
|
+
return True
|
|
2152
|
+
|
|
2153
|
+
return self.index == 0
|
|
2154
|
+
|
|
2155
|
+
def is_last_child(self):
|
|
2156
|
+
"""Determines if this node is the last child of its parent or has no parent.
|
|
2157
|
+
|
|
2158
|
+
Returns:
|
|
2159
|
+
bool: True if this node is the last child of its parent or if this node has no parent; else, False;
|
|
2160
|
+
|
|
2161
|
+
"""
|
|
2162
|
+
|
|
2163
|
+
if not self.get_parent():
|
|
2164
|
+
return True
|
|
2165
|
+
|
|
2166
|
+
return self.index == self.get_parent().get_last_child_index()
|
|
2167
|
+
|
|
2168
|
+
def get_last_child_index(self):
|
|
2169
|
+
"""Returns the max index value for the children of this node. If the node has no children, returns None.
|
|
2170
|
+
|
|
2171
|
+
Returns:
|
|
2172
|
+
int or None: The max index of the children of this node, or None if there are no children.
|
|
2173
|
+
|
|
2174
|
+
"""
|
|
2175
|
+
|
|
2176
|
+
if not self.get_children():
|
|
2177
|
+
return None
|
|
2178
|
+
|
|
2179
|
+
max_index = 0
|
|
2180
|
+
for child in self.get_children():
|
|
2181
|
+
if child.index > max_index:
|
|
2182
|
+
max_index = child.index
|
|
2183
|
+
|
|
2184
|
+
return max_index
|
|
2185
|
+
|
|
2186
|
+
def get_node_at_index(self, index):
|
|
2187
|
+
"""Returns the child node at the specified index. If the specified index is outside the first (0), or
|
|
2188
|
+
last child's index, None is returned.
|
|
2189
|
+
|
|
2190
|
+
Note: documents allow for sparse representation and child nodes may not have consecutive index numbers.
|
|
2191
|
+
If there isn't a child node at the specfied index, a 'virtual' node will be returned. This 'virtual' node
|
|
2192
|
+
will have the node type of its nearest sibling and will have an index value, but will have no features or content.
|
|
2193
|
+
|
|
2194
|
+
Args:
|
|
2195
|
+
index (int): The index (zero-based) for the child node.
|
|
2196
|
+
|
|
2197
|
+
Returns:
|
|
2198
|
+
ContentNode or None: Node at index, or None if the index is outside the boundaries of child nodes.
|
|
2199
|
+
|
|
2200
|
+
"""
|
|
2201
|
+
children = self.get_children()
|
|
2202
|
+
|
|
2203
|
+
if children:
|
|
2204
|
+
# First check if we have a real node at this index
|
|
2205
|
+
for child in children:
|
|
2206
|
+
if child.index == index:
|
|
2207
|
+
return child
|
|
2208
|
+
|
|
2209
|
+
# Check if index is before the first child
|
|
2210
|
+
if index < children[0].index:
|
|
2211
|
+
virtual_node = self.document.create_node(
|
|
2212
|
+
node_type=children[0].node_type,
|
|
2213
|
+
virtual=True,
|
|
2214
|
+
parent=self,
|
|
2215
|
+
index=index,
|
|
2216
|
+
)
|
|
2217
|
+
return virtual_node
|
|
2218
|
+
|
|
2219
|
+
# Check if index is between existing children
|
|
2220
|
+
last_child = None
|
|
2221
|
+
next_child = None
|
|
2222
|
+
|
|
2223
|
+
for i, child in enumerate(children):
|
|
2224
|
+
if child.index < index:
|
|
2225
|
+
last_child = child
|
|
2226
|
+
# Look for the next child after this index
|
|
2227
|
+
if i + 1 < len(children):
|
|
2228
|
+
if children[i + 1].index > index:
|
|
2229
|
+
next_child = children[i + 1]
|
|
2230
|
+
break
|
|
2231
|
+
elif child.index > index:
|
|
2232
|
+
next_child = child
|
|
2233
|
+
break
|
|
2234
|
+
|
|
2235
|
+
# If we have a gap between two nodes that contains our index
|
|
2236
|
+
if last_child and index < children[-1].index:
|
|
2237
|
+
virtual_node = self.document.create_node(
|
|
2238
|
+
node_type=last_child.node_type,
|
|
2239
|
+
virtual=True,
|
|
2240
|
+
parent=self,
|
|
2241
|
+
index=index,
|
|
2242
|
+
)
|
|
2243
|
+
return virtual_node
|
|
2244
|
+
|
|
2245
|
+
return None
|
|
2246
|
+
else:
|
|
2247
|
+
return None
|
|
2248
|
+
|
|
2249
|
+
def has_next_node(self, node_type_re=".*", skip_virtual=False):
|
|
2250
|
+
"""Determine if this node has a next sibling that matches the type specified by the node_type_re regex.
|
|
2251
|
+
|
|
2252
|
+
Args:
|
|
2253
|
+
node_type_re(str, optional, optional): The regular expression to match against the next sibling node's type; default is '.*'.
|
|
2254
|
+
skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
|
|
2255
|
+
|
|
2256
|
+
Returns:
|
|
2257
|
+
bool: True if there is a next sibling node matching the specified type regex; else, False.
|
|
2258
|
+
|
|
2259
|
+
"""
|
|
2260
|
+
return self.next_node(node_type_re, skip_virtual=skip_virtual) is not None
|
|
2261
|
+
|
|
2262
|
+
def has_previous_node(self, node_type_re=".*", skip_virtual=False):
|
|
2263
|
+
"""Determine if this node has a previous sibling that matches the type specified by the node_type_re regex.
|
|
2264
|
+
|
|
2265
|
+
Args:
|
|
2266
|
+
node_type_re(str, optional, optional): The regular expression to match against the previous sibling node's type; default is '.*'.
|
|
2267
|
+
skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
|
|
2268
|
+
|
|
2269
|
+
Returns:
|
|
2270
|
+
bool: True if there is a previous sibling node matching the specified type regex; else, False.
|
|
2271
|
+
|
|
2272
|
+
"""
|
|
2273
|
+
return (
|
|
2274
|
+
self.previous_node(node_type_re=node_type_re, skip_virtual=skip_virtual)
|
|
2275
|
+
is not None
|
|
2276
|
+
)
|
|
2277
|
+
|
|
2278
|
+
def next_node(
|
|
2279
|
+
self,
|
|
2280
|
+
node_type_re=".*",
|
|
2281
|
+
skip_virtual=False,
|
|
2282
|
+
has_no_content=True,
|
|
2283
|
+
traverse=Traverse.SIBLING,
|
|
2284
|
+
):
|
|
2285
|
+
"""
|
|
2286
|
+
Returns the next sibling content node.
|
|
2287
|
+
|
|
2288
|
+
Note: This logic relies on node indexes. Documents allow for sparse representation and child nodes may not have consecutive index numbers.
|
|
2289
|
+
Therefore, the next node might actually be a virtual node that is created to fill a gap in the document. You can skip virtual nodes by setting the
|
|
2290
|
+
skip_virtual parameter to False.
|
|
2291
|
+
|
|
2292
|
+
Args:
|
|
2293
|
+
node_type_re(str, optional, optional): The regular expression to match against the next sibling node's type; default is '.*'.
|
|
2294
|
+
skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
|
|
2295
|
+
has_no_content(bool, optional, optional): Allow a node that has no content to be returned; default is True.
|
|
2296
|
+
|
|
2297
|
+
Returns:
|
|
2298
|
+
ContentNode or None: The next node or None, if no node exists
|
|
2299
|
+
|
|
2300
|
+
"""
|
|
2301
|
+
# If this node has no index, we can't determine the next node by index
|
|
2302
|
+
# Use sibling enumeration instead
|
|
2303
|
+
if self.index is None:
|
|
2304
|
+
# Get all siblings and find the node right after this one
|
|
2305
|
+
if self.get_parent():
|
|
2306
|
+
siblings = self.get_parent().get_children()
|
|
2307
|
+
for i, sibling in enumerate(siblings):
|
|
2308
|
+
if sibling.id == self.id and i + 1 < len(siblings):
|
|
2309
|
+
return siblings[i + 1]
|
|
2310
|
+
return None
|
|
2311
|
+
|
|
2312
|
+
# If we have a valid index, use the original implementation
|
|
2313
|
+
search_index = self.index + 1
|
|
2314
|
+
compiled_node_type_re = re.compile(node_type_re)
|
|
2315
|
+
|
|
2316
|
+
while True:
|
|
2317
|
+
node = (
|
|
2318
|
+
self.get_parent().get_node_at_index(search_index)
|
|
2319
|
+
if self.get_parent()
|
|
2320
|
+
else None
|
|
2321
|
+
)
|
|
2322
|
+
|
|
2323
|
+
if not node:
|
|
2324
|
+
if (
|
|
2325
|
+
(traverse == Traverse.ALL or traverse == Traverse.PARENT)
|
|
2326
|
+
and self.get_parent()
|
|
2327
|
+
and self.get_parent().get_parent()
|
|
2328
|
+
):
|
|
2329
|
+
# noinspection PyBroadException
|
|
2330
|
+
try:
|
|
2331
|
+
potential_next_node = (
|
|
2332
|
+
self.get_parent()
|
|
2333
|
+
.get_parent()
|
|
2334
|
+
.get_children()[self.get_parent().index + 1]
|
|
2335
|
+
)
|
|
2336
|
+
if potential_next_node:
|
|
2337
|
+
return potential_next_node
|
|
2338
|
+
except Exception:
|
|
2339
|
+
# traverse additional layer
|
|
2340
|
+
try:
|
|
2341
|
+
potential_next_node = (
|
|
2342
|
+
self.get_parent()
|
|
2343
|
+
.get_parent()
|
|
2344
|
+
.get_parent()
|
|
2345
|
+
.get_children()[
|
|
2346
|
+
self.get_parent().get_parent().index + 1
|
|
2347
|
+
]
|
|
2348
|
+
)
|
|
2349
|
+
if potential_next_node:
|
|
2350
|
+
return potential_next_node
|
|
2351
|
+
except Exception:
|
|
2352
|
+
pass
|
|
2353
|
+
return node
|
|
2354
|
+
|
|
2355
|
+
if compiled_node_type_re.match(node.node_type) and (
|
|
2356
|
+
not skip_virtual or not node.virtual
|
|
2357
|
+
):
|
|
2358
|
+
if (not has_no_content and node.content) or has_no_content:
|
|
2359
|
+
return node
|
|
2360
|
+
|
|
2361
|
+
search_index += 1
|
|
2362
|
+
|
|
2363
|
+
def previous_node(
|
|
2364
|
+
self,
|
|
2365
|
+
node_type_re=".*",
|
|
2366
|
+
skip_virtual=False,
|
|
2367
|
+
has_no_content=False,
|
|
2368
|
+
traverse=Traverse.SIBLING,
|
|
2369
|
+
):
|
|
2370
|
+
"""Returns the previous sibling content node.
|
|
2371
|
+
|
|
2372
|
+
Note: This logic relies on node indexes. Documents allow for sparse representation and child nodes may not have consecutive index numbers.
|
|
2373
|
+
Therefore, the previous node might actually be a virtual node that is created to fill a gap in the document. You can skip virtual nodes by setting the
|
|
2374
|
+
skip_virtual parameter to False.
|
|
2375
|
+
|
|
2376
|
+
Args:
|
|
2377
|
+
node_type_re(str, optional, optional): The regular expression to match against the previous node's type; default is '.*'.
|
|
2378
|
+
skip_virtual(bool, optional, optional): Skip virtual nodes and return the next real node; default is False.
|
|
2379
|
+
has_no_content(bool, optional, optional): Allow a node that has no content to be returned; default is False.
|
|
2380
|
+
traverse(Traverse(enum), optional, optional): The transition you'd like to traverse (SIBLING, CHILDREN, PARENT, or ALL); default is Traverse.SIBLING.
|
|
2381
|
+
|
|
2382
|
+
Returns:
|
|
2383
|
+
ContentNode or None: The previous node or None, if no node exists
|
|
2384
|
+
|
|
2385
|
+
"""
|
|
2386
|
+
|
|
2387
|
+
# TODO: implement/differentiate traverse logic for CHILDREN and SIBLING
|
|
2388
|
+
if self.index == 0:
|
|
2389
|
+
if (
|
|
2390
|
+
traverse == traverse.ALL
|
|
2391
|
+
or traverse == traverse.PARENT
|
|
2392
|
+
and self.get_parent()
|
|
2393
|
+
):
|
|
2394
|
+
# Lets look for a previous node on the parent
|
|
2395
|
+
return self.get_parent().previous_node(
|
|
2396
|
+
node_type_re, skip_virtual, has_no_content, traverse
|
|
2397
|
+
)
|
|
2398
|
+
|
|
2399
|
+
return None
|
|
2400
|
+
|
|
2401
|
+
search_index = self.index - 1
|
|
2402
|
+
compiled_node_type_re = re.compile(node_type_re)
|
|
2403
|
+
|
|
2404
|
+
while True:
|
|
2405
|
+
node = self.get_parent().get_node_at_index(search_index)
|
|
2406
|
+
|
|
2407
|
+
if not node:
|
|
2408
|
+
return node
|
|
2409
|
+
|
|
2410
|
+
if compiled_node_type_re.match(node.node_type) and (
|
|
2411
|
+
not skip_virtual or not node.virtual
|
|
2412
|
+
):
|
|
2413
|
+
if (not has_no_content) or (has_no_content and not node.content):
|
|
2414
|
+
return node
|
|
2415
|
+
|
|
2416
|
+
search_index -= 1
|
|
2417
|
+
|
|
2418
|
+
|
|
2419
|
+
class ContentFeature(object):
|
|
2420
|
+
"""
|
|
2421
|
+
A feature allows you to capture almost any additional data or metadata and associate it with a ContentNode.
|
|
2422
|
+
The 'value' of a feature is always a list, allowing a single feature name (e.g., 'ner') to hold multiple
|
|
2423
|
+
data points (e.g., multiple recognized entities).
|
|
2424
|
+
"""
|
|
2425
|
+
|
|
2426
|
+
def __init__(self, feature_type: str, name: str, value: Any):
|
|
2427
|
+
self.feature_type: str = feature_type
|
|
2428
|
+
"""The type of feature, a logical name to group feature types together (ie. spatial)"""
|
|
2429
|
+
self.name: str = name
|
|
2430
|
+
"""The name of the feature (ie. bbox)"""
|
|
2431
|
+
|
|
2432
|
+
if not isinstance(value, list):
|
|
2433
|
+
self.value: List[Any] = [value]
|
|
2434
|
+
else:
|
|
2435
|
+
self.value: List[Any] = value
|
|
2436
|
+
"""The list of values for this feature. For example, a 'ner' feature might have multiple Tag objects,
|
|
2437
|
+
a 'spatial:bbox' might have multiple bounding box coordinate lists, etc. (Always a list)"""
|
|
2438
|
+
|
|
2439
|
+
def __str__(self):
|
|
2440
|
+
# Consider showing a snippet of values if not too long, or just type and count for brevity
|
|
2441
|
+
return f"Feature [type='{self.feature_type}' name='{self.name}' value_count='{len(self.value)}']"
|
|
2442
|
+
|
|
2443
|
+
def to_dict(self):
|
|
2444
|
+
"""
|
|
2445
|
+
Create a dictionary representing this ContentFeature's structure and content.
|
|
2446
|
+
The 'value' in the dictionary will be a list of serialized items.
|
|
2447
|
+
|
|
2448
|
+
Returns:
|
|
2449
|
+
dict: The properties of this ContentFeature structured as a dictionary.
|
|
2450
|
+
"""
|
|
2451
|
+
processed_value_list = []
|
|
2452
|
+
# self.value is now guaranteed to be a list
|
|
2453
|
+
for item in self.value:
|
|
2454
|
+
if hasattr(item, "to_dict") and callable(getattr(item, "to_dict")):
|
|
2455
|
+
processed_value_list.append(item.to_dict())
|
|
2456
|
+
else:
|
|
2457
|
+
# Handle cases where item might be None or a primitive type
|
|
2458
|
+
processed_value_list.append(item)
|
|
2459
|
+
return {
|
|
2460
|
+
"name": self.feature_type + ":" + self.name,
|
|
2461
|
+
"value": processed_value_list,
|
|
2462
|
+
}
|
|
2463
|
+
|
|
2464
|
+
def get_value(self) -> List[Any]:
|
|
2465
|
+
"""
|
|
2466
|
+
Get the list of values for the feature.
|
|
2467
|
+
|
|
2468
|
+
Returns:
|
|
2469
|
+
List[Any]: The list of values of the feature. This is always a list, even if it contains a single item or is empty.
|
|
2470
|
+
"""
|
|
2471
|
+
return self.value
|
|
2472
|
+
|
|
2473
|
+
|
|
2474
|
+
class ModelInsight(BaseModel):
|
|
2475
|
+
model_config = ConfigDict(
|
|
2476
|
+
populate_by_name=True,
|
|
2477
|
+
use_enum_values=True,
|
|
2478
|
+
arbitrary_types_allowed=True,
|
|
2479
|
+
protected_namespaces=("model_config",),
|
|
2480
|
+
)
|
|
2481
|
+
"""
|
|
2482
|
+
A class used to represent the insights of a model.
|
|
2483
|
+
|
|
2484
|
+
Attributes:
|
|
2485
|
+
model_ref (str): The reference to the model.
|
|
2486
|
+
insight_type (str): The type of the insight.
|
|
2487
|
+
description (Optional[str]): The description of the insight, default is None.
|
|
2488
|
+
details (Optional[str]): The details of the insight, default is None.
|
|
2489
|
+
properties (Optional[Dict]): The properties of the insight, default is None.
|
|
2490
|
+
"""
|
|
2491
|
+
|
|
2492
|
+
model_ref: str
|
|
2493
|
+
insight_type: str
|
|
2494
|
+
description: Optional[str] = None
|
|
2495
|
+
details: Optional[str] = None
|
|
2496
|
+
properties: Optional[Dict] = None
|
|
2497
|
+
|
|
2498
|
+
|
|
2499
|
+
@dataclasses.dataclass()
|
|
2500
|
+
class SourceMetadata:
|
|
2501
|
+
"""Class for keeping track of an original source information for a document.
|
|
2502
|
+
|
|
2503
|
+
Attributes:
|
|
2504
|
+
original_filename (Optional[str]): The original filename of the document.
|
|
2505
|
+
original_path (Optional[str]): The original path of the document.
|
|
2506
|
+
checksum (Optional[str]): The checksum of the document.
|
|
2507
|
+
cid (Optional[str]): The ID used for internal caching.
|
|
2508
|
+
last_modified (Optional[str]): The last modified date of the document.
|
|
2509
|
+
created (Optional[str]): The creation date of the document.
|
|
2510
|
+
connector (Optional[str]): The connector used for the document.
|
|
2511
|
+
mime_type (Optional[str]): The MIME type of the document.
|
|
2512
|
+
headers (Optional[Dict]): The headers of the document.
|
|
2513
|
+
lineage_document_uuid (Optional[str]): The UUID of the document that this document was derived from.
|
|
2514
|
+
source_document_uuid (Optional[str]): The UUID of the original first document.
|
|
2515
|
+
pdf_document_uuid (Optional[str]): The UUID of the document in a PDF form (used for archiving and preview).
|
|
2516
|
+
"""
|
|
2517
|
+
|
|
2518
|
+
"""Class for keeping track of the original source information for a
|
|
2519
|
+
document
|
|
2520
|
+
|
|
2521
|
+
Args:
|
|
2522
|
+
|
|
2523
|
+
Returns:
|
|
2524
|
+
|
|
2525
|
+
"""
|
|
2526
|
+
original_filename: Optional[str] = None
|
|
2527
|
+
original_path: Optional[str] = None
|
|
2528
|
+
checksum: Optional[str] = None
|
|
2529
|
+
|
|
2530
|
+
# The ID used for internal caching
|
|
2531
|
+
cid: Optional[str] = None
|
|
2532
|
+
last_modified: Optional[str] = None
|
|
2533
|
+
created: Optional[str] = None
|
|
2534
|
+
connector: Optional[str] = None
|
|
2535
|
+
mime_type: Optional[str] = None
|
|
2536
|
+
headers: Optional[Dict] = None
|
|
2537
|
+
|
|
2538
|
+
# The UUID of the document that this document was derived from
|
|
2539
|
+
# noting that multiple documents coming from an original source
|
|
2540
|
+
lineage_document_uuid: Optional[str] = None
|
|
2541
|
+
|
|
2542
|
+
# The UUID of the original first document
|
|
2543
|
+
source_document_uuid: Optional[str] = None
|
|
2544
|
+
|
|
2545
|
+
# The UUID of the document in a PDF form (used for archiving and preview)
|
|
2546
|
+
pdf_document_uuid: Optional[str] = None
|
|
2547
|
+
|
|
2548
|
+
@classmethod
|
|
2549
|
+
def from_dict(cls, env):
|
|
2550
|
+
"""Creates an instance of the class from a dictionary.
|
|
2551
|
+
|
|
2552
|
+
Args:
|
|
2553
|
+
env (dict): A dictionary containing the attributes of the class.
|
|
2554
|
+
|
|
2555
|
+
Returns:
|
|
2556
|
+
SourceMetadata: An instance of the class.
|
|
2557
|
+
"""
|
|
2558
|
+
return cls(
|
|
2559
|
+
**{k: v for k, v in env.items() if k in inspect.signature(cls).parameters}
|
|
2560
|
+
)
|
|
2561
|
+
|
|
2562
|
+
|
|
2563
|
+
class FeatureSetDiff:
|
|
2564
|
+
"""
|
|
2565
|
+
A utility class that can be used to diff two feature sets.
|
|
2566
|
+
"""
|
|
2567
|
+
|
|
2568
|
+
"""
|
|
2569
|
+
A utility class that can be used to diff two feature sets
|
|
2570
|
+
"""
|
|
2571
|
+
|
|
2572
|
+
def __init__(self, first_feature_set: FeatureSet, second_feature_set: FeatureSet):
|
|
2573
|
+
self.first_feature_map = self.parse_feature_set(first_feature_set)
|
|
2574
|
+
self.second_feature_map = self.parse_feature_set(second_feature_set)
|
|
2575
|
+
self._differences = deepdiff.DeepDiff(
|
|
2576
|
+
self.first_feature_map,
|
|
2577
|
+
self.second_feature_map,
|
|
2578
|
+
exclude_obj_callback=self.exclude_callback,
|
|
2579
|
+
).to_dict()
|
|
2580
|
+
|
|
2581
|
+
def get_differences(self):
|
|
2582
|
+
"""
|
|
2583
|
+
Gets the differences between the two feature sets.
|
|
2584
|
+
|
|
2585
|
+
Returns:
|
|
2586
|
+
dict: A dictionary containing the differences between the two feature sets.
|
|
2587
|
+
"""
|
|
2588
|
+
if "type_changes" in self._differences:
|
|
2589
|
+
self._differences.pop("type_changes")
|
|
2590
|
+
|
|
2591
|
+
return self._differences
|
|
2592
|
+
|
|
2593
|
+
def get_exclude_paths(self):
|
|
2594
|
+
"""
|
|
2595
|
+
Gets the paths to exclude.
|
|
2596
|
+
|
|
2597
|
+
Returns:
|
|
2598
|
+
list: A list of paths to exclude.
|
|
2599
|
+
"""
|
|
2600
|
+
return ["shape", "group_uuid", "uuid", "parent_group_uuid", "single"]
|
|
2601
|
+
|
|
2602
|
+
def exclude_callback(self, path, key):
|
|
2603
|
+
"""
|
|
2604
|
+
Checks if the key is to be excluded from the diff.
|
|
2605
|
+
|
|
2606
|
+
Args:
|
|
2607
|
+
path (str): The path that contains the values of the key.
|
|
2608
|
+
key (str): The key of the data dictionary to compare.
|
|
2609
|
+
|
|
2610
|
+
Returns:
|
|
2611
|
+
bool: True if the key is to be excluded, False otherwise.
|
|
2612
|
+
"""
|
|
2613
|
+
if any(re.search(exclude_key, key) for exclude_key in self.get_exclude_paths()):
|
|
2614
|
+
return True
|
|
2615
|
+
else:
|
|
2616
|
+
return False
|
|
2617
|
+
|
|
2618
|
+
def parse_feature_set(self, feature_set: FeatureSet):
|
|
2619
|
+
"""
|
|
2620
|
+
Parses the feature set.
|
|
2621
|
+
|
|
2622
|
+
Args:
|
|
2623
|
+
feature_set (FeatureSet): The feature set to be parsed.
|
|
2624
|
+
|
|
2625
|
+
Returns:
|
|
2626
|
+
dict: A dictionary of features with the key as the nodeUuid.
|
|
2627
|
+
"""
|
|
2628
|
+
return {
|
|
2629
|
+
feature.get("nodeUuid"): feature for feature in feature_set.node_features
|
|
2630
|
+
}
|
|
2631
|
+
|
|
2632
|
+
def parsed_values_changed(self):
|
|
2633
|
+
"""
|
|
2634
|
+
Checks if the old value is still in the second feature map. If it is, remove the key.
|
|
2635
|
+
"""
|
|
2636
|
+
for key, value in self._differences.get("values_changed").items():
|
|
2637
|
+
# Check if the old_value is stil in the second_feature_map. If it is remove the key
|
|
2638
|
+
if key in self.second_feature_map.node_features:
|
|
2639
|
+
self._differences.get("values_changed").remove(key)
|
|
2640
|
+
|
|
2641
|
+
def is_equal(self) -> bool:
|
|
2642
|
+
"""
|
|
2643
|
+
Checks if the two feature sets are equal to each other.
|
|
2644
|
+
|
|
2645
|
+
Returns:
|
|
2646
|
+
bool: True if the feature sets are equal, False otherwise.
|
|
2647
|
+
"""
|
|
2648
|
+
return self._differences == {}
|
|
2649
|
+
|
|
2650
|
+
def get_changed_nodes(self):
|
|
2651
|
+
"""
|
|
2652
|
+
Gets the nodes that were changed.
|
|
2653
|
+
|
|
2654
|
+
Returns:
|
|
2655
|
+
dict: A dictionary containing the nodes that were changed.
|
|
2656
|
+
"""
|
|
2657
|
+
if self.is_equal():
|
|
2658
|
+
return []
|
|
2659
|
+
|
|
2660
|
+
# Check for new nodes added in the second_feature_map
|
|
2661
|
+
new_added_nodes = []
|
|
2662
|
+
|
|
2663
|
+
# Checked for removed nodes in the first_feature_map
|
|
2664
|
+
removed_nodes = []
|
|
2665
|
+
|
|
2666
|
+
# Checked for modified nodes
|
|
2667
|
+
modified_nodes = []
|
|
2668
|
+
for key, value in self._differences.get("values_changed").items():
|
|
2669
|
+
modified_nodes.append(self.parsed_node_uuid(key))
|
|
2670
|
+
|
|
2671
|
+
# Merge unique nodeUuid of first_feature_map and second_feature_map
|
|
2672
|
+
merged_node_uuids = set(self.first_feature_map.keys()).union(
|
|
2673
|
+
set(self.second_feature_map.keys())
|
|
2674
|
+
)
|
|
2675
|
+
for node_uuid in merged_node_uuids:
|
|
2676
|
+
if node_uuid not in self.first_feature_map:
|
|
2677
|
+
new_added_nodes.append(node_uuid)
|
|
2678
|
+
elif node_uuid not in self.second_feature_map:
|
|
2679
|
+
removed_nodes.append(node_uuid)
|
|
2680
|
+
|
|
2681
|
+
return {
|
|
2682
|
+
"new_added_nodes": new_added_nodes,
|
|
2683
|
+
"removed_nodes": removed_nodes,
|
|
2684
|
+
"existing_modified_nodes": modified_nodes,
|
|
2685
|
+
}
|
|
2686
|
+
|
|
2687
|
+
def get_difference_count(self):
|
|
2688
|
+
"""
|
|
2689
|
+
Gets the total number of differences between the feature sets.
|
|
2690
|
+
|
|
2691
|
+
Returns:
|
|
2692
|
+
int: The total number of differences between the feature sets.
|
|
2693
|
+
"""
|
|
2694
|
+
return len(self._differences().keys())
|
|
2695
|
+
|
|
2696
|
+
def parsed_item_added(self):
|
|
2697
|
+
"""
|
|
2698
|
+
Parses the items that were added.
|
|
2699
|
+
|
|
2700
|
+
Returns:
|
|
2701
|
+
dict: A dictionary containing the items that were added.
|
|
2702
|
+
"""
|
|
2703
|
+
item_added: Dict = self._differences.get("iterable_item_added")
|
|
2704
|
+
if item_added:
|
|
2705
|
+
return {}
|
|
2706
|
+
|
|
2707
|
+
for key, value in item_added.items():
|
|
2708
|
+
node = self.parsed_node_uuid(key)
|
|
2709
|
+
if node in self._changed_nodes["new_added_nodes"]:
|
|
2710
|
+
self._differences["iterable_item_added"][key][
|
|
2711
|
+
"details"
|
|
2712
|
+
] = f"Node: {node} was added"
|
|
2713
|
+
continue
|
|
2714
|
+
|
|
2715
|
+
# if node in
|
|
2716
|
+
return self.get_difference_count()
|
|
2717
|
+
|
|
2718
|
+
def parsed_node_uuid(self, key):
|
|
2719
|
+
"""
|
|
2720
|
+
Parses the node uuid from the key.
|
|
2721
|
+
|
|
2722
|
+
Args:
|
|
2723
|
+
key (str): The key of the data dictionary.
|
|
2724
|
+
|
|
2725
|
+
Returns:
|
|
2726
|
+
str: The node uuid from the key.
|
|
2727
|
+
"""
|
|
2728
|
+
node = key.split("['")[1].split("']")[0]
|
|
2729
|
+
return node
|
|
2730
|
+
|
|
2731
|
+
|
|
2732
|
+
class ProcessingStep(BaseModel):
|
|
2733
|
+
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
|
2734
|
+
name: str
|
|
2735
|
+
metadata: dict = Field(default_factory=lambda: {})
|
|
2736
|
+
presentation_metadata: dict = Field(
|
|
2737
|
+
default_factory=lambda: {}, alias="presentationMetadata"
|
|
2738
|
+
)
|
|
2739
|
+
children: List["ProcessingStep"] = Field(default_factory=list)
|
|
2740
|
+
parents: List["ProcessingStep"] = Field(default_factory=list)
|
|
2741
|
+
|
|
2742
|
+
def add_child(self, child_step: "ProcessingStep"):
|
|
2743
|
+
self.children.append(child_step)
|
|
2744
|
+
child_step.parents.append(self)
|
|
2745
|
+
|
|
2746
|
+
@staticmethod
|
|
2747
|
+
def merge_with(*other_steps: "ProcessingStep") -> "ProcessingStep":
|
|
2748
|
+
merged_step = ProcessingStep(
|
|
2749
|
+
name=f"Merged({', '.join(step.name for step in other_steps)})"
|
|
2750
|
+
)
|
|
2751
|
+
for step in other_steps:
|
|
2752
|
+
step.children.append(merged_step)
|
|
2753
|
+
merged_step.parents.append(step)
|
|
2754
|
+
return merged_step
|
|
2755
|
+
|
|
2756
|
+
model_config = ConfigDict(
|
|
2757
|
+
arbitrary_types_allowed = True,
|
|
2758
|
+
json_encoders = {"ProcessingStep": lambda step: step.to_dict()}
|
|
2759
|
+
)
|
|
2760
|
+
|
|
2761
|
+
def to_dict(self, seen=None):
|
|
2762
|
+
if seen is None:
|
|
2763
|
+
seen = set()
|
|
2764
|
+
|
|
2765
|
+
# Avoid circular references by skipping already seen objects
|
|
2766
|
+
if self.id in seen:
|
|
2767
|
+
return {"id": self.id, "name": self.name}
|
|
2768
|
+
|
|
2769
|
+
seen.add(self.id)
|
|
2770
|
+
|
|
2771
|
+
return {
|
|
2772
|
+
"id": self.id,
|
|
2773
|
+
"name": self.name,
|
|
2774
|
+
"metadata": self.metadata,
|
|
2775
|
+
"presentationMetadata": self.presentation_metadata,
|
|
2776
|
+
"children": [child.to_dict(seen) for child in self.children],
|
|
2777
|
+
"parents": [
|
|
2778
|
+
{"id": parent.id, "name": parent.name} for parent in self.parents
|
|
2779
|
+
], # or parent.to_dict(seen) if full structure is needed
|
|
2780
|
+
}
|
|
2781
|
+
|
|
2782
|
+
def to_json(self):
|
|
2783
|
+
return json.dumps(self.to_dict())
|
|
2784
|
+
|
|
2785
|
+
def __repr__(self):
|
|
2786
|
+
return f"Step(id={self.id}, name={self.name})"
|
|
2787
|
+
|
|
2788
|
+
|
|
2789
|
+
class Document(object):
|
|
2790
|
+
"""A Document is a collection of metadata and a set of content nodes."""
|
|
2791
|
+
|
|
2792
|
+
PREVIOUS_VERSION: str = "1.0.0"
|
|
2793
|
+
CURRENT_VERSION: str = "8.0.0"
|
|
2794
|
+
|
|
2795
|
+
def __init__(
|
|
2796
|
+
self,
|
|
2797
|
+
metadata=None,
|
|
2798
|
+
source=None,
|
|
2799
|
+
ref: str = None,
|
|
2800
|
+
kddb_path: str = None,
|
|
2801
|
+
delete_on_close=False,
|
|
2802
|
+
inmemory=False,
|
|
2803
|
+
):
|
|
2804
|
+
"""A Kodexa Document has content nodes and metadata to represent the information.
|
|
2805
|
+
|
|
2806
|
+
Args:
|
|
2807
|
+
metadata (DocumentMetadata): The metadata for the document (default is empty)
|
|
2808
|
+
source (SourceMetadata): The source metadata for the document (optional)
|
|
2809
|
+
ref (str): The reference (if it is a remote document)
|
|
2810
|
+
kddb_path (str): If we want to open an existing kddb
|
|
2811
|
+
delete_on_close (boolean): Whether to delete on close
|
|
2812
|
+
inmemory (boolean): Whether to operate in memory (faster but more memory intensive)
|
|
2813
|
+
"""
|
|
2814
|
+
self.metadata = metadata if metadata is not None else DocumentMetadata()
|
|
2815
|
+
"""The metadata for the document"""
|
|
2816
|
+
|
|
2817
|
+
self._mixins = []
|
|
2818
|
+
self._persistence_layer = None
|
|
2819
|
+
|
|
2820
|
+
self.labels = []
|
|
2821
|
+
"""A list of document level labels"""
|
|
2822
|
+
self.content_node = None
|
|
2823
|
+
self.source = source if source is not None else SourceMetadata()
|
|
2824
|
+
"""The source of the document"""
|
|
2825
|
+
|
|
2826
|
+
self.uuid = str(uuid.uuid4())
|
|
2827
|
+
"""A UUID representing this document"""
|
|
2828
|
+
|
|
2829
|
+
self.create_persistence_layer(kddb_path, delete_on_close, inmemory)
|
|
2830
|
+
|
|
2831
|
+
if ref is not None:
|
|
2832
|
+
self.ref = Ref(ref)
|
|
2833
|
+
|
|
2834
|
+
self.version = self.CURRENT_VERSION
|
|
2835
|
+
|
|
2836
|
+
def __str__(self):
|
|
2837
|
+
return f"kodexa://{self.uuid}"
|
|
2838
|
+
|
|
2839
|
+
def get_validations(self) -> list[DocumentTaxonValidation]:
|
|
2840
|
+
return self.get_persistence().get_validations()
|
|
2841
|
+
|
|
2842
|
+
def set_validations(self, validations: list[DocumentTaxonValidation]):
|
|
2843
|
+
self.get_persistence().set_validations(validations)
|
|
2844
|
+
|
|
2845
|
+
def add_exception(self, exception: ContentException):
|
|
2846
|
+
self._persistence_layer.add_exception(exception)
|
|
2847
|
+
|
|
2848
|
+
def get_exceptions(self) -> List[ContentException]:
|
|
2849
|
+
return self._persistence_layer.get_exceptions()
|
|
2850
|
+
|
|
2851
|
+
def get_external_data(self, key="default") -> dict:
|
|
2852
|
+
return self._persistence_layer.get_external_data(key)
|
|
2853
|
+
|
|
2854
|
+
def get_external_data_keys(self) -> list[str]:
|
|
2855
|
+
return self._persistence_layer.get_external_data_keys()
|
|
2856
|
+
|
|
2857
|
+
def set_external_data(self, external_data: dict, key="default"):
|
|
2858
|
+
return self._persistence_layer.set_external_data(external_data, key)
|
|
2859
|
+
|
|
2860
|
+
def get_steps(self) -> list[ProcessingStep]:
|
|
2861
|
+
return self._persistence_layer.get_steps()
|
|
2862
|
+
|
|
2863
|
+
def set_steps(self, steps: list[ProcessingStep]):
|
|
2864
|
+
self._persistence_layer.set_steps(steps)
|
|
2865
|
+
|
|
2866
|
+
def replace_exceptions(self, exceptions: List[ContentException]):
|
|
2867
|
+
self._persistence_layer.replace_exceptions(exceptions)
|
|
2868
|
+
|
|
2869
|
+
def create_persistence_layer(
|
|
2870
|
+
self, kddb_path=None, delete_on_close=False, inmemory=False
|
|
2871
|
+
):
|
|
2872
|
+
"""
|
|
2873
|
+
Creates a persistence layer for the document
|
|
2874
|
+
|
|
2875
|
+
Args:
|
|
2876
|
+
kddb_path: Path to the KDDB file
|
|
2877
|
+
delete_on_close: Whether to delete the file on close
|
|
2878
|
+
inmemory: Whether to operate in memory
|
|
2879
|
+
"""
|
|
2880
|
+
from kodexa_document.persistence import SqliteDocumentPersistence
|
|
2881
|
+
|
|
2882
|
+
self._persistence_layer = SqliteDocumentPersistence(
|
|
2883
|
+
document=self,
|
|
2884
|
+
filename=kddb_path,
|
|
2885
|
+
delete_on_close=delete_on_close,
|
|
2886
|
+
inmemory=inmemory,
|
|
2887
|
+
)
|
|
2888
|
+
self._persistence_layer.initialize()
|
|
2889
|
+
|
|
2890
|
+
def remove_tags_by_owner(self, owner_uri: str):
|
|
2891
|
+
|
|
2892
|
+
for tag in self.get_all_tags():
|
|
2893
|
+
for tag_instance in self.get_tag_instances(tag):
|
|
2894
|
+
tag_meta: dict = tag_instance.get_data()
|
|
2895
|
+
if "owner_uri" in tag_meta and tag_meta["owner_uri"] == owner_uri:
|
|
2896
|
+
for node in tag_instance.nodes:
|
|
2897
|
+
node.remove_tag(tag)
|
|
2898
|
+
|
|
2899
|
+
def get_nodes_by_type(self, node_type: str) -> List[ContentNode]:
|
|
2900
|
+
"""
|
|
2901
|
+
Get all the nodes of a specific type
|
|
2902
|
+
|
|
2903
|
+
Args:
|
|
2904
|
+
node_type: the type of the node
|
|
2905
|
+
|
|
2906
|
+
Returns:
|
|
2907
|
+
a list of nodes
|
|
2908
|
+
|
|
2909
|
+
"""
|
|
2910
|
+
return self._persistence_layer.get_nodes_by_type(node_type)
|
|
2911
|
+
|
|
2912
|
+
def get_node_by_uuid(self, uuid: int) -> ContentNode:
|
|
2913
|
+
"""
|
|
2914
|
+
Get a node by its uuid
|
|
2915
|
+
|
|
2916
|
+
Args:
|
|
2917
|
+
uuid: the uuid of the node
|
|
2918
|
+
|
|
2919
|
+
Returns:
|
|
2920
|
+
the node
|
|
2921
|
+
|
|
2922
|
+
"""
|
|
2923
|
+
return self._persistence_layer.get_node_by_uuid(uuid)
|
|
2924
|
+
|
|
2925
|
+
def add_tag_instance(
|
|
2926
|
+
self, tag_to_apply: str, node_list: List[ContentNode], tag_uuid: str = None
|
|
2927
|
+
):
|
|
2928
|
+
"""
|
|
2929
|
+
This will create a group of a tag with indexes
|
|
2930
|
+
:param tag_to_apply: name of the tag
|
|
2931
|
+
:param node_list: contains the list of index of a node
|
|
2932
|
+
:return:
|
|
2933
|
+
"""
|
|
2934
|
+
# For each node in the list create/update a feature
|
|
2935
|
+
tag = Tag()
|
|
2936
|
+
tag.uuid = tag_uuid if tag_uuid else str(uuid.uuid4())
|
|
2937
|
+
for node in node_list:
|
|
2938
|
+
node.add_feature("tag", tag_to_apply, tag)
|
|
2939
|
+
|
|
2940
|
+
def update_tag_instance(self, tag_uuid):
|
|
2941
|
+
for tag_instance in self.get_tag_instances(tag_uuid):
|
|
2942
|
+
if tag_instance.tag.id == tag_uuid:
|
|
2943
|
+
# Update attributes of a Tag
|
|
2944
|
+
for node in tag_instance.nodes:
|
|
2945
|
+
node.get_tag(tag_instance.tag.value, tag_uuid=tag_instance.tag.id)
|
|
2946
|
+
|
|
2947
|
+
def get_tag_instance(self, tag):
|
|
2948
|
+
"""
|
|
2949
|
+
Get the tag instance based on the tag itself
|
|
2950
|
+
:param tag: name of the tag
|
|
2951
|
+
:return: a list of tag instance
|
|
2952
|
+
"""
|
|
2953
|
+
return [
|
|
2954
|
+
tag_instance
|
|
2955
|
+
for tag_instance in self.get_tag_instances(tag)
|
|
2956
|
+
if tag_instance.tag == tag
|
|
2957
|
+
]
|
|
2958
|
+
|
|
2959
|
+
def get_persistence(self):
|
|
2960
|
+
return self._persistence_layer
|
|
2961
|
+
|
|
2962
|
+
def get_all_tags(self):
|
|
2963
|
+
return self._persistence_layer.get_all_tags()
|
|
2964
|
+
|
|
2965
|
+
def add_model_insight(self, model_insight: ModelInsight):
|
|
2966
|
+
self._persistence_layer.add_model_insight(model_insight)
|
|
2967
|
+
|
|
2968
|
+
def clear_model_insights(self):
|
|
2969
|
+
self._persistence_layer.clear_model_insights()
|
|
2970
|
+
|
|
2971
|
+
def get_model_insights(self) -> List[ModelInsight]:
|
|
2972
|
+
return self._persistence_layer.get_model_insights()
|
|
2973
|
+
|
|
2974
|
+
def get_tagged_nodes(self, tag_name, tag_uuid=None):
|
|
2975
|
+
return self._persistence_layer.get_tagged_nodes(tag_name, tag_uuid)
|
|
2976
|
+
|
|
2977
|
+
@property
|
|
2978
|
+
def content_node(self) -> ContentNode:
|
|
2979
|
+
"""The root content Node"""
|
|
2980
|
+
return self._content_node
|
|
2981
|
+
|
|
2982
|
+
@content_node.setter
|
|
2983
|
+
def content_node(self, value):
|
|
2984
|
+
"""Set the content node for the document"""
|
|
2985
|
+
# Initialize _content_node attribute if it doesn't exist
|
|
2986
|
+
if not hasattr(self, "_content_node"):
|
|
2987
|
+
self._content_node = None
|
|
2988
|
+
|
|
2989
|
+
# Handle None value
|
|
2990
|
+
if value is None:
|
|
2991
|
+
self._content_node = None
|
|
2992
|
+
return
|
|
2993
|
+
|
|
2994
|
+
# Set index to 0 if node has an index property
|
|
2995
|
+
value.index = 0
|
|
2996
|
+
|
|
2997
|
+
# Remove old content node if it exists
|
|
2998
|
+
if self._content_node is not None and value != self._content_node:
|
|
2999
|
+
self.get_persistence().remove_content_node(self._content_node)
|
|
3000
|
+
|
|
3001
|
+
# Set the new content node
|
|
3002
|
+
self._content_node = value
|
|
3003
|
+
|
|
3004
|
+
# Add the content node to persistence
|
|
3005
|
+
self.get_persistence().add_content_node(self._content_node, None)
|
|
3006
|
+
|
|
3007
|
+
def get_tag_instances(self, tag):
|
|
3008
|
+
groups = self.content_node.get_related_tag_nodes(tag, everywhere=True)
|
|
3009
|
+
tag_instances = []
|
|
3010
|
+
for key in groups.keys():
|
|
3011
|
+
tag_instances.append(TagInstance(key, groups[key]))
|
|
3012
|
+
return tag_instances
|
|
3013
|
+
|
|
3014
|
+
def add_label(self, label: str):
|
|
3015
|
+
"""Add a label to the document
|
|
3016
|
+
|
|
3017
|
+
Args:
|
|
3018
|
+
label: str Label to add
|
|
3019
|
+
label: str:
|
|
3020
|
+
|
|
3021
|
+
Returns:
|
|
3022
|
+
the document
|
|
3023
|
+
|
|
3024
|
+
"""
|
|
3025
|
+
if label not in self.labels:
|
|
3026
|
+
self.labels.append(label)
|
|
3027
|
+
|
|
3028
|
+
return self
|
|
3029
|
+
|
|
3030
|
+
def remove_label(self, label: str):
|
|
3031
|
+
"""Remove a label from the document
|
|
3032
|
+
|
|
3033
|
+
Args:
|
|
3034
|
+
label: str Label to remove
|
|
3035
|
+
label: str:
|
|
3036
|
+
|
|
3037
|
+
Returns:
|
|
3038
|
+
the document
|
|
3039
|
+
|
|
3040
|
+
"""
|
|
3041
|
+
self.labels.remove(label)
|
|
3042
|
+
return self
|
|
3043
|
+
|
|
3044
|
+
@classmethod
|
|
3045
|
+
def from_text(cls, text, separator=None, inmemory=False):
|
|
3046
|
+
"""Creates a new Document from the text provided.
|
|
3047
|
+
|
|
3048
|
+
Args:
|
|
3049
|
+
text: str Text to be used as content on the Document's ContentNode(s)
|
|
3050
|
+
separator: str If provided, this string will be used to split the text and the resulting text will be placed on children of the root ContentNode. (Default value = None)
|
|
3051
|
+
|
|
3052
|
+
Returns:
|
|
3053
|
+
the document
|
|
3054
|
+
|
|
3055
|
+
"""
|
|
3056
|
+
new_document = Document(inmemory=inmemory)
|
|
3057
|
+
new_document.source.original_filename = f"text-{uuid.uuid4()}"
|
|
3058
|
+
new_document.content_node = new_document.create_node(node_type="text", index=0)
|
|
3059
|
+
if text:
|
|
3060
|
+
if separator:
|
|
3061
|
+
for s in text.split(separator):
|
|
3062
|
+
# Create the node with content
|
|
3063
|
+
child_node = new_document.create_node(node_type="text", content=s)
|
|
3064
|
+
# Add as a child to the content node
|
|
3065
|
+
new_document.content_node.add_child(child_node)
|
|
3066
|
+
# Explicitly make sure content parts are set
|
|
3067
|
+
if s:
|
|
3068
|
+
child_node.set_content_parts([s])
|
|
3069
|
+
else:
|
|
3070
|
+
new_document.content_node.content = text
|
|
3071
|
+
new_document.content_node.set_content_parts([text])
|
|
3072
|
+
|
|
3073
|
+
new_document.add_mixin("text")
|
|
3074
|
+
return new_document
|
|
3075
|
+
|
|
3076
|
+
def get_root(self):
|
|
3077
|
+
"""Get the root content node for the document (same as content_node)"""
|
|
3078
|
+
return self.content_node
|
|
3079
|
+
|
|
3080
|
+
def to_kdxa(self, file_path: str):
|
|
3081
|
+
"""Write the document to the kdxa format (msgpack) which can be
|
|
3082
|
+
used with the Kodexa platform
|
|
3083
|
+
|
|
3084
|
+
Args:
|
|
3085
|
+
file_path: the path to the mdoc you wish to create
|
|
3086
|
+
file_path: str:
|
|
3087
|
+
|
|
3088
|
+
Returns:
|
|
3089
|
+
|
|
3090
|
+
>>> document.to_mdoc('my-document.kdxa')
|
|
3091
|
+
"""
|
|
3092
|
+
with open(file_path, "wb") as outfile:
|
|
3093
|
+
msgpack.pack(self.to_dict(), outfile, use_bin_type=True)
|
|
3094
|
+
|
|
3095
|
+
@staticmethod
|
|
3096
|
+
def open_kddb(file_path):
|
|
3097
|
+
"""
|
|
3098
|
+
Opens a Kodexa Document Database.
|
|
3099
|
+
|
|
3100
|
+
This is the Kodexa V4 default way to store documents, it provides high-performance
|
|
3101
|
+
and also the ability to handle very large document objects
|
|
3102
|
+
|
|
3103
|
+
:param file_path: The file path
|
|
3104
|
+
:return: The Document instance
|
|
3105
|
+
"""
|
|
3106
|
+
return Document(kddb_path=file_path)
|
|
3107
|
+
|
|
3108
|
+
def close(self):
|
|
3109
|
+
"""
|
|
3110
|
+
Close the document and clean up the resources
|
|
3111
|
+
"""
|
|
3112
|
+
self.get_persistence().close()
|
|
3113
|
+
|
|
3114
|
+
def to_kddb(self, path=None):
|
|
3115
|
+
"""
|
|
3116
|
+
Either write this document to a KDDB file or convert this document object structure into a KDDB and return a bytes-like object
|
|
3117
|
+
|
|
3118
|
+
This is dependent on whether you provide a path to write to
|
|
3119
|
+
"""
|
|
3120
|
+
|
|
3121
|
+
if path is None:
|
|
3122
|
+
return self.get_persistence().get_bytes()
|
|
3123
|
+
|
|
3124
|
+
with open(path, "wb") as output_file:
|
|
3125
|
+
output_file.write(self.get_persistence().get_bytes())
|
|
3126
|
+
|
|
3127
|
+
@staticmethod
|
|
3128
|
+
def from_kdxa(file_path):
|
|
3129
|
+
"""Read an .kdxa file from the given file_path and
|
|
3130
|
+
|
|
3131
|
+
Args:
|
|
3132
|
+
file_path: the path to the mdoc file
|
|
3133
|
+
|
|
3134
|
+
Returns:
|
|
3135
|
+
|
|
3136
|
+
>>> document = Document.from_kdxa('my-document.kdxa')
|
|
3137
|
+
"""
|
|
3138
|
+
with open(file_path, "rb") as data_file:
|
|
3139
|
+
data_loaded = msgpack.unpack(data_file, raw=False)
|
|
3140
|
+
return Document.from_dict(data_loaded)
|
|
3141
|
+
|
|
3142
|
+
def to_msgpack(self):
|
|
3143
|
+
"""Convert this document object structure into a message pack"""
|
|
3144
|
+
return msgpack.packb(self.to_dict(), use_bin_type=True)
|
|
3145
|
+
|
|
3146
|
+
def to_json(self):
|
|
3147
|
+
"""Create a JSON string representation of this Document.
|
|
3148
|
+
|
|
3149
|
+
Args:
|
|
3150
|
+
|
|
3151
|
+
Returns:
|
|
3152
|
+
str: The JSON formatted string representation of this Document.
|
|
3153
|
+
|
|
3154
|
+
>>> document.to_json()
|
|
3155
|
+
"""
|
|
3156
|
+
return json.dumps(self.to_dict(), ensure_ascii=False)
|
|
3157
|
+
|
|
3158
|
+
def to_dict(self):
|
|
3159
|
+
"""Create a dictionary representing this Document's structure and content.
|
|
3160
|
+
|
|
3161
|
+
Args:
|
|
3162
|
+
|
|
3163
|
+
Returns:
|
|
3164
|
+
dict: A dictionary representation of this Document.
|
|
3165
|
+
|
|
3166
|
+
>>> document.to_dict()
|
|
3167
|
+
"""
|
|
3168
|
+
|
|
3169
|
+
# We don't want to store the none values
|
|
3170
|
+
def clean_none_values(d):
|
|
3171
|
+
"""
|
|
3172
|
+
This function recursively cleans a dictionary by removing keys with None values.
|
|
3173
|
+
|
|
3174
|
+
Args:
|
|
3175
|
+
d (dict): The dictionary to clean.
|
|
3176
|
+
|
|
3177
|
+
Returns:
|
|
3178
|
+
dict: A new dictionary with the same structure as the input, but without keys that had None values.
|
|
3179
|
+
"""
|
|
3180
|
+
clean = {}
|
|
3181
|
+
for k, v in d.items():
|
|
3182
|
+
if isinstance(v, dict):
|
|
3183
|
+
nested = clean_none_values(v)
|
|
3184
|
+
if len(nested.keys()) > 0:
|
|
3185
|
+
clean[k] = nested
|
|
3186
|
+
elif v is not None:
|
|
3187
|
+
clean[k] = v
|
|
3188
|
+
return clean
|
|
3189
|
+
|
|
3190
|
+
return {
|
|
3191
|
+
"version": Document.CURRENT_VERSION,
|
|
3192
|
+
"metadata": self.metadata,
|
|
3193
|
+
"content_node": self.content_node.to_dict() if self.content_node else None,
|
|
3194
|
+
"source": clean_none_values(dataclasses.asdict(self.source)),
|
|
3195
|
+
"mixins": self._mixins,
|
|
3196
|
+
"labels": self.labels,
|
|
3197
|
+
"uuid": self.uuid,
|
|
3198
|
+
}
|
|
3199
|
+
|
|
3200
|
+
@staticmethod
|
|
3201
|
+
def from_dict(doc_dict):
|
|
3202
|
+
"""Build a new Document from a dictionary.
|
|
3203
|
+
|
|
3204
|
+
Args:
|
|
3205
|
+
dict: doc_dict: A dictionary representation of a Kodexa Document.
|
|
3206
|
+
doc_dict:
|
|
3207
|
+
|
|
3208
|
+
Returns:
|
|
3209
|
+
Document: A complete Kodexa Document
|
|
3210
|
+
|
|
3211
|
+
>>> Document.from_dict(doc_dict)
|
|
3212
|
+
"""
|
|
3213
|
+
new_document = Document(DocumentMetadata(doc_dict["metadata"]))
|
|
3214
|
+
new_document.version = (
|
|
3215
|
+
doc_dict["version"]
|
|
3216
|
+
if "version" in doc_dict and doc_dict["version"]
|
|
3217
|
+
else Document.PREVIOUS_VERSION
|
|
3218
|
+
) # some older docs don't have a version or it's None
|
|
3219
|
+
new_document.uuid = (
|
|
3220
|
+
doc_dict["uuid"]
|
|
3221
|
+
if "uuid" in doc_dict
|
|
3222
|
+
else str(uuid.uuid5(uuid.NAMESPACE_DNS, "kodexa.com"))
|
|
3223
|
+
)
|
|
3224
|
+
|
|
3225
|
+
if "content_node" in doc_dict and doc_dict["content_node"]:
|
|
3226
|
+
new_document.content_node = ContentNode.from_dict(
|
|
3227
|
+
new_document, doc_dict["content_node"]
|
|
3228
|
+
)
|
|
3229
|
+
|
|
3230
|
+
if "source" in doc_dict and doc_dict["source"]:
|
|
3231
|
+
new_document.source = SourceMetadata.from_dict(doc_dict["source"])
|
|
3232
|
+
if "labels" in doc_dict and doc_dict["labels"]:
|
|
3233
|
+
new_document.labels = doc_dict["labels"]
|
|
3234
|
+
|
|
3235
|
+
new_document.get_persistence().update_metadata()
|
|
3236
|
+
return new_document
|
|
3237
|
+
|
|
3238
|
+
@staticmethod
|
|
3239
|
+
def from_json(json_string):
|
|
3240
|
+
"""Create an instance of a Document from a JSON string.
|
|
3241
|
+
|
|
3242
|
+
Args:
|
|
3243
|
+
str: json_string: A JSON string representation of a Kodexa Document
|
|
3244
|
+
json_string:
|
|
3245
|
+
|
|
3246
|
+
Returns:
|
|
3247
|
+
Document: A complete Kodexa Document
|
|
3248
|
+
|
|
3249
|
+
>>> Document.from_json(json_string)
|
|
3250
|
+
"""
|
|
3251
|
+
return Document.from_dict(json.loads(json_string))
|
|
3252
|
+
|
|
3253
|
+
@staticmethod
|
|
3254
|
+
def from_msgpack(msgpack_bytes):
|
|
3255
|
+
"""Create an instance of a Document from a message pack byte array.
|
|
3256
|
+
|
|
3257
|
+
Args:
|
|
3258
|
+
msgpack_bytes: bytes: A message pack byte array.
|
|
3259
|
+
|
|
3260
|
+
Returns:
|
|
3261
|
+
Document: A complete Kodexa Document
|
|
3262
|
+
|
|
3263
|
+
>>> Document.from_msgpack(open(os.path.join('news-doc.kdxa'), 'rb').read())
|
|
3264
|
+
"""
|
|
3265
|
+
return Document.from_dict(msgpack.unpackb(msgpack_bytes, raw=False))
|
|
3266
|
+
|
|
3267
|
+
def get_mixins(self):
|
|
3268
|
+
"""
|
|
3269
|
+
Get the list of mixins that have been enabled on this document
|
|
3270
|
+
|
|
3271
|
+
Returns:
|
|
3272
|
+
mixins: list[str] a list of the mixin names
|
|
3273
|
+
"""
|
|
3274
|
+
return self._mixins
|
|
3275
|
+
|
|
3276
|
+
def add_mixin(self, mixin):
|
|
3277
|
+
"""
|
|
3278
|
+
Add the given mixin to this document, this will apply the mixin to all the content nodes,
|
|
3279
|
+
and also register it with the document so that future invocations of create_node will ensure
|
|
3280
|
+
the node has the mixin appled.
|
|
3281
|
+
|
|
3282
|
+
Args:
|
|
3283
|
+
mixin:str the name of the mixin to add
|
|
3284
|
+
|
|
3285
|
+
Returns:
|
|
3286
|
+
>>> import * from kodexa
|
|
3287
|
+
>>> document = Document()
|
|
3288
|
+
>>> document.add_mixin('spatial')
|
|
3289
|
+
"""
|
|
3290
|
+
self._mixins.append(mixin)
|
|
3291
|
+
self.get_persistence().update_metadata()
|
|
3292
|
+
|
|
3293
|
+
def create_node(
|
|
3294
|
+
self,
|
|
3295
|
+
node_type: str,
|
|
3296
|
+
content: Optional[str] = None,
|
|
3297
|
+
virtual: bool = False,
|
|
3298
|
+
parent: ContentNode = None,
|
|
3299
|
+
index: Optional[int] = None,
|
|
3300
|
+
):
|
|
3301
|
+
"""
|
|
3302
|
+
Creates a new node for the document. The new node is not added to the document, but any mixins that have been
|
|
3303
|
+
applied to the document will also be available on the new node.
|
|
3304
|
+
|
|
3305
|
+
Args:
|
|
3306
|
+
node_type (str): The type of node.
|
|
3307
|
+
content (str): The content for the node; defaults to None.
|
|
3308
|
+
virtual (bool): Indicates if this is a 'real' or 'virtual' node; default is False. 'Real' nodes contain
|
|
3309
|
+
document content. 'Virtual' nodes are synthesized as necessary to fill gaps in between
|
|
3310
|
+
non-consecutively indexed siblings. Such indexing arises when document content is sparse.
|
|
3311
|
+
parent (ContentNode): The parent for this newly created node; default is None;
|
|
3312
|
+
index (Optional[int)): The index property to be set on this node; default is 0;
|
|
3313
|
+
|
|
3314
|
+
Returns:
|
|
3315
|
+
ContentNode: This newly created node.
|
|
3316
|
+
|
|
3317
|
+
>>> document.create_node(node_type='page')
|
|
3318
|
+
<kodexa_document.model.ContentNode object at 0x7f80605e53c8>
|
|
3319
|
+
"""
|
|
3320
|
+
content_node = ContentNode(
|
|
3321
|
+
document=self,
|
|
3322
|
+
node_type=node_type,
|
|
3323
|
+
content=content,
|
|
3324
|
+
parent=parent,
|
|
3325
|
+
index=index,
|
|
3326
|
+
virtual=virtual,
|
|
3327
|
+
)
|
|
3328
|
+
|
|
3329
|
+
if virtual:
|
|
3330
|
+
# For virtual nodes, we just set the parent ID without persistence
|
|
3331
|
+
if parent is not None:
|
|
3332
|
+
content_node._parent_id = parent.id if parent.id else None
|
|
3333
|
+
else:
|
|
3334
|
+
# Only add non-virtual nodes to the document
|
|
3335
|
+
if parent is not None:
|
|
3336
|
+
parent.add_child(content_node, index)
|
|
3337
|
+
else:
|
|
3338
|
+
self.get_persistence().add_content_node(content_node)
|
|
3339
|
+
|
|
3340
|
+
# This is redundant as we already set content in the ContentNode constructor
|
|
3341
|
+
# We should remove this and rely on the ContentNode logic
|
|
3342
|
+
if content is not None and len(content_node.get_content_parts()) == 0:
|
|
3343
|
+
content_node.set_content_parts([content])
|
|
3344
|
+
|
|
3345
|
+
return content_node
|
|
3346
|
+
|
|
3347
|
+
@classmethod
|
|
3348
|
+
def from_kddb(
|
|
3349
|
+
cls, source: Union[str, bytes], detached: bool = True, inmemory: bool = False
|
|
3350
|
+
):
|
|
3351
|
+
"""
|
|
3352
|
+
Create a document from a KDDB (Kodexa Document Database) source. The source can either be a file path or the KDDB bytes.
|
|
3353
|
+
|
|
3354
|
+
Args:
|
|
3355
|
+
source (str or bytes): The KDDB source.
|
|
3356
|
+
detached (bool, optional): Whether to create a detached Document. Defaults to True.
|
|
3357
|
+
inmemory (bool, optional): Whether to load the document in memory. Defaults to False.
|
|
3358
|
+
|
|
3359
|
+
Returns:
|
|
3360
|
+
Document: A new Document instance loaded from the KDDB source.
|
|
3361
|
+
|
|
3362
|
+
>>> document = Document.from_kddb('path/to/document.kddb')
|
|
3363
|
+
"""
|
|
3364
|
+
from kodexa_document.persistence import SqliteDocumentPersistence
|
|
3365
|
+
|
|
3366
|
+
document = cls(
|
|
3367
|
+
kddb_path=source if isinstance(source, str) else None, inmemory=inmemory
|
|
3368
|
+
)
|
|
3369
|
+
temp_file = None
|
|
3370
|
+
|
|
3371
|
+
try:
|
|
3372
|
+
if isinstance(source, bytes):
|
|
3373
|
+
# We are getting byte source
|
|
3374
|
+
import tempfile
|
|
3375
|
+
import os
|
|
3376
|
+
temp_file = tempfile.NamedTemporaryFile(
|
|
3377
|
+
suffix=".kddb", delete=False
|
|
3378
|
+
)
|
|
3379
|
+
temp_file.write(source)
|
|
3380
|
+
temp_file.close()
|
|
3381
|
+
file_name = temp_file.name
|
|
3382
|
+
else:
|
|
3383
|
+
file_name = source
|
|
3384
|
+
|
|
3385
|
+
# We should make sure that we transfer features
|
|
3386
|
+
document._persistence_layer = SqliteDocumentPersistence(
|
|
3387
|
+
document, file_name, True, inmemory
|
|
3388
|
+
)
|
|
3389
|
+
document._persistence_layer.initialize()
|
|
3390
|
+
|
|
3391
|
+
if detached:
|
|
3392
|
+
document._detached = True
|
|
3393
|
+
|
|
3394
|
+
# Save the document type for easier checking
|
|
3395
|
+
document._document_type = "kddb"
|
|
3396
|
+
|
|
3397
|
+
return document
|
|
3398
|
+
except Exception as e:
|
|
3399
|
+
# Clean up the document resources if initialization failed
|
|
3400
|
+
if hasattr(document, "_persistence_layer") and document._persistence_layer:
|
|
3401
|
+
try:
|
|
3402
|
+
document._persistence_layer.close()
|
|
3403
|
+
except:
|
|
3404
|
+
pass
|
|
3405
|
+
raise e
|
|
3406
|
+
finally:
|
|
3407
|
+
# Clean up the temporary file if we created one
|
|
3408
|
+
if temp_file and os.path.exists(temp_file.name):
|
|
3409
|
+
try:
|
|
3410
|
+
os.unlink(temp_file.name)
|
|
3411
|
+
except:
|
|
3412
|
+
pass
|
|
3413
|
+
|
|
3414
|
+
@classmethod
|
|
3415
|
+
def from_file(cls, file, unpack: bool = False):
|
|
3416
|
+
"""Creates a Document that has a 'file-handle' connector to the specified file.
|
|
3417
|
+
|
|
3418
|
+
Args:
|
|
3419
|
+
file: file: The file to which the new Document is connected.
|
|
3420
|
+
unpack: bool: (Default value = False)
|
|
3421
|
+
|
|
3422
|
+
Returns:
|
|
3423
|
+
Document: A Document connected to the specified file.
|
|
3424
|
+
|
|
3425
|
+
"""
|
|
3426
|
+
if unpack:
|
|
3427
|
+
Document.from_kdxa(file)
|
|
3428
|
+
else:
|
|
3429
|
+
file_document = Document()
|
|
3430
|
+
file_document.metadata["connector"] = "file-handle"
|
|
3431
|
+
file_document.metadata["connector_options"] = {}
|
|
3432
|
+
file_document.metadata["connector_options"]["file"] = file
|
|
3433
|
+
file_document.source
|
|
3434
|
+
file_document.source.connector = "file-handle"
|
|
3435
|
+
file_document.source.original_filename = os.path.basename(file)
|
|
3436
|
+
file_document.source.original_path = file
|
|
3437
|
+
return file_document
|
|
3438
|
+
|
|
3439
|
+
@classmethod
|
|
3440
|
+
def from_url(cls, url, headers=None):
|
|
3441
|
+
"""Creates a Document that has a 'url' connector for the specified url.
|
|
3442
|
+
|
|
3443
|
+
Args:
|
|
3444
|
+
str: url: The URL to which the new Document is connected.
|
|
3445
|
+
dict: headers: Headers that should be used when reading from the URL
|
|
3446
|
+
url:
|
|
3447
|
+
headers: (Default value = None)
|
|
3448
|
+
|
|
3449
|
+
Returns:
|
|
3450
|
+
Document: A Document connected to the specified URL with the specified headers (if any).
|
|
3451
|
+
|
|
3452
|
+
"""
|
|
3453
|
+
if headers is None:
|
|
3454
|
+
headers = {}
|
|
3455
|
+
url_document = Document()
|
|
3456
|
+
url_document.metadata.connector = "url"
|
|
3457
|
+
url_document.metadata.connector_options.base_url = url
|
|
3458
|
+
url_document.metadata.connector_options.headers = headers
|
|
3459
|
+
url_document.source.connector = "url"
|
|
3460
|
+
url_document.source.original_filename = url
|
|
3461
|
+
url_document.source.original_path = url
|
|
3462
|
+
url_document.source.headers = headers
|
|
3463
|
+
return url_document
|
|
3464
|
+
|
|
3465
|
+
def select_first(self, selector, variables=None) -> Optional[ContentNode]:
|
|
3466
|
+
"""Select and return the first child of this node that match the selector value.
|
|
3467
|
+
|
|
3468
|
+
Args:
|
|
3469
|
+
selector (str): The selector (ie. //*)
|
|
3470
|
+
variables (dict, optional): A dictionary of variable name/value to use in substituion; defaults to None.
|
|
3471
|
+
Dictionary keys should match a variable specified in the selector.
|
|
3472
|
+
|
|
3473
|
+
Returns:
|
|
3474
|
+
Optional[ContentNode]: The first matching node or none
|
|
3475
|
+
|
|
3476
|
+
>>> document.get_root().select_first('.')
|
|
3477
|
+
ContentNode
|
|
3478
|
+
|
|
3479
|
+
>>> document.get_root().select_first('//*[hasTag($tagName)]', {"tagName": "div"})
|
|
3480
|
+
ContentNode
|
|
3481
|
+
"""
|
|
3482
|
+
result = self.select(selector, variables, first_only=True)
|
|
3483
|
+
return result[0] if len(result) > 0 else None
|
|
3484
|
+
|
|
3485
|
+
def select(
|
|
3486
|
+
self, selector: str, variables: Optional[dict] = None, first_only=False
|
|
3487
|
+
) -> List[ContentNode]:
|
|
3488
|
+
"""Execute a selector on the root node and then return a list of the matching nodes.
|
|
3489
|
+
|
|
3490
|
+
Args:
|
|
3491
|
+
selector (str): The selector (ie. //*)
|
|
3492
|
+
variables (Optional[dict): A dictionary of variable name/value to use in substituion; defaults to an empty
|
|
3493
|
+
first_only (bool): If True, only the first matching node is returned; defaults to False.
|
|
3494
|
+
dictionary. Dictionary keys should match a variable specified in the selector.
|
|
3495
|
+
|
|
3496
|
+
Returns:
|
|
3497
|
+
list[ContentNodes]: A list of the matching ContentNodes. If no matches found, list is empty.
|
|
3498
|
+
|
|
3499
|
+
>>> document.select('.')
|
|
3500
|
+
[ContentNode]
|
|
3501
|
+
"""
|
|
3502
|
+
if variables is None:
|
|
3503
|
+
variables = {}
|
|
3504
|
+
if self.content_node:
|
|
3505
|
+
result = self.content_node.select(selector, variables, first_only)
|
|
3506
|
+
if isinstance(result, list):
|
|
3507
|
+
return result
|
|
3508
|
+
elif isinstance(result, ContentNode):
|
|
3509
|
+
return [result]
|
|
3510
|
+
|
|
3511
|
+
return [self.content_node] if bool(result) else []
|
|
3512
|
+
return []
|
|
3513
|
+
|
|
3514
|
+
def get_labels(self) -> List[str]:
|
|
3515
|
+
"""
|
|
3516
|
+
|
|
3517
|
+
Args:
|
|
3518
|
+
|
|
3519
|
+
Returns:
|
|
3520
|
+
List[str]: list of associated labels
|
|
3521
|
+
|
|
3522
|
+
"""
|
|
3523
|
+
return self.labels
|
|
3524
|
+
|
|
3525
|
+
def get_feature_set(self, owner_uri: Optional[str] = None) -> FeatureSet:
|
|
3526
|
+
""" """
|
|
3527
|
+
feature_set = FeatureSet()
|
|
3528
|
+
feature_set.node_features = []
|
|
3529
|
+
for tagged_node in self.get_all_tagged_nodes():
|
|
3530
|
+
node_feature = {"nodeUuid": str(tagged_node.id), "features": []}
|
|
3531
|
+
|
|
3532
|
+
feature_set.node_features.append(node_feature)
|
|
3533
|
+
|
|
3534
|
+
# TODO this needs to be cleaned up, also should it only really
|
|
3535
|
+
# be the tag features?
|
|
3536
|
+
for feature in tagged_node.get_features():
|
|
3537
|
+
if feature.feature_type == "tag":
|
|
3538
|
+
if owner_uri is not None:
|
|
3539
|
+
if (
|
|
3540
|
+
"owner_uri" in feature.value[0]
|
|
3541
|
+
and feature.value[0]["owner_uri"] != owner_uri
|
|
3542
|
+
):
|
|
3543
|
+
continue
|
|
3544
|
+
|
|
3545
|
+
feature_dict = feature.to_dict()
|
|
3546
|
+
feature_dict["featureType"] = feature.feature_type
|
|
3547
|
+
feature_dict["name"] = feature.name
|
|
3548
|
+
|
|
3549
|
+
if isinstance(feature_dict["value"][0], Tag):
|
|
3550
|
+
feature_dict["value"] = [feature_dict["value"][0].to_dict()]
|
|
3551
|
+
|
|
3552
|
+
node_feature["features"].append(feature_dict)
|
|
3553
|
+
|
|
3554
|
+
return feature_set
|
|
3555
|
+
|
|
3556
|
+
def get_all_tagged_nodes(self) -> List[ContentNode]:
|
|
3557
|
+
"""
|
|
3558
|
+
Get all the tagged nodes in the document
|
|
3559
|
+
|
|
3560
|
+
:return:
|
|
3561
|
+
"""
|
|
3562
|
+
return self._persistence_layer.get_all_tagged_nodes()
|
|
3563
|
+
|
|
3564
|
+
|
|
3565
|
+
class TagInstance:
|
|
3566
|
+
"""
|
|
3567
|
+
A class to represent a TagInstance.
|
|
3568
|
+
|
|
3569
|
+
...
|
|
3570
|
+
|
|
3571
|
+
Attributes
|
|
3572
|
+
----------
|
|
3573
|
+
tag_uuid : str
|
|
3574
|
+
a string that represents the unique identifier of the tag
|
|
3575
|
+
nodes : list
|
|
3576
|
+
a list of nodes associated with the tag
|
|
3577
|
+
|
|
3578
|
+
Methods
|
|
3579
|
+
-------
|
|
3580
|
+
get_value():
|
|
3581
|
+
Returns the combined content of all nodes.
|
|
3582
|
+
get_data():
|
|
3583
|
+
Returns the data of the tag feature with the same uuid as the tag.
|
|
3584
|
+
"""
|
|
3585
|
+
|
|
3586
|
+
def __init__(self, tag_uuid, nodes):
|
|
3587
|
+
self.tag_uuid = tag_uuid
|
|
3588
|
+
self.nodes = nodes
|
|
3589
|
+
|
|
3590
|
+
def get_value(self):
|
|
3591
|
+
"""
|
|
3592
|
+
Combines and returns the content of all nodes.
|
|
3593
|
+
|
|
3594
|
+
Returns
|
|
3595
|
+
-------
|
|
3596
|
+
str
|
|
3597
|
+
a string that represents the combined content of all nodes
|
|
3598
|
+
"""
|
|
3599
|
+
content_parts = []
|
|
3600
|
+
for node in self.nodes:
|
|
3601
|
+
content_parts.append(node.get_all_content())
|
|
3602
|
+
return " ".join(content_parts)
|
|
3603
|
+
|
|
3604
|
+
def get_data(self):
|
|
3605
|
+
"""
|
|
3606
|
+
Returns the data of the tag feature with the same uuid as the tag.
|
|
3607
|
+
|
|
3608
|
+
Returns
|
|
3609
|
+
-------
|
|
3610
|
+
dict
|
|
3611
|
+
a dictionary that represents the data of the tag feature with the same uuid as the tag
|
|
3612
|
+
"""
|
|
3613
|
+
for node in self.nodes:
|
|
3614
|
+
for tag_feature in node.get_tag_features():
|
|
3615
|
+
data = tag_feature.value[0]
|
|
3616
|
+
if "uuid" in data and data["uuid"] == self.tag_uuid:
|
|
3617
|
+
return data
|
|
3618
|
+
return {}
|
|
3619
|
+
|
|
3620
|
+
|
|
3621
|
+
class ContentObjectReference:
|
|
3622
|
+
"""A reference to a content object within a document.
|
|
3623
|
+
|
|
3624
|
+
This class provides a way to reference a specific content object within a document,
|
|
3625
|
+
and includes information about the document's family and the store where the document is located.
|
|
3626
|
+
|
|
3627
|
+
Attributes:
|
|
3628
|
+
content_object (ContentObject): The content object being referenced.
|
|
3629
|
+
store: The store where the document is located.
|
|
3630
|
+
document (Document): The document in which the content object is located.
|
|
3631
|
+
document_family: The family to which the document belongs.
|
|
3632
|
+
"""
|
|
3633
|
+
|
|
3634
|
+
""" """
|
|
3635
|
+
|
|
3636
|
+
def __init__(
|
|
3637
|
+
self, content_object: ContentObject, store, document: Document, document_family
|
|
3638
|
+
):
|
|
3639
|
+
self.content_object = content_object
|
|
3640
|
+
self.store = store
|
|
3641
|
+
self.document = document
|
|
3642
|
+
self.document_family = document_family
|